Fix total review count detection - use robust selector on Overview tab
- Detect total BEFORE clicking reviews tab (element is on Overview) - Use span[role="img"][aria-label*="review"] (robust, no class names) - Extract count from aria-label (e.g., "260 reviews" → 260) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -234,6 +234,28 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
break
|
break
|
||||||
time.sleep(0.01) # 10ms - responsive but low CPU
|
time.sleep(0.01) # 10ms - responsive but low CPU
|
||||||
|
|
||||||
|
# Extract total review count BEFORE clicking reviews tab (it's on Overview)
|
||||||
|
# ROBUST: Use aria-label="X reviews" on span[role="img"]
|
||||||
|
total_reviews = None
|
||||||
|
try:
|
||||||
|
total_reviews = driver.execute_script("""
|
||||||
|
// ROBUST: Find span[role="img"][aria-label*="review"] - contains "(X)" text
|
||||||
|
// aria-label format: "260 reviews" or "1,234 reviews"
|
||||||
|
var reviewSpans = document.querySelectorAll('span[role="img"][aria-label*="review"]');
|
||||||
|
for (var i = 0; i < reviewSpans.length; i++) {
|
||||||
|
var label = reviewSpans[i].getAttribute('aria-label') || '';
|
||||||
|
var match = label.match(/^([\\d,\\.]+)\\s*review/i);
|
||||||
|
if (match) {
|
||||||
|
return parseInt(match[1].replace(/[,\\.]/g, ''));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
""")
|
||||||
|
if total_reviews:
|
||||||
|
print(f"📊 Total reviews on page: {total_reviews}")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
# Click reviews tab - poll until found
|
# Click reviews tab - poll until found
|
||||||
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
|
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
|
||||||
start = time.time()
|
start = time.time()
|
||||||
@@ -299,47 +321,6 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
|
|
||||||
print("✅ Found scroll container")
|
print("✅ Found scroll container")
|
||||||
|
|
||||||
# Extract total review count from page (look in specific places)
|
|
||||||
total_reviews = None
|
|
||||||
try:
|
|
||||||
total_reviews = driver.execute_script("""
|
|
||||||
// Method 1: Sum up star rating counts (most accurate)
|
|
||||||
// Look for aria-labels like "5 stars, 171 reviews"
|
|
||||||
var total = 0;
|
|
||||||
var starLabels = document.querySelectorAll('[aria-label*="stars,"][aria-label*="review"]');
|
|
||||||
if (starLabels.length >= 5) {
|
|
||||||
for (var i = 0; i < starLabels.length; i++) {
|
|
||||||
var match = starLabels[i].getAttribute('aria-label').match(/(\\d+)\\s*review/i);
|
|
||||||
if (match) total += parseInt(match[1]);
|
|
||||||
}
|
|
||||||
if (total > 0) return total;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Method 2: Look in reviews tab text (e.g., "Reviews (247)")
|
|
||||||
var tabs = document.querySelectorAll('button[role="tab"]');
|
|
||||||
for (var i = 0; i < tabs.length; i++) {
|
|
||||||
var text = tabs[i].textContent || '';
|
|
||||||
if (/review|reseña/i.test(text)) {
|
|
||||||
var match = text.match(/\\(([\\d,\\.]+)\\)/);
|
|
||||||
if (match) return parseInt(match[1].replace(/[,\\.]/g, ''));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Method 3: Look for "X reviews" near rating
|
|
||||||
var spans = document.querySelectorAll('span');
|
|
||||||
for (var i = 0; i < spans.length; i++) {
|
|
||||||
var text = spans[i].textContent || '';
|
|
||||||
var match = text.match(/^([\\d,\\.]+)\\s*reviews?$/i);
|
|
||||||
if (match) return parseInt(match[1].replace(/[,\\.]/g, ''));
|
|
||||||
}
|
|
||||||
|
|
||||||
return null;
|
|
||||||
""")
|
|
||||||
if total_reviews:
|
|
||||||
print(f"📊 Total reviews on page: {total_reviews}")
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# PHASE 2: Inject API interceptor for scroll-loaded reviews
|
# PHASE 2: Inject API interceptor for scroll-loaded reviews
|
||||||
print("🔌 Injecting API interceptor...")
|
print("🔌 Injecting API interceptor...")
|
||||||
driver.execute_script("""
|
driver.execute_script("""
|
||||||
|
|||||||
Reference in New Issue
Block a user