Fix total review count detection - use robust selector on Overview tab

- Detect total BEFORE clicking reviews tab (element is on Overview)
- Use span[role="img"][aria-label*="review"] (robust, no class names)
- Extract count from aria-label (e.g., "260 reviews" → 260)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-22 12:23:00 +00:00
parent 10b32244d7
commit 94240ef2cc

View File

@@ -234,6 +234,28 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
break break
time.sleep(0.01) # 10ms - responsive but low CPU time.sleep(0.01) # 10ms - responsive but low CPU
# Extract total review count BEFORE clicking reviews tab (it's on Overview)
# ROBUST: Use aria-label="X reviews" on span[role="img"]
total_reviews = None
try:
total_reviews = driver.execute_script("""
// ROBUST: Find span[role="img"][aria-label*="review"] - contains "(X)" text
// aria-label format: "260 reviews" or "1,234 reviews"
var reviewSpans = document.querySelectorAll('span[role="img"][aria-label*="review"]');
for (var i = 0; i < reviewSpans.length; i++) {
var label = reviewSpans[i].getAttribute('aria-label') || '';
var match = label.match(/^([\\d,\\.]+)\\s*review/i);
if (match) {
return parseInt(match[1].replace(/[,\\.]/g, ''));
}
}
return null;
""")
if total_reviews:
print(f"📊 Total reviews on page: {total_reviews}")
except:
pass
# Click reviews tab - poll until found # Click reviews tab - poll until found
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"] review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
start = time.time() start = time.time()
@@ -299,47 +321,6 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
print("✅ Found scroll container") print("✅ Found scroll container")
# Extract total review count from page (look in specific places)
total_reviews = None
try:
total_reviews = driver.execute_script("""
// Method 1: Sum up star rating counts (most accurate)
// Look for aria-labels like "5 stars, 171 reviews"
var total = 0;
var starLabels = document.querySelectorAll('[aria-label*="stars,"][aria-label*="review"]');
if (starLabels.length >= 5) {
for (var i = 0; i < starLabels.length; i++) {
var match = starLabels[i].getAttribute('aria-label').match(/(\\d+)\\s*review/i);
if (match) total += parseInt(match[1]);
}
if (total > 0) return total;
}
// Method 2: Look in reviews tab text (e.g., "Reviews (247)")
var tabs = document.querySelectorAll('button[role="tab"]');
for (var i = 0; i < tabs.length; i++) {
var text = tabs[i].textContent || '';
if (/review|reseña/i.test(text)) {
var match = text.match(/\\(([\\d,\\.]+)\\)/);
if (match) return parseInt(match[1].replace(/[,\\.]/g, ''));
}
}
// Method 3: Look for "X reviews" near rating
var spans = document.querySelectorAll('span');
for (var i = 0; i < spans.length; i++) {
var text = spans[i].textContent || '';
var match = text.match(/^([\\d,\\.]+)\\s*reviews?$/i);
if (match) return parseInt(match[1].replace(/[,\\.]/g, ''));
}
return null;
""")
if total_reviews:
print(f"📊 Total reviews on page: {total_reviews}")
except:
pass
# PHASE 2: Inject API interceptor for scroll-loaded reviews # PHASE 2: Inject API interceptor for scroll-loaded reviews
print("🔌 Injecting API interceptor...") print("🔌 Injecting API interceptor...")
driver.execute_script(""" driver.execute_script("""