Add polling for total count detection on page load

- Poll for up to 5s waiting for span[role="img"][aria-label*="review"]
- Element may not be present immediately after consent handling
- Tested: Soho Club 247/247 reviews in 31.4s with correct total

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-22 12:30:17 +00:00
parent 94240ef2cc
commit b4fae38027

View File

@@ -236,25 +236,29 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Extract total review count BEFORE clicking reviews tab (it's on Overview) # Extract total review count BEFORE clicking reviews tab (it's on Overview)
# ROBUST: Use aria-label="X reviews" on span[role="img"] # ROBUST: Use aria-label="X reviews" on span[role="img"]
# Poll for up to 5s since page might still be loading after consent
total_reviews = None total_reviews = None
try: start = time.time()
total_reviews = driver.execute_script(""" while time.time() - start < 5:
// ROBUST: Find span[role="img"][aria-label*="review"] - contains "(X)" text try:
// aria-label format: "260 reviews" or "1,234 reviews" total_reviews = driver.execute_script("""
var reviewSpans = document.querySelectorAll('span[role="img"][aria-label*="review"]'); // ROBUST: Find span[role="img"] with aria-label starting with number + "review"
for (var i = 0; i < reviewSpans.length; i++) { var reviewSpans = document.querySelectorAll('span[role="img"]');
var label = reviewSpans[i].getAttribute('aria-label') || ''; for (var i = 0; i < reviewSpans.length; i++) {
var match = label.match(/^([\\d,\\.]+)\\s*review/i); var label = reviewSpans[i].getAttribute('aria-label') || '';
if (match) { var match = label.match(/^([\\d,\\.]+)\\s*review/i);
return parseInt(match[1].replace(/[,\\.]/g, '')); if (match) {
return parseInt(match[1].replace(/[,\\.]/g, ''));
}
} }
} return null;
return null; """)
""") if total_reviews:
if total_reviews: print(f"📊 Total reviews on page: {total_reviews}")
print(f"📊 Total reviews on page: {total_reviews}") break
except: except:
pass pass
time.sleep(0.1)
# Click reviews tab - poll until found # Click reviews tab - poll until found
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"] review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]