Add polling for total count detection on page load

- Poll for up to 5s waiting for span[role="img"][aria-label*="review"]
- Element may not be present immediately after consent handling
- Tested: Soho Club 247/247 reviews in 31.4s with correct total

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-22 12:30:17 +00:00
parent 94240ef2cc
commit b4fae38027

View File

@@ -236,12 +236,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Extract total review count BEFORE clicking reviews tab (it's on Overview)
# ROBUST: Use aria-label="X reviews" on span[role="img"]
# Poll for up to 5s since page might still be loading after consent
total_reviews = None
start = time.time()
while time.time() - start < 5:
try:
total_reviews = driver.execute_script("""
// ROBUST: Find span[role="img"][aria-label*="review"] - contains "(X)" text
// aria-label format: "260 reviews" or "1,234 reviews"
var reviewSpans = document.querySelectorAll('span[role="img"][aria-label*="review"]');
// ROBUST: Find span[role="img"] with aria-label starting with number + "review"
var reviewSpans = document.querySelectorAll('span[role="img"]');
for (var i = 0; i < reviewSpans.length; i++) {
var label = reviewSpans[i].getAttribute('aria-label') || '';
var match = label.match(/^([\\d,\\.]+)\\s*review/i);
@@ -253,8 +255,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
""")
if total_reviews:
print(f"📊 Total reviews on page: {total_reviews}")
break
except:
pass
time.sleep(0.1)
# Click reviews tab - poll until found
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]