From b4fae38027965c870107e92095f881d96e026efe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Thu, 22 Jan 2026 12:30:17 +0000 Subject: [PATCH] Add polling for total count detection on page load - Poll for up to 5s waiting for span[role="img"][aria-label*="review"] - Element may not be present immediately after consent handling - Tested: Soho Club 247/247 reviews in 31.4s with correct total Co-Authored-By: Claude Opus 4.5 --- modules/scraper_clean.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py index 3dacea2..4abef09 100644 --- a/modules/scraper_clean.py +++ b/modules/scraper_clean.py @@ -236,25 +236,29 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in # Extract total review count BEFORE clicking reviews tab (it's on Overview) # ROBUST: Use aria-label="X reviews" on span[role="img"] + # Poll for up to 5s since page might still be loading after consent total_reviews = None - try: - total_reviews = driver.execute_script(""" - // ROBUST: Find span[role="img"][aria-label*="review"] - contains "(X)" text - // aria-label format: "260 reviews" or "1,234 reviews" - var reviewSpans = document.querySelectorAll('span[role="img"][aria-label*="review"]'); - for (var i = 0; i < reviewSpans.length; i++) { - var label = reviewSpans[i].getAttribute('aria-label') || ''; - var match = label.match(/^([\\d,\\.]+)\\s*review/i); - if (match) { - return parseInt(match[1].replace(/[,\\.]/g, '')); + start = time.time() + while time.time() - start < 5: + try: + total_reviews = driver.execute_script(""" + // ROBUST: Find span[role="img"] with aria-label starting with number + "review" + var reviewSpans = document.querySelectorAll('span[role="img"]'); + for (var i = 0; i < reviewSpans.length; i++) { + var label = reviewSpans[i].getAttribute('aria-label') || ''; + var match = label.match(/^([\\d,\\.]+)\\s*review/i); + if (match) { + return parseInt(match[1].replace(/[,\\.]/g, '')); + } } - } - return null; - """) - if total_reviews: - print(f"📊 Total reviews on page: {total_reviews}") - except: - pass + return null; + """) + if total_reviews: + print(f"📊 Total reviews on page: {total_reviews}") + break + except: + pass + time.sleep(0.1) # Click reviews tab - poll until found review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]