diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py index 9772333..3dacea2 100644 --- a/modules/scraper_clean.py +++ b/modules/scraper_clean.py @@ -234,6 +234,28 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in break time.sleep(0.01) # 10ms - responsive but low CPU + # Extract total review count BEFORE clicking reviews tab (it's on Overview) + # ROBUST: Use aria-label="X reviews" on span[role="img"] + total_reviews = None + try: + total_reviews = driver.execute_script(""" + // ROBUST: Find span[role="img"][aria-label*="review"] - contains "(X)" text + // aria-label format: "260 reviews" or "1,234 reviews" + var reviewSpans = document.querySelectorAll('span[role="img"][aria-label*="review"]'); + for (var i = 0; i < reviewSpans.length; i++) { + var label = reviewSpans[i].getAttribute('aria-label') || ''; + var match = label.match(/^([\\d,\\.]+)\\s*review/i); + if (match) { + return parseInt(match[1].replace(/[,\\.]/g, '')); + } + } + return null; + """) + if total_reviews: + print(f"📊 Total reviews on page: {total_reviews}") + except: + pass + # Click reviews tab - poll until found review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"] start = time.time() @@ -299,47 +321,6 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in print("✅ Found scroll container") - # Extract total review count from page (look in specific places) - total_reviews = None - try: - total_reviews = driver.execute_script(""" - // Method 1: Sum up star rating counts (most accurate) - // Look for aria-labels like "5 stars, 171 reviews" - var total = 0; - var starLabels = document.querySelectorAll('[aria-label*="stars,"][aria-label*="review"]'); - if (starLabels.length >= 5) { - for (var i = 0; i < starLabels.length; i++) { - var match = starLabels[i].getAttribute('aria-label').match(/(\\d+)\\s*review/i); - if (match) total += parseInt(match[1]); - } - if (total > 0) return total; - } - - // Method 2: Look in reviews tab text (e.g., "Reviews (247)") - var tabs = document.querySelectorAll('button[role="tab"]'); - for (var i = 0; i < tabs.length; i++) { - var text = tabs[i].textContent || ''; - if (/review|reseña/i.test(text)) { - var match = text.match(/\\(([\\d,\\.]+)\\)/); - if (match) return parseInt(match[1].replace(/[,\\.]/g, '')); - } - } - - // Method 3: Look for "X reviews" near rating - var spans = document.querySelectorAll('span'); - for (var i = 0; i < spans.length; i++) { - var text = spans[i].textContent || ''; - var match = text.match(/^([\\d,\\.]+)\\s*reviews?$/i); - if (match) return parseInt(match[1].replace(/[,\\.]/g, '')); - } - - return null; - """) - if total_reviews: - print(f"📊 Total reviews on page: {total_reviews}") - except: - pass - # PHASE 2: Inject API interceptor for scroll-loaded reviews print("🔌 Injecting API interceptor...") driver.execute_script("""