From 0778b2e07dd1fe6e7097538001b74ad85e4032af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Wed, 21 Jan 2026 22:50:06 +0000 Subject: [PATCH] Fix total review count detection - sum star ratings Previous detection was matching wrong elements (partial counts). Now sums "X stars, Y reviews" aria-labels for accurate total. Fallback methods: 1. Sum star rating counts (most accurate) 2. Reviews tab text like "Reviews (247)" 3. Span with "X reviews" text Tested: Soho Club 247/247 correctly detected Co-Authored-By: Claude Opus 4.5 --- modules/scraper_clean.py | 54 ++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py index 1f3b15d..7881b00 100644 --- a/modules/scraper_clean.py +++ b/modules/scraper_clean.py @@ -293,26 +293,44 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in print("✅ Found scroll container") - # Extract total review count from page + # Extract total review count from page (look in specific places) total_reviews = None try: - page_text = driver.page_source - # Look for "XX reviews" pattern - patterns = [ - r'(\d{1,3}(?:,\d{3})*)\s+reviews?', - r'(\d+\.?\d*K)\s+reviews?', - r'(\d{1,3}(?:,\d{3})*)\s+reseñas?', - ] - for pattern in patterns: - matches = re.findall(pattern, page_text, re.IGNORECASE) - if matches: - count_str = matches[0] - if 'K' in count_str.upper(): - total_reviews = int(float(count_str.upper().replace('K', '')) * 1000) - else: - total_reviews = int(count_str.replace(',', '')) - print(f"📊 Total reviews on page: {total_reviews}") - break + total_reviews = driver.execute_script(""" + // Method 1: Sum up star rating counts (most accurate) + // Look for aria-labels like "5 stars, 171 reviews" + var total = 0; + var starLabels = document.querySelectorAll('[aria-label*="stars,"][aria-label*="review"]'); + if (starLabels.length >= 5) { + for (var i = 0; i < starLabels.length; i++) { + var match = starLabels[i].getAttribute('aria-label').match(/(\\d+)\\s*review/i); + if (match) total += parseInt(match[1]); + } + if (total > 0) return total; + } + + // Method 2: Look in reviews tab text (e.g., "Reviews (247)") + var tabs = document.querySelectorAll('button[role="tab"]'); + for (var i = 0; i < tabs.length; i++) { + var text = tabs[i].textContent || ''; + if (/review|reseña/i.test(text)) { + var match = text.match(/\\(([\\d,\\.]+)\\)/); + if (match) return parseInt(match[1].replace(/[,\\.]/g, '')); + } + } + + // Method 3: Look for "X reviews" near rating + var spans = document.querySelectorAll('span'); + for (var i = 0; i < spans.length; i++) { + var text = spans[i].textContent || ''; + var match = text.match(/^([\\d,\\.]+)\\s*reviews?$/i); + if (match) return parseInt(match[1].replace(/[,\\.]/g, '')); + } + + return null; + """) + if total_reviews: + print(f"📊 Total reviews on page: {total_reviews}") except: pass