From 0778b2e07dd1fe6e7097538001b74ad85e4032af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?=
 <35082514+alezmad@users.noreply.github.com>
Date: Wed, 21 Jan 2026 22:50:06 +0000
Subject: [PATCH] Fix total review count detection - sum star ratings

Previous detection was matching wrong elements (partial counts).
Now sums "X stars, Y reviews" aria-labels for accurate total.

Fallback methods:
1. Sum star rating counts (most accurate)
2. Reviews tab text like "Reviews (247)"
3. Span with "X reviews" text

Tested: Soho Club 247/247 correctly detected

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 modules/scraper_clean.py | 54 ++++++++++++++++++++++++++--------------
 1 file changed, 36 insertions(+), 18 deletions(-)

diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py
index 1f3b15d..7881b00 100644
--- a/modules/scraper_clean.py
+++ b/modules/scraper_clean.py
@@ -293,26 +293,44 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
 
     print("✅ Found scroll container")
 
-    # Extract total review count from page
+    # Extract total review count from page (look in specific places)
     total_reviews = None
     try:
-        page_text = driver.page_source
-        # Look for "XX reviews" pattern
-        patterns = [
-            r'(\d{1,3}(?:,\d{3})*)\s+reviews?',
-            r'(\d+\.?\d*K)\s+reviews?',
-            r'(\d{1,3}(?:,\d{3})*)\s+reseñas?',
-        ]
-        for pattern in patterns:
-            matches = re.findall(pattern, page_text, re.IGNORECASE)
-            if matches:
-                count_str = matches[0]
-                if 'K' in count_str.upper():
-                    total_reviews = int(float(count_str.upper().replace('K', '')) * 1000)
-                else:
-                    total_reviews = int(count_str.replace(',', ''))
-                print(f"📊 Total reviews on page: {total_reviews}")
-                break
+        total_reviews = driver.execute_script("""
+            // Method 1: Sum up star rating counts (most accurate)
+            // Look for aria-labels like "5 stars, 171 reviews"
+            var total = 0;
+            var starLabels = document.querySelectorAll('[aria-label*="stars,"][aria-label*="review"]');
+            if (starLabels.length >= 5) {
+                for (var i = 0; i < starLabels.length; i++) {
+                    var match = starLabels[i].getAttribute('aria-label').match(/(\\d+)\\s*review/i);
+                    if (match) total += parseInt(match[1]);
+                }
+                if (total > 0) return total;
+            }
+
+            // Method 2: Look in reviews tab text (e.g., "Reviews (247)")
+            var tabs = document.querySelectorAll('button[role="tab"]');
+            for (var i = 0; i < tabs.length; i++) {
+                var text = tabs[i].textContent || '';
+                if (/review|reseña/i.test(text)) {
+                    var match = text.match(/\\(([\\d,\\.]+)\\)/);
+                    if (match) return parseInt(match[1].replace(/[,\\.]/g, ''));
+                }
+            }
+
+            // Method 3: Look for "X reviews" near rating
+            var spans = document.querySelectorAll('span');
+            for (var i = 0; i < spans.length; i++) {
+                var text = spans[i].textContent || '';
+                var match = text.match(/^([\\d,\\.]+)\\s*reviews?$/i);
+                if (match) return parseInt(match[1].replace(/[,\\.]/g, ''));
+            }
+
+            return null;
+        """)
+        if total_reviews:
+            print(f"📊 Total reviews on page: {total_reviews}")
     except:
         pass