Fix total review count detection - sum star ratings

Previous detection was matching wrong elements (partial counts).
Now sums "X stars, Y reviews" aria-labels for accurate total.

Fallback methods:
1. Sum star rating counts (most accurate)
2. Reviews tab text like "Reviews (247)"
3. Span with "X reviews" text

Tested: Soho Club 247/247 correctly detected

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-21 22:50:06 +00:00
parent 6934838a69
commit 0778b2e07d

View File

@@ -293,26 +293,44 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
print("✅ Found scroll container") print("✅ Found scroll container")
# Extract total review count from page # Extract total review count from page (look in specific places)
total_reviews = None total_reviews = None
try: try:
page_text = driver.page_source total_reviews = driver.execute_script("""
# Look for "XX reviews" pattern // Method 1: Sum up star rating counts (most accurate)
patterns = [ // Look for aria-labels like "5 stars, 171 reviews"
r'(\d{1,3}(?:,\d{3})*)\s+reviews?', var total = 0;
r'(\d+\.?\d*K)\s+reviews?', var starLabels = document.querySelectorAll('[aria-label*="stars,"][aria-label*="review"]');
r'(\d{1,3}(?:,\d{3})*)\s+reseñas?', if (starLabels.length >= 5) {
] for (var i = 0; i < starLabels.length; i++) {
for pattern in patterns: var match = starLabels[i].getAttribute('aria-label').match(/(\\d+)\\s*review/i);
matches = re.findall(pattern, page_text, re.IGNORECASE) if (match) total += parseInt(match[1]);
if matches: }
count_str = matches[0] if (total > 0) return total;
if 'K' in count_str.upper(): }
total_reviews = int(float(count_str.upper().replace('K', '')) * 1000)
else: // Method 2: Look in reviews tab text (e.g., "Reviews (247)")
total_reviews = int(count_str.replace(',', '')) var tabs = document.querySelectorAll('button[role="tab"]');
for (var i = 0; i < tabs.length; i++) {
var text = tabs[i].textContent || '';
if (/review|reseña/i.test(text)) {
var match = text.match(/\\(([\\d,\\.]+)\\)/);
if (match) return parseInt(match[1].replace(/[,\\.]/g, ''));
}
}
// Method 3: Look for "X reviews" near rating
var spans = document.querySelectorAll('span');
for (var i = 0; i < spans.length; i++) {
var text = spans[i].textContent || '';
var match = text.match(/^([\\d,\\.]+)\\s*reviews?$/i);
if (match) return parseInt(match[1].replace(/[,\\.]/g, ''));
}
return null;
""")
if total_reviews:
print(f"📊 Total reviews on page: {total_reviews}") print(f"📊 Total reviews on page: {total_reviews}")
break
except: except:
pass pass