Fix total review count detection - sum star ratings
Previous detection was matching wrong elements (partial counts). Now sums "X stars, Y reviews" aria-labels for accurate total. Fallback methods: 1. Sum star rating counts (most accurate) 2. Reviews tab text like "Reviews (247)" 3. Span with "X reviews" text Tested: Soho Club 247/247 correctly detected Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -293,26 +293,44 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
|
||||
print("✅ Found scroll container")
|
||||
|
||||
# Extract total review count from page
|
||||
# Extract total review count from page (look in specific places)
|
||||
total_reviews = None
|
||||
try:
|
||||
page_text = driver.page_source
|
||||
# Look for "XX reviews" pattern
|
||||
patterns = [
|
||||
r'(\d{1,3}(?:,\d{3})*)\s+reviews?',
|
||||
r'(\d+\.?\d*K)\s+reviews?',
|
||||
r'(\d{1,3}(?:,\d{3})*)\s+reseñas?',
|
||||
]
|
||||
for pattern in patterns:
|
||||
matches = re.findall(pattern, page_text, re.IGNORECASE)
|
||||
if matches:
|
||||
count_str = matches[0]
|
||||
if 'K' in count_str.upper():
|
||||
total_reviews = int(float(count_str.upper().replace('K', '')) * 1000)
|
||||
else:
|
||||
total_reviews = int(count_str.replace(',', ''))
|
||||
print(f"📊 Total reviews on page: {total_reviews}")
|
||||
break
|
||||
total_reviews = driver.execute_script("""
|
||||
// Method 1: Sum up star rating counts (most accurate)
|
||||
// Look for aria-labels like "5 stars, 171 reviews"
|
||||
var total = 0;
|
||||
var starLabels = document.querySelectorAll('[aria-label*="stars,"][aria-label*="review"]');
|
||||
if (starLabels.length >= 5) {
|
||||
for (var i = 0; i < starLabels.length; i++) {
|
||||
var match = starLabels[i].getAttribute('aria-label').match(/(\\d+)\\s*review/i);
|
||||
if (match) total += parseInt(match[1]);
|
||||
}
|
||||
if (total > 0) return total;
|
||||
}
|
||||
|
||||
// Method 2: Look in reviews tab text (e.g., "Reviews (247)")
|
||||
var tabs = document.querySelectorAll('button[role="tab"]');
|
||||
for (var i = 0; i < tabs.length; i++) {
|
||||
var text = tabs[i].textContent || '';
|
||||
if (/review|reseña/i.test(text)) {
|
||||
var match = text.match(/\\(([\\d,\\.]+)\\)/);
|
||||
if (match) return parseInt(match[1].replace(/[,\\.]/g, ''));
|
||||
}
|
||||
}
|
||||
|
||||
// Method 3: Look for "X reviews" near rating
|
||||
var spans = document.querySelectorAll('span');
|
||||
for (var i = 0; i < spans.length; i++) {
|
||||
var text = spans[i].textContent || '';
|
||||
var match = text.match(/^([\\d,\\.]+)\\s*reviews?$/i);
|
||||
if (match) return parseInt(match[1].replace(/[,\\.]/g, ''));
|
||||
}
|
||||
|
||||
return null;
|
||||
""")
|
||||
if total_reviews:
|
||||
print(f"📊 Total reviews on page: {total_reviews}")
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
Reference in New Issue
Block a user