Fix total review count detection - sum star ratings
Previous detection was matching wrong elements (partial counts). Now sums "X stars, Y reviews" aria-labels for accurate total. Fallback methods: 1. Sum star rating counts (most accurate) 2. Reviews tab text like "Reviews (247)" 3. Span with "X reviews" text Tested: Soho Club 247/247 correctly detected Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -293,26 +293,44 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
|
|
||||||
print("✅ Found scroll container")
|
print("✅ Found scroll container")
|
||||||
|
|
||||||
# Extract total review count from page
|
# Extract total review count from page (look in specific places)
|
||||||
total_reviews = None
|
total_reviews = None
|
||||||
try:
|
try:
|
||||||
page_text = driver.page_source
|
total_reviews = driver.execute_script("""
|
||||||
# Look for "XX reviews" pattern
|
// Method 1: Sum up star rating counts (most accurate)
|
||||||
patterns = [
|
// Look for aria-labels like "5 stars, 171 reviews"
|
||||||
r'(\d{1,3}(?:,\d{3})*)\s+reviews?',
|
var total = 0;
|
||||||
r'(\d+\.?\d*K)\s+reviews?',
|
var starLabels = document.querySelectorAll('[aria-label*="stars,"][aria-label*="review"]');
|
||||||
r'(\d{1,3}(?:,\d{3})*)\s+reseñas?',
|
if (starLabels.length >= 5) {
|
||||||
]
|
for (var i = 0; i < starLabels.length; i++) {
|
||||||
for pattern in patterns:
|
var match = starLabels[i].getAttribute('aria-label').match(/(\\d+)\\s*review/i);
|
||||||
matches = re.findall(pattern, page_text, re.IGNORECASE)
|
if (match) total += parseInt(match[1]);
|
||||||
if matches:
|
}
|
||||||
count_str = matches[0]
|
if (total > 0) return total;
|
||||||
if 'K' in count_str.upper():
|
}
|
||||||
total_reviews = int(float(count_str.upper().replace('K', '')) * 1000)
|
|
||||||
else:
|
// Method 2: Look in reviews tab text (e.g., "Reviews (247)")
|
||||||
total_reviews = int(count_str.replace(',', ''))
|
var tabs = document.querySelectorAll('button[role="tab"]');
|
||||||
|
for (var i = 0; i < tabs.length; i++) {
|
||||||
|
var text = tabs[i].textContent || '';
|
||||||
|
if (/review|reseña/i.test(text)) {
|
||||||
|
var match = text.match(/\\(([\\d,\\.]+)\\)/);
|
||||||
|
if (match) return parseInt(match[1].replace(/[,\\.]/g, ''));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Method 3: Look for "X reviews" near rating
|
||||||
|
var spans = document.querySelectorAll('span');
|
||||||
|
for (var i = 0; i < spans.length; i++) {
|
||||||
|
var text = spans[i].textContent || '';
|
||||||
|
var match = text.match(/^([\\d,\\.]+)\\s*reviews?$/i);
|
||||||
|
if (match) return parseInt(match[1].replace(/[,\\.]/g, ''));
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
""")
|
||||||
|
if total_reviews:
|
||||||
print(f"📊 Total reviews on page: {total_reviews}")
|
print(f"📊 Total reviews on page: {total_reviews}")
|
||||||
break
|
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user