Add polling for total count detection on page load
- Poll for up to 5s waiting for span[role="img"][aria-label*="review"] - Element may not be present immediately after consent handling - Tested: Soho Club 247/247 reviews in 31.4s with correct total Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -236,25 +236,29 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
|
||||
# Extract total review count BEFORE clicking reviews tab (it's on Overview)
|
||||
# ROBUST: Use aria-label="X reviews" on span[role="img"]
|
||||
# Poll for up to 5s since page might still be loading after consent
|
||||
total_reviews = None
|
||||
try:
|
||||
total_reviews = driver.execute_script("""
|
||||
// ROBUST: Find span[role="img"][aria-label*="review"] - contains "(X)" text
|
||||
// aria-label format: "260 reviews" or "1,234 reviews"
|
||||
var reviewSpans = document.querySelectorAll('span[role="img"][aria-label*="review"]');
|
||||
for (var i = 0; i < reviewSpans.length; i++) {
|
||||
var label = reviewSpans[i].getAttribute('aria-label') || '';
|
||||
var match = label.match(/^([\\d,\\.]+)\\s*review/i);
|
||||
if (match) {
|
||||
return parseInt(match[1].replace(/[,\\.]/g, ''));
|
||||
start = time.time()
|
||||
while time.time() - start < 5:
|
||||
try:
|
||||
total_reviews = driver.execute_script("""
|
||||
// ROBUST: Find span[role="img"] with aria-label starting with number + "review"
|
||||
var reviewSpans = document.querySelectorAll('span[role="img"]');
|
||||
for (var i = 0; i < reviewSpans.length; i++) {
|
||||
var label = reviewSpans[i].getAttribute('aria-label') || '';
|
||||
var match = label.match(/^([\\d,\\.]+)\\s*review/i);
|
||||
if (match) {
|
||||
return parseInt(match[1].replace(/[,\\.]/g, ''));
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
""")
|
||||
if total_reviews:
|
||||
print(f"📊 Total reviews on page: {total_reviews}")
|
||||
except:
|
||||
pass
|
||||
return null;
|
||||
""")
|
||||
if total_reviews:
|
||||
print(f"📊 Total reviews on page: {total_reviews}")
|
||||
break
|
||||
except:
|
||||
pass
|
||||
time.sleep(0.1)
|
||||
|
||||
# Click reviews tab - poll until found
|
||||
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
|
||||
|
||||
Reference in New Issue
Block a user