Stop immediately when all reviews collected
- Check total_reviews before recovery attempts - Exit loop as soon as current_count >= total_reviews - Reduces scrape time significantly (13s vs 56s for 247 reviews) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -785,18 +785,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
else:
|
||||
print(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True)
|
||||
|
||||
# STUCK DETECTION: If no new reviews for 3s+, try to unstick
|
||||
# Trigger at 3s, 6s, 9s... (every 3 seconds while stuck)
|
||||
if elapsed >= 3 and int(elapsed) % 3 == 0:
|
||||
print(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
|
||||
unstick_scroll()
|
||||
|
||||
# Stop conditions
|
||||
# Stop conditions - check BEFORE recovery attempts
|
||||
if current_count >= max_reviews:
|
||||
print(f"✅ Reached max: {current_count}")
|
||||
stop_scrolling.set()
|
||||
break
|
||||
|
||||
# Also stop if we have all reviews from the page
|
||||
if total_reviews and current_count >= total_reviews:
|
||||
print(f"✅ All {current_count} reviews collected")
|
||||
stop_scrolling.set()
|
||||
break
|
||||
|
||||
# STUCK DETECTION: If no new reviews for 3s+, try to unstick
|
||||
# Only if we haven't collected all reviews yet
|
||||
if elapsed >= 3 and int(elapsed) % 3 == 0:
|
||||
print(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
|
||||
unstick_scroll()
|
||||
|
||||
# Check scroll state - track if content is still being added
|
||||
try:
|
||||
scroll_state = driver.execute_script("""
|
||||
|
||||
Reference in New Issue
Block a user