Stop immediately when all reviews collected

- Check total_reviews before recovery attempts
- Exit loop as soon as current_count >= total_reviews
- Reduces scrape time significantly (13s vs 56s for 247 reviews)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-22 13:19:45 +00:00
parent f1f1aa0785
commit 5db277ad2f

View File

@@ -785,18 +785,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
else: else:
print(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True) print(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True)
# STUCK DETECTION: If no new reviews for 3s+, try to unstick # Stop conditions - check BEFORE recovery attempts
# Trigger at 3s, 6s, 9s... (every 3 seconds while stuck)
if elapsed >= 3 and int(elapsed) % 3 == 0:
print(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
unstick_scroll()
# Stop conditions
if current_count >= max_reviews: if current_count >= max_reviews:
print(f"✅ Reached max: {current_count}") print(f"✅ Reached max: {current_count}")
stop_scrolling.set() stop_scrolling.set()
break break
# Also stop if we have all reviews from the page
if total_reviews and current_count >= total_reviews:
print(f"✅ All {current_count} reviews collected")
stop_scrolling.set()
break
# STUCK DETECTION: If no new reviews for 3s+, try to unstick
# Only if we haven't collected all reviews yet
if elapsed >= 3 and int(elapsed) % 3 == 0:
print(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
unstick_scroll()
# Check scroll state - track if content is still being added # Check scroll state - track if content is still being added
try: try:
scroll_state = driver.execute_script(""" scroll_state = driver.execute_script("""