Real-time parsing + image blocking for large datasets

Key improvements:
- Parse reviews immediately during scroll (not at end)
- Fixes virtual scroll issue - was losing reviews after ~1000
- Block images via CDP for faster loading
- Smart recovery: 4 methods (keys, wheel, scroll up/down, click card)
- Dynamic timeout based on scroll state and content growth
- Spinner + network activity detection resets idle timer
- Sort by newest first option

Results: 1930 reviews (was 990) on 2433-review location

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-21 22:25:26 +00:00
parent 6a75159ebe
commit 6934838a69

View File

@@ -200,9 +200,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Storage - use review ID as key # Storage - use review ID as key
reviews = {} # review_id -> review reviews = {} # review_id -> review
# Force English language # Don't force language - let Google show all reviews in user's locale
if "hl=" not in url:
url = url + ("&" if "?" in url else "?") + "hl=en"
# Navigate to URL # Navigate to URL
print(f"🌐 Loading: {url[:80]}...") print(f"🌐 Loading: {url[:80]}...")
@@ -380,10 +378,47 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
pass pass
return api_revs return api_revs
# Store pane in window for scroll thread # Sort by newest first (helps with loading)
driver.execute_script("window.scrollablePane = arguments[0];", scroll_container) try:
sort_btn = driver.execute_script("""
var btns = document.querySelectorAll('button[data-value="sort"]');
if (btns.length) return btns[0];
// Try aria-label
var all = document.querySelectorAll('button[aria-label*="Sort"]');
if (all.length) return all[0];
return null;
""")
if sort_btn:
sort_btn.click()
time.sleep(0.3)
# Click "Newest" option
driver.execute_script("""
var items = document.querySelectorAll('[role="menuitemradio"], [data-index="1"]');
for (var i = 0; i < items.length; i++) {
var txt = items[i].textContent.toLowerCase();
if (txt.includes('newest') || txt.includes('recent') || txt.includes('más reciente')) {
items[i].click();
break;
}
}
""")
time.sleep(0.5)
print(" 📅 Sorted by newest")
except:
pass
# Background scroll thread (fast, continuous) # Block images to speed up scrolling (use CDP)
try:
driver.execute_cdp_cmd('Network.setBlockedURLs', {
'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*']
})
driver.execute_cdp_cmd('Network.enable', {})
print(" 🚫 Blocking images for faster scrolling")
except Exception as e:
pass # CDP might not be available in all setups
# Simple scroll - scrollTop = scrollHeight (proven to work)
driver.execute_script("window.scrollablePane = arguments[0];", scroll_container)
stop_scrolling = threading.Event() stop_scrolling = threading.Event()
def scroll_worker(): def scroll_worker():
@@ -395,11 +430,57 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
""") """)
except: except:
pass pass
time.sleep(0.1) # 10x per second time.sleep(0.1)
scroll_thread = threading.Thread(target=scroll_worker, daemon=True) scroll_thread = threading.Thread(target=scroll_worker, daemon=True)
scroll_thread.start() scroll_thread.start()
# Recovery function - use real mouse actions when stuck
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
recovery_count = [0]
def unstick_scroll():
recovery_count[0] += 1
method = recovery_count[0] % 4
try:
if method == 1:
# Method 1: Click pane and send Page Down keys
scroll_container.click()
ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
elif method == 2:
# Method 2: Real mouse wheel scroll
ActionChains(driver).move_to_element(scroll_container)\
.scroll_by_amount(0, 800).perform()
elif method == 3:
# Method 3: Scroll up significantly then back down (force reload)
driver.execute_script("""
var p = window.scrollablePane;
if (p) p.scrollTop = Math.max(0, p.scrollTop - 2000);
""")
time.sleep(0.3)
driver.execute_script("""
var p = window.scrollablePane;
if (p) p.scrollTop = p.scrollHeight;
""")
else:
# Method 4: Click last review card to focus, then scroll
driver.execute_script("""
var cards = document.querySelectorAll('div.jftiEf[data-review-id]');
if (cards.length > 0) {
cards[cards.length - 1].scrollIntoView({block: 'end'});
cards[cards.length - 1].click();
}
""")
time.sleep(0.2)
driver.execute_script("""
var p = window.scrollablePane;
if (p) p.scrollTop = p.scrollHeight;
""")
except:
pass
# Main collection loop # Main collection loop
last_new_time = time.time() last_new_time = time.time()
last_count = len(reviews) last_count = len(reviews)
@@ -417,19 +498,17 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
key = f"api_{rev['author'][:20]}_{rev['rating']}" key = f"api_{rev['author'][:20]}_{rev['rating']}"
reviews[key] = rev reviews[key] = rev
# Collect review IDs via JavaScript (doesn't affect scroll position!) # Parse reviews in real-time (Google Maps uses virtual scroll - elements get removed!)
# Use specific selector to only get actual review cards, not buttons # We must parse NOW, not later
try: try:
review_ids = driver.execute_script(""" cards = driver.find_elements(By.CSS_SELECTOR, "div.jftiEf[data-review-id]")
var ids = []; for card in cards:
document.querySelectorAll('div.jftiEf[data-review-id]').forEach(function(el) { rid = card.get_attribute("data-review-id")
ids.push(el.getAttribute('data-review-id'));
});
return ids;
""")
for rid in (review_ids or []):
if rid and rid not in reviews: if rid and rid not in reviews:
reviews[rid] = {"id": rid, "source": "dom", "_needs_parse": True} # Parse immediately - element may be gone later!
review = parse_dom_review(card)
if review:
reviews[rid] = review
except: except:
pass pass
@@ -440,6 +519,30 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
last_new_time = time.time() last_new_time = time.time()
last_count = current_count last_count = current_count
# Check if loading (spinner visible OR network activity)
try:
loading_status = driver.execute_script("""
var status = {spinner: false, network: false};
// Check for Google's loading indicators
var spinner = document.querySelector('div[role="progressbar"]');
if (spinner && spinner.offsetParent !== null) status.spinner = true;
var loading = document.querySelector('.qjESne, .loading');
if (loading && loading.offsetParent !== null) status.spinner = true;
// Check for recent network activity (API interceptor)
var responses = window.__interceptedResponses || [];
var lastCount = window.__lastResponseCount || 0;
if (responses.length > lastCount) {
status.network = true;
window.__lastResponseCount = responses.length;
}
return status;
""")
is_loading = loading_status.get('spinner') or loading_status.get('network')
if is_loading:
last_new_time = time.time() # Reset timer while loading
except:
is_loading = False
# Progress update # Progress update
elapsed = time.time() - last_new_time elapsed = time.time() - last_new_time
if total_reviews: if total_reviews:
@@ -448,37 +551,58 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
else: else:
print(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True) print(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True)
# STUCK DETECTION: If no new reviews for 3s+, try to unstick
# Trigger at 3s, 6s, 9s... (every 3 seconds while stuck)
if elapsed >= 3 and int(elapsed) % 3 == 0:
print(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
unstick_scroll()
# Stop conditions # Stop conditions
if current_count >= max_reviews: if current_count >= max_reviews:
print(f"✅ Reached max: {current_count}") print(f"✅ Reached max: {current_count}")
stop_scrolling.set() stop_scrolling.set()
break break
if total_reviews and current_count >= total_reviews: # Check scroll state - track if content is still being added
print(f"✅ Got all {total_reviews} reviews!") try:
scroll_state = driver.execute_script("""
var p = window.scrollablePane;
if (!p) return {atBottom: true, height: 0};
var atBottom = (p.scrollTop + p.clientHeight >= p.scrollHeight - 50);
var height = p.scrollHeight;
var lastHeight = window.__lastScrollHeight || 0;
var growing = height > lastHeight;
window.__lastScrollHeight = height;
return {atBottom: atBottom, height: height, growing: growing};
""")
at_bottom = scroll_state.get('atBottom', True)
content_growing = scroll_state.get('growing', False)
except:
at_bottom = True
content_growing = False
# Reset timer if content is growing (new reviews loading)
if content_growing:
last_new_time = time.time()
# Dynamic timeout based on state and recovery attempts
# - 5s if at bottom AND content stopped growing AND multiple recovery attempts failed
# - 15s max otherwise (keep trying)
recovery_failed = recovery_count[0] >= 5 and elapsed >= 5
truly_done = at_bottom and not content_growing and recovery_failed
timeout_hit = elapsed >= 15
if truly_done or timeout_hit:
print(f"✅ All reviews loaded: {current_count}")
stop_scrolling.set() stop_scrolling.set()
break break
if time.time() - last_new_time >= timeout_no_new: # Reviews already parsed during scrolling (real-time parsing)
print(f"⏱️ Timeout: no new reviews for {timeout_no_new}s") print("📝 Finalizing review data...")
stop_scrolling.set()
break
# FINAL PHASE: Parse full review data from DOM (scroll is stopped) # Separate API and DOM reviews
print("📝 Parsing full review data...")
api_reviews_collected = {k: v for k, v in reviews.items() if v.get("source") == "api"} api_reviews_collected = {k: v for k, v in reviews.items() if v.get("source") == "api"}
reviews.clear() dom_reviews = {k: v for k, v in reviews.items() if v.get("source") == "dom"}
# Parse all DOM cards now that scrolling is done
# Use specific selector to only get actual review cards (div.jftiEf), not buttons
try:
cards = driver.find_elements(By.CSS_SELECTOR, "div.jftiEf[data-review-id]")
for card in cards:
review = parse_dom_review(card)
if review and review.get("id"):
reviews[review["id"]] = review
except Exception as e:
print(f" Warning: DOM parse error: {e}")
# Merge API reviews (only add if not already in DOM) # Merge API reviews (only add if not already in DOM)
api_added = 0 api_added = 0