Real-time parsing + image blocking for large datasets
Key improvements: - Parse reviews immediately during scroll (not at end) - Fixes virtual scroll issue - was losing reviews after ~1000 - Block images via CDP for faster loading - Smart recovery: 4 methods (keys, wheel, scroll up/down, click card) - Dynamic timeout based on scroll state and content growth - Spinner + network activity detection resets idle timer - Sort by newest first option Results: 1930 reviews (was 990) on 2433-review location Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -200,9 +200,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
# Storage - use review ID as key
|
||||
reviews = {} # review_id -> review
|
||||
|
||||
# Force English language
|
||||
if "hl=" not in url:
|
||||
url = url + ("&" if "?" in url else "?") + "hl=en"
|
||||
# Don't force language - let Google show all reviews in user's locale
|
||||
|
||||
# Navigate to URL
|
||||
print(f"🌐 Loading: {url[:80]}...")
|
||||
@@ -380,10 +378,47 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
pass
|
||||
return api_revs
|
||||
|
||||
# Store pane in window for scroll thread
|
||||
driver.execute_script("window.scrollablePane = arguments[0];", scroll_container)
|
||||
# Sort by newest first (helps with loading)
|
||||
try:
|
||||
sort_btn = driver.execute_script("""
|
||||
var btns = document.querySelectorAll('button[data-value="sort"]');
|
||||
if (btns.length) return btns[0];
|
||||
// Try aria-label
|
||||
var all = document.querySelectorAll('button[aria-label*="Sort"]');
|
||||
if (all.length) return all[0];
|
||||
return null;
|
||||
""")
|
||||
if sort_btn:
|
||||
sort_btn.click()
|
||||
time.sleep(0.3)
|
||||
# Click "Newest" option
|
||||
driver.execute_script("""
|
||||
var items = document.querySelectorAll('[role="menuitemradio"], [data-index="1"]');
|
||||
for (var i = 0; i < items.length; i++) {
|
||||
var txt = items[i].textContent.toLowerCase();
|
||||
if (txt.includes('newest') || txt.includes('recent') || txt.includes('más reciente')) {
|
||||
items[i].click();
|
||||
break;
|
||||
}
|
||||
}
|
||||
""")
|
||||
time.sleep(0.5)
|
||||
print(" 📅 Sorted by newest")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Background scroll thread (fast, continuous)
|
||||
# Block images to speed up scrolling (use CDP)
|
||||
try:
|
||||
driver.execute_cdp_cmd('Network.setBlockedURLs', {
|
||||
'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*']
|
||||
})
|
||||
driver.execute_cdp_cmd('Network.enable', {})
|
||||
print(" 🚫 Blocking images for faster scrolling")
|
||||
except Exception as e:
|
||||
pass # CDP might not be available in all setups
|
||||
|
||||
# Simple scroll - scrollTop = scrollHeight (proven to work)
|
||||
driver.execute_script("window.scrollablePane = arguments[0];", scroll_container)
|
||||
stop_scrolling = threading.Event()
|
||||
|
||||
def scroll_worker():
|
||||
@@ -395,11 +430,57 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
""")
|
||||
except:
|
||||
pass
|
||||
time.sleep(0.1) # 10x per second
|
||||
time.sleep(0.1)
|
||||
|
||||
scroll_thread = threading.Thread(target=scroll_worker, daemon=True)
|
||||
scroll_thread.start()
|
||||
|
||||
# Recovery function - use real mouse actions when stuck
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
recovery_count = [0]
|
||||
|
||||
def unstick_scroll():
|
||||
recovery_count[0] += 1
|
||||
method = recovery_count[0] % 4
|
||||
try:
|
||||
if method == 1:
|
||||
# Method 1: Click pane and send Page Down keys
|
||||
scroll_container.click()
|
||||
ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
|
||||
ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
|
||||
elif method == 2:
|
||||
# Method 2: Real mouse wheel scroll
|
||||
ActionChains(driver).move_to_element(scroll_container)\
|
||||
.scroll_by_amount(0, 800).perform()
|
||||
elif method == 3:
|
||||
# Method 3: Scroll up significantly then back down (force reload)
|
||||
driver.execute_script("""
|
||||
var p = window.scrollablePane;
|
||||
if (p) p.scrollTop = Math.max(0, p.scrollTop - 2000);
|
||||
""")
|
||||
time.sleep(0.3)
|
||||
driver.execute_script("""
|
||||
var p = window.scrollablePane;
|
||||
if (p) p.scrollTop = p.scrollHeight;
|
||||
""")
|
||||
else:
|
||||
# Method 4: Click last review card to focus, then scroll
|
||||
driver.execute_script("""
|
||||
var cards = document.querySelectorAll('div.jftiEf[data-review-id]');
|
||||
if (cards.length > 0) {
|
||||
cards[cards.length - 1].scrollIntoView({block: 'end'});
|
||||
cards[cards.length - 1].click();
|
||||
}
|
||||
""")
|
||||
time.sleep(0.2)
|
||||
driver.execute_script("""
|
||||
var p = window.scrollablePane;
|
||||
if (p) p.scrollTop = p.scrollHeight;
|
||||
""")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Main collection loop
|
||||
last_new_time = time.time()
|
||||
last_count = len(reviews)
|
||||
@@ -417,19 +498,17 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
key = f"api_{rev['author'][:20]}_{rev['rating']}"
|
||||
reviews[key] = rev
|
||||
|
||||
# Collect review IDs via JavaScript (doesn't affect scroll position!)
|
||||
# Use specific selector to only get actual review cards, not buttons
|
||||
# Parse reviews in real-time (Google Maps uses virtual scroll - elements get removed!)
|
||||
# We must parse NOW, not later
|
||||
try:
|
||||
review_ids = driver.execute_script("""
|
||||
var ids = [];
|
||||
document.querySelectorAll('div.jftiEf[data-review-id]').forEach(function(el) {
|
||||
ids.push(el.getAttribute('data-review-id'));
|
||||
});
|
||||
return ids;
|
||||
""")
|
||||
for rid in (review_ids or []):
|
||||
cards = driver.find_elements(By.CSS_SELECTOR, "div.jftiEf[data-review-id]")
|
||||
for card in cards:
|
||||
rid = card.get_attribute("data-review-id")
|
||||
if rid and rid not in reviews:
|
||||
reviews[rid] = {"id": rid, "source": "dom", "_needs_parse": True}
|
||||
# Parse immediately - element may be gone later!
|
||||
review = parse_dom_review(card)
|
||||
if review:
|
||||
reviews[rid] = review
|
||||
except:
|
||||
pass
|
||||
|
||||
@@ -440,6 +519,30 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
last_new_time = time.time()
|
||||
last_count = current_count
|
||||
|
||||
# Check if loading (spinner visible OR network activity)
|
||||
try:
|
||||
loading_status = driver.execute_script("""
|
||||
var status = {spinner: false, network: false};
|
||||
// Check for Google's loading indicators
|
||||
var spinner = document.querySelector('div[role="progressbar"]');
|
||||
if (spinner && spinner.offsetParent !== null) status.spinner = true;
|
||||
var loading = document.querySelector('.qjESne, .loading');
|
||||
if (loading && loading.offsetParent !== null) status.spinner = true;
|
||||
// Check for recent network activity (API interceptor)
|
||||
var responses = window.__interceptedResponses || [];
|
||||
var lastCount = window.__lastResponseCount || 0;
|
||||
if (responses.length > lastCount) {
|
||||
status.network = true;
|
||||
window.__lastResponseCount = responses.length;
|
||||
}
|
||||
return status;
|
||||
""")
|
||||
is_loading = loading_status.get('spinner') or loading_status.get('network')
|
||||
if is_loading:
|
||||
last_new_time = time.time() # Reset timer while loading
|
||||
except:
|
||||
is_loading = False
|
||||
|
||||
# Progress update
|
||||
elapsed = time.time() - last_new_time
|
||||
if total_reviews:
|
||||
@@ -448,37 +551,58 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
else:
|
||||
print(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True)
|
||||
|
||||
# STUCK DETECTION: If no new reviews for 3s+, try to unstick
|
||||
# Trigger at 3s, 6s, 9s... (every 3 seconds while stuck)
|
||||
if elapsed >= 3 and int(elapsed) % 3 == 0:
|
||||
print(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
|
||||
unstick_scroll()
|
||||
|
||||
# Stop conditions
|
||||
if current_count >= max_reviews:
|
||||
print(f"✅ Reached max: {current_count}")
|
||||
stop_scrolling.set()
|
||||
break
|
||||
|
||||
if total_reviews and current_count >= total_reviews:
|
||||
print(f"✅ Got all {total_reviews} reviews!")
|
||||
# Check scroll state - track if content is still being added
|
||||
try:
|
||||
scroll_state = driver.execute_script("""
|
||||
var p = window.scrollablePane;
|
||||
if (!p) return {atBottom: true, height: 0};
|
||||
var atBottom = (p.scrollTop + p.clientHeight >= p.scrollHeight - 50);
|
||||
var height = p.scrollHeight;
|
||||
var lastHeight = window.__lastScrollHeight || 0;
|
||||
var growing = height > lastHeight;
|
||||
window.__lastScrollHeight = height;
|
||||
return {atBottom: atBottom, height: height, growing: growing};
|
||||
""")
|
||||
at_bottom = scroll_state.get('atBottom', True)
|
||||
content_growing = scroll_state.get('growing', False)
|
||||
except:
|
||||
at_bottom = True
|
||||
content_growing = False
|
||||
|
||||
# Reset timer if content is growing (new reviews loading)
|
||||
if content_growing:
|
||||
last_new_time = time.time()
|
||||
|
||||
# Dynamic timeout based on state and recovery attempts
|
||||
# - 5s if at bottom AND content stopped growing AND multiple recovery attempts failed
|
||||
# - 15s max otherwise (keep trying)
|
||||
recovery_failed = recovery_count[0] >= 5 and elapsed >= 5
|
||||
truly_done = at_bottom and not content_growing and recovery_failed
|
||||
timeout_hit = elapsed >= 15
|
||||
|
||||
if truly_done or timeout_hit:
|
||||
print(f"✅ All reviews loaded: {current_count}")
|
||||
stop_scrolling.set()
|
||||
break
|
||||
|
||||
if time.time() - last_new_time >= timeout_no_new:
|
||||
print(f"⏱️ Timeout: no new reviews for {timeout_no_new}s")
|
||||
stop_scrolling.set()
|
||||
break
|
||||
# Reviews already parsed during scrolling (real-time parsing)
|
||||
print("📝 Finalizing review data...")
|
||||
|
||||
# FINAL PHASE: Parse full review data from DOM (scroll is stopped)
|
||||
print("📝 Parsing full review data...")
|
||||
# Separate API and DOM reviews
|
||||
api_reviews_collected = {k: v for k, v in reviews.items() if v.get("source") == "api"}
|
||||
reviews.clear()
|
||||
|
||||
# Parse all DOM cards now that scrolling is done
|
||||
# Use specific selector to only get actual review cards (div.jftiEf), not buttons
|
||||
try:
|
||||
cards = driver.find_elements(By.CSS_SELECTOR, "div.jftiEf[data-review-id]")
|
||||
for card in cards:
|
||||
review = parse_dom_review(card)
|
||||
if review and review.get("id"):
|
||||
reviews[review["id"]] = review
|
||||
except Exception as e:
|
||||
print(f" Warning: DOM parse error: {e}")
|
||||
dom_reviews = {k: v for k, v in reviews.items() if v.get("source") == "dom"}
|
||||
|
||||
# Merge API reviews (only add if not already in DOM)
|
||||
api_added = 0
|
||||
|
||||
Reference in New Issue
Block a user