Real-time parsing + image blocking for large datasets
Key improvements: - Parse reviews immediately during scroll (not at end) - Fixes virtual scroll issue - was losing reviews after ~1000 - Block images via CDP for faster loading - Smart recovery: 4 methods (keys, wheel, scroll up/down, click card) - Dynamic timeout based on scroll state and content growth - Spinner + network activity detection resets idle timer - Sort by newest first option Results: 1930 reviews (was 990) on 2433-review location Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -200,9 +200,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
# Storage - use review ID as key
|
# Storage - use review ID as key
|
||||||
reviews = {} # review_id -> review
|
reviews = {} # review_id -> review
|
||||||
|
|
||||||
# Force English language
|
# Don't force language - let Google show all reviews in user's locale
|
||||||
if "hl=" not in url:
|
|
||||||
url = url + ("&" if "?" in url else "?") + "hl=en"
|
|
||||||
|
|
||||||
# Navigate to URL
|
# Navigate to URL
|
||||||
print(f"🌐 Loading: {url[:80]}...")
|
print(f"🌐 Loading: {url[:80]}...")
|
||||||
@@ -380,10 +378,47 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
pass
|
pass
|
||||||
return api_revs
|
return api_revs
|
||||||
|
|
||||||
# Store pane in window for scroll thread
|
# Sort by newest first (helps with loading)
|
||||||
driver.execute_script("window.scrollablePane = arguments[0];", scroll_container)
|
try:
|
||||||
|
sort_btn = driver.execute_script("""
|
||||||
|
var btns = document.querySelectorAll('button[data-value="sort"]');
|
||||||
|
if (btns.length) return btns[0];
|
||||||
|
// Try aria-label
|
||||||
|
var all = document.querySelectorAll('button[aria-label*="Sort"]');
|
||||||
|
if (all.length) return all[0];
|
||||||
|
return null;
|
||||||
|
""")
|
||||||
|
if sort_btn:
|
||||||
|
sort_btn.click()
|
||||||
|
time.sleep(0.3)
|
||||||
|
# Click "Newest" option
|
||||||
|
driver.execute_script("""
|
||||||
|
var items = document.querySelectorAll('[role="menuitemradio"], [data-index="1"]');
|
||||||
|
for (var i = 0; i < items.length; i++) {
|
||||||
|
var txt = items[i].textContent.toLowerCase();
|
||||||
|
if (txt.includes('newest') || txt.includes('recent') || txt.includes('más reciente')) {
|
||||||
|
items[i].click();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
time.sleep(0.5)
|
||||||
|
print(" 📅 Sorted by newest")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
# Background scroll thread (fast, continuous)
|
# Block images to speed up scrolling (use CDP)
|
||||||
|
try:
|
||||||
|
driver.execute_cdp_cmd('Network.setBlockedURLs', {
|
||||||
|
'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*']
|
||||||
|
})
|
||||||
|
driver.execute_cdp_cmd('Network.enable', {})
|
||||||
|
print(" 🚫 Blocking images for faster scrolling")
|
||||||
|
except Exception as e:
|
||||||
|
pass # CDP might not be available in all setups
|
||||||
|
|
||||||
|
# Simple scroll - scrollTop = scrollHeight (proven to work)
|
||||||
|
driver.execute_script("window.scrollablePane = arguments[0];", scroll_container)
|
||||||
stop_scrolling = threading.Event()
|
stop_scrolling = threading.Event()
|
||||||
|
|
||||||
def scroll_worker():
|
def scroll_worker():
|
||||||
@@ -395,11 +430,57 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
""")
|
""")
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
time.sleep(0.1) # 10x per second
|
time.sleep(0.1)
|
||||||
|
|
||||||
scroll_thread = threading.Thread(target=scroll_worker, daemon=True)
|
scroll_thread = threading.Thread(target=scroll_worker, daemon=True)
|
||||||
scroll_thread.start()
|
scroll_thread.start()
|
||||||
|
|
||||||
|
# Recovery function - use real mouse actions when stuck
|
||||||
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
|
from selenium.webdriver.common.keys import Keys
|
||||||
|
recovery_count = [0]
|
||||||
|
|
||||||
|
def unstick_scroll():
|
||||||
|
recovery_count[0] += 1
|
||||||
|
method = recovery_count[0] % 4
|
||||||
|
try:
|
||||||
|
if method == 1:
|
||||||
|
# Method 1: Click pane and send Page Down keys
|
||||||
|
scroll_container.click()
|
||||||
|
ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
|
||||||
|
ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
|
||||||
|
elif method == 2:
|
||||||
|
# Method 2: Real mouse wheel scroll
|
||||||
|
ActionChains(driver).move_to_element(scroll_container)\
|
||||||
|
.scroll_by_amount(0, 800).perform()
|
||||||
|
elif method == 3:
|
||||||
|
# Method 3: Scroll up significantly then back down (force reload)
|
||||||
|
driver.execute_script("""
|
||||||
|
var p = window.scrollablePane;
|
||||||
|
if (p) p.scrollTop = Math.max(0, p.scrollTop - 2000);
|
||||||
|
""")
|
||||||
|
time.sleep(0.3)
|
||||||
|
driver.execute_script("""
|
||||||
|
var p = window.scrollablePane;
|
||||||
|
if (p) p.scrollTop = p.scrollHeight;
|
||||||
|
""")
|
||||||
|
else:
|
||||||
|
# Method 4: Click last review card to focus, then scroll
|
||||||
|
driver.execute_script("""
|
||||||
|
var cards = document.querySelectorAll('div.jftiEf[data-review-id]');
|
||||||
|
if (cards.length > 0) {
|
||||||
|
cards[cards.length - 1].scrollIntoView({block: 'end'});
|
||||||
|
cards[cards.length - 1].click();
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
time.sleep(0.2)
|
||||||
|
driver.execute_script("""
|
||||||
|
var p = window.scrollablePane;
|
||||||
|
if (p) p.scrollTop = p.scrollHeight;
|
||||||
|
""")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
# Main collection loop
|
# Main collection loop
|
||||||
last_new_time = time.time()
|
last_new_time = time.time()
|
||||||
last_count = len(reviews)
|
last_count = len(reviews)
|
||||||
@@ -417,19 +498,17 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
key = f"api_{rev['author'][:20]}_{rev['rating']}"
|
key = f"api_{rev['author'][:20]}_{rev['rating']}"
|
||||||
reviews[key] = rev
|
reviews[key] = rev
|
||||||
|
|
||||||
# Collect review IDs via JavaScript (doesn't affect scroll position!)
|
# Parse reviews in real-time (Google Maps uses virtual scroll - elements get removed!)
|
||||||
# Use specific selector to only get actual review cards, not buttons
|
# We must parse NOW, not later
|
||||||
try:
|
try:
|
||||||
review_ids = driver.execute_script("""
|
cards = driver.find_elements(By.CSS_SELECTOR, "div.jftiEf[data-review-id]")
|
||||||
var ids = [];
|
for card in cards:
|
||||||
document.querySelectorAll('div.jftiEf[data-review-id]').forEach(function(el) {
|
rid = card.get_attribute("data-review-id")
|
||||||
ids.push(el.getAttribute('data-review-id'));
|
|
||||||
});
|
|
||||||
return ids;
|
|
||||||
""")
|
|
||||||
for rid in (review_ids or []):
|
|
||||||
if rid and rid not in reviews:
|
if rid and rid not in reviews:
|
||||||
reviews[rid] = {"id": rid, "source": "dom", "_needs_parse": True}
|
# Parse immediately - element may be gone later!
|
||||||
|
review = parse_dom_review(card)
|
||||||
|
if review:
|
||||||
|
reviews[rid] = review
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -440,6 +519,30 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
last_new_time = time.time()
|
last_new_time = time.time()
|
||||||
last_count = current_count
|
last_count = current_count
|
||||||
|
|
||||||
|
# Check if loading (spinner visible OR network activity)
|
||||||
|
try:
|
||||||
|
loading_status = driver.execute_script("""
|
||||||
|
var status = {spinner: false, network: false};
|
||||||
|
// Check for Google's loading indicators
|
||||||
|
var spinner = document.querySelector('div[role="progressbar"]');
|
||||||
|
if (spinner && spinner.offsetParent !== null) status.spinner = true;
|
||||||
|
var loading = document.querySelector('.qjESne, .loading');
|
||||||
|
if (loading && loading.offsetParent !== null) status.spinner = true;
|
||||||
|
// Check for recent network activity (API interceptor)
|
||||||
|
var responses = window.__interceptedResponses || [];
|
||||||
|
var lastCount = window.__lastResponseCount || 0;
|
||||||
|
if (responses.length > lastCount) {
|
||||||
|
status.network = true;
|
||||||
|
window.__lastResponseCount = responses.length;
|
||||||
|
}
|
||||||
|
return status;
|
||||||
|
""")
|
||||||
|
is_loading = loading_status.get('spinner') or loading_status.get('network')
|
||||||
|
if is_loading:
|
||||||
|
last_new_time = time.time() # Reset timer while loading
|
||||||
|
except:
|
||||||
|
is_loading = False
|
||||||
|
|
||||||
# Progress update
|
# Progress update
|
||||||
elapsed = time.time() - last_new_time
|
elapsed = time.time() - last_new_time
|
||||||
if total_reviews:
|
if total_reviews:
|
||||||
@@ -448,37 +551,58 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
else:
|
else:
|
||||||
print(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True)
|
print(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True)
|
||||||
|
|
||||||
|
# STUCK DETECTION: If no new reviews for 3s+, try to unstick
|
||||||
|
# Trigger at 3s, 6s, 9s... (every 3 seconds while stuck)
|
||||||
|
if elapsed >= 3 and int(elapsed) % 3 == 0:
|
||||||
|
print(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
|
||||||
|
unstick_scroll()
|
||||||
|
|
||||||
# Stop conditions
|
# Stop conditions
|
||||||
if current_count >= max_reviews:
|
if current_count >= max_reviews:
|
||||||
print(f"✅ Reached max: {current_count}")
|
print(f"✅ Reached max: {current_count}")
|
||||||
stop_scrolling.set()
|
stop_scrolling.set()
|
||||||
break
|
break
|
||||||
|
|
||||||
if total_reviews and current_count >= total_reviews:
|
# Check scroll state - track if content is still being added
|
||||||
print(f"✅ Got all {total_reviews} reviews!")
|
try:
|
||||||
|
scroll_state = driver.execute_script("""
|
||||||
|
var p = window.scrollablePane;
|
||||||
|
if (!p) return {atBottom: true, height: 0};
|
||||||
|
var atBottom = (p.scrollTop + p.clientHeight >= p.scrollHeight - 50);
|
||||||
|
var height = p.scrollHeight;
|
||||||
|
var lastHeight = window.__lastScrollHeight || 0;
|
||||||
|
var growing = height > lastHeight;
|
||||||
|
window.__lastScrollHeight = height;
|
||||||
|
return {atBottom: atBottom, height: height, growing: growing};
|
||||||
|
""")
|
||||||
|
at_bottom = scroll_state.get('atBottom', True)
|
||||||
|
content_growing = scroll_state.get('growing', False)
|
||||||
|
except:
|
||||||
|
at_bottom = True
|
||||||
|
content_growing = False
|
||||||
|
|
||||||
|
# Reset timer if content is growing (new reviews loading)
|
||||||
|
if content_growing:
|
||||||
|
last_new_time = time.time()
|
||||||
|
|
||||||
|
# Dynamic timeout based on state and recovery attempts
|
||||||
|
# - 5s if at bottom AND content stopped growing AND multiple recovery attempts failed
|
||||||
|
# - 15s max otherwise (keep trying)
|
||||||
|
recovery_failed = recovery_count[0] >= 5 and elapsed >= 5
|
||||||
|
truly_done = at_bottom and not content_growing and recovery_failed
|
||||||
|
timeout_hit = elapsed >= 15
|
||||||
|
|
||||||
|
if truly_done or timeout_hit:
|
||||||
|
print(f"✅ All reviews loaded: {current_count}")
|
||||||
stop_scrolling.set()
|
stop_scrolling.set()
|
||||||
break
|
break
|
||||||
|
|
||||||
if time.time() - last_new_time >= timeout_no_new:
|
# Reviews already parsed during scrolling (real-time parsing)
|
||||||
print(f"⏱️ Timeout: no new reviews for {timeout_no_new}s")
|
print("📝 Finalizing review data...")
|
||||||
stop_scrolling.set()
|
|
||||||
break
|
|
||||||
|
|
||||||
# FINAL PHASE: Parse full review data from DOM (scroll is stopped)
|
# Separate API and DOM reviews
|
||||||
print("📝 Parsing full review data...")
|
|
||||||
api_reviews_collected = {k: v for k, v in reviews.items() if v.get("source") == "api"}
|
api_reviews_collected = {k: v for k, v in reviews.items() if v.get("source") == "api"}
|
||||||
reviews.clear()
|
dom_reviews = {k: v for k, v in reviews.items() if v.get("source") == "dom"}
|
||||||
|
|
||||||
# Parse all DOM cards now that scrolling is done
|
|
||||||
# Use specific selector to only get actual review cards (div.jftiEf), not buttons
|
|
||||||
try:
|
|
||||||
cards = driver.find_elements(By.CSS_SELECTOR, "div.jftiEf[data-review-id]")
|
|
||||||
for card in cards:
|
|
||||||
review = parse_dom_review(card)
|
|
||||||
if review and review.get("id"):
|
|
||||||
reviews[review["id"]] = review
|
|
||||||
except Exception as e:
|
|
||||||
print(f" Warning: DOM parse error: {e}")
|
|
||||||
|
|
||||||
# Merge API reviews (only add if not already in DOM)
|
# Merge API reviews (only add if not already in DOM)
|
||||||
api_added = 0
|
api_added = 0
|
||||||
|
|||||||
Reference in New Issue
Block a user