Add hard refresh recovery for stuck scraper

When the scraper gets stuck (8+ failed soft recovery attempts), it now
does a hard page refresh and re-setups everything:
- Reloads the page
- Re-clicks reviews tab
- Re-sorts by newest
- Re-injects API interceptor
- Continues collecting with existing seen_ids for deduplication

Key changes:
- Extract page setup into reusable setup_reviews_page() function
- Add do_hard_refresh() that calls setup on refresh
- Trigger hard refresh after 8 failed soft recoveries
- Try hard refresh before timeout gives up completely
- Max 3 hard refreshes before truly giving up
- Reset recovery counter after successful hard refresh

This ensures the scraper can recover from browser issues, DOM detachment,
or other problems that soft recovery (scroll tricks) can't fix.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-22 13:42:54 +00:00
parent b55a7a0fb1
commit ff03a4a1b7

View File

@@ -259,10 +259,48 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
review_order = {} # review_id -> position (DOM visual order for sorting)
order_counter = [0] # Current order position
# Don't force language - let Google show all reviews in user's locale
# Track total reviews (persists across refreshes)
total_reviews = [None] # Use list for closure mutation
# Navigate to URL
# Hard refresh counter
hard_refresh_count = [0]
max_hard_refreshes = 3 # Max number of hard refreshes before giving up
# Find scrollable reviews container helper
def find_scroll_container():
selectors = [
"div.m6QErb.DxyBCb.kA9KIf.dS8AEf",
"div.m6QErb.DxyBCb.kA9KIf",
"div.m6QErb.DxyBCb",
"div.m6QErb[aria-label]",
"div.DxyBCb.kA9KIf.dS8AEf",
"div[role='main'] div.m6QErb",
]
for sel in selectors:
try:
els = driver.find_elements(By.CSS_SELECTOR, sel)
for el in els:
if el.is_displayed() and el.size['height'] > 100:
return el
except:
pass
return None
def setup_reviews_page(is_refresh=False):
"""
Setup the reviews page for scraping.
Returns (scroll_container, stop_scrolling_event) or (None, None) on failure.
Can be called after initial load or after a hard refresh.
"""
nonlocal total_reviews
refresh_label = " (after refresh)" if is_refresh else ""
# Navigate to URL (only on initial load or refresh)
if not is_refresh:
print(f"🌐 Loading: {url[:80]}...")
else:
print(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
driver.get(url)
# Handle consent popup if redirected (poll with tiny sleep)
@@ -288,14 +326,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
time.sleep(0.01) # 10ms - responsive but low CPU
# Extract total review count BEFORE clicking reviews tab (it's on Overview)
# ROBUST: Use aria-label="X reviews" on span[role="img"]
# Poll for up to 5s since page might still be loading after consent
total_reviews = None
# Only on first load (don't overwrite if we already have it)
if total_reviews[0] is None:
start = time.time()
while time.time() - start < 5:
try:
total_reviews = driver.execute_script("""
// ROBUST: Find span[role="img"] with aria-label starting with number + "review"
count = driver.execute_script("""
var reviewSpans = document.querySelectorAll('span[role="img"]');
for (var i = 0; i < reviewSpans.length; i++) {
var label = reviewSpans[i].getAttribute('aria-label') || '';
@@ -306,8 +342,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
}
return null;
""")
if total_reviews:
print(f"📊 Total reviews on page: {total_reviews}")
if count:
total_reviews[0] = count
print(f"📊 Total reviews on page: {count}")
break
except:
pass
@@ -316,42 +353,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Click reviews tab - poll until found
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
start = time.time()
tab_clicked = False
while time.time() - start < 5: # Max 5s for tabs
try:
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
for tab in tabs:
tab_text = tab.text.lower()
if any(kw in tab_text for kw in review_keywords):
if not is_refresh:
print(f" Clicking reviews tab: '{tab.text}'")
tab.click()
tab_clicked = True
break
if tab_clicked:
break
else:
time.sleep(0.01) # 10ms between polls
continue
break # Found and clicked
except:
time.sleep(0.01)
# Find scrollable reviews container
def find_scroll_container():
selectors = [
"div.m6QErb.DxyBCb.kA9KIf.dS8AEf",
"div.m6QErb.DxyBCb.kA9KIf",
"div.m6QErb.DxyBCb",
"div.m6QErb[aria-label]",
"div.DxyBCb.kA9KIf.dS8AEf",
"div[role='main'] div.m6QErb",
]
for sel in selectors:
try:
els = driver.find_elements(By.CSS_SELECTOR, sel)
for el in els:
if el.is_displayed() and el.size['height'] > 100:
return el
except:
pass
return None
# Poll for scroll container (10ms intervals - fast but low CPU)
scroll_container = None
start = time.time()
@@ -362,30 +381,32 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
break
elapsed = int(time.time() - start)
if elapsed > last_print:
print(f" Waiting for reviews panel... ({elapsed}s)")
print(f" Waiting for reviews panel...{refresh_label} ({elapsed}s)")
last_print = elapsed
time.sleep(0.01) # 10ms - responsive but low CPU
if not scroll_container:
print("❌ Could not find reviews scroll container")
# Debug: print page source snippet
print(f"❌ Could not find reviews scroll container{refresh_label}")
try:
print("Page title:", driver.title)
print("Current URL:", driver.current_url[:100])
except:
pass
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
return None, None
print("✅ Found scroll container")
print(f"✅ Found scroll container{refresh_label}")
# PHASE 2: Inject API interceptor for scroll-loaded reviews
# Inject API interceptor (needs to be re-injected after refresh)
if not is_refresh:
print("🔌 Injecting API interceptor...")
driver.execute_script("""
if (window.__reviewInterceptorInjected) return;
// Always re-setup on refresh
window.__reviewInterceptorInjected = true;
window.__interceptedResponses = [];
window.__interceptedResponses = window.__interceptedResponses || [];
// Intercept fetch
// Intercept fetch (only if not already patched)
if (!window.__fetchPatched) {
window.__fetchPatched = true;
const originalFetch = window.fetch;
window.fetch = async function(...args) {
const url = args[0].toString();
@@ -399,8 +420,11 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
}
return response;
};
}
// Intercept XHR
// Intercept XHR (only if not already patched)
if (!window.__xhrPatched) {
window.__xhrPatched = true;
const originalXHR = window.XMLHttpRequest;
window.XMLHttpRequest = function() {
const xhr = new originalXHR();
@@ -422,30 +446,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
for (let prop of Object.getOwnPropertyNames(originalXHR)) {
try { window.XMLHttpRequest[prop] = originalXHR[prop]; } catch(e) {}
}
}
""")
def get_api_reviews():
"""Get reviews from intercepted API responses."""
api_revs = []
try:
responses = driver.execute_script("""
var r = window.__interceptedResponses || [];
window.__interceptedResponses = [];
return r;
""")
for resp in (responses or []):
body = resp.get("body", "")
api_revs.extend(extract_reviews_from_api_body(body))
except:
pass
return api_revs
# Sort by newest first (helps with loading)
# Sort by newest first
try:
sort_btn = driver.execute_script("""
var btns = document.querySelectorAll('button[data-value="sort"]');
if (btns.length) return btns[0];
// Try aria-label
var all = document.querySelectorAll('button[aria-label*="Sort"]');
if (all.length) return all[0];
return null;
@@ -453,7 +461,6 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
if sort_btn:
sort_btn.click()
time.sleep(0.3)
# Click "Newest" option
driver.execute_script("""
var items = document.querySelectorAll('[role="menuitemradio"], [data-index="1"]');
for (var i = 0; i < items.length; i++) {
@@ -474,8 +481,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
except:
pass
# EXPAND ALL "MORE" BUTTONS for full text on pre-rendered reviews
# This batch-clicks all "More" buttons at once (fast, no waiting per button)
# Expand "More" buttons for full text
try:
expanded = driver.execute_script("""
var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
@@ -499,12 +505,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*']
})
driver.execute_cdp_cmd('Network.enable', {})
if not is_refresh:
print(" 🚫 Blocking images for faster scrolling")
except Exception as e:
pass # CDP might not be available in all setups
except:
pass
# Simple scroll - scrollTop = scrollHeight (proven to work)
# Setup scrollable pane reference
driver.execute_script("window.scrollablePane = arguments[0];", scroll_container)
# Create scroll worker
stop_scrolling = threading.Event()
def scroll_worker():
@@ -521,12 +530,36 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
scroll_thread = threading.Thread(target=scroll_worker, daemon=True)
scroll_thread.start()
return scroll_container, stop_scrolling
# Initial page setup
scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False)
if not scroll_container:
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
def get_api_reviews():
"""Get reviews from intercepted API responses."""
api_revs = []
try:
responses = driver.execute_script("""
var r = window.__interceptedResponses || [];
window.__interceptedResponses = [];
return r;
""")
for resp in (responses or []):
body = resp.get("body", "")
api_revs.extend(extract_reviews_from_api_body(body))
except:
pass
return api_revs
# Recovery function - use real mouse actions when stuck
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
recovery_count = [0]
def unstick_scroll():
nonlocal scroll_container
recovery_count[0] += 1
method = recovery_count[0] % 4
try:
@@ -566,6 +599,31 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
except:
pass
def do_hard_refresh():
"""Hard refresh the page and re-setup everything. Returns True on success."""
nonlocal scroll_container, stop_scrolling
hard_refresh_count[0] += 1
if hard_refresh_count[0] > max_hard_refreshes:
print(f" ⚠️ Max hard refreshes ({max_hard_refreshes}) reached, giving up")
return False
# Stop current scroll worker
stop_scrolling.set()
time.sleep(0.2)
# Re-setup page
new_container, new_stop = setup_reviews_page(is_refresh=True)
if new_container:
scroll_container = new_container
stop_scrolling = new_stop
recovery_count[0] = 0 # Reset recovery count after successful refresh
print(f" ✅ Hard refresh successful, resuming with {len(seen_ids)} reviews already collected")
return True
else:
print(f" ❌ Hard refresh failed to find scroll container")
return False
# Main collection loop
last_new_time = time.time()
last_count = len(reviews)
@@ -784,9 +842,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Progress update
elapsed = time.time() - last_new_time
if total_reviews:
pct = (current_count / total_reviews) * 100
print(f" 📊 {current_count}/{total_reviews} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True)
if total_reviews[0]:
pct = (current_count / total_reviews[0]) * 100
print(f" 📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True)
else:
print(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True)
@@ -797,7 +855,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
break
# Also stop if we have all reviews from the page
if total_reviews and current_count >= total_reviews:
if total_reviews[0] and current_count >= total_reviews[0]:
print(f"✅ All {current_count} reviews collected")
stop_scrolling.set()
break
@@ -805,6 +863,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# STUCK DETECTION: If no new reviews for 3s+, try to unstick
# Only if we haven't collected all reviews yet
if elapsed >= 3 and int(elapsed) % 3 == 0:
# After 8+ failed recovery attempts, try hard refresh
if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes:
print(f" 🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...", flush=True)
if do_hard_refresh():
last_new_time = time.time() # Reset timer after refresh
continue # Skip to next iteration
else:
print(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
unstick_scroll()
@@ -831,13 +896,20 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
last_new_time = time.time()
# Dynamic timeout based on state and recovery attempts
# - Try hard refresh before giving up if we still have refreshes left
# - 5s if at bottom AND content stopped growing AND multiple recovery attempts failed
# - 15s max otherwise (keep trying)
recovery_failed = recovery_count[0] >= 5 and elapsed >= 5
truly_done = at_bottom and not content_growing and recovery_failed
timeout_hit = elapsed >= 15
timeout_hit = elapsed >= timeout_no_new
if truly_done or timeout_hit:
# Last chance: try hard refresh before giving up
if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
print(f" 🔄 Timeout reached, trying hard refresh before giving up...", flush=True)
if do_hard_refresh():
last_new_time = time.time()
continue # Keep trying
print(f"✅ All reviews loaded: {current_count}")
stop_scrolling.set()
break