Add hard refresh recovery for stuck scraper

When the scraper gets stuck (8+ failed soft recovery attempts), it now
does a hard page refresh and re-setups everything:
- Reloads the page
- Re-clicks reviews tab
- Re-sorts by newest
- Re-injects API interceptor
- Continues collecting with existing seen_ids for deduplication

Key changes:
- Extract page setup into reusable setup_reviews_page() function
- Add do_hard_refresh() that calls setup on refresh
- Trigger hard refresh after 8 failed soft recoveries
- Try hard refresh before timeout gives up completely
- Max 3 hard refreshes before truly giving up
- Reset recovery counter after successful hard refresh

This ensures the scraper can recover from browser issues, DOM detachment,
or other problems that soft recovery (scroll tricks) can't fix.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-22 13:42:54 +00:00
parent b55a7a0fb1
commit ff03a4a1b7

View File

@@ -259,80 +259,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
review_order = {} # review_id -> position (DOM visual order for sorting) review_order = {} # review_id -> position (DOM visual order for sorting)
order_counter = [0] # Current order position order_counter = [0] # Current order position
# Don't force language - let Google show all reviews in user's locale # Track total reviews (persists across refreshes)
total_reviews = [None] # Use list for closure mutation
# Navigate to URL # Hard refresh counter
print(f"🌐 Loading: {url[:80]}...") hard_refresh_count = [0]
driver.get(url) max_hard_refreshes = 3 # Max number of hard refreshes before giving up
# Handle consent popup if redirected (poll with tiny sleep) # Find scrollable reviews container helper
start = time.time()
while time.time() - start < 5: # Max 5s for consent
if "consent.google" in driver.current_url:
print(" Handling consent popup...")
try:
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
txt = btn.text.lower()
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
btn.click()
# Reload original URL after consent
print(" Reloading after consent...")
driver.get(url)
break
except:
pass
break
# Check if we're already on the target page
if "maps/place" in driver.current_url and "consent" not in driver.current_url:
break
time.sleep(0.01) # 10ms - responsive but low CPU
# Extract total review count BEFORE clicking reviews tab (it's on Overview)
# ROBUST: Use aria-label="X reviews" on span[role="img"]
# Poll for up to 5s since page might still be loading after consent
total_reviews = None
start = time.time()
while time.time() - start < 5:
try:
total_reviews = driver.execute_script("""
// ROBUST: Find span[role="img"] with aria-label starting with number + "review"
var reviewSpans = document.querySelectorAll('span[role="img"]');
for (var i = 0; i < reviewSpans.length; i++) {
var label = reviewSpans[i].getAttribute('aria-label') || '';
var match = label.match(/^([\\d,\\.]+)\\s*review/i);
if (match) {
return parseInt(match[1].replace(/[,\\.]/g, ''));
}
}
return null;
""")
if total_reviews:
print(f"📊 Total reviews on page: {total_reviews}")
break
except:
pass
time.sleep(0.1)
# Click reviews tab - poll until found
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
start = time.time()
while time.time() - start < 5: # Max 5s for tabs
try:
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
for tab in tabs:
tab_text = tab.text.lower()
if any(kw in tab_text for kw in review_keywords):
print(f" Clicking reviews tab: '{tab.text}'")
tab.click()
break
else:
time.sleep(0.01) # 10ms between polls
continue
break # Found and clicked
except:
time.sleep(0.01)
# Find scrollable reviews container
def find_scroll_container(): def find_scroll_container():
selectors = [ selectors = [
"div.m6QErb.DxyBCb.kA9KIf.dS8AEf", "div.m6QErb.DxyBCb.kA9KIf.dS8AEf",
@@ -352,77 +286,256 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
pass pass
return None return None
# Poll for scroll container (10ms intervals - fast but low CPU) def setup_reviews_page(is_refresh=False):
scroll_container = None """
start = time.time() Setup the reviews page for scraping.
last_print = 0 Returns (scroll_container, stop_scrolling_event) or (None, None) on failure.
while time.time() - start < 10: # Max 10s Can be called after initial load or after a hard refresh.
scroll_container = find_scroll_container() """
if scroll_container: nonlocal total_reviews
break
elapsed = int(time.time() - start)
if elapsed > last_print:
print(f" Waiting for reviews panel... ({elapsed}s)")
last_print = elapsed
time.sleep(0.01) # 10ms - responsive but low CPU
if not scroll_container: refresh_label = " (after refresh)" if is_refresh else ""
print("❌ Could not find reviews scroll container")
# Debug: print page source snippet # Navigate to URL (only on initial load or refresh)
if not is_refresh:
print(f"🌐 Loading: {url[:80]}...")
else:
print(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
driver.get(url)
# Handle consent popup if redirected (poll with tiny sleep)
start = time.time()
while time.time() - start < 5: # Max 5s for consent
if "consent.google" in driver.current_url:
print(" Handling consent popup...")
try:
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
txt = btn.text.lower()
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
btn.click()
# Reload original URL after consent
print(" Reloading after consent...")
driver.get(url)
break
except:
pass
break
# Check if we're already on the target page
if "maps/place" in driver.current_url and "consent" not in driver.current_url:
break
time.sleep(0.01) # 10ms - responsive but low CPU
# Extract total review count BEFORE clicking reviews tab (it's on Overview)
# Only on first load (don't overwrite if we already have it)
if total_reviews[0] is None:
start = time.time()
while time.time() - start < 5:
try:
count = driver.execute_script("""
var reviewSpans = document.querySelectorAll('span[role="img"]');
for (var i = 0; i < reviewSpans.length; i++) {
var label = reviewSpans[i].getAttribute('aria-label') || '';
var match = label.match(/^([\\d,\\.]+)\\s*review/i);
if (match) {
return parseInt(match[1].replace(/[,\\.]/g, ''));
}
}
return null;
""")
if count:
total_reviews[0] = count
print(f"📊 Total reviews on page: {count}")
break
except:
pass
time.sleep(0.1)
# Click reviews tab - poll until found
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
start = time.time()
tab_clicked = False
while time.time() - start < 5: # Max 5s for tabs
try:
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
for tab in tabs:
tab_text = tab.text.lower()
if any(kw in tab_text for kw in review_keywords):
if not is_refresh:
print(f" Clicking reviews tab: '{tab.text}'")
tab.click()
tab_clicked = True
break
if tab_clicked:
break
time.sleep(0.01) # 10ms between polls
except:
time.sleep(0.01)
# Poll for scroll container (10ms intervals - fast but low CPU)
scroll_container = None
start = time.time()
last_print = 0
while time.time() - start < 10: # Max 10s
scroll_container = find_scroll_container()
if scroll_container:
break
elapsed = int(time.time() - start)
if elapsed > last_print:
print(f" Waiting for reviews panel...{refresh_label} ({elapsed}s)")
last_print = elapsed
time.sleep(0.01) # 10ms - responsive but low CPU
if not scroll_container:
print(f"❌ Could not find reviews scroll container{refresh_label}")
try:
print("Page title:", driver.title)
print("Current URL:", driver.current_url[:100])
except:
pass
return None, None
print(f"✅ Found scroll container{refresh_label}")
# Inject API interceptor (needs to be re-injected after refresh)
if not is_refresh:
print("🔌 Injecting API interceptor...")
driver.execute_script("""
// Always re-setup on refresh
window.__reviewInterceptorInjected = true;
window.__interceptedResponses = window.__interceptedResponses || [];
// Intercept fetch (only if not already patched)
if (!window.__fetchPatched) {
window.__fetchPatched = true;
const originalFetch = window.fetch;
window.fetch = async function(...args) {
const url = args[0].toString();
const response = await originalFetch.apply(this, args);
if (url.includes('listugcposts') || url.includes('review')) {
try {
const clone = response.clone();
const text = await clone.text();
window.__interceptedResponses.push({url: url, body: text});
} catch(e) {}
}
return response;
};
}
// Intercept XHR (only if not already patched)
if (!window.__xhrPatched) {
window.__xhrPatched = true;
const originalXHR = window.XMLHttpRequest;
window.XMLHttpRequest = function() {
const xhr = new originalXHR();
const originalOpen = xhr.open;
let reqUrl = '';
xhr.open = function(method, url, ...rest) {
reqUrl = url;
return originalOpen.apply(this, [method, url, ...rest]);
};
xhr.addEventListener('load', function() {
if (reqUrl.includes('listugcposts') || reqUrl.includes('review')) {
try {
window.__interceptedResponses.push({url: reqUrl, body: xhr.responseText});
} catch(e) {}
}
});
return xhr;
};
for (let prop of Object.getOwnPropertyNames(originalXHR)) {
try { window.XMLHttpRequest[prop] = originalXHR[prop]; } catch(e) {}
}
}
""")
# Sort by newest first
try: try:
print("Page title:", driver.title) sort_btn = driver.execute_script("""
print("Current URL:", driver.current_url[:100]) var btns = document.querySelectorAll('button[data-value="sort"]');
if (btns.length) return btns[0];
var all = document.querySelectorAll('button[aria-label*="Sort"]');
if (all.length) return all[0];
return null;
""")
if sort_btn:
sort_btn.click()
time.sleep(0.3)
driver.execute_script("""
var items = document.querySelectorAll('[role="menuitemradio"], [data-index="1"]');
for (var i = 0; i < items.length; i++) {
var txt = items[i].textContent.toLowerCase();
if (txt.includes('newest') || txt.includes('recent') || txt.includes('más reciente')) {
items[i].click();
break;
}
}
""")
time.sleep(0.5)
print(" 📅 Sorted by newest")
# Re-find scroll container after sorting (DOM may be recreated)
new_container = find_scroll_container()
if new_container:
scroll_container = new_container
print(" 🔄 Refreshed scroll container reference")
except: except:
pass pass
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
print("✅ Found scroll container") # Expand "More" buttons for full text
try:
# PHASE 2: Inject API interceptor for scroll-loaded reviews expanded = driver.execute_script("""
print("🔌 Injecting API interceptor...") var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
driver.execute_script(""" var count = 0;
if (window.__reviewInterceptorInjected) return; for (var i = 0; i < buttons.length; i++) {
window.__reviewInterceptorInjected = true; if (buttons[i].textContent.trim() === 'More') {
window.__interceptedResponses = []; buttons[i].click();
count++;
// Intercept fetch }
const originalFetch = window.fetch;
window.fetch = async function(...args) {
const url = args[0].toString();
const response = await originalFetch.apply(this, args);
if (url.includes('listugcposts') || url.includes('review')) {
try {
const clone = response.clone();
const text = await clone.text();
window.__interceptedResponses.push({url: url, body: text});
} catch(e) {}
}
return response;
};
// Intercept XHR
const originalXHR = window.XMLHttpRequest;
window.XMLHttpRequest = function() {
const xhr = new originalXHR();
const originalOpen = xhr.open;
let reqUrl = '';
xhr.open = function(method, url, ...rest) {
reqUrl = url;
return originalOpen.apply(this, [method, url, ...rest]);
};
xhr.addEventListener('load', function() {
if (reqUrl.includes('listugcposts') || reqUrl.includes('review')) {
try {
window.__interceptedResponses.push({url: reqUrl, body: xhr.responseText});
} catch(e) {}
} }
}); return count;
return xhr; """)
}; if expanded > 0:
for (let prop of Object.getOwnPropertyNames(originalXHR)) { print(f" 📝 Expanded {expanded} truncated reviews")
try { window.XMLHttpRequest[prop] = originalXHR[prop]; } catch(e) {} except:
} pass
""")
# Block images to speed up scrolling (use CDP)
try:
driver.execute_cdp_cmd('Network.setBlockedURLs', {
'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*']
})
driver.execute_cdp_cmd('Network.enable', {})
if not is_refresh:
print(" 🚫 Blocking images for faster scrolling")
except:
pass
# Setup scrollable pane reference
driver.execute_script("window.scrollablePane = arguments[0];", scroll_container)
# Create scroll worker
stop_scrolling = threading.Event()
def scroll_worker():
while not stop_scrolling.is_set():
try:
driver.execute_script("""
var p = window.scrollablePane;
if (p) p.scrollTop = p.scrollHeight;
""")
except:
pass
time.sleep(0.1)
scroll_thread = threading.Thread(target=scroll_worker, daemon=True)
scroll_thread.start()
return scroll_container, stop_scrolling
# Initial page setup
scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False)
if not scroll_container:
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
def get_api_reviews(): def get_api_reviews():
"""Get reviews from intercepted API responses.""" """Get reviews from intercepted API responses."""
@@ -440,93 +553,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
pass pass
return api_revs return api_revs
# Sort by newest first (helps with loading)
try:
sort_btn = driver.execute_script("""
var btns = document.querySelectorAll('button[data-value="sort"]');
if (btns.length) return btns[0];
// Try aria-label
var all = document.querySelectorAll('button[aria-label*="Sort"]');
if (all.length) return all[0];
return null;
""")
if sort_btn:
sort_btn.click()
time.sleep(0.3)
# Click "Newest" option
driver.execute_script("""
var items = document.querySelectorAll('[role="menuitemradio"], [data-index="1"]');
for (var i = 0; i < items.length; i++) {
var txt = items[i].textContent.toLowerCase();
if (txt.includes('newest') || txt.includes('recent') || txt.includes('más reciente')) {
items[i].click();
break;
}
}
""")
time.sleep(0.5)
print(" 📅 Sorted by newest")
# Re-find scroll container after sorting (DOM may be recreated)
new_container = find_scroll_container()
if new_container:
scroll_container = new_container
print(" 🔄 Refreshed scroll container reference")
except:
pass
# EXPAND ALL "MORE" BUTTONS for full text on pre-rendered reviews
# This batch-clicks all "More" buttons at once (fast, no waiting per button)
try:
expanded = driver.execute_script("""
var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
var count = 0;
for (var i = 0; i < buttons.length; i++) {
if (buttons[i].textContent.trim() === 'More') {
buttons[i].click();
count++;
}
}
return count;
""")
if expanded > 0:
print(f" 📝 Expanded {expanded} truncated reviews")
except:
pass
# Block images to speed up scrolling (use CDP)
try:
driver.execute_cdp_cmd('Network.setBlockedURLs', {
'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*']
})
driver.execute_cdp_cmd('Network.enable', {})
print(" 🚫 Blocking images for faster scrolling")
except Exception as e:
pass # CDP might not be available in all setups
# Simple scroll - scrollTop = scrollHeight (proven to work)
driver.execute_script("window.scrollablePane = arguments[0];", scroll_container)
stop_scrolling = threading.Event()
def scroll_worker():
while not stop_scrolling.is_set():
try:
driver.execute_script("""
var p = window.scrollablePane;
if (p) p.scrollTop = p.scrollHeight;
""")
except:
pass
time.sleep(0.1)
scroll_thread = threading.Thread(target=scroll_worker, daemon=True)
scroll_thread.start()
# Recovery function - use real mouse actions when stuck # Recovery function - use real mouse actions when stuck
from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.keys import Keys
recovery_count = [0] recovery_count = [0]
def unstick_scroll(): def unstick_scroll():
nonlocal scroll_container
recovery_count[0] += 1 recovery_count[0] += 1
method = recovery_count[0] % 4 method = recovery_count[0] % 4
try: try:
@@ -566,6 +599,31 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
except: except:
pass pass
def do_hard_refresh():
"""Hard refresh the page and re-setup everything. Returns True on success."""
nonlocal scroll_container, stop_scrolling
hard_refresh_count[0] += 1
if hard_refresh_count[0] > max_hard_refreshes:
print(f" ⚠️ Max hard refreshes ({max_hard_refreshes}) reached, giving up")
return False
# Stop current scroll worker
stop_scrolling.set()
time.sleep(0.2)
# Re-setup page
new_container, new_stop = setup_reviews_page(is_refresh=True)
if new_container:
scroll_container = new_container
stop_scrolling = new_stop
recovery_count[0] = 0 # Reset recovery count after successful refresh
print(f" ✅ Hard refresh successful, resuming with {len(seen_ids)} reviews already collected")
return True
else:
print(f" ❌ Hard refresh failed to find scroll container")
return False
# Main collection loop # Main collection loop
last_new_time = time.time() last_new_time = time.time()
last_count = len(reviews) last_count = len(reviews)
@@ -784,9 +842,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Progress update # Progress update
elapsed = time.time() - last_new_time elapsed = time.time() - last_new_time
if total_reviews: if total_reviews[0]:
pct = (current_count / total_reviews) * 100 pct = (current_count / total_reviews[0]) * 100
print(f" 📊 {current_count}/{total_reviews} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True) print(f" 📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True)
else: else:
print(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True) print(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True)
@@ -797,7 +855,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
break break
# Also stop if we have all reviews from the page # Also stop if we have all reviews from the page
if total_reviews and current_count >= total_reviews: if total_reviews[0] and current_count >= total_reviews[0]:
print(f"✅ All {current_count} reviews collected") print(f"✅ All {current_count} reviews collected")
stop_scrolling.set() stop_scrolling.set()
break break
@@ -805,8 +863,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# STUCK DETECTION: If no new reviews for 3s+, try to unstick # STUCK DETECTION: If no new reviews for 3s+, try to unstick
# Only if we haven't collected all reviews yet # Only if we haven't collected all reviews yet
if elapsed >= 3 and int(elapsed) % 3 == 0: if elapsed >= 3 and int(elapsed) % 3 == 0:
print(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True) # After 8+ failed recovery attempts, try hard refresh
unstick_scroll() if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes:
print(f" 🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...", flush=True)
if do_hard_refresh():
last_new_time = time.time() # Reset timer after refresh
continue # Skip to next iteration
else:
print(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
unstick_scroll()
# Check scroll state - track if content is still being added # Check scroll state - track if content is still being added
try: try:
@@ -831,13 +896,20 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
last_new_time = time.time() last_new_time = time.time()
# Dynamic timeout based on state and recovery attempts # Dynamic timeout based on state and recovery attempts
# - Try hard refresh before giving up if we still have refreshes left
# - 5s if at bottom AND content stopped growing AND multiple recovery attempts failed # - 5s if at bottom AND content stopped growing AND multiple recovery attempts failed
# - 15s max otherwise (keep trying) # - 15s max otherwise (keep trying)
recovery_failed = recovery_count[0] >= 5 and elapsed >= 5 recovery_failed = recovery_count[0] >= 5 and elapsed >= 5
truly_done = at_bottom and not content_growing and recovery_failed truly_done = at_bottom and not content_growing and recovery_failed
timeout_hit = elapsed >= 15 timeout_hit = elapsed >= timeout_no_new
if truly_done or timeout_hit: if truly_done or timeout_hit:
# Last chance: try hard refresh before giving up
if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
print(f" 🔄 Timeout reached, trying hard refresh before giving up...", flush=True)
if do_hard_refresh():
last_new_time = time.time()
continue # Keep trying
print(f"✅ All reviews loaded: {current_count}") print(f"✅ All reviews loaded: {current_count}")
stop_scrolling.set() stop_scrolling.set()
break break