Add hard refresh recovery for stuck scraper
When the scraper gets stuck (8+ failed soft recovery attempts), it now does a hard page refresh and re-setups everything: - Reloads the page - Re-clicks reviews tab - Re-sorts by newest - Re-injects API interceptor - Continues collecting with existing seen_ids for deduplication Key changes: - Extract page setup into reusable setup_reviews_page() function - Add do_hard_refresh() that calls setup on refresh - Trigger hard refresh after 8 failed soft recoveries - Try hard refresh before timeout gives up completely - Max 3 hard refreshes before truly giving up - Reset recovery counter after successful hard refresh This ensures the scraper can recover from browser issues, DOM detachment, or other problems that soft recovery (scroll tricks) can't fix. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -259,80 +259,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
review_order = {} # review_id -> position (DOM visual order for sorting)
|
||||
order_counter = [0] # Current order position
|
||||
|
||||
# Don't force language - let Google show all reviews in user's locale
|
||||
# Track total reviews (persists across refreshes)
|
||||
total_reviews = [None] # Use list for closure mutation
|
||||
|
||||
# Navigate to URL
|
||||
print(f"🌐 Loading: {url[:80]}...")
|
||||
driver.get(url)
|
||||
# Hard refresh counter
|
||||
hard_refresh_count = [0]
|
||||
max_hard_refreshes = 3 # Max number of hard refreshes before giving up
|
||||
|
||||
# Handle consent popup if redirected (poll with tiny sleep)
|
||||
start = time.time()
|
||||
while time.time() - start < 5: # Max 5s for consent
|
||||
if "consent.google" in driver.current_url:
|
||||
print(" Handling consent popup...")
|
||||
try:
|
||||
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
|
||||
txt = btn.text.lower()
|
||||
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
||||
btn.click()
|
||||
# Reload original URL after consent
|
||||
print(" Reloading after consent...")
|
||||
driver.get(url)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
break
|
||||
# Check if we're already on the target page
|
||||
if "maps/place" in driver.current_url and "consent" not in driver.current_url:
|
||||
break
|
||||
time.sleep(0.01) # 10ms - responsive but low CPU
|
||||
|
||||
# Extract total review count BEFORE clicking reviews tab (it's on Overview)
|
||||
# ROBUST: Use aria-label="X reviews" on span[role="img"]
|
||||
# Poll for up to 5s since page might still be loading after consent
|
||||
total_reviews = None
|
||||
start = time.time()
|
||||
while time.time() - start < 5:
|
||||
try:
|
||||
total_reviews = driver.execute_script("""
|
||||
// ROBUST: Find span[role="img"] with aria-label starting with number + "review"
|
||||
var reviewSpans = document.querySelectorAll('span[role="img"]');
|
||||
for (var i = 0; i < reviewSpans.length; i++) {
|
||||
var label = reviewSpans[i].getAttribute('aria-label') || '';
|
||||
var match = label.match(/^([\\d,\\.]+)\\s*review/i);
|
||||
if (match) {
|
||||
return parseInt(match[1].replace(/[,\\.]/g, ''));
|
||||
}
|
||||
}
|
||||
return null;
|
||||
""")
|
||||
if total_reviews:
|
||||
print(f"📊 Total reviews on page: {total_reviews}")
|
||||
break
|
||||
except:
|
||||
pass
|
||||
time.sleep(0.1)
|
||||
|
||||
# Click reviews tab - poll until found
|
||||
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
|
||||
start = time.time()
|
||||
while time.time() - start < 5: # Max 5s for tabs
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
|
||||
for tab in tabs:
|
||||
tab_text = tab.text.lower()
|
||||
if any(kw in tab_text for kw in review_keywords):
|
||||
print(f" Clicking reviews tab: '{tab.text}'")
|
||||
tab.click()
|
||||
break
|
||||
else:
|
||||
time.sleep(0.01) # 10ms between polls
|
||||
continue
|
||||
break # Found and clicked
|
||||
except:
|
||||
time.sleep(0.01)
|
||||
|
||||
# Find scrollable reviews container
|
||||
# Find scrollable reviews container helper
|
||||
def find_scroll_container():
|
||||
selectors = [
|
||||
"div.m6QErb.DxyBCb.kA9KIf.dS8AEf",
|
||||
@@ -352,77 +286,256 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
pass
|
||||
return None
|
||||
|
||||
# Poll for scroll container (10ms intervals - fast but low CPU)
|
||||
scroll_container = None
|
||||
start = time.time()
|
||||
last_print = 0
|
||||
while time.time() - start < 10: # Max 10s
|
||||
scroll_container = find_scroll_container()
|
||||
if scroll_container:
|
||||
break
|
||||
elapsed = int(time.time() - start)
|
||||
if elapsed > last_print:
|
||||
print(f" Waiting for reviews panel... ({elapsed}s)")
|
||||
last_print = elapsed
|
||||
time.sleep(0.01) # 10ms - responsive but low CPU
|
||||
def setup_reviews_page(is_refresh=False):
|
||||
"""
|
||||
Setup the reviews page for scraping.
|
||||
Returns (scroll_container, stop_scrolling_event) or (None, None) on failure.
|
||||
Can be called after initial load or after a hard refresh.
|
||||
"""
|
||||
nonlocal total_reviews
|
||||
|
||||
if not scroll_container:
|
||||
print("❌ Could not find reviews scroll container")
|
||||
# Debug: print page source snippet
|
||||
refresh_label = " (after refresh)" if is_refresh else ""
|
||||
|
||||
# Navigate to URL (only on initial load or refresh)
|
||||
if not is_refresh:
|
||||
print(f"🌐 Loading: {url[:80]}...")
|
||||
else:
|
||||
print(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
|
||||
driver.get(url)
|
||||
|
||||
# Handle consent popup if redirected (poll with tiny sleep)
|
||||
start = time.time()
|
||||
while time.time() - start < 5: # Max 5s for consent
|
||||
if "consent.google" in driver.current_url:
|
||||
print(" Handling consent popup...")
|
||||
try:
|
||||
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
|
||||
txt = btn.text.lower()
|
||||
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
||||
btn.click()
|
||||
# Reload original URL after consent
|
||||
print(" Reloading after consent...")
|
||||
driver.get(url)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
break
|
||||
# Check if we're already on the target page
|
||||
if "maps/place" in driver.current_url and "consent" not in driver.current_url:
|
||||
break
|
||||
time.sleep(0.01) # 10ms - responsive but low CPU
|
||||
|
||||
# Extract total review count BEFORE clicking reviews tab (it's on Overview)
|
||||
# Only on first load (don't overwrite if we already have it)
|
||||
if total_reviews[0] is None:
|
||||
start = time.time()
|
||||
while time.time() - start < 5:
|
||||
try:
|
||||
count = driver.execute_script("""
|
||||
var reviewSpans = document.querySelectorAll('span[role="img"]');
|
||||
for (var i = 0; i < reviewSpans.length; i++) {
|
||||
var label = reviewSpans[i].getAttribute('aria-label') || '';
|
||||
var match = label.match(/^([\\d,\\.]+)\\s*review/i);
|
||||
if (match) {
|
||||
return parseInt(match[1].replace(/[,\\.]/g, ''));
|
||||
}
|
||||
}
|
||||
return null;
|
||||
""")
|
||||
if count:
|
||||
total_reviews[0] = count
|
||||
print(f"📊 Total reviews on page: {count}")
|
||||
break
|
||||
except:
|
||||
pass
|
||||
time.sleep(0.1)
|
||||
|
||||
# Click reviews tab - poll until found
|
||||
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
|
||||
start = time.time()
|
||||
tab_clicked = False
|
||||
while time.time() - start < 5: # Max 5s for tabs
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
|
||||
for tab in tabs:
|
||||
tab_text = tab.text.lower()
|
||||
if any(kw in tab_text for kw in review_keywords):
|
||||
if not is_refresh:
|
||||
print(f" Clicking reviews tab: '{tab.text}'")
|
||||
tab.click()
|
||||
tab_clicked = True
|
||||
break
|
||||
if tab_clicked:
|
||||
break
|
||||
time.sleep(0.01) # 10ms between polls
|
||||
except:
|
||||
time.sleep(0.01)
|
||||
|
||||
# Poll for scroll container (10ms intervals - fast but low CPU)
|
||||
scroll_container = None
|
||||
start = time.time()
|
||||
last_print = 0
|
||||
while time.time() - start < 10: # Max 10s
|
||||
scroll_container = find_scroll_container()
|
||||
if scroll_container:
|
||||
break
|
||||
elapsed = int(time.time() - start)
|
||||
if elapsed > last_print:
|
||||
print(f" Waiting for reviews panel...{refresh_label} ({elapsed}s)")
|
||||
last_print = elapsed
|
||||
time.sleep(0.01) # 10ms - responsive but low CPU
|
||||
|
||||
if not scroll_container:
|
||||
print(f"❌ Could not find reviews scroll container{refresh_label}")
|
||||
try:
|
||||
print("Page title:", driver.title)
|
||||
print("Current URL:", driver.current_url[:100])
|
||||
except:
|
||||
pass
|
||||
return None, None
|
||||
|
||||
print(f"✅ Found scroll container{refresh_label}")
|
||||
|
||||
# Inject API interceptor (needs to be re-injected after refresh)
|
||||
if not is_refresh:
|
||||
print("🔌 Injecting API interceptor...")
|
||||
driver.execute_script("""
|
||||
// Always re-setup on refresh
|
||||
window.__reviewInterceptorInjected = true;
|
||||
window.__interceptedResponses = window.__interceptedResponses || [];
|
||||
|
||||
// Intercept fetch (only if not already patched)
|
||||
if (!window.__fetchPatched) {
|
||||
window.__fetchPatched = true;
|
||||
const originalFetch = window.fetch;
|
||||
window.fetch = async function(...args) {
|
||||
const url = args[0].toString();
|
||||
const response = await originalFetch.apply(this, args);
|
||||
if (url.includes('listugcposts') || url.includes('review')) {
|
||||
try {
|
||||
const clone = response.clone();
|
||||
const text = await clone.text();
|
||||
window.__interceptedResponses.push({url: url, body: text});
|
||||
} catch(e) {}
|
||||
}
|
||||
return response;
|
||||
};
|
||||
}
|
||||
|
||||
// Intercept XHR (only if not already patched)
|
||||
if (!window.__xhrPatched) {
|
||||
window.__xhrPatched = true;
|
||||
const originalXHR = window.XMLHttpRequest;
|
||||
window.XMLHttpRequest = function() {
|
||||
const xhr = new originalXHR();
|
||||
const originalOpen = xhr.open;
|
||||
let reqUrl = '';
|
||||
xhr.open = function(method, url, ...rest) {
|
||||
reqUrl = url;
|
||||
return originalOpen.apply(this, [method, url, ...rest]);
|
||||
};
|
||||
xhr.addEventListener('load', function() {
|
||||
if (reqUrl.includes('listugcposts') || reqUrl.includes('review')) {
|
||||
try {
|
||||
window.__interceptedResponses.push({url: reqUrl, body: xhr.responseText});
|
||||
} catch(e) {}
|
||||
}
|
||||
});
|
||||
return xhr;
|
||||
};
|
||||
for (let prop of Object.getOwnPropertyNames(originalXHR)) {
|
||||
try { window.XMLHttpRequest[prop] = originalXHR[prop]; } catch(e) {}
|
||||
}
|
||||
}
|
||||
""")
|
||||
|
||||
# Sort by newest first
|
||||
try:
|
||||
print("Page title:", driver.title)
|
||||
print("Current URL:", driver.current_url[:100])
|
||||
sort_btn = driver.execute_script("""
|
||||
var btns = document.querySelectorAll('button[data-value="sort"]');
|
||||
if (btns.length) return btns[0];
|
||||
var all = document.querySelectorAll('button[aria-label*="Sort"]');
|
||||
if (all.length) return all[0];
|
||||
return null;
|
||||
""")
|
||||
if sort_btn:
|
||||
sort_btn.click()
|
||||
time.sleep(0.3)
|
||||
driver.execute_script("""
|
||||
var items = document.querySelectorAll('[role="menuitemradio"], [data-index="1"]');
|
||||
for (var i = 0; i < items.length; i++) {
|
||||
var txt = items[i].textContent.toLowerCase();
|
||||
if (txt.includes('newest') || txt.includes('recent') || txt.includes('más reciente')) {
|
||||
items[i].click();
|
||||
break;
|
||||
}
|
||||
}
|
||||
""")
|
||||
time.sleep(0.5)
|
||||
print(" 📅 Sorted by newest")
|
||||
# Re-find scroll container after sorting (DOM may be recreated)
|
||||
new_container = find_scroll_container()
|
||||
if new_container:
|
||||
scroll_container = new_container
|
||||
print(" 🔄 Refreshed scroll container reference")
|
||||
except:
|
||||
pass
|
||||
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
|
||||
|
||||
print("✅ Found scroll container")
|
||||
|
||||
# PHASE 2: Inject API interceptor for scroll-loaded reviews
|
||||
print("🔌 Injecting API interceptor...")
|
||||
driver.execute_script("""
|
||||
if (window.__reviewInterceptorInjected) return;
|
||||
window.__reviewInterceptorInjected = true;
|
||||
window.__interceptedResponses = [];
|
||||
|
||||
// Intercept fetch
|
||||
const originalFetch = window.fetch;
|
||||
window.fetch = async function(...args) {
|
||||
const url = args[0].toString();
|
||||
const response = await originalFetch.apply(this, args);
|
||||
if (url.includes('listugcposts') || url.includes('review')) {
|
||||
try {
|
||||
const clone = response.clone();
|
||||
const text = await clone.text();
|
||||
window.__interceptedResponses.push({url: url, body: text});
|
||||
} catch(e) {}
|
||||
}
|
||||
return response;
|
||||
};
|
||||
|
||||
// Intercept XHR
|
||||
const originalXHR = window.XMLHttpRequest;
|
||||
window.XMLHttpRequest = function() {
|
||||
const xhr = new originalXHR();
|
||||
const originalOpen = xhr.open;
|
||||
let reqUrl = '';
|
||||
xhr.open = function(method, url, ...rest) {
|
||||
reqUrl = url;
|
||||
return originalOpen.apply(this, [method, url, ...rest]);
|
||||
};
|
||||
xhr.addEventListener('load', function() {
|
||||
if (reqUrl.includes('listugcposts') || reqUrl.includes('review')) {
|
||||
try {
|
||||
window.__interceptedResponses.push({url: reqUrl, body: xhr.responseText});
|
||||
} catch(e) {}
|
||||
# Expand "More" buttons for full text
|
||||
try:
|
||||
expanded = driver.execute_script("""
|
||||
var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
|
||||
var count = 0;
|
||||
for (var i = 0; i < buttons.length; i++) {
|
||||
if (buttons[i].textContent.trim() === 'More') {
|
||||
buttons[i].click();
|
||||
count++;
|
||||
}
|
||||
}
|
||||
});
|
||||
return xhr;
|
||||
};
|
||||
for (let prop of Object.getOwnPropertyNames(originalXHR)) {
|
||||
try { window.XMLHttpRequest[prop] = originalXHR[prop]; } catch(e) {}
|
||||
}
|
||||
""")
|
||||
return count;
|
||||
""")
|
||||
if expanded > 0:
|
||||
print(f" 📝 Expanded {expanded} truncated reviews")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Block images to speed up scrolling (use CDP)
|
||||
try:
|
||||
driver.execute_cdp_cmd('Network.setBlockedURLs', {
|
||||
'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*']
|
||||
})
|
||||
driver.execute_cdp_cmd('Network.enable', {})
|
||||
if not is_refresh:
|
||||
print(" 🚫 Blocking images for faster scrolling")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Setup scrollable pane reference
|
||||
driver.execute_script("window.scrollablePane = arguments[0];", scroll_container)
|
||||
|
||||
# Create scroll worker
|
||||
stop_scrolling = threading.Event()
|
||||
|
||||
def scroll_worker():
|
||||
while not stop_scrolling.is_set():
|
||||
try:
|
||||
driver.execute_script("""
|
||||
var p = window.scrollablePane;
|
||||
if (p) p.scrollTop = p.scrollHeight;
|
||||
""")
|
||||
except:
|
||||
pass
|
||||
time.sleep(0.1)
|
||||
|
||||
scroll_thread = threading.Thread(target=scroll_worker, daemon=True)
|
||||
scroll_thread.start()
|
||||
|
||||
return scroll_container, stop_scrolling
|
||||
|
||||
# Initial page setup
|
||||
scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False)
|
||||
if not scroll_container:
|
||||
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
|
||||
|
||||
def get_api_reviews():
|
||||
"""Get reviews from intercepted API responses."""
|
||||
@@ -440,93 +553,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
pass
|
||||
return api_revs
|
||||
|
||||
# Sort by newest first (helps with loading)
|
||||
try:
|
||||
sort_btn = driver.execute_script("""
|
||||
var btns = document.querySelectorAll('button[data-value="sort"]');
|
||||
if (btns.length) return btns[0];
|
||||
// Try aria-label
|
||||
var all = document.querySelectorAll('button[aria-label*="Sort"]');
|
||||
if (all.length) return all[0];
|
||||
return null;
|
||||
""")
|
||||
if sort_btn:
|
||||
sort_btn.click()
|
||||
time.sleep(0.3)
|
||||
# Click "Newest" option
|
||||
driver.execute_script("""
|
||||
var items = document.querySelectorAll('[role="menuitemradio"], [data-index="1"]');
|
||||
for (var i = 0; i < items.length; i++) {
|
||||
var txt = items[i].textContent.toLowerCase();
|
||||
if (txt.includes('newest') || txt.includes('recent') || txt.includes('más reciente')) {
|
||||
items[i].click();
|
||||
break;
|
||||
}
|
||||
}
|
||||
""")
|
||||
time.sleep(0.5)
|
||||
print(" 📅 Sorted by newest")
|
||||
# Re-find scroll container after sorting (DOM may be recreated)
|
||||
new_container = find_scroll_container()
|
||||
if new_container:
|
||||
scroll_container = new_container
|
||||
print(" 🔄 Refreshed scroll container reference")
|
||||
except:
|
||||
pass
|
||||
|
||||
# EXPAND ALL "MORE" BUTTONS for full text on pre-rendered reviews
|
||||
# This batch-clicks all "More" buttons at once (fast, no waiting per button)
|
||||
try:
|
||||
expanded = driver.execute_script("""
|
||||
var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
|
||||
var count = 0;
|
||||
for (var i = 0; i < buttons.length; i++) {
|
||||
if (buttons[i].textContent.trim() === 'More') {
|
||||
buttons[i].click();
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
""")
|
||||
if expanded > 0:
|
||||
print(f" 📝 Expanded {expanded} truncated reviews")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Block images to speed up scrolling (use CDP)
|
||||
try:
|
||||
driver.execute_cdp_cmd('Network.setBlockedURLs', {
|
||||
'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*']
|
||||
})
|
||||
driver.execute_cdp_cmd('Network.enable', {})
|
||||
print(" 🚫 Blocking images for faster scrolling")
|
||||
except Exception as e:
|
||||
pass # CDP might not be available in all setups
|
||||
|
||||
# Simple scroll - scrollTop = scrollHeight (proven to work)
|
||||
driver.execute_script("window.scrollablePane = arguments[0];", scroll_container)
|
||||
stop_scrolling = threading.Event()
|
||||
|
||||
def scroll_worker():
|
||||
while not stop_scrolling.is_set():
|
||||
try:
|
||||
driver.execute_script("""
|
||||
var p = window.scrollablePane;
|
||||
if (p) p.scrollTop = p.scrollHeight;
|
||||
""")
|
||||
except:
|
||||
pass
|
||||
time.sleep(0.1)
|
||||
|
||||
scroll_thread = threading.Thread(target=scroll_worker, daemon=True)
|
||||
scroll_thread.start()
|
||||
|
||||
# Recovery function - use real mouse actions when stuck
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
recovery_count = [0]
|
||||
|
||||
def unstick_scroll():
|
||||
nonlocal scroll_container
|
||||
recovery_count[0] += 1
|
||||
method = recovery_count[0] % 4
|
||||
try:
|
||||
@@ -566,6 +599,31 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
except:
|
||||
pass
|
||||
|
||||
def do_hard_refresh():
|
||||
"""Hard refresh the page and re-setup everything. Returns True on success."""
|
||||
nonlocal scroll_container, stop_scrolling
|
||||
hard_refresh_count[0] += 1
|
||||
|
||||
if hard_refresh_count[0] > max_hard_refreshes:
|
||||
print(f" ⚠️ Max hard refreshes ({max_hard_refreshes}) reached, giving up")
|
||||
return False
|
||||
|
||||
# Stop current scroll worker
|
||||
stop_scrolling.set()
|
||||
time.sleep(0.2)
|
||||
|
||||
# Re-setup page
|
||||
new_container, new_stop = setup_reviews_page(is_refresh=True)
|
||||
if new_container:
|
||||
scroll_container = new_container
|
||||
stop_scrolling = new_stop
|
||||
recovery_count[0] = 0 # Reset recovery count after successful refresh
|
||||
print(f" ✅ Hard refresh successful, resuming with {len(seen_ids)} reviews already collected")
|
||||
return True
|
||||
else:
|
||||
print(f" ❌ Hard refresh failed to find scroll container")
|
||||
return False
|
||||
|
||||
# Main collection loop
|
||||
last_new_time = time.time()
|
||||
last_count = len(reviews)
|
||||
@@ -784,9 +842,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
|
||||
# Progress update
|
||||
elapsed = time.time() - last_new_time
|
||||
if total_reviews:
|
||||
pct = (current_count / total_reviews) * 100
|
||||
print(f" 📊 {current_count}/{total_reviews} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True)
|
||||
if total_reviews[0]:
|
||||
pct = (current_count / total_reviews[0]) * 100
|
||||
print(f" 📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True)
|
||||
else:
|
||||
print(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True)
|
||||
|
||||
@@ -797,7 +855,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
break
|
||||
|
||||
# Also stop if we have all reviews from the page
|
||||
if total_reviews and current_count >= total_reviews:
|
||||
if total_reviews[0] and current_count >= total_reviews[0]:
|
||||
print(f"✅ All {current_count} reviews collected")
|
||||
stop_scrolling.set()
|
||||
break
|
||||
@@ -805,8 +863,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
# STUCK DETECTION: If no new reviews for 3s+, try to unstick
|
||||
# Only if we haven't collected all reviews yet
|
||||
if elapsed >= 3 and int(elapsed) % 3 == 0:
|
||||
print(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
|
||||
unstick_scroll()
|
||||
# After 8+ failed recovery attempts, try hard refresh
|
||||
if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes:
|
||||
print(f" 🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...", flush=True)
|
||||
if do_hard_refresh():
|
||||
last_new_time = time.time() # Reset timer after refresh
|
||||
continue # Skip to next iteration
|
||||
else:
|
||||
print(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
|
||||
unstick_scroll()
|
||||
|
||||
# Check scroll state - track if content is still being added
|
||||
try:
|
||||
@@ -831,13 +896,20 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
last_new_time = time.time()
|
||||
|
||||
# Dynamic timeout based on state and recovery attempts
|
||||
# - Try hard refresh before giving up if we still have refreshes left
|
||||
# - 5s if at bottom AND content stopped growing AND multiple recovery attempts failed
|
||||
# - 15s max otherwise (keep trying)
|
||||
recovery_failed = recovery_count[0] >= 5 and elapsed >= 5
|
||||
truly_done = at_bottom and not content_growing and recovery_failed
|
||||
timeout_hit = elapsed >= 15
|
||||
timeout_hit = elapsed >= timeout_no_new
|
||||
|
||||
if truly_done or timeout_hit:
|
||||
# Last chance: try hard refresh before giving up
|
||||
if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
|
||||
print(f" 🔄 Timeout reached, trying hard refresh before giving up...", flush=True)
|
||||
if do_hard_refresh():
|
||||
last_new_time = time.time()
|
||||
continue # Keep trying
|
||||
print(f"✅ All reviews loaded: {current_count}")
|
||||
stop_scrolling.set()
|
||||
break
|
||||
|
||||
Reference in New Issue
Block a user