Add hard refresh recovery for stuck scraper
When the scraper gets stuck (8+ failed soft recovery attempts), it now does a hard page refresh and re-setups everything: - Reloads the page - Re-clicks reviews tab - Re-sorts by newest - Re-injects API interceptor - Continues collecting with existing seen_ids for deduplication Key changes: - Extract page setup into reusable setup_reviews_page() function - Add do_hard_refresh() that calls setup on refresh - Trigger hard refresh after 8 failed soft recoveries - Try hard refresh before timeout gives up completely - Max 3 hard refreshes before truly giving up - Reset recovery counter after successful hard refresh This ensures the scraper can recover from browser issues, DOM detachment, or other problems that soft recovery (scroll tricks) can't fix. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -259,80 +259,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
review_order = {} # review_id -> position (DOM visual order for sorting)
|
review_order = {} # review_id -> position (DOM visual order for sorting)
|
||||||
order_counter = [0] # Current order position
|
order_counter = [0] # Current order position
|
||||||
|
|
||||||
# Don't force language - let Google show all reviews in user's locale
|
# Track total reviews (persists across refreshes)
|
||||||
|
total_reviews = [None] # Use list for closure mutation
|
||||||
|
|
||||||
# Navigate to URL
|
# Hard refresh counter
|
||||||
print(f"🌐 Loading: {url[:80]}...")
|
hard_refresh_count = [0]
|
||||||
driver.get(url)
|
max_hard_refreshes = 3 # Max number of hard refreshes before giving up
|
||||||
|
|
||||||
# Handle consent popup if redirected (poll with tiny sleep)
|
# Find scrollable reviews container helper
|
||||||
start = time.time()
|
|
||||||
while time.time() - start < 5: # Max 5s for consent
|
|
||||||
if "consent.google" in driver.current_url:
|
|
||||||
print(" Handling consent popup...")
|
|
||||||
try:
|
|
||||||
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
|
|
||||||
txt = btn.text.lower()
|
|
||||||
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
|
||||||
btn.click()
|
|
||||||
# Reload original URL after consent
|
|
||||||
print(" Reloading after consent...")
|
|
||||||
driver.get(url)
|
|
||||||
break
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
break
|
|
||||||
# Check if we're already on the target page
|
|
||||||
if "maps/place" in driver.current_url and "consent" not in driver.current_url:
|
|
||||||
break
|
|
||||||
time.sleep(0.01) # 10ms - responsive but low CPU
|
|
||||||
|
|
||||||
# Extract total review count BEFORE clicking reviews tab (it's on Overview)
|
|
||||||
# ROBUST: Use aria-label="X reviews" on span[role="img"]
|
|
||||||
# Poll for up to 5s since page might still be loading after consent
|
|
||||||
total_reviews = None
|
|
||||||
start = time.time()
|
|
||||||
while time.time() - start < 5:
|
|
||||||
try:
|
|
||||||
total_reviews = driver.execute_script("""
|
|
||||||
// ROBUST: Find span[role="img"] with aria-label starting with number + "review"
|
|
||||||
var reviewSpans = document.querySelectorAll('span[role="img"]');
|
|
||||||
for (var i = 0; i < reviewSpans.length; i++) {
|
|
||||||
var label = reviewSpans[i].getAttribute('aria-label') || '';
|
|
||||||
var match = label.match(/^([\\d,\\.]+)\\s*review/i);
|
|
||||||
if (match) {
|
|
||||||
return parseInt(match[1].replace(/[,\\.]/g, ''));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
""")
|
|
||||||
if total_reviews:
|
|
||||||
print(f"📊 Total reviews on page: {total_reviews}")
|
|
||||||
break
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
time.sleep(0.1)
|
|
||||||
|
|
||||||
# Click reviews tab - poll until found
|
|
||||||
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
|
|
||||||
start = time.time()
|
|
||||||
while time.time() - start < 5: # Max 5s for tabs
|
|
||||||
try:
|
|
||||||
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
|
|
||||||
for tab in tabs:
|
|
||||||
tab_text = tab.text.lower()
|
|
||||||
if any(kw in tab_text for kw in review_keywords):
|
|
||||||
print(f" Clicking reviews tab: '{tab.text}'")
|
|
||||||
tab.click()
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
time.sleep(0.01) # 10ms between polls
|
|
||||||
continue
|
|
||||||
break # Found and clicked
|
|
||||||
except:
|
|
||||||
time.sleep(0.01)
|
|
||||||
|
|
||||||
# Find scrollable reviews container
|
|
||||||
def find_scroll_container():
|
def find_scroll_container():
|
||||||
selectors = [
|
selectors = [
|
||||||
"div.m6QErb.DxyBCb.kA9KIf.dS8AEf",
|
"div.m6QErb.DxyBCb.kA9KIf.dS8AEf",
|
||||||
@@ -352,77 +286,256 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
pass
|
pass
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Poll for scroll container (10ms intervals - fast but low CPU)
|
def setup_reviews_page(is_refresh=False):
|
||||||
scroll_container = None
|
"""
|
||||||
start = time.time()
|
Setup the reviews page for scraping.
|
||||||
last_print = 0
|
Returns (scroll_container, stop_scrolling_event) or (None, None) on failure.
|
||||||
while time.time() - start < 10: # Max 10s
|
Can be called after initial load or after a hard refresh.
|
||||||
scroll_container = find_scroll_container()
|
"""
|
||||||
if scroll_container:
|
nonlocal total_reviews
|
||||||
break
|
|
||||||
elapsed = int(time.time() - start)
|
|
||||||
if elapsed > last_print:
|
|
||||||
print(f" Waiting for reviews panel... ({elapsed}s)")
|
|
||||||
last_print = elapsed
|
|
||||||
time.sleep(0.01) # 10ms - responsive but low CPU
|
|
||||||
|
|
||||||
if not scroll_container:
|
refresh_label = " (after refresh)" if is_refresh else ""
|
||||||
print("❌ Could not find reviews scroll container")
|
|
||||||
# Debug: print page source snippet
|
# Navigate to URL (only on initial load or refresh)
|
||||||
|
if not is_refresh:
|
||||||
|
print(f"🌐 Loading: {url[:80]}...")
|
||||||
|
else:
|
||||||
|
print(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
|
||||||
|
driver.get(url)
|
||||||
|
|
||||||
|
# Handle consent popup if redirected (poll with tiny sleep)
|
||||||
|
start = time.time()
|
||||||
|
while time.time() - start < 5: # Max 5s for consent
|
||||||
|
if "consent.google" in driver.current_url:
|
||||||
|
print(" Handling consent popup...")
|
||||||
|
try:
|
||||||
|
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
|
||||||
|
txt = btn.text.lower()
|
||||||
|
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
||||||
|
btn.click()
|
||||||
|
# Reload original URL after consent
|
||||||
|
print(" Reloading after consent...")
|
||||||
|
driver.get(url)
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
break
|
||||||
|
# Check if we're already on the target page
|
||||||
|
if "maps/place" in driver.current_url and "consent" not in driver.current_url:
|
||||||
|
break
|
||||||
|
time.sleep(0.01) # 10ms - responsive but low CPU
|
||||||
|
|
||||||
|
# Extract total review count BEFORE clicking reviews tab (it's on Overview)
|
||||||
|
# Only on first load (don't overwrite if we already have it)
|
||||||
|
if total_reviews[0] is None:
|
||||||
|
start = time.time()
|
||||||
|
while time.time() - start < 5:
|
||||||
|
try:
|
||||||
|
count = driver.execute_script("""
|
||||||
|
var reviewSpans = document.querySelectorAll('span[role="img"]');
|
||||||
|
for (var i = 0; i < reviewSpans.length; i++) {
|
||||||
|
var label = reviewSpans[i].getAttribute('aria-label') || '';
|
||||||
|
var match = label.match(/^([\\d,\\.]+)\\s*review/i);
|
||||||
|
if (match) {
|
||||||
|
return parseInt(match[1].replace(/[,\\.]/g, ''));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
""")
|
||||||
|
if count:
|
||||||
|
total_reviews[0] = count
|
||||||
|
print(f"📊 Total reviews on page: {count}")
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
# Click reviews tab - poll until found
|
||||||
|
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
|
||||||
|
start = time.time()
|
||||||
|
tab_clicked = False
|
||||||
|
while time.time() - start < 5: # Max 5s for tabs
|
||||||
|
try:
|
||||||
|
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
|
||||||
|
for tab in tabs:
|
||||||
|
tab_text = tab.text.lower()
|
||||||
|
if any(kw in tab_text for kw in review_keywords):
|
||||||
|
if not is_refresh:
|
||||||
|
print(f" Clicking reviews tab: '{tab.text}'")
|
||||||
|
tab.click()
|
||||||
|
tab_clicked = True
|
||||||
|
break
|
||||||
|
if tab_clicked:
|
||||||
|
break
|
||||||
|
time.sleep(0.01) # 10ms between polls
|
||||||
|
except:
|
||||||
|
time.sleep(0.01)
|
||||||
|
|
||||||
|
# Poll for scroll container (10ms intervals - fast but low CPU)
|
||||||
|
scroll_container = None
|
||||||
|
start = time.time()
|
||||||
|
last_print = 0
|
||||||
|
while time.time() - start < 10: # Max 10s
|
||||||
|
scroll_container = find_scroll_container()
|
||||||
|
if scroll_container:
|
||||||
|
break
|
||||||
|
elapsed = int(time.time() - start)
|
||||||
|
if elapsed > last_print:
|
||||||
|
print(f" Waiting for reviews panel...{refresh_label} ({elapsed}s)")
|
||||||
|
last_print = elapsed
|
||||||
|
time.sleep(0.01) # 10ms - responsive but low CPU
|
||||||
|
|
||||||
|
if not scroll_container:
|
||||||
|
print(f"❌ Could not find reviews scroll container{refresh_label}")
|
||||||
|
try:
|
||||||
|
print("Page title:", driver.title)
|
||||||
|
print("Current URL:", driver.current_url[:100])
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
print(f"✅ Found scroll container{refresh_label}")
|
||||||
|
|
||||||
|
# Inject API interceptor (needs to be re-injected after refresh)
|
||||||
|
if not is_refresh:
|
||||||
|
print("🔌 Injecting API interceptor...")
|
||||||
|
driver.execute_script("""
|
||||||
|
// Always re-setup on refresh
|
||||||
|
window.__reviewInterceptorInjected = true;
|
||||||
|
window.__interceptedResponses = window.__interceptedResponses || [];
|
||||||
|
|
||||||
|
// Intercept fetch (only if not already patched)
|
||||||
|
if (!window.__fetchPatched) {
|
||||||
|
window.__fetchPatched = true;
|
||||||
|
const originalFetch = window.fetch;
|
||||||
|
window.fetch = async function(...args) {
|
||||||
|
const url = args[0].toString();
|
||||||
|
const response = await originalFetch.apply(this, args);
|
||||||
|
if (url.includes('listugcposts') || url.includes('review')) {
|
||||||
|
try {
|
||||||
|
const clone = response.clone();
|
||||||
|
const text = await clone.text();
|
||||||
|
window.__interceptedResponses.push({url: url, body: text});
|
||||||
|
} catch(e) {}
|
||||||
|
}
|
||||||
|
return response;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Intercept XHR (only if not already patched)
|
||||||
|
if (!window.__xhrPatched) {
|
||||||
|
window.__xhrPatched = true;
|
||||||
|
const originalXHR = window.XMLHttpRequest;
|
||||||
|
window.XMLHttpRequest = function() {
|
||||||
|
const xhr = new originalXHR();
|
||||||
|
const originalOpen = xhr.open;
|
||||||
|
let reqUrl = '';
|
||||||
|
xhr.open = function(method, url, ...rest) {
|
||||||
|
reqUrl = url;
|
||||||
|
return originalOpen.apply(this, [method, url, ...rest]);
|
||||||
|
};
|
||||||
|
xhr.addEventListener('load', function() {
|
||||||
|
if (reqUrl.includes('listugcposts') || reqUrl.includes('review')) {
|
||||||
|
try {
|
||||||
|
window.__interceptedResponses.push({url: reqUrl, body: xhr.responseText});
|
||||||
|
} catch(e) {}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return xhr;
|
||||||
|
};
|
||||||
|
for (let prop of Object.getOwnPropertyNames(originalXHR)) {
|
||||||
|
try { window.XMLHttpRequest[prop] = originalXHR[prop]; } catch(e) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Sort by newest first
|
||||||
try:
|
try:
|
||||||
print("Page title:", driver.title)
|
sort_btn = driver.execute_script("""
|
||||||
print("Current URL:", driver.current_url[:100])
|
var btns = document.querySelectorAll('button[data-value="sort"]');
|
||||||
|
if (btns.length) return btns[0];
|
||||||
|
var all = document.querySelectorAll('button[aria-label*="Sort"]');
|
||||||
|
if (all.length) return all[0];
|
||||||
|
return null;
|
||||||
|
""")
|
||||||
|
if sort_btn:
|
||||||
|
sort_btn.click()
|
||||||
|
time.sleep(0.3)
|
||||||
|
driver.execute_script("""
|
||||||
|
var items = document.querySelectorAll('[role="menuitemradio"], [data-index="1"]');
|
||||||
|
for (var i = 0; i < items.length; i++) {
|
||||||
|
var txt = items[i].textContent.toLowerCase();
|
||||||
|
if (txt.includes('newest') || txt.includes('recent') || txt.includes('más reciente')) {
|
||||||
|
items[i].click();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
time.sleep(0.5)
|
||||||
|
print(" 📅 Sorted by newest")
|
||||||
|
# Re-find scroll container after sorting (DOM may be recreated)
|
||||||
|
new_container = find_scroll_container()
|
||||||
|
if new_container:
|
||||||
|
scroll_container = new_container
|
||||||
|
print(" 🔄 Refreshed scroll container reference")
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
|
|
||||||
|
|
||||||
print("✅ Found scroll container")
|
# Expand "More" buttons for full text
|
||||||
|
try:
|
||||||
# PHASE 2: Inject API interceptor for scroll-loaded reviews
|
expanded = driver.execute_script("""
|
||||||
print("🔌 Injecting API interceptor...")
|
var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
|
||||||
driver.execute_script("""
|
var count = 0;
|
||||||
if (window.__reviewInterceptorInjected) return;
|
for (var i = 0; i < buttons.length; i++) {
|
||||||
window.__reviewInterceptorInjected = true;
|
if (buttons[i].textContent.trim() === 'More') {
|
||||||
window.__interceptedResponses = [];
|
buttons[i].click();
|
||||||
|
count++;
|
||||||
// Intercept fetch
|
}
|
||||||
const originalFetch = window.fetch;
|
|
||||||
window.fetch = async function(...args) {
|
|
||||||
const url = args[0].toString();
|
|
||||||
const response = await originalFetch.apply(this, args);
|
|
||||||
if (url.includes('listugcposts') || url.includes('review')) {
|
|
||||||
try {
|
|
||||||
const clone = response.clone();
|
|
||||||
const text = await clone.text();
|
|
||||||
window.__interceptedResponses.push({url: url, body: text});
|
|
||||||
} catch(e) {}
|
|
||||||
}
|
|
||||||
return response;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Intercept XHR
|
|
||||||
const originalXHR = window.XMLHttpRequest;
|
|
||||||
window.XMLHttpRequest = function() {
|
|
||||||
const xhr = new originalXHR();
|
|
||||||
const originalOpen = xhr.open;
|
|
||||||
let reqUrl = '';
|
|
||||||
xhr.open = function(method, url, ...rest) {
|
|
||||||
reqUrl = url;
|
|
||||||
return originalOpen.apply(this, [method, url, ...rest]);
|
|
||||||
};
|
|
||||||
xhr.addEventListener('load', function() {
|
|
||||||
if (reqUrl.includes('listugcposts') || reqUrl.includes('review')) {
|
|
||||||
try {
|
|
||||||
window.__interceptedResponses.push({url: reqUrl, body: xhr.responseText});
|
|
||||||
} catch(e) {}
|
|
||||||
}
|
}
|
||||||
});
|
return count;
|
||||||
return xhr;
|
""")
|
||||||
};
|
if expanded > 0:
|
||||||
for (let prop of Object.getOwnPropertyNames(originalXHR)) {
|
print(f" 📝 Expanded {expanded} truncated reviews")
|
||||||
try { window.XMLHttpRequest[prop] = originalXHR[prop]; } catch(e) {}
|
except:
|
||||||
}
|
pass
|
||||||
""")
|
|
||||||
|
# Block images to speed up scrolling (use CDP)
|
||||||
|
try:
|
||||||
|
driver.execute_cdp_cmd('Network.setBlockedURLs', {
|
||||||
|
'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*']
|
||||||
|
})
|
||||||
|
driver.execute_cdp_cmd('Network.enable', {})
|
||||||
|
if not is_refresh:
|
||||||
|
print(" 🚫 Blocking images for faster scrolling")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Setup scrollable pane reference
|
||||||
|
driver.execute_script("window.scrollablePane = arguments[0];", scroll_container)
|
||||||
|
|
||||||
|
# Create scroll worker
|
||||||
|
stop_scrolling = threading.Event()
|
||||||
|
|
||||||
|
def scroll_worker():
|
||||||
|
while not stop_scrolling.is_set():
|
||||||
|
try:
|
||||||
|
driver.execute_script("""
|
||||||
|
var p = window.scrollablePane;
|
||||||
|
if (p) p.scrollTop = p.scrollHeight;
|
||||||
|
""")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
scroll_thread = threading.Thread(target=scroll_worker, daemon=True)
|
||||||
|
scroll_thread.start()
|
||||||
|
|
||||||
|
return scroll_container, stop_scrolling
|
||||||
|
|
||||||
|
# Initial page setup
|
||||||
|
scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False)
|
||||||
|
if not scroll_container:
|
||||||
|
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
|
||||||
|
|
||||||
def get_api_reviews():
|
def get_api_reviews():
|
||||||
"""Get reviews from intercepted API responses."""
|
"""Get reviews from intercepted API responses."""
|
||||||
@@ -440,93 +553,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
pass
|
pass
|
||||||
return api_revs
|
return api_revs
|
||||||
|
|
||||||
# Sort by newest first (helps with loading)
|
|
||||||
try:
|
|
||||||
sort_btn = driver.execute_script("""
|
|
||||||
var btns = document.querySelectorAll('button[data-value="sort"]');
|
|
||||||
if (btns.length) return btns[0];
|
|
||||||
// Try aria-label
|
|
||||||
var all = document.querySelectorAll('button[aria-label*="Sort"]');
|
|
||||||
if (all.length) return all[0];
|
|
||||||
return null;
|
|
||||||
""")
|
|
||||||
if sort_btn:
|
|
||||||
sort_btn.click()
|
|
||||||
time.sleep(0.3)
|
|
||||||
# Click "Newest" option
|
|
||||||
driver.execute_script("""
|
|
||||||
var items = document.querySelectorAll('[role="menuitemradio"], [data-index="1"]');
|
|
||||||
for (var i = 0; i < items.length; i++) {
|
|
||||||
var txt = items[i].textContent.toLowerCase();
|
|
||||||
if (txt.includes('newest') || txt.includes('recent') || txt.includes('más reciente')) {
|
|
||||||
items[i].click();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
""")
|
|
||||||
time.sleep(0.5)
|
|
||||||
print(" 📅 Sorted by newest")
|
|
||||||
# Re-find scroll container after sorting (DOM may be recreated)
|
|
||||||
new_container = find_scroll_container()
|
|
||||||
if new_container:
|
|
||||||
scroll_container = new_container
|
|
||||||
print(" 🔄 Refreshed scroll container reference")
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# EXPAND ALL "MORE" BUTTONS for full text on pre-rendered reviews
|
|
||||||
# This batch-clicks all "More" buttons at once (fast, no waiting per button)
|
|
||||||
try:
|
|
||||||
expanded = driver.execute_script("""
|
|
||||||
var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
|
|
||||||
var count = 0;
|
|
||||||
for (var i = 0; i < buttons.length; i++) {
|
|
||||||
if (buttons[i].textContent.trim() === 'More') {
|
|
||||||
buttons[i].click();
|
|
||||||
count++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return count;
|
|
||||||
""")
|
|
||||||
if expanded > 0:
|
|
||||||
print(f" 📝 Expanded {expanded} truncated reviews")
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Block images to speed up scrolling (use CDP)
|
|
||||||
try:
|
|
||||||
driver.execute_cdp_cmd('Network.setBlockedURLs', {
|
|
||||||
'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*']
|
|
||||||
})
|
|
||||||
driver.execute_cdp_cmd('Network.enable', {})
|
|
||||||
print(" 🚫 Blocking images for faster scrolling")
|
|
||||||
except Exception as e:
|
|
||||||
pass # CDP might not be available in all setups
|
|
||||||
|
|
||||||
# Simple scroll - scrollTop = scrollHeight (proven to work)
|
|
||||||
driver.execute_script("window.scrollablePane = arguments[0];", scroll_container)
|
|
||||||
stop_scrolling = threading.Event()
|
|
||||||
|
|
||||||
def scroll_worker():
|
|
||||||
while not stop_scrolling.is_set():
|
|
||||||
try:
|
|
||||||
driver.execute_script("""
|
|
||||||
var p = window.scrollablePane;
|
|
||||||
if (p) p.scrollTop = p.scrollHeight;
|
|
||||||
""")
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
time.sleep(0.1)
|
|
||||||
|
|
||||||
scroll_thread = threading.Thread(target=scroll_worker, daemon=True)
|
|
||||||
scroll_thread.start()
|
|
||||||
|
|
||||||
# Recovery function - use real mouse actions when stuck
|
# Recovery function - use real mouse actions when stuck
|
||||||
from selenium.webdriver.common.action_chains import ActionChains
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
from selenium.webdriver.common.keys import Keys
|
from selenium.webdriver.common.keys import Keys
|
||||||
recovery_count = [0]
|
recovery_count = [0]
|
||||||
|
|
||||||
def unstick_scroll():
|
def unstick_scroll():
|
||||||
|
nonlocal scroll_container
|
||||||
recovery_count[0] += 1
|
recovery_count[0] += 1
|
||||||
method = recovery_count[0] % 4
|
method = recovery_count[0] % 4
|
||||||
try:
|
try:
|
||||||
@@ -566,6 +599,31 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def do_hard_refresh():
|
||||||
|
"""Hard refresh the page and re-setup everything. Returns True on success."""
|
||||||
|
nonlocal scroll_container, stop_scrolling
|
||||||
|
hard_refresh_count[0] += 1
|
||||||
|
|
||||||
|
if hard_refresh_count[0] > max_hard_refreshes:
|
||||||
|
print(f" ⚠️ Max hard refreshes ({max_hard_refreshes}) reached, giving up")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Stop current scroll worker
|
||||||
|
stop_scrolling.set()
|
||||||
|
time.sleep(0.2)
|
||||||
|
|
||||||
|
# Re-setup page
|
||||||
|
new_container, new_stop = setup_reviews_page(is_refresh=True)
|
||||||
|
if new_container:
|
||||||
|
scroll_container = new_container
|
||||||
|
stop_scrolling = new_stop
|
||||||
|
recovery_count[0] = 0 # Reset recovery count after successful refresh
|
||||||
|
print(f" ✅ Hard refresh successful, resuming with {len(seen_ids)} reviews already collected")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f" ❌ Hard refresh failed to find scroll container")
|
||||||
|
return False
|
||||||
|
|
||||||
# Main collection loop
|
# Main collection loop
|
||||||
last_new_time = time.time()
|
last_new_time = time.time()
|
||||||
last_count = len(reviews)
|
last_count = len(reviews)
|
||||||
@@ -784,9 +842,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
|
|
||||||
# Progress update
|
# Progress update
|
||||||
elapsed = time.time() - last_new_time
|
elapsed = time.time() - last_new_time
|
||||||
if total_reviews:
|
if total_reviews[0]:
|
||||||
pct = (current_count / total_reviews) * 100
|
pct = (current_count / total_reviews[0]) * 100
|
||||||
print(f" 📊 {current_count}/{total_reviews} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True)
|
print(f" 📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True)
|
||||||
else:
|
else:
|
||||||
print(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True)
|
print(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True)
|
||||||
|
|
||||||
@@ -797,7 +855,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
break
|
break
|
||||||
|
|
||||||
# Also stop if we have all reviews from the page
|
# Also stop if we have all reviews from the page
|
||||||
if total_reviews and current_count >= total_reviews:
|
if total_reviews[0] and current_count >= total_reviews[0]:
|
||||||
print(f"✅ All {current_count} reviews collected")
|
print(f"✅ All {current_count} reviews collected")
|
||||||
stop_scrolling.set()
|
stop_scrolling.set()
|
||||||
break
|
break
|
||||||
@@ -805,8 +863,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
# STUCK DETECTION: If no new reviews for 3s+, try to unstick
|
# STUCK DETECTION: If no new reviews for 3s+, try to unstick
|
||||||
# Only if we haven't collected all reviews yet
|
# Only if we haven't collected all reviews yet
|
||||||
if elapsed >= 3 and int(elapsed) % 3 == 0:
|
if elapsed >= 3 and int(elapsed) % 3 == 0:
|
||||||
print(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
|
# After 8+ failed recovery attempts, try hard refresh
|
||||||
unstick_scroll()
|
if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes:
|
||||||
|
print(f" 🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...", flush=True)
|
||||||
|
if do_hard_refresh():
|
||||||
|
last_new_time = time.time() # Reset timer after refresh
|
||||||
|
continue # Skip to next iteration
|
||||||
|
else:
|
||||||
|
print(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
|
||||||
|
unstick_scroll()
|
||||||
|
|
||||||
# Check scroll state - track if content is still being added
|
# Check scroll state - track if content is still being added
|
||||||
try:
|
try:
|
||||||
@@ -831,13 +896,20 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
last_new_time = time.time()
|
last_new_time = time.time()
|
||||||
|
|
||||||
# Dynamic timeout based on state and recovery attempts
|
# Dynamic timeout based on state and recovery attempts
|
||||||
|
# - Try hard refresh before giving up if we still have refreshes left
|
||||||
# - 5s if at bottom AND content stopped growing AND multiple recovery attempts failed
|
# - 5s if at bottom AND content stopped growing AND multiple recovery attempts failed
|
||||||
# - 15s max otherwise (keep trying)
|
# - 15s max otherwise (keep trying)
|
||||||
recovery_failed = recovery_count[0] >= 5 and elapsed >= 5
|
recovery_failed = recovery_count[0] >= 5 and elapsed >= 5
|
||||||
truly_done = at_bottom and not content_growing and recovery_failed
|
truly_done = at_bottom and not content_growing and recovery_failed
|
||||||
timeout_hit = elapsed >= 15
|
timeout_hit = elapsed >= timeout_no_new
|
||||||
|
|
||||||
if truly_done or timeout_hit:
|
if truly_done or timeout_hit:
|
||||||
|
# Last chance: try hard refresh before giving up
|
||||||
|
if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
|
||||||
|
print(f" 🔄 Timeout reached, trying hard refresh before giving up...", flush=True)
|
||||||
|
if do_hard_refresh():
|
||||||
|
last_new_time = time.time()
|
||||||
|
continue # Keep trying
|
||||||
print(f"✅ All reviews loaded: {current_count}")
|
print(f"✅ All reviews loaded: {current_count}")
|
||||||
stop_scrolling.set()
|
stop_scrolling.set()
|
||||||
break
|
break
|
||||||
|
|||||||
Reference in New Issue
Block a user