Add hard refresh recovery for stuck scraper

When the scraper gets stuck (8+ failed soft recovery attempts), it now does a hard page refresh and re-setups everything: - Reloads the page - Re-clicks reviews tab - Re-sorts by newest - Re-injects API interceptor - Continues collecting with existing seen_ids for deduplication Key changes: - Extract page setup into reusable setup_reviews_page() function - Add do_hard_refresh() that calls setup on refresh - Trigger hard refresh after 8 failed soft recoveries - Try hard refresh before timeout gives up completely - Max 3 hard refreshes before truly giving up - Reset recovery counter after successful hard refresh This ensures the scraper can recover from browser issues, DOM detachment, or other problems that soft recovery (scroll tricks) can't fix. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 13:42:54 +00:00
parent b55a7a0fb1
commit ff03a4a1b7
1 changed files with 297 additions and 225 deletions
--- a/modules/scraper_clean.py
+++ b/modules/scraper_clean.py
@@ -259,10 +259,48 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
    review_order = {}  # review_id -> position (DOM visual order for sorting)
    order_counter = [0]  # Current order position

-    # Don't force language - let Google show all reviews in user's locale
+    # Track total reviews (persists across refreshes)
+    total_reviews = [None]  # Use list for closure mutation

-    # Navigate to URL
+    # Hard refresh counter
+    hard_refresh_count = [0]
+    max_hard_refreshes = 3  # Max number of hard refreshes before giving up
+
+    # Find scrollable reviews container helper
+    def find_scroll_container():
+        selectors = [
+            "div.m6QErb.DxyBCb.kA9KIf.dS8AEf",
+            "div.m6QErb.DxyBCb.kA9KIf",
+            "div.m6QErb.DxyBCb",
+            "div.m6QErb[aria-label]",
+            "div.DxyBCb.kA9KIf.dS8AEf",
+            "div[role='main'] div.m6QErb",
+        ]
+        for sel in selectors:
+            try:
+                els = driver.find_elements(By.CSS_SELECTOR, sel)
+                for el in els:
+                    if el.is_displayed() and el.size['height'] > 100:
+                        return el
+            except:
+                pass
+        return None
+
+    def setup_reviews_page(is_refresh=False):
+        """
+        Setup the reviews page for scraping.
+        Returns (scroll_container, stop_scrolling_event) or (None, None) on failure.
+        Can be called after initial load or after a hard refresh.
+        """
+        nonlocal total_reviews
+
+        refresh_label = " (after refresh)" if is_refresh else ""
+
+        # Navigate to URL (only on initial load or refresh)
+        if not is_refresh:
            print(f"🌐 Loading: {url[:80]}...")
+        else:
+            print(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
        driver.get(url)

        # Handle consent popup if redirected (poll with tiny sleep)
@@ -288,14 +326,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
            time.sleep(0.01)  # 10ms - responsive but low CPU

        # Extract total review count BEFORE clicking reviews tab (it's on Overview)
-    # ROBUST: Use aria-label="X reviews" on span[role="img"]
-    # Poll for up to 5s since page might still be loading after consent
-    total_reviews = None
+        # Only on first load (don't overwrite if we already have it)
+        if total_reviews[0] is None:
            start = time.time()
            while time.time() - start < 5:
                try:
-            total_reviews = driver.execute_script("""
-                // ROBUST: Find span[role="img"] with aria-label starting with number + "review"
+                    count = driver.execute_script("""
                        var reviewSpans = document.querySelectorAll('span[role="img"]');
                        for (var i = 0; i < reviewSpans.length; i++) {
                            var label = reviewSpans[i].getAttribute('aria-label') || '';
@@ -306,8 +342,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                        }
                        return null;
                    """)
-            if total_reviews:
-                print(f"📊 Total reviews on page: {total_reviews}")
+                    if count:
+                        total_reviews[0] = count
+                        print(f"📊 Total reviews on page: {count}")
                        break
                except:
                    pass
@@ -316,42 +353,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        # Click reviews tab - poll until found
        review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
        start = time.time()
+        tab_clicked = False
        while time.time() - start < 5:  # Max 5s for tabs
            try:
                tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
                for tab in tabs:
                    tab_text = tab.text.lower()
                    if any(kw in tab_text for kw in review_keywords):
+                        if not is_refresh:
                            print(f"  Clicking reviews tab: '{tab.text}'")
                        tab.click()
+                        tab_clicked = True
+                        break
+                if tab_clicked:
                    break
-            else:
                time.sleep(0.01)  # 10ms between polls
-                continue
-            break  # Found and clicked
            except:
                time.sleep(0.01)

-    # Find scrollable reviews container
-    def find_scroll_container():
-        selectors = [
-            "div.m6QErb.DxyBCb.kA9KIf.dS8AEf",
-            "div.m6QErb.DxyBCb.kA9KIf",
-            "div.m6QErb.DxyBCb",
-            "div.m6QErb[aria-label]",
-            "div.DxyBCb.kA9KIf.dS8AEf",
-            "div[role='main'] div.m6QErb",
-        ]
-        for sel in selectors:
-            try:
-                els = driver.find_elements(By.CSS_SELECTOR, sel)
-                for el in els:
-                    if el.is_displayed() and el.size['height'] > 100:
-                        return el
-            except:
-                pass
-        return None
-
        # Poll for scroll container (10ms intervals - fast but low CPU)
        scroll_container = None
        start = time.time()
@@ -362,30 +381,32 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                break
            elapsed = int(time.time() - start)
            if elapsed > last_print:
-            print(f"  Waiting for reviews panel... ({elapsed}s)")
+                print(f"  Waiting for reviews panel...{refresh_label} ({elapsed}s)")
                last_print = elapsed
            time.sleep(0.01)  # 10ms - responsive but low CPU

        if not scroll_container:
-        print("❌ Could not find reviews scroll container")
-        # Debug: print page source snippet
+            print(f"❌ Could not find reviews scroll container{refresh_label}")
            try:
                print("Page title:", driver.title)
                print("Current URL:", driver.current_url[:100])
            except:
                pass
-        return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
+            return None, None

-    print("✅ Found scroll container")
+        print(f"✅ Found scroll container{refresh_label}")

-    # PHASE 2: Inject API interceptor for scroll-loaded reviews
+        # Inject API interceptor (needs to be re-injected after refresh)
+        if not is_refresh:
            print("🔌 Injecting API interceptor...")
        driver.execute_script("""
-        if (window.__reviewInterceptorInjected) return;
+            // Always re-setup on refresh
            window.__reviewInterceptorInjected = true;
-        window.__interceptedResponses = [];
+            window.__interceptedResponses = window.__interceptedResponses || [];

-        // Intercept fetch
+            // Intercept fetch (only if not already patched)
+            if (!window.__fetchPatched) {
+                window.__fetchPatched = true;
                const originalFetch = window.fetch;
                window.fetch = async function(...args) {
                    const url = args[0].toString();
@@ -399,8 +420,11 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                    }
                    return response;
                };
+            }

-        // Intercept XHR
+            // Intercept XHR (only if not already patched)
+            if (!window.__xhrPatched) {
+                window.__xhrPatched = true;
                const originalXHR = window.XMLHttpRequest;
                window.XMLHttpRequest = function() {
                    const xhr = new originalXHR();
@@ -422,30 +446,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                for (let prop of Object.getOwnPropertyNames(originalXHR)) {
                    try { window.XMLHttpRequest[prop] = originalXHR[prop]; } catch(e) {}
                }
+            }
        """)

-    def get_api_reviews():
-        """Get reviews from intercepted API responses."""
-        api_revs = []
-        try:
-            responses = driver.execute_script("""
-                var r = window.__interceptedResponses || [];
-                window.__interceptedResponses = [];
-                return r;
-            """)
-            for resp in (responses or []):
-                body = resp.get("body", "")
-                api_revs.extend(extract_reviews_from_api_body(body))
-        except:
-            pass
-        return api_revs
-
-    # Sort by newest first (helps with loading)
+        # Sort by newest first
        try:
            sort_btn = driver.execute_script("""
                var btns = document.querySelectorAll('button[data-value="sort"]');
                if (btns.length) return btns[0];
-            // Try aria-label
                var all = document.querySelectorAll('button[aria-label*="Sort"]');
                if (all.length) return all[0];
                return null;
@@ -453,7 +461,6 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
            if sort_btn:
                sort_btn.click()
                time.sleep(0.3)
-            # Click "Newest" option
                driver.execute_script("""
                    var items = document.querySelectorAll('[role="menuitemradio"], [data-index="1"]');
                    for (var i = 0; i < items.length; i++) {
@@ -474,8 +481,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        except:
            pass

-    # EXPAND ALL "MORE" BUTTONS for full text on pre-rendered reviews
-    # This batch-clicks all "More" buttons at once (fast, no waiting per button)
+        # Expand "More" buttons for full text
        try:
            expanded = driver.execute_script("""
                var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
@@ -499,12 +505,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*']
            })
            driver.execute_cdp_cmd('Network.enable', {})
+            if not is_refresh:
                print("  🚫 Blocking images for faster scrolling")
-    except Exception as e:
-        pass  # CDP might not be available in all setups
+        except:
+            pass

-    # Simple scroll - scrollTop = scrollHeight (proven to work)
+        # Setup scrollable pane reference
        driver.execute_script("window.scrollablePane = arguments[0];", scroll_container)
+
+        # Create scroll worker
        stop_scrolling = threading.Event()

        def scroll_worker():
@@ -521,12 +530,36 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        scroll_thread = threading.Thread(target=scroll_worker, daemon=True)
        scroll_thread.start()

+        return scroll_container, stop_scrolling
+
+    # Initial page setup
+    scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False)
+    if not scroll_container:
+        return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
+
+    def get_api_reviews():
+        """Get reviews from intercepted API responses."""
+        api_revs = []
+        try:
+            responses = driver.execute_script("""
+                var r = window.__interceptedResponses || [];
+                window.__interceptedResponses = [];
+                return r;
+            """)
+            for resp in (responses or []):
+                body = resp.get("body", "")
+                api_revs.extend(extract_reviews_from_api_body(body))
+        except:
+            pass
+        return api_revs
+
    # Recovery function - use real mouse actions when stuck
    from selenium.webdriver.common.action_chains import ActionChains
    from selenium.webdriver.common.keys import Keys
    recovery_count = [0]

    def unstick_scroll():
+        nonlocal scroll_container
        recovery_count[0] += 1
        method = recovery_count[0] % 4
        try:
@@ -566,6 +599,31 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        except:
            pass

+    def do_hard_refresh():
+        """Hard refresh the page and re-setup everything. Returns True on success."""
+        nonlocal scroll_container, stop_scrolling
+        hard_refresh_count[0] += 1
+
+        if hard_refresh_count[0] > max_hard_refreshes:
+            print(f"  ⚠️  Max hard refreshes ({max_hard_refreshes}) reached, giving up")
+            return False
+
+        # Stop current scroll worker
+        stop_scrolling.set()
+        time.sleep(0.2)
+
+        # Re-setup page
+        new_container, new_stop = setup_reviews_page(is_refresh=True)
+        if new_container:
+            scroll_container = new_container
+            stop_scrolling = new_stop
+            recovery_count[0] = 0  # Reset recovery count after successful refresh
+            print(f"  ✅ Hard refresh successful, resuming with {len(seen_ids)} reviews already collected")
+            return True
+        else:
+            print(f"  ❌ Hard refresh failed to find scroll container")
+            return False
+
    # Main collection loop
    last_new_time = time.time()
    last_count = len(reviews)
@@ -784,9 +842,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in

        # Progress update
        elapsed = time.time() - last_new_time
-        if total_reviews:
-            pct = (current_count / total_reviews) * 100
-            print(f"  📊 {current_count}/{total_reviews} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True)
+        if total_reviews[0]:
+            pct = (current_count / total_reviews[0]) * 100
+            print(f"  📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True)
        else:
            print(f"  📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True)

@@ -797,7 +855,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
            break

        # Also stop if we have all reviews from the page
-        if total_reviews and current_count >= total_reviews:
+        if total_reviews[0] and current_count >= total_reviews[0]:
            print(f"✅ All {current_count} reviews collected")
            stop_scrolling.set()
            break
@@ -805,6 +863,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        # STUCK DETECTION: If no new reviews for 3s+, try to unstick
        # Only if we haven't collected all reviews yet
        if elapsed >= 3 and int(elapsed) % 3 == 0:
+            # After 8+ failed recovery attempts, try hard refresh
+            if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes:
+                print(f"  🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...", flush=True)
+                if do_hard_refresh():
+                    last_new_time = time.time()  # Reset timer after refresh
+                    continue  # Skip to next iteration
+            else:
                print(f"  🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
                unstick_scroll()

@@ -831,13 +896,20 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
            last_new_time = time.time()

        # Dynamic timeout based on state and recovery attempts
+        # - Try hard refresh before giving up if we still have refreshes left
        # - 5s if at bottom AND content stopped growing AND multiple recovery attempts failed
        # - 15s max otherwise (keep trying)
        recovery_failed = recovery_count[0] >= 5 and elapsed >= 5
        truly_done = at_bottom and not content_growing and recovery_failed
-        timeout_hit = elapsed >= 15
+        timeout_hit = elapsed >= timeout_no_new

        if truly_done or timeout_hit:
+            # Last chance: try hard refresh before giving up
+            if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
+                print(f"  🔄 Timeout reached, trying hard refresh before giving up...", flush=True)
+                if do_hard_refresh():
+                    last_new_time = time.time()
+                    continue  # Keep trying
            print(f"✅ All reviews loaded: {current_count}")
            stop_scrolling.set()
            break