Real-time parsing + image blocking for large datasets

Key improvements: - Parse reviews immediately during scroll (not at end) - Fixes virtual scroll issue - was losing reviews after ~1000 - Block images via CDP for faster loading - Smart recovery: 4 methods (keys, wheel, scroll up/down, click card) - Dynamic timeout based on scroll state and content growth - Spinner + network activity detection resets idle timer - Sort by newest first option Results: 1930 reviews (was 990) on 2433-review location Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-21 22:25:26 +00:00
parent 6a75159ebe
commit 6934838a69
1 changed files with 162 additions and 38 deletions
--- a/modules/scraper_clean.py
+++ b/modules/scraper_clean.py
@@ -200,9 +200,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
    # Storage - use review ID as key
    reviews = {}  # review_id -> review
-    # Force English language
+    # Don't force language - let Google show all reviews in user's locale
    if "hl=" not in url:
        url = url + ("&" if "?" in url else "?") + "hl=en"
    # Navigate to URL
    print(f"🌐 Loading: {url[:80]}...")
@@ -380,10 +378,47 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
            pass
        return api_revs
-    # Store pane in window for scroll thread
+    # Sort by newest first (helps with loading)
-    driver.execute_script("window.scrollablePane = arguments[0];", scroll_container)
+    try:
        sort_btn = driver.execute_script("""
            var btns = document.querySelectorAll('button[data-value="sort"]');
            if (btns.length) return btns[0];
            // Try aria-label
            var all = document.querySelectorAll('button[aria-label*="Sort"]');
            if (all.length) return all[0];
            return null;
        """)
        if sort_btn:
            sort_btn.click()
            time.sleep(0.3)
            # Click "Newest" option
            driver.execute_script("""
                var items = document.querySelectorAll('[role="menuitemradio"], [data-index="1"]');
                for (var i = 0; i < items.length; i++) {
                    var txt = items[i].textContent.toLowerCase();
                    if (txt.includes('newest') || txt.includes('recent') || txt.includes('más reciente')) {
                        items[i].click();
                        break;
                    }
                }
            """)
            time.sleep(0.5)
            print("  📅 Sorted by newest")
    except:
        pass
-    # Background scroll thread (fast, continuous)
+    # Block images to speed up scrolling (use CDP)
    try:
        driver.execute_cdp_cmd('Network.setBlockedURLs', {
            'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*']
        })
        driver.execute_cdp_cmd('Network.enable', {})
        print("  🚫 Blocking images for faster scrolling")
    except Exception as e:
        pass  # CDP might not be available in all setups
    # Simple scroll - scrollTop = scrollHeight (proven to work)
    driver.execute_script("window.scrollablePane = arguments[0];", scroll_container)
    stop_scrolling = threading.Event()
    def scroll_worker():
@@ -395,11 +430,57 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                """)
            except:
                pass
-            time.sleep(0.1)  # 10x per second
+            time.sleep(0.1)
    scroll_thread = threading.Thread(target=scroll_worker, daemon=True)
    scroll_thread.start()
    # Recovery function - use real mouse actions when stuck
    from selenium.webdriver.common.action_chains import ActionChains
    from selenium.webdriver.common.keys import Keys
    recovery_count = [0]
    def unstick_scroll():
        recovery_count[0] += 1
        method = recovery_count[0] % 4
        try:
            if method == 1:
                # Method 1: Click pane and send Page Down keys
                scroll_container.click()
                ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
                ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
            elif method == 2:
                # Method 2: Real mouse wheel scroll
                ActionChains(driver).move_to_element(scroll_container)\
                    .scroll_by_amount(0, 800).perform()
            elif method == 3:
                # Method 3: Scroll up significantly then back down (force reload)
                driver.execute_script("""
                    var p = window.scrollablePane;
                    if (p) p.scrollTop = Math.max(0, p.scrollTop - 2000);
                """)
                time.sleep(0.3)
                driver.execute_script("""
                    var p = window.scrollablePane;
                    if (p) p.scrollTop = p.scrollHeight;
                """)
            else:
                # Method 4: Click last review card to focus, then scroll
                driver.execute_script("""
                    var cards = document.querySelectorAll('div.jftiEf[data-review-id]');
                    if (cards.length > 0) {
                        cards[cards.length - 1].scrollIntoView({block: 'end'});
                        cards[cards.length - 1].click();
                    }
                """)
                time.sleep(0.2)
                driver.execute_script("""
                    var p = window.scrollablePane;
                    if (p) p.scrollTop = p.scrollHeight;
                """)
        except:
            pass
    # Main collection loop
    last_new_time = time.time()
    last_count = len(reviews)
@@ -417,19 +498,17 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                key = f"api_{rev['author'][:20]}_{rev['rating']}"
                reviews[key] = rev
-        # Collect review IDs via JavaScript (doesn't affect scroll position!)
+        # Parse reviews in real-time (Google Maps uses virtual scroll - elements get removed!)
-        # Use specific selector to only get actual review cards, not buttons
+        # We must parse NOW, not later
        try:
-            review_ids = driver.execute_script("""
+            cards = driver.find_elements(By.CSS_SELECTOR, "div.jftiEf[data-review-id]")
-                var ids = [];
+            for card in cards:
-                document.querySelectorAll('div.jftiEf[data-review-id]').forEach(function(el) {
+                rid = card.get_attribute("data-review-id")
                    ids.push(el.getAttribute('data-review-id'));
                });
                return ids;
            """)
            for rid in (review_ids or []):
                if rid and rid not in reviews:
-                    reviews[rid] = {"id": rid, "source": "dom", "_needs_parse": True}
+                    # Parse immediately - element may be gone later!
                    review = parse_dom_review(card)
                    if review:
                        reviews[rid] = review
        except:
            pass
@@ -440,6 +519,30 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
            last_new_time = time.time()
            last_count = current_count
        # Check if loading (spinner visible OR network activity)
        try:
            loading_status = driver.execute_script("""
                var status = {spinner: false, network: false};
                // Check for Google's loading indicators
                var spinner = document.querySelector('div[role="progressbar"]');
                if (spinner && spinner.offsetParent !== null) status.spinner = true;
                var loading = document.querySelector('.qjESne, .loading');
                if (loading && loading.offsetParent !== null) status.spinner = true;
                // Check for recent network activity (API interceptor)
                var responses = window.__interceptedResponses || [];
                var lastCount = window.__lastResponseCount || 0;
                if (responses.length > lastCount) {
                    status.network = true;
                    window.__lastResponseCount = responses.length;
                }
                return status;
            """)
            is_loading = loading_status.get('spinner') or loading_status.get('network')
            if is_loading:
                last_new_time = time.time()  # Reset timer while loading
        except:
            is_loading = False
        # Progress update
        elapsed = time.time() - last_new_time
        if total_reviews:
@@ -448,37 +551,58 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        else:
            print(f"  📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True)
        # STUCK DETECTION: If no new reviews for 3s+, try to unstick
        # Trigger at 3s, 6s, 9s... (every 3 seconds while stuck)
        if elapsed >= 3 and int(elapsed) % 3 == 0:
            print(f"  🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
            unstick_scroll()
        # Stop conditions
        if current_count >= max_reviews:
            print(f"✅ Reached max: {current_count}")
            stop_scrolling.set()
            break
-        if total_reviews and current_count >= total_reviews:
+        # Check scroll state - track if content is still being added
-            print(f"✅ Got all {total_reviews} reviews!")
+        try:
            scroll_state = driver.execute_script("""
                var p = window.scrollablePane;
                if (!p) return {atBottom: true, height: 0};
                var atBottom = (p.scrollTop + p.clientHeight >= p.scrollHeight - 50);
                var height = p.scrollHeight;
                var lastHeight = window.__lastScrollHeight || 0;
                var growing = height > lastHeight;
                window.__lastScrollHeight = height;
                return {atBottom: atBottom, height: height, growing: growing};
            """)
            at_bottom = scroll_state.get('atBottom', True)
            content_growing = scroll_state.get('growing', False)
        except:
            at_bottom = True
            content_growing = False
        # Reset timer if content is growing (new reviews loading)
        if content_growing:
            last_new_time = time.time()
        # Dynamic timeout based on state and recovery attempts
        # - 5s if at bottom AND content stopped growing AND multiple recovery attempts failed
        # - 15s max otherwise (keep trying)
        recovery_failed = recovery_count[0] >= 5 and elapsed >= 5
        truly_done = at_bottom and not content_growing and recovery_failed
        timeout_hit = elapsed >= 15
        if truly_done or timeout_hit:
            print(f"✅ All reviews loaded: {current_count}")
            stop_scrolling.set()
            break
-        if time.time() - last_new_time >= timeout_no_new:
+    # Reviews already parsed during scrolling (real-time parsing)
-            print(f"⏱️ Timeout: no new reviews for {timeout_no_new}s")
+    print("📝 Finalizing review data...")
            stop_scrolling.set()
            break
-    # FINAL PHASE: Parse full review data from DOM (scroll is stopped)
+    # Separate API and DOM reviews
    print("📝 Parsing full review data...")
    api_reviews_collected = {k: v for k, v in reviews.items() if v.get("source") == "api"}
-    reviews.clear()
+    dom_reviews = {k: v for k, v in reviews.items() if v.get("source") == "dom"}
    # Parse all DOM cards now that scrolling is done
    # Use specific selector to only get actual review cards (div.jftiEf), not buttons
    try:
        cards = driver.find_elements(By.CSS_SELECTOR, "div.jftiEf[data-review-id]")
        for card in cards:
            review = parse_dom_review(card)
            if review and review.get("id"):
                reviews[review["id"]] = review
    except Exception as e:
        print(f"  Warning: DOM parse error: {e}")
    # Merge API reviews (only add if not already in DOM)
    api_added = 0