Fix DOM cleanup: hide cards from API interception too

The continue statement was skipping the card.style.display='none' and card.innerHTML='' cleanup for cards already seen via API interception. This caused DOM to grow unbounded during long scrapes. Now ALL processed cards are hidden regardless of data source. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 17:23:51 +00:00
parent 01ea18d91d
commit 80e7771c00
1 changed files with 125 additions and 46 deletions
--- a/modules/scraper_clean.py
+++ b/modules/scraper_clean.py
@@ -8,9 +8,41 @@ import re
 import json
 import time
 import threading
+from datetime import datetime
 from selenium.webdriver.common.by import By


+class LogCapture:
+    """Captures scraper logs for storage and viewing."""
+
+    def __init__(self):
+        self.logs = []
+
+    def log(self, message: str, level: str = "INFO", source: str = "scraper"):
+        """Add a log entry with timestamp."""
+        entry = {
+            "timestamp": datetime.utcnow().isoformat() + "Z",
+            "level": level,
+            "source": source,
+            "message": message
+        }
+        self.logs.append(entry)
+        # Also print for console visibility
+        print(message, flush=True)
+
+    def info(self, message: str, source: str = "scraper"):
+        self.log(message, "INFO", source)
+
+    def warning(self, message: str, source: str = "scraper"):
+        self.log(message, "WARNING", source)
+
+    def error(self, message: str, source: str = "scraper"):
+        self.log(message, "ERROR", source)
+
+    def get_logs(self):
+        return self.logs
+
+
 def parse_api_review(raw: list) -> dict:
    """Parse a review from API response array."""
    try:
@@ -235,7 +267,8 @@ def parse_dom_review(card) -> dict:


 def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
-                   flush_callback=None, flush_batch_size: int = 500) -> dict:
+                   flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None,
+                   progress_callback=None) -> dict:
    """
    Scrape Google Maps reviews.

@@ -247,10 +280,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        flush_callback: Optional callback(reviews_list) called every flush_batch_size reviews
                       This allows streaming data to disk and freeing memory
        flush_batch_size: Number of reviews to collect before flushing (default 500)
+        log_capture: Optional LogCapture instance for storing logs
+        progress_callback: Optional callback(current_count, total_count) called every iteration

    Returns:
        dict with reviews list and metadata
    """
+    # Use provided log_capture or create a dummy that just prints
+    log = log_capture or LogCapture()

    # Storage - use review ID as key
    reviews = {}  # review_id -> review
@@ -298,23 +335,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in

        # Navigate to URL (only on initial load or refresh)
        if not is_refresh:
-            print(f"🌐 Loading: {url[:80]}...")
+            log.info(f"🌐 Loading: {url[:80]}...")
        else:
-            print(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
+            log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
        driver.get(url)

        # Handle consent popup if redirected (poll with tiny sleep)
        start = time.time()
        while time.time() - start < 5:  # Max 5s for consent
            if "consent.google" in driver.current_url:
-                print("  Handling consent popup...")
+                log.info("  Handling consent popup...")
                try:
                    for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
                        txt = btn.text.lower()
                        if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
                            btn.click()
                            # Reload original URL after consent
-                            print("  Reloading after consent...")
+                            log.info("  Reloading after consent...")
                            driver.get(url)
                            break
                except:
@@ -344,7 +381,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                    """)
                    if count:
                        total_reviews[0] = count
-                        print(f"📊 Total reviews on page: {count}")
+                        log.info(f"📊 Total reviews on page: {count}")
                        break
                except:
                    pass
@@ -361,7 +398,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                    tab_text = tab.text.lower()
                    if any(kw in tab_text for kw in review_keywords):
                        if not is_refresh:
-                            print(f"  Clicking reviews tab: '{tab.text}'")
+                            log.info(f"  Clicking reviews tab: '{tab.text}'")
                        tab.click()
                        tab_clicked = True
                        break
@@ -381,24 +418,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                break
            elapsed = int(time.time() - start)
            if elapsed > last_print:
-                print(f"  Waiting for reviews panel...{refresh_label} ({elapsed}s)")
+                log.info(f"  Waiting for reviews panel...{refresh_label} ({elapsed}s)")
                last_print = elapsed
            time.sleep(0.01)  # 10ms - responsive but low CPU

        if not scroll_container:
-            print(f"❌ Could not find reviews scroll container{refresh_label}")
+            log.error(f"❌ Could not find reviews scroll container{refresh_label}")
            try:
-                print("Page title:", driver.title)
-                print("Current URL:", driver.current_url[:100])
+                log.error(f"Page title: {driver.title}")
+                log.error(f"Current URL: {driver.current_url[:100]}")
            except:
                pass
            return None, None

-        print(f"✅ Found scroll container{refresh_label}")
+        log.info(f"✅ Found scroll container{refresh_label}")

        # Inject API interceptor (needs to be re-injected after refresh)
        if not is_refresh:
-            print("🔌 Injecting API interceptor...")
+            log.info("🔌 Injecting API interceptor...")
        driver.execute_script("""
            // Always re-setup on refresh
            window.__reviewInterceptorInjected = true;
@@ -472,12 +509,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                    }
                """)
                time.sleep(0.5)
-                print("  📅 Sorted by newest")
+                log.info("  📅 Sorted by newest")
                # Re-find scroll container after sorting (DOM may be recreated)
                new_container = find_scroll_container()
                if new_container:
                    scroll_container = new_container
-                    print("  🔄 Refreshed scroll container reference")
+                    log.info("  🔄 Refreshed scroll container reference")
        except:
            pass

@@ -495,7 +532,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                return count;
            """)
            if expanded > 0:
-                print(f"  📝 Expanded {expanded} truncated reviews")
+                log.info(f"  📝 Expanded {expanded} truncated reviews")
        except:
            pass

@@ -506,7 +543,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
            })
            driver.execute_cdp_cmd('Network.enable', {})
            if not is_refresh:
-                print("  🚫 Blocking images for faster scrolling")
+                log.info("  🚫 Blocking images for faster scrolling")
        except:
            pass

@@ -605,7 +642,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        hard_refresh_count[0] += 1

        if hard_refresh_count[0] > max_hard_refreshes:
-            print(f"  ⚠️  Max hard refreshes ({max_hard_refreshes}) reached, giving up")
+            log.warning(f"  ⚠️  Max hard refreshes ({max_hard_refreshes}) reached, giving up")
            return False

        # Stop current scroll worker
@@ -618,10 +655,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
            scroll_container = new_container
            stop_scrolling = new_stop
            recovery_count[0] = 0  # Reset recovery count after successful refresh
-            print(f"  ✅ Hard refresh successful, resuming with {len(seen_ids)} reviews already collected")
+            log.info(f"  ✅ Hard refresh successful, resuming with {len(seen_ids)} reviews already collected")
            return True
        else:
-            print(f"  ❌ Hard refresh failed to find scroll container")
+            log.error(f"  ❌ Hard refresh failed to find scroll container")
            return False

    # Main collection loop
@@ -629,7 +666,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
    last_count = len(reviews)
    check_num = 0

-    print(f"🔄 Scrolling... (timeout: {timeout_no_new}s with no new)", flush=True)
+    log.info(f"🔄 Scrolling... (timeout: {timeout_no_new}s with no new)")

    cycle_start = time.time()
    while True:
@@ -711,8 +748,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                    processedIds.add(rid);

                    // Already seen from API - just track order, skip content
+                    // BUT still hide the card to keep DOM light!
                    if (seenSet.has(rid)) {
                        results.push({id: rid, orderOnly: true});
+                        // Hide this card since we already have its data from API
+                        card.style.display = 'none';
+                        card.innerHTML = '';
                        continue;
                    }

@@ -769,10 +810,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                            timestamp: timestamp,
                            source: 'dom'
                        });
-                        // Hide processed card (separators removed on next cycle)
-                        card.style.display = 'none';
-                        card.innerHTML = '';
                    }
+
+                    // ALWAYS hide processed cards to keep DOM light
+                    // (even if extraction failed - we've seen this card)
+                    card.style.display = 'none';
+                    card.innerHTML = '';
                }
                return {reviews: results, cardCount: cards.length, sepsRemoved: sepsRemoved};
            """, seen_list)
@@ -791,14 +834,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                    reviews[rid] = rev
                    seen_ids.add(rid)
        except Exception as e:
-            print(f"  ❌ DOM parse error: {e}")
+            log.error(f"  ❌ DOM parse error: {e}")
        dom_time = time.time() - t2

        # BATCH FLUSH: If we have enough reviews, flush to callback and clear memory
        # Sort by DOM order before flushing
        t3 = time.time()
        if flush_callback and len(reviews) >= flush_batch_size:
-            print(f"  💾 Flushing {len(reviews)} reviews to disk...")
+            log.info(f"  💾 Flushing {len(reviews)} reviews to disk...")
            sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
            flush_callback([r for _, r in sorted_reviews])
            total_flushed[0] += len(reviews)
@@ -809,7 +852,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in

        # TIMING: Print if cycle is slow (>2s)
        if cycle_delta > 2.0:
-            print(f"  ⚠️  SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})")
+            log.warning(f"  ⚠️  SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})")

        # Check for new reviews
        if current_count > last_count:
@@ -844,19 +887,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        elapsed = time.time() - last_new_time
        if total_reviews[0]:
            pct = (current_count / total_reviews[0]) * 100
-            print(f"  📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True)
+            log.info(f"  📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s")
        else:
-            print(f"  📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True)
+            log.info(f"  📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s")
+
+        # Call progress callback on every iteration (for real-time log updates)
+        if progress_callback:
+            progress_callback(current_count, total_reviews[0])

        # Stop conditions - check BEFORE recovery attempts
        if current_count >= max_reviews:
-            print(f"✅ Reached max: {current_count}")
+            log.info(f"✅ Reached max: {current_count}")
            stop_scrolling.set()
            break

        # Also stop if we have all reviews from the page
        if total_reviews[0] and current_count >= total_reviews[0]:
-            print(f"✅ All {current_count} reviews collected")
+            log.info(f"✅ All {current_count} reviews collected")
            stop_scrolling.set()
            break

@@ -865,12 +912,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        if elapsed >= 3 and int(elapsed) % 3 == 0:
            # After 8+ failed recovery attempts, try hard refresh
            if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes:
-                print(f"  🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...", flush=True)
+                log.info(f"  🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...")
                if do_hard_refresh():
                    last_new_time = time.time()  # Reset timer after refresh
                    continue  # Skip to next iteration
            else:
-                print(f"  🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
+                log.info(f"  🔧 Recovery attempt #{recovery_count[0] + 1}...")
                unstick_scroll()

        # Check scroll state - track if content is still being added
@@ -906,24 +953,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        if truly_done or timeout_hit:
            # Last chance: try hard refresh before giving up
            if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
-                print(f"  🔄 Timeout reached, trying hard refresh before giving up...", flush=True)
+                log.info(f"  🔄 Timeout reached, trying hard refresh before giving up...")
                if do_hard_refresh():
                    last_new_time = time.time()
                    continue  # Keep trying
-            print(f"✅ All reviews loaded: {current_count}")
+            log.info(f"✅ All reviews loaded: {current_count}")
            stop_scrolling.set()
            break

    # Flush any remaining reviews (sorted by DOM order)
    if flush_callback and reviews:
-        print(f"  💾 Final flush: {len(reviews)} reviews...")
+        log.info(f"  💾 Final flush: {len(reviews)} reviews...")
        sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
        flush_callback([r for _, r in sorted_reviews])
        total_flushed[0] += len(reviews)
        reviews.clear()

    # Reviews already parsed during scrolling (real-time parsing)
-    print("📝 Finalizing review data...")
+    log.info("📝 Finalizing review data...")

    # Final results (sorted by DOM order)
    sorted_items = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
@@ -933,21 +980,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
    api_count = sum(1 for r in review_list if r.get("source") == "api")

    if total_flushed[0] > 0:
-        print(f"\n📋 Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})")
+        log.info(f"📋 Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})")
    else:
-        print(f"\n📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
+        log.info(f"📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")

    return {
        "reviews": review_list,  # Only unflushed reviews (flushed already sent to callback)
        "total": grand_total,
        "total_flushed": total_flushed[0],
        "checks": check_num,
-        "url": url
+        "url": url,
+        "logs": log.get_logs()
    }


 def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
-                        progress_callback=None, driver=None, return_driver: bool = False):
+                        progress_callback=None, driver=None, return_driver: bool = False,
+                        log_capture: LogCapture = None):
    """
    Production-compatible wrapper for scrape_reviews.
    Matches the API expected by job_manager.py.
@@ -959,9 +1008,10 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
        progress_callback: Optional callback(current_count, total_count) for progress
        driver: Existing driver instance to reuse
        return_driver: If True, return driver in result
+        log_capture: Optional LogCapture instance for real-time log access

    Returns:
-        Dictionary with: reviews, count, total_reviews, time, success, error, driver
+        Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
    """
    from seleniumbase import Driver

@@ -969,6 +1019,9 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
    driver_provided = driver is not None
    should_close_driver = not return_driver and not driver_provided

+    # Use provided log_capture or create new one
+    log_capture = log_capture or LogCapture()
+
    try:
        # Create driver if not provided
        if not driver:
@@ -980,6 +1033,25 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
            )
            driver.set_window_size(1200, 900)  # Proper viewport for Google Maps

+        # Set Chrome geolocation to US (Boston, MA) using CDP
+        # This ensures Google Maps shows US results regardless of server location
+        try:
+            driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
+                'latitude': 42.3601,
+                'longitude': -71.0589,
+                'accuracy': 100
+            })
+            log_capture.info("Set geolocation to US (Boston, MA)")
+        except Exception as e:
+            log_capture.warning(f"Could not set geolocation: {e}")
+
+        # Add URL parameters for consistent results
+        if 'hl=' not in url:
+            separator = '&' if '?' in url else '?'
+            url = f"{url}{separator}hl=en"
+        if 'gl=' not in url:
+            url = f"{url}&gl=us"
+
        # Create progress wrapper if callback provided
        flush_callback = None
        if progress_callback:
@@ -989,14 +1061,16 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
                progress_callback(collected[0], None)
            flush_callback = flush_with_progress

-        # Run the scraper
+        # Run the scraper with progress callback for real-time updates
        result = scrape_reviews(
            driver=driver,
            url=url,
            max_reviews=999999,  # Effectively unlimited
            timeout_no_new=15,
            flush_callback=flush_callback,
-            flush_batch_size=100  # Smaller batches for more frequent progress
+            flush_batch_size=100,  # Smaller batches for more frequent progress
+            log_capture=log_capture,
+            progress_callback=progress_callback  # Pass through for real-time log updates
        )

        elapsed = time.time() - start_time
@@ -1008,7 +1082,8 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
            "total_reviews": result.get("total", 0),
            "time": elapsed,
            "success": True,
-            "error": None
+            "error": None,
+            "logs": result.get("logs", [])
        }

        if return_driver:
@@ -1030,6 +1105,9 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
            except:
                pass

+        # Log error to the existing log_capture
+        log_capture.error(f"Scraper failed: {str(e)}")
+
        return {
            "reviews": [],
            "count": 0,
@@ -1037,7 +1115,8 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
            "time": elapsed,
            "success": False,
            "error": str(e),
-            "driver": driver if return_driver else None
+            "driver": driver if return_driver else None,
+            "logs": log_capture.get_logs()
        }