Fix DOM cleanup: hide cards from API interception too

The continue statement was skipping the card.style.display='none' and card.innerHTML='' cleanup for cards already seen via API interception. This caused DOM to grow unbounded during long scrapes. Now ALL processed cards are hidden regardless of data source. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 17:23:51 +00:00
parent 01ea18d91d
commit 80e7771c00
1 changed files with 125 additions and 46 deletions
--- a/modules/scraper_clean.py
+++ b/modules/scraper_clean.py
@@ -8,9 +8,41 @@ import re
 import json
 import time
 import threading
 from datetime import datetime
 from selenium.webdriver.common.by import By
 class LogCapture:
    """Captures scraper logs for storage and viewing."""
    def __init__(self):
        self.logs = []
    def log(self, message: str, level: str = "INFO", source: str = "scraper"):
        """Add a log entry with timestamp."""
        entry = {
            "timestamp": datetime.utcnow().isoformat() + "Z",
            "level": level,
            "source": source,
            "message": message
        }
        self.logs.append(entry)
        # Also print for console visibility
        print(message, flush=True)
    def info(self, message: str, source: str = "scraper"):
        self.log(message, "INFO", source)
    def warning(self, message: str, source: str = "scraper"):
        self.log(message, "WARNING", source)
    def error(self, message: str, source: str = "scraper"):
        self.log(message, "ERROR", source)
    def get_logs(self):
        return self.logs
 def parse_api_review(raw: list) -> dict:
    """Parse a review from API response array."""
    try:
@@ -235,7 +267,8 @@ def parse_dom_review(card) -> dict:
 def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
-                   flush_callback=None, flush_batch_size: int = 500) -> dict:
+                   flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None,
                   progress_callback=None) -> dict:
    """
    Scrape Google Maps reviews.
@@ -247,10 +280,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        flush_callback: Optional callback(reviews_list) called every flush_batch_size reviews
                       This allows streaming data to disk and freeing memory
        flush_batch_size: Number of reviews to collect before flushing (default 500)
        log_capture: Optional LogCapture instance for storing logs
        progress_callback: Optional callback(current_count, total_count) called every iteration
    Returns:
        dict with reviews list and metadata
    """
    # Use provided log_capture or create a dummy that just prints
    log = log_capture or LogCapture()
    # Storage - use review ID as key
    reviews = {}  # review_id -> review
@@ -298,23 +335,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        # Navigate to URL (only on initial load or refresh)
        if not is_refresh:
-            print(f"🌐 Loading: {url[:80]}...")
+            log.info(f"🌐 Loading: {url[:80]}...")
        else:
-            print(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
+            log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
        driver.get(url)
        # Handle consent popup if redirected (poll with tiny sleep)
        start = time.time()
        while time.time() - start < 5:  # Max 5s for consent
            if "consent.google" in driver.current_url:
-                print("  Handling consent popup...")
+                log.info("  Handling consent popup...")
                try:
                    for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
                        txt = btn.text.lower()
                        if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
                            btn.click()
                            # Reload original URL after consent
-                            print("  Reloading after consent...")
+                            log.info("  Reloading after consent...")
                            driver.get(url)
                            break
                except:
@@ -344,7 +381,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                    """)
                    if count:
                        total_reviews[0] = count
-                        print(f"📊 Total reviews on page: {count}")
+                        log.info(f"📊 Total reviews on page: {count}")
                        break
                except:
                    pass
@@ -361,7 +398,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                    tab_text = tab.text.lower()
                    if any(kw in tab_text for kw in review_keywords):
                        if not is_refresh:
-                            print(f"  Clicking reviews tab: '{tab.text}'")
+                            log.info(f"  Clicking reviews tab: '{tab.text}'")
                        tab.click()
                        tab_clicked = True
                        break
@@ -381,24 +418,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                break
            elapsed = int(time.time() - start)
            if elapsed > last_print:
-                print(f"  Waiting for reviews panel...{refresh_label} ({elapsed}s)")
+                log.info(f"  Waiting for reviews panel...{refresh_label} ({elapsed}s)")
                last_print = elapsed
            time.sleep(0.01)  # 10ms - responsive but low CPU
        if not scroll_container:
-            print(f"❌ Could not find reviews scroll container{refresh_label}")
+            log.error(f"❌ Could not find reviews scroll container{refresh_label}")
            try:
-                print("Page title:", driver.title)
+                log.error(f"Page title: {driver.title}")
-                print("Current URL:", driver.current_url[:100])
+                log.error(f"Current URL: {driver.current_url[:100]}")
            except:
                pass
            return None, None
-        print(f"✅ Found scroll container{refresh_label}")
+        log.info(f"✅ Found scroll container{refresh_label}")
        # Inject API interceptor (needs to be re-injected after refresh)
        if not is_refresh:
-            print("🔌 Injecting API interceptor...")
+            log.info("🔌 Injecting API interceptor...")
        driver.execute_script("""
            // Always re-setup on refresh
            window.__reviewInterceptorInjected = true;
@@ -472,12 +509,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                    }
                """)
                time.sleep(0.5)
-                print("  📅 Sorted by newest")
+                log.info("  📅 Sorted by newest")
                # Re-find scroll container after sorting (DOM may be recreated)
                new_container = find_scroll_container()
                if new_container:
                    scroll_container = new_container
-                    print("  🔄 Refreshed scroll container reference")
+                    log.info("  🔄 Refreshed scroll container reference")
        except:
            pass
@@ -495,7 +532,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                return count;
            """)
            if expanded > 0:
-                print(f"  📝 Expanded {expanded} truncated reviews")
+                log.info(f"  📝 Expanded {expanded} truncated reviews")
        except:
            pass
@@ -506,7 +543,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
            })
            driver.execute_cdp_cmd('Network.enable', {})
            if not is_refresh:
-                print("  🚫 Blocking images for faster scrolling")
+                log.info("  🚫 Blocking images for faster scrolling")
        except:
            pass
@@ -605,7 +642,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        hard_refresh_count[0] += 1
        if hard_refresh_count[0] > max_hard_refreshes:
-            print(f"  ⚠️  Max hard refreshes ({max_hard_refreshes}) reached, giving up")
+            log.warning(f"  ⚠️  Max hard refreshes ({max_hard_refreshes}) reached, giving up")
            return False
        # Stop current scroll worker
@@ -618,10 +655,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
            scroll_container = new_container
            stop_scrolling = new_stop
            recovery_count[0] = 0  # Reset recovery count after successful refresh
-            print(f"  ✅ Hard refresh successful, resuming with {len(seen_ids)} reviews already collected")
+            log.info(f"  ✅ Hard refresh successful, resuming with {len(seen_ids)} reviews already collected")
            return True
        else:
-            print(f"  ❌ Hard refresh failed to find scroll container")
+            log.error(f"  ❌ Hard refresh failed to find scroll container")
            return False
    # Main collection loop
@@ -629,7 +666,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
    last_count = len(reviews)
    check_num = 0
-    print(f"🔄 Scrolling... (timeout: {timeout_no_new}s with no new)", flush=True)
+    log.info(f"🔄 Scrolling... (timeout: {timeout_no_new}s with no new)")
    cycle_start = time.time()
    while True:
@@ -711,8 +748,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                    processedIds.add(rid);
                    // Already seen from API - just track order, skip content
                    // BUT still hide the card to keep DOM light!
                    if (seenSet.has(rid)) {
                        results.push({id: rid, orderOnly: true});
                        // Hide this card since we already have its data from API
                        card.style.display = 'none';
                        card.innerHTML = '';
                        continue;
                    }
@@ -769,10 +810,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                            timestamp: timestamp,
                            source: 'dom'
                        });
                        // Hide processed card (separators removed on next cycle)
                        card.style.display = 'none';
                        card.innerHTML = '';
                    }
                    // ALWAYS hide processed cards to keep DOM light
                    // (even if extraction failed - we've seen this card)
                    card.style.display = 'none';
                    card.innerHTML = '';
                }
                return {reviews: results, cardCount: cards.length, sepsRemoved: sepsRemoved};
            """, seen_list)
@@ -791,14 +834,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                    reviews[rid] = rev
                    seen_ids.add(rid)
        except Exception as e:
-            print(f"  ❌ DOM parse error: {e}")
+            log.error(f"  ❌ DOM parse error: {e}")
        dom_time = time.time() - t2
        # BATCH FLUSH: If we have enough reviews, flush to callback and clear memory
        # Sort by DOM order before flushing
        t3 = time.time()
        if flush_callback and len(reviews) >= flush_batch_size:
-            print(f"  💾 Flushing {len(reviews)} reviews to disk...")
+            log.info(f"  💾 Flushing {len(reviews)} reviews to disk...")
            sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
            flush_callback([r for _, r in sorted_reviews])
            total_flushed[0] += len(reviews)
@@ -809,7 +852,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        # TIMING: Print if cycle is slow (>2s)
        if cycle_delta > 2.0:
-            print(f"  ⚠️  SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})")
+            log.warning(f"  ⚠️  SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})")
        # Check for new reviews
        if current_count > last_count:
@@ -844,19 +887,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        elapsed = time.time() - last_new_time
        if total_reviews[0]:
            pct = (current_count / total_reviews[0]) * 100
-            print(f"  📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True)
+            log.info(f"  📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s")
        else:
-            print(f"  📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True)
+            log.info(f"  📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s")
        # Call progress callback on every iteration (for real-time log updates)
        if progress_callback:
            progress_callback(current_count, total_reviews[0])
        # Stop conditions - check BEFORE recovery attempts
        if current_count >= max_reviews:
-            print(f"✅ Reached max: {current_count}")
+            log.info(f"✅ Reached max: {current_count}")
            stop_scrolling.set()
            break
        # Also stop if we have all reviews from the page
        if total_reviews[0] and current_count >= total_reviews[0]:
-            print(f"✅ All {current_count} reviews collected")
+            log.info(f"✅ All {current_count} reviews collected")
            stop_scrolling.set()
            break
@@ -865,12 +912,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        if elapsed >= 3 and int(elapsed) % 3 == 0:
            # After 8+ failed recovery attempts, try hard refresh
            if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes:
-                print(f"  🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...", flush=True)
+                log.info(f"  🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...")
                if do_hard_refresh():
                    last_new_time = time.time()  # Reset timer after refresh
                    continue  # Skip to next iteration
            else:
-                print(f"  🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
+                log.info(f"  🔧 Recovery attempt #{recovery_count[0] + 1}...")
                unstick_scroll()
        # Check scroll state - track if content is still being added
@@ -906,24 +953,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        if truly_done or timeout_hit:
            # Last chance: try hard refresh before giving up
            if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
-                print(f"  🔄 Timeout reached, trying hard refresh before giving up...", flush=True)
+                log.info(f"  🔄 Timeout reached, trying hard refresh before giving up...")
                if do_hard_refresh():
                    last_new_time = time.time()
                    continue  # Keep trying
-            print(f"✅ All reviews loaded: {current_count}")
+            log.info(f"✅ All reviews loaded: {current_count}")
            stop_scrolling.set()
            break
    # Flush any remaining reviews (sorted by DOM order)
    if flush_callback and reviews:
-        print(f"  💾 Final flush: {len(reviews)} reviews...")
+        log.info(f"  💾 Final flush: {len(reviews)} reviews...")
        sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
        flush_callback([r for _, r in sorted_reviews])
        total_flushed[0] += len(reviews)
        reviews.clear()
    # Reviews already parsed during scrolling (real-time parsing)
-    print("📝 Finalizing review data...")
+    log.info("📝 Finalizing review data...")
    # Final results (sorted by DOM order)
    sorted_items = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
@@ -933,21 +980,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
    api_count = sum(1 for r in review_list if r.get("source") == "api")
    if total_flushed[0] > 0:
-        print(f"\n📋 Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})")
+        log.info(f"📋 Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})")
    else:
-        print(f"\n📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
+        log.info(f"📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
    return {
        "reviews": review_list,  # Only unflushed reviews (flushed already sent to callback)
        "total": grand_total,
        "total_flushed": total_flushed[0],
        "checks": check_num,
-        "url": url
+        "url": url,
        "logs": log.get_logs()
    }
 def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
-                        progress_callback=None, driver=None, return_driver: bool = False):
+                        progress_callback=None, driver=None, return_driver: bool = False,
                        log_capture: LogCapture = None):
    """
    Production-compatible wrapper for scrape_reviews.
    Matches the API expected by job_manager.py.
@@ -959,9 +1008,10 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
        progress_callback: Optional callback(current_count, total_count) for progress
        driver: Existing driver instance to reuse
        return_driver: If True, return driver in result
        log_capture: Optional LogCapture instance for real-time log access
    Returns:
-        Dictionary with: reviews, count, total_reviews, time, success, error, driver
+        Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
    """
    from seleniumbase import Driver
@@ -969,6 +1019,9 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
    driver_provided = driver is not None
    should_close_driver = not return_driver and not driver_provided
    # Use provided log_capture or create new one
    log_capture = log_capture or LogCapture()
    try:
        # Create driver if not provided
        if not driver:
@@ -980,6 +1033,25 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
            )
            driver.set_window_size(1200, 900)  # Proper viewport for Google Maps
        # Set Chrome geolocation to US (Boston, MA) using CDP
        # This ensures Google Maps shows US results regardless of server location
        try:
            driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
                'latitude': 42.3601,
                'longitude': -71.0589,
                'accuracy': 100
            })
            log_capture.info("Set geolocation to US (Boston, MA)")
        except Exception as e:
            log_capture.warning(f"Could not set geolocation: {e}")
        # Add URL parameters for consistent results
        if 'hl=' not in url:
            separator = '&' if '?' in url else '?'
            url = f"{url}{separator}hl=en"
        if 'gl=' not in url:
            url = f"{url}&gl=us"
        # Create progress wrapper if callback provided
        flush_callback = None
        if progress_callback:
@@ -989,14 +1061,16 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
                progress_callback(collected[0], None)
            flush_callback = flush_with_progress
-        # Run the scraper
+        # Run the scraper with progress callback for real-time updates
        result = scrape_reviews(
            driver=driver,
            url=url,
            max_reviews=999999,  # Effectively unlimited
            timeout_no_new=15,
            flush_callback=flush_callback,
-            flush_batch_size=100  # Smaller batches for more frequent progress
+            flush_batch_size=100,  # Smaller batches for more frequent progress
            log_capture=log_capture,
            progress_callback=progress_callback  # Pass through for real-time log updates
        )
        elapsed = time.time() - start_time
@@ -1008,7 +1082,8 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
            "total_reviews": result.get("total", 0),
            "time": elapsed,
            "success": True,
-            "error": None
+            "error": None,
            "logs": result.get("logs", [])
        }
        if return_driver:
@@ -1030,6 +1105,9 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
            except:
                pass
        # Log error to the existing log_capture
        log_capture.error(f"Scraper failed: {str(e)}")
        return {
            "reviews": [],
            "count": 0,
@@ -1037,7 +1115,8 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
            "time": elapsed,
            "success": False,
            "error": str(e),
-            "driver": driver if return_driver else None
+            "driver": driver if return_driver else None,
            "logs": log_capture.get_logs()
        }