From 80e7771c00af15947d7fab3eba7323b6cb2aab5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?=
 <35082514+alezmad@users.noreply.github.com>
Date: Fri, 23 Jan 2026 17:23:51 +0000
Subject: [PATCH] Fix DOM cleanup: hide cards from API interception too

The continue statement was skipping the card.style.display='none'
and card.innerHTML='' cleanup for cards already seen via API
interception. This caused DOM to grow unbounded during long scrapes.

Now ALL processed cards are hidden regardless of data source.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 modules/scraper_clean.py | 171 ++++++++++++++++++++++++++++-----------
 1 file changed, 125 insertions(+), 46 deletions(-)

diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py
index 126de46..1648749 100644
--- a/modules/scraper_clean.py
+++ b/modules/scraper_clean.py
@@ -8,9 +8,41 @@ import re
 import json
 import time
 import threading
+from datetime import datetime
 from selenium.webdriver.common.by import By
 
 
+class LogCapture:
+    """Captures scraper logs for storage and viewing."""
+
+    def __init__(self):
+        self.logs = []
+
+    def log(self, message: str, level: str = "INFO", source: str = "scraper"):
+        """Add a log entry with timestamp."""
+        entry = {
+            "timestamp": datetime.utcnow().isoformat() + "Z",
+            "level": level,
+            "source": source,
+            "message": message
+        }
+        self.logs.append(entry)
+        # Also print for console visibility
+        print(message, flush=True)
+
+    def info(self, message: str, source: str = "scraper"):
+        self.log(message, "INFO", source)
+
+    def warning(self, message: str, source: str = "scraper"):
+        self.log(message, "WARNING", source)
+
+    def error(self, message: str, source: str = "scraper"):
+        self.log(message, "ERROR", source)
+
+    def get_logs(self):
+        return self.logs
+
+
 def parse_api_review(raw: list) -> dict:
     """Parse a review from API response array."""
     try:
@@ -235,7 +267,8 @@ def parse_dom_review(card) -> dict:
 
 
 def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
-                   flush_callback=None, flush_batch_size: int = 500) -> dict:
+                   flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None,
+                   progress_callback=None) -> dict:
     """
     Scrape Google Maps reviews.
 
@@ -247,10 +280,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
         flush_callback: Optional callback(reviews_list) called every flush_batch_size reviews
                        This allows streaming data to disk and freeing memory
         flush_batch_size: Number of reviews to collect before flushing (default 500)
+        log_capture: Optional LogCapture instance for storing logs
+        progress_callback: Optional callback(current_count, total_count) called every iteration
 
     Returns:
         dict with reviews list and metadata
     """
+    # Use provided log_capture or create a dummy that just prints
+    log = log_capture or LogCapture()
 
     # Storage - use review ID as key
     reviews = {}  # review_id -> review
@@ -298,23 +335,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
 
         # Navigate to URL (only on initial load or refresh)
         if not is_refresh:
-            print(f"🌐 Loading: {url[:80]}...")
+            log.info(f"🌐 Loading: {url[:80]}...")
         else:
-            print(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
+            log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
         driver.get(url)
 
         # Handle consent popup if redirected (poll with tiny sleep)
         start = time.time()
         while time.time() - start < 5:  # Max 5s for consent
             if "consent.google" in driver.current_url:
-                print("  Handling consent popup...")
+                log.info("  Handling consent popup...")
                 try:
                     for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
                         txt = btn.text.lower()
                         if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
                             btn.click()
                             # Reload original URL after consent
-                            print("  Reloading after consent...")
+                            log.info("  Reloading after consent...")
                             driver.get(url)
                             break
                 except:
@@ -344,7 +381,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                     """)
                     if count:
                         total_reviews[0] = count
-                        print(f"📊 Total reviews on page: {count}")
+                        log.info(f"📊 Total reviews on page: {count}")
                         break
                 except:
                     pass
@@ -361,7 +398,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                     tab_text = tab.text.lower()
                     if any(kw in tab_text for kw in review_keywords):
                         if not is_refresh:
-                            print(f"  Clicking reviews tab: '{tab.text}'")
+                            log.info(f"  Clicking reviews tab: '{tab.text}'")
                         tab.click()
                         tab_clicked = True
                         break
@@ -381,24 +418,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                 break
             elapsed = int(time.time() - start)
             if elapsed > last_print:
-                print(f"  Waiting for reviews panel...{refresh_label} ({elapsed}s)")
+                log.info(f"  Waiting for reviews panel...{refresh_label} ({elapsed}s)")
                 last_print = elapsed
             time.sleep(0.01)  # 10ms - responsive but low CPU
 
         if not scroll_container:
-            print(f"❌ Could not find reviews scroll container{refresh_label}")
+            log.error(f"❌ Could not find reviews scroll container{refresh_label}")
             try:
-                print("Page title:", driver.title)
-                print("Current URL:", driver.current_url[:100])
+                log.error(f"Page title: {driver.title}")
+                log.error(f"Current URL: {driver.current_url[:100]}")
             except:
                 pass
             return None, None
 
-        print(f"✅ Found scroll container{refresh_label}")
+        log.info(f"✅ Found scroll container{refresh_label}")
 
         # Inject API interceptor (needs to be re-injected after refresh)
         if not is_refresh:
-            print("🔌 Injecting API interceptor...")
+            log.info("🔌 Injecting API interceptor...")
         driver.execute_script("""
             // Always re-setup on refresh
             window.__reviewInterceptorInjected = true;
@@ -472,12 +509,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                     }
                 """)
                 time.sleep(0.5)
-                print("  📅 Sorted by newest")
+                log.info("  📅 Sorted by newest")
                 # Re-find scroll container after sorting (DOM may be recreated)
                 new_container = find_scroll_container()
                 if new_container:
                     scroll_container = new_container
-                    print("  🔄 Refreshed scroll container reference")
+                    log.info("  🔄 Refreshed scroll container reference")
         except:
             pass
 
@@ -495,7 +532,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                 return count;
             """)
             if expanded > 0:
-                print(f"  📝 Expanded {expanded} truncated reviews")
+                log.info(f"  📝 Expanded {expanded} truncated reviews")
         except:
             pass
 
@@ -506,7 +543,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
             })
             driver.execute_cdp_cmd('Network.enable', {})
             if not is_refresh:
-                print("  🚫 Blocking images for faster scrolling")
+                log.info("  🚫 Blocking images for faster scrolling")
         except:
             pass
 
@@ -605,7 +642,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
         hard_refresh_count[0] += 1
 
         if hard_refresh_count[0] > max_hard_refreshes:
-            print(f"  ⚠️  Max hard refreshes ({max_hard_refreshes}) reached, giving up")
+            log.warning(f"  ⚠️  Max hard refreshes ({max_hard_refreshes}) reached, giving up")
             return False
 
         # Stop current scroll worker
@@ -618,10 +655,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
             scroll_container = new_container
             stop_scrolling = new_stop
             recovery_count[0] = 0  # Reset recovery count after successful refresh
-            print(f"  ✅ Hard refresh successful, resuming with {len(seen_ids)} reviews already collected")
+            log.info(f"  ✅ Hard refresh successful, resuming with {len(seen_ids)} reviews already collected")
             return True
         else:
-            print(f"  ❌ Hard refresh failed to find scroll container")
+            log.error(f"  ❌ Hard refresh failed to find scroll container")
             return False
 
     # Main collection loop
@@ -629,7 +666,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
     last_count = len(reviews)
     check_num = 0
 
-    print(f"🔄 Scrolling... (timeout: {timeout_no_new}s with no new)", flush=True)
+    log.info(f"🔄 Scrolling... (timeout: {timeout_no_new}s with no new)")
 
     cycle_start = time.time()
     while True:
@@ -711,8 +748,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                     processedIds.add(rid);
 
                     // Already seen from API - just track order, skip content
+                    // BUT still hide the card to keep DOM light!
                     if (seenSet.has(rid)) {
                         results.push({id: rid, orderOnly: true});
+                        // Hide this card since we already have its data from API
+                        card.style.display = 'none';
+                        card.innerHTML = '';
                         continue;
                     }
 
@@ -769,10 +810,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                             timestamp: timestamp,
                             source: 'dom'
                         });
-                        // Hide processed card (separators removed on next cycle)
-                        card.style.display = 'none';
-                        card.innerHTML = '';
                     }
+
+                    // ALWAYS hide processed cards to keep DOM light
+                    // (even if extraction failed - we've seen this card)
+                    card.style.display = 'none';
+                    card.innerHTML = '';
                 }
                 return {reviews: results, cardCount: cards.length, sepsRemoved: sepsRemoved};
             """, seen_list)
@@ -791,14 +834,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                     reviews[rid] = rev
                     seen_ids.add(rid)
         except Exception as e:
-            print(f"  ❌ DOM parse error: {e}")
+            log.error(f"  ❌ DOM parse error: {e}")
         dom_time = time.time() - t2
 
         # BATCH FLUSH: If we have enough reviews, flush to callback and clear memory
         # Sort by DOM order before flushing
         t3 = time.time()
         if flush_callback and len(reviews) >= flush_batch_size:
-            print(f"  💾 Flushing {len(reviews)} reviews to disk...")
+            log.info(f"  💾 Flushing {len(reviews)} reviews to disk...")
             sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
             flush_callback([r for _, r in sorted_reviews])
             total_flushed[0] += len(reviews)
@@ -809,7 +852,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
 
         # TIMING: Print if cycle is slow (>2s)
         if cycle_delta > 2.0:
-            print(f"  ⚠️  SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})")
+            log.warning(f"  ⚠️  SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})")
 
         # Check for new reviews
         if current_count > last_count:
@@ -844,19 +887,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
         elapsed = time.time() - last_new_time
         if total_reviews[0]:
             pct = (current_count / total_reviews[0]) * 100
-            print(f"  📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True)
+            log.info(f"  📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s")
         else:
-            print(f"  📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True)
+            log.info(f"  📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s")
+
+        # Call progress callback on every iteration (for real-time log updates)
+        if progress_callback:
+            progress_callback(current_count, total_reviews[0])
 
         # Stop conditions - check BEFORE recovery attempts
         if current_count >= max_reviews:
-            print(f"✅ Reached max: {current_count}")
+            log.info(f"✅ Reached max: {current_count}")
             stop_scrolling.set()
             break
 
         # Also stop if we have all reviews from the page
         if total_reviews[0] and current_count >= total_reviews[0]:
-            print(f"✅ All {current_count} reviews collected")
+            log.info(f"✅ All {current_count} reviews collected")
             stop_scrolling.set()
             break
 
@@ -865,12 +912,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
         if elapsed >= 3 and int(elapsed) % 3 == 0:
             # After 8+ failed recovery attempts, try hard refresh
             if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes:
-                print(f"  🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...", flush=True)
+                log.info(f"  🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...")
                 if do_hard_refresh():
                     last_new_time = time.time()  # Reset timer after refresh
                     continue  # Skip to next iteration
             else:
-                print(f"  🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
+                log.info(f"  🔧 Recovery attempt #{recovery_count[0] + 1}...")
                 unstick_scroll()
 
         # Check scroll state - track if content is still being added
@@ -906,24 +953,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
         if truly_done or timeout_hit:
             # Last chance: try hard refresh before giving up
             if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
-                print(f"  🔄 Timeout reached, trying hard refresh before giving up...", flush=True)
+                log.info(f"  🔄 Timeout reached, trying hard refresh before giving up...")
                 if do_hard_refresh():
                     last_new_time = time.time()
                     continue  # Keep trying
-            print(f"✅ All reviews loaded: {current_count}")
+            log.info(f"✅ All reviews loaded: {current_count}")
             stop_scrolling.set()
             break
 
     # Flush any remaining reviews (sorted by DOM order)
     if flush_callback and reviews:
-        print(f"  💾 Final flush: {len(reviews)} reviews...")
+        log.info(f"  💾 Final flush: {len(reviews)} reviews...")
         sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
         flush_callback([r for _, r in sorted_reviews])
         total_flushed[0] += len(reviews)
         reviews.clear()
 
     # Reviews already parsed during scrolling (real-time parsing)
-    print("📝 Finalizing review data...")
+    log.info("📝 Finalizing review data...")
 
     # Final results (sorted by DOM order)
     sorted_items = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
@@ -933,21 +980,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
     api_count = sum(1 for r in review_list if r.get("source") == "api")
 
     if total_flushed[0] > 0:
-        print(f"\n📋 Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})")
+        log.info(f"📋 Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})")
     else:
-        print(f"\n📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
+        log.info(f"📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
 
     return {
         "reviews": review_list,  # Only unflushed reviews (flushed already sent to callback)
         "total": grand_total,
         "total_flushed": total_flushed[0],
         "checks": check_num,
-        "url": url
+        "url": url,
+        "logs": log.get_logs()
     }
 
 
 def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
-                        progress_callback=None, driver=None, return_driver: bool = False):
+                        progress_callback=None, driver=None, return_driver: bool = False,
+                        log_capture: LogCapture = None):
     """
     Production-compatible wrapper for scrape_reviews.
     Matches the API expected by job_manager.py.
@@ -959,9 +1008,10 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
         progress_callback: Optional callback(current_count, total_count) for progress
         driver: Existing driver instance to reuse
         return_driver: If True, return driver in result
+        log_capture: Optional LogCapture instance for real-time log access
 
     Returns:
-        Dictionary with: reviews, count, total_reviews, time, success, error, driver
+        Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
     """
     from seleniumbase import Driver
 
@@ -969,6 +1019,9 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
     driver_provided = driver is not None
     should_close_driver = not return_driver and not driver_provided
 
+    # Use provided log_capture or create new one
+    log_capture = log_capture or LogCapture()
+
     try:
         # Create driver if not provided
         if not driver:
@@ -980,6 +1033,25 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
             )
             driver.set_window_size(1200, 900)  # Proper viewport for Google Maps
 
+        # Set Chrome geolocation to US (Boston, MA) using CDP
+        # This ensures Google Maps shows US results regardless of server location
+        try:
+            driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
+                'latitude': 42.3601,
+                'longitude': -71.0589,
+                'accuracy': 100
+            })
+            log_capture.info("Set geolocation to US (Boston, MA)")
+        except Exception as e:
+            log_capture.warning(f"Could not set geolocation: {e}")
+
+        # Add URL parameters for consistent results
+        if 'hl=' not in url:
+            separator = '&' if '?' in url else '?'
+            url = f"{url}{separator}hl=en"
+        if 'gl=' not in url:
+            url = f"{url}&gl=us"
+
         # Create progress wrapper if callback provided
         flush_callback = None
         if progress_callback:
@@ -989,14 +1061,16 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
                 progress_callback(collected[0], None)
             flush_callback = flush_with_progress
 
-        # Run the scraper
+        # Run the scraper with progress callback for real-time updates
         result = scrape_reviews(
             driver=driver,
             url=url,
             max_reviews=999999,  # Effectively unlimited
             timeout_no_new=15,
             flush_callback=flush_callback,
-            flush_batch_size=100  # Smaller batches for more frequent progress
+            flush_batch_size=100,  # Smaller batches for more frequent progress
+            log_capture=log_capture,
+            progress_callback=progress_callback  # Pass through for real-time log updates
         )
 
         elapsed = time.time() - start_time
@@ -1008,7 +1082,8 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
             "total_reviews": result.get("total", 0),
             "time": elapsed,
             "success": True,
-            "error": None
+            "error": None,
+            "logs": result.get("logs", [])
         }
 
         if return_driver:
@@ -1030,6 +1105,9 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
             except:
                 pass
 
+        # Log error to the existing log_capture
+        log_capture.error(f"Scraper failed: {str(e)}")
+
         return {
             "reviews": [],
             "count": 0,
@@ -1037,7 +1115,8 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
             "time": elapsed,
             "success": False,
             "error": str(e),
-            "driver": driver if return_driver else None
+            "driver": driver if return_driver else None,
+            "logs": log_capture.get_logs()
         }