From 80e7771c00af15947d7fab3eba7323b6cb2aab5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Fri, 23 Jan 2026 17:23:51 +0000 Subject: [PATCH] Fix DOM cleanup: hide cards from API interception too The continue statement was skipping the card.style.display='none' and card.innerHTML='' cleanup for cards already seen via API interception. This caused DOM to grow unbounded during long scrapes. Now ALL processed cards are hidden regardless of data source. Co-Authored-By: Claude Opus 4.5 --- modules/scraper_clean.py | 171 ++++++++++++++++++++++++++++----------- 1 file changed, 125 insertions(+), 46 deletions(-) diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py index 126de46..1648749 100644 --- a/modules/scraper_clean.py +++ b/modules/scraper_clean.py @@ -8,9 +8,41 @@ import re import json import time import threading +from datetime import datetime from selenium.webdriver.common.by import By +class LogCapture: + """Captures scraper logs for storage and viewing.""" + + def __init__(self): + self.logs = [] + + def log(self, message: str, level: str = "INFO", source: str = "scraper"): + """Add a log entry with timestamp.""" + entry = { + "timestamp": datetime.utcnow().isoformat() + "Z", + "level": level, + "source": source, + "message": message + } + self.logs.append(entry) + # Also print for console visibility + print(message, flush=True) + + def info(self, message: str, source: str = "scraper"): + self.log(message, "INFO", source) + + def warning(self, message: str, source: str = "scraper"): + self.log(message, "WARNING", source) + + def error(self, message: str, source: str = "scraper"): + self.log(message, "ERROR", source) + + def get_logs(self): + return self.logs + + def parse_api_review(raw: list) -> dict: """Parse a review from API response array.""" try: @@ -235,7 +267,8 @@ def parse_dom_review(card) -> dict: def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15, - flush_callback=None, flush_batch_size: int = 500) -> dict: + flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None, + progress_callback=None) -> dict: """ Scrape Google Maps reviews. @@ -247,10 +280,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in flush_callback: Optional callback(reviews_list) called every flush_batch_size reviews This allows streaming data to disk and freeing memory flush_batch_size: Number of reviews to collect before flushing (default 500) + log_capture: Optional LogCapture instance for storing logs + progress_callback: Optional callback(current_count, total_count) called every iteration Returns: dict with reviews list and metadata """ + # Use provided log_capture or create a dummy that just prints + log = log_capture or LogCapture() # Storage - use review ID as key reviews = {} # review_id -> review @@ -298,23 +335,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in # Navigate to URL (only on initial load or refresh) if not is_refresh: - print(f"🌐 Loading: {url[:80]}...") + log.info(f"🌐 Loading: {url[:80]}...") else: - print(f"šŸ”„ Hard refresh #{hard_refresh_count[0]}: reloading page...") + log.info(f"šŸ”„ Hard refresh #{hard_refresh_count[0]}: reloading page...") driver.get(url) # Handle consent popup if redirected (poll with tiny sleep) start = time.time() while time.time() - start < 5: # Max 5s for consent if "consent.google" in driver.current_url: - print(" Handling consent popup...") + log.info(" Handling consent popup...") try: for btn in driver.find_elements(By.CSS_SELECTOR, "button"): txt = btn.text.lower() if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt: btn.click() # Reload original URL after consent - print(" Reloading after consent...") + log.info(" Reloading after consent...") driver.get(url) break except: @@ -344,7 +381,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in """) if count: total_reviews[0] = count - print(f"šŸ“Š Total reviews on page: {count}") + log.info(f"šŸ“Š Total reviews on page: {count}") break except: pass @@ -361,7 +398,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in tab_text = tab.text.lower() if any(kw in tab_text for kw in review_keywords): if not is_refresh: - print(f" Clicking reviews tab: '{tab.text}'") + log.info(f" Clicking reviews tab: '{tab.text}'") tab.click() tab_clicked = True break @@ -381,24 +418,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in break elapsed = int(time.time() - start) if elapsed > last_print: - print(f" Waiting for reviews panel...{refresh_label} ({elapsed}s)") + log.info(f" Waiting for reviews panel...{refresh_label} ({elapsed}s)") last_print = elapsed time.sleep(0.01) # 10ms - responsive but low CPU if not scroll_container: - print(f"āŒ Could not find reviews scroll container{refresh_label}") + log.error(f"āŒ Could not find reviews scroll container{refresh_label}") try: - print("Page title:", driver.title) - print("Current URL:", driver.current_url[:100]) + log.error(f"Page title: {driver.title}") + log.error(f"Current URL: {driver.current_url[:100]}") except: pass return None, None - print(f"āœ… Found scroll container{refresh_label}") + log.info(f"āœ… Found scroll container{refresh_label}") # Inject API interceptor (needs to be re-injected after refresh) if not is_refresh: - print("šŸ”Œ Injecting API interceptor...") + log.info("šŸ”Œ Injecting API interceptor...") driver.execute_script(""" // Always re-setup on refresh window.__reviewInterceptorInjected = true; @@ -472,12 +509,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in } """) time.sleep(0.5) - print(" šŸ“… Sorted by newest") + log.info(" šŸ“… Sorted by newest") # Re-find scroll container after sorting (DOM may be recreated) new_container = find_scroll_container() if new_container: scroll_container = new_container - print(" šŸ”„ Refreshed scroll container reference") + log.info(" šŸ”„ Refreshed scroll container reference") except: pass @@ -495,7 +532,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in return count; """) if expanded > 0: - print(f" šŸ“ Expanded {expanded} truncated reviews") + log.info(f" šŸ“ Expanded {expanded} truncated reviews") except: pass @@ -506,7 +543,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in }) driver.execute_cdp_cmd('Network.enable', {}) if not is_refresh: - print(" 🚫 Blocking images for faster scrolling") + log.info(" 🚫 Blocking images for faster scrolling") except: pass @@ -605,7 +642,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in hard_refresh_count[0] += 1 if hard_refresh_count[0] > max_hard_refreshes: - print(f" āš ļø Max hard refreshes ({max_hard_refreshes}) reached, giving up") + log.warning(f" āš ļø Max hard refreshes ({max_hard_refreshes}) reached, giving up") return False # Stop current scroll worker @@ -618,10 +655,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in scroll_container = new_container stop_scrolling = new_stop recovery_count[0] = 0 # Reset recovery count after successful refresh - print(f" āœ… Hard refresh successful, resuming with {len(seen_ids)} reviews already collected") + log.info(f" āœ… Hard refresh successful, resuming with {len(seen_ids)} reviews already collected") return True else: - print(f" āŒ Hard refresh failed to find scroll container") + log.error(f" āŒ Hard refresh failed to find scroll container") return False # Main collection loop @@ -629,7 +666,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in last_count = len(reviews) check_num = 0 - print(f"šŸ”„ Scrolling... (timeout: {timeout_no_new}s with no new)", flush=True) + log.info(f"šŸ”„ Scrolling... (timeout: {timeout_no_new}s with no new)") cycle_start = time.time() while True: @@ -711,8 +748,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in processedIds.add(rid); // Already seen from API - just track order, skip content + // BUT still hide the card to keep DOM light! if (seenSet.has(rid)) { results.push({id: rid, orderOnly: true}); + // Hide this card since we already have its data from API + card.style.display = 'none'; + card.innerHTML = ''; continue; } @@ -769,10 +810,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in timestamp: timestamp, source: 'dom' }); - // Hide processed card (separators removed on next cycle) - card.style.display = 'none'; - card.innerHTML = ''; } + + // ALWAYS hide processed cards to keep DOM light + // (even if extraction failed - we've seen this card) + card.style.display = 'none'; + card.innerHTML = ''; } return {reviews: results, cardCount: cards.length, sepsRemoved: sepsRemoved}; """, seen_list) @@ -791,14 +834,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in reviews[rid] = rev seen_ids.add(rid) except Exception as e: - print(f" āŒ DOM parse error: {e}") + log.error(f" āŒ DOM parse error: {e}") dom_time = time.time() - t2 # BATCH FLUSH: If we have enough reviews, flush to callback and clear memory # Sort by DOM order before flushing t3 = time.time() if flush_callback and len(reviews) >= flush_batch_size: - print(f" šŸ’¾ Flushing {len(reviews)} reviews to disk...") + log.info(f" šŸ’¾ Flushing {len(reviews)} reviews to disk...") sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) flush_callback([r for _, r in sorted_reviews]) total_flushed[0] += len(reviews) @@ -809,7 +852,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in # TIMING: Print if cycle is slow (>2s) if cycle_delta > 2.0: - print(f" āš ļø SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})") + log.warning(f" āš ļø SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})") # Check for new reviews if current_count > last_count: @@ -844,19 +887,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in elapsed = time.time() - last_new_time if total_reviews[0]: pct = (current_count / total_reviews[0]) * 100 - print(f" šŸ“Š {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True) + log.info(f" šŸ“Š {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s") else: - print(f" šŸ“Š {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True) + log.info(f" šŸ“Š {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s") + + # Call progress callback on every iteration (for real-time log updates) + if progress_callback: + progress_callback(current_count, total_reviews[0]) # Stop conditions - check BEFORE recovery attempts if current_count >= max_reviews: - print(f"āœ… Reached max: {current_count}") + log.info(f"āœ… Reached max: {current_count}") stop_scrolling.set() break # Also stop if we have all reviews from the page if total_reviews[0] and current_count >= total_reviews[0]: - print(f"āœ… All {current_count} reviews collected") + log.info(f"āœ… All {current_count} reviews collected") stop_scrolling.set() break @@ -865,12 +912,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in if elapsed >= 3 and int(elapsed) % 3 == 0: # After 8+ failed recovery attempts, try hard refresh if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes: - print(f" šŸ”„ Soft recovery failed {recovery_count[0]} times, trying hard refresh...", flush=True) + log.info(f" šŸ”„ Soft recovery failed {recovery_count[0]} times, trying hard refresh...") if do_hard_refresh(): last_new_time = time.time() # Reset timer after refresh continue # Skip to next iteration else: - print(f" šŸ”§ Recovery attempt #{recovery_count[0] + 1}...", flush=True) + log.info(f" šŸ”§ Recovery attempt #{recovery_count[0] + 1}...") unstick_scroll() # Check scroll state - track if content is still being added @@ -906,24 +953,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in if truly_done or timeout_hit: # Last chance: try hard refresh before giving up if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews): - print(f" šŸ”„ Timeout reached, trying hard refresh before giving up...", flush=True) + log.info(f" šŸ”„ Timeout reached, trying hard refresh before giving up...") if do_hard_refresh(): last_new_time = time.time() continue # Keep trying - print(f"āœ… All reviews loaded: {current_count}") + log.info(f"āœ… All reviews loaded: {current_count}") stop_scrolling.set() break # Flush any remaining reviews (sorted by DOM order) if flush_callback and reviews: - print(f" šŸ’¾ Final flush: {len(reviews)} reviews...") + log.info(f" šŸ’¾ Final flush: {len(reviews)} reviews...") sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) flush_callback([r for _, r in sorted_reviews]) total_flushed[0] += len(reviews) reviews.clear() # Reviews already parsed during scrolling (real-time parsing) - print("šŸ“ Finalizing review data...") + log.info("šŸ“ Finalizing review data...") # Final results (sorted by DOM order) sorted_items = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) @@ -933,21 +980,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in api_count = sum(1 for r in review_list if r.get("source") == "api") if total_flushed[0] > 0: - print(f"\nšŸ“‹ Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})") + log.info(f"šŸ“‹ Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})") else: - print(f"\nšŸ“‹ Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})") + log.info(f"šŸ“‹ Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})") return { "reviews": review_list, # Only unflushed reviews (flushed already sent to callback) "total": grand_total, "total_flushed": total_flushed[0], "checks": check_num, - "url": url + "url": url, + "logs": log.get_logs() } def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999, - progress_callback=None, driver=None, return_driver: bool = False): + progress_callback=None, driver=None, return_driver: bool = False, + log_capture: LogCapture = None): """ Production-compatible wrapper for scrape_reviews. Matches the API expected by job_manager.py. @@ -959,9 +1008,10 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999 progress_callback: Optional callback(current_count, total_count) for progress driver: Existing driver instance to reuse return_driver: If True, return driver in result + log_capture: Optional LogCapture instance for real-time log access Returns: - Dictionary with: reviews, count, total_reviews, time, success, error, driver + Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs """ from seleniumbase import Driver @@ -969,6 +1019,9 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999 driver_provided = driver is not None should_close_driver = not return_driver and not driver_provided + # Use provided log_capture or create new one + log_capture = log_capture or LogCapture() + try: # Create driver if not provided if not driver: @@ -980,6 +1033,25 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999 ) driver.set_window_size(1200, 900) # Proper viewport for Google Maps + # Set Chrome geolocation to US (Boston, MA) using CDP + # This ensures Google Maps shows US results regardless of server location + try: + driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { + 'latitude': 42.3601, + 'longitude': -71.0589, + 'accuracy': 100 + }) + log_capture.info("Set geolocation to US (Boston, MA)") + except Exception as e: + log_capture.warning(f"Could not set geolocation: {e}") + + # Add URL parameters for consistent results + if 'hl=' not in url: + separator = '&' if '?' in url else '?' + url = f"{url}{separator}hl=en" + if 'gl=' not in url: + url = f"{url}&gl=us" + # Create progress wrapper if callback provided flush_callback = None if progress_callback: @@ -989,14 +1061,16 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999 progress_callback(collected[0], None) flush_callback = flush_with_progress - # Run the scraper + # Run the scraper with progress callback for real-time updates result = scrape_reviews( driver=driver, url=url, max_reviews=999999, # Effectively unlimited timeout_no_new=15, flush_callback=flush_callback, - flush_batch_size=100 # Smaller batches for more frequent progress + flush_batch_size=100, # Smaller batches for more frequent progress + log_capture=log_capture, + progress_callback=progress_callback # Pass through for real-time log updates ) elapsed = time.time() - start_time @@ -1008,7 +1082,8 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999 "total_reviews": result.get("total", 0), "time": elapsed, "success": True, - "error": None + "error": None, + "logs": result.get("logs", []) } if return_driver: @@ -1030,6 +1105,9 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999 except: pass + # Log error to the existing log_capture + log_capture.error(f"Scraper failed: {str(e)}") + return { "reviews": [], "count": 0, @@ -1037,7 +1115,8 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999 "time": elapsed, "success": False, "error": str(e), - "driver": driver if return_driver else None + "driver": driver if return_driver else None, + "logs": log_capture.get_logs() }