Fix DOM cleanup: hide cards from API interception too

The continue statement was skipping the card.style.display='none'
and card.innerHTML='' cleanup for cards already seen via API
interception. This caused DOM to grow unbounded during long scrapes.

Now ALL processed cards are hidden regardless of data source.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-23 17:23:51 +00:00
parent 01ea18d91d
commit 80e7771c00

View File

@@ -8,9 +8,41 @@ import re
import json import json
import time import time
import threading import threading
from datetime import datetime
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
class LogCapture:
"""Captures scraper logs for storage and viewing."""
def __init__(self):
self.logs = []
def log(self, message: str, level: str = "INFO", source: str = "scraper"):
"""Add a log entry with timestamp."""
entry = {
"timestamp": datetime.utcnow().isoformat() + "Z",
"level": level,
"source": source,
"message": message
}
self.logs.append(entry)
# Also print for console visibility
print(message, flush=True)
def info(self, message: str, source: str = "scraper"):
self.log(message, "INFO", source)
def warning(self, message: str, source: str = "scraper"):
self.log(message, "WARNING", source)
def error(self, message: str, source: str = "scraper"):
self.log(message, "ERROR", source)
def get_logs(self):
return self.logs
def parse_api_review(raw: list) -> dict: def parse_api_review(raw: list) -> dict:
"""Parse a review from API response array.""" """Parse a review from API response array."""
try: try:
@@ -235,7 +267,8 @@ def parse_dom_review(card) -> dict:
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15, def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
flush_callback=None, flush_batch_size: int = 500) -> dict: flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None,
progress_callback=None) -> dict:
""" """
Scrape Google Maps reviews. Scrape Google Maps reviews.
@@ -247,10 +280,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
flush_callback: Optional callback(reviews_list) called every flush_batch_size reviews flush_callback: Optional callback(reviews_list) called every flush_batch_size reviews
This allows streaming data to disk and freeing memory This allows streaming data to disk and freeing memory
flush_batch_size: Number of reviews to collect before flushing (default 500) flush_batch_size: Number of reviews to collect before flushing (default 500)
log_capture: Optional LogCapture instance for storing logs
progress_callback: Optional callback(current_count, total_count) called every iteration
Returns: Returns:
dict with reviews list and metadata dict with reviews list and metadata
""" """
# Use provided log_capture or create a dummy that just prints
log = log_capture or LogCapture()
# Storage - use review ID as key # Storage - use review ID as key
reviews = {} # review_id -> review reviews = {} # review_id -> review
@@ -298,23 +335,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Navigate to URL (only on initial load or refresh) # Navigate to URL (only on initial load or refresh)
if not is_refresh: if not is_refresh:
print(f"🌐 Loading: {url[:80]}...") log.info(f"🌐 Loading: {url[:80]}...")
else: else:
print(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...") log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
driver.get(url) driver.get(url)
# Handle consent popup if redirected (poll with tiny sleep) # Handle consent popup if redirected (poll with tiny sleep)
start = time.time() start = time.time()
while time.time() - start < 5: # Max 5s for consent while time.time() - start < 5: # Max 5s for consent
if "consent.google" in driver.current_url: if "consent.google" in driver.current_url:
print(" Handling consent popup...") log.info(" Handling consent popup...")
try: try:
for btn in driver.find_elements(By.CSS_SELECTOR, "button"): for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
txt = btn.text.lower() txt = btn.text.lower()
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt: if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
btn.click() btn.click()
# Reload original URL after consent # Reload original URL after consent
print(" Reloading after consent...") log.info(" Reloading after consent...")
driver.get(url) driver.get(url)
break break
except: except:
@@ -344,7 +381,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
""") """)
if count: if count:
total_reviews[0] = count total_reviews[0] = count
print(f"📊 Total reviews on page: {count}") log.info(f"📊 Total reviews on page: {count}")
break break
except: except:
pass pass
@@ -361,7 +398,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
tab_text = tab.text.lower() tab_text = tab.text.lower()
if any(kw in tab_text for kw in review_keywords): if any(kw in tab_text for kw in review_keywords):
if not is_refresh: if not is_refresh:
print(f" Clicking reviews tab: '{tab.text}'") log.info(f" Clicking reviews tab: '{tab.text}'")
tab.click() tab.click()
tab_clicked = True tab_clicked = True
break break
@@ -381,24 +418,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
break break
elapsed = int(time.time() - start) elapsed = int(time.time() - start)
if elapsed > last_print: if elapsed > last_print:
print(f" Waiting for reviews panel...{refresh_label} ({elapsed}s)") log.info(f" Waiting for reviews panel...{refresh_label} ({elapsed}s)")
last_print = elapsed last_print = elapsed
time.sleep(0.01) # 10ms - responsive but low CPU time.sleep(0.01) # 10ms - responsive but low CPU
if not scroll_container: if not scroll_container:
print(f"❌ Could not find reviews scroll container{refresh_label}") log.error(f"❌ Could not find reviews scroll container{refresh_label}")
try: try:
print("Page title:", driver.title) log.error(f"Page title: {driver.title}")
print("Current URL:", driver.current_url[:100]) log.error(f"Current URL: {driver.current_url[:100]}")
except: except:
pass pass
return None, None return None, None
print(f"✅ Found scroll container{refresh_label}") log.info(f"✅ Found scroll container{refresh_label}")
# Inject API interceptor (needs to be re-injected after refresh) # Inject API interceptor (needs to be re-injected after refresh)
if not is_refresh: if not is_refresh:
print("🔌 Injecting API interceptor...") log.info("🔌 Injecting API interceptor...")
driver.execute_script(""" driver.execute_script("""
// Always re-setup on refresh // Always re-setup on refresh
window.__reviewInterceptorInjected = true; window.__reviewInterceptorInjected = true;
@@ -472,12 +509,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
} }
""") """)
time.sleep(0.5) time.sleep(0.5)
print(" 📅 Sorted by newest") log.info(" 📅 Sorted by newest")
# Re-find scroll container after sorting (DOM may be recreated) # Re-find scroll container after sorting (DOM may be recreated)
new_container = find_scroll_container() new_container = find_scroll_container()
if new_container: if new_container:
scroll_container = new_container scroll_container = new_container
print(" 🔄 Refreshed scroll container reference") log.info(" 🔄 Refreshed scroll container reference")
except: except:
pass pass
@@ -495,7 +532,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
return count; return count;
""") """)
if expanded > 0: if expanded > 0:
print(f" 📝 Expanded {expanded} truncated reviews") log.info(f" 📝 Expanded {expanded} truncated reviews")
except: except:
pass pass
@@ -506,7 +543,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
}) })
driver.execute_cdp_cmd('Network.enable', {}) driver.execute_cdp_cmd('Network.enable', {})
if not is_refresh: if not is_refresh:
print(" 🚫 Blocking images for faster scrolling") log.info(" 🚫 Blocking images for faster scrolling")
except: except:
pass pass
@@ -605,7 +642,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
hard_refresh_count[0] += 1 hard_refresh_count[0] += 1
if hard_refresh_count[0] > max_hard_refreshes: if hard_refresh_count[0] > max_hard_refreshes:
print(f" ⚠️ Max hard refreshes ({max_hard_refreshes}) reached, giving up") log.warning(f" ⚠️ Max hard refreshes ({max_hard_refreshes}) reached, giving up")
return False return False
# Stop current scroll worker # Stop current scroll worker
@@ -618,10 +655,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
scroll_container = new_container scroll_container = new_container
stop_scrolling = new_stop stop_scrolling = new_stop
recovery_count[0] = 0 # Reset recovery count after successful refresh recovery_count[0] = 0 # Reset recovery count after successful refresh
print(f" ✅ Hard refresh successful, resuming with {len(seen_ids)} reviews already collected") log.info(f" ✅ Hard refresh successful, resuming with {len(seen_ids)} reviews already collected")
return True return True
else: else:
print(f" ❌ Hard refresh failed to find scroll container") log.error(f" ❌ Hard refresh failed to find scroll container")
return False return False
# Main collection loop # Main collection loop
@@ -629,7 +666,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
last_count = len(reviews) last_count = len(reviews)
check_num = 0 check_num = 0
print(f"🔄 Scrolling... (timeout: {timeout_no_new}s with no new)", flush=True) log.info(f"🔄 Scrolling... (timeout: {timeout_no_new}s with no new)")
cycle_start = time.time() cycle_start = time.time()
while True: while True:
@@ -711,8 +748,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
processedIds.add(rid); processedIds.add(rid);
// Already seen from API - just track order, skip content // Already seen from API - just track order, skip content
// BUT still hide the card to keep DOM light!
if (seenSet.has(rid)) { if (seenSet.has(rid)) {
results.push({id: rid, orderOnly: true}); results.push({id: rid, orderOnly: true});
// Hide this card since we already have its data from API
card.style.display = 'none';
card.innerHTML = '';
continue; continue;
} }
@@ -769,10 +810,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
timestamp: timestamp, timestamp: timestamp,
source: 'dom' source: 'dom'
}); });
// Hide processed card (separators removed on next cycle)
card.style.display = 'none';
card.innerHTML = '';
} }
// ALWAYS hide processed cards to keep DOM light
// (even if extraction failed - we've seen this card)
card.style.display = 'none';
card.innerHTML = '';
} }
return {reviews: results, cardCount: cards.length, sepsRemoved: sepsRemoved}; return {reviews: results, cardCount: cards.length, sepsRemoved: sepsRemoved};
""", seen_list) """, seen_list)
@@ -791,14 +834,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
reviews[rid] = rev reviews[rid] = rev
seen_ids.add(rid) seen_ids.add(rid)
except Exception as e: except Exception as e:
print(f" ❌ DOM parse error: {e}") log.error(f" ❌ DOM parse error: {e}")
dom_time = time.time() - t2 dom_time = time.time() - t2
# BATCH FLUSH: If we have enough reviews, flush to callback and clear memory # BATCH FLUSH: If we have enough reviews, flush to callback and clear memory
# Sort by DOM order before flushing # Sort by DOM order before flushing
t3 = time.time() t3 = time.time()
if flush_callback and len(reviews) >= flush_batch_size: if flush_callback and len(reviews) >= flush_batch_size:
print(f" 💾 Flushing {len(reviews)} reviews to disk...") log.info(f" 💾 Flushing {len(reviews)} reviews to disk...")
sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
flush_callback([r for _, r in sorted_reviews]) flush_callback([r for _, r in sorted_reviews])
total_flushed[0] += len(reviews) total_flushed[0] += len(reviews)
@@ -809,7 +852,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# TIMING: Print if cycle is slow (>2s) # TIMING: Print if cycle is slow (>2s)
if cycle_delta > 2.0: if cycle_delta > 2.0:
print(f" ⚠️ SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})") log.warning(f" ⚠️ SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})")
# Check for new reviews # Check for new reviews
if current_count > last_count: if current_count > last_count:
@@ -844,19 +887,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
elapsed = time.time() - last_new_time elapsed = time.time() - last_new_time
if total_reviews[0]: if total_reviews[0]:
pct = (current_count / total_reviews[0]) * 100 pct = (current_count / total_reviews[0]) * 100
print(f" 📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True) log.info(f" 📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s")
else: else:
print(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True) log.info(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s")
# Call progress callback on every iteration (for real-time log updates)
if progress_callback:
progress_callback(current_count, total_reviews[0])
# Stop conditions - check BEFORE recovery attempts # Stop conditions - check BEFORE recovery attempts
if current_count >= max_reviews: if current_count >= max_reviews:
print(f"✅ Reached max: {current_count}") log.info(f"✅ Reached max: {current_count}")
stop_scrolling.set() stop_scrolling.set()
break break
# Also stop if we have all reviews from the page # Also stop if we have all reviews from the page
if total_reviews[0] and current_count >= total_reviews[0]: if total_reviews[0] and current_count >= total_reviews[0]:
print(f"✅ All {current_count} reviews collected") log.info(f"✅ All {current_count} reviews collected")
stop_scrolling.set() stop_scrolling.set()
break break
@@ -865,12 +912,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
if elapsed >= 3 and int(elapsed) % 3 == 0: if elapsed >= 3 and int(elapsed) % 3 == 0:
# After 8+ failed recovery attempts, try hard refresh # After 8+ failed recovery attempts, try hard refresh
if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes: if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes:
print(f" 🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...", flush=True) log.info(f" 🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...")
if do_hard_refresh(): if do_hard_refresh():
last_new_time = time.time() # Reset timer after refresh last_new_time = time.time() # Reset timer after refresh
continue # Skip to next iteration continue # Skip to next iteration
else: else:
print(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True) log.info(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...")
unstick_scroll() unstick_scroll()
# Check scroll state - track if content is still being added # Check scroll state - track if content is still being added
@@ -906,24 +953,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
if truly_done or timeout_hit: if truly_done or timeout_hit:
# Last chance: try hard refresh before giving up # Last chance: try hard refresh before giving up
if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews): if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
print(f" 🔄 Timeout reached, trying hard refresh before giving up...", flush=True) log.info(f" 🔄 Timeout reached, trying hard refresh before giving up...")
if do_hard_refresh(): if do_hard_refresh():
last_new_time = time.time() last_new_time = time.time()
continue # Keep trying continue # Keep trying
print(f"✅ All reviews loaded: {current_count}") log.info(f"✅ All reviews loaded: {current_count}")
stop_scrolling.set() stop_scrolling.set()
break break
# Flush any remaining reviews (sorted by DOM order) # Flush any remaining reviews (sorted by DOM order)
if flush_callback and reviews: if flush_callback and reviews:
print(f" 💾 Final flush: {len(reviews)} reviews...") log.info(f" 💾 Final flush: {len(reviews)} reviews...")
sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
flush_callback([r for _, r in sorted_reviews]) flush_callback([r for _, r in sorted_reviews])
total_flushed[0] += len(reviews) total_flushed[0] += len(reviews)
reviews.clear() reviews.clear()
# Reviews already parsed during scrolling (real-time parsing) # Reviews already parsed during scrolling (real-time parsing)
print("📝 Finalizing review data...") log.info("📝 Finalizing review data...")
# Final results (sorted by DOM order) # Final results (sorted by DOM order)
sorted_items = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) sorted_items = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
@@ -933,21 +980,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
api_count = sum(1 for r in review_list if r.get("source") == "api") api_count = sum(1 for r in review_list if r.get("source") == "api")
if total_flushed[0] > 0: if total_flushed[0] > 0:
print(f"\n📋 Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})") log.info(f"📋 Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})")
else: else:
print(f"\n📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})") log.info(f"📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
return { return {
"reviews": review_list, # Only unflushed reviews (flushed already sent to callback) "reviews": review_list, # Only unflushed reviews (flushed already sent to callback)
"total": grand_total, "total": grand_total,
"total_flushed": total_flushed[0], "total_flushed": total_flushed[0],
"checks": check_num, "checks": check_num,
"url": url "url": url,
"logs": log.get_logs()
} }
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999, def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
progress_callback=None, driver=None, return_driver: bool = False): progress_callback=None, driver=None, return_driver: bool = False,
log_capture: LogCapture = None):
""" """
Production-compatible wrapper for scrape_reviews. Production-compatible wrapper for scrape_reviews.
Matches the API expected by job_manager.py. Matches the API expected by job_manager.py.
@@ -959,9 +1008,10 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
progress_callback: Optional callback(current_count, total_count) for progress progress_callback: Optional callback(current_count, total_count) for progress
driver: Existing driver instance to reuse driver: Existing driver instance to reuse
return_driver: If True, return driver in result return_driver: If True, return driver in result
log_capture: Optional LogCapture instance for real-time log access
Returns: Returns:
Dictionary with: reviews, count, total_reviews, time, success, error, driver Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
""" """
from seleniumbase import Driver from seleniumbase import Driver
@@ -969,6 +1019,9 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
driver_provided = driver is not None driver_provided = driver is not None
should_close_driver = not return_driver and not driver_provided should_close_driver = not return_driver and not driver_provided
# Use provided log_capture or create new one
log_capture = log_capture or LogCapture()
try: try:
# Create driver if not provided # Create driver if not provided
if not driver: if not driver:
@@ -980,6 +1033,25 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
) )
driver.set_window_size(1200, 900) # Proper viewport for Google Maps driver.set_window_size(1200, 900) # Proper viewport for Google Maps
# Set Chrome geolocation to US (Boston, MA) using CDP
# This ensures Google Maps shows US results regardless of server location
try:
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': 42.3601,
'longitude': -71.0589,
'accuracy': 100
})
log_capture.info("Set geolocation to US (Boston, MA)")
except Exception as e:
log_capture.warning(f"Could not set geolocation: {e}")
# Add URL parameters for consistent results
if 'hl=' not in url:
separator = '&' if '?' in url else '?'
url = f"{url}{separator}hl=en"
if 'gl=' not in url:
url = f"{url}&gl=us"
# Create progress wrapper if callback provided # Create progress wrapper if callback provided
flush_callback = None flush_callback = None
if progress_callback: if progress_callback:
@@ -989,14 +1061,16 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
progress_callback(collected[0], None) progress_callback(collected[0], None)
flush_callback = flush_with_progress flush_callback = flush_with_progress
# Run the scraper # Run the scraper with progress callback for real-time updates
result = scrape_reviews( result = scrape_reviews(
driver=driver, driver=driver,
url=url, url=url,
max_reviews=999999, # Effectively unlimited max_reviews=999999, # Effectively unlimited
timeout_no_new=15, timeout_no_new=15,
flush_callback=flush_callback, flush_callback=flush_callback,
flush_batch_size=100 # Smaller batches for more frequent progress flush_batch_size=100, # Smaller batches for more frequent progress
log_capture=log_capture,
progress_callback=progress_callback # Pass through for real-time log updates
) )
elapsed = time.time() - start_time elapsed = time.time() - start_time
@@ -1008,7 +1082,8 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
"total_reviews": result.get("total", 0), "total_reviews": result.get("total", 0),
"time": elapsed, "time": elapsed,
"success": True, "success": True,
"error": None "error": None,
"logs": result.get("logs", [])
} }
if return_driver: if return_driver:
@@ -1030,6 +1105,9 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
except: except:
pass pass
# Log error to the existing log_capture
log_capture.error(f"Scraper failed: {str(e)}")
return { return {
"reviews": [], "reviews": [],
"count": 0, "count": 0,
@@ -1037,7 +1115,8 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
"time": elapsed, "time": elapsed,
"success": False, "success": False,
"error": str(e), "error": str(e),
"driver": driver if return_driver else None "driver": driver if return_driver else None,
"logs": log_capture.get_logs()
} }