Fix DOM cleanup: hide cards from API interception too

The continue statement was skipping the card.style.display='none'
and card.innerHTML='' cleanup for cards already seen via API
interception. This caused DOM to grow unbounded during long scrapes.

Now ALL processed cards are hidden regardless of data source.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-23 17:23:51 +00:00
parent 01ea18d91d
commit 80e7771c00

View File

@@ -8,9 +8,41 @@ import re
import json
import time
import threading
from datetime import datetime
from selenium.webdriver.common.by import By
class LogCapture:
"""Captures scraper logs for storage and viewing."""
def __init__(self):
self.logs = []
def log(self, message: str, level: str = "INFO", source: str = "scraper"):
"""Add a log entry with timestamp."""
entry = {
"timestamp": datetime.utcnow().isoformat() + "Z",
"level": level,
"source": source,
"message": message
}
self.logs.append(entry)
# Also print for console visibility
print(message, flush=True)
def info(self, message: str, source: str = "scraper"):
self.log(message, "INFO", source)
def warning(self, message: str, source: str = "scraper"):
self.log(message, "WARNING", source)
def error(self, message: str, source: str = "scraper"):
self.log(message, "ERROR", source)
def get_logs(self):
return self.logs
def parse_api_review(raw: list) -> dict:
"""Parse a review from API response array."""
try:
@@ -235,7 +267,8 @@ def parse_dom_review(card) -> dict:
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
flush_callback=None, flush_batch_size: int = 500) -> dict:
flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None,
progress_callback=None) -> dict:
"""
Scrape Google Maps reviews.
@@ -247,10 +280,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
flush_callback: Optional callback(reviews_list) called every flush_batch_size reviews
This allows streaming data to disk and freeing memory
flush_batch_size: Number of reviews to collect before flushing (default 500)
log_capture: Optional LogCapture instance for storing logs
progress_callback: Optional callback(current_count, total_count) called every iteration
Returns:
dict with reviews list and metadata
"""
# Use provided log_capture or create a dummy that just prints
log = log_capture or LogCapture()
# Storage - use review ID as key
reviews = {} # review_id -> review
@@ -298,23 +335,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Navigate to URL (only on initial load or refresh)
if not is_refresh:
print(f"🌐 Loading: {url[:80]}...")
log.info(f"🌐 Loading: {url[:80]}...")
else:
print(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
driver.get(url)
# Handle consent popup if redirected (poll with tiny sleep)
start = time.time()
while time.time() - start < 5: # Max 5s for consent
if "consent.google" in driver.current_url:
print(" Handling consent popup...")
log.info(" Handling consent popup...")
try:
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
txt = btn.text.lower()
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
btn.click()
# Reload original URL after consent
print(" Reloading after consent...")
log.info(" Reloading after consent...")
driver.get(url)
break
except:
@@ -344,7 +381,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
""")
if count:
total_reviews[0] = count
print(f"📊 Total reviews on page: {count}")
log.info(f"📊 Total reviews on page: {count}")
break
except:
pass
@@ -361,7 +398,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
tab_text = tab.text.lower()
if any(kw in tab_text for kw in review_keywords):
if not is_refresh:
print(f" Clicking reviews tab: '{tab.text}'")
log.info(f" Clicking reviews tab: '{tab.text}'")
tab.click()
tab_clicked = True
break
@@ -381,24 +418,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
break
elapsed = int(time.time() - start)
if elapsed > last_print:
print(f" Waiting for reviews panel...{refresh_label} ({elapsed}s)")
log.info(f" Waiting for reviews panel...{refresh_label} ({elapsed}s)")
last_print = elapsed
time.sleep(0.01) # 10ms - responsive but low CPU
if not scroll_container:
print(f"❌ Could not find reviews scroll container{refresh_label}")
log.error(f"❌ Could not find reviews scroll container{refresh_label}")
try:
print("Page title:", driver.title)
print("Current URL:", driver.current_url[:100])
log.error(f"Page title: {driver.title}")
log.error(f"Current URL: {driver.current_url[:100]}")
except:
pass
return None, None
print(f"✅ Found scroll container{refresh_label}")
log.info(f"✅ Found scroll container{refresh_label}")
# Inject API interceptor (needs to be re-injected after refresh)
if not is_refresh:
print("🔌 Injecting API interceptor...")
log.info("🔌 Injecting API interceptor...")
driver.execute_script("""
// Always re-setup on refresh
window.__reviewInterceptorInjected = true;
@@ -472,12 +509,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
}
""")
time.sleep(0.5)
print(" 📅 Sorted by newest")
log.info(" 📅 Sorted by newest")
# Re-find scroll container after sorting (DOM may be recreated)
new_container = find_scroll_container()
if new_container:
scroll_container = new_container
print(" 🔄 Refreshed scroll container reference")
log.info(" 🔄 Refreshed scroll container reference")
except:
pass
@@ -495,7 +532,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
return count;
""")
if expanded > 0:
print(f" 📝 Expanded {expanded} truncated reviews")
log.info(f" 📝 Expanded {expanded} truncated reviews")
except:
pass
@@ -506,7 +543,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
})
driver.execute_cdp_cmd('Network.enable', {})
if not is_refresh:
print(" 🚫 Blocking images for faster scrolling")
log.info(" 🚫 Blocking images for faster scrolling")
except:
pass
@@ -605,7 +642,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
hard_refresh_count[0] += 1
if hard_refresh_count[0] > max_hard_refreshes:
print(f" ⚠️ Max hard refreshes ({max_hard_refreshes}) reached, giving up")
log.warning(f" ⚠️ Max hard refreshes ({max_hard_refreshes}) reached, giving up")
return False
# Stop current scroll worker
@@ -618,10 +655,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
scroll_container = new_container
stop_scrolling = new_stop
recovery_count[0] = 0 # Reset recovery count after successful refresh
print(f" ✅ Hard refresh successful, resuming with {len(seen_ids)} reviews already collected")
log.info(f" ✅ Hard refresh successful, resuming with {len(seen_ids)} reviews already collected")
return True
else:
print(f" ❌ Hard refresh failed to find scroll container")
log.error(f" ❌ Hard refresh failed to find scroll container")
return False
# Main collection loop
@@ -629,7 +666,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
last_count = len(reviews)
check_num = 0
print(f"🔄 Scrolling... (timeout: {timeout_no_new}s with no new)", flush=True)
log.info(f"🔄 Scrolling... (timeout: {timeout_no_new}s with no new)")
cycle_start = time.time()
while True:
@@ -711,8 +748,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
processedIds.add(rid);
// Already seen from API - just track order, skip content
// BUT still hide the card to keep DOM light!
if (seenSet.has(rid)) {
results.push({id: rid, orderOnly: true});
// Hide this card since we already have its data from API
card.style.display = 'none';
card.innerHTML = '';
continue;
}
@@ -769,10 +810,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
timestamp: timestamp,
source: 'dom'
});
// Hide processed card (separators removed on next cycle)
card.style.display = 'none';
card.innerHTML = '';
}
// ALWAYS hide processed cards to keep DOM light
// (even if extraction failed - we've seen this card)
card.style.display = 'none';
card.innerHTML = '';
}
return {reviews: results, cardCount: cards.length, sepsRemoved: sepsRemoved};
""", seen_list)
@@ -791,14 +834,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
reviews[rid] = rev
seen_ids.add(rid)
except Exception as e:
print(f" ❌ DOM parse error: {e}")
log.error(f" ❌ DOM parse error: {e}")
dom_time = time.time() - t2
# BATCH FLUSH: If we have enough reviews, flush to callback and clear memory
# Sort by DOM order before flushing
t3 = time.time()
if flush_callback and len(reviews) >= flush_batch_size:
print(f" 💾 Flushing {len(reviews)} reviews to disk...")
log.info(f" 💾 Flushing {len(reviews)} reviews to disk...")
sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
flush_callback([r for _, r in sorted_reviews])
total_flushed[0] += len(reviews)
@@ -809,7 +852,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# TIMING: Print if cycle is slow (>2s)
if cycle_delta > 2.0:
print(f" ⚠️ SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})")
log.warning(f" ⚠️ SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})")
# Check for new reviews
if current_count > last_count:
@@ -844,19 +887,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
elapsed = time.time() - last_new_time
if total_reviews[0]:
pct = (current_count / total_reviews[0]) * 100
print(f" 📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True)
log.info(f" 📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s")
else:
print(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True)
log.info(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s")
# Call progress callback on every iteration (for real-time log updates)
if progress_callback:
progress_callback(current_count, total_reviews[0])
# Stop conditions - check BEFORE recovery attempts
if current_count >= max_reviews:
print(f"✅ Reached max: {current_count}")
log.info(f"✅ Reached max: {current_count}")
stop_scrolling.set()
break
# Also stop if we have all reviews from the page
if total_reviews[0] and current_count >= total_reviews[0]:
print(f"✅ All {current_count} reviews collected")
log.info(f"✅ All {current_count} reviews collected")
stop_scrolling.set()
break
@@ -865,12 +912,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
if elapsed >= 3 and int(elapsed) % 3 == 0:
# After 8+ failed recovery attempts, try hard refresh
if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes:
print(f" 🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...", flush=True)
log.info(f" 🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...")
if do_hard_refresh():
last_new_time = time.time() # Reset timer after refresh
continue # Skip to next iteration
else:
print(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
log.info(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...")
unstick_scroll()
# Check scroll state - track if content is still being added
@@ -906,24 +953,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
if truly_done or timeout_hit:
# Last chance: try hard refresh before giving up
if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
print(f" 🔄 Timeout reached, trying hard refresh before giving up...", flush=True)
log.info(f" 🔄 Timeout reached, trying hard refresh before giving up...")
if do_hard_refresh():
last_new_time = time.time()
continue # Keep trying
print(f"✅ All reviews loaded: {current_count}")
log.info(f"✅ All reviews loaded: {current_count}")
stop_scrolling.set()
break
# Flush any remaining reviews (sorted by DOM order)
if flush_callback and reviews:
print(f" 💾 Final flush: {len(reviews)} reviews...")
log.info(f" 💾 Final flush: {len(reviews)} reviews...")
sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
flush_callback([r for _, r in sorted_reviews])
total_flushed[0] += len(reviews)
reviews.clear()
# Reviews already parsed during scrolling (real-time parsing)
print("📝 Finalizing review data...")
log.info("📝 Finalizing review data...")
# Final results (sorted by DOM order)
sorted_items = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
@@ -933,21 +980,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
api_count = sum(1 for r in review_list if r.get("source") == "api")
if total_flushed[0] > 0:
print(f"\n📋 Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})")
log.info(f"📋 Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})")
else:
print(f"\n📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
log.info(f"📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
return {
"reviews": review_list, # Only unflushed reviews (flushed already sent to callback)
"total": grand_total,
"total_flushed": total_flushed[0],
"checks": check_num,
"url": url
"url": url,
"logs": log.get_logs()
}
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
progress_callback=None, driver=None, return_driver: bool = False):
progress_callback=None, driver=None, return_driver: bool = False,
log_capture: LogCapture = None):
"""
Production-compatible wrapper for scrape_reviews.
Matches the API expected by job_manager.py.
@@ -959,9 +1008,10 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
progress_callback: Optional callback(current_count, total_count) for progress
driver: Existing driver instance to reuse
return_driver: If True, return driver in result
log_capture: Optional LogCapture instance for real-time log access
Returns:
Dictionary with: reviews, count, total_reviews, time, success, error, driver
Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
"""
from seleniumbase import Driver
@@ -969,6 +1019,9 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
driver_provided = driver is not None
should_close_driver = not return_driver and not driver_provided
# Use provided log_capture or create new one
log_capture = log_capture or LogCapture()
try:
# Create driver if not provided
if not driver:
@@ -980,6 +1033,25 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
)
driver.set_window_size(1200, 900) # Proper viewport for Google Maps
# Set Chrome geolocation to US (Boston, MA) using CDP
# This ensures Google Maps shows US results regardless of server location
try:
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': 42.3601,
'longitude': -71.0589,
'accuracy': 100
})
log_capture.info("Set geolocation to US (Boston, MA)")
except Exception as e:
log_capture.warning(f"Could not set geolocation: {e}")
# Add URL parameters for consistent results
if 'hl=' not in url:
separator = '&' if '?' in url else '?'
url = f"{url}{separator}hl=en"
if 'gl=' not in url:
url = f"{url}&gl=us"
# Create progress wrapper if callback provided
flush_callback = None
if progress_callback:
@@ -989,14 +1061,16 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
progress_callback(collected[0], None)
flush_callback = flush_with_progress
# Run the scraper
# Run the scraper with progress callback for real-time updates
result = scrape_reviews(
driver=driver,
url=url,
max_reviews=999999, # Effectively unlimited
timeout_no_new=15,
flush_callback=flush_callback,
flush_batch_size=100 # Smaller batches for more frequent progress
flush_batch_size=100, # Smaller batches for more frequent progress
log_capture=log_capture,
progress_callback=progress_callback # Pass through for real-time log updates
)
elapsed = time.time() - start_time
@@ -1008,7 +1082,8 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
"total_reviews": result.get("total", 0),
"time": elapsed,
"success": True,
"error": None
"error": None,
"logs": result.get("logs", [])
}
if return_driver:
@@ -1030,6 +1105,9 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
except:
pass
# Log error to the existing log_capture
log_capture.error(f"Scraper failed: {str(e)}")
return {
"reviews": [],
"count": 0,
@@ -1037,7 +1115,8 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
"time": elapsed,
"success": False,
"error": str(e),
"driver": driver if return_driver else None
"driver": driver if return_driver else None,
"logs": log_capture.get_logs()
}