Fix DOM cleanup: hide cards from API interception too
The continue statement was skipping the card.style.display='none' and card.innerHTML='' cleanup for cards already seen via API interception. This caused DOM to grow unbounded during long scrapes. Now ALL processed cards are hidden regardless of data source. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -8,9 +8,41 @@ import re
|
||||
import json
|
||||
import time
|
||||
import threading
|
||||
from datetime import datetime
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
|
||||
class LogCapture:
|
||||
"""Captures scraper logs for storage and viewing."""
|
||||
|
||||
def __init__(self):
|
||||
self.logs = []
|
||||
|
||||
def log(self, message: str, level: str = "INFO", source: str = "scraper"):
|
||||
"""Add a log entry with timestamp."""
|
||||
entry = {
|
||||
"timestamp": datetime.utcnow().isoformat() + "Z",
|
||||
"level": level,
|
||||
"source": source,
|
||||
"message": message
|
||||
}
|
||||
self.logs.append(entry)
|
||||
# Also print for console visibility
|
||||
print(message, flush=True)
|
||||
|
||||
def info(self, message: str, source: str = "scraper"):
|
||||
self.log(message, "INFO", source)
|
||||
|
||||
def warning(self, message: str, source: str = "scraper"):
|
||||
self.log(message, "WARNING", source)
|
||||
|
||||
def error(self, message: str, source: str = "scraper"):
|
||||
self.log(message, "ERROR", source)
|
||||
|
||||
def get_logs(self):
|
||||
return self.logs
|
||||
|
||||
|
||||
def parse_api_review(raw: list) -> dict:
|
||||
"""Parse a review from API response array."""
|
||||
try:
|
||||
@@ -235,7 +267,8 @@ def parse_dom_review(card) -> dict:
|
||||
|
||||
|
||||
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
|
||||
flush_callback=None, flush_batch_size: int = 500) -> dict:
|
||||
flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None,
|
||||
progress_callback=None) -> dict:
|
||||
"""
|
||||
Scrape Google Maps reviews.
|
||||
|
||||
@@ -247,10 +280,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
flush_callback: Optional callback(reviews_list) called every flush_batch_size reviews
|
||||
This allows streaming data to disk and freeing memory
|
||||
flush_batch_size: Number of reviews to collect before flushing (default 500)
|
||||
log_capture: Optional LogCapture instance for storing logs
|
||||
progress_callback: Optional callback(current_count, total_count) called every iteration
|
||||
|
||||
Returns:
|
||||
dict with reviews list and metadata
|
||||
"""
|
||||
# Use provided log_capture or create a dummy that just prints
|
||||
log = log_capture or LogCapture()
|
||||
|
||||
# Storage - use review ID as key
|
||||
reviews = {} # review_id -> review
|
||||
@@ -298,23 +335,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
|
||||
# Navigate to URL (only on initial load or refresh)
|
||||
if not is_refresh:
|
||||
print(f"🌐 Loading: {url[:80]}...")
|
||||
log.info(f"🌐 Loading: {url[:80]}...")
|
||||
else:
|
||||
print(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
|
||||
log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
|
||||
driver.get(url)
|
||||
|
||||
# Handle consent popup if redirected (poll with tiny sleep)
|
||||
start = time.time()
|
||||
while time.time() - start < 5: # Max 5s for consent
|
||||
if "consent.google" in driver.current_url:
|
||||
print(" Handling consent popup...")
|
||||
log.info(" Handling consent popup...")
|
||||
try:
|
||||
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
|
||||
txt = btn.text.lower()
|
||||
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
||||
btn.click()
|
||||
# Reload original URL after consent
|
||||
print(" Reloading after consent...")
|
||||
log.info(" Reloading after consent...")
|
||||
driver.get(url)
|
||||
break
|
||||
except:
|
||||
@@ -344,7 +381,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
""")
|
||||
if count:
|
||||
total_reviews[0] = count
|
||||
print(f"📊 Total reviews on page: {count}")
|
||||
log.info(f"📊 Total reviews on page: {count}")
|
||||
break
|
||||
except:
|
||||
pass
|
||||
@@ -361,7 +398,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
tab_text = tab.text.lower()
|
||||
if any(kw in tab_text for kw in review_keywords):
|
||||
if not is_refresh:
|
||||
print(f" Clicking reviews tab: '{tab.text}'")
|
||||
log.info(f" Clicking reviews tab: '{tab.text}'")
|
||||
tab.click()
|
||||
tab_clicked = True
|
||||
break
|
||||
@@ -381,24 +418,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
break
|
||||
elapsed = int(time.time() - start)
|
||||
if elapsed > last_print:
|
||||
print(f" Waiting for reviews panel...{refresh_label} ({elapsed}s)")
|
||||
log.info(f" Waiting for reviews panel...{refresh_label} ({elapsed}s)")
|
||||
last_print = elapsed
|
||||
time.sleep(0.01) # 10ms - responsive but low CPU
|
||||
|
||||
if not scroll_container:
|
||||
print(f"❌ Could not find reviews scroll container{refresh_label}")
|
||||
log.error(f"❌ Could not find reviews scroll container{refresh_label}")
|
||||
try:
|
||||
print("Page title:", driver.title)
|
||||
print("Current URL:", driver.current_url[:100])
|
||||
log.error(f"Page title: {driver.title}")
|
||||
log.error(f"Current URL: {driver.current_url[:100]}")
|
||||
except:
|
||||
pass
|
||||
return None, None
|
||||
|
||||
print(f"✅ Found scroll container{refresh_label}")
|
||||
log.info(f"✅ Found scroll container{refresh_label}")
|
||||
|
||||
# Inject API interceptor (needs to be re-injected after refresh)
|
||||
if not is_refresh:
|
||||
print("🔌 Injecting API interceptor...")
|
||||
log.info("🔌 Injecting API interceptor...")
|
||||
driver.execute_script("""
|
||||
// Always re-setup on refresh
|
||||
window.__reviewInterceptorInjected = true;
|
||||
@@ -472,12 +509,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
}
|
||||
""")
|
||||
time.sleep(0.5)
|
||||
print(" 📅 Sorted by newest")
|
||||
log.info(" 📅 Sorted by newest")
|
||||
# Re-find scroll container after sorting (DOM may be recreated)
|
||||
new_container = find_scroll_container()
|
||||
if new_container:
|
||||
scroll_container = new_container
|
||||
print(" 🔄 Refreshed scroll container reference")
|
||||
log.info(" 🔄 Refreshed scroll container reference")
|
||||
except:
|
||||
pass
|
||||
|
||||
@@ -495,7 +532,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
return count;
|
||||
""")
|
||||
if expanded > 0:
|
||||
print(f" 📝 Expanded {expanded} truncated reviews")
|
||||
log.info(f" 📝 Expanded {expanded} truncated reviews")
|
||||
except:
|
||||
pass
|
||||
|
||||
@@ -506,7 +543,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
})
|
||||
driver.execute_cdp_cmd('Network.enable', {})
|
||||
if not is_refresh:
|
||||
print(" 🚫 Blocking images for faster scrolling")
|
||||
log.info(" 🚫 Blocking images for faster scrolling")
|
||||
except:
|
||||
pass
|
||||
|
||||
@@ -605,7 +642,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
hard_refresh_count[0] += 1
|
||||
|
||||
if hard_refresh_count[0] > max_hard_refreshes:
|
||||
print(f" ⚠️ Max hard refreshes ({max_hard_refreshes}) reached, giving up")
|
||||
log.warning(f" ⚠️ Max hard refreshes ({max_hard_refreshes}) reached, giving up")
|
||||
return False
|
||||
|
||||
# Stop current scroll worker
|
||||
@@ -618,10 +655,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
scroll_container = new_container
|
||||
stop_scrolling = new_stop
|
||||
recovery_count[0] = 0 # Reset recovery count after successful refresh
|
||||
print(f" ✅ Hard refresh successful, resuming with {len(seen_ids)} reviews already collected")
|
||||
log.info(f" ✅ Hard refresh successful, resuming with {len(seen_ids)} reviews already collected")
|
||||
return True
|
||||
else:
|
||||
print(f" ❌ Hard refresh failed to find scroll container")
|
||||
log.error(f" ❌ Hard refresh failed to find scroll container")
|
||||
return False
|
||||
|
||||
# Main collection loop
|
||||
@@ -629,7 +666,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
last_count = len(reviews)
|
||||
check_num = 0
|
||||
|
||||
print(f"🔄 Scrolling... (timeout: {timeout_no_new}s with no new)", flush=True)
|
||||
log.info(f"🔄 Scrolling... (timeout: {timeout_no_new}s with no new)")
|
||||
|
||||
cycle_start = time.time()
|
||||
while True:
|
||||
@@ -711,8 +748,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
processedIds.add(rid);
|
||||
|
||||
// Already seen from API - just track order, skip content
|
||||
// BUT still hide the card to keep DOM light!
|
||||
if (seenSet.has(rid)) {
|
||||
results.push({id: rid, orderOnly: true});
|
||||
// Hide this card since we already have its data from API
|
||||
card.style.display = 'none';
|
||||
card.innerHTML = '';
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -769,10 +810,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
timestamp: timestamp,
|
||||
source: 'dom'
|
||||
});
|
||||
// Hide processed card (separators removed on next cycle)
|
||||
card.style.display = 'none';
|
||||
card.innerHTML = '';
|
||||
}
|
||||
|
||||
// ALWAYS hide processed cards to keep DOM light
|
||||
// (even if extraction failed - we've seen this card)
|
||||
card.style.display = 'none';
|
||||
card.innerHTML = '';
|
||||
}
|
||||
return {reviews: results, cardCount: cards.length, sepsRemoved: sepsRemoved};
|
||||
""", seen_list)
|
||||
@@ -791,14 +834,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
reviews[rid] = rev
|
||||
seen_ids.add(rid)
|
||||
except Exception as e:
|
||||
print(f" ❌ DOM parse error: {e}")
|
||||
log.error(f" ❌ DOM parse error: {e}")
|
||||
dom_time = time.time() - t2
|
||||
|
||||
# BATCH FLUSH: If we have enough reviews, flush to callback and clear memory
|
||||
# Sort by DOM order before flushing
|
||||
t3 = time.time()
|
||||
if flush_callback and len(reviews) >= flush_batch_size:
|
||||
print(f" 💾 Flushing {len(reviews)} reviews to disk...")
|
||||
log.info(f" 💾 Flushing {len(reviews)} reviews to disk...")
|
||||
sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
|
||||
flush_callback([r for _, r in sorted_reviews])
|
||||
total_flushed[0] += len(reviews)
|
||||
@@ -809,7 +852,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
|
||||
# TIMING: Print if cycle is slow (>2s)
|
||||
if cycle_delta > 2.0:
|
||||
print(f" ⚠️ SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})")
|
||||
log.warning(f" ⚠️ SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})")
|
||||
|
||||
# Check for new reviews
|
||||
if current_count > last_count:
|
||||
@@ -844,19 +887,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
elapsed = time.time() - last_new_time
|
||||
if total_reviews[0]:
|
||||
pct = (current_count / total_reviews[0]) * 100
|
||||
print(f" 📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True)
|
||||
log.info(f" 📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s")
|
||||
else:
|
||||
print(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True)
|
||||
log.info(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s")
|
||||
|
||||
# Call progress callback on every iteration (for real-time log updates)
|
||||
if progress_callback:
|
||||
progress_callback(current_count, total_reviews[0])
|
||||
|
||||
# Stop conditions - check BEFORE recovery attempts
|
||||
if current_count >= max_reviews:
|
||||
print(f"✅ Reached max: {current_count}")
|
||||
log.info(f"✅ Reached max: {current_count}")
|
||||
stop_scrolling.set()
|
||||
break
|
||||
|
||||
# Also stop if we have all reviews from the page
|
||||
if total_reviews[0] and current_count >= total_reviews[0]:
|
||||
print(f"✅ All {current_count} reviews collected")
|
||||
log.info(f"✅ All {current_count} reviews collected")
|
||||
stop_scrolling.set()
|
||||
break
|
||||
|
||||
@@ -865,12 +912,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
if elapsed >= 3 and int(elapsed) % 3 == 0:
|
||||
# After 8+ failed recovery attempts, try hard refresh
|
||||
if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes:
|
||||
print(f" 🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...", flush=True)
|
||||
log.info(f" 🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...")
|
||||
if do_hard_refresh():
|
||||
last_new_time = time.time() # Reset timer after refresh
|
||||
continue # Skip to next iteration
|
||||
else:
|
||||
print(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
|
||||
log.info(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...")
|
||||
unstick_scroll()
|
||||
|
||||
# Check scroll state - track if content is still being added
|
||||
@@ -906,24 +953,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
if truly_done or timeout_hit:
|
||||
# Last chance: try hard refresh before giving up
|
||||
if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
|
||||
print(f" 🔄 Timeout reached, trying hard refresh before giving up...", flush=True)
|
||||
log.info(f" 🔄 Timeout reached, trying hard refresh before giving up...")
|
||||
if do_hard_refresh():
|
||||
last_new_time = time.time()
|
||||
continue # Keep trying
|
||||
print(f"✅ All reviews loaded: {current_count}")
|
||||
log.info(f"✅ All reviews loaded: {current_count}")
|
||||
stop_scrolling.set()
|
||||
break
|
||||
|
||||
# Flush any remaining reviews (sorted by DOM order)
|
||||
if flush_callback and reviews:
|
||||
print(f" 💾 Final flush: {len(reviews)} reviews...")
|
||||
log.info(f" 💾 Final flush: {len(reviews)} reviews...")
|
||||
sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
|
||||
flush_callback([r for _, r in sorted_reviews])
|
||||
total_flushed[0] += len(reviews)
|
||||
reviews.clear()
|
||||
|
||||
# Reviews already parsed during scrolling (real-time parsing)
|
||||
print("📝 Finalizing review data...")
|
||||
log.info("📝 Finalizing review data...")
|
||||
|
||||
# Final results (sorted by DOM order)
|
||||
sorted_items = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
|
||||
@@ -933,21 +980,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
api_count = sum(1 for r in review_list if r.get("source") == "api")
|
||||
|
||||
if total_flushed[0] > 0:
|
||||
print(f"\n📋 Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})")
|
||||
log.info(f"📋 Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})")
|
||||
else:
|
||||
print(f"\n📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
|
||||
log.info(f"📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
|
||||
|
||||
return {
|
||||
"reviews": review_list, # Only unflushed reviews (flushed already sent to callback)
|
||||
"total": grand_total,
|
||||
"total_flushed": total_flushed[0],
|
||||
"checks": check_num,
|
||||
"url": url
|
||||
"url": url,
|
||||
"logs": log.get_logs()
|
||||
}
|
||||
|
||||
|
||||
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
|
||||
progress_callback=None, driver=None, return_driver: bool = False):
|
||||
progress_callback=None, driver=None, return_driver: bool = False,
|
||||
log_capture: LogCapture = None):
|
||||
"""
|
||||
Production-compatible wrapper for scrape_reviews.
|
||||
Matches the API expected by job_manager.py.
|
||||
@@ -959,9 +1008,10 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
progress_callback: Optional callback(current_count, total_count) for progress
|
||||
driver: Existing driver instance to reuse
|
||||
return_driver: If True, return driver in result
|
||||
log_capture: Optional LogCapture instance for real-time log access
|
||||
|
||||
Returns:
|
||||
Dictionary with: reviews, count, total_reviews, time, success, error, driver
|
||||
Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
|
||||
"""
|
||||
from seleniumbase import Driver
|
||||
|
||||
@@ -969,6 +1019,9 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
driver_provided = driver is not None
|
||||
should_close_driver = not return_driver and not driver_provided
|
||||
|
||||
# Use provided log_capture or create new one
|
||||
log_capture = log_capture or LogCapture()
|
||||
|
||||
try:
|
||||
# Create driver if not provided
|
||||
if not driver:
|
||||
@@ -980,6 +1033,25 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
)
|
||||
driver.set_window_size(1200, 900) # Proper viewport for Google Maps
|
||||
|
||||
# Set Chrome geolocation to US (Boston, MA) using CDP
|
||||
# This ensures Google Maps shows US results regardless of server location
|
||||
try:
|
||||
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
||||
'latitude': 42.3601,
|
||||
'longitude': -71.0589,
|
||||
'accuracy': 100
|
||||
})
|
||||
log_capture.info("Set geolocation to US (Boston, MA)")
|
||||
except Exception as e:
|
||||
log_capture.warning(f"Could not set geolocation: {e}")
|
||||
|
||||
# Add URL parameters for consistent results
|
||||
if 'hl=' not in url:
|
||||
separator = '&' if '?' in url else '?'
|
||||
url = f"{url}{separator}hl=en"
|
||||
if 'gl=' not in url:
|
||||
url = f"{url}&gl=us"
|
||||
|
||||
# Create progress wrapper if callback provided
|
||||
flush_callback = None
|
||||
if progress_callback:
|
||||
@@ -989,14 +1061,16 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
progress_callback(collected[0], None)
|
||||
flush_callback = flush_with_progress
|
||||
|
||||
# Run the scraper
|
||||
# Run the scraper with progress callback for real-time updates
|
||||
result = scrape_reviews(
|
||||
driver=driver,
|
||||
url=url,
|
||||
max_reviews=999999, # Effectively unlimited
|
||||
timeout_no_new=15,
|
||||
flush_callback=flush_callback,
|
||||
flush_batch_size=100 # Smaller batches for more frequent progress
|
||||
flush_batch_size=100, # Smaller batches for more frequent progress
|
||||
log_capture=log_capture,
|
||||
progress_callback=progress_callback # Pass through for real-time log updates
|
||||
)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
@@ -1008,7 +1082,8 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
"total_reviews": result.get("total", 0),
|
||||
"time": elapsed,
|
||||
"success": True,
|
||||
"error": None
|
||||
"error": None,
|
||||
"logs": result.get("logs", [])
|
||||
}
|
||||
|
||||
if return_driver:
|
||||
@@ -1030,6 +1105,9 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
except:
|
||||
pass
|
||||
|
||||
# Log error to the existing log_capture
|
||||
log_capture.error(f"Scraper failed: {str(e)}")
|
||||
|
||||
return {
|
||||
"reviews": [],
|
||||
"count": 0,
|
||||
@@ -1037,7 +1115,8 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
"time": elapsed,
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"driver": driver if return_driver else None
|
||||
"driver": driver if return_driver else None,
|
||||
"logs": log_capture.get_logs()
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user