Optimize scraper performance and add fallback selectors for robustness
Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
359
modules/chrome_pool.py
Normal file
359
modules/chrome_pool.py
Normal file
@@ -0,0 +1,359 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Chrome Worker Pool Manager
|
||||
|
||||
Maintains a pool of idle Chrome instances for faster scraping.
|
||||
Pre-warms browsers on startup to eliminate cold-start delays.
|
||||
"""
|
||||
import logging
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Optional, Dict, Any
|
||||
from seleniumbase import Driver
|
||||
from queue import Queue, Empty
|
||||
import threading
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ChromeWorker:
|
||||
"""Single Chrome worker instance"""
|
||||
|
||||
def __init__(self, worker_id: str, headless: bool = True):
|
||||
self.worker_id = worker_id
|
||||
self.headless = headless
|
||||
self.driver: Optional[Driver] = None
|
||||
self.created_at = None
|
||||
self.last_used = None
|
||||
self.use_count = 0
|
||||
self.is_busy = False
|
||||
|
||||
def initialize(self):
|
||||
"""Initialize Chrome driver with stability flags for unlimited scraping"""
|
||||
try:
|
||||
log.info(f"Worker {self.worker_id}: Initializing Chrome for unlimited review scraping...")
|
||||
|
||||
# SeleniumBase Driver automatically includes UC mode anti-detection
|
||||
# Initialize with longer timeouts for large scraping jobs
|
||||
self.driver = Driver(
|
||||
uc=True,
|
||||
headless=self.headless,
|
||||
page_load_strategy="normal"
|
||||
)
|
||||
|
||||
# Set generous timeouts for large scraping jobs
|
||||
self.driver.set_page_load_timeout(120) # 2 minutes for slow networks
|
||||
self.driver.set_script_timeout(60) # 1 minute for complex extraction
|
||||
|
||||
self.driver.maximize_window()
|
||||
self.created_at = time.time()
|
||||
self.last_used = time.time()
|
||||
log.info(f"Worker {self.worker_id}: Chrome ready for unlimited scraping")
|
||||
return True
|
||||
except Exception as e:
|
||||
log.error(f"Worker {self.worker_id}: Failed to initialize: {e}")
|
||||
return False
|
||||
|
||||
def reset(self):
|
||||
"""Reset worker to clean state"""
|
||||
try:
|
||||
if self.driver:
|
||||
# Clear cookies, cache, local storage
|
||||
self.driver.delete_all_cookies()
|
||||
self.driver.execute_script("window.localStorage.clear();")
|
||||
self.driver.execute_script("window.sessionStorage.clear();")
|
||||
log.debug(f"Worker {self.worker_id}: Reset complete")
|
||||
except Exception as e:
|
||||
log.warning(f"Worker {self.worker_id}: Reset failed: {e}")
|
||||
|
||||
def shutdown(self):
|
||||
"""Shutdown worker"""
|
||||
try:
|
||||
if self.driver:
|
||||
self.driver.quit()
|
||||
log.info(f"Worker {self.worker_id}: Shutdown complete")
|
||||
except Exception as e:
|
||||
log.warning(f"Worker {self.worker_id}: Shutdown error: {e}")
|
||||
finally:
|
||||
self.driver = None
|
||||
|
||||
def should_recycle(self, max_age_seconds: int = 3600, max_uses: int = 50):
|
||||
"""Check if worker should be recycled"""
|
||||
if not self.driver:
|
||||
return True
|
||||
|
||||
age = time.time() - self.created_at if self.created_at else 0
|
||||
if age > max_age_seconds:
|
||||
log.info(f"Worker {self.worker_id}: Recycling due to age ({age:.0f}s)")
|
||||
return True
|
||||
|
||||
if self.use_count > max_uses:
|
||||
log.info(f"Worker {self.worker_id}: Recycling due to use count ({self.use_count})")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
class ChromeWorkerPool:
|
||||
"""
|
||||
Pool of Chrome worker instances for faster scraping.
|
||||
|
||||
Maintains idle workers ready to execute tasks immediately.
|
||||
Workers are recycled after max age or max uses to prevent memory leaks.
|
||||
"""
|
||||
|
||||
def __init__(self, pool_size: int = 2, headless: bool = True):
|
||||
"""
|
||||
Initialize worker pool.
|
||||
|
||||
Args:
|
||||
pool_size: Number of idle workers to maintain
|
||||
headless: Run Chrome in headless mode
|
||||
"""
|
||||
self.pool_size = pool_size
|
||||
self.headless = headless
|
||||
self.workers: Queue[ChromeWorker] = Queue(maxsize=pool_size)
|
||||
self.active_workers: Dict[str, ChromeWorker] = {}
|
||||
self.worker_counter = 0
|
||||
self.lock = threading.Lock()
|
||||
self.running = False
|
||||
self.maintenance_thread = None
|
||||
|
||||
def start(self):
|
||||
"""Start the worker pool"""
|
||||
log.info(f"Starting Chrome worker pool (size={self.pool_size}, headless={self.headless})")
|
||||
self.running = True
|
||||
|
||||
# Pre-warm workers
|
||||
for _ in range(self.pool_size):
|
||||
self._create_worker()
|
||||
|
||||
# Start maintenance thread
|
||||
self.maintenance_thread = threading.Thread(target=self._maintenance_loop, daemon=True)
|
||||
self.maintenance_thread.start()
|
||||
|
||||
log.info(f"Chrome worker pool started with {self.workers.qsize()} ready workers")
|
||||
|
||||
def stop(self):
|
||||
"""Stop the worker pool"""
|
||||
log.info("Stopping Chrome worker pool...")
|
||||
self.running = False
|
||||
|
||||
if self.maintenance_thread:
|
||||
self.maintenance_thread.join(timeout=5)
|
||||
|
||||
# Shutdown all workers
|
||||
while not self.workers.empty():
|
||||
try:
|
||||
worker = self.workers.get_nowait()
|
||||
worker.shutdown()
|
||||
except Empty:
|
||||
break
|
||||
|
||||
# Shutdown active workers
|
||||
with self.lock:
|
||||
for worker in self.active_workers.values():
|
||||
worker.shutdown()
|
||||
self.active_workers.clear()
|
||||
|
||||
log.info("Chrome worker pool stopped")
|
||||
|
||||
def _create_worker(self) -> Optional[ChromeWorker]:
|
||||
"""Create a new worker and add to pool"""
|
||||
with self.lock:
|
||||
self.worker_counter += 1
|
||||
worker_id = f"worker-{self.worker_counter}"
|
||||
|
||||
worker = ChromeWorker(worker_id, headless=self.headless)
|
||||
if worker.initialize():
|
||||
try:
|
||||
self.workers.put_nowait(worker)
|
||||
return worker
|
||||
except:
|
||||
worker.shutdown()
|
||||
return None
|
||||
return None
|
||||
|
||||
def acquire_worker(self, timeout: float = 30) -> Optional[ChromeWorker]:
|
||||
"""
|
||||
Acquire a worker from the pool.
|
||||
|
||||
Args:
|
||||
timeout: Maximum time to wait for a worker
|
||||
|
||||
Returns:
|
||||
ChromeWorker instance or None if timeout
|
||||
"""
|
||||
try:
|
||||
worker = self.workers.get(timeout=timeout)
|
||||
worker.is_busy = True
|
||||
worker.last_used = time.time()
|
||||
worker.use_count += 1
|
||||
|
||||
with self.lock:
|
||||
self.active_workers[worker.worker_id] = worker
|
||||
|
||||
log.debug(f"Acquired {worker.worker_id} (uses: {worker.use_count}, pool: {self.workers.qsize()}/{self.pool_size})")
|
||||
|
||||
# No need to create replacement - worker will be returned to pool after use
|
||||
# Maintenance thread ensures pool stays at capacity
|
||||
|
||||
return worker
|
||||
except Empty:
|
||||
log.warning(f"Failed to acquire worker within {timeout}s")
|
||||
return None
|
||||
|
||||
def release_worker(self, worker: ChromeWorker, recycle: bool = False):
|
||||
"""
|
||||
Release a worker back to the pool.
|
||||
|
||||
Args:
|
||||
worker: Worker to release
|
||||
recycle: Force worker recycling
|
||||
"""
|
||||
with self.lock:
|
||||
if worker.worker_id in self.active_workers:
|
||||
del self.active_workers[worker.worker_id]
|
||||
|
||||
worker.is_busy = False
|
||||
|
||||
# Check if worker should be recycled
|
||||
if recycle or worker.should_recycle():
|
||||
log.info(f"Recycling {worker.worker_id}")
|
||||
worker.shutdown()
|
||||
# Create replacement worker in background
|
||||
threading.Thread(target=self._create_worker, daemon=True).start()
|
||||
else:
|
||||
# Reset and return to pool
|
||||
worker.reset()
|
||||
try:
|
||||
# Non-blocking put - if pool is full, it means we have extra workers
|
||||
# Just keep the worker for next time instead of destroying it
|
||||
current_size = self.workers.qsize()
|
||||
if current_size < self.pool_size:
|
||||
self.workers.put_nowait(worker)
|
||||
log.debug(f"Released {worker.worker_id} back to pool ({current_size + 1}/{self.pool_size})")
|
||||
else:
|
||||
# Pool already at capacity, recycle this extra worker
|
||||
log.debug(f"Pool at capacity ({current_size}/{self.pool_size}), recycling extra {worker.worker_id}")
|
||||
worker.shutdown()
|
||||
except Exception as e:
|
||||
# Unexpected error, shutdown worker
|
||||
log.error(f"Failed to release {worker.worker_id}: {e}")
|
||||
worker.shutdown()
|
||||
|
||||
def _maintenance_loop(self):
|
||||
"""Background maintenance thread"""
|
||||
while self.running:
|
||||
try:
|
||||
# Ensure pool is at capacity
|
||||
current_size = self.workers.qsize()
|
||||
needed = self.pool_size - current_size
|
||||
|
||||
if needed > 0:
|
||||
log.debug(f"Pool needs {needed} more workers")
|
||||
for _ in range(needed):
|
||||
self._create_worker()
|
||||
|
||||
# Sleep for 10 seconds
|
||||
time.sleep(10)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Maintenance loop error: {e}")
|
||||
time.sleep(5)
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get pool statistics"""
|
||||
with self.lock:
|
||||
active_count = len(self.active_workers)
|
||||
|
||||
return {
|
||||
"pool_size": self.pool_size,
|
||||
"idle_workers": self.workers.qsize(),
|
||||
"active_workers": active_count,
|
||||
"total_workers_created": self.worker_counter,
|
||||
"headless": self.headless
|
||||
}
|
||||
|
||||
|
||||
# Global worker pool instances
|
||||
validation_pool: Optional[ChromeWorkerPool] = None
|
||||
scraping_pool: Optional[ChromeWorkerPool] = None
|
||||
|
||||
|
||||
def start_worker_pools(validation_size: int = 1, scraping_size: int = 2, headless: bool = True):
|
||||
"""
|
||||
Start global worker pools.
|
||||
|
||||
Args:
|
||||
validation_size: Number of workers for validation checks
|
||||
scraping_size: Number of workers for scraping jobs
|
||||
headless: Run Chrome in headless mode
|
||||
"""
|
||||
global validation_pool, scraping_pool
|
||||
|
||||
log.info("Starting global Chrome worker pools...")
|
||||
|
||||
validation_pool = ChromeWorkerPool(pool_size=validation_size, headless=headless)
|
||||
validation_pool.start()
|
||||
|
||||
scraping_pool = ChromeWorkerPool(pool_size=scraping_size, headless=headless)
|
||||
scraping_pool.start()
|
||||
|
||||
log.info("Global Chrome worker pools started")
|
||||
|
||||
|
||||
def stop_worker_pools():
|
||||
"""Stop global worker pools"""
|
||||
global validation_pool, scraping_pool
|
||||
|
||||
log.info("Stopping global Chrome worker pools...")
|
||||
|
||||
if validation_pool:
|
||||
validation_pool.stop()
|
||||
validation_pool = None
|
||||
|
||||
if scraping_pool:
|
||||
scraping_pool.stop()
|
||||
scraping_pool = None
|
||||
|
||||
log.info("Global Chrome worker pools stopped")
|
||||
|
||||
|
||||
def get_validation_worker(timeout: float = 10) -> Optional[ChromeWorker]:
|
||||
"""Get a worker for validation check"""
|
||||
if validation_pool:
|
||||
return validation_pool.acquire_worker(timeout=timeout)
|
||||
return None
|
||||
|
||||
|
||||
def release_validation_worker(worker: ChromeWorker, recycle: bool = False):
|
||||
"""Release a validation worker"""
|
||||
if validation_pool:
|
||||
validation_pool.release_worker(worker, recycle=recycle)
|
||||
|
||||
|
||||
def get_scraping_worker(timeout: float = 30) -> Optional[ChromeWorker]:
|
||||
"""Get a worker for scraping"""
|
||||
if scraping_pool:
|
||||
return scraping_pool.acquire_worker(timeout=timeout)
|
||||
return None
|
||||
|
||||
|
||||
def release_scraping_worker(worker: ChromeWorker, recycle: bool = False):
|
||||
"""Release a scraping worker"""
|
||||
if scraping_pool:
|
||||
scraping_pool.release_worker(worker, recycle=recycle)
|
||||
|
||||
|
||||
def get_pool_stats() -> Dict[str, Any]:
|
||||
"""Get statistics for all pools"""
|
||||
stats = {}
|
||||
|
||||
if validation_pool:
|
||||
stats['validation'] = validation_pool.get_stats()
|
||||
|
||||
if scraping_pool:
|
||||
stats['scraping'] = scraping_pool.get_stats()
|
||||
|
||||
return stats
|
||||
Reference in New Issue
Block a user