Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
360 lines
12 KiB
Python
360 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Chrome Worker Pool Manager
|
|
|
|
Maintains a pool of idle Chrome instances for faster scraping.
|
|
Pre-warms browsers on startup to eliminate cold-start delays.
|
|
"""
|
|
import logging
|
|
import asyncio
|
|
import time
|
|
from typing import Optional, Dict, Any
|
|
from seleniumbase import Driver
|
|
from queue import Queue, Empty
|
|
import threading
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
class ChromeWorker:
|
|
"""Single Chrome worker instance"""
|
|
|
|
def __init__(self, worker_id: str, headless: bool = True):
|
|
self.worker_id = worker_id
|
|
self.headless = headless
|
|
self.driver: Optional[Driver] = None
|
|
self.created_at = None
|
|
self.last_used = None
|
|
self.use_count = 0
|
|
self.is_busy = False
|
|
|
|
def initialize(self):
|
|
"""Initialize Chrome driver with stability flags for unlimited scraping"""
|
|
try:
|
|
log.info(f"Worker {self.worker_id}: Initializing Chrome for unlimited review scraping...")
|
|
|
|
# SeleniumBase Driver automatically includes UC mode anti-detection
|
|
# Initialize with longer timeouts for large scraping jobs
|
|
self.driver = Driver(
|
|
uc=True,
|
|
headless=self.headless,
|
|
page_load_strategy="normal"
|
|
)
|
|
|
|
# Set generous timeouts for large scraping jobs
|
|
self.driver.set_page_load_timeout(120) # 2 minutes for slow networks
|
|
self.driver.set_script_timeout(60) # 1 minute for complex extraction
|
|
|
|
self.driver.maximize_window()
|
|
self.created_at = time.time()
|
|
self.last_used = time.time()
|
|
log.info(f"Worker {self.worker_id}: Chrome ready for unlimited scraping")
|
|
return True
|
|
except Exception as e:
|
|
log.error(f"Worker {self.worker_id}: Failed to initialize: {e}")
|
|
return False
|
|
|
|
def reset(self):
|
|
"""Reset worker to clean state"""
|
|
try:
|
|
if self.driver:
|
|
# Clear cookies, cache, local storage
|
|
self.driver.delete_all_cookies()
|
|
self.driver.execute_script("window.localStorage.clear();")
|
|
self.driver.execute_script("window.sessionStorage.clear();")
|
|
log.debug(f"Worker {self.worker_id}: Reset complete")
|
|
except Exception as e:
|
|
log.warning(f"Worker {self.worker_id}: Reset failed: {e}")
|
|
|
|
def shutdown(self):
|
|
"""Shutdown worker"""
|
|
try:
|
|
if self.driver:
|
|
self.driver.quit()
|
|
log.info(f"Worker {self.worker_id}: Shutdown complete")
|
|
except Exception as e:
|
|
log.warning(f"Worker {self.worker_id}: Shutdown error: {e}")
|
|
finally:
|
|
self.driver = None
|
|
|
|
def should_recycle(self, max_age_seconds: int = 3600, max_uses: int = 50):
|
|
"""Check if worker should be recycled"""
|
|
if not self.driver:
|
|
return True
|
|
|
|
age = time.time() - self.created_at if self.created_at else 0
|
|
if age > max_age_seconds:
|
|
log.info(f"Worker {self.worker_id}: Recycling due to age ({age:.0f}s)")
|
|
return True
|
|
|
|
if self.use_count > max_uses:
|
|
log.info(f"Worker {self.worker_id}: Recycling due to use count ({self.use_count})")
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
class ChromeWorkerPool:
|
|
"""
|
|
Pool of Chrome worker instances for faster scraping.
|
|
|
|
Maintains idle workers ready to execute tasks immediately.
|
|
Workers are recycled after max age or max uses to prevent memory leaks.
|
|
"""
|
|
|
|
def __init__(self, pool_size: int = 2, headless: bool = True):
|
|
"""
|
|
Initialize worker pool.
|
|
|
|
Args:
|
|
pool_size: Number of idle workers to maintain
|
|
headless: Run Chrome in headless mode
|
|
"""
|
|
self.pool_size = pool_size
|
|
self.headless = headless
|
|
self.workers: Queue[ChromeWorker] = Queue(maxsize=pool_size)
|
|
self.active_workers: Dict[str, ChromeWorker] = {}
|
|
self.worker_counter = 0
|
|
self.lock = threading.Lock()
|
|
self.running = False
|
|
self.maintenance_thread = None
|
|
|
|
def start(self):
|
|
"""Start the worker pool"""
|
|
log.info(f"Starting Chrome worker pool (size={self.pool_size}, headless={self.headless})")
|
|
self.running = True
|
|
|
|
# Pre-warm workers
|
|
for _ in range(self.pool_size):
|
|
self._create_worker()
|
|
|
|
# Start maintenance thread
|
|
self.maintenance_thread = threading.Thread(target=self._maintenance_loop, daemon=True)
|
|
self.maintenance_thread.start()
|
|
|
|
log.info(f"Chrome worker pool started with {self.workers.qsize()} ready workers")
|
|
|
|
def stop(self):
|
|
"""Stop the worker pool"""
|
|
log.info("Stopping Chrome worker pool...")
|
|
self.running = False
|
|
|
|
if self.maintenance_thread:
|
|
self.maintenance_thread.join(timeout=5)
|
|
|
|
# Shutdown all workers
|
|
while not self.workers.empty():
|
|
try:
|
|
worker = self.workers.get_nowait()
|
|
worker.shutdown()
|
|
except Empty:
|
|
break
|
|
|
|
# Shutdown active workers
|
|
with self.lock:
|
|
for worker in self.active_workers.values():
|
|
worker.shutdown()
|
|
self.active_workers.clear()
|
|
|
|
log.info("Chrome worker pool stopped")
|
|
|
|
def _create_worker(self) -> Optional[ChromeWorker]:
|
|
"""Create a new worker and add to pool"""
|
|
with self.lock:
|
|
self.worker_counter += 1
|
|
worker_id = f"worker-{self.worker_counter}"
|
|
|
|
worker = ChromeWorker(worker_id, headless=self.headless)
|
|
if worker.initialize():
|
|
try:
|
|
self.workers.put_nowait(worker)
|
|
return worker
|
|
except:
|
|
worker.shutdown()
|
|
return None
|
|
return None
|
|
|
|
def acquire_worker(self, timeout: float = 30) -> Optional[ChromeWorker]:
|
|
"""
|
|
Acquire a worker from the pool.
|
|
|
|
Args:
|
|
timeout: Maximum time to wait for a worker
|
|
|
|
Returns:
|
|
ChromeWorker instance or None if timeout
|
|
"""
|
|
try:
|
|
worker = self.workers.get(timeout=timeout)
|
|
worker.is_busy = True
|
|
worker.last_used = time.time()
|
|
worker.use_count += 1
|
|
|
|
with self.lock:
|
|
self.active_workers[worker.worker_id] = worker
|
|
|
|
log.debug(f"Acquired {worker.worker_id} (uses: {worker.use_count}, pool: {self.workers.qsize()}/{self.pool_size})")
|
|
|
|
# No need to create replacement - worker will be returned to pool after use
|
|
# Maintenance thread ensures pool stays at capacity
|
|
|
|
return worker
|
|
except Empty:
|
|
log.warning(f"Failed to acquire worker within {timeout}s")
|
|
return None
|
|
|
|
def release_worker(self, worker: ChromeWorker, recycle: bool = False):
|
|
"""
|
|
Release a worker back to the pool.
|
|
|
|
Args:
|
|
worker: Worker to release
|
|
recycle: Force worker recycling
|
|
"""
|
|
with self.lock:
|
|
if worker.worker_id in self.active_workers:
|
|
del self.active_workers[worker.worker_id]
|
|
|
|
worker.is_busy = False
|
|
|
|
# Check if worker should be recycled
|
|
if recycle or worker.should_recycle():
|
|
log.info(f"Recycling {worker.worker_id}")
|
|
worker.shutdown()
|
|
# Create replacement worker in background
|
|
threading.Thread(target=self._create_worker, daemon=True).start()
|
|
else:
|
|
# Reset and return to pool
|
|
worker.reset()
|
|
try:
|
|
# Non-blocking put - if pool is full, it means we have extra workers
|
|
# Just keep the worker for next time instead of destroying it
|
|
current_size = self.workers.qsize()
|
|
if current_size < self.pool_size:
|
|
self.workers.put_nowait(worker)
|
|
log.debug(f"Released {worker.worker_id} back to pool ({current_size + 1}/{self.pool_size})")
|
|
else:
|
|
# Pool already at capacity, recycle this extra worker
|
|
log.debug(f"Pool at capacity ({current_size}/{self.pool_size}), recycling extra {worker.worker_id}")
|
|
worker.shutdown()
|
|
except Exception as e:
|
|
# Unexpected error, shutdown worker
|
|
log.error(f"Failed to release {worker.worker_id}: {e}")
|
|
worker.shutdown()
|
|
|
|
def _maintenance_loop(self):
|
|
"""Background maintenance thread"""
|
|
while self.running:
|
|
try:
|
|
# Ensure pool is at capacity
|
|
current_size = self.workers.qsize()
|
|
needed = self.pool_size - current_size
|
|
|
|
if needed > 0:
|
|
log.debug(f"Pool needs {needed} more workers")
|
|
for _ in range(needed):
|
|
self._create_worker()
|
|
|
|
# Sleep for 10 seconds
|
|
time.sleep(10)
|
|
|
|
except Exception as e:
|
|
log.error(f"Maintenance loop error: {e}")
|
|
time.sleep(5)
|
|
|
|
def get_stats(self) -> Dict[str, Any]:
|
|
"""Get pool statistics"""
|
|
with self.lock:
|
|
active_count = len(self.active_workers)
|
|
|
|
return {
|
|
"pool_size": self.pool_size,
|
|
"idle_workers": self.workers.qsize(),
|
|
"active_workers": active_count,
|
|
"total_workers_created": self.worker_counter,
|
|
"headless": self.headless
|
|
}
|
|
|
|
|
|
# Global worker pool instances
|
|
validation_pool: Optional[ChromeWorkerPool] = None
|
|
scraping_pool: Optional[ChromeWorkerPool] = None
|
|
|
|
|
|
def start_worker_pools(validation_size: int = 1, scraping_size: int = 2, headless: bool = True):
|
|
"""
|
|
Start global worker pools.
|
|
|
|
Args:
|
|
validation_size: Number of workers for validation checks
|
|
scraping_size: Number of workers for scraping jobs
|
|
headless: Run Chrome in headless mode
|
|
"""
|
|
global validation_pool, scraping_pool
|
|
|
|
log.info("Starting global Chrome worker pools...")
|
|
|
|
validation_pool = ChromeWorkerPool(pool_size=validation_size, headless=headless)
|
|
validation_pool.start()
|
|
|
|
scraping_pool = ChromeWorkerPool(pool_size=scraping_size, headless=headless)
|
|
scraping_pool.start()
|
|
|
|
log.info("Global Chrome worker pools started")
|
|
|
|
|
|
def stop_worker_pools():
|
|
"""Stop global worker pools"""
|
|
global validation_pool, scraping_pool
|
|
|
|
log.info("Stopping global Chrome worker pools...")
|
|
|
|
if validation_pool:
|
|
validation_pool.stop()
|
|
validation_pool = None
|
|
|
|
if scraping_pool:
|
|
scraping_pool.stop()
|
|
scraping_pool = None
|
|
|
|
log.info("Global Chrome worker pools stopped")
|
|
|
|
|
|
def get_validation_worker(timeout: float = 10) -> Optional[ChromeWorker]:
|
|
"""Get a worker for validation check"""
|
|
if validation_pool:
|
|
return validation_pool.acquire_worker(timeout=timeout)
|
|
return None
|
|
|
|
|
|
def release_validation_worker(worker: ChromeWorker, recycle: bool = False):
|
|
"""Release a validation worker"""
|
|
if validation_pool:
|
|
validation_pool.release_worker(worker, recycle=recycle)
|
|
|
|
|
|
def get_scraping_worker(timeout: float = 30) -> Optional[ChromeWorker]:
|
|
"""Get a worker for scraping"""
|
|
if scraping_pool:
|
|
return scraping_pool.acquire_worker(timeout=timeout)
|
|
return None
|
|
|
|
|
|
def release_scraping_worker(worker: ChromeWorker, recycle: bool = False):
|
|
"""Release a scraping worker"""
|
|
if scraping_pool:
|
|
scraping_pool.release_worker(worker, recycle=recycle)
|
|
|
|
|
|
def get_pool_stats() -> Dict[str, Any]:
|
|
"""Get statistics for all pools"""
|
|
stats = {}
|
|
|
|
if validation_pool:
|
|
stats['validation'] = validation_pool.get_stats()
|
|
|
|
if scraping_pool:
|
|
stats['scraping'] = scraping_pool.get_stats()
|
|
|
|
return stats
|