#!/usr/bin/env python3 """ Chrome Worker Pool Manager Maintains a pool of idle Chrome instances for faster scraping. Pre-warms browsers on startup to eliminate cold-start delays. """ import logging import asyncio import time from typing import Optional, Dict, Any from seleniumbase import Driver from queue import Queue, Empty import threading log = logging.getLogger(__name__) class ChromeWorker: """Single Chrome worker instance""" def __init__(self, worker_id: str, headless: bool = True): self.worker_id = worker_id self.headless = headless self.driver: Optional[Driver] = None self.created_at = None self.last_used = None self.use_count = 0 self.is_busy = False def initialize(self): """Initialize Chrome driver with stability flags for unlimited scraping""" try: log.info(f"Worker {self.worker_id}: Initializing Chrome for unlimited review scraping...") # SeleniumBase Driver automatically includes UC mode anti-detection # Initialize with longer timeouts for large scraping jobs self.driver = Driver( uc=True, headless=self.headless, page_load_strategy="normal" ) # Set generous timeouts for large scraping jobs self.driver.set_page_load_timeout(120) # 2 minutes for slow networks self.driver.set_script_timeout(60) # 1 minute for complex extraction self.driver.maximize_window() self.created_at = time.time() self.last_used = time.time() log.info(f"Worker {self.worker_id}: Chrome ready for unlimited scraping") return True except Exception as e: log.error(f"Worker {self.worker_id}: Failed to initialize: {e}") return False def reset(self): """Reset worker to clean state""" try: if self.driver: # Clear cookies, cache, local storage self.driver.delete_all_cookies() self.driver.execute_script("window.localStorage.clear();") self.driver.execute_script("window.sessionStorage.clear();") log.debug(f"Worker {self.worker_id}: Reset complete") except Exception as e: log.warning(f"Worker {self.worker_id}: Reset failed: {e}") def shutdown(self): """Shutdown worker""" try: if self.driver: self.driver.quit() log.info(f"Worker {self.worker_id}: Shutdown complete") except Exception as e: log.warning(f"Worker {self.worker_id}: Shutdown error: {e}") finally: self.driver = None def should_recycle(self, max_age_seconds: int = 3600, max_uses: int = 50): """Check if worker should be recycled""" if not self.driver: return True age = time.time() - self.created_at if self.created_at else 0 if age > max_age_seconds: log.info(f"Worker {self.worker_id}: Recycling due to age ({age:.0f}s)") return True if self.use_count > max_uses: log.info(f"Worker {self.worker_id}: Recycling due to use count ({self.use_count})") return True return False class ChromeWorkerPool: """ Pool of Chrome worker instances for faster scraping. Maintains idle workers ready to execute tasks immediately. Workers are recycled after max age or max uses to prevent memory leaks. """ def __init__(self, pool_size: int = 2, headless: bool = True): """ Initialize worker pool. Args: pool_size: Number of idle workers to maintain headless: Run Chrome in headless mode """ self.pool_size = pool_size self.headless = headless self.workers: Queue[ChromeWorker] = Queue(maxsize=pool_size) self.active_workers: Dict[str, ChromeWorker] = {} self.worker_counter = 0 self.lock = threading.Lock() self.running = False self.maintenance_thread = None def start(self): """Start the worker pool""" log.info(f"Starting Chrome worker pool (size={self.pool_size}, headless={self.headless})") self.running = True # Pre-warm workers for _ in range(self.pool_size): self._create_worker() # Start maintenance thread self.maintenance_thread = threading.Thread(target=self._maintenance_loop, daemon=True) self.maintenance_thread.start() log.info(f"Chrome worker pool started with {self.workers.qsize()} ready workers") def stop(self): """Stop the worker pool""" log.info("Stopping Chrome worker pool...") self.running = False if self.maintenance_thread: self.maintenance_thread.join(timeout=5) # Shutdown all workers while not self.workers.empty(): try: worker = self.workers.get_nowait() worker.shutdown() except Empty: break # Shutdown active workers with self.lock: for worker in self.active_workers.values(): worker.shutdown() self.active_workers.clear() log.info("Chrome worker pool stopped") def _create_worker(self) -> Optional[ChromeWorker]: """Create a new worker and add to pool""" with self.lock: self.worker_counter += 1 worker_id = f"worker-{self.worker_counter}" worker = ChromeWorker(worker_id, headless=self.headless) if worker.initialize(): try: self.workers.put_nowait(worker) return worker except: worker.shutdown() return None return None def acquire_worker(self, timeout: float = 30) -> Optional[ChromeWorker]: """ Acquire a worker from the pool. Args: timeout: Maximum time to wait for a worker Returns: ChromeWorker instance or None if timeout """ try: worker = self.workers.get(timeout=timeout) worker.is_busy = True worker.last_used = time.time() worker.use_count += 1 with self.lock: self.active_workers[worker.worker_id] = worker log.debug(f"Acquired {worker.worker_id} (uses: {worker.use_count}, pool: {self.workers.qsize()}/{self.pool_size})") # No need to create replacement - worker will be returned to pool after use # Maintenance thread ensures pool stays at capacity return worker except Empty: log.warning(f"Failed to acquire worker within {timeout}s") return None def release_worker(self, worker: ChromeWorker, recycle: bool = False): """ Release a worker back to the pool. Args: worker: Worker to release recycle: Force worker recycling """ with self.lock: if worker.worker_id in self.active_workers: del self.active_workers[worker.worker_id] worker.is_busy = False # Check if worker should be recycled if recycle or worker.should_recycle(): log.info(f"Recycling {worker.worker_id}") worker.shutdown() # Create replacement worker in background threading.Thread(target=self._create_worker, daemon=True).start() else: # Reset and return to pool worker.reset() try: # Non-blocking put - if pool is full, it means we have extra workers # Just keep the worker for next time instead of destroying it current_size = self.workers.qsize() if current_size < self.pool_size: self.workers.put_nowait(worker) log.debug(f"Released {worker.worker_id} back to pool ({current_size + 1}/{self.pool_size})") else: # Pool already at capacity, recycle this extra worker log.debug(f"Pool at capacity ({current_size}/{self.pool_size}), recycling extra {worker.worker_id}") worker.shutdown() except Exception as e: # Unexpected error, shutdown worker log.error(f"Failed to release {worker.worker_id}: {e}") worker.shutdown() def _maintenance_loop(self): """Background maintenance thread""" while self.running: try: # Ensure pool is at capacity current_size = self.workers.qsize() needed = self.pool_size - current_size if needed > 0: log.debug(f"Pool needs {needed} more workers") for _ in range(needed): self._create_worker() # Sleep for 10 seconds time.sleep(10) except Exception as e: log.error(f"Maintenance loop error: {e}") time.sleep(5) def get_stats(self) -> Dict[str, Any]: """Get pool statistics""" with self.lock: active_count = len(self.active_workers) return { "pool_size": self.pool_size, "idle_workers": self.workers.qsize(), "active_workers": active_count, "total_workers_created": self.worker_counter, "headless": self.headless } # Global worker pool instances validation_pool: Optional[ChromeWorkerPool] = None scraping_pool: Optional[ChromeWorkerPool] = None def start_worker_pools(validation_size: int = 1, scraping_size: int = 2, headless: bool = True): """ Start global worker pools. Args: validation_size: Number of workers for validation checks scraping_size: Number of workers for scraping jobs headless: Run Chrome in headless mode """ global validation_pool, scraping_pool log.info("Starting global Chrome worker pools...") validation_pool = ChromeWorkerPool(pool_size=validation_size, headless=headless) validation_pool.start() scraping_pool = ChromeWorkerPool(pool_size=scraping_size, headless=headless) scraping_pool.start() log.info("Global Chrome worker pools started") def stop_worker_pools(): """Stop global worker pools""" global validation_pool, scraping_pool log.info("Stopping global Chrome worker pools...") if validation_pool: validation_pool.stop() validation_pool = None if scraping_pool: scraping_pool.stop() scraping_pool = None log.info("Global Chrome worker pools stopped") def get_validation_worker(timeout: float = 10) -> Optional[ChromeWorker]: """Get a worker for validation check""" if validation_pool: return validation_pool.acquire_worker(timeout=timeout) return None def release_validation_worker(worker: ChromeWorker, recycle: bool = False): """Release a validation worker""" if validation_pool: validation_pool.release_worker(worker, recycle=recycle) def get_scraping_worker(timeout: float = 30) -> Optional[ChromeWorker]: """Get a worker for scraping""" if scraping_pool: return scraping_pool.acquire_worker(timeout=timeout) return None def release_scraping_worker(worker: ChromeWorker, recycle: bool = False): """Release a scraping worker""" if scraping_pool: scraping_pool.release_worker(worker, recycle=recycle) def get_pool_stats() -> Dict[str, Any]: """Get statistics for all pools""" stats = {} if validation_pool: stats['validation'] = validation_pool.get_stats() if scraping_pool: stats['scraping'] = scraping_pool.get_stats() return stats