Files
whyrating-engine-legacy/modules/chrome_pool.py
Alejandro Gutiérrez 8ccf72a489 Remove old scraper files - consolidate to scraper_clean
Production (api_server_production.py) only uses:
- modules/scraper_clean.py - main scraping logic
- modules/fast_scraper.py - validation helpers
- modules/database.py, webhooks.py, health_checks.py, chrome_pool.py

Deleted 33 unused Python files including:
- Old API server (api_server.py)
- 14 start*.py experimental scrapers
- 7 *_scraper.py variants
- Old modules: scraper.py, api_interceptor.py, job_manager.py, cli.py
- Various debug/test/utility scripts

Saves ~11,000 lines of unmaintained code.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 17:25:00 +00:00

389 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Chrome Worker Pool Manager
Maintains a pool of idle Chrome instances for faster scraping.
Pre-warms browsers on startup to eliminate cold-start delays.
"""
import logging
import asyncio
import time
from typing import Optional, Dict, Any
from seleniumbase import Driver
from queue import Queue, Empty
import threading
log = logging.getLogger(__name__)
class ChromeWorker:
"""Single Chrome worker instance"""
def __init__(self, worker_id: str, headless: bool = True):
self.worker_id = worker_id
self.headless = headless
self.driver: Optional[Driver] = None
self.created_at = None
self.last_used = None
self.use_count = 0
self.is_busy = False
def initialize(self):
"""Initialize Chrome driver with stability flags for unlimited scraping"""
try:
log.info(f"Worker {self.worker_id}: Initializing Chrome for unlimited review scraping...")
# SeleniumBase Driver automatically includes UC mode anti-detection
# Initialize with longer timeouts for large scraping jobs
# Chrome arguments for Docker stability
chrome_args = [
"--disable-dev-shm-usage", # Use /tmp instead of /dev/shm (critical for Docker)
"--disable-gpu", # Disable GPU acceleration
"--no-sandbox", # Required for Docker
"--disable-software-rasterizer",
"--disable-extensions",
"--disable-background-networking",
"--disable-default-apps",
"--disable-sync",
"--metrics-recording-only",
"--mute-audio",
"--no-first-run",
"--safebrowsing-disable-auto-update",
]
self.driver = Driver(
uc=True,
headless=self.headless,
page_load_strategy="normal",
chromium_arg=",".join(chrome_args)
)
# Set generous timeouts for large scraping jobs
self.driver.set_page_load_timeout(120) # 2 minutes for slow networks
self.driver.set_script_timeout(60) # 1 minute for complex extraction
# Set Chrome geolocation to US (Boston, MA) for consistent Google Maps results
# This prevents location-based variations in search results
try:
self.driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': 42.3601,
'longitude': -71.0589,
'accuracy': 100
})
log.info(f"Worker {self.worker_id}: Geolocation set to US (Boston, MA)")
except Exception as e:
log.warning(f"Worker {self.worker_id}: Could not set geolocation: {e}")
self.driver.maximize_window()
self.created_at = time.time()
self.last_used = time.time()
log.info(f"Worker {self.worker_id}: Chrome ready for unlimited scraping")
return True
except Exception as e:
log.error(f"Worker {self.worker_id}: Failed to initialize: {e}")
return False
def reset(self):
"""Reset worker to clean state"""
try:
if self.driver:
# Clear cookies, cache, local storage
self.driver.delete_all_cookies()
self.driver.execute_script("window.localStorage.clear();")
self.driver.execute_script("window.sessionStorage.clear();")
log.debug(f"Worker {self.worker_id}: Reset complete")
except Exception as e:
log.warning(f"Worker {self.worker_id}: Reset failed: {e}")
def shutdown(self):
"""Shutdown worker"""
try:
if self.driver:
self.driver.quit()
log.info(f"Worker {self.worker_id}: Shutdown complete")
except Exception as e:
log.warning(f"Worker {self.worker_id}: Shutdown error: {e}")
finally:
self.driver = None
def should_recycle(self, max_age_seconds: int = 3600, max_uses: int = 50):
"""Check if worker should be recycled"""
if not self.driver:
return True
age = time.time() - self.created_at if self.created_at else 0
if age > max_age_seconds:
log.info(f"Worker {self.worker_id}: Recycling due to age ({age:.0f}s)")
return True
if self.use_count > max_uses:
log.info(f"Worker {self.worker_id}: Recycling due to use count ({self.use_count})")
return True
return False
class ChromeWorkerPool:
"""
Pool of Chrome worker instances for faster scraping.
Maintains idle workers ready to execute tasks immediately.
Workers are recycled after max age or max uses to prevent memory leaks.
"""
def __init__(self, pool_size: int = 2, headless: bool = True):
"""
Initialize worker pool.
Args:
pool_size: Number of idle workers to maintain
headless: Run Chrome in headless mode
"""
self.pool_size = pool_size
self.headless = headless
self.workers: Queue[ChromeWorker] = Queue(maxsize=pool_size)
self.active_workers: Dict[str, ChromeWorker] = {}
self.worker_counter = 0
self.lock = threading.Lock()
self.running = False
self.maintenance_thread = None
def start(self):
"""Start the worker pool"""
log.info(f"Starting Chrome worker pool (size={self.pool_size}, headless={self.headless})")
self.running = True
# Pre-warm workers
for _ in range(self.pool_size):
self._create_worker()
# Start maintenance thread
self.maintenance_thread = threading.Thread(target=self._maintenance_loop, daemon=True)
self.maintenance_thread.start()
log.info(f"Chrome worker pool started with {self.workers.qsize()} ready workers")
def stop(self):
"""Stop the worker pool"""
log.info("Stopping Chrome worker pool...")
self.running = False
if self.maintenance_thread:
self.maintenance_thread.join(timeout=5)
# Shutdown all workers
while not self.workers.empty():
try:
worker = self.workers.get_nowait()
worker.shutdown()
except Empty:
break
# Shutdown active workers
with self.lock:
for worker in self.active_workers.values():
worker.shutdown()
self.active_workers.clear()
log.info("Chrome worker pool stopped")
def _create_worker(self) -> Optional[ChromeWorker]:
"""Create a new worker and add to pool"""
with self.lock:
self.worker_counter += 1
worker_id = f"worker-{self.worker_counter}"
worker = ChromeWorker(worker_id, headless=self.headless)
if worker.initialize():
try:
self.workers.put_nowait(worker)
return worker
except:
worker.shutdown()
return None
return None
def acquire_worker(self, timeout: float = 30) -> Optional[ChromeWorker]:
"""
Acquire a worker from the pool.
Args:
timeout: Maximum time to wait for a worker
Returns:
ChromeWorker instance or None if timeout
"""
try:
worker = self.workers.get(timeout=timeout)
worker.is_busy = True
worker.last_used = time.time()
worker.use_count += 1
with self.lock:
self.active_workers[worker.worker_id] = worker
log.debug(f"Acquired {worker.worker_id} (uses: {worker.use_count}, pool: {self.workers.qsize()}/{self.pool_size})")
# No need to create replacement - worker will be returned to pool after use
# Maintenance thread ensures pool stays at capacity
return worker
except Empty:
log.warning(f"Failed to acquire worker within {timeout}s")
return None
def release_worker(self, worker: ChromeWorker, recycle: bool = False):
"""
Release a worker back to the pool.
Args:
worker: Worker to release
recycle: Force worker recycling
"""
with self.lock:
if worker.worker_id in self.active_workers:
del self.active_workers[worker.worker_id]
worker.is_busy = False
# Check if worker should be recycled
if recycle or worker.should_recycle():
log.info(f"Recycling {worker.worker_id}")
worker.shutdown()
# Create replacement worker in background
threading.Thread(target=self._create_worker, daemon=True).start()
else:
# Reset and return to pool
worker.reset()
try:
# Non-blocking put - if pool is full, it means we have extra workers
# Just keep the worker for next time instead of destroying it
current_size = self.workers.qsize()
if current_size < self.pool_size:
self.workers.put_nowait(worker)
log.debug(f"Released {worker.worker_id} back to pool ({current_size + 1}/{self.pool_size})")
else:
# Pool already at capacity, recycle this extra worker
log.debug(f"Pool at capacity ({current_size}/{self.pool_size}), recycling extra {worker.worker_id}")
worker.shutdown()
except Exception as e:
# Unexpected error, shutdown worker
log.error(f"Failed to release {worker.worker_id}: {e}")
worker.shutdown()
def _maintenance_loop(self):
"""Background maintenance thread"""
while self.running:
try:
# Ensure pool is at capacity
current_size = self.workers.qsize()
needed = self.pool_size - current_size
if needed > 0:
log.debug(f"Pool needs {needed} more workers")
for _ in range(needed):
self._create_worker()
# Sleep for 10 seconds
time.sleep(10)
except Exception as e:
log.error(f"Maintenance loop error: {e}")
time.sleep(5)
def get_stats(self) -> Dict[str, Any]:
"""Get pool statistics"""
with self.lock:
active_count = len(self.active_workers)
return {
"pool_size": self.pool_size,
"idle_workers": self.workers.qsize(),
"active_workers": active_count,
"total_workers_created": self.worker_counter,
"headless": self.headless
}
# Global worker pool instances
validation_pool: Optional[ChromeWorkerPool] = None
scraping_pool: Optional[ChromeWorkerPool] = None
def start_worker_pools(validation_size: int = 1, scraping_size: int = 2, headless: bool = True):
"""
Start global worker pools.
Args:
validation_size: Number of workers for validation checks
scraping_size: Number of workers for scraping jobs
headless: Run Chrome in headless mode
"""
global validation_pool, scraping_pool
log.info("Starting global Chrome worker pools...")
validation_pool = ChromeWorkerPool(pool_size=validation_size, headless=headless)
validation_pool.start()
scraping_pool = ChromeWorkerPool(pool_size=scraping_size, headless=headless)
scraping_pool.start()
log.info("Global Chrome worker pools started")
def stop_worker_pools():
"""Stop global worker pools"""
global validation_pool, scraping_pool
log.info("Stopping global Chrome worker pools...")
if validation_pool:
validation_pool.stop()
validation_pool = None
if scraping_pool:
scraping_pool.stop()
scraping_pool = None
log.info("Global Chrome worker pools stopped")
def get_validation_worker(timeout: float = 10) -> Optional[ChromeWorker]:
"""Get a worker for validation check"""
if validation_pool:
return validation_pool.acquire_worker(timeout=timeout)
return None
def release_validation_worker(worker: ChromeWorker, recycle: bool = False):
"""Release a validation worker"""
if validation_pool:
validation_pool.release_worker(worker, recycle=recycle)
def get_scraping_worker(timeout: float = 30) -> Optional[ChromeWorker]:
"""Get a worker for scraping"""
if scraping_pool:
return scraping_pool.acquire_worker(timeout=timeout)
return None
def release_scraping_worker(worker: ChromeWorker, recycle: bool = False):
"""Release a scraping worker"""
if scraping_pool:
scraping_pool.release_worker(worker, recycle=recycle)
def get_pool_stats() -> Dict[str, Any]:
"""Get statistics for all pools"""
stats = {}
if validation_pool:
stats['validation'] = validation_pool.get_stats()
if scraping_pool:
stats['scraping'] = scraping_pool.get_stats()
return stats