""" Scraper Registry This module provides a database-backed registry for managing and routing scraper requests. It supports: - Version-based routing (exact version or latest for variant) - A/B testing via traffic_pct weighted selection - Priority-based scraper filtering - Caching with TTL for performance """ import asyncio import logging import random import time from dataclasses import dataclass from typing import Any, Dict, List, Optional, Type from scrapers.base import BaseScraper log = logging.getLogger(__name__) @dataclass class ScraperInfo: """Information about a registered scraper.""" id: str job_type: str version: str variant: str module_path: str function_name: str is_default: bool traffic_pct: int min_priority: int config: Optional[Dict[str, Any]] deprecated_at: Optional[str] class ScraperRegistry: """ Routes scraping requests to appropriate scraper versions. Supports A/B testing via traffic_pct and variant selection. This registry is backed by the scraper_registry database table and provides weighted random selection for A/B testing scenarios. Usage: registry = ScraperRegistry(db) scraper_info = await registry.get_scraper("google_reviews") # scraper_info contains module_path, function_name, config, etc. """ def __init__(self, db: "DatabaseManager"): # noqa: F821 - forward reference """ Initialize the scraper registry. Args: db: DatabaseManager instance for database access """ self.db = db self._cache: Dict[str, List[ScraperInfo]] = {} # Cache by job_type self._cache_timestamp: float = 0 self._cache_ttl: int = 60 # Refresh cache every 60 seconds self._cache_lock = asyncio.Lock() async def get_scraper( self, job_type: str, version: str = None, variant: str = None, priority: int = 0 ) -> Optional[Dict[str, Any]]: """ Get scraper info for a job. Priority order: 1. If version specified, return exact match 2. If variant specified, return latest active scraper of that variant 3. Otherwise, use A/B routing based on traffic_pct Args: job_type: Type of scraping job (e.g., "google_reviews") version: Optional specific version to use (e.g., "1.0.0") variant: Optional variant filter ("stable", "beta", "canary") priority: Job priority level for min_priority filtering Returns: Dictionary containing scraper info: { "version": "1.0.0", "variant": "stable", "module_path": "scrapers.google_reviews.v1_0_0", "function_name": "fast_scrape_reviews", "config": {...} } Returns None if no matching scraper found. """ # Ensure cache is fresh await self._ensure_cache_fresh() # Get all scrapers for this job type scrapers = self._cache.get(job_type, []) if not scrapers: log.warning(f"No scrapers registered for job_type: {job_type}") return None # Filter out deprecated scrapers active_scrapers = [s for s in scrapers if s.deprecated_at is None] if not active_scrapers: log.warning(f"All scrapers for job_type {job_type} are deprecated") return None selected: Optional[ScraperInfo] = None # Priority 1: Exact version match if version: selected = self._find_by_version(active_scrapers, version) if selected: log.debug(f"Selected scraper by exact version: {version}") else: log.warning(f"Requested version {version} not found for {job_type}") return None # Priority 2: Latest for variant elif variant: selected = self._find_latest_for_variant(active_scrapers, variant) if selected: log.debug(f"Selected latest scraper for variant {variant}: {selected.version}") else: log.warning(f"No active scrapers found for variant {variant} in {job_type}") return None # Priority 3: A/B weighted selection else: selected = await self._get_weighted_scraper(job_type, priority) if selected: log.debug(f"Selected scraper via A/B routing: {selected.version} ({selected.variant})") if not selected: return None return self._scraper_to_dict(selected) async def _get_weighted_scraper( self, job_type: str, priority: int ) -> Optional[ScraperInfo]: """ Select scraper based on traffic weights. Uses random selection weighted by traffic_pct. Filters by min_priority. Args: job_type: Type of scraping job priority: Job priority level Returns: Selected ScraperInfo or None if no eligible scrapers """ scrapers = self._cache.get(job_type, []) # Filter: active, has traffic allocation, and meets priority requirement eligible = [ s for s in scrapers if s.deprecated_at is None and s.traffic_pct > 0 and s.min_priority <= priority ] if not eligible: # Fall back to default scraper default = self._find_default(scrapers) if default and default.min_priority <= priority: log.debug(f"No eligible A/B scrapers, using default: {default.version}") return default log.warning(f"No eligible scrapers for job_type {job_type} with priority {priority}") return None # Weighted random selection total_weight = sum(s.traffic_pct for s in eligible) if total_weight == 0: # Equal probability if all have 0 traffic_pct return random.choice(eligible) # Generate random number in range [0, total_weight) rand_value = random.random() * total_weight cumulative = 0 for scraper in eligible: cumulative += scraper.traffic_pct if rand_value < cumulative: return scraper # Fallback (shouldn't reach here, but safety) return eligible[-1] async def refresh_cache(self) -> None: """ Reload registry from database. This method forces a cache refresh regardless of TTL. Thread-safe via asyncio lock. """ async with self._cache_lock: await self._load_cache() async def _ensure_cache_fresh(self) -> None: """Ensure cache is loaded and not stale.""" current_time = time.time() if ( not self._cache or (current_time - self._cache_timestamp) > self._cache_ttl ): async with self._cache_lock: # Double-check after acquiring lock if ( not self._cache or (current_time - self._cache_timestamp) > self._cache_ttl ): await self._load_cache() async def _load_cache(self) -> None: """Load all scraper registry entries from database.""" try: async with self.db.pool.acquire() as conn: rows = await conn.fetch(""" SELECT id, job_type, version, variant, module_path, function_name, is_default, traffic_pct, min_priority, config, deprecated_at FROM scraper_registry ORDER BY job_type, version DESC """) # Group by job_type self._cache.clear() for row in rows: scraper_info = ScraperInfo( id=str(row['id']), job_type=row['job_type'], version=row['version'], variant=row['variant'], module_path=row['module_path'], function_name=row['function_name'], is_default=row['is_default'], traffic_pct=row['traffic_pct'], min_priority=row['min_priority'], config=row['config'], deprecated_at=str(row['deprecated_at']) if row['deprecated_at'] else None ) if scraper_info.job_type not in self._cache: self._cache[scraper_info.job_type] = [] self._cache[scraper_info.job_type].append(scraper_info) self._cache_timestamp = time.time() log.info(f"Scraper registry cache loaded: {sum(len(v) for v in self._cache.values())} entries") except Exception as e: log.error(f"Failed to load scraper registry cache: {e}") raise async def list_scrapers( self, job_type: str = None, include_deprecated: bool = False ) -> List[Dict[str, Any]]: """ List registered scrapers, optionally filtered by job_type. Args: job_type: Optional job type filter include_deprecated: Whether to include deprecated scrapers Returns: List of scraper info dictionaries """ await self._ensure_cache_fresh() result = [] if job_type: scrapers = self._cache.get(job_type, []) else: scrapers = [s for scrapers_list in self._cache.values() for s in scrapers_list] for scraper in scrapers: if not include_deprecated and scraper.deprecated_at: continue result.append(self._scraper_to_dict(scraper)) return result async def get_scraper_by_id(self, scraper_id: str) -> Optional[Dict[str, Any]]: """ Get a specific scraper by its database ID. Args: scraper_id: UUID of the scraper registry entry Returns: Scraper info dictionary or None if not found """ await self._ensure_cache_fresh() for scrapers_list in self._cache.values(): for scraper in scrapers_list: if scraper.id == scraper_id: return self._scraper_to_dict(scraper) return None async def register_scraper( self, job_type: str, version: str, variant: str, module_path: str, function_name: str, is_default: bool = False, traffic_pct: int = 0, min_priority: int = 0, config: Optional[Dict[str, Any]] = None ) -> str: """ Register a new scraper version in the database. Args: job_type: Type of scraping job version: Semantic version string variant: Release channel ("stable", "beta", "canary") module_path: Python module path function_name: Entry function name is_default: Whether this is the default fallback traffic_pct: Traffic percentage for A/B testing (0-100) min_priority: Minimum job priority required config: Optional configuration dictionary Returns: UUID of created registry entry Raises: ValueError: If variant is invalid or traffic_pct out of range """ if variant not in ('stable', 'beta', 'canary'): raise ValueError(f"Invalid variant: {variant}. Must be 'stable', 'beta', or 'canary'") if not 0 <= traffic_pct <= 100: raise ValueError(f"traffic_pct must be between 0 and 100, got: {traffic_pct}") import json async with self.db.pool.acquire() as conn: scraper_id = await conn.fetchval(""" INSERT INTO scraper_registry ( job_type, version, variant, module_path, function_name, is_default, traffic_pct, min_priority, config ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9::jsonb) RETURNING id """, job_type, version, variant, module_path, function_name, is_default, traffic_pct, min_priority, json.dumps(config) if config else None) # Invalidate cache self._cache_timestamp = 0 log.info(f"Registered scraper: {job_type} v{version} ({variant})") return str(scraper_id) async def deprecate_scraper( self, job_type: str, version: str ) -> bool: """ Deprecate a scraper version (soft delete). Args: job_type: Type of scraping job version: Version to deprecate Returns: True if deprecated, False if not found """ async with self.db.pool.acquire() as conn: result = await conn.execute(""" UPDATE scraper_registry SET deprecated_at = NOW() WHERE job_type = $1 AND version = $2 AND deprecated_at IS NULL """, job_type, version) updated = result.split()[-1] == "1" if updated: self._cache_timestamp = 0 # Invalidate cache log.info(f"Deprecated scraper: {job_type} v{version}") return updated async def update_traffic_allocation( self, job_type: str, allocations: Dict[str, int] ) -> None: """ Update traffic allocations for multiple scrapers atomically. Args: job_type: Type of scraping job allocations: Dict mapping version to traffic_pct e.g., {"1.0.0": 90, "1.1.0-beta": 10} Raises: ValueError: If total exceeds 100 or any value is invalid """ total = sum(allocations.values()) if total > 100: raise ValueError(f"Total traffic allocation cannot exceed 100, got: {total}") for version, pct in allocations.items(): if not 0 <= pct <= 100: raise ValueError(f"Invalid traffic_pct for {version}: {pct}") async with self.db.pool.acquire() as conn: async with conn.transaction(): for version, traffic_pct in allocations.items(): await conn.execute(""" UPDATE scraper_registry SET traffic_pct = $3 WHERE job_type = $1 AND version = $2 AND deprecated_at IS NULL """, job_type, version, traffic_pct) # Invalidate cache self._cache_timestamp = 0 log.info(f"Updated traffic allocations for {job_type}: {allocations}") # ==================== Helper Methods ==================== def _find_by_version( self, scrapers: List[ScraperInfo], version: str ) -> Optional[ScraperInfo]: """Find scraper by exact version match.""" for scraper in scrapers: if scraper.version == version: return scraper return None def _find_latest_for_variant( self, scrapers: List[ScraperInfo], variant: str ) -> Optional[ScraperInfo]: """Find latest (first in sorted list) scraper for a variant.""" for scraper in scrapers: if scraper.variant == variant: return scraper return None def _find_default( self, scrapers: List[ScraperInfo] ) -> Optional[ScraperInfo]: """Find default scraper for fallback.""" for scraper in scrapers: if scraper.is_default and scraper.deprecated_at is None: return scraper return None def _scraper_to_dict(self, scraper: ScraperInfo) -> Dict[str, Any]: """Convert ScraperInfo to dictionary for API responses.""" return { "id": scraper.id, "version": scraper.version, "variant": scraper.variant, "module_path": scraper.module_path, "function_name": scraper.function_name, "is_default": scraper.is_default, "traffic_pct": scraper.traffic_pct, "min_priority": scraper.min_priority, "config": scraper.config, "deprecated": scraper.deprecated_at is not None } # ==================== Legacy Singleton Registry ==================== # Kept for backward compatibility with existing code that uses # the old class-based scraper registration pattern. class LegacyScraperRegistry: """ Legacy registry for managing scraper implementations. This class provides backward compatibility with the old scraper registration pattern using class-based scrapers. New code should use the database-backed ScraperRegistry instead. The registry allows: - Registering scrapers by name and version - Looking up scrapers by domain or name - Listing all available scrapers Usage: registry = LegacyScraperRegistry() registry.register(GoogleReviewsScraper) scraper = registry.get_scraper_for_url("https://google.com/maps/place/...") """ _instance: Optional["LegacyScraperRegistry"] = None _scrapers: Dict[str, Type[BaseScraper]] def __new__(cls) -> "LegacyScraperRegistry": """Singleton pattern to ensure one global registry.""" if cls._instance is None: cls._instance = super().__new__(cls) cls._instance._scrapers = {} cls._instance._domain_map = {} return cls._instance def register(self, scraper_class: Type[BaseScraper], name: Optional[str] = None) -> None: """ Register a scraper class with the registry. Args: scraper_class: The scraper class to register (must inherit from BaseScraper) name: Optional name override, defaults to scraper_class.name property """ # Create a temporary instance to get properties # Note: In production, we might want scraper_class to have class-level properties instance = scraper_class.__new__(scraper_class) scraper_name = name or instance.name scraper_version = instance.version key = f"{scraper_name}:{scraper_version}" self._scrapers[key] = scraper_class # Map domains to this scraper for domain in instance.supported_domains: if domain not in self._domain_map: self._domain_map[domain] = [] self._domain_map[domain].append(key) def get_scraper(self, name: str, version: Optional[str] = None) -> Optional[Type[BaseScraper]]: """ Get a scraper class by name and optional version. Args: name: The scraper name version: Optional version string. If not provided, returns the latest. Returns: The scraper class, or None if not found """ if version: key = f"{name}:{version}" return self._scrapers.get(key) # Find latest version for this name matching = [k for k in self._scrapers.keys() if k.startswith(f"{name}:")] if not matching: return None # Sort by version and return latest matching.sort(reverse=True) return self._scrapers.get(matching[0]) def get_scraper_for_url(self, url: str) -> Optional[Type[BaseScraper]]: """ Find a suitable scraper for the given URL. Args: url: The URL to find a scraper for Returns: The scraper class that can handle this URL, or None if no match """ from urllib.parse import urlparse parsed = urlparse(url) domain = parsed.netloc.lower() # Remove www. prefix for matching if domain.startswith("www."): domain = domain[4:] scraper_keys = self._domain_map.get(domain, []) if not scraper_keys: return None # Return the latest version scraper_keys.sort(reverse=True) return self._scrapers.get(scraper_keys[0]) def list_scrapers(self) -> List[Dict[str, str]]: """ List all registered scrapers. Returns: List of dictionaries with scraper info (name, version, domains) """ result = [] for key, scraper_class in self._scrapers.items(): instance = scraper_class.__new__(scraper_class) result.append({ "name": instance.name, "version": instance.version, "domains": instance.supported_domains }) return result def clear(self) -> None: """Clear all registered scrapers. Useful for testing.""" self._scrapers.clear() self._domain_map.clear() # Global legacy registry instance (for backward compatibility) registry = LegacyScraperRegistry()