whyrating-engine-legacy/scrapers/registry.py

"""
Scraper Registry

This module provides a database-backed registry for managing and routing
scraper requests. It supports:
- Version-based routing (exact version or latest for variant)
- A/B testing via traffic_pct weighted selection
- Priority-based scraper filtering
- Caching with TTL for performance
"""

import asyncio
import logging
import random
import time
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Type

from scrapers.base import BaseScraper

log = logging.getLogger(__name__)


@dataclass
class ScraperInfo:
    """Information about a registered scraper."""
    id: str
    job_type: str
    version: str
    variant: str
    module_path: str
    function_name: str
    is_default: bool
    traffic_pct: int
    min_priority: int
    config: Optional[Dict[str, Any]]
    deprecated_at: Optional[str]


class ScraperRegistry:
    """
    Routes scraping requests to appropriate scraper versions.
    Supports A/B testing via traffic_pct and variant selection.

    This registry is backed by the scraper_registry database table and
    provides weighted random selection for A/B testing scenarios.

    Usage:
        registry = ScraperRegistry(db)
        scraper_info = await registry.get_scraper("google_reviews")
        # scraper_info contains module_path, function_name, config, etc.
    """

    def __init__(self, db: "DatabaseManager"):  # noqa: F821 - forward reference
        """
        Initialize the scraper registry.

        Args:
            db: DatabaseManager instance for database access
        """
        self.db = db
        self._cache: Dict[str, List[ScraperInfo]] = {}  # Cache by job_type
        self._cache_timestamp: float = 0
        self._cache_ttl: int = 60  # Refresh cache every 60 seconds
        self._cache_lock = asyncio.Lock()

    async def get_scraper(
        self,
        job_type: str,
        version: str = None,
        variant: str = None,
        priority: int = 0
    ) -> Optional[Dict[str, Any]]:
        """
        Get scraper info for a job.

        Priority order:
        1. If version specified, return exact match
        2. If variant specified, return latest active scraper of that variant
        3. Otherwise, use A/B routing based on traffic_pct

        Args:
            job_type: Type of scraping job (e.g., "google_reviews")
            version: Optional specific version to use (e.g., "1.0.0")
            variant: Optional variant filter ("stable", "beta", "canary")
            priority: Job priority level for min_priority filtering

        Returns:
            Dictionary containing scraper info:
            {
                "version": "1.0.0",
                "variant": "stable",
                "module_path": "scrapers.google_reviews.v1_0_0",
                "function_name": "fast_scrape_reviews",
                "config": {...}
            }

            Returns None if no matching scraper found.
        """
        # Ensure cache is fresh
        await self._ensure_cache_fresh()

        # Get all scrapers for this job type
        scrapers = self._cache.get(job_type, [])
        if not scrapers:
            log.warning(f"No scrapers registered for job_type: {job_type}")
            return None

        # Filter out deprecated scrapers
        active_scrapers = [s for s in scrapers if s.deprecated_at is None]
        if not active_scrapers:
            log.warning(f"All scrapers for job_type {job_type} are deprecated")
            return None

        selected: Optional[ScraperInfo] = None

        # Priority 1: Exact version match
        if version:
            selected = self._find_by_version(active_scrapers, version)
            if selected:
                log.debug(f"Selected scraper by exact version: {version}")
            else:
                log.warning(f"Requested version {version} not found for {job_type}")
                return None

        # Priority 2: Latest for variant
        elif variant:
            selected = self._find_latest_for_variant(active_scrapers, variant)
            if selected:
                log.debug(f"Selected latest scraper for variant {variant}: {selected.version}")
            else:
                log.warning(f"No active scrapers found for variant {variant} in {job_type}")
                return None

        # Priority 3: A/B weighted selection
        else:
            selected = await self._get_weighted_scraper(job_type, priority)
            if selected:
                log.debug(f"Selected scraper via A/B routing: {selected.version} ({selected.variant})")

        if not selected:
            return None

        return self._scraper_to_dict(selected)

    async def _get_weighted_scraper(
        self,
        job_type: str,
        priority: int
    ) -> Optional[ScraperInfo]:
        """
        Select scraper based on traffic weights.
        Uses random selection weighted by traffic_pct.
        Filters by min_priority.

        Args:
            job_type: Type of scraping job
            priority: Job priority level

        Returns:
            Selected ScraperInfo or None if no eligible scrapers
        """
        scrapers = self._cache.get(job_type, [])

        # Filter: active, has traffic allocation, and meets priority requirement
        eligible = [
            s for s in scrapers
            if s.deprecated_at is None
            and s.traffic_pct > 0
            and s.min_priority <= priority
        ]

        if not eligible:
            # Fall back to default scraper
            default = self._find_default(scrapers)
            if default and default.min_priority <= priority:
                log.debug(f"No eligible A/B scrapers, using default: {default.version}")
                return default
            log.warning(f"No eligible scrapers for job_type {job_type} with priority {priority}")
            return None

        # Weighted random selection
        total_weight = sum(s.traffic_pct for s in eligible)
        if total_weight == 0:
            # Equal probability if all have 0 traffic_pct
            return random.choice(eligible)

        # Generate random number in range [0, total_weight)
        rand_value = random.random() * total_weight
        cumulative = 0

        for scraper in eligible:
            cumulative += scraper.traffic_pct
            if rand_value < cumulative:
                return scraper

        # Fallback (shouldn't reach here, but safety)
        return eligible[-1]

    async def refresh_cache(self) -> None:
        """
        Reload registry from database.

        This method forces a cache refresh regardless of TTL.
        Thread-safe via asyncio lock.
        """
        async with self._cache_lock:
            await self._load_cache()

    async def _ensure_cache_fresh(self) -> None:
        """Ensure cache is loaded and not stale."""
        current_time = time.time()
        if (
            not self._cache
            or (current_time - self._cache_timestamp) > self._cache_ttl
        ):
            async with self._cache_lock:
                # Double-check after acquiring lock
                if (
                    not self._cache
                    or (current_time - self._cache_timestamp) > self._cache_ttl
                ):
                    await self._load_cache()

    async def _load_cache(self) -> None:
        """Load all scraper registry entries from database."""
        try:
            async with self.db.pool.acquire() as conn:
                rows = await conn.fetch("""
                    SELECT
                        id,
                        job_type,
                        version,
                        variant,
                        module_path,
                        function_name,
                        is_default,
                        traffic_pct,
                        min_priority,
                        config,
                        deprecated_at
                    FROM scraper_registry
                    ORDER BY job_type, version DESC
                """)

            # Group by job_type
            self._cache.clear()
            for row in rows:
                scraper_info = ScraperInfo(
                    id=str(row['id']),
                    job_type=row['job_type'],
                    version=row['version'],
                    variant=row['variant'],
                    module_path=row['module_path'],
                    function_name=row['function_name'],
                    is_default=row['is_default'],
                    traffic_pct=row['traffic_pct'],
                    min_priority=row['min_priority'],
                    config=row['config'],
                    deprecated_at=str(row['deprecated_at']) if row['deprecated_at'] else None
                )

                if scraper_info.job_type not in self._cache:
                    self._cache[scraper_info.job_type] = []
                self._cache[scraper_info.job_type].append(scraper_info)

            self._cache_timestamp = time.time()
            log.info(f"Scraper registry cache loaded: {sum(len(v) for v in self._cache.values())} entries")

        except Exception as e:
            log.error(f"Failed to load scraper registry cache: {e}")
            raise

    async def list_scrapers(
        self,
        job_type: str = None,
        include_deprecated: bool = False
    ) -> List[Dict[str, Any]]:
        """
        List registered scrapers, optionally filtered by job_type.

        Args:
            job_type: Optional job type filter
            include_deprecated: Whether to include deprecated scrapers

        Returns:
            List of scraper info dictionaries
        """
        await self._ensure_cache_fresh()

        result = []

        if job_type:
            scrapers = self._cache.get(job_type, [])
        else:
            scrapers = [s for scrapers_list in self._cache.values() for s in scrapers_list]

        for scraper in scrapers:
            if not include_deprecated and scraper.deprecated_at:
                continue
            result.append(self._scraper_to_dict(scraper))

        return result

    async def get_scraper_by_id(self, scraper_id: str) -> Optional[Dict[str, Any]]:
        """
        Get a specific scraper by its database ID.

        Args:
            scraper_id: UUID of the scraper registry entry

        Returns:
            Scraper info dictionary or None if not found
        """
        await self._ensure_cache_fresh()

        for scrapers_list in self._cache.values():
            for scraper in scrapers_list:
                if scraper.id == scraper_id:
                    return self._scraper_to_dict(scraper)

        return None

    async def register_scraper(
        self,
        job_type: str,
        version: str,
        variant: str,
        module_path: str,
        function_name: str,
        is_default: bool = False,
        traffic_pct: int = 0,
        min_priority: int = 0,
        config: Optional[Dict[str, Any]] = None
    ) -> str:
        """
        Register a new scraper version in the database.

        Args:
            job_type: Type of scraping job
            version: Semantic version string
            variant: Release channel ("stable", "beta", "canary")
            module_path: Python module path
            function_name: Entry function name
            is_default: Whether this is the default fallback
            traffic_pct: Traffic percentage for A/B testing (0-100)
            min_priority: Minimum job priority required
            config: Optional configuration dictionary

        Returns:
            UUID of created registry entry

        Raises:
            ValueError: If variant is invalid or traffic_pct out of range
        """
        if variant not in ('stable', 'beta', 'canary'):
            raise ValueError(f"Invalid variant: {variant}. Must be 'stable', 'beta', or 'canary'")

        if not 0 <= traffic_pct <= 100:
            raise ValueError(f"traffic_pct must be between 0 and 100, got: {traffic_pct}")

        import json

        async with self.db.pool.acquire() as conn:
            scraper_id = await conn.fetchval("""
                INSERT INTO scraper_registry (
                    job_type, version, variant, module_path, function_name,
                    is_default, traffic_pct, min_priority, config
                )
                VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9::jsonb)
                RETURNING id
            """, job_type, version, variant, module_path, function_name,
                is_default, traffic_pct, min_priority,
                json.dumps(config) if config else None)

        # Invalidate cache
        self._cache_timestamp = 0

        log.info(f"Registered scraper: {job_type} v{version} ({variant})")
        return str(scraper_id)

    async def deprecate_scraper(
        self,
        job_type: str,
        version: str
    ) -> bool:
        """
        Deprecate a scraper version (soft delete).

        Args:
            job_type: Type of scraping job
            version: Version to deprecate

        Returns:
            True if deprecated, False if not found
        """
        async with self.db.pool.acquire() as conn:
            result = await conn.execute("""
                UPDATE scraper_registry
                SET deprecated_at = NOW()
                WHERE job_type = $1 AND version = $2 AND deprecated_at IS NULL
            """, job_type, version)

        updated = result.split()[-1] == "1"
        if updated:
            self._cache_timestamp = 0  # Invalidate cache
            log.info(f"Deprecated scraper: {job_type} v{version}")

        return updated

    async def update_traffic_allocation(
        self,
        job_type: str,
        allocations: Dict[str, int]
    ) -> None:
        """
        Update traffic allocations for multiple scrapers atomically.

        Args:
            job_type: Type of scraping job
            allocations: Dict mapping version to traffic_pct
                        e.g., {"1.0.0": 90, "1.1.0-beta": 10}

        Raises:
            ValueError: If total exceeds 100 or any value is invalid
        """
        total = sum(allocations.values())
        if total > 100:
            raise ValueError(f"Total traffic allocation cannot exceed 100, got: {total}")

        for version, pct in allocations.items():
            if not 0 <= pct <= 100:
                raise ValueError(f"Invalid traffic_pct for {version}: {pct}")

        async with self.db.pool.acquire() as conn:
            async with conn.transaction():
                for version, traffic_pct in allocations.items():
                    await conn.execute("""
                        UPDATE scraper_registry
                        SET traffic_pct = $3
                        WHERE job_type = $1 AND version = $2 AND deprecated_at IS NULL
                    """, job_type, version, traffic_pct)

        # Invalidate cache
        self._cache_timestamp = 0
        log.info(f"Updated traffic allocations for {job_type}: {allocations}")

    # ==================== Helper Methods ====================

    def _find_by_version(
        self,
        scrapers: List[ScraperInfo],
        version: str
    ) -> Optional[ScraperInfo]:
        """Find scraper by exact version match."""
        for scraper in scrapers:
            if scraper.version == version:
                return scraper
        return None

    def _find_latest_for_variant(
        self,
        scrapers: List[ScraperInfo],
        variant: str
    ) -> Optional[ScraperInfo]:
        """Find latest (first in sorted list) scraper for a variant."""
        for scraper in scrapers:
            if scraper.variant == variant:
                return scraper
        return None

    def _find_default(
        self,
        scrapers: List[ScraperInfo]
    ) -> Optional[ScraperInfo]:
        """Find default scraper for fallback."""
        for scraper in scrapers:
            if scraper.is_default and scraper.deprecated_at is None:
                return scraper
        return None

    def _scraper_to_dict(self, scraper: ScraperInfo) -> Dict[str, Any]:
        """Convert ScraperInfo to dictionary for API responses."""
        return {
            "id": scraper.id,
            "version": scraper.version,
            "variant": scraper.variant,
            "module_path": scraper.module_path,
            "function_name": scraper.function_name,
            "is_default": scraper.is_default,
            "traffic_pct": scraper.traffic_pct,
            "min_priority": scraper.min_priority,
            "config": scraper.config,
            "deprecated": scraper.deprecated_at is not None
        }


# ==================== Legacy Singleton Registry ====================
# Kept for backward compatibility with existing code that uses
# the old class-based scraper registration pattern.


class LegacyScraperRegistry:
    """
    Legacy registry for managing scraper implementations.

    This class provides backward compatibility with the old scraper
    registration pattern using class-based scrapers. New code should
    use the database-backed ScraperRegistry instead.

    The registry allows:
    - Registering scrapers by name and version
    - Looking up scrapers by domain or name
    - Listing all available scrapers

    Usage:
        registry = LegacyScraperRegistry()
        registry.register(GoogleReviewsScraper)
        scraper = registry.get_scraper_for_url("https://google.com/maps/place/...")
    """

    _instance: Optional["LegacyScraperRegistry"] = None
    _scrapers: Dict[str, Type[BaseScraper]]

    def __new__(cls) -> "LegacyScraperRegistry":
        """Singleton pattern to ensure one global registry."""
        if cls._instance is None:
            cls._instance = super().__new__(cls)
            cls._instance._scrapers = {}
            cls._instance._domain_map = {}
        return cls._instance

    def register(self, scraper_class: Type[BaseScraper], name: Optional[str] = None) -> None:
        """
        Register a scraper class with the registry.

        Args:
            scraper_class: The scraper class to register (must inherit from BaseScraper)
            name: Optional name override, defaults to scraper_class.name property
        """
        # Create a temporary instance to get properties
        # Note: In production, we might want scraper_class to have class-level properties
        instance = scraper_class.__new__(scraper_class)

        scraper_name = name or instance.name
        scraper_version = instance.version
        key = f"{scraper_name}:{scraper_version}"

        self._scrapers[key] = scraper_class

        # Map domains to this scraper
        for domain in instance.supported_domains:
            if domain not in self._domain_map:
                self._domain_map[domain] = []
            self._domain_map[domain].append(key)

    def get_scraper(self, name: str, version: Optional[str] = None) -> Optional[Type[BaseScraper]]:
        """
        Get a scraper class by name and optional version.

        Args:
            name: The scraper name
            version: Optional version string. If not provided, returns the latest.

        Returns:
            The scraper class, or None if not found
        """
        if version:
            key = f"{name}:{version}"
            return self._scrapers.get(key)

        # Find latest version for this name
        matching = [k for k in self._scrapers.keys() if k.startswith(f"{name}:")]
        if not matching:
            return None

        # Sort by version and return latest
        matching.sort(reverse=True)
        return self._scrapers.get(matching[0])

    def get_scraper_for_url(self, url: str) -> Optional[Type[BaseScraper]]:
        """
        Find a suitable scraper for the given URL.

        Args:
            url: The URL to find a scraper for

        Returns:
            The scraper class that can handle this URL, or None if no match
        """
        from urllib.parse import urlparse

        parsed = urlparse(url)
        domain = parsed.netloc.lower()

        # Remove www. prefix for matching
        if domain.startswith("www."):
            domain = domain[4:]

        scraper_keys = self._domain_map.get(domain, [])
        if not scraper_keys:
            return None

        # Return the latest version
        scraper_keys.sort(reverse=True)
        return self._scrapers.get(scraper_keys[0])

    def list_scrapers(self) -> List[Dict[str, str]]:
        """
        List all registered scrapers.

        Returns:
            List of dictionaries with scraper info (name, version, domains)
        """
        result = []
        for key, scraper_class in self._scrapers.items():
            instance = scraper_class.__new__(scraper_class)
            result.append({
                "name": instance.name,
                "version": instance.version,
                "domains": instance.supported_domains
            })
        return result

    def clear(self) -> None:
        """Clear all registered scrapers. Useful for testing."""
        self._scrapers.clear()
        self._domain_map.clear()


# Global legacy registry instance (for backward compatibility)
registry = LegacyScraperRegistry()