""" Scraper Registry This module provides a registry for managing and discovering scrapers. It allows dynamic registration and lookup of scraper implementations. """ from typing import Dict, List, Optional, Type from scrapers.base import BaseScraper class ScraperRegistry: """ Registry for managing scraper implementations. The registry allows: - Registering scrapers by name and version - Looking up scrapers by domain or name - Listing all available scrapers Usage: registry = ScraperRegistry() registry.register(GoogleReviewsScraper) scraper = registry.get_scraper_for_url("https://google.com/maps/place/...") """ _instance: Optional["ScraperRegistry"] = None _scrapers: Dict[str, Type[BaseScraper]] def __new__(cls) -> "ScraperRegistry": """Singleton pattern to ensure one global registry.""" if cls._instance is None: cls._instance = super().__new__(cls) cls._instance._scrapers = {} cls._instance._domain_map = {} return cls._instance def register(self, scraper_class: Type[BaseScraper], name: Optional[str] = None) -> None: """ Register a scraper class with the registry. Args: scraper_class: The scraper class to register (must inherit from BaseScraper) name: Optional name override, defaults to scraper_class.name property """ # Create a temporary instance to get properties # Note: In production, we might want scraper_class to have class-level properties instance = scraper_class.__new__(scraper_class) scraper_name = name or instance.name scraper_version = instance.version key = f"{scraper_name}:{scraper_version}" self._scrapers[key] = scraper_class # Map domains to this scraper for domain in instance.supported_domains: if domain not in self._domain_map: self._domain_map[domain] = [] self._domain_map[domain].append(key) def get_scraper(self, name: str, version: Optional[str] = None) -> Optional[Type[BaseScraper]]: """ Get a scraper class by name and optional version. Args: name: The scraper name version: Optional version string. If not provided, returns the latest. Returns: The scraper class, or None if not found """ if version: key = f"{name}:{version}" return self._scrapers.get(key) # Find latest version for this name matching = [k for k in self._scrapers.keys() if k.startswith(f"{name}:")] if not matching: return None # Sort by version and return latest matching.sort(reverse=True) return self._scrapers.get(matching[0]) def get_scraper_for_url(self, url: str) -> Optional[Type[BaseScraper]]: """ Find a suitable scraper for the given URL. Args: url: The URL to find a scraper for Returns: The scraper class that can handle this URL, or None if no match """ from urllib.parse import urlparse parsed = urlparse(url) domain = parsed.netloc.lower() # Remove www. prefix for matching if domain.startswith("www."): domain = domain[4:] scraper_keys = self._domain_map.get(domain, []) if not scraper_keys: return None # Return the latest version scraper_keys.sort(reverse=True) return self._scrapers.get(scraper_keys[0]) def list_scrapers(self) -> List[Dict[str, str]]: """ List all registered scrapers. Returns: List of dictionaries with scraper info (name, version, domains) """ result = [] for key, scraper_class in self._scrapers.items(): instance = scraper_class.__new__(scraper_class) result.append({ "name": instance.name, "version": instance.version, "domains": instance.supported_domains }) return result def clear(self) -> None: """Clear all registered scrapers. Useful for testing.""" self._scrapers.clear() self._domain_map.clear() # Global registry instance registry = ScraperRegistry()