New structure: - scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py) - scrapers/base.py (BaseScraper interface) - scrapers/registry.py (ScraperRegistry for version routing) - core/database.py, models.py, config.py, enums.py - utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py - workers/chrome_pool.py - services/webhook_service.py - api/ routes structure (empty, ready for Phase 2) - tests/ structure mirroring source All imports updated in: - api_server_production.py (7 import paths updated) - utils/health_checks.py (scraper import path) Legacy modules moved to modules/_legacy/: - data_storage.py, image_handler.py, s3_handler.py (unused) Syntax verified, frontend build passing. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
139 lines
4.3 KiB
Python
139 lines
4.3 KiB
Python
"""
|
|
Scraper Registry
|
|
|
|
This module provides a registry for managing and discovering scrapers.
|
|
It allows dynamic registration and lookup of scraper implementations.
|
|
"""
|
|
|
|
from typing import Dict, List, Optional, Type
|
|
|
|
from scrapers.base import BaseScraper
|
|
|
|
|
|
class ScraperRegistry:
|
|
"""
|
|
Registry for managing scraper implementations.
|
|
|
|
The registry allows:
|
|
- Registering scrapers by name and version
|
|
- Looking up scrapers by domain or name
|
|
- Listing all available scrapers
|
|
|
|
Usage:
|
|
registry = ScraperRegistry()
|
|
registry.register(GoogleReviewsScraper)
|
|
scraper = registry.get_scraper_for_url("https://google.com/maps/place/...")
|
|
"""
|
|
|
|
_instance: Optional["ScraperRegistry"] = None
|
|
_scrapers: Dict[str, Type[BaseScraper]]
|
|
|
|
def __new__(cls) -> "ScraperRegistry":
|
|
"""Singleton pattern to ensure one global registry."""
|
|
if cls._instance is None:
|
|
cls._instance = super().__new__(cls)
|
|
cls._instance._scrapers = {}
|
|
cls._instance._domain_map = {}
|
|
return cls._instance
|
|
|
|
def register(self, scraper_class: Type[BaseScraper], name: Optional[str] = None) -> None:
|
|
"""
|
|
Register a scraper class with the registry.
|
|
|
|
Args:
|
|
scraper_class: The scraper class to register (must inherit from BaseScraper)
|
|
name: Optional name override, defaults to scraper_class.name property
|
|
"""
|
|
# Create a temporary instance to get properties
|
|
# Note: In production, we might want scraper_class to have class-level properties
|
|
instance = scraper_class.__new__(scraper_class)
|
|
|
|
scraper_name = name or instance.name
|
|
scraper_version = instance.version
|
|
key = f"{scraper_name}:{scraper_version}"
|
|
|
|
self._scrapers[key] = scraper_class
|
|
|
|
# Map domains to this scraper
|
|
for domain in instance.supported_domains:
|
|
if domain not in self._domain_map:
|
|
self._domain_map[domain] = []
|
|
self._domain_map[domain].append(key)
|
|
|
|
def get_scraper(self, name: str, version: Optional[str] = None) -> Optional[Type[BaseScraper]]:
|
|
"""
|
|
Get a scraper class by name and optional version.
|
|
|
|
Args:
|
|
name: The scraper name
|
|
version: Optional version string. If not provided, returns the latest.
|
|
|
|
Returns:
|
|
The scraper class, or None if not found
|
|
"""
|
|
if version:
|
|
key = f"{name}:{version}"
|
|
return self._scrapers.get(key)
|
|
|
|
# Find latest version for this name
|
|
matching = [k for k in self._scrapers.keys() if k.startswith(f"{name}:")]
|
|
if not matching:
|
|
return None
|
|
|
|
# Sort by version and return latest
|
|
matching.sort(reverse=True)
|
|
return self._scrapers.get(matching[0])
|
|
|
|
def get_scraper_for_url(self, url: str) -> Optional[Type[BaseScraper]]:
|
|
"""
|
|
Find a suitable scraper for the given URL.
|
|
|
|
Args:
|
|
url: The URL to find a scraper for
|
|
|
|
Returns:
|
|
The scraper class that can handle this URL, or None if no match
|
|
"""
|
|
from urllib.parse import urlparse
|
|
|
|
parsed = urlparse(url)
|
|
domain = parsed.netloc.lower()
|
|
|
|
# Remove www. prefix for matching
|
|
if domain.startswith("www."):
|
|
domain = domain[4:]
|
|
|
|
scraper_keys = self._domain_map.get(domain, [])
|
|
if not scraper_keys:
|
|
return None
|
|
|
|
# Return the latest version
|
|
scraper_keys.sort(reverse=True)
|
|
return self._scrapers.get(scraper_keys[0])
|
|
|
|
def list_scrapers(self) -> List[Dict[str, str]]:
|
|
"""
|
|
List all registered scrapers.
|
|
|
|
Returns:
|
|
List of dictionaries with scraper info (name, version, domains)
|
|
"""
|
|
result = []
|
|
for key, scraper_class in self._scrapers.items():
|
|
instance = scraper_class.__new__(scraper_class)
|
|
result.append({
|
|
"name": instance.name,
|
|
"version": instance.version,
|
|
"domains": instance.supported_domains
|
|
})
|
|
return result
|
|
|
|
def clear(self) -> None:
|
|
"""Clear all registered scrapers. Useful for testing."""
|
|
self._scrapers.clear()
|
|
self._domain_map.clear()
|
|
|
|
|
|
# Global registry instance
|
|
registry = ScraperRegistry()
|