Phases 2-4: Requester support, batches, webhooks, scraper registry
Phase 2 - Requester & Batch Support:
- core/database.py: Added create_job params (requester_*, batch_*, priority, callback_*)
- core/database.py: Added batch methods (create_batch, get_batch, update_batch_progress, get_batches)
- core/database.py: Added update_job_callback for tracking webhook delivery
- api/routes/batches.py: New endpoints:
- POST /api/scrape/google-reviews/batch (submit batch)
- GET /api/batches (list batches)
- GET /api/batches/{id} (batch detail)
- DELETE /api/batches/{id} (cancel batch)
- api_server_production.py: Updated /api/scrape with requester, priority, callback fields
- api_server_production.py: New primary endpoint POST /api/scrape/google-reviews
Phase 3 - Webhooks:
- services/job_callback_service.py: New service with:
- JobCallbackService: send_job_callback, send_batch_callback, retry_failed_callbacks
- JobCallbackDispatcher: Background worker for callback monitoring
- Payload formats per spec (job.completed, job.failed, batch.completed)
- Exponential backoff for retries
- Error classification for failure payloads
Phase 4 - Scraper Registry:
- scrapers/registry.py: Database-backed version routing:
- get_scraper(): Version/variant/A/B routing
- _get_weighted_scraper(): Traffic-weighted random selection
- 60-second TTL cache for performance
- register_scraper, deprecate_scraper, update_traffic_allocation
- LegacyScraperRegistry preserved for backwards compatibility
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,18 +1,512 @@
|
||||
"""
|
||||
Scraper Registry
|
||||
|
||||
This module provides a registry for managing and discovering scrapers.
|
||||
It allows dynamic registration and lookup of scraper implementations.
|
||||
This module provides a database-backed registry for managing and routing
|
||||
scraper requests. It supports:
|
||||
- Version-based routing (exact version or latest for variant)
|
||||
- A/B testing via traffic_pct weighted selection
|
||||
- Priority-based scraper filtering
|
||||
- Caching with TTL for performance
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Optional, Type
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Type
|
||||
|
||||
from scrapers.base import BaseScraper
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScraperInfo:
|
||||
"""Information about a registered scraper."""
|
||||
id: str
|
||||
job_type: str
|
||||
version: str
|
||||
variant: str
|
||||
module_path: str
|
||||
function_name: str
|
||||
is_default: bool
|
||||
traffic_pct: int
|
||||
min_priority: int
|
||||
config: Optional[Dict[str, Any]]
|
||||
deprecated_at: Optional[str]
|
||||
|
||||
|
||||
class ScraperRegistry:
|
||||
"""
|
||||
Registry for managing scraper implementations.
|
||||
Routes scraping requests to appropriate scraper versions.
|
||||
Supports A/B testing via traffic_pct and variant selection.
|
||||
|
||||
This registry is backed by the scraper_registry database table and
|
||||
provides weighted random selection for A/B testing scenarios.
|
||||
|
||||
Usage:
|
||||
registry = ScraperRegistry(db)
|
||||
scraper_info = await registry.get_scraper("google_reviews")
|
||||
# scraper_info contains module_path, function_name, config, etc.
|
||||
"""
|
||||
|
||||
def __init__(self, db: "DatabaseManager"): # noqa: F821 - forward reference
|
||||
"""
|
||||
Initialize the scraper registry.
|
||||
|
||||
Args:
|
||||
db: DatabaseManager instance for database access
|
||||
"""
|
||||
self.db = db
|
||||
self._cache: Dict[str, List[ScraperInfo]] = {} # Cache by job_type
|
||||
self._cache_timestamp: float = 0
|
||||
self._cache_ttl: int = 60 # Refresh cache every 60 seconds
|
||||
self._cache_lock = asyncio.Lock()
|
||||
|
||||
async def get_scraper(
|
||||
self,
|
||||
job_type: str,
|
||||
version: str = None,
|
||||
variant: str = None,
|
||||
priority: int = 0
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get scraper info for a job.
|
||||
|
||||
Priority order:
|
||||
1. If version specified, return exact match
|
||||
2. If variant specified, return latest active scraper of that variant
|
||||
3. Otherwise, use A/B routing based on traffic_pct
|
||||
|
||||
Args:
|
||||
job_type: Type of scraping job (e.g., "google_reviews")
|
||||
version: Optional specific version to use (e.g., "1.0.0")
|
||||
variant: Optional variant filter ("stable", "beta", "canary")
|
||||
priority: Job priority level for min_priority filtering
|
||||
|
||||
Returns:
|
||||
Dictionary containing scraper info:
|
||||
{
|
||||
"version": "1.0.0",
|
||||
"variant": "stable",
|
||||
"module_path": "scrapers.google_reviews.v1_0_0",
|
||||
"function_name": "fast_scrape_reviews",
|
||||
"config": {...}
|
||||
}
|
||||
|
||||
Returns None if no matching scraper found.
|
||||
"""
|
||||
# Ensure cache is fresh
|
||||
await self._ensure_cache_fresh()
|
||||
|
||||
# Get all scrapers for this job type
|
||||
scrapers = self._cache.get(job_type, [])
|
||||
if not scrapers:
|
||||
log.warning(f"No scrapers registered for job_type: {job_type}")
|
||||
return None
|
||||
|
||||
# Filter out deprecated scrapers
|
||||
active_scrapers = [s for s in scrapers if s.deprecated_at is None]
|
||||
if not active_scrapers:
|
||||
log.warning(f"All scrapers for job_type {job_type} are deprecated")
|
||||
return None
|
||||
|
||||
selected: Optional[ScraperInfo] = None
|
||||
|
||||
# Priority 1: Exact version match
|
||||
if version:
|
||||
selected = self._find_by_version(active_scrapers, version)
|
||||
if selected:
|
||||
log.debug(f"Selected scraper by exact version: {version}")
|
||||
else:
|
||||
log.warning(f"Requested version {version} not found for {job_type}")
|
||||
return None
|
||||
|
||||
# Priority 2: Latest for variant
|
||||
elif variant:
|
||||
selected = self._find_latest_for_variant(active_scrapers, variant)
|
||||
if selected:
|
||||
log.debug(f"Selected latest scraper for variant {variant}: {selected.version}")
|
||||
else:
|
||||
log.warning(f"No active scrapers found for variant {variant} in {job_type}")
|
||||
return None
|
||||
|
||||
# Priority 3: A/B weighted selection
|
||||
else:
|
||||
selected = await self._get_weighted_scraper(job_type, priority)
|
||||
if selected:
|
||||
log.debug(f"Selected scraper via A/B routing: {selected.version} ({selected.variant})")
|
||||
|
||||
if not selected:
|
||||
return None
|
||||
|
||||
return self._scraper_to_dict(selected)
|
||||
|
||||
async def _get_weighted_scraper(
|
||||
self,
|
||||
job_type: str,
|
||||
priority: int
|
||||
) -> Optional[ScraperInfo]:
|
||||
"""
|
||||
Select scraper based on traffic weights.
|
||||
Uses random selection weighted by traffic_pct.
|
||||
Filters by min_priority.
|
||||
|
||||
Args:
|
||||
job_type: Type of scraping job
|
||||
priority: Job priority level
|
||||
|
||||
Returns:
|
||||
Selected ScraperInfo or None if no eligible scrapers
|
||||
"""
|
||||
scrapers = self._cache.get(job_type, [])
|
||||
|
||||
# Filter: active, has traffic allocation, and meets priority requirement
|
||||
eligible = [
|
||||
s for s in scrapers
|
||||
if s.deprecated_at is None
|
||||
and s.traffic_pct > 0
|
||||
and s.min_priority <= priority
|
||||
]
|
||||
|
||||
if not eligible:
|
||||
# Fall back to default scraper
|
||||
default = self._find_default(scrapers)
|
||||
if default and default.min_priority <= priority:
|
||||
log.debug(f"No eligible A/B scrapers, using default: {default.version}")
|
||||
return default
|
||||
log.warning(f"No eligible scrapers for job_type {job_type} with priority {priority}")
|
||||
return None
|
||||
|
||||
# Weighted random selection
|
||||
total_weight = sum(s.traffic_pct for s in eligible)
|
||||
if total_weight == 0:
|
||||
# Equal probability if all have 0 traffic_pct
|
||||
return random.choice(eligible)
|
||||
|
||||
# Generate random number in range [0, total_weight)
|
||||
rand_value = random.random() * total_weight
|
||||
cumulative = 0
|
||||
|
||||
for scraper in eligible:
|
||||
cumulative += scraper.traffic_pct
|
||||
if rand_value < cumulative:
|
||||
return scraper
|
||||
|
||||
# Fallback (shouldn't reach here, but safety)
|
||||
return eligible[-1]
|
||||
|
||||
async def refresh_cache(self) -> None:
|
||||
"""
|
||||
Reload registry from database.
|
||||
|
||||
This method forces a cache refresh regardless of TTL.
|
||||
Thread-safe via asyncio lock.
|
||||
"""
|
||||
async with self._cache_lock:
|
||||
await self._load_cache()
|
||||
|
||||
async def _ensure_cache_fresh(self) -> None:
|
||||
"""Ensure cache is loaded and not stale."""
|
||||
current_time = time.time()
|
||||
if (
|
||||
not self._cache
|
||||
or (current_time - self._cache_timestamp) > self._cache_ttl
|
||||
):
|
||||
async with self._cache_lock:
|
||||
# Double-check after acquiring lock
|
||||
if (
|
||||
not self._cache
|
||||
or (current_time - self._cache_timestamp) > self._cache_ttl
|
||||
):
|
||||
await self._load_cache()
|
||||
|
||||
async def _load_cache(self) -> None:
|
||||
"""Load all scraper registry entries from database."""
|
||||
try:
|
||||
async with self.db.pool.acquire() as conn:
|
||||
rows = await conn.fetch("""
|
||||
SELECT
|
||||
id,
|
||||
job_type,
|
||||
version,
|
||||
variant,
|
||||
module_path,
|
||||
function_name,
|
||||
is_default,
|
||||
traffic_pct,
|
||||
min_priority,
|
||||
config,
|
||||
deprecated_at
|
||||
FROM scraper_registry
|
||||
ORDER BY job_type, version DESC
|
||||
""")
|
||||
|
||||
# Group by job_type
|
||||
self._cache.clear()
|
||||
for row in rows:
|
||||
scraper_info = ScraperInfo(
|
||||
id=str(row['id']),
|
||||
job_type=row['job_type'],
|
||||
version=row['version'],
|
||||
variant=row['variant'],
|
||||
module_path=row['module_path'],
|
||||
function_name=row['function_name'],
|
||||
is_default=row['is_default'],
|
||||
traffic_pct=row['traffic_pct'],
|
||||
min_priority=row['min_priority'],
|
||||
config=row['config'],
|
||||
deprecated_at=str(row['deprecated_at']) if row['deprecated_at'] else None
|
||||
)
|
||||
|
||||
if scraper_info.job_type not in self._cache:
|
||||
self._cache[scraper_info.job_type] = []
|
||||
self._cache[scraper_info.job_type].append(scraper_info)
|
||||
|
||||
self._cache_timestamp = time.time()
|
||||
log.info(f"Scraper registry cache loaded: {sum(len(v) for v in self._cache.values())} entries")
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Failed to load scraper registry cache: {e}")
|
||||
raise
|
||||
|
||||
async def list_scrapers(
|
||||
self,
|
||||
job_type: str = None,
|
||||
include_deprecated: bool = False
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
List registered scrapers, optionally filtered by job_type.
|
||||
|
||||
Args:
|
||||
job_type: Optional job type filter
|
||||
include_deprecated: Whether to include deprecated scrapers
|
||||
|
||||
Returns:
|
||||
List of scraper info dictionaries
|
||||
"""
|
||||
await self._ensure_cache_fresh()
|
||||
|
||||
result = []
|
||||
|
||||
if job_type:
|
||||
scrapers = self._cache.get(job_type, [])
|
||||
else:
|
||||
scrapers = [s for scrapers_list in self._cache.values() for s in scrapers_list]
|
||||
|
||||
for scraper in scrapers:
|
||||
if not include_deprecated and scraper.deprecated_at:
|
||||
continue
|
||||
result.append(self._scraper_to_dict(scraper))
|
||||
|
||||
return result
|
||||
|
||||
async def get_scraper_by_id(self, scraper_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get a specific scraper by its database ID.
|
||||
|
||||
Args:
|
||||
scraper_id: UUID of the scraper registry entry
|
||||
|
||||
Returns:
|
||||
Scraper info dictionary or None if not found
|
||||
"""
|
||||
await self._ensure_cache_fresh()
|
||||
|
||||
for scrapers_list in self._cache.values():
|
||||
for scraper in scrapers_list:
|
||||
if scraper.id == scraper_id:
|
||||
return self._scraper_to_dict(scraper)
|
||||
|
||||
return None
|
||||
|
||||
async def register_scraper(
|
||||
self,
|
||||
job_type: str,
|
||||
version: str,
|
||||
variant: str,
|
||||
module_path: str,
|
||||
function_name: str,
|
||||
is_default: bool = False,
|
||||
traffic_pct: int = 0,
|
||||
min_priority: int = 0,
|
||||
config: Optional[Dict[str, Any]] = None
|
||||
) -> str:
|
||||
"""
|
||||
Register a new scraper version in the database.
|
||||
|
||||
Args:
|
||||
job_type: Type of scraping job
|
||||
version: Semantic version string
|
||||
variant: Release channel ("stable", "beta", "canary")
|
||||
module_path: Python module path
|
||||
function_name: Entry function name
|
||||
is_default: Whether this is the default fallback
|
||||
traffic_pct: Traffic percentage for A/B testing (0-100)
|
||||
min_priority: Minimum job priority required
|
||||
config: Optional configuration dictionary
|
||||
|
||||
Returns:
|
||||
UUID of created registry entry
|
||||
|
||||
Raises:
|
||||
ValueError: If variant is invalid or traffic_pct out of range
|
||||
"""
|
||||
if variant not in ('stable', 'beta', 'canary'):
|
||||
raise ValueError(f"Invalid variant: {variant}. Must be 'stable', 'beta', or 'canary'")
|
||||
|
||||
if not 0 <= traffic_pct <= 100:
|
||||
raise ValueError(f"traffic_pct must be between 0 and 100, got: {traffic_pct}")
|
||||
|
||||
import json
|
||||
|
||||
async with self.db.pool.acquire() as conn:
|
||||
scraper_id = await conn.fetchval("""
|
||||
INSERT INTO scraper_registry (
|
||||
job_type, version, variant, module_path, function_name,
|
||||
is_default, traffic_pct, min_priority, config
|
||||
)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9::jsonb)
|
||||
RETURNING id
|
||||
""", job_type, version, variant, module_path, function_name,
|
||||
is_default, traffic_pct, min_priority,
|
||||
json.dumps(config) if config else None)
|
||||
|
||||
# Invalidate cache
|
||||
self._cache_timestamp = 0
|
||||
|
||||
log.info(f"Registered scraper: {job_type} v{version} ({variant})")
|
||||
return str(scraper_id)
|
||||
|
||||
async def deprecate_scraper(
|
||||
self,
|
||||
job_type: str,
|
||||
version: str
|
||||
) -> bool:
|
||||
"""
|
||||
Deprecate a scraper version (soft delete).
|
||||
|
||||
Args:
|
||||
job_type: Type of scraping job
|
||||
version: Version to deprecate
|
||||
|
||||
Returns:
|
||||
True if deprecated, False if not found
|
||||
"""
|
||||
async with self.db.pool.acquire() as conn:
|
||||
result = await conn.execute("""
|
||||
UPDATE scraper_registry
|
||||
SET deprecated_at = NOW()
|
||||
WHERE job_type = $1 AND version = $2 AND deprecated_at IS NULL
|
||||
""", job_type, version)
|
||||
|
||||
updated = result.split()[-1] == "1"
|
||||
if updated:
|
||||
self._cache_timestamp = 0 # Invalidate cache
|
||||
log.info(f"Deprecated scraper: {job_type} v{version}")
|
||||
|
||||
return updated
|
||||
|
||||
async def update_traffic_allocation(
|
||||
self,
|
||||
job_type: str,
|
||||
allocations: Dict[str, int]
|
||||
) -> None:
|
||||
"""
|
||||
Update traffic allocations for multiple scrapers atomically.
|
||||
|
||||
Args:
|
||||
job_type: Type of scraping job
|
||||
allocations: Dict mapping version to traffic_pct
|
||||
e.g., {"1.0.0": 90, "1.1.0-beta": 10}
|
||||
|
||||
Raises:
|
||||
ValueError: If total exceeds 100 or any value is invalid
|
||||
"""
|
||||
total = sum(allocations.values())
|
||||
if total > 100:
|
||||
raise ValueError(f"Total traffic allocation cannot exceed 100, got: {total}")
|
||||
|
||||
for version, pct in allocations.items():
|
||||
if not 0 <= pct <= 100:
|
||||
raise ValueError(f"Invalid traffic_pct for {version}: {pct}")
|
||||
|
||||
async with self.db.pool.acquire() as conn:
|
||||
async with conn.transaction():
|
||||
for version, traffic_pct in allocations.items():
|
||||
await conn.execute("""
|
||||
UPDATE scraper_registry
|
||||
SET traffic_pct = $3
|
||||
WHERE job_type = $1 AND version = $2 AND deprecated_at IS NULL
|
||||
""", job_type, version, traffic_pct)
|
||||
|
||||
# Invalidate cache
|
||||
self._cache_timestamp = 0
|
||||
log.info(f"Updated traffic allocations for {job_type}: {allocations}")
|
||||
|
||||
# ==================== Helper Methods ====================
|
||||
|
||||
def _find_by_version(
|
||||
self,
|
||||
scrapers: List[ScraperInfo],
|
||||
version: str
|
||||
) -> Optional[ScraperInfo]:
|
||||
"""Find scraper by exact version match."""
|
||||
for scraper in scrapers:
|
||||
if scraper.version == version:
|
||||
return scraper
|
||||
return None
|
||||
|
||||
def _find_latest_for_variant(
|
||||
self,
|
||||
scrapers: List[ScraperInfo],
|
||||
variant: str
|
||||
) -> Optional[ScraperInfo]:
|
||||
"""Find latest (first in sorted list) scraper for a variant."""
|
||||
for scraper in scrapers:
|
||||
if scraper.variant == variant:
|
||||
return scraper
|
||||
return None
|
||||
|
||||
def _find_default(
|
||||
self,
|
||||
scrapers: List[ScraperInfo]
|
||||
) -> Optional[ScraperInfo]:
|
||||
"""Find default scraper for fallback."""
|
||||
for scraper in scrapers:
|
||||
if scraper.is_default and scraper.deprecated_at is None:
|
||||
return scraper
|
||||
return None
|
||||
|
||||
def _scraper_to_dict(self, scraper: ScraperInfo) -> Dict[str, Any]:
|
||||
"""Convert ScraperInfo to dictionary for API responses."""
|
||||
return {
|
||||
"id": scraper.id,
|
||||
"version": scraper.version,
|
||||
"variant": scraper.variant,
|
||||
"module_path": scraper.module_path,
|
||||
"function_name": scraper.function_name,
|
||||
"is_default": scraper.is_default,
|
||||
"traffic_pct": scraper.traffic_pct,
|
||||
"min_priority": scraper.min_priority,
|
||||
"config": scraper.config,
|
||||
"deprecated": scraper.deprecated_at is not None
|
||||
}
|
||||
|
||||
|
||||
# ==================== Legacy Singleton Registry ====================
|
||||
# Kept for backward compatibility with existing code that uses
|
||||
# the old class-based scraper registration pattern.
|
||||
|
||||
|
||||
class LegacyScraperRegistry:
|
||||
"""
|
||||
Legacy registry for managing scraper implementations.
|
||||
|
||||
This class provides backward compatibility with the old scraper
|
||||
registration pattern using class-based scrapers. New code should
|
||||
use the database-backed ScraperRegistry instead.
|
||||
|
||||
The registry allows:
|
||||
- Registering scrapers by name and version
|
||||
@@ -20,15 +514,15 @@ class ScraperRegistry:
|
||||
- Listing all available scrapers
|
||||
|
||||
Usage:
|
||||
registry = ScraperRegistry()
|
||||
registry = LegacyScraperRegistry()
|
||||
registry.register(GoogleReviewsScraper)
|
||||
scraper = registry.get_scraper_for_url("https://google.com/maps/place/...")
|
||||
"""
|
||||
|
||||
_instance: Optional["ScraperRegistry"] = None
|
||||
_instance: Optional["LegacyScraperRegistry"] = None
|
||||
_scrapers: Dict[str, Type[BaseScraper]]
|
||||
|
||||
def __new__(cls) -> "ScraperRegistry":
|
||||
def __new__(cls) -> "LegacyScraperRegistry":
|
||||
"""Singleton pattern to ensure one global registry."""
|
||||
if cls._instance is None:
|
||||
cls._instance = super().__new__(cls)
|
||||
@@ -134,5 +628,5 @@ class ScraperRegistry:
|
||||
self._domain_map.clear()
|
||||
|
||||
|
||||
# Global registry instance
|
||||
registry = ScraperRegistry()
|
||||
# Global legacy registry instance (for backward compatibility)
|
||||
registry = LegacyScraperRegistry()
|
||||
|
||||
Reference in New Issue
Block a user