Phases 2-4: Requester support, batches, webhooks, scraper registry

Phase 2 - Requester & Batch Support:
- core/database.py: Added create_job params (requester_*, batch_*, priority, callback_*)
- core/database.py: Added batch methods (create_batch, get_batch, update_batch_progress, get_batches)
- core/database.py: Added update_job_callback for tracking webhook delivery
- api/routes/batches.py: New endpoints:
  - POST /api/scrape/google-reviews/batch (submit batch)
  - GET /api/batches (list batches)
  - GET /api/batches/{id} (batch detail)
  - DELETE /api/batches/{id} (cancel batch)
- api_server_production.py: Updated /api/scrape with requester, priority, callback fields
- api_server_production.py: New primary endpoint POST /api/scrape/google-reviews

Phase 3 - Webhooks:
- services/job_callback_service.py: New service with:
  - JobCallbackService: send_job_callback, send_batch_callback, retry_failed_callbacks
  - JobCallbackDispatcher: Background worker for callback monitoring
  - Payload formats per spec (job.completed, job.failed, batch.completed)
  - Exponential backoff for retries
  - Error classification for failure payloads

Phase 4 - Scraper Registry:
- scrapers/registry.py: Database-backed version routing:
  - get_scraper(): Version/variant/A/B routing
  - _get_weighted_scraper(): Traffic-weighted random selection
  - 60-second TTL cache for performance
  - register_scraper, deprecate_scraper, update_traffic_allocation
  - LegacyScraperRegistry preserved for backwards compatibility

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 15:35:58 +00:00
parent 2412996c54
commit 788ef84756
8 changed files with 2503 additions and 98 deletions

View File

@@ -1,9 +1,14 @@
#!/usr/bin/env python3
"""
Production Google Reviews Scraper API Server with Phase 1 features:
Production Google Reviews Scraper API Server with Phase 1 & 2 features:
- PostgreSQL storage with JSONB
- Webhook delivery with retries
- Smart health checks with canary testing
- Phase 2: Requester tracking (client_id, source, purpose)
- Phase 2: Job priority for queue ordering
- Phase 2: Callback URL alternative to webhooks
- Phase 2: Scraper version/variant selection for A/B testing
- Phase 2: Explicit job type endpoint (/api/scrape/google-reviews)
"""
import asyncio
import json
@@ -35,6 +40,7 @@ from workers.chrome_pool import (
release_scraping_worker,
get_pool_stats
)
from api.routes import batches_router, set_batches_db
# Configure logging
logging.basicConfig(
@@ -78,6 +84,9 @@ async def lifespan(app: FastAPI):
await db.initialize_schema()
log.info("Database initialized")
# Inject database into route modules
set_batches_db(db)
# Initialize health check system with canary monitoring
# DISABLED: Canary tests consume Google Maps requests and trigger rate limiting
# health_system = HealthCheckSystem(db)
@@ -134,6 +143,9 @@ app.add_middleware(
allow_headers=["*"],
)
# Include routers from api/routes/
app.include_router(batches_router)
# ==================== Request/Response Models ====================
@@ -159,14 +171,44 @@ class BrowserFingerprintModel(BaseModel):
platform: Optional[str] = Field(None, description="Platform (e.g., MacIntel, Win32)")
class RequesterModel(BaseModel):
"""Information about the requester of a scrape job"""
client_id: Optional[str] = Field(None, description="Client identifier")
source: Optional[str] = Field(None, description="Source of the request (e.g., 'web', 'api', 'internal')")
purpose: Optional[str] = Field(None, description="Purpose of the scrape (e.g., 'competitor_analysis', 'review_monitoring')")
metadata: Optional[Dict[str, Any]] = Field(None, description="Additional requester metadata")
class ScrapeRequest(BaseModel):
"""Request model for starting a scrape job"""
"""Request model for starting a scrape job (legacy endpoint, routes to google-reviews)"""
url: HttpUrl = Field(..., description="Google Maps URL to scrape")
webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications")
webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature")
metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata")
geolocation: Optional[GeolocationModel] = Field(None, description="User's geolocation for Chrome")
browser_fingerprint: Optional[BrowserFingerprintModel] = Field(None, description="User's browser fingerprint")
# Phase 2: New optional fields for enhanced job tracking
requester: Optional[RequesterModel] = Field(None, description="Information about who requested this job")
priority: Optional[int] = Field(0, description="Job priority (higher = more important)", ge=0, le=100)
callback_url: Optional[HttpUrl] = Field(None, description="URL to call when job completes (alternative to webhook)")
scraper_version: Optional[str] = Field(None, description="Specific scraper version to use")
scraper_variant: Optional[str] = Field(None, description="Scraper variant (e.g., 'fast', 'thorough', 'stealth')")
class GoogleReviewsScrapeRequest(BaseModel):
"""Request model for Google Reviews scraping - explicit job type endpoint"""
url: HttpUrl = Field(..., description="Google Maps URL to scrape")
webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications")
webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature")
metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata")
geolocation: Optional[GeolocationModel] = Field(None, description="User's geolocation for Chrome")
browser_fingerprint: Optional[BrowserFingerprintModel] = Field(None, description="User's browser fingerprint")
# Phase 2: New optional fields for enhanced job tracking
requester: Optional[RequesterModel] = Field(None, description="Information about who requested this job")
priority: Optional[int] = Field(0, description="Job priority (higher = more important)", ge=0, le=100)
callback_url: Optional[HttpUrl] = Field(None, description="URL to call when job completes (alternative to webhook)")
scraper_version: Optional[str] = Field(None, description="Specific scraper version to use")
scraper_variant: Optional[str] = Field(None, description="Scraper variant (e.g., 'fast', 'thorough', 'stealth')")
class JobResponse(BaseModel):
@@ -267,10 +309,146 @@ async def root():
}
@app.post("/scrape", response_model=Dict[str, str], summary="Start Scraping Job")
async def _create_google_reviews_job(
url: str,
webhook_url: Optional[str] = None,
webhook_secret: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
browser_fingerprint: Optional[BrowserFingerprintModel] = None,
geolocation: Optional[GeolocationModel] = None,
requester: Optional[RequesterModel] = None,
priority: int = 0,
callback_url: Optional[str] = None,
scraper_version: Optional[str] = None,
scraper_variant: Optional[str] = None,
job_type: str = "google-reviews"
) -> Dict[str, str]:
"""
Core logic for creating a Google Reviews scraping job.
This is the shared implementation used by both /scrape and /api/scrape/google-reviews endpoints.
Returns:
Dict with job_id, status, and message
"""
if not db:
raise HTTPException(status_code=500, detail="Database not initialized")
try:
# Build metadata with all Phase 2 fields
job_metadata = metadata.copy() if metadata else {}
# Add browser fingerprint if provided
if browser_fingerprint:
fp = browser_fingerprint
job_metadata['browser_fingerprint'] = {
"userAgent": fp.userAgent,
"timezone": fp.timezone,
"language": fp.language,
"platform": fp.platform,
}
if fp.viewport:
job_metadata['browser_fingerprint']['viewport'] = {"width": fp.viewport.width, "height": fp.viewport.height}
if fp.geolocation:
job_metadata['browser_fingerprint']['geolocation'] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
elif geolocation:
job_metadata['geolocation'] = {
'lat': geolocation.lat,
'lng': geolocation.lng
}
# Phase 2: Add requester info if provided
if requester:
job_metadata['requester'] = {
'client_id': requester.client_id,
'source': requester.source,
'purpose': requester.purpose,
'metadata': requester.metadata
}
# Phase 2: Add job type for multi-scraper support
job_metadata['job_type'] = job_type
# Phase 2: Add priority for job queue ordering
job_metadata['priority'] = priority
# Phase 2: Add callback_url (alternative to webhook)
if callback_url:
job_metadata['callback_url'] = callback_url
# Phase 2: Add scraper version/variant for A/B testing and version control
if scraper_version:
job_metadata['scraper_version'] = scraper_version
if scraper_variant:
job_metadata['scraper_variant'] = scraper_variant
# Create job in database
job_id = await db.create_job(
url=url,
webhook_url=webhook_url,
webhook_secret=webhook_secret,
metadata=job_metadata
)
# Start scraping job in background
asyncio.create_task(run_scraping_job(job_id))
log.info(f"Created and started job {job_id} (type={job_type}, priority={priority})")
return {
"job_id": str(job_id),
"status": "started",
"message": "Scraping job started successfully",
"job_type": job_type
}
except Exception as e:
log.error(f"Error creating scraping job: {e}")
raise HTTPException(status_code=500, detail=f"Failed to create scraping job: {str(e)}")
@app.post("/api/scrape/google-reviews", response_model=Dict[str, str], summary="Start Google Reviews Scraping Job")
async def scrape_google_reviews(request: GoogleReviewsScrapeRequest):
"""
Start a new Google Reviews scraping job.
This is the primary endpoint for Phase 2 onwards. It explicitly creates a job
of type 'google-reviews' with full support for all Phase 2 features:
- Requester tracking (client_id, source, purpose)
- Job priority for queue ordering
- Callback URL (alternative to webhook)
- Scraper version/variant selection for A/B testing
The job runs asynchronously in the background. You can:
- Poll GET /jobs/{job_id} for status
- Provide webhook_url for automatic notification when complete
- Subscribe to SSE at /jobs/{job_id}/stream for real-time updates
Returns the job ID for tracking.
"""
return await _create_google_reviews_job(
url=str(request.url),
webhook_url=str(request.webhook_url) if request.webhook_url else None,
webhook_secret=request.webhook_secret,
metadata=request.metadata,
browser_fingerprint=request.browser_fingerprint,
geolocation=request.geolocation,
requester=request.requester,
priority=request.priority or 0,
callback_url=str(request.callback_url) if request.callback_url else None,
scraper_version=request.scraper_version,
scraper_variant=request.scraper_variant,
job_type="google-reviews"
)
@app.post("/scrape", response_model=Dict[str, str], summary="Start Scraping Job (Legacy)")
async def start_scrape(request: ScrapeRequest):
"""
Start a new scraping job.
Start a new scraping job (legacy endpoint, routes to google-reviews).
**NOTE**: This endpoint is maintained for backwards compatibility.
For new integrations, use POST /api/scrape/google-reviews instead.
The job runs asynchronously in the background. You can:
- Poll GET /jobs/{job_id} for status
@@ -278,52 +456,51 @@ async def start_scrape(request: ScrapeRequest):
Returns the job ID for tracking.
"""
if not db:
raise HTTPException(status_code=500, detail="Database not initialized")
return await _create_google_reviews_job(
url=str(request.url),
webhook_url=str(request.webhook_url) if request.webhook_url else None,
webhook_secret=request.webhook_secret,
metadata=request.metadata,
browser_fingerprint=request.browser_fingerprint,
geolocation=request.geolocation,
requester=request.requester,
priority=request.priority or 0,
callback_url=str(request.callback_url) if request.callback_url else None,
scraper_version=request.scraper_version,
scraper_variant=request.scraper_variant,
job_type="google-reviews"
)
try:
# Merge browser fingerprint into metadata if provided
metadata = request.metadata or {}
if request.browser_fingerprint:
fp = request.browser_fingerprint
metadata['browser_fingerprint'] = {
"userAgent": fp.userAgent,
"timezone": fp.timezone,
"language": fp.language,
"platform": fp.platform,
}
if fp.viewport:
metadata['browser_fingerprint']['viewport'] = {"width": fp.viewport.width, "height": fp.viewport.height}
if fp.geolocation:
metadata['browser_fingerprint']['geolocation'] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
elif request.geolocation:
metadata['geolocation'] = {
'lat': request.geolocation.lat,
'lng': request.geolocation.lng
}
# Create job in database
job_id = await db.create_job(
url=str(request.url),
webhook_url=str(request.webhook_url) if request.webhook_url else None,
webhook_secret=request.webhook_secret,
metadata=metadata
)
@app.post("/api/scrape", response_model=Dict[str, str], summary="Start Scraping Job")
async def api_start_scrape(request: ScrapeRequest):
"""
Start a new scraping job via the /api/scrape endpoint.
# Start scraping job in background
asyncio.create_task(run_scraping_job(job_id))
This endpoint accepts the same request body as /scrape and routes to google-reviews.
For explicit job type control, use POST /api/scrape/google-reviews instead.
log.info(f"Created and started job {job_id}")
The job runs asynchronously in the background. You can:
- Poll GET /jobs/{job_id} for status
- Provide webhook_url for automatic notification when complete
- Subscribe to SSE at /jobs/{job_id}/stream for real-time updates
return {
"job_id": str(job_id),
"status": "started",
"message": "Scraping job started successfully"
}
except Exception as e:
log.error(f"Error creating scraping job: {e}")
raise HTTPException(status_code=500, detail=f"Failed to create scraping job: {str(e)}")
Returns the job ID for tracking.
"""
return await _create_google_reviews_job(
url=str(request.url),
webhook_url=str(request.webhook_url) if request.webhook_url else None,
webhook_secret=request.webhook_secret,
metadata=request.metadata,
browser_fingerprint=request.browser_fingerprint,
geolocation=request.geolocation,
requester=request.requester,
priority=request.priority or 0,
callback_url=str(request.callback_url) if request.callback_url else None,
scraper_version=request.scraper_version,
scraper_variant=request.scraper_variant,
job_type="google-reviews"
)
@app.get("/jobs/{job_id}", response_model=JobResponse, summary="Get Job Status")