Phases 2-4: Requester support, batches, webhooks, scraper registry
Phase 2 - Requester & Batch Support:
- core/database.py: Added create_job params (requester_*, batch_*, priority, callback_*)
- core/database.py: Added batch methods (create_batch, get_batch, update_batch_progress, get_batches)
- core/database.py: Added update_job_callback for tracking webhook delivery
- api/routes/batches.py: New endpoints:
- POST /api/scrape/google-reviews/batch (submit batch)
- GET /api/batches (list batches)
- GET /api/batches/{id} (batch detail)
- DELETE /api/batches/{id} (cancel batch)
- api_server_production.py: Updated /api/scrape with requester, priority, callback fields
- api_server_production.py: New primary endpoint POST /api/scrape/google-reviews
Phase 3 - Webhooks:
- services/job_callback_service.py: New service with:
- JobCallbackService: send_job_callback, send_batch_callback, retry_failed_callbacks
- JobCallbackDispatcher: Background worker for callback monitoring
- Payload formats per spec (job.completed, job.failed, batch.completed)
- Exponential backoff for retries
- Error classification for failure payloads
Phase 4 - Scraper Registry:
- scrapers/registry.py: Database-backed version routing:
- get_scraper(): Version/variant/A/B routing
- _get_weighted_scraper(): Traffic-weighted random selection
- 60-second TTL cache for performance
- register_scraper, deprecate_scraper, update_traffic_allocation
- LegacyScraperRegistry preserved for backwards compatibility
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,9 +1,14 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Production Google Reviews Scraper API Server with Phase 1 features:
|
||||
Production Google Reviews Scraper API Server with Phase 1 & 2 features:
|
||||
- PostgreSQL storage with JSONB
|
||||
- Webhook delivery with retries
|
||||
- Smart health checks with canary testing
|
||||
- Phase 2: Requester tracking (client_id, source, purpose)
|
||||
- Phase 2: Job priority for queue ordering
|
||||
- Phase 2: Callback URL alternative to webhooks
|
||||
- Phase 2: Scraper version/variant selection for A/B testing
|
||||
- Phase 2: Explicit job type endpoint (/api/scrape/google-reviews)
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
@@ -35,6 +40,7 @@ from workers.chrome_pool import (
|
||||
release_scraping_worker,
|
||||
get_pool_stats
|
||||
)
|
||||
from api.routes import batches_router, set_batches_db
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
@@ -78,6 +84,9 @@ async def lifespan(app: FastAPI):
|
||||
await db.initialize_schema()
|
||||
log.info("Database initialized")
|
||||
|
||||
# Inject database into route modules
|
||||
set_batches_db(db)
|
||||
|
||||
# Initialize health check system with canary monitoring
|
||||
# DISABLED: Canary tests consume Google Maps requests and trigger rate limiting
|
||||
# health_system = HealthCheckSystem(db)
|
||||
@@ -134,6 +143,9 @@ app.add_middleware(
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Include routers from api/routes/
|
||||
app.include_router(batches_router)
|
||||
|
||||
|
||||
# ==================== Request/Response Models ====================
|
||||
|
||||
@@ -159,14 +171,44 @@ class BrowserFingerprintModel(BaseModel):
|
||||
platform: Optional[str] = Field(None, description="Platform (e.g., MacIntel, Win32)")
|
||||
|
||||
|
||||
class RequesterModel(BaseModel):
|
||||
"""Information about the requester of a scrape job"""
|
||||
client_id: Optional[str] = Field(None, description="Client identifier")
|
||||
source: Optional[str] = Field(None, description="Source of the request (e.g., 'web', 'api', 'internal')")
|
||||
purpose: Optional[str] = Field(None, description="Purpose of the scrape (e.g., 'competitor_analysis', 'review_monitoring')")
|
||||
metadata: Optional[Dict[str, Any]] = Field(None, description="Additional requester metadata")
|
||||
|
||||
|
||||
class ScrapeRequest(BaseModel):
|
||||
"""Request model for starting a scrape job"""
|
||||
"""Request model for starting a scrape job (legacy endpoint, routes to google-reviews)"""
|
||||
url: HttpUrl = Field(..., description="Google Maps URL to scrape")
|
||||
webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications")
|
||||
webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature")
|
||||
metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata")
|
||||
geolocation: Optional[GeolocationModel] = Field(None, description="User's geolocation for Chrome")
|
||||
browser_fingerprint: Optional[BrowserFingerprintModel] = Field(None, description="User's browser fingerprint")
|
||||
# Phase 2: New optional fields for enhanced job tracking
|
||||
requester: Optional[RequesterModel] = Field(None, description="Information about who requested this job")
|
||||
priority: Optional[int] = Field(0, description="Job priority (higher = more important)", ge=0, le=100)
|
||||
callback_url: Optional[HttpUrl] = Field(None, description="URL to call when job completes (alternative to webhook)")
|
||||
scraper_version: Optional[str] = Field(None, description="Specific scraper version to use")
|
||||
scraper_variant: Optional[str] = Field(None, description="Scraper variant (e.g., 'fast', 'thorough', 'stealth')")
|
||||
|
||||
|
||||
class GoogleReviewsScrapeRequest(BaseModel):
|
||||
"""Request model for Google Reviews scraping - explicit job type endpoint"""
|
||||
url: HttpUrl = Field(..., description="Google Maps URL to scrape")
|
||||
webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications")
|
||||
webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature")
|
||||
metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata")
|
||||
geolocation: Optional[GeolocationModel] = Field(None, description="User's geolocation for Chrome")
|
||||
browser_fingerprint: Optional[BrowserFingerprintModel] = Field(None, description="User's browser fingerprint")
|
||||
# Phase 2: New optional fields for enhanced job tracking
|
||||
requester: Optional[RequesterModel] = Field(None, description="Information about who requested this job")
|
||||
priority: Optional[int] = Field(0, description="Job priority (higher = more important)", ge=0, le=100)
|
||||
callback_url: Optional[HttpUrl] = Field(None, description="URL to call when job completes (alternative to webhook)")
|
||||
scraper_version: Optional[str] = Field(None, description="Specific scraper version to use")
|
||||
scraper_variant: Optional[str] = Field(None, description="Scraper variant (e.g., 'fast', 'thorough', 'stealth')")
|
||||
|
||||
|
||||
class JobResponse(BaseModel):
|
||||
@@ -267,10 +309,146 @@ async def root():
|
||||
}
|
||||
|
||||
|
||||
@app.post("/scrape", response_model=Dict[str, str], summary="Start Scraping Job")
|
||||
async def _create_google_reviews_job(
|
||||
url: str,
|
||||
webhook_url: Optional[str] = None,
|
||||
webhook_secret: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
browser_fingerprint: Optional[BrowserFingerprintModel] = None,
|
||||
geolocation: Optional[GeolocationModel] = None,
|
||||
requester: Optional[RequesterModel] = None,
|
||||
priority: int = 0,
|
||||
callback_url: Optional[str] = None,
|
||||
scraper_version: Optional[str] = None,
|
||||
scraper_variant: Optional[str] = None,
|
||||
job_type: str = "google-reviews"
|
||||
) -> Dict[str, str]:
|
||||
"""
|
||||
Core logic for creating a Google Reviews scraping job.
|
||||
|
||||
This is the shared implementation used by both /scrape and /api/scrape/google-reviews endpoints.
|
||||
|
||||
Returns:
|
||||
Dict with job_id, status, and message
|
||||
"""
|
||||
if not db:
|
||||
raise HTTPException(status_code=500, detail="Database not initialized")
|
||||
|
||||
try:
|
||||
# Build metadata with all Phase 2 fields
|
||||
job_metadata = metadata.copy() if metadata else {}
|
||||
|
||||
# Add browser fingerprint if provided
|
||||
if browser_fingerprint:
|
||||
fp = browser_fingerprint
|
||||
job_metadata['browser_fingerprint'] = {
|
||||
"userAgent": fp.userAgent,
|
||||
"timezone": fp.timezone,
|
||||
"language": fp.language,
|
||||
"platform": fp.platform,
|
||||
}
|
||||
if fp.viewport:
|
||||
job_metadata['browser_fingerprint']['viewport'] = {"width": fp.viewport.width, "height": fp.viewport.height}
|
||||
if fp.geolocation:
|
||||
job_metadata['browser_fingerprint']['geolocation'] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
|
||||
elif geolocation:
|
||||
job_metadata['geolocation'] = {
|
||||
'lat': geolocation.lat,
|
||||
'lng': geolocation.lng
|
||||
}
|
||||
|
||||
# Phase 2: Add requester info if provided
|
||||
if requester:
|
||||
job_metadata['requester'] = {
|
||||
'client_id': requester.client_id,
|
||||
'source': requester.source,
|
||||
'purpose': requester.purpose,
|
||||
'metadata': requester.metadata
|
||||
}
|
||||
|
||||
# Phase 2: Add job type for multi-scraper support
|
||||
job_metadata['job_type'] = job_type
|
||||
|
||||
# Phase 2: Add priority for job queue ordering
|
||||
job_metadata['priority'] = priority
|
||||
|
||||
# Phase 2: Add callback_url (alternative to webhook)
|
||||
if callback_url:
|
||||
job_metadata['callback_url'] = callback_url
|
||||
|
||||
# Phase 2: Add scraper version/variant for A/B testing and version control
|
||||
if scraper_version:
|
||||
job_metadata['scraper_version'] = scraper_version
|
||||
if scraper_variant:
|
||||
job_metadata['scraper_variant'] = scraper_variant
|
||||
|
||||
# Create job in database
|
||||
job_id = await db.create_job(
|
||||
url=url,
|
||||
webhook_url=webhook_url,
|
||||
webhook_secret=webhook_secret,
|
||||
metadata=job_metadata
|
||||
)
|
||||
|
||||
# Start scraping job in background
|
||||
asyncio.create_task(run_scraping_job(job_id))
|
||||
|
||||
log.info(f"Created and started job {job_id} (type={job_type}, priority={priority})")
|
||||
|
||||
return {
|
||||
"job_id": str(job_id),
|
||||
"status": "started",
|
||||
"message": "Scraping job started successfully",
|
||||
"job_type": job_type
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error creating scraping job: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to create scraping job: {str(e)}")
|
||||
|
||||
|
||||
@app.post("/api/scrape/google-reviews", response_model=Dict[str, str], summary="Start Google Reviews Scraping Job")
|
||||
async def scrape_google_reviews(request: GoogleReviewsScrapeRequest):
|
||||
"""
|
||||
Start a new Google Reviews scraping job.
|
||||
|
||||
This is the primary endpoint for Phase 2 onwards. It explicitly creates a job
|
||||
of type 'google-reviews' with full support for all Phase 2 features:
|
||||
- Requester tracking (client_id, source, purpose)
|
||||
- Job priority for queue ordering
|
||||
- Callback URL (alternative to webhook)
|
||||
- Scraper version/variant selection for A/B testing
|
||||
|
||||
The job runs asynchronously in the background. You can:
|
||||
- Poll GET /jobs/{job_id} for status
|
||||
- Provide webhook_url for automatic notification when complete
|
||||
- Subscribe to SSE at /jobs/{job_id}/stream for real-time updates
|
||||
|
||||
Returns the job ID for tracking.
|
||||
"""
|
||||
return await _create_google_reviews_job(
|
||||
url=str(request.url),
|
||||
webhook_url=str(request.webhook_url) if request.webhook_url else None,
|
||||
webhook_secret=request.webhook_secret,
|
||||
metadata=request.metadata,
|
||||
browser_fingerprint=request.browser_fingerprint,
|
||||
geolocation=request.geolocation,
|
||||
requester=request.requester,
|
||||
priority=request.priority or 0,
|
||||
callback_url=str(request.callback_url) if request.callback_url else None,
|
||||
scraper_version=request.scraper_version,
|
||||
scraper_variant=request.scraper_variant,
|
||||
job_type="google-reviews"
|
||||
)
|
||||
|
||||
|
||||
@app.post("/scrape", response_model=Dict[str, str], summary="Start Scraping Job (Legacy)")
|
||||
async def start_scrape(request: ScrapeRequest):
|
||||
"""
|
||||
Start a new scraping job.
|
||||
Start a new scraping job (legacy endpoint, routes to google-reviews).
|
||||
|
||||
**NOTE**: This endpoint is maintained for backwards compatibility.
|
||||
For new integrations, use POST /api/scrape/google-reviews instead.
|
||||
|
||||
The job runs asynchronously in the background. You can:
|
||||
- Poll GET /jobs/{job_id} for status
|
||||
@@ -278,52 +456,51 @@ async def start_scrape(request: ScrapeRequest):
|
||||
|
||||
Returns the job ID for tracking.
|
||||
"""
|
||||
if not db:
|
||||
raise HTTPException(status_code=500, detail="Database not initialized")
|
||||
return await _create_google_reviews_job(
|
||||
url=str(request.url),
|
||||
webhook_url=str(request.webhook_url) if request.webhook_url else None,
|
||||
webhook_secret=request.webhook_secret,
|
||||
metadata=request.metadata,
|
||||
browser_fingerprint=request.browser_fingerprint,
|
||||
geolocation=request.geolocation,
|
||||
requester=request.requester,
|
||||
priority=request.priority or 0,
|
||||
callback_url=str(request.callback_url) if request.callback_url else None,
|
||||
scraper_version=request.scraper_version,
|
||||
scraper_variant=request.scraper_variant,
|
||||
job_type="google-reviews"
|
||||
)
|
||||
|
||||
try:
|
||||
# Merge browser fingerprint into metadata if provided
|
||||
metadata = request.metadata or {}
|
||||
if request.browser_fingerprint:
|
||||
fp = request.browser_fingerprint
|
||||
metadata['browser_fingerprint'] = {
|
||||
"userAgent": fp.userAgent,
|
||||
"timezone": fp.timezone,
|
||||
"language": fp.language,
|
||||
"platform": fp.platform,
|
||||
}
|
||||
if fp.viewport:
|
||||
metadata['browser_fingerprint']['viewport'] = {"width": fp.viewport.width, "height": fp.viewport.height}
|
||||
if fp.geolocation:
|
||||
metadata['browser_fingerprint']['geolocation'] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
|
||||
elif request.geolocation:
|
||||
metadata['geolocation'] = {
|
||||
'lat': request.geolocation.lat,
|
||||
'lng': request.geolocation.lng
|
||||
}
|
||||
|
||||
# Create job in database
|
||||
job_id = await db.create_job(
|
||||
url=str(request.url),
|
||||
webhook_url=str(request.webhook_url) if request.webhook_url else None,
|
||||
webhook_secret=request.webhook_secret,
|
||||
metadata=metadata
|
||||
)
|
||||
@app.post("/api/scrape", response_model=Dict[str, str], summary="Start Scraping Job")
|
||||
async def api_start_scrape(request: ScrapeRequest):
|
||||
"""
|
||||
Start a new scraping job via the /api/scrape endpoint.
|
||||
|
||||
# Start scraping job in background
|
||||
asyncio.create_task(run_scraping_job(job_id))
|
||||
This endpoint accepts the same request body as /scrape and routes to google-reviews.
|
||||
For explicit job type control, use POST /api/scrape/google-reviews instead.
|
||||
|
||||
log.info(f"Created and started job {job_id}")
|
||||
The job runs asynchronously in the background. You can:
|
||||
- Poll GET /jobs/{job_id} for status
|
||||
- Provide webhook_url for automatic notification when complete
|
||||
- Subscribe to SSE at /jobs/{job_id}/stream for real-time updates
|
||||
|
||||
return {
|
||||
"job_id": str(job_id),
|
||||
"status": "started",
|
||||
"message": "Scraping job started successfully"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error creating scraping job: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to create scraping job: {str(e)}")
|
||||
Returns the job ID for tracking.
|
||||
"""
|
||||
return await _create_google_reviews_job(
|
||||
url=str(request.url),
|
||||
webhook_url=str(request.webhook_url) if request.webhook_url else None,
|
||||
webhook_secret=request.webhook_secret,
|
||||
metadata=request.metadata,
|
||||
browser_fingerprint=request.browser_fingerprint,
|
||||
geolocation=request.geolocation,
|
||||
requester=request.requester,
|
||||
priority=request.priority or 0,
|
||||
callback_url=str(request.callback_url) if request.callback_url else None,
|
||||
scraper_version=request.scraper_version,
|
||||
scraper_variant=request.scraper_variant,
|
||||
job_type="google-reviews"
|
||||
)
|
||||
|
||||
|
||||
@app.get("/jobs/{job_id}", response_model=JobResponse, summary="Get Job Status")
|
||||
|
||||
Reference in New Issue
Block a user