Phases 2-4: Requester support, batches, webhooks, scraper registry

Phase 2 - Requester & Batch Support: - core/database.py: Added create_job params (requester_*, batch_*, priority, callback_*) - core/database.py: Added batch methods (create_batch, get_batch, update_batch_progress, get_batches) - core/database.py: Added update_job_callback for tracking webhook delivery - api/routes/batches.py: New endpoints: - POST /api/scrape/google-reviews/batch (submit batch) - GET /api/batches (list batches) - GET /api/batches/{id} (batch detail) - DELETE /api/batches/{id} (cancel batch) - api_server_production.py: Updated /api/scrape with requester, priority, callback fields - api_server_production.py: New primary endpoint POST /api/scrape/google-reviews Phase 3 - Webhooks: - services/job_callback_service.py: New service with: - JobCallbackService: send_job_callback, send_batch_callback, retry_failed_callbacks - JobCallbackDispatcher: Background worker for callback monitoring - Payload formats per spec (job.completed, job.failed, batch.completed) - Exponential backoff for retries - Error classification for failure payloads Phase 4 - Scraper Registry: - scrapers/registry.py: Database-backed version routing: - get_scraper(): Version/variant/A/B routing - _get_weighted_scraper(): Traffic-weighted random selection - 60-second TTL cache for performance - register_scraper, deprecate_scraper, update_traffic_allocation - LegacyScraperRegistry preserved for backwards compatibility Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 15:35:58 +00:00
parent 2412996c54
commit 788ef84756
8 changed files with 2503 additions and 98 deletions
--- a/api_server_production.py
+++ b/api_server_production.py
@@ -1,9 +1,14 @@
 #!/usr/bin/env python3
 """
-Production Google Reviews Scraper API Server with Phase 1 features:
+Production Google Reviews Scraper API Server with Phase 1 & 2 features:
 - PostgreSQL storage with JSONB
 - Webhook delivery with retries
 - Smart health checks with canary testing
+- Phase 2: Requester tracking (client_id, source, purpose)
+- Phase 2: Job priority for queue ordering
+- Phase 2: Callback URL alternative to webhooks
+- Phase 2: Scraper version/variant selection for A/B testing
+- Phase 2: Explicit job type endpoint (/api/scrape/google-reviews)
 """
 import asyncio
 import json
@@ -35,6 +40,7 @@ from workers.chrome_pool import (
    release_scraping_worker,
    get_pool_stats
 )
+from api.routes import batches_router, set_batches_db

 # Configure logging
 logging.basicConfig(
@@ -78,6 +84,9 @@ async def lifespan(app: FastAPI):
    await db.initialize_schema()
    log.info("Database initialized")

+    # Inject database into route modules
+    set_batches_db(db)
+
    # Initialize health check system with canary monitoring
    # DISABLED: Canary tests consume Google Maps requests and trigger rate limiting
    # health_system = HealthCheckSystem(db)
@@ -134,6 +143,9 @@ app.add_middleware(
    allow_headers=["*"],
 )

+# Include routers from api/routes/
+app.include_router(batches_router)
+

 # ==================== Request/Response Models ====================

@@ -159,14 +171,44 @@ class BrowserFingerprintModel(BaseModel):
    platform: Optional[str] = Field(None, description="Platform (e.g., MacIntel, Win32)")


+class RequesterModel(BaseModel):
+    """Information about the requester of a scrape job"""
+    client_id: Optional[str] = Field(None, description="Client identifier")
+    source: Optional[str] = Field(None, description="Source of the request (e.g., 'web', 'api', 'internal')")
+    purpose: Optional[str] = Field(None, description="Purpose of the scrape (e.g., 'competitor_analysis', 'review_monitoring')")
+    metadata: Optional[Dict[str, Any]] = Field(None, description="Additional requester metadata")
+
+
 class ScrapeRequest(BaseModel):
-    """Request model for starting a scrape job"""
+    """Request model for starting a scrape job (legacy endpoint, routes to google-reviews)"""
    url: HttpUrl = Field(..., description="Google Maps URL to scrape")
    webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications")
    webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature")
    metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata")
    geolocation: Optional[GeolocationModel] = Field(None, description="User's geolocation for Chrome")
    browser_fingerprint: Optional[BrowserFingerprintModel] = Field(None, description="User's browser fingerprint")
+    # Phase 2: New optional fields for enhanced job tracking
+    requester: Optional[RequesterModel] = Field(None, description="Information about who requested this job")
+    priority: Optional[int] = Field(0, description="Job priority (higher = more important)", ge=0, le=100)
+    callback_url: Optional[HttpUrl] = Field(None, description="URL to call when job completes (alternative to webhook)")
+    scraper_version: Optional[str] = Field(None, description="Specific scraper version to use")
+    scraper_variant: Optional[str] = Field(None, description="Scraper variant (e.g., 'fast', 'thorough', 'stealth')")
+
+
+class GoogleReviewsScrapeRequest(BaseModel):
+    """Request model for Google Reviews scraping - explicit job type endpoint"""
+    url: HttpUrl = Field(..., description="Google Maps URL to scrape")
+    webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications")
+    webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature")
+    metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata")
+    geolocation: Optional[GeolocationModel] = Field(None, description="User's geolocation for Chrome")
+    browser_fingerprint: Optional[BrowserFingerprintModel] = Field(None, description="User's browser fingerprint")
+    # Phase 2: New optional fields for enhanced job tracking
+    requester: Optional[RequesterModel] = Field(None, description="Information about who requested this job")
+    priority: Optional[int] = Field(0, description="Job priority (higher = more important)", ge=0, le=100)
+    callback_url: Optional[HttpUrl] = Field(None, description="URL to call when job completes (alternative to webhook)")
+    scraper_version: Optional[str] = Field(None, description="Specific scraper version to use")
+    scraper_variant: Optional[str] = Field(None, description="Scraper variant (e.g., 'fast', 'thorough', 'stealth')")


 class JobResponse(BaseModel):
@@ -267,10 +309,146 @@ async def root():
    }


-@app.post("/scrape", response_model=Dict[str, str], summary="Start Scraping Job")
+async def _create_google_reviews_job(
+    url: str,
+    webhook_url: Optional[str] = None,
+    webhook_secret: Optional[str] = None,
+    metadata: Optional[Dict[str, Any]] = None,
+    browser_fingerprint: Optional[BrowserFingerprintModel] = None,
+    geolocation: Optional[GeolocationModel] = None,
+    requester: Optional[RequesterModel] = None,
+    priority: int = 0,
+    callback_url: Optional[str] = None,
+    scraper_version: Optional[str] = None,
+    scraper_variant: Optional[str] = None,
+    job_type: str = "google-reviews"
+) -> Dict[str, str]:
+    """
+    Core logic for creating a Google Reviews scraping job.
+
+    This is the shared implementation used by both /scrape and /api/scrape/google-reviews endpoints.
+
+    Returns:
+        Dict with job_id, status, and message
+    """
+    if not db:
+        raise HTTPException(status_code=500, detail="Database not initialized")
+
+    try:
+        # Build metadata with all Phase 2 fields
+        job_metadata = metadata.copy() if metadata else {}
+
+        # Add browser fingerprint if provided
+        if browser_fingerprint:
+            fp = browser_fingerprint
+            job_metadata['browser_fingerprint'] = {
+                "userAgent": fp.userAgent,
+                "timezone": fp.timezone,
+                "language": fp.language,
+                "platform": fp.platform,
+            }
+            if fp.viewport:
+                job_metadata['browser_fingerprint']['viewport'] = {"width": fp.viewport.width, "height": fp.viewport.height}
+            if fp.geolocation:
+                job_metadata['browser_fingerprint']['geolocation'] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
+        elif geolocation:
+            job_metadata['geolocation'] = {
+                'lat': geolocation.lat,
+                'lng': geolocation.lng
+            }
+
+        # Phase 2: Add requester info if provided
+        if requester:
+            job_metadata['requester'] = {
+                'client_id': requester.client_id,
+                'source': requester.source,
+                'purpose': requester.purpose,
+                'metadata': requester.metadata
+            }
+
+        # Phase 2: Add job type for multi-scraper support
+        job_metadata['job_type'] = job_type
+
+        # Phase 2: Add priority for job queue ordering
+        job_metadata['priority'] = priority
+
+        # Phase 2: Add callback_url (alternative to webhook)
+        if callback_url:
+            job_metadata['callback_url'] = callback_url
+
+        # Phase 2: Add scraper version/variant for A/B testing and version control
+        if scraper_version:
+            job_metadata['scraper_version'] = scraper_version
+        if scraper_variant:
+            job_metadata['scraper_variant'] = scraper_variant
+
+        # Create job in database
+        job_id = await db.create_job(
+            url=url,
+            webhook_url=webhook_url,
+            webhook_secret=webhook_secret,
+            metadata=job_metadata
+        )
+
+        # Start scraping job in background
+        asyncio.create_task(run_scraping_job(job_id))
+
+        log.info(f"Created and started job {job_id} (type={job_type}, priority={priority})")
+
+        return {
+            "job_id": str(job_id),
+            "status": "started",
+            "message": "Scraping job started successfully",
+            "job_type": job_type
+        }
+
+    except Exception as e:
+        log.error(f"Error creating scraping job: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to create scraping job: {str(e)}")
+
+
+@app.post("/api/scrape/google-reviews", response_model=Dict[str, str], summary="Start Google Reviews Scraping Job")
+async def scrape_google_reviews(request: GoogleReviewsScrapeRequest):
+    """
+    Start a new Google Reviews scraping job.
+
+    This is the primary endpoint for Phase 2 onwards. It explicitly creates a job
+    of type 'google-reviews' with full support for all Phase 2 features:
+    - Requester tracking (client_id, source, purpose)
+    - Job priority for queue ordering
+    - Callback URL (alternative to webhook)
+    - Scraper version/variant selection for A/B testing
+
+    The job runs asynchronously in the background. You can:
+    - Poll GET /jobs/{job_id} for status
+    - Provide webhook_url for automatic notification when complete
+    - Subscribe to SSE at /jobs/{job_id}/stream for real-time updates
+
+    Returns the job ID for tracking.
+    """
+    return await _create_google_reviews_job(
+        url=str(request.url),
+        webhook_url=str(request.webhook_url) if request.webhook_url else None,
+        webhook_secret=request.webhook_secret,
+        metadata=request.metadata,
+        browser_fingerprint=request.browser_fingerprint,
+        geolocation=request.geolocation,
+        requester=request.requester,
+        priority=request.priority or 0,
+        callback_url=str(request.callback_url) if request.callback_url else None,
+        scraper_version=request.scraper_version,
+        scraper_variant=request.scraper_variant,
+        job_type="google-reviews"
+    )
+
+
+@app.post("/scrape", response_model=Dict[str, str], summary="Start Scraping Job (Legacy)")
 async def start_scrape(request: ScrapeRequest):
    """
-    Start a new scraping job.
+    Start a new scraping job (legacy endpoint, routes to google-reviews).
+
+    **NOTE**: This endpoint is maintained for backwards compatibility.
+    For new integrations, use POST /api/scrape/google-reviews instead.

    The job runs asynchronously in the background. You can:
    - Poll GET /jobs/{job_id} for status
@@ -278,52 +456,51 @@ async def start_scrape(request: ScrapeRequest):

    Returns the job ID for tracking.
    """
-    if not db:
-        raise HTTPException(status_code=500, detail="Database not initialized")
+    return await _create_google_reviews_job(
+        url=str(request.url),
+        webhook_url=str(request.webhook_url) if request.webhook_url else None,
+        webhook_secret=request.webhook_secret,
+        metadata=request.metadata,
+        browser_fingerprint=request.browser_fingerprint,
+        geolocation=request.geolocation,
+        requester=request.requester,
+        priority=request.priority or 0,
+        callback_url=str(request.callback_url) if request.callback_url else None,
+        scraper_version=request.scraper_version,
+        scraper_variant=request.scraper_variant,
+        job_type="google-reviews"
+    )

-    try:
-        # Merge browser fingerprint into metadata if provided
-        metadata = request.metadata or {}
-        if request.browser_fingerprint:
-            fp = request.browser_fingerprint
-            metadata['browser_fingerprint'] = {
-                "userAgent": fp.userAgent,
-                "timezone": fp.timezone,
-                "language": fp.language,
-                "platform": fp.platform,
-            }
-            if fp.viewport:
-                metadata['browser_fingerprint']['viewport'] = {"width": fp.viewport.width, "height": fp.viewport.height}
-            if fp.geolocation:
-                metadata['browser_fingerprint']['geolocation'] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
-        elif request.geolocation:
-            metadata['geolocation'] = {
-                'lat': request.geolocation.lat,
-                'lng': request.geolocation.lng
-            }

-        # Create job in database
-        job_id = await db.create_job(
-            url=str(request.url),
-            webhook_url=str(request.webhook_url) if request.webhook_url else None,
-            webhook_secret=request.webhook_secret,
-            metadata=metadata
-        )
+@app.post("/api/scrape", response_model=Dict[str, str], summary="Start Scraping Job")
+async def api_start_scrape(request: ScrapeRequest):
+    """
+    Start a new scraping job via the /api/scrape endpoint.

-        # Start scraping job in background
-        asyncio.create_task(run_scraping_job(job_id))
+    This endpoint accepts the same request body as /scrape and routes to google-reviews.
+    For explicit job type control, use POST /api/scrape/google-reviews instead.

-        log.info(f"Created and started job {job_id}")
+    The job runs asynchronously in the background. You can:
+    - Poll GET /jobs/{job_id} for status
+    - Provide webhook_url for automatic notification when complete
+    - Subscribe to SSE at /jobs/{job_id}/stream for real-time updates

-        return {
-            "job_id": str(job_id),
-            "status": "started",
-            "message": "Scraping job started successfully"
-        }
-
-    except Exception as e:
-        log.error(f"Error creating scraping job: {e}")
-        raise HTTPException(status_code=500, detail=f"Failed to create scraping job: {str(e)}")
+    Returns the job ID for tracking.
+    """
+    return await _create_google_reviews_job(
+        url=str(request.url),
+        webhook_url=str(request.webhook_url) if request.webhook_url else None,
+        webhook_secret=request.webhook_secret,
+        metadata=request.metadata,
+        browser_fingerprint=request.browser_fingerprint,
+        geolocation=request.geolocation,
+        requester=request.requester,
+        priority=request.priority or 0,
+        callback_url=str(request.callback_url) if request.callback_url else None,
+        scraper_version=request.scraper_version,
+        scraper_variant=request.scraper_variant,
+        job_type="google-reviews"
+    )


@app.get("/jobs/{job_id}", response_model=JobResponse, summary="Get Job Status")