Phases 2-4: Requester support, batches, webhooks, scraper registry

Phase 2 - Requester & Batch Support:
- core/database.py: Added create_job params (requester_*, batch_*, priority, callback_*)
- core/database.py: Added batch methods (create_batch, get_batch, update_batch_progress, get_batches)
- core/database.py: Added update_job_callback for tracking webhook delivery
- api/routes/batches.py: New endpoints:
  - POST /api/scrape/google-reviews/batch (submit batch)
  - GET /api/batches (list batches)
  - GET /api/batches/{id} (batch detail)
  - DELETE /api/batches/{id} (cancel batch)
- api_server_production.py: Updated /api/scrape with requester, priority, callback fields
- api_server_production.py: New primary endpoint POST /api/scrape/google-reviews

Phase 3 - Webhooks:
- services/job_callback_service.py: New service with:
  - JobCallbackService: send_job_callback, send_batch_callback, retry_failed_callbacks
  - JobCallbackDispatcher: Background worker for callback monitoring
  - Payload formats per spec (job.completed, job.failed, batch.completed)
  - Exponential backoff for retries
  - Error classification for failure payloads

Phase 4 - Scraper Registry:
- scrapers/registry.py: Database-backed version routing:
  - get_scraper(): Version/variant/A/B routing
  - _get_weighted_scraper(): Traffic-weighted random selection
  - 60-second TTL cache for performance
  - register_scraper, deprecate_scraper, update_traffic_allocation
  - LegacyScraperRegistry preserved for backwards compatibility

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 15:35:58 +00:00
parent 2412996c54
commit 788ef84756
8 changed files with 2503 additions and 98 deletions

View File

@@ -104,7 +104,7 @@ reviewiq/ # Will rename from google-reviews-scraper-pro
| Phase | Description | Status | | Phase | Description | Status |
|-------|-------------|--------| |-------|-------------|--------|
| 0 | Project restructure (move files to new locations) | ✅ COMPLETE | | 0 | Project restructure (move files to new locations) | ✅ COMPLETE |
| 1 | Database migrations (new fields + tables) | Not started | | 1 | Database migrations (new fields + tables) | ✅ COMPLETE |
| 2 | Requester & batch support | Not started | | 2 | Requester & batch support | Not started |
| 3 | Webhooks | Not started | | 3 | Webhooks | Not started |
| 4 | Scraper versioning & registry | Not started | | 4 | Scraper versioning & registry | Not started |

View File

@@ -0,0 +1,11 @@
"""
API Routes for ReviewIQ.
This module exports all route modules for easy import into the main server.
"""
from api.routes.batches import router as batches_router, set_database as set_batches_db
__all__ = [
'batches_router',
'set_batches_db',
]

672
api/routes/batches.py Normal file
View File

@@ -0,0 +1,672 @@
#!/usr/bin/env python3
"""
Batch submission API for ReviewIQ Phase 2.
Enables submitting multiple URLs as a batch with:
- Batch-level tracking and callback
- Individual job creation for each URL
- Batch status aggregation
- Batch cancellation
"""
import asyncio
import json
import logging
from datetime import datetime
from typing import Optional, List, Dict, Any
from uuid import UUID, uuid4
from fastapi import APIRouter, HTTPException, Query, Depends
from pydantic import BaseModel, HttpUrl, Field, validator
from core.database import DatabaseManager
from core.enums import JobStatus
log = logging.getLogger(__name__)
# Create router
router = APIRouter(prefix="/api", tags=["batches"])
# ==================== Pydantic Models ====================
class RequesterModel(BaseModel):
"""Requester information for batch tracking"""
client_id: str = Field(..., description="Client identifier (e.g., 'veritas_123')")
source: str = Field(..., description="Source of the request (e.g., 'veritasreview.com')")
purpose: Optional[str] = Field(None, description="Purpose of the batch (e.g., 'prospect_screening')")
metadata: Optional[Dict[str, Any]] = Field(default_factory=dict, description="Additional requester metadata")
class BatchSubmitRequest(BaseModel):
"""Request model for submitting a batch of URLs"""
name: str = Field(..., description="Batch name for identification", min_length=1, max_length=255)
urls: List[HttpUrl] = Field(..., description="List of Google Maps URLs to scrape", min_items=1, max_items=1000)
requester: RequesterModel = Field(..., description="Requester information")
priority: int = Field(default=0, description="Priority level (higher = more urgent)", ge=-10, le=10)
callback_url: Optional[HttpUrl] = Field(None, description="Webhook URL called when ALL jobs complete")
@validator('urls')
def validate_unique_urls(cls, v):
"""Ensure all URLs are unique"""
unique_urls = list(set(str(url) for url in v))
if len(unique_urls) != len(v):
raise ValueError('Duplicate URLs are not allowed in a batch')
return v
class BatchSubmitResponse(BaseModel):
"""Response model for batch submission"""
batch_id: str = Field(..., description="Unique batch identifier")
job_ids: List[str] = Field(..., description="List of created job IDs")
total_jobs: int = Field(..., description="Total number of jobs in the batch")
class BatchJobSummary(BaseModel):
"""Summary of a job within a batch"""
job_id: str
url: str
status: str
reviews_count: Optional[int] = None
error_message: Optional[str] = None
created_at: str
completed_at: Optional[str] = None
class BatchSummary(BaseModel):
"""Summary model for batch listing"""
batch_id: str
name: str
client_id: str
status: str # pending, running, completed, partial, failed
total_jobs: int
pending_jobs: int
running_jobs: int
completed_jobs: int
failed_jobs: int
total_reviews: int
created_at: str
updated_at: Optional[str] = None
completed_at: Optional[str] = None
class BatchDetailResponse(BaseModel):
"""Detailed batch information with job list"""
batch_id: str
name: str
requester: RequesterModel
priority: int
callback_url: Optional[str] = None
status: str
total_jobs: int
pending_jobs: int
running_jobs: int
completed_jobs: int
failed_jobs: int
total_reviews: int
created_at: str
updated_at: Optional[str] = None
completed_at: Optional[str] = None
jobs: List[BatchJobSummary]
class BatchCancelResponse(BaseModel):
"""Response model for batch cancellation"""
batch_id: str
cancelled_jobs: int
already_completed: int
message: str
# ==================== Database Helper Functions ====================
# These will be added to core.database in a future refactor,
# but for now we implement them here for the batch functionality.
async def create_batch(
db: DatabaseManager,
name: str,
requester: RequesterModel,
priority: int = 0,
callback_url: Optional[str] = None
) -> UUID:
"""
Create a new batch record in the database.
Returns:
UUID of created batch
"""
async with db.pool.acquire() as conn:
# First ensure batch table exists
await conn.execute("""
CREATE TABLE IF NOT EXISTS batches (
batch_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
name VARCHAR(255) NOT NULL,
client_id VARCHAR(255) NOT NULL,
requester JSONB NOT NULL,
priority INTEGER NOT NULL DEFAULT 0,
callback_url TEXT,
callback_delivered BOOLEAN DEFAULT FALSE,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP,
completed_at TIMESTAMP
);
""")
# Create index on client_id if not exists
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_batches_client_id ON batches(client_id);
""")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_batches_created_at ON batches(created_at DESC);
""")
# Ensure batch_id column exists in jobs table
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS batch_id UUID REFERENCES batches(batch_id) ON DELETE SET NULL;
""")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_jobs_batch_id ON jobs(batch_id) WHERE batch_id IS NOT NULL;
""")
batch_id = await conn.fetchval("""
INSERT INTO batches (name, client_id, requester, priority, callback_url)
VALUES ($1, $2, $3, $4, $5)
RETURNING batch_id
""", name, requester.client_id, json.dumps(requester.dict()), priority, callback_url)
log.info(f"Created batch {batch_id}: '{name}' for client {requester.client_id}")
return batch_id
async def create_job_in_batch(
db: DatabaseManager,
batch_id: UUID,
url: str,
priority: int = 0,
metadata: Optional[Dict[str, Any]] = None
) -> UUID:
"""
Create a job that belongs to a batch.
Returns:
UUID of created job
"""
job_metadata = metadata or {}
job_metadata['batch_id'] = str(batch_id)
job_metadata['priority'] = priority
async with db.pool.acquire() as conn:
job_id = await conn.fetchval("""
INSERT INTO jobs (url, batch_id, metadata)
VALUES ($1, $2, $3)
RETURNING job_id
""", url, batch_id, json.dumps(job_metadata))
return job_id
async def get_batch(db: DatabaseManager, batch_id: UUID) -> Optional[Dict[str, Any]]:
"""
Get batch by ID with aggregated job stats.
Returns:
Batch dictionary with job stats or None if not found
"""
async with db.pool.acquire() as conn:
# Get batch basic info
batch_row = await conn.fetchrow("""
SELECT
batch_id,
name,
client_id,
requester,
priority,
callback_url,
callback_delivered,
created_at,
updated_at,
completed_at
FROM batches
WHERE batch_id = $1
""", batch_id)
if not batch_row:
return None
# Get job stats for this batch
stats = await conn.fetchrow("""
SELECT
COUNT(*) as total_jobs,
COUNT(*) FILTER (WHERE status = 'pending') as pending_jobs,
COUNT(*) FILTER (WHERE status = 'running') as running_jobs,
COUNT(*) FILTER (WHERE status = 'completed') as completed_jobs,
COUNT(*) FILTER (WHERE status IN ('failed', 'partial')) as failed_jobs,
COALESCE(SUM(reviews_count), 0) as total_reviews
FROM jobs
WHERE batch_id = $1
""", batch_id)
result = dict(batch_row)
result.update(dict(stats))
# Calculate batch status
total = stats['total_jobs']
completed = stats['completed_jobs']
failed = stats['failed_jobs']
running = stats['running_jobs']
if total == 0:
result['status'] = 'pending'
elif completed + failed == total:
if failed == total:
result['status'] = 'failed'
elif failed > 0:
result['status'] = 'partial'
else:
result['status'] = 'completed'
elif running > 0 or completed > 0:
result['status'] = 'running'
else:
result['status'] = 'pending'
return result
async def get_batch_jobs(db: DatabaseManager, batch_id: UUID) -> List[Dict[str, Any]]:
"""
Get all jobs in a batch.
Returns:
List of job dictionaries
"""
async with db.pool.acquire() as conn:
rows = await conn.fetch("""
SELECT
job_id,
url,
status,
reviews_count,
error_message,
created_at,
completed_at
FROM jobs
WHERE batch_id = $1
ORDER BY created_at ASC
""", batch_id)
return [dict(row) for row in rows]
async def list_batches(
db: DatabaseManager,
client_id: Optional[str] = None,
status: Optional[str] = None,
limit: int = 100,
offset: int = 0
) -> List[Dict[str, Any]]:
"""
List batches with optional filtering.
Returns:
List of batch summary dictionaries
"""
async with db.pool.acquire() as conn:
# Build query dynamically based on filters
where_clauses = []
params = []
param_idx = 1
if client_id:
where_clauses.append(f"b.client_id = ${param_idx}")
params.append(client_id)
param_idx += 1
where_sql = f"WHERE {' AND '.join(where_clauses)}" if where_clauses else ""
query = f"""
SELECT
b.batch_id,
b.name,
b.client_id,
b.created_at,
b.updated_at,
b.completed_at,
COUNT(j.job_id) as total_jobs,
COUNT(j.job_id) FILTER (WHERE j.status = 'pending') as pending_jobs,
COUNT(j.job_id) FILTER (WHERE j.status = 'running') as running_jobs,
COUNT(j.job_id) FILTER (WHERE j.status = 'completed') as completed_jobs,
COUNT(j.job_id) FILTER (WHERE j.status IN ('failed', 'partial')) as failed_jobs,
COALESCE(SUM(j.reviews_count), 0) as total_reviews
FROM batches b
LEFT JOIN jobs j ON j.batch_id = b.batch_id
{where_sql}
GROUP BY b.batch_id
ORDER BY b.created_at DESC
LIMIT ${param_idx} OFFSET ${param_idx + 1}
"""
params.extend([limit, offset])
rows = await conn.fetch(query, *params)
results = []
for row in rows:
batch = dict(row)
# Calculate status
total = batch['total_jobs']
completed = batch['completed_jobs']
failed = batch['failed_jobs']
running = batch['running_jobs']
if total == 0:
batch['status'] = 'pending'
elif completed + failed == total:
if failed == total:
batch['status'] = 'failed'
elif failed > 0:
batch['status'] = 'partial'
else:
batch['status'] = 'completed'
elif running > 0 or completed > 0:
batch['status'] = 'running'
else:
batch['status'] = 'pending'
# Filter by status if specified
if status and batch['status'] != status:
continue
results.append(batch)
return results
async def cancel_batch_jobs(db: DatabaseManager, batch_id: UUID) -> tuple:
"""
Cancel all pending jobs in a batch.
Returns:
Tuple of (cancelled_count, already_completed_count)
"""
async with db.pool.acquire() as conn:
# Count already completed/failed jobs
already_done = await conn.fetchval("""
SELECT COUNT(*)
FROM jobs
WHERE batch_id = $1 AND status IN ('completed', 'failed', 'partial')
""", batch_id)
# Cancel pending and running jobs
result = await conn.execute("""
UPDATE jobs
SET status = 'cancelled', completed_at = NOW()
WHERE batch_id = $1 AND status IN ('pending', 'running')
""", batch_id)
cancelled_count = int(result.split()[-1])
# Update batch completed_at if all jobs are now done
await conn.execute("""
UPDATE batches
SET updated_at = NOW(),
completed_at = CASE
WHEN NOT EXISTS (
SELECT 1 FROM jobs
WHERE batch_id = $1 AND status IN ('pending', 'running')
) THEN NOW()
ELSE completed_at
END
WHERE batch_id = $1
""", batch_id)
log.info(f"Cancelled {cancelled_count} jobs in batch {batch_id}")
return cancelled_count, already_done or 0
# ==================== Dependency Injection ====================
# Database instance will be injected from the main app
# This is a placeholder that should be overridden when including the router
_db: Optional[DatabaseManager] = None
def set_database(db: DatabaseManager):
"""Set the database instance for the router"""
global _db
_db = db
def get_db() -> DatabaseManager:
"""Dependency to get database instance"""
if _db is None:
raise HTTPException(status_code=500, detail="Database not initialized")
return _db
# ==================== API Endpoints ====================
@router.post(
"/scrape/google-reviews/batch",
response_model=BatchSubmitResponse,
summary="Submit Batch of URLs",
description="Submit multiple Google Maps URLs as a batch for scraping"
)
async def submit_batch(
request: BatchSubmitRequest,
db: DatabaseManager = Depends(get_db)
):
"""
Submit multiple URLs as a batch for scraping.
Each URL becomes an individual job that can be tracked separately,
while the batch provides aggregate status and an optional callback
when all jobs complete.
- **name**: Descriptive name for the batch (e.g., "Q1 Prospects")
- **urls**: List of Google Maps URLs (1-1000 URLs)
- **requester**: Client identification for tracking
- **priority**: Higher priority batches may be processed first (default: 0)
- **callback_url**: Webhook called when ALL jobs in the batch complete
Returns batch_id and list of individual job_ids for tracking.
"""
try:
# Create the batch record
batch_id = await create_batch(
db=db,
name=request.name,
requester=request.requester,
priority=request.priority,
callback_url=str(request.callback_url) if request.callback_url else None
)
# Create individual jobs for each URL
job_ids = []
for url in request.urls:
job_id = await create_job_in_batch(
db=db,
batch_id=batch_id,
url=str(url),
priority=request.priority,
metadata={
'requester': request.requester.dict(),
'batch_name': request.name
}
)
job_ids.append(str(job_id))
log.info(f"Created batch {batch_id} with {len(job_ids)} jobs for client {request.requester.client_id}")
return BatchSubmitResponse(
batch_id=str(batch_id),
job_ids=job_ids,
total_jobs=len(job_ids)
)
except Exception as e:
log.error(f"Error creating batch: {e}")
raise HTTPException(status_code=500, detail=f"Failed to create batch: {str(e)}")
@router.get(
"/batches",
response_model=List[BatchSummary],
summary="List Batches",
description="List batches with optional filtering by client_id and status"
)
async def list_batches_endpoint(
client_id: Optional[str] = Query(None, description="Filter by client ID"),
status: Optional[str] = Query(None, description="Filter by batch status (pending, running, completed, partial, failed)"),
limit: int = Query(100, description="Maximum number of batches to return", ge=1, le=1000),
offset: int = Query(0, description="Number of batches to skip", ge=0),
db: DatabaseManager = Depends(get_db)
):
"""
List batches with optional filters.
- **client_id**: Filter to only show batches for a specific client
- **status**: Filter by batch status:
- pending: No jobs have started yet
- running: At least one job is running or completed
- completed: All jobs completed successfully
- partial: All jobs done, but some failed
- failed: All jobs failed
- **limit**: Maximum results (default: 100)
- **offset**: Skip first N results for pagination
"""
try:
batches = await list_batches(
db=db,
client_id=client_id,
status=status,
limit=limit,
offset=offset
)
return [
BatchSummary(
batch_id=str(b['batch_id']),
name=b['name'],
client_id=b['client_id'],
status=b['status'],
total_jobs=b['total_jobs'],
pending_jobs=b['pending_jobs'],
running_jobs=b['running_jobs'],
completed_jobs=b['completed_jobs'],
failed_jobs=b['failed_jobs'],
total_reviews=b['total_reviews'],
created_at=b['created_at'].isoformat(),
updated_at=b['updated_at'].isoformat() if b.get('updated_at') else None,
completed_at=b['completed_at'].isoformat() if b.get('completed_at') else None
)
for b in batches
]
except Exception as e:
log.error(f"Error listing batches: {e}")
raise HTTPException(status_code=500, detail=f"Failed to list batches: {str(e)}")
@router.get(
"/batches/{batch_id}",
response_model=BatchDetailResponse,
summary="Get Batch Details",
description="Get detailed batch information including all jobs"
)
async def get_batch_details(
batch_id: UUID,
db: DatabaseManager = Depends(get_db)
):
"""
Get detailed information about a specific batch.
Returns batch metadata, aggregate statistics, and a list of all jobs
with their individual statuses.
"""
try:
batch = await get_batch(db, batch_id)
if not batch:
raise HTTPException(status_code=404, detail="Batch not found")
jobs = await get_batch_jobs(db, batch_id)
# Parse requester from JSONB
requester_data = batch['requester']
if isinstance(requester_data, str):
requester_data = json.loads(requester_data)
return BatchDetailResponse(
batch_id=str(batch['batch_id']),
name=batch['name'],
requester=RequesterModel(**requester_data),
priority=batch['priority'],
callback_url=batch.get('callback_url'),
status=batch['status'],
total_jobs=batch['total_jobs'],
pending_jobs=batch['pending_jobs'],
running_jobs=batch['running_jobs'],
completed_jobs=batch['completed_jobs'],
failed_jobs=batch['failed_jobs'],
total_reviews=batch['total_reviews'],
created_at=batch['created_at'].isoformat(),
updated_at=batch['updated_at'].isoformat() if batch.get('updated_at') else None,
completed_at=batch['completed_at'].isoformat() if batch.get('completed_at') else None,
jobs=[
BatchJobSummary(
job_id=str(j['job_id']),
url=j['url'],
status=j['status'],
reviews_count=j.get('reviews_count'),
error_message=j.get('error_message'),
created_at=j['created_at'].isoformat(),
completed_at=j['completed_at'].isoformat() if j.get('completed_at') else None
)
for j in jobs
]
)
except HTTPException:
raise
except Exception as e:
log.error(f"Error getting batch {batch_id}: {e}")
raise HTTPException(status_code=500, detail=f"Failed to get batch: {str(e)}")
@router.delete(
"/batches/{batch_id}",
response_model=BatchCancelResponse,
summary="Cancel Batch",
description="Cancel all pending jobs in a batch"
)
async def cancel_batch(
batch_id: UUID,
db: DatabaseManager = Depends(get_db)
):
"""
Cancel all pending and running jobs in a batch.
Already completed or failed jobs cannot be cancelled and will remain
with their current status.
Returns the count of cancelled jobs and already-completed jobs.
"""
try:
# Verify batch exists
batch = await get_batch(db, batch_id)
if not batch:
raise HTTPException(status_code=404, detail="Batch not found")
# Cancel the jobs
cancelled, already_done = await cancel_batch_jobs(db, batch_id)
return BatchCancelResponse(
batch_id=str(batch_id),
cancelled_jobs=cancelled,
already_completed=already_done,
message=f"Cancelled {cancelled} jobs. {already_done} jobs were already completed/failed."
)
except HTTPException:
raise
except Exception as e:
log.error(f"Error cancelling batch {batch_id}: {e}")
raise HTTPException(status_code=500, detail=f"Failed to cancel batch: {str(e)}")

View File

@@ -1,9 +1,14 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Production Google Reviews Scraper API Server with Phase 1 features: Production Google Reviews Scraper API Server with Phase 1 & 2 features:
- PostgreSQL storage with JSONB - PostgreSQL storage with JSONB
- Webhook delivery with retries - Webhook delivery with retries
- Smart health checks with canary testing - Smart health checks with canary testing
- Phase 2: Requester tracking (client_id, source, purpose)
- Phase 2: Job priority for queue ordering
- Phase 2: Callback URL alternative to webhooks
- Phase 2: Scraper version/variant selection for A/B testing
- Phase 2: Explicit job type endpoint (/api/scrape/google-reviews)
""" """
import asyncio import asyncio
import json import json
@@ -35,6 +40,7 @@ from workers.chrome_pool import (
release_scraping_worker, release_scraping_worker,
get_pool_stats get_pool_stats
) )
from api.routes import batches_router, set_batches_db
# Configure logging # Configure logging
logging.basicConfig( logging.basicConfig(
@@ -78,6 +84,9 @@ async def lifespan(app: FastAPI):
await db.initialize_schema() await db.initialize_schema()
log.info("Database initialized") log.info("Database initialized")
# Inject database into route modules
set_batches_db(db)
# Initialize health check system with canary monitoring # Initialize health check system with canary monitoring
# DISABLED: Canary tests consume Google Maps requests and trigger rate limiting # DISABLED: Canary tests consume Google Maps requests and trigger rate limiting
# health_system = HealthCheckSystem(db) # health_system = HealthCheckSystem(db)
@@ -134,6 +143,9 @@ app.add_middleware(
allow_headers=["*"], allow_headers=["*"],
) )
# Include routers from api/routes/
app.include_router(batches_router)
# ==================== Request/Response Models ==================== # ==================== Request/Response Models ====================
@@ -159,14 +171,44 @@ class BrowserFingerprintModel(BaseModel):
platform: Optional[str] = Field(None, description="Platform (e.g., MacIntel, Win32)") platform: Optional[str] = Field(None, description="Platform (e.g., MacIntel, Win32)")
class RequesterModel(BaseModel):
"""Information about the requester of a scrape job"""
client_id: Optional[str] = Field(None, description="Client identifier")
source: Optional[str] = Field(None, description="Source of the request (e.g., 'web', 'api', 'internal')")
purpose: Optional[str] = Field(None, description="Purpose of the scrape (e.g., 'competitor_analysis', 'review_monitoring')")
metadata: Optional[Dict[str, Any]] = Field(None, description="Additional requester metadata")
class ScrapeRequest(BaseModel): class ScrapeRequest(BaseModel):
"""Request model for starting a scrape job""" """Request model for starting a scrape job (legacy endpoint, routes to google-reviews)"""
url: HttpUrl = Field(..., description="Google Maps URL to scrape") url: HttpUrl = Field(..., description="Google Maps URL to scrape")
webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications") webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications")
webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature") webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature")
metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata") metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata")
geolocation: Optional[GeolocationModel] = Field(None, description="User's geolocation for Chrome") geolocation: Optional[GeolocationModel] = Field(None, description="User's geolocation for Chrome")
browser_fingerprint: Optional[BrowserFingerprintModel] = Field(None, description="User's browser fingerprint") browser_fingerprint: Optional[BrowserFingerprintModel] = Field(None, description="User's browser fingerprint")
# Phase 2: New optional fields for enhanced job tracking
requester: Optional[RequesterModel] = Field(None, description="Information about who requested this job")
priority: Optional[int] = Field(0, description="Job priority (higher = more important)", ge=0, le=100)
callback_url: Optional[HttpUrl] = Field(None, description="URL to call when job completes (alternative to webhook)")
scraper_version: Optional[str] = Field(None, description="Specific scraper version to use")
scraper_variant: Optional[str] = Field(None, description="Scraper variant (e.g., 'fast', 'thorough', 'stealth')")
class GoogleReviewsScrapeRequest(BaseModel):
"""Request model for Google Reviews scraping - explicit job type endpoint"""
url: HttpUrl = Field(..., description="Google Maps URL to scrape")
webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications")
webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature")
metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata")
geolocation: Optional[GeolocationModel] = Field(None, description="User's geolocation for Chrome")
browser_fingerprint: Optional[BrowserFingerprintModel] = Field(None, description="User's browser fingerprint")
# Phase 2: New optional fields for enhanced job tracking
requester: Optional[RequesterModel] = Field(None, description="Information about who requested this job")
priority: Optional[int] = Field(0, description="Job priority (higher = more important)", ge=0, le=100)
callback_url: Optional[HttpUrl] = Field(None, description="URL to call when job completes (alternative to webhook)")
scraper_version: Optional[str] = Field(None, description="Specific scraper version to use")
scraper_variant: Optional[str] = Field(None, description="Scraper variant (e.g., 'fast', 'thorough', 'stealth')")
class JobResponse(BaseModel): class JobResponse(BaseModel):
@@ -267,10 +309,146 @@ async def root():
} }
@app.post("/scrape", response_model=Dict[str, str], summary="Start Scraping Job") async def _create_google_reviews_job(
url: str,
webhook_url: Optional[str] = None,
webhook_secret: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
browser_fingerprint: Optional[BrowserFingerprintModel] = None,
geolocation: Optional[GeolocationModel] = None,
requester: Optional[RequesterModel] = None,
priority: int = 0,
callback_url: Optional[str] = None,
scraper_version: Optional[str] = None,
scraper_variant: Optional[str] = None,
job_type: str = "google-reviews"
) -> Dict[str, str]:
"""
Core logic for creating a Google Reviews scraping job.
This is the shared implementation used by both /scrape and /api/scrape/google-reviews endpoints.
Returns:
Dict with job_id, status, and message
"""
if not db:
raise HTTPException(status_code=500, detail="Database not initialized")
try:
# Build metadata with all Phase 2 fields
job_metadata = metadata.copy() if metadata else {}
# Add browser fingerprint if provided
if browser_fingerprint:
fp = browser_fingerprint
job_metadata['browser_fingerprint'] = {
"userAgent": fp.userAgent,
"timezone": fp.timezone,
"language": fp.language,
"platform": fp.platform,
}
if fp.viewport:
job_metadata['browser_fingerprint']['viewport'] = {"width": fp.viewport.width, "height": fp.viewport.height}
if fp.geolocation:
job_metadata['browser_fingerprint']['geolocation'] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
elif geolocation:
job_metadata['geolocation'] = {
'lat': geolocation.lat,
'lng': geolocation.lng
}
# Phase 2: Add requester info if provided
if requester:
job_metadata['requester'] = {
'client_id': requester.client_id,
'source': requester.source,
'purpose': requester.purpose,
'metadata': requester.metadata
}
# Phase 2: Add job type for multi-scraper support
job_metadata['job_type'] = job_type
# Phase 2: Add priority for job queue ordering
job_metadata['priority'] = priority
# Phase 2: Add callback_url (alternative to webhook)
if callback_url:
job_metadata['callback_url'] = callback_url
# Phase 2: Add scraper version/variant for A/B testing and version control
if scraper_version:
job_metadata['scraper_version'] = scraper_version
if scraper_variant:
job_metadata['scraper_variant'] = scraper_variant
# Create job in database
job_id = await db.create_job(
url=url,
webhook_url=webhook_url,
webhook_secret=webhook_secret,
metadata=job_metadata
)
# Start scraping job in background
asyncio.create_task(run_scraping_job(job_id))
log.info(f"Created and started job {job_id} (type={job_type}, priority={priority})")
return {
"job_id": str(job_id),
"status": "started",
"message": "Scraping job started successfully",
"job_type": job_type
}
except Exception as e:
log.error(f"Error creating scraping job: {e}")
raise HTTPException(status_code=500, detail=f"Failed to create scraping job: {str(e)}")
@app.post("/api/scrape/google-reviews", response_model=Dict[str, str], summary="Start Google Reviews Scraping Job")
async def scrape_google_reviews(request: GoogleReviewsScrapeRequest):
"""
Start a new Google Reviews scraping job.
This is the primary endpoint for Phase 2 onwards. It explicitly creates a job
of type 'google-reviews' with full support for all Phase 2 features:
- Requester tracking (client_id, source, purpose)
- Job priority for queue ordering
- Callback URL (alternative to webhook)
- Scraper version/variant selection for A/B testing
The job runs asynchronously in the background. You can:
- Poll GET /jobs/{job_id} for status
- Provide webhook_url for automatic notification when complete
- Subscribe to SSE at /jobs/{job_id}/stream for real-time updates
Returns the job ID for tracking.
"""
return await _create_google_reviews_job(
url=str(request.url),
webhook_url=str(request.webhook_url) if request.webhook_url else None,
webhook_secret=request.webhook_secret,
metadata=request.metadata,
browser_fingerprint=request.browser_fingerprint,
geolocation=request.geolocation,
requester=request.requester,
priority=request.priority or 0,
callback_url=str(request.callback_url) if request.callback_url else None,
scraper_version=request.scraper_version,
scraper_variant=request.scraper_variant,
job_type="google-reviews"
)
@app.post("/scrape", response_model=Dict[str, str], summary="Start Scraping Job (Legacy)")
async def start_scrape(request: ScrapeRequest): async def start_scrape(request: ScrapeRequest):
""" """
Start a new scraping job. Start a new scraping job (legacy endpoint, routes to google-reviews).
**NOTE**: This endpoint is maintained for backwards compatibility.
For new integrations, use POST /api/scrape/google-reviews instead.
The job runs asynchronously in the background. You can: The job runs asynchronously in the background. You can:
- Poll GET /jobs/{job_id} for status - Poll GET /jobs/{job_id} for status
@@ -278,52 +456,51 @@ async def start_scrape(request: ScrapeRequest):
Returns the job ID for tracking. Returns the job ID for tracking.
""" """
if not db: return await _create_google_reviews_job(
raise HTTPException(status_code=500, detail="Database not initialized")
try:
# Merge browser fingerprint into metadata if provided
metadata = request.metadata or {}
if request.browser_fingerprint:
fp = request.browser_fingerprint
metadata['browser_fingerprint'] = {
"userAgent": fp.userAgent,
"timezone": fp.timezone,
"language": fp.language,
"platform": fp.platform,
}
if fp.viewport:
metadata['browser_fingerprint']['viewport'] = {"width": fp.viewport.width, "height": fp.viewport.height}
if fp.geolocation:
metadata['browser_fingerprint']['geolocation'] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
elif request.geolocation:
metadata['geolocation'] = {
'lat': request.geolocation.lat,
'lng': request.geolocation.lng
}
# Create job in database
job_id = await db.create_job(
url=str(request.url), url=str(request.url),
webhook_url=str(request.webhook_url) if request.webhook_url else None, webhook_url=str(request.webhook_url) if request.webhook_url else None,
webhook_secret=request.webhook_secret, webhook_secret=request.webhook_secret,
metadata=metadata metadata=request.metadata,
browser_fingerprint=request.browser_fingerprint,
geolocation=request.geolocation,
requester=request.requester,
priority=request.priority or 0,
callback_url=str(request.callback_url) if request.callback_url else None,
scraper_version=request.scraper_version,
scraper_variant=request.scraper_variant,
job_type="google-reviews"
) )
# Start scraping job in background
asyncio.create_task(run_scraping_job(job_id))
log.info(f"Created and started job {job_id}") @app.post("/api/scrape", response_model=Dict[str, str], summary="Start Scraping Job")
async def api_start_scrape(request: ScrapeRequest):
"""
Start a new scraping job via the /api/scrape endpoint.
return { This endpoint accepts the same request body as /scrape and routes to google-reviews.
"job_id": str(job_id), For explicit job type control, use POST /api/scrape/google-reviews instead.
"status": "started",
"message": "Scraping job started successfully"
}
except Exception as e: The job runs asynchronously in the background. You can:
log.error(f"Error creating scraping job: {e}") - Poll GET /jobs/{job_id} for status
raise HTTPException(status_code=500, detail=f"Failed to create scraping job: {str(e)}") - Provide webhook_url for automatic notification when complete
- Subscribe to SSE at /jobs/{job_id}/stream for real-time updates
Returns the job ID for tracking.
"""
return await _create_google_reviews_job(
url=str(request.url),
webhook_url=str(request.webhook_url) if request.webhook_url else None,
webhook_secret=request.webhook_secret,
metadata=request.metadata,
browser_fingerprint=request.browser_fingerprint,
geolocation=request.geolocation,
requester=request.requester,
priority=request.priority or 0,
callback_url=str(request.callback_url) if request.callback_url else None,
scraper_version=request.scraper_version,
scraper_variant=request.scraper_variant,
job_type="google-reviews"
)
@app.get("/jobs/{job_id}", response_model=JobResponse, summary="Get Job Status") @app.get("/jobs/{job_id}", response_model=JobResponse, summary="Get Job Status")

View File

@@ -189,7 +189,22 @@ class DatabaseManager:
url: str, url: str,
webhook_url: Optional[str] = None, webhook_url: Optional[str] = None,
webhook_secret: Optional[str] = None, webhook_secret: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None metadata: Optional[Dict[str, Any]] = None,
# Phase 2: New fields for requester tracking
requester_client_id: Optional[str] = None,
requester_source: Optional[str] = None,
scrape_purpose: Optional[str] = None,
requester_metadata: Optional[Dict[str, Any]] = None,
# Phase 2: Batch support
batch_id: Optional[UUID] = None,
batch_index: Optional[int] = None,
# Phase 2: Job configuration
job_type: str = 'google_reviews',
priority: int = 0,
callback_url: Optional[str] = None,
# Phase 2: Scraper versioning
scraper_version: Optional[str] = None,
scraper_variant: Optional[str] = None
) -> UUID: ) -> UUID:
""" """
Create a new scraping job. Create a new scraping job.
@@ -199,16 +214,41 @@ class DatabaseManager:
webhook_url: Optional webhook URL for notifications webhook_url: Optional webhook URL for notifications
webhook_secret: Optional secret for webhook signature webhook_secret: Optional secret for webhook signature
metadata: Optional additional metadata metadata: Optional additional metadata
requester_client_id: Client ID of the requester (for tracking)
requester_source: Source of the request (e.g., 'api', 'web', 'batch')
scrape_purpose: Purpose of the scrape (e.g., 'competitor_analysis')
requester_metadata: Additional requester-specific metadata
batch_id: ID of the batch this job belongs to
batch_index: Index of this job within the batch
job_type: Type of job (default: 'google_reviews')
priority: Job priority (higher = more urgent, default: 0)
callback_url: URL to call when job completes
scraper_version: Version of the scraper to use
scraper_variant: Variant of the scraper (e.g., 'stealth', 'fast')
Returns: Returns:
UUID of created job UUID of created job
""" """
async with self.pool.acquire() as conn: async with self.pool.acquire() as conn:
job_id = await conn.fetchval(""" job_id = await conn.fetchval("""
INSERT INTO jobs (url, webhook_url, webhook_secret, metadata) INSERT INTO jobs (
VALUES ($1, $2, $3, $4) url, webhook_url, webhook_secret, metadata,
requester_client_id, requester_source, scrape_purpose, requester_metadata,
batch_id, batch_index,
job_type, priority, callback_url,
scraper_version, scraper_variant
)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
RETURNING job_id RETURNING job_id
""", url, webhook_url, webhook_secret, json.dumps(metadata) if metadata else None) """,
url, webhook_url, webhook_secret,
json.dumps(metadata) if metadata else None,
requester_client_id, requester_source, scrape_purpose,
json.dumps(requester_metadata) if requester_metadata else None,
batch_id, batch_index,
job_type, priority, callback_url,
scraper_version, scraper_variant
)
log.info(f"Created job {job_id} for URL: {url[:80]}...") log.info(f"Created job {job_id} for URL: {url[:80]}...")
return job_id return job_id
@@ -241,7 +281,20 @@ class DatabaseManager:
error_message, error_message,
metadata, metadata,
scrape_logs, scrape_logs,
review_topics review_topics,
requester_client_id,
requester_source,
scrape_purpose,
requester_metadata,
batch_id,
batch_index,
job_type,
priority,
callback_url,
callback_status,
callback_attempts,
scraper_version,
scraper_variant
FROM jobs FROM jobs
WHERE job_id = $1 WHERE job_id = $1
""", job_id) """, job_id)
@@ -493,7 +546,9 @@ class DatabaseManager:
self, self,
status: Optional[JobStatus] = None, status: Optional[JobStatus] = None,
limit: int = 100, limit: int = 100,
offset: int = 0 offset: int = 0,
requester_client_id: Optional[str] = None,
batch_id: Optional[UUID] = None
) -> List[Dict[str, Any]]: ) -> List[Dict[str, Any]]:
""" """
List jobs with optional filtering. List jobs with optional filtering.
@@ -502,49 +557,299 @@ class DatabaseManager:
status: Optional status filter status: Optional status filter
limit: Maximum number of jobs to return limit: Maximum number of jobs to return
offset: Number of jobs to skip offset: Number of jobs to skip
requester_client_id: Optional filter by requester client ID
batch_id: Optional filter by batch ID
Returns: Returns:
List of job dictionaries List of job dictionaries
""" """
async with self.pool.acquire() as conn: async with self.pool.acquire() as conn:
if status: # Build dynamic WHERE clause
rows = await conn.fetch(""" conditions = []
SELECT params = []
job_id, param_idx = 1
status,
url,
created_at,
completed_at,
reviews_count,
total_reviews,
scrape_time,
error_message,
metadata,
review_topics
FROM jobs
WHERE status = $1
ORDER BY created_at DESC
LIMIT $2 OFFSET $3
""", status.value, limit, offset)
else:
rows = await conn.fetch("""
SELECT
job_id,
status,
url,
created_at,
completed_at,
reviews_count,
total_reviews,
scrape_time,
error_message,
metadata,
review_topics
FROM jobs
ORDER BY created_at DESC
LIMIT $1 OFFSET $2
""", limit, offset)
if status:
conditions.append(f"status = ${param_idx}")
params.append(status.value)
param_idx += 1
if requester_client_id:
conditions.append(f"requester_client_id = ${param_idx}")
params.append(requester_client_id)
param_idx += 1
if batch_id:
conditions.append(f"batch_id = ${param_idx}")
params.append(batch_id)
param_idx += 1
where_clause = f"WHERE {' AND '.join(conditions)}" if conditions else ""
# Add limit and offset params
params.extend([limit, offset])
query = f"""
SELECT
job_id,
status,
url,
created_at,
completed_at,
reviews_count,
total_reviews,
scrape_time,
error_message,
metadata,
review_topics,
requester_client_id,
requester_source,
scrape_purpose,
requester_metadata,
batch_id,
batch_index,
job_type,
priority,
callback_url,
callback_status,
callback_attempts,
scraper_version,
scraper_variant
FROM jobs
{where_clause}
ORDER BY created_at DESC
LIMIT ${param_idx} OFFSET ${param_idx + 1}
"""
rows = await conn.fetch(query, *params)
return [dict(row) for row in rows]
async def update_job_callback(
self,
job_id: UUID,
status: str,
attempts: Optional[int] = None
):
"""
Update callback status for a job.
Args:
job_id: Job UUID
status: Callback status ('pending', 'success', 'failed', 'skipped')
attempts: Number of callback attempts (if not provided, increments by 1)
"""
async with self.pool.acquire() as conn:
if attempts is not None:
await conn.execute("""
UPDATE jobs
SET callback_status = $2, callback_attempts = $3, updated_at = NOW()
WHERE job_id = $1
""", job_id, status, attempts)
else:
await conn.execute("""
UPDATE jobs
SET callback_status = $2,
callback_attempts = COALESCE(callback_attempts, 0) + 1,
updated_at = NOW()
WHERE job_id = $1
""", job_id, status)
log.debug(f"Updated callback status for job {job_id}: {status}")
# ==================== Batch Operations ====================
async def create_batch(
self,
name: str,
requester_client_id: Optional[str] = None,
requester_source: Optional[str] = None,
scrape_purpose: Optional[str] = None,
total_jobs: int = 0,
callback_url: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
) -> UUID:
"""
Create a new batch for grouping jobs.
Args:
name: Batch name/description
requester_client_id: Client ID of the requester
requester_source: Source of the batch request
scrape_purpose: Purpose of the scrape
total_jobs: Expected total number of jobs in batch
callback_url: URL to call when batch completes
metadata: Additional batch metadata
Returns:
UUID of created batch
"""
async with self.pool.acquire() as conn:
batch_id = await conn.fetchval("""
INSERT INTO batches (
name, requester_client_id, requester_source, scrape_purpose,
total_jobs, callback_url, metadata
)
VALUES ($1, $2, $3, $4, $5, $6, $7)
RETURNING batch_id
""",
name, requester_client_id, requester_source, scrape_purpose,
total_jobs, callback_url,
json.dumps(metadata) if metadata else None
)
log.info(f"Created batch {batch_id}: {name} ({total_jobs} jobs)")
return batch_id
async def get_batch(self, batch_id: UUID) -> Optional[Dict[str, Any]]:
"""
Get batch by ID with job counts.
Args:
batch_id: Batch UUID
Returns:
Batch dictionary with job counts or None if not found
"""
async with self.pool.acquire() as conn:
row = await conn.fetchrow("""
SELECT
b.batch_id,
b.name,
b.status,
b.requester_client_id,
b.requester_source,
b.scrape_purpose,
b.total_jobs,
b.completed_jobs,
b.failed_jobs,
b.callback_url,
b.callback_status,
b.metadata,
b.created_at,
b.updated_at,
b.completed_at,
COUNT(j.job_id) FILTER (WHERE j.status = 'pending') as pending_jobs,
COUNT(j.job_id) FILTER (WHERE j.status = 'running') as running_jobs,
COUNT(j.job_id) as actual_total_jobs
FROM batches b
LEFT JOIN jobs j ON j.batch_id = b.batch_id
WHERE b.batch_id = $1
GROUP BY b.batch_id
""", batch_id)
if not row:
return None
return dict(row)
async def update_batch_progress(self, batch_id: UUID):
"""
Recalculate and update batch progress from jobs table.
Args:
batch_id: Batch UUID
"""
async with self.pool.acquire() as conn:
# Calculate job counts
counts = await conn.fetchrow("""
SELECT
COUNT(*) FILTER (WHERE status = 'completed') as completed,
COUNT(*) FILTER (WHERE status = 'failed') as failed,
COUNT(*) FILTER (WHERE status = 'partial') as partial,
COUNT(*) as total
FROM jobs
WHERE batch_id = $1
""", batch_id)
completed = counts['completed'] or 0
failed = counts['failed'] or 0
partial = counts['partial'] or 0
total = counts['total'] or 0
# Determine batch status
if total == 0:
status = 'pending'
elif completed + failed + partial >= total:
status = 'completed'
elif completed > 0 or failed > 0 or partial > 0:
status = 'running'
else:
status = 'pending'
# Update batch
await conn.execute("""
UPDATE batches
SET
completed_jobs = $2,
failed_jobs = $3,
status = $4,
updated_at = NOW(),
completed_at = CASE WHEN $4 = 'completed' THEN NOW() ELSE completed_at END
WHERE batch_id = $1
""", batch_id, completed, failed, status)
log.debug(f"Updated batch {batch_id} progress: {completed}/{total} completed, {failed} failed")
async def get_batches(
self,
requester_client_id: Optional[str] = None,
status: Optional[str] = None,
limit: int = 50
) -> List[Dict[str, Any]]:
"""
List batches with optional filtering.
Args:
requester_client_id: Optional filter by requester client ID
status: Optional filter by batch status
limit: Maximum number of batches to return
Returns:
List of batch dictionaries
"""
async with self.pool.acquire() as conn:
# Build dynamic WHERE clause
conditions = []
params = []
param_idx = 1
if requester_client_id:
conditions.append(f"requester_client_id = ${param_idx}")
params.append(requester_client_id)
param_idx += 1
if status:
conditions.append(f"status = ${param_idx}")
params.append(status)
param_idx += 1
where_clause = f"WHERE {' AND '.join(conditions)}" if conditions else ""
params.append(limit)
query = f"""
SELECT
batch_id,
name,
status,
requester_client_id,
requester_source,
scrape_purpose,
total_jobs,
completed_jobs,
failed_jobs,
callback_url,
callback_status,
metadata,
created_at,
updated_at,
completed_at
FROM batches
{where_clause}
ORDER BY created_at DESC
LIMIT ${param_idx}
"""
rows = await conn.fetch(query, *params)
return [dict(row) for row in rows] return [dict(row) for row in rows]
async def get_pending_jobs_with_webhooks(self, limit: int = 100) -> List[Dict[str, Any]]: async def get_pending_jobs_with_webhooks(self, limit: int = 100) -> List[Dict[str, Any]]:

View File

@@ -1,18 +1,512 @@
""" """
Scraper Registry Scraper Registry
This module provides a registry for managing and discovering scrapers. This module provides a database-backed registry for managing and routing
It allows dynamic registration and lookup of scraper implementations. scraper requests. It supports:
- Version-based routing (exact version or latest for variant)
- A/B testing via traffic_pct weighted selection
- Priority-based scraper filtering
- Caching with TTL for performance
""" """
from typing import Dict, List, Optional, Type import asyncio
import logging
import random
import time
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Type
from scrapers.base import BaseScraper from scrapers.base import BaseScraper
log = logging.getLogger(__name__)
@dataclass
class ScraperInfo:
"""Information about a registered scraper."""
id: str
job_type: str
version: str
variant: str
module_path: str
function_name: str
is_default: bool
traffic_pct: int
min_priority: int
config: Optional[Dict[str, Any]]
deprecated_at: Optional[str]
class ScraperRegistry: class ScraperRegistry:
""" """
Registry for managing scraper implementations. Routes scraping requests to appropriate scraper versions.
Supports A/B testing via traffic_pct and variant selection.
This registry is backed by the scraper_registry database table and
provides weighted random selection for A/B testing scenarios.
Usage:
registry = ScraperRegistry(db)
scraper_info = await registry.get_scraper("google_reviews")
# scraper_info contains module_path, function_name, config, etc.
"""
def __init__(self, db: "DatabaseManager"): # noqa: F821 - forward reference
"""
Initialize the scraper registry.
Args:
db: DatabaseManager instance for database access
"""
self.db = db
self._cache: Dict[str, List[ScraperInfo]] = {} # Cache by job_type
self._cache_timestamp: float = 0
self._cache_ttl: int = 60 # Refresh cache every 60 seconds
self._cache_lock = asyncio.Lock()
async def get_scraper(
self,
job_type: str,
version: str = None,
variant: str = None,
priority: int = 0
) -> Optional[Dict[str, Any]]:
"""
Get scraper info for a job.
Priority order:
1. If version specified, return exact match
2. If variant specified, return latest active scraper of that variant
3. Otherwise, use A/B routing based on traffic_pct
Args:
job_type: Type of scraping job (e.g., "google_reviews")
version: Optional specific version to use (e.g., "1.0.0")
variant: Optional variant filter ("stable", "beta", "canary")
priority: Job priority level for min_priority filtering
Returns:
Dictionary containing scraper info:
{
"version": "1.0.0",
"variant": "stable",
"module_path": "scrapers.google_reviews.v1_0_0",
"function_name": "fast_scrape_reviews",
"config": {...}
}
Returns None if no matching scraper found.
"""
# Ensure cache is fresh
await self._ensure_cache_fresh()
# Get all scrapers for this job type
scrapers = self._cache.get(job_type, [])
if not scrapers:
log.warning(f"No scrapers registered for job_type: {job_type}")
return None
# Filter out deprecated scrapers
active_scrapers = [s for s in scrapers if s.deprecated_at is None]
if not active_scrapers:
log.warning(f"All scrapers for job_type {job_type} are deprecated")
return None
selected: Optional[ScraperInfo] = None
# Priority 1: Exact version match
if version:
selected = self._find_by_version(active_scrapers, version)
if selected:
log.debug(f"Selected scraper by exact version: {version}")
else:
log.warning(f"Requested version {version} not found for {job_type}")
return None
# Priority 2: Latest for variant
elif variant:
selected = self._find_latest_for_variant(active_scrapers, variant)
if selected:
log.debug(f"Selected latest scraper for variant {variant}: {selected.version}")
else:
log.warning(f"No active scrapers found for variant {variant} in {job_type}")
return None
# Priority 3: A/B weighted selection
else:
selected = await self._get_weighted_scraper(job_type, priority)
if selected:
log.debug(f"Selected scraper via A/B routing: {selected.version} ({selected.variant})")
if not selected:
return None
return self._scraper_to_dict(selected)
async def _get_weighted_scraper(
self,
job_type: str,
priority: int
) -> Optional[ScraperInfo]:
"""
Select scraper based on traffic weights.
Uses random selection weighted by traffic_pct.
Filters by min_priority.
Args:
job_type: Type of scraping job
priority: Job priority level
Returns:
Selected ScraperInfo or None if no eligible scrapers
"""
scrapers = self._cache.get(job_type, [])
# Filter: active, has traffic allocation, and meets priority requirement
eligible = [
s for s in scrapers
if s.deprecated_at is None
and s.traffic_pct > 0
and s.min_priority <= priority
]
if not eligible:
# Fall back to default scraper
default = self._find_default(scrapers)
if default and default.min_priority <= priority:
log.debug(f"No eligible A/B scrapers, using default: {default.version}")
return default
log.warning(f"No eligible scrapers for job_type {job_type} with priority {priority}")
return None
# Weighted random selection
total_weight = sum(s.traffic_pct for s in eligible)
if total_weight == 0:
# Equal probability if all have 0 traffic_pct
return random.choice(eligible)
# Generate random number in range [0, total_weight)
rand_value = random.random() * total_weight
cumulative = 0
for scraper in eligible:
cumulative += scraper.traffic_pct
if rand_value < cumulative:
return scraper
# Fallback (shouldn't reach here, but safety)
return eligible[-1]
async def refresh_cache(self) -> None:
"""
Reload registry from database.
This method forces a cache refresh regardless of TTL.
Thread-safe via asyncio lock.
"""
async with self._cache_lock:
await self._load_cache()
async def _ensure_cache_fresh(self) -> None:
"""Ensure cache is loaded and not stale."""
current_time = time.time()
if (
not self._cache
or (current_time - self._cache_timestamp) > self._cache_ttl
):
async with self._cache_lock:
# Double-check after acquiring lock
if (
not self._cache
or (current_time - self._cache_timestamp) > self._cache_ttl
):
await self._load_cache()
async def _load_cache(self) -> None:
"""Load all scraper registry entries from database."""
try:
async with self.db.pool.acquire() as conn:
rows = await conn.fetch("""
SELECT
id,
job_type,
version,
variant,
module_path,
function_name,
is_default,
traffic_pct,
min_priority,
config,
deprecated_at
FROM scraper_registry
ORDER BY job_type, version DESC
""")
# Group by job_type
self._cache.clear()
for row in rows:
scraper_info = ScraperInfo(
id=str(row['id']),
job_type=row['job_type'],
version=row['version'],
variant=row['variant'],
module_path=row['module_path'],
function_name=row['function_name'],
is_default=row['is_default'],
traffic_pct=row['traffic_pct'],
min_priority=row['min_priority'],
config=row['config'],
deprecated_at=str(row['deprecated_at']) if row['deprecated_at'] else None
)
if scraper_info.job_type not in self._cache:
self._cache[scraper_info.job_type] = []
self._cache[scraper_info.job_type].append(scraper_info)
self._cache_timestamp = time.time()
log.info(f"Scraper registry cache loaded: {sum(len(v) for v in self._cache.values())} entries")
except Exception as e:
log.error(f"Failed to load scraper registry cache: {e}")
raise
async def list_scrapers(
self,
job_type: str = None,
include_deprecated: bool = False
) -> List[Dict[str, Any]]:
"""
List registered scrapers, optionally filtered by job_type.
Args:
job_type: Optional job type filter
include_deprecated: Whether to include deprecated scrapers
Returns:
List of scraper info dictionaries
"""
await self._ensure_cache_fresh()
result = []
if job_type:
scrapers = self._cache.get(job_type, [])
else:
scrapers = [s for scrapers_list in self._cache.values() for s in scrapers_list]
for scraper in scrapers:
if not include_deprecated and scraper.deprecated_at:
continue
result.append(self._scraper_to_dict(scraper))
return result
async def get_scraper_by_id(self, scraper_id: str) -> Optional[Dict[str, Any]]:
"""
Get a specific scraper by its database ID.
Args:
scraper_id: UUID of the scraper registry entry
Returns:
Scraper info dictionary or None if not found
"""
await self._ensure_cache_fresh()
for scrapers_list in self._cache.values():
for scraper in scrapers_list:
if scraper.id == scraper_id:
return self._scraper_to_dict(scraper)
return None
async def register_scraper(
self,
job_type: str,
version: str,
variant: str,
module_path: str,
function_name: str,
is_default: bool = False,
traffic_pct: int = 0,
min_priority: int = 0,
config: Optional[Dict[str, Any]] = None
) -> str:
"""
Register a new scraper version in the database.
Args:
job_type: Type of scraping job
version: Semantic version string
variant: Release channel ("stable", "beta", "canary")
module_path: Python module path
function_name: Entry function name
is_default: Whether this is the default fallback
traffic_pct: Traffic percentage for A/B testing (0-100)
min_priority: Minimum job priority required
config: Optional configuration dictionary
Returns:
UUID of created registry entry
Raises:
ValueError: If variant is invalid or traffic_pct out of range
"""
if variant not in ('stable', 'beta', 'canary'):
raise ValueError(f"Invalid variant: {variant}. Must be 'stable', 'beta', or 'canary'")
if not 0 <= traffic_pct <= 100:
raise ValueError(f"traffic_pct must be between 0 and 100, got: {traffic_pct}")
import json
async with self.db.pool.acquire() as conn:
scraper_id = await conn.fetchval("""
INSERT INTO scraper_registry (
job_type, version, variant, module_path, function_name,
is_default, traffic_pct, min_priority, config
)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9::jsonb)
RETURNING id
""", job_type, version, variant, module_path, function_name,
is_default, traffic_pct, min_priority,
json.dumps(config) if config else None)
# Invalidate cache
self._cache_timestamp = 0
log.info(f"Registered scraper: {job_type} v{version} ({variant})")
return str(scraper_id)
async def deprecate_scraper(
self,
job_type: str,
version: str
) -> bool:
"""
Deprecate a scraper version (soft delete).
Args:
job_type: Type of scraping job
version: Version to deprecate
Returns:
True if deprecated, False if not found
"""
async with self.db.pool.acquire() as conn:
result = await conn.execute("""
UPDATE scraper_registry
SET deprecated_at = NOW()
WHERE job_type = $1 AND version = $2 AND deprecated_at IS NULL
""", job_type, version)
updated = result.split()[-1] == "1"
if updated:
self._cache_timestamp = 0 # Invalidate cache
log.info(f"Deprecated scraper: {job_type} v{version}")
return updated
async def update_traffic_allocation(
self,
job_type: str,
allocations: Dict[str, int]
) -> None:
"""
Update traffic allocations for multiple scrapers atomically.
Args:
job_type: Type of scraping job
allocations: Dict mapping version to traffic_pct
e.g., {"1.0.0": 90, "1.1.0-beta": 10}
Raises:
ValueError: If total exceeds 100 or any value is invalid
"""
total = sum(allocations.values())
if total > 100:
raise ValueError(f"Total traffic allocation cannot exceed 100, got: {total}")
for version, pct in allocations.items():
if not 0 <= pct <= 100:
raise ValueError(f"Invalid traffic_pct for {version}: {pct}")
async with self.db.pool.acquire() as conn:
async with conn.transaction():
for version, traffic_pct in allocations.items():
await conn.execute("""
UPDATE scraper_registry
SET traffic_pct = $3
WHERE job_type = $1 AND version = $2 AND deprecated_at IS NULL
""", job_type, version, traffic_pct)
# Invalidate cache
self._cache_timestamp = 0
log.info(f"Updated traffic allocations for {job_type}: {allocations}")
# ==================== Helper Methods ====================
def _find_by_version(
self,
scrapers: List[ScraperInfo],
version: str
) -> Optional[ScraperInfo]:
"""Find scraper by exact version match."""
for scraper in scrapers:
if scraper.version == version:
return scraper
return None
def _find_latest_for_variant(
self,
scrapers: List[ScraperInfo],
variant: str
) -> Optional[ScraperInfo]:
"""Find latest (first in sorted list) scraper for a variant."""
for scraper in scrapers:
if scraper.variant == variant:
return scraper
return None
def _find_default(
self,
scrapers: List[ScraperInfo]
) -> Optional[ScraperInfo]:
"""Find default scraper for fallback."""
for scraper in scrapers:
if scraper.is_default and scraper.deprecated_at is None:
return scraper
return None
def _scraper_to_dict(self, scraper: ScraperInfo) -> Dict[str, Any]:
"""Convert ScraperInfo to dictionary for API responses."""
return {
"id": scraper.id,
"version": scraper.version,
"variant": scraper.variant,
"module_path": scraper.module_path,
"function_name": scraper.function_name,
"is_default": scraper.is_default,
"traffic_pct": scraper.traffic_pct,
"min_priority": scraper.min_priority,
"config": scraper.config,
"deprecated": scraper.deprecated_at is not None
}
# ==================== Legacy Singleton Registry ====================
# Kept for backward compatibility with existing code that uses
# the old class-based scraper registration pattern.
class LegacyScraperRegistry:
"""
Legacy registry for managing scraper implementations.
This class provides backward compatibility with the old scraper
registration pattern using class-based scrapers. New code should
use the database-backed ScraperRegistry instead.
The registry allows: The registry allows:
- Registering scrapers by name and version - Registering scrapers by name and version
@@ -20,15 +514,15 @@ class ScraperRegistry:
- Listing all available scrapers - Listing all available scrapers
Usage: Usage:
registry = ScraperRegistry() registry = LegacyScraperRegistry()
registry.register(GoogleReviewsScraper) registry.register(GoogleReviewsScraper)
scraper = registry.get_scraper_for_url("https://google.com/maps/place/...") scraper = registry.get_scraper_for_url("https://google.com/maps/place/...")
""" """
_instance: Optional["ScraperRegistry"] = None _instance: Optional["LegacyScraperRegistry"] = None
_scrapers: Dict[str, Type[BaseScraper]] _scrapers: Dict[str, Type[BaseScraper]]
def __new__(cls) -> "ScraperRegistry": def __new__(cls) -> "LegacyScraperRegistry":
"""Singleton pattern to ensure one global registry.""" """Singleton pattern to ensure one global registry."""
if cls._instance is None: if cls._instance is None:
cls._instance = super().__new__(cls) cls._instance = super().__new__(cls)
@@ -134,5 +628,5 @@ class ScraperRegistry:
self._domain_map.clear() self._domain_map.clear()
# Global registry instance # Global legacy registry instance (for backward compatibility)
registry = ScraperRegistry() registry = LegacyScraperRegistry()

View File

@@ -0,0 +1,29 @@
"""
Services module for ReviewIQ platform.
Contains service classes for webhook delivery, job callbacks, and other
business logic.
"""
from services.webhook_service import (
WebhookManager,
WebhookDispatcher,
WebhookDeliveryError,
verify_webhook_signature
)
from services.job_callback_service import (
JobCallbackService,
JobCallbackDispatcher,
CallbackStatus
)
__all__ = [
# Webhook service
'WebhookManager',
'WebhookDispatcher',
'WebhookDeliveryError',
'verify_webhook_signature',
# Job callback service
'JobCallbackService',
'JobCallbackDispatcher',
'CallbackStatus',
]

View File

@@ -0,0 +1,717 @@
#!/usr/bin/env python3
"""
Job Callback Service for webhook delivery on job and batch completion.
This service handles sending webhooks when jobs complete or fail,
as well as batch-level completion callbacks.
"""
import asyncio
import json
import logging
from datetime import datetime
from typing import Dict, Any, Optional, List
from uuid import UUID
import httpx
from services.webhook_service import WebhookManager
log = logging.getLogger(__name__)
# Scraper version (should match the deployed scraper)
SCRAPER_VERSION = "1.0.0"
class CallbackStatus:
"""Callback status constants"""
PENDING = "pending"
SENT = "sent"
FAILED = "failed"
class JobCallbackService:
"""
Handles webhook callbacks for job and batch completion.
This service is responsible for:
- Sending callbacks when individual jobs complete or fail
- Sending callbacks when entire batches complete
- Retrying failed callbacks with exponential backoff
- Tracking callback status and attempts in the database
"""
def __init__(
self,
db,
max_retries: int = 3,
timeout: float = 10.0,
initial_retry_delay: float = 2.0
):
"""
Initialize the job callback service.
Args:
db: DatabaseManager instance
max_retries: Maximum number of delivery attempts per callback
timeout: HTTP request timeout in seconds
initial_retry_delay: Initial delay between retries (exponential backoff)
"""
self.db = db
self.webhook_manager = WebhookManager(
max_retries=max_retries,
timeout=timeout,
initial_retry_delay=initial_retry_delay
)
self.max_retries = max_retries
self.timeout = timeout
self.initial_retry_delay = initial_retry_delay
async def send_job_callback(self, job_id: UUID) -> bool:
"""
Send callback for completed/failed job.
This method:
- Fetches the job from the database
- Builds the appropriate payload based on job status
- POSTs to the callback_url
- Updates callback_status in the database
Args:
job_id: UUID of the job
Returns:
True if callback was sent successfully, False otherwise
"""
# Fetch job from database
job = await self.db.get_job(job_id)
if not job:
log.error(f"Job {job_id} not found for callback")
return False
callback_url = job.get('callback_url')
if not callback_url:
log.debug(f"Job {job_id} has no callback_url configured")
return True # No callback needed, consider success
status = job.get('status')
if status not in ('completed', 'failed', 'partial'):
log.warning(f"Job {job_id} has status '{status}', not sending callback")
return False
# Build payload based on status
payload = self._build_job_payload(job)
# Get webhook secret if available (reuse webhook_secret for callbacks)
secret = job.get('webhook_secret')
# Send the callback
log.info(f"Sending job callback to {callback_url} for job {job_id} (status: {status})")
success = await self._send_callback(
url=callback_url,
payload=payload,
secret=secret,
job_id=job_id
)
# Update callback status in database
await self._update_callback_status(job_id, success)
if success:
log.info(f"Job callback sent successfully for job {job_id}")
else:
log.error(f"Job callback failed for job {job_id}")
return success
async def send_batch_callback(self, batch_id: UUID) -> bool:
"""
Send callback when batch completes.
This method:
- Fetches all jobs in the batch from the database
- Checks if all jobs have completed (success or failure)
- Builds a summary payload
- POSTs to the batch callback_url
- Updates callback_status
Args:
batch_id: UUID of the batch
Returns:
True if callback was sent successfully, False otherwise
"""
# Get batch info (from first job with this batch_id)
batch_info = await self._get_batch_info(batch_id)
if not batch_info:
log.error(f"Batch {batch_id} not found or has no jobs")
return False
callback_url = batch_info.get('callback_url')
if not callback_url:
log.debug(f"Batch {batch_id} has no callback_url configured")
return True # No callback needed
# Check if batch is complete
if not batch_info.get('is_complete'):
log.debug(f"Batch {batch_id} is not yet complete")
return False
# Build batch payload
payload = self._build_batch_payload(batch_info)
# Get webhook secret (from first job's webhook_secret)
secret = batch_info.get('webhook_secret')
# Send the callback
log.info(f"Sending batch callback to {callback_url} for batch {batch_id}")
success = await self._send_callback(
url=callback_url,
payload=payload,
secret=secret,
job_id=None # Batch callback, no single job_id
)
# Update batch callback status (on all jobs in the batch)
await self._update_batch_callback_status(batch_id, success)
if success:
log.info(f"Batch callback sent successfully for batch {batch_id}")
else:
log.error(f"Batch callback failed for batch {batch_id}")
return success
async def retry_failed_callbacks(self, max_attempts: int = 5) -> Dict[str, int]:
"""
Find jobs with callback_status='failed' and attempts < max.
Retry sending callbacks with exponential backoff.
Args:
max_attempts: Maximum number of total attempts before giving up
Returns:
Dict with counts: {'retried': n, 'succeeded': n, 'failed': n}
"""
# Get jobs with failed callbacks that haven't exceeded max attempts
jobs = await self._get_failed_callbacks(max_attempts)
results = {
'retried': 0,
'succeeded': 0,
'failed': 0
}
if not jobs:
log.debug("No failed callbacks to retry")
return results
log.info(f"Retrying {len(jobs)} failed callbacks")
for job in jobs:
job_id = job['job_id']
attempts = job.get('callback_attempts', 0)
# Calculate delay based on attempt number (exponential backoff)
delay = self.initial_retry_delay * (2 ** attempts)
log.info(f"Retrying callback for job {job_id} (attempt {attempts + 1}), delay: {delay:.1f}s")
# Wait with backoff
await asyncio.sleep(delay)
# Retry the callback
success = await self.send_job_callback(job_id)
results['retried'] += 1
if success:
results['succeeded'] += 1
else:
results['failed'] += 1
log.info(f"Callback retry complete: {results}")
return results
async def check_and_send_batch_callbacks(self) -> Dict[str, int]:
"""
Check for completed batches and send their callbacks.
This should be called periodically to detect when batches complete.
Returns:
Dict with counts: {'checked': n, 'sent': n, 'failed': n}
"""
# Get distinct batch_ids that might be complete
batch_ids = await self._get_potentially_complete_batches()
results = {
'checked': 0,
'sent': 0,
'failed': 0
}
for batch_id in batch_ids:
results['checked'] += 1
success = await self.send_batch_callback(batch_id)
if success:
results['sent'] += 1
else:
results['failed'] += 1
return results
def _build_job_payload(self, job: Dict[str, Any]) -> Dict[str, Any]:
"""
Build webhook payload for job completion/failure.
Args:
job: Job dictionary from database
Returns:
Webhook payload dictionary
"""
status = job.get('status')
job_id = str(job.get('job_id'))
job_type = job.get('job_type', 'google_reviews')
# Base payload
payload = {
"job_id": job_id,
"job_type": job_type,
"status": status,
"url": job.get('url'),
"scraper_version": job.get('scraper_version') or SCRAPER_VERSION,
}
if status == 'completed':
# Completed job payload
payload["event"] = "job.completed"
# Calculate result summary
reviews_count = job.get('reviews_count') or 0
# Try to extract primary metric (average rating) from reviews_data
primary_metric = None
reviews_data = job.get('reviews_data')
if reviews_data:
if isinstance(reviews_data, str):
try:
reviews_data = json.loads(reviews_data)
except json.JSONDecodeError:
reviews_data = []
if reviews_data:
ratings = [r.get('rating', 0) for r in reviews_data if r.get('rating')]
if ratings:
primary_metric = round(sum(ratings) / len(ratings), 2)
payload["result_summary"] = {
"item_count": reviews_count,
"primary_metric": primary_metric
}
# Duration
started_at = job.get('started_at')
completed_at = job.get('completed_at')
if started_at and completed_at:
if isinstance(started_at, str):
started_at = datetime.fromisoformat(started_at.replace('Z', '+00:00'))
if isinstance(completed_at, str):
completed_at = datetime.fromisoformat(completed_at.replace('Z', '+00:00'))
duration = (completed_at - started_at).total_seconds()
payload["duration_seconds"] = round(duration, 2)
elif job.get('scrape_time'):
payload["duration_seconds"] = round(job.get('scrape_time'), 2)
# Completed timestamp
if completed_at:
if isinstance(completed_at, datetime):
payload["completed_at"] = completed_at.isoformat() + 'Z'
else:
payload["completed_at"] = completed_at
elif status in ('failed', 'partial'):
# Failed job payload
payload["event"] = "job.failed"
error_message = job.get('error_message', 'Unknown error')
# Determine error type from message
error_type = self._classify_error(error_message)
payload["error"] = {
"type": error_type,
"message": error_message
}
# Include partial results info if applicable
if status == 'partial':
payload["status"] = "partial"
payload["result_summary"] = {
"item_count": job.get('reviews_count') or 0,
"primary_metric": None
}
# Failed timestamp
completed_at = job.get('completed_at')
if completed_at:
if isinstance(completed_at, datetime):
payload["completed_at"] = completed_at.isoformat() + 'Z'
else:
payload["completed_at"] = completed_at
return payload
def _build_batch_payload(self, batch_info: Dict[str, Any]) -> Dict[str, Any]:
"""
Build webhook payload for batch completion.
Args:
batch_info: Batch info dictionary with job summaries
Returns:
Webhook payload dictionary
"""
batch_id = str(batch_info.get('batch_id'))
jobs = batch_info.get('jobs', [])
# Count successes and failures
succeeded = sum(1 for j in jobs if j.get('status') == 'completed')
failed = sum(1 for j in jobs if j.get('status') in ('failed', 'partial'))
failed_job_ids = [str(j.get('job_id')) for j in jobs if j.get('status') in ('failed', 'partial')]
# Find latest completed_at
completed_times = [
j.get('completed_at') for j in jobs
if j.get('completed_at')
]
latest_completed = max(completed_times) if completed_times else datetime.utcnow()
if isinstance(latest_completed, datetime):
latest_completed = latest_completed.isoformat() + 'Z'
payload = {
"event": "batch.completed",
"batch_id": batch_id,
"name": batch_info.get('name', f'Batch {batch_id[:8]}'),
"total_jobs": len(jobs),
"succeeded": succeeded,
"failed": failed,
"completed_at": latest_completed,
"failed_job_ids": failed_job_ids
}
return payload
def _classify_error(self, error_message: str) -> str:
"""
Classify error message into a type category.
Args:
error_message: Error message string
Returns:
Error type string
"""
if not error_message:
return "unknown"
error_lower = error_message.lower()
if 'rate' in error_lower and 'limit' in error_lower:
return "rate_limited"
elif 'timeout' in error_lower:
return "timeout"
elif 'captcha' in error_lower or 'robot' in error_lower:
return "captcha_detected"
elif 'blocked' in error_lower or 'denied' in error_lower:
return "blocked"
elif 'network' in error_lower or 'connection' in error_lower:
return "network_error"
elif 'not found' in error_lower or '404' in error_lower:
return "not_found"
elif 'invalid' in error_lower:
return "invalid_input"
elif 'element' in error_lower or 'selector' in error_lower:
return "scrape_error"
else:
return "unknown"
async def _send_callback(
self,
url: str,
payload: Dict[str, Any],
secret: Optional[str] = None,
job_id: Optional[UUID] = None
) -> bool:
"""
Send a callback to the specified URL.
Uses the WebhookManager for retry logic and HMAC signing.
Args:
url: Callback URL
payload: Payload dictionary
secret: Optional secret for HMAC signature
job_id: Optional job ID for logging
Returns:
True if sent successfully
"""
return await self.webhook_manager.send_webhook(
webhook_url=url,
payload=payload,
secret=secret,
job_id=job_id,
db=self.db
)
async def _update_callback_status(self, job_id: UUID, success: bool):
"""
Update the callback_status and callback_attempts for a job.
Args:
job_id: Job UUID
success: Whether the callback was sent successfully
"""
async with self.db.pool.acquire() as conn:
if success:
await conn.execute("""
UPDATE jobs
SET callback_status = 'sent',
callback_attempts = COALESCE(callback_attempts, 0) + 1
WHERE job_id = $1
""", job_id)
else:
await conn.execute("""
UPDATE jobs
SET callback_status = 'failed',
callback_attempts = COALESCE(callback_attempts, 0) + 1
WHERE job_id = $1
""", job_id)
async def _update_batch_callback_status(self, batch_id: UUID, success: bool):
"""
Update callback status for all jobs in a batch.
Args:
batch_id: Batch UUID
success: Whether the callback was sent successfully
"""
status = 'sent' if success else 'failed'
async with self.db.pool.acquire() as conn:
await conn.execute("""
UPDATE jobs
SET callback_status = $2,
callback_attempts = COALESCE(callback_attempts, 0) + 1
WHERE batch_id = $1
""", batch_id, status)
async def _get_failed_callbacks(self, max_attempts: int) -> List[Dict[str, Any]]:
"""
Get jobs with failed callbacks that can be retried.
Args:
max_attempts: Maximum attempts before giving up
Returns:
List of job dictionaries
"""
async with self.db.pool.acquire() as conn:
rows = await conn.fetch("""
SELECT
job_id,
status,
callback_url,
callback_status,
callback_attempts,
webhook_secret
FROM jobs
WHERE callback_url IS NOT NULL
AND callback_status = 'failed'
AND COALESCE(callback_attempts, 0) < $1
AND status IN ('completed', 'failed', 'partial')
ORDER BY completed_at ASC
LIMIT 100
""", max_attempts)
return [dict(row) for row in rows]
async def _get_batch_info(self, batch_id: UUID) -> Optional[Dict[str, Any]]:
"""
Get batch information including all jobs.
Args:
batch_id: Batch UUID
Returns:
Batch info dictionary with jobs list, or None if not found
"""
async with self.db.pool.acquire() as conn:
rows = await conn.fetch("""
SELECT
job_id,
status,
batch_index,
callback_url,
callback_status,
webhook_secret,
completed_at,
reviews_count,
error_message,
metadata
FROM jobs
WHERE batch_id = $1
ORDER BY batch_index ASC
""", batch_id)
if not rows:
return None
jobs = [dict(row) for row in rows]
# Determine if batch is complete (all jobs finished)
pending_statuses = ('pending', 'running')
is_complete = all(
j.get('status') not in pending_statuses
for j in jobs
)
# Get batch name from first job's metadata if available
batch_name = None
first_metadata = jobs[0].get('metadata')
if first_metadata:
if isinstance(first_metadata, str):
try:
first_metadata = json.loads(first_metadata)
except json.JSONDecodeError:
first_metadata = {}
batch_name = first_metadata.get('batch_name')
return {
'batch_id': batch_id,
'name': batch_name,
'jobs': jobs,
'is_complete': is_complete,
'callback_url': jobs[0].get('callback_url'),
'webhook_secret': jobs[0].get('webhook_secret')
}
async def _get_potentially_complete_batches(self) -> List[UUID]:
"""
Get batch IDs that might have recently completed.
Returns:
List of batch UUIDs to check
"""
async with self.db.pool.acquire() as conn:
# Find batches where:
# 1. At least one job has callback_url set
# 2. callback_status is null or pending (not yet sent)
# 3. No jobs are still running
rows = await conn.fetch("""
SELECT DISTINCT batch_id
FROM jobs
WHERE batch_id IS NOT NULL
AND callback_url IS NOT NULL
AND COALESCE(callback_status, 'pending') = 'pending'
AND batch_id NOT IN (
SELECT DISTINCT batch_id
FROM jobs
WHERE batch_id IS NOT NULL
AND status IN ('pending', 'running')
)
LIMIT 100
""")
return [row['batch_id'] for row in rows]
class JobCallbackDispatcher:
"""
Background dispatcher that monitors for jobs needing callbacks.
Runs in background and processes callbacks for completed jobs.
"""
def __init__(
self,
db,
interval_seconds: int = 30,
retry_interval_seconds: int = 300
):
"""
Initialize the callback dispatcher.
Args:
db: DatabaseManager instance
interval_seconds: How often to check for pending callbacks
retry_interval_seconds: How often to retry failed callbacks
"""
self.db = db
self.interval = interval_seconds
self.retry_interval = retry_interval_seconds
self.callback_service = JobCallbackService(db)
self.running = False
self._last_retry = datetime.utcnow()
async def start(self):
"""Start the background callback dispatcher"""
self.running = True
log.info("Job callback dispatcher started")
while self.running:
try:
# Process pending job callbacks
await self._process_pending_callbacks()
# Check for completed batches
await self.callback_service.check_and_send_batch_callbacks()
# Periodically retry failed callbacks
now = datetime.utcnow()
if (now - self._last_retry).total_seconds() >= self.retry_interval:
await self.callback_service.retry_failed_callbacks(max_attempts=5)
self._last_retry = now
except Exception as e:
log.error(f"Error in callback dispatcher: {e}")
await asyncio.sleep(self.interval)
def stop(self):
"""Stop the background callback dispatcher"""
self.running = False
log.info("Job callback dispatcher stopped")
async def _process_pending_callbacks(self):
"""
Process all pending callbacks.
Fetches jobs with callback_url set and callback_status null/pending.
"""
async with self.db.pool.acquire() as conn:
rows = await conn.fetch("""
SELECT job_id
FROM jobs
WHERE callback_url IS NOT NULL
AND COALESCE(callback_status, 'pending') = 'pending'
AND status IN ('completed', 'failed', 'partial')
ORDER BY completed_at ASC
LIMIT 100
""")
if not rows:
return
log.info(f"Processing {len(rows)} pending job callbacks")
for row in rows:
job_id = row['job_id']
try:
await self.callback_service.send_job_callback(job_id)
except Exception as e:
log.error(f"Error sending callback for job {job_id}: {e}")
log.info(f"Processed {len(rows)} callbacks")