Phases 2-4: Requester support, batches, webhooks, scraper registry

Phase 2 - Requester & Batch Support:
- core/database.py: Added create_job params (requester_*, batch_*, priority, callback_*)
- core/database.py: Added batch methods (create_batch, get_batch, update_batch_progress, get_batches)
- core/database.py: Added update_job_callback for tracking webhook delivery
- api/routes/batches.py: New endpoints:
  - POST /api/scrape/google-reviews/batch (submit batch)
  - GET /api/batches (list batches)
  - GET /api/batches/{id} (batch detail)
  - DELETE /api/batches/{id} (cancel batch)
- api_server_production.py: Updated /api/scrape with requester, priority, callback fields
- api_server_production.py: New primary endpoint POST /api/scrape/google-reviews

Phase 3 - Webhooks:
- services/job_callback_service.py: New service with:
  - JobCallbackService: send_job_callback, send_batch_callback, retry_failed_callbacks
  - JobCallbackDispatcher: Background worker for callback monitoring
  - Payload formats per spec (job.completed, job.failed, batch.completed)
  - Exponential backoff for retries
  - Error classification for failure payloads

Phase 4 - Scraper Registry:
- scrapers/registry.py: Database-backed version routing:
  - get_scraper(): Version/variant/A/B routing
  - _get_weighted_scraper(): Traffic-weighted random selection
  - 60-second TTL cache for performance
  - register_scraper, deprecate_scraper, update_traffic_allocation
  - LegacyScraperRegistry preserved for backwards compatibility

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 15:35:58 +00:00
parent 2412996c54
commit 788ef84756
8 changed files with 2503 additions and 98 deletions

View File

@@ -189,7 +189,22 @@ class DatabaseManager:
url: str,
webhook_url: Optional[str] = None,
webhook_secret: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
metadata: Optional[Dict[str, Any]] = None,
# Phase 2: New fields for requester tracking
requester_client_id: Optional[str] = None,
requester_source: Optional[str] = None,
scrape_purpose: Optional[str] = None,
requester_metadata: Optional[Dict[str, Any]] = None,
# Phase 2: Batch support
batch_id: Optional[UUID] = None,
batch_index: Optional[int] = None,
# Phase 2: Job configuration
job_type: str = 'google_reviews',
priority: int = 0,
callback_url: Optional[str] = None,
# Phase 2: Scraper versioning
scraper_version: Optional[str] = None,
scraper_variant: Optional[str] = None
) -> UUID:
"""
Create a new scraping job.
@@ -199,16 +214,41 @@ class DatabaseManager:
webhook_url: Optional webhook URL for notifications
webhook_secret: Optional secret for webhook signature
metadata: Optional additional metadata
requester_client_id: Client ID of the requester (for tracking)
requester_source: Source of the request (e.g., 'api', 'web', 'batch')
scrape_purpose: Purpose of the scrape (e.g., 'competitor_analysis')
requester_metadata: Additional requester-specific metadata
batch_id: ID of the batch this job belongs to
batch_index: Index of this job within the batch
job_type: Type of job (default: 'google_reviews')
priority: Job priority (higher = more urgent, default: 0)
callback_url: URL to call when job completes
scraper_version: Version of the scraper to use
scraper_variant: Variant of the scraper (e.g., 'stealth', 'fast')
Returns:
UUID of created job
"""
async with self.pool.acquire() as conn:
job_id = await conn.fetchval("""
INSERT INTO jobs (url, webhook_url, webhook_secret, metadata)
VALUES ($1, $2, $3, $4)
INSERT INTO jobs (
url, webhook_url, webhook_secret, metadata,
requester_client_id, requester_source, scrape_purpose, requester_metadata,
batch_id, batch_index,
job_type, priority, callback_url,
scraper_version, scraper_variant
)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
RETURNING job_id
""", url, webhook_url, webhook_secret, json.dumps(metadata) if metadata else None)
""",
url, webhook_url, webhook_secret,
json.dumps(metadata) if metadata else None,
requester_client_id, requester_source, scrape_purpose,
json.dumps(requester_metadata) if requester_metadata else None,
batch_id, batch_index,
job_type, priority, callback_url,
scraper_version, scraper_variant
)
log.info(f"Created job {job_id} for URL: {url[:80]}...")
return job_id
@@ -241,7 +281,20 @@ class DatabaseManager:
error_message,
metadata,
scrape_logs,
review_topics
review_topics,
requester_client_id,
requester_source,
scrape_purpose,
requester_metadata,
batch_id,
batch_index,
job_type,
priority,
callback_url,
callback_status,
callback_attempts,
scraper_version,
scraper_variant
FROM jobs
WHERE job_id = $1
""", job_id)
@@ -493,7 +546,9 @@ class DatabaseManager:
self,
status: Optional[JobStatus] = None,
limit: int = 100,
offset: int = 0
offset: int = 0,
requester_client_id: Optional[str] = None,
batch_id: Optional[UUID] = None
) -> List[Dict[str, Any]]:
"""
List jobs with optional filtering.
@@ -502,49 +557,299 @@ class DatabaseManager:
status: Optional status filter
limit: Maximum number of jobs to return
offset: Number of jobs to skip
requester_client_id: Optional filter by requester client ID
batch_id: Optional filter by batch ID
Returns:
List of job dictionaries
"""
async with self.pool.acquire() as conn:
if status:
rows = await conn.fetch("""
SELECT
job_id,
status,
url,
created_at,
completed_at,
reviews_count,
total_reviews,
scrape_time,
error_message,
metadata,
review_topics
FROM jobs
WHERE status = $1
ORDER BY created_at DESC
LIMIT $2 OFFSET $3
""", status.value, limit, offset)
else:
rows = await conn.fetch("""
SELECT
job_id,
status,
url,
created_at,
completed_at,
reviews_count,
total_reviews,
scrape_time,
error_message,
metadata,
review_topics
FROM jobs
ORDER BY created_at DESC
LIMIT $1 OFFSET $2
""", limit, offset)
# Build dynamic WHERE clause
conditions = []
params = []
param_idx = 1
if status:
conditions.append(f"status = ${param_idx}")
params.append(status.value)
param_idx += 1
if requester_client_id:
conditions.append(f"requester_client_id = ${param_idx}")
params.append(requester_client_id)
param_idx += 1
if batch_id:
conditions.append(f"batch_id = ${param_idx}")
params.append(batch_id)
param_idx += 1
where_clause = f"WHERE {' AND '.join(conditions)}" if conditions else ""
# Add limit and offset params
params.extend([limit, offset])
query = f"""
SELECT
job_id,
status,
url,
created_at,
completed_at,
reviews_count,
total_reviews,
scrape_time,
error_message,
metadata,
review_topics,
requester_client_id,
requester_source,
scrape_purpose,
requester_metadata,
batch_id,
batch_index,
job_type,
priority,
callback_url,
callback_status,
callback_attempts,
scraper_version,
scraper_variant
FROM jobs
{where_clause}
ORDER BY created_at DESC
LIMIT ${param_idx} OFFSET ${param_idx + 1}
"""
rows = await conn.fetch(query, *params)
return [dict(row) for row in rows]
async def update_job_callback(
self,
job_id: UUID,
status: str,
attempts: Optional[int] = None
):
"""
Update callback status for a job.
Args:
job_id: Job UUID
status: Callback status ('pending', 'success', 'failed', 'skipped')
attempts: Number of callback attempts (if not provided, increments by 1)
"""
async with self.pool.acquire() as conn:
if attempts is not None:
await conn.execute("""
UPDATE jobs
SET callback_status = $2, callback_attempts = $3, updated_at = NOW()
WHERE job_id = $1
""", job_id, status, attempts)
else:
await conn.execute("""
UPDATE jobs
SET callback_status = $2,
callback_attempts = COALESCE(callback_attempts, 0) + 1,
updated_at = NOW()
WHERE job_id = $1
""", job_id, status)
log.debug(f"Updated callback status for job {job_id}: {status}")
# ==================== Batch Operations ====================
async def create_batch(
self,
name: str,
requester_client_id: Optional[str] = None,
requester_source: Optional[str] = None,
scrape_purpose: Optional[str] = None,
total_jobs: int = 0,
callback_url: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
) -> UUID:
"""
Create a new batch for grouping jobs.
Args:
name: Batch name/description
requester_client_id: Client ID of the requester
requester_source: Source of the batch request
scrape_purpose: Purpose of the scrape
total_jobs: Expected total number of jobs in batch
callback_url: URL to call when batch completes
metadata: Additional batch metadata
Returns:
UUID of created batch
"""
async with self.pool.acquire() as conn:
batch_id = await conn.fetchval("""
INSERT INTO batches (
name, requester_client_id, requester_source, scrape_purpose,
total_jobs, callback_url, metadata
)
VALUES ($1, $2, $3, $4, $5, $6, $7)
RETURNING batch_id
""",
name, requester_client_id, requester_source, scrape_purpose,
total_jobs, callback_url,
json.dumps(metadata) if metadata else None
)
log.info(f"Created batch {batch_id}: {name} ({total_jobs} jobs)")
return batch_id
async def get_batch(self, batch_id: UUID) -> Optional[Dict[str, Any]]:
"""
Get batch by ID with job counts.
Args:
batch_id: Batch UUID
Returns:
Batch dictionary with job counts or None if not found
"""
async with self.pool.acquire() as conn:
row = await conn.fetchrow("""
SELECT
b.batch_id,
b.name,
b.status,
b.requester_client_id,
b.requester_source,
b.scrape_purpose,
b.total_jobs,
b.completed_jobs,
b.failed_jobs,
b.callback_url,
b.callback_status,
b.metadata,
b.created_at,
b.updated_at,
b.completed_at,
COUNT(j.job_id) FILTER (WHERE j.status = 'pending') as pending_jobs,
COUNT(j.job_id) FILTER (WHERE j.status = 'running') as running_jobs,
COUNT(j.job_id) as actual_total_jobs
FROM batches b
LEFT JOIN jobs j ON j.batch_id = b.batch_id
WHERE b.batch_id = $1
GROUP BY b.batch_id
""", batch_id)
if not row:
return None
return dict(row)
async def update_batch_progress(self, batch_id: UUID):
"""
Recalculate and update batch progress from jobs table.
Args:
batch_id: Batch UUID
"""
async with self.pool.acquire() as conn:
# Calculate job counts
counts = await conn.fetchrow("""
SELECT
COUNT(*) FILTER (WHERE status = 'completed') as completed,
COUNT(*) FILTER (WHERE status = 'failed') as failed,
COUNT(*) FILTER (WHERE status = 'partial') as partial,
COUNT(*) as total
FROM jobs
WHERE batch_id = $1
""", batch_id)
completed = counts['completed'] or 0
failed = counts['failed'] or 0
partial = counts['partial'] or 0
total = counts['total'] or 0
# Determine batch status
if total == 0:
status = 'pending'
elif completed + failed + partial >= total:
status = 'completed'
elif completed > 0 or failed > 0 or partial > 0:
status = 'running'
else:
status = 'pending'
# Update batch
await conn.execute("""
UPDATE batches
SET
completed_jobs = $2,
failed_jobs = $3,
status = $4,
updated_at = NOW(),
completed_at = CASE WHEN $4 = 'completed' THEN NOW() ELSE completed_at END
WHERE batch_id = $1
""", batch_id, completed, failed, status)
log.debug(f"Updated batch {batch_id} progress: {completed}/{total} completed, {failed} failed")
async def get_batches(
self,
requester_client_id: Optional[str] = None,
status: Optional[str] = None,
limit: int = 50
) -> List[Dict[str, Any]]:
"""
List batches with optional filtering.
Args:
requester_client_id: Optional filter by requester client ID
status: Optional filter by batch status
limit: Maximum number of batches to return
Returns:
List of batch dictionaries
"""
async with self.pool.acquire() as conn:
# Build dynamic WHERE clause
conditions = []
params = []
param_idx = 1
if requester_client_id:
conditions.append(f"requester_client_id = ${param_idx}")
params.append(requester_client_id)
param_idx += 1
if status:
conditions.append(f"status = ${param_idx}")
params.append(status)
param_idx += 1
where_clause = f"WHERE {' AND '.join(conditions)}" if conditions else ""
params.append(limit)
query = f"""
SELECT
batch_id,
name,
status,
requester_client_id,
requester_source,
scrape_purpose,
total_jobs,
completed_jobs,
failed_jobs,
callback_url,
callback_status,
metadata,
created_at,
updated_at,
completed_at
FROM batches
{where_clause}
ORDER BY created_at DESC
LIMIT ${param_idx}
"""
rows = await conn.fetch(query, *params)
return [dict(row) for row in rows]
async def get_pending_jobs_with_webhooks(self, limit: int = 100) -> List[Dict[str, Any]]: