Phases 2-4: Requester support, batches, webhooks, scraper registry

Phase 2 - Requester & Batch Support:
- core/database.py: Added create_job params (requester_*, batch_*, priority, callback_*)
- core/database.py: Added batch methods (create_batch, get_batch, update_batch_progress, get_batches)
- core/database.py: Added update_job_callback for tracking webhook delivery
- api/routes/batches.py: New endpoints:
  - POST /api/scrape/google-reviews/batch (submit batch)
  - GET /api/batches (list batches)
  - GET /api/batches/{id} (batch detail)
  - DELETE /api/batches/{id} (cancel batch)
- api_server_production.py: Updated /api/scrape with requester, priority, callback fields
- api_server_production.py: New primary endpoint POST /api/scrape/google-reviews

Phase 3 - Webhooks:
- services/job_callback_service.py: New service with:
  - JobCallbackService: send_job_callback, send_batch_callback, retry_failed_callbacks
  - JobCallbackDispatcher: Background worker for callback monitoring
  - Payload formats per spec (job.completed, job.failed, batch.completed)
  - Exponential backoff for retries
  - Error classification for failure payloads

Phase 4 - Scraper Registry:
- scrapers/registry.py: Database-backed version routing:
  - get_scraper(): Version/variant/A/B routing
  - _get_weighted_scraper(): Traffic-weighted random selection
  - 60-second TTL cache for performance
  - register_scraper, deprecate_scraper, update_traffic_allocation
  - LegacyScraperRegistry preserved for backwards compatibility

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 15:35:58 +00:00
parent 2412996c54
commit 788ef84756
8 changed files with 2503 additions and 98 deletions

View File

@@ -0,0 +1,29 @@
"""
Services module for ReviewIQ platform.
Contains service classes for webhook delivery, job callbacks, and other
business logic.
"""
from services.webhook_service import (
WebhookManager,
WebhookDispatcher,
WebhookDeliveryError,
verify_webhook_signature
)
from services.job_callback_service import (
JobCallbackService,
JobCallbackDispatcher,
CallbackStatus
)
__all__ = [
# Webhook service
'WebhookManager',
'WebhookDispatcher',
'WebhookDeliveryError',
'verify_webhook_signature',
# Job callback service
'JobCallbackService',
'JobCallbackDispatcher',
'CallbackStatus',
]

View File

@@ -0,0 +1,717 @@
#!/usr/bin/env python3
"""
Job Callback Service for webhook delivery on job and batch completion.
This service handles sending webhooks when jobs complete or fail,
as well as batch-level completion callbacks.
"""
import asyncio
import json
import logging
from datetime import datetime
from typing import Dict, Any, Optional, List
from uuid import UUID
import httpx
from services.webhook_service import WebhookManager
log = logging.getLogger(__name__)
# Scraper version (should match the deployed scraper)
SCRAPER_VERSION = "1.0.0"
class CallbackStatus:
"""Callback status constants"""
PENDING = "pending"
SENT = "sent"
FAILED = "failed"
class JobCallbackService:
"""
Handles webhook callbacks for job and batch completion.
This service is responsible for:
- Sending callbacks when individual jobs complete or fail
- Sending callbacks when entire batches complete
- Retrying failed callbacks with exponential backoff
- Tracking callback status and attempts in the database
"""
def __init__(
self,
db,
max_retries: int = 3,
timeout: float = 10.0,
initial_retry_delay: float = 2.0
):
"""
Initialize the job callback service.
Args:
db: DatabaseManager instance
max_retries: Maximum number of delivery attempts per callback
timeout: HTTP request timeout in seconds
initial_retry_delay: Initial delay between retries (exponential backoff)
"""
self.db = db
self.webhook_manager = WebhookManager(
max_retries=max_retries,
timeout=timeout,
initial_retry_delay=initial_retry_delay
)
self.max_retries = max_retries
self.timeout = timeout
self.initial_retry_delay = initial_retry_delay
async def send_job_callback(self, job_id: UUID) -> bool:
"""
Send callback for completed/failed job.
This method:
- Fetches the job from the database
- Builds the appropriate payload based on job status
- POSTs to the callback_url
- Updates callback_status in the database
Args:
job_id: UUID of the job
Returns:
True if callback was sent successfully, False otherwise
"""
# Fetch job from database
job = await self.db.get_job(job_id)
if not job:
log.error(f"Job {job_id} not found for callback")
return False
callback_url = job.get('callback_url')
if not callback_url:
log.debug(f"Job {job_id} has no callback_url configured")
return True # No callback needed, consider success
status = job.get('status')
if status not in ('completed', 'failed', 'partial'):
log.warning(f"Job {job_id} has status '{status}', not sending callback")
return False
# Build payload based on status
payload = self._build_job_payload(job)
# Get webhook secret if available (reuse webhook_secret for callbacks)
secret = job.get('webhook_secret')
# Send the callback
log.info(f"Sending job callback to {callback_url} for job {job_id} (status: {status})")
success = await self._send_callback(
url=callback_url,
payload=payload,
secret=secret,
job_id=job_id
)
# Update callback status in database
await self._update_callback_status(job_id, success)
if success:
log.info(f"Job callback sent successfully for job {job_id}")
else:
log.error(f"Job callback failed for job {job_id}")
return success
async def send_batch_callback(self, batch_id: UUID) -> bool:
"""
Send callback when batch completes.
This method:
- Fetches all jobs in the batch from the database
- Checks if all jobs have completed (success or failure)
- Builds a summary payload
- POSTs to the batch callback_url
- Updates callback_status
Args:
batch_id: UUID of the batch
Returns:
True if callback was sent successfully, False otherwise
"""
# Get batch info (from first job with this batch_id)
batch_info = await self._get_batch_info(batch_id)
if not batch_info:
log.error(f"Batch {batch_id} not found or has no jobs")
return False
callback_url = batch_info.get('callback_url')
if not callback_url:
log.debug(f"Batch {batch_id} has no callback_url configured")
return True # No callback needed
# Check if batch is complete
if not batch_info.get('is_complete'):
log.debug(f"Batch {batch_id} is not yet complete")
return False
# Build batch payload
payload = self._build_batch_payload(batch_info)
# Get webhook secret (from first job's webhook_secret)
secret = batch_info.get('webhook_secret')
# Send the callback
log.info(f"Sending batch callback to {callback_url} for batch {batch_id}")
success = await self._send_callback(
url=callback_url,
payload=payload,
secret=secret,
job_id=None # Batch callback, no single job_id
)
# Update batch callback status (on all jobs in the batch)
await self._update_batch_callback_status(batch_id, success)
if success:
log.info(f"Batch callback sent successfully for batch {batch_id}")
else:
log.error(f"Batch callback failed for batch {batch_id}")
return success
async def retry_failed_callbacks(self, max_attempts: int = 5) -> Dict[str, int]:
"""
Find jobs with callback_status='failed' and attempts < max.
Retry sending callbacks with exponential backoff.
Args:
max_attempts: Maximum number of total attempts before giving up
Returns:
Dict with counts: {'retried': n, 'succeeded': n, 'failed': n}
"""
# Get jobs with failed callbacks that haven't exceeded max attempts
jobs = await self._get_failed_callbacks(max_attempts)
results = {
'retried': 0,
'succeeded': 0,
'failed': 0
}
if not jobs:
log.debug("No failed callbacks to retry")
return results
log.info(f"Retrying {len(jobs)} failed callbacks")
for job in jobs:
job_id = job['job_id']
attempts = job.get('callback_attempts', 0)
# Calculate delay based on attempt number (exponential backoff)
delay = self.initial_retry_delay * (2 ** attempts)
log.info(f"Retrying callback for job {job_id} (attempt {attempts + 1}), delay: {delay:.1f}s")
# Wait with backoff
await asyncio.sleep(delay)
# Retry the callback
success = await self.send_job_callback(job_id)
results['retried'] += 1
if success:
results['succeeded'] += 1
else:
results['failed'] += 1
log.info(f"Callback retry complete: {results}")
return results
async def check_and_send_batch_callbacks(self) -> Dict[str, int]:
"""
Check for completed batches and send their callbacks.
This should be called periodically to detect when batches complete.
Returns:
Dict with counts: {'checked': n, 'sent': n, 'failed': n}
"""
# Get distinct batch_ids that might be complete
batch_ids = await self._get_potentially_complete_batches()
results = {
'checked': 0,
'sent': 0,
'failed': 0
}
for batch_id in batch_ids:
results['checked'] += 1
success = await self.send_batch_callback(batch_id)
if success:
results['sent'] += 1
else:
results['failed'] += 1
return results
def _build_job_payload(self, job: Dict[str, Any]) -> Dict[str, Any]:
"""
Build webhook payload for job completion/failure.
Args:
job: Job dictionary from database
Returns:
Webhook payload dictionary
"""
status = job.get('status')
job_id = str(job.get('job_id'))
job_type = job.get('job_type', 'google_reviews')
# Base payload
payload = {
"job_id": job_id,
"job_type": job_type,
"status": status,
"url": job.get('url'),
"scraper_version": job.get('scraper_version') or SCRAPER_VERSION,
}
if status == 'completed':
# Completed job payload
payload["event"] = "job.completed"
# Calculate result summary
reviews_count = job.get('reviews_count') or 0
# Try to extract primary metric (average rating) from reviews_data
primary_metric = None
reviews_data = job.get('reviews_data')
if reviews_data:
if isinstance(reviews_data, str):
try:
reviews_data = json.loads(reviews_data)
except json.JSONDecodeError:
reviews_data = []
if reviews_data:
ratings = [r.get('rating', 0) for r in reviews_data if r.get('rating')]
if ratings:
primary_metric = round(sum(ratings) / len(ratings), 2)
payload["result_summary"] = {
"item_count": reviews_count,
"primary_metric": primary_metric
}
# Duration
started_at = job.get('started_at')
completed_at = job.get('completed_at')
if started_at and completed_at:
if isinstance(started_at, str):
started_at = datetime.fromisoformat(started_at.replace('Z', '+00:00'))
if isinstance(completed_at, str):
completed_at = datetime.fromisoformat(completed_at.replace('Z', '+00:00'))
duration = (completed_at - started_at).total_seconds()
payload["duration_seconds"] = round(duration, 2)
elif job.get('scrape_time'):
payload["duration_seconds"] = round(job.get('scrape_time'), 2)
# Completed timestamp
if completed_at:
if isinstance(completed_at, datetime):
payload["completed_at"] = completed_at.isoformat() + 'Z'
else:
payload["completed_at"] = completed_at
elif status in ('failed', 'partial'):
# Failed job payload
payload["event"] = "job.failed"
error_message = job.get('error_message', 'Unknown error')
# Determine error type from message
error_type = self._classify_error(error_message)
payload["error"] = {
"type": error_type,
"message": error_message
}
# Include partial results info if applicable
if status == 'partial':
payload["status"] = "partial"
payload["result_summary"] = {
"item_count": job.get('reviews_count') or 0,
"primary_metric": None
}
# Failed timestamp
completed_at = job.get('completed_at')
if completed_at:
if isinstance(completed_at, datetime):
payload["completed_at"] = completed_at.isoformat() + 'Z'
else:
payload["completed_at"] = completed_at
return payload
def _build_batch_payload(self, batch_info: Dict[str, Any]) -> Dict[str, Any]:
"""
Build webhook payload for batch completion.
Args:
batch_info: Batch info dictionary with job summaries
Returns:
Webhook payload dictionary
"""
batch_id = str(batch_info.get('batch_id'))
jobs = batch_info.get('jobs', [])
# Count successes and failures
succeeded = sum(1 for j in jobs if j.get('status') == 'completed')
failed = sum(1 for j in jobs if j.get('status') in ('failed', 'partial'))
failed_job_ids = [str(j.get('job_id')) for j in jobs if j.get('status') in ('failed', 'partial')]
# Find latest completed_at
completed_times = [
j.get('completed_at') for j in jobs
if j.get('completed_at')
]
latest_completed = max(completed_times) if completed_times else datetime.utcnow()
if isinstance(latest_completed, datetime):
latest_completed = latest_completed.isoformat() + 'Z'
payload = {
"event": "batch.completed",
"batch_id": batch_id,
"name": batch_info.get('name', f'Batch {batch_id[:8]}'),
"total_jobs": len(jobs),
"succeeded": succeeded,
"failed": failed,
"completed_at": latest_completed,
"failed_job_ids": failed_job_ids
}
return payload
def _classify_error(self, error_message: str) -> str:
"""
Classify error message into a type category.
Args:
error_message: Error message string
Returns:
Error type string
"""
if not error_message:
return "unknown"
error_lower = error_message.lower()
if 'rate' in error_lower and 'limit' in error_lower:
return "rate_limited"
elif 'timeout' in error_lower:
return "timeout"
elif 'captcha' in error_lower or 'robot' in error_lower:
return "captcha_detected"
elif 'blocked' in error_lower or 'denied' in error_lower:
return "blocked"
elif 'network' in error_lower or 'connection' in error_lower:
return "network_error"
elif 'not found' in error_lower or '404' in error_lower:
return "not_found"
elif 'invalid' in error_lower:
return "invalid_input"
elif 'element' in error_lower or 'selector' in error_lower:
return "scrape_error"
else:
return "unknown"
async def _send_callback(
self,
url: str,
payload: Dict[str, Any],
secret: Optional[str] = None,
job_id: Optional[UUID] = None
) -> bool:
"""
Send a callback to the specified URL.
Uses the WebhookManager for retry logic and HMAC signing.
Args:
url: Callback URL
payload: Payload dictionary
secret: Optional secret for HMAC signature
job_id: Optional job ID for logging
Returns:
True if sent successfully
"""
return await self.webhook_manager.send_webhook(
webhook_url=url,
payload=payload,
secret=secret,
job_id=job_id,
db=self.db
)
async def _update_callback_status(self, job_id: UUID, success: bool):
"""
Update the callback_status and callback_attempts for a job.
Args:
job_id: Job UUID
success: Whether the callback was sent successfully
"""
async with self.db.pool.acquire() as conn:
if success:
await conn.execute("""
UPDATE jobs
SET callback_status = 'sent',
callback_attempts = COALESCE(callback_attempts, 0) + 1
WHERE job_id = $1
""", job_id)
else:
await conn.execute("""
UPDATE jobs
SET callback_status = 'failed',
callback_attempts = COALESCE(callback_attempts, 0) + 1
WHERE job_id = $1
""", job_id)
async def _update_batch_callback_status(self, batch_id: UUID, success: bool):
"""
Update callback status for all jobs in a batch.
Args:
batch_id: Batch UUID
success: Whether the callback was sent successfully
"""
status = 'sent' if success else 'failed'
async with self.db.pool.acquire() as conn:
await conn.execute("""
UPDATE jobs
SET callback_status = $2,
callback_attempts = COALESCE(callback_attempts, 0) + 1
WHERE batch_id = $1
""", batch_id, status)
async def _get_failed_callbacks(self, max_attempts: int) -> List[Dict[str, Any]]:
"""
Get jobs with failed callbacks that can be retried.
Args:
max_attempts: Maximum attempts before giving up
Returns:
List of job dictionaries
"""
async with self.db.pool.acquire() as conn:
rows = await conn.fetch("""
SELECT
job_id,
status,
callback_url,
callback_status,
callback_attempts,
webhook_secret
FROM jobs
WHERE callback_url IS NOT NULL
AND callback_status = 'failed'
AND COALESCE(callback_attempts, 0) < $1
AND status IN ('completed', 'failed', 'partial')
ORDER BY completed_at ASC
LIMIT 100
""", max_attempts)
return [dict(row) for row in rows]
async def _get_batch_info(self, batch_id: UUID) -> Optional[Dict[str, Any]]:
"""
Get batch information including all jobs.
Args:
batch_id: Batch UUID
Returns:
Batch info dictionary with jobs list, or None if not found
"""
async with self.db.pool.acquire() as conn:
rows = await conn.fetch("""
SELECT
job_id,
status,
batch_index,
callback_url,
callback_status,
webhook_secret,
completed_at,
reviews_count,
error_message,
metadata
FROM jobs
WHERE batch_id = $1
ORDER BY batch_index ASC
""", batch_id)
if not rows:
return None
jobs = [dict(row) for row in rows]
# Determine if batch is complete (all jobs finished)
pending_statuses = ('pending', 'running')
is_complete = all(
j.get('status') not in pending_statuses
for j in jobs
)
# Get batch name from first job's metadata if available
batch_name = None
first_metadata = jobs[0].get('metadata')
if first_metadata:
if isinstance(first_metadata, str):
try:
first_metadata = json.loads(first_metadata)
except json.JSONDecodeError:
first_metadata = {}
batch_name = first_metadata.get('batch_name')
return {
'batch_id': batch_id,
'name': batch_name,
'jobs': jobs,
'is_complete': is_complete,
'callback_url': jobs[0].get('callback_url'),
'webhook_secret': jobs[0].get('webhook_secret')
}
async def _get_potentially_complete_batches(self) -> List[UUID]:
"""
Get batch IDs that might have recently completed.
Returns:
List of batch UUIDs to check
"""
async with self.db.pool.acquire() as conn:
# Find batches where:
# 1. At least one job has callback_url set
# 2. callback_status is null or pending (not yet sent)
# 3. No jobs are still running
rows = await conn.fetch("""
SELECT DISTINCT batch_id
FROM jobs
WHERE batch_id IS NOT NULL
AND callback_url IS NOT NULL
AND COALESCE(callback_status, 'pending') = 'pending'
AND batch_id NOT IN (
SELECT DISTINCT batch_id
FROM jobs
WHERE batch_id IS NOT NULL
AND status IN ('pending', 'running')
)
LIMIT 100
""")
return [row['batch_id'] for row in rows]
class JobCallbackDispatcher:
"""
Background dispatcher that monitors for jobs needing callbacks.
Runs in background and processes callbacks for completed jobs.
"""
def __init__(
self,
db,
interval_seconds: int = 30,
retry_interval_seconds: int = 300
):
"""
Initialize the callback dispatcher.
Args:
db: DatabaseManager instance
interval_seconds: How often to check for pending callbacks
retry_interval_seconds: How often to retry failed callbacks
"""
self.db = db
self.interval = interval_seconds
self.retry_interval = retry_interval_seconds
self.callback_service = JobCallbackService(db)
self.running = False
self._last_retry = datetime.utcnow()
async def start(self):
"""Start the background callback dispatcher"""
self.running = True
log.info("Job callback dispatcher started")
while self.running:
try:
# Process pending job callbacks
await self._process_pending_callbacks()
# Check for completed batches
await self.callback_service.check_and_send_batch_callbacks()
# Periodically retry failed callbacks
now = datetime.utcnow()
if (now - self._last_retry).total_seconds() >= self.retry_interval:
await self.callback_service.retry_failed_callbacks(max_attempts=5)
self._last_retry = now
except Exception as e:
log.error(f"Error in callback dispatcher: {e}")
await asyncio.sleep(self.interval)
def stop(self):
"""Stop the background callback dispatcher"""
self.running = False
log.info("Job callback dispatcher stopped")
async def _process_pending_callbacks(self):
"""
Process all pending callbacks.
Fetches jobs with callback_url set and callback_status null/pending.
"""
async with self.db.pool.acquire() as conn:
rows = await conn.fetch("""
SELECT job_id
FROM jobs
WHERE callback_url IS NOT NULL
AND COALESCE(callback_status, 'pending') = 'pending'
AND status IN ('completed', 'failed', 'partial')
ORDER BY completed_at ASC
LIMIT 100
""")
if not rows:
return
log.info(f"Processing {len(rows)} pending job callbacks")
for row in rows:
job_id = row['job_id']
try:
await self.callback_service.send_job_callback(job_id)
except Exception as e:
log.error(f"Error sending callback for job {job_id}: {e}")
log.info(f"Processed {len(rows)} callbacks")