Phases 2-4: Requester support, batches, webhooks, scraper registry
Phase 2 - Requester & Batch Support:
- core/database.py: Added create_job params (requester_*, batch_*, priority, callback_*)
- core/database.py: Added batch methods (create_batch, get_batch, update_batch_progress, get_batches)
- core/database.py: Added update_job_callback for tracking webhook delivery
- api/routes/batches.py: New endpoints:
- POST /api/scrape/google-reviews/batch (submit batch)
- GET /api/batches (list batches)
- GET /api/batches/{id} (batch detail)
- DELETE /api/batches/{id} (cancel batch)
- api_server_production.py: Updated /api/scrape with requester, priority, callback fields
- api_server_production.py: New primary endpoint POST /api/scrape/google-reviews
Phase 3 - Webhooks:
- services/job_callback_service.py: New service with:
- JobCallbackService: send_job_callback, send_batch_callback, retry_failed_callbacks
- JobCallbackDispatcher: Background worker for callback monitoring
- Payload formats per spec (job.completed, job.failed, batch.completed)
- Exponential backoff for retries
- Error classification for failure payloads
Phase 4 - Scraper Registry:
- scrapers/registry.py: Database-backed version routing:
- get_scraper(): Version/variant/A/B routing
- _get_weighted_scraper(): Traffic-weighted random selection
- 60-second TTL cache for performance
- register_scraper, deprecate_scraper, update_traffic_allocation
- LegacyScraperRegistry preserved for backwards compatibility
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
391
core/database.py
391
core/database.py
@@ -189,7 +189,22 @@ class DatabaseManager:
|
||||
url: str,
|
||||
webhook_url: Optional[str] = None,
|
||||
webhook_secret: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
# Phase 2: New fields for requester tracking
|
||||
requester_client_id: Optional[str] = None,
|
||||
requester_source: Optional[str] = None,
|
||||
scrape_purpose: Optional[str] = None,
|
||||
requester_metadata: Optional[Dict[str, Any]] = None,
|
||||
# Phase 2: Batch support
|
||||
batch_id: Optional[UUID] = None,
|
||||
batch_index: Optional[int] = None,
|
||||
# Phase 2: Job configuration
|
||||
job_type: str = 'google_reviews',
|
||||
priority: int = 0,
|
||||
callback_url: Optional[str] = None,
|
||||
# Phase 2: Scraper versioning
|
||||
scraper_version: Optional[str] = None,
|
||||
scraper_variant: Optional[str] = None
|
||||
) -> UUID:
|
||||
"""
|
||||
Create a new scraping job.
|
||||
@@ -199,16 +214,41 @@ class DatabaseManager:
|
||||
webhook_url: Optional webhook URL for notifications
|
||||
webhook_secret: Optional secret for webhook signature
|
||||
metadata: Optional additional metadata
|
||||
requester_client_id: Client ID of the requester (for tracking)
|
||||
requester_source: Source of the request (e.g., 'api', 'web', 'batch')
|
||||
scrape_purpose: Purpose of the scrape (e.g., 'competitor_analysis')
|
||||
requester_metadata: Additional requester-specific metadata
|
||||
batch_id: ID of the batch this job belongs to
|
||||
batch_index: Index of this job within the batch
|
||||
job_type: Type of job (default: 'google_reviews')
|
||||
priority: Job priority (higher = more urgent, default: 0)
|
||||
callback_url: URL to call when job completes
|
||||
scraper_version: Version of the scraper to use
|
||||
scraper_variant: Variant of the scraper (e.g., 'stealth', 'fast')
|
||||
|
||||
Returns:
|
||||
UUID of created job
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
job_id = await conn.fetchval("""
|
||||
INSERT INTO jobs (url, webhook_url, webhook_secret, metadata)
|
||||
VALUES ($1, $2, $3, $4)
|
||||
INSERT INTO jobs (
|
||||
url, webhook_url, webhook_secret, metadata,
|
||||
requester_client_id, requester_source, scrape_purpose, requester_metadata,
|
||||
batch_id, batch_index,
|
||||
job_type, priority, callback_url,
|
||||
scraper_version, scraper_variant
|
||||
)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
|
||||
RETURNING job_id
|
||||
""", url, webhook_url, webhook_secret, json.dumps(metadata) if metadata else None)
|
||||
""",
|
||||
url, webhook_url, webhook_secret,
|
||||
json.dumps(metadata) if metadata else None,
|
||||
requester_client_id, requester_source, scrape_purpose,
|
||||
json.dumps(requester_metadata) if requester_metadata else None,
|
||||
batch_id, batch_index,
|
||||
job_type, priority, callback_url,
|
||||
scraper_version, scraper_variant
|
||||
)
|
||||
|
||||
log.info(f"Created job {job_id} for URL: {url[:80]}...")
|
||||
return job_id
|
||||
@@ -241,7 +281,20 @@ class DatabaseManager:
|
||||
error_message,
|
||||
metadata,
|
||||
scrape_logs,
|
||||
review_topics
|
||||
review_topics,
|
||||
requester_client_id,
|
||||
requester_source,
|
||||
scrape_purpose,
|
||||
requester_metadata,
|
||||
batch_id,
|
||||
batch_index,
|
||||
job_type,
|
||||
priority,
|
||||
callback_url,
|
||||
callback_status,
|
||||
callback_attempts,
|
||||
scraper_version,
|
||||
scraper_variant
|
||||
FROM jobs
|
||||
WHERE job_id = $1
|
||||
""", job_id)
|
||||
@@ -493,7 +546,9 @@ class DatabaseManager:
|
||||
self,
|
||||
status: Optional[JobStatus] = None,
|
||||
limit: int = 100,
|
||||
offset: int = 0
|
||||
offset: int = 0,
|
||||
requester_client_id: Optional[str] = None,
|
||||
batch_id: Optional[UUID] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
List jobs with optional filtering.
|
||||
@@ -502,49 +557,299 @@ class DatabaseManager:
|
||||
status: Optional status filter
|
||||
limit: Maximum number of jobs to return
|
||||
offset: Number of jobs to skip
|
||||
requester_client_id: Optional filter by requester client ID
|
||||
batch_id: Optional filter by batch ID
|
||||
|
||||
Returns:
|
||||
List of job dictionaries
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
if status:
|
||||
rows = await conn.fetch("""
|
||||
SELECT
|
||||
job_id,
|
||||
status,
|
||||
url,
|
||||
created_at,
|
||||
completed_at,
|
||||
reviews_count,
|
||||
total_reviews,
|
||||
scrape_time,
|
||||
error_message,
|
||||
metadata,
|
||||
review_topics
|
||||
FROM jobs
|
||||
WHERE status = $1
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $2 OFFSET $3
|
||||
""", status.value, limit, offset)
|
||||
else:
|
||||
rows = await conn.fetch("""
|
||||
SELECT
|
||||
job_id,
|
||||
status,
|
||||
url,
|
||||
created_at,
|
||||
completed_at,
|
||||
reviews_count,
|
||||
total_reviews,
|
||||
scrape_time,
|
||||
error_message,
|
||||
metadata,
|
||||
review_topics
|
||||
FROM jobs
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $1 OFFSET $2
|
||||
""", limit, offset)
|
||||
# Build dynamic WHERE clause
|
||||
conditions = []
|
||||
params = []
|
||||
param_idx = 1
|
||||
|
||||
if status:
|
||||
conditions.append(f"status = ${param_idx}")
|
||||
params.append(status.value)
|
||||
param_idx += 1
|
||||
|
||||
if requester_client_id:
|
||||
conditions.append(f"requester_client_id = ${param_idx}")
|
||||
params.append(requester_client_id)
|
||||
param_idx += 1
|
||||
|
||||
if batch_id:
|
||||
conditions.append(f"batch_id = ${param_idx}")
|
||||
params.append(batch_id)
|
||||
param_idx += 1
|
||||
|
||||
where_clause = f"WHERE {' AND '.join(conditions)}" if conditions else ""
|
||||
|
||||
# Add limit and offset params
|
||||
params.extend([limit, offset])
|
||||
|
||||
query = f"""
|
||||
SELECT
|
||||
job_id,
|
||||
status,
|
||||
url,
|
||||
created_at,
|
||||
completed_at,
|
||||
reviews_count,
|
||||
total_reviews,
|
||||
scrape_time,
|
||||
error_message,
|
||||
metadata,
|
||||
review_topics,
|
||||
requester_client_id,
|
||||
requester_source,
|
||||
scrape_purpose,
|
||||
requester_metadata,
|
||||
batch_id,
|
||||
batch_index,
|
||||
job_type,
|
||||
priority,
|
||||
callback_url,
|
||||
callback_status,
|
||||
callback_attempts,
|
||||
scraper_version,
|
||||
scraper_variant
|
||||
FROM jobs
|
||||
{where_clause}
|
||||
ORDER BY created_at DESC
|
||||
LIMIT ${param_idx} OFFSET ${param_idx + 1}
|
||||
"""
|
||||
|
||||
rows = await conn.fetch(query, *params)
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
async def update_job_callback(
|
||||
self,
|
||||
job_id: UUID,
|
||||
status: str,
|
||||
attempts: Optional[int] = None
|
||||
):
|
||||
"""
|
||||
Update callback status for a job.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
status: Callback status ('pending', 'success', 'failed', 'skipped')
|
||||
attempts: Number of callback attempts (if not provided, increments by 1)
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
if attempts is not None:
|
||||
await conn.execute("""
|
||||
UPDATE jobs
|
||||
SET callback_status = $2, callback_attempts = $3, updated_at = NOW()
|
||||
WHERE job_id = $1
|
||||
""", job_id, status, attempts)
|
||||
else:
|
||||
await conn.execute("""
|
||||
UPDATE jobs
|
||||
SET callback_status = $2,
|
||||
callback_attempts = COALESCE(callback_attempts, 0) + 1,
|
||||
updated_at = NOW()
|
||||
WHERE job_id = $1
|
||||
""", job_id, status)
|
||||
|
||||
log.debug(f"Updated callback status for job {job_id}: {status}")
|
||||
|
||||
# ==================== Batch Operations ====================
|
||||
|
||||
async def create_batch(
|
||||
self,
|
||||
name: str,
|
||||
requester_client_id: Optional[str] = None,
|
||||
requester_source: Optional[str] = None,
|
||||
scrape_purpose: Optional[str] = None,
|
||||
total_jobs: int = 0,
|
||||
callback_url: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
) -> UUID:
|
||||
"""
|
||||
Create a new batch for grouping jobs.
|
||||
|
||||
Args:
|
||||
name: Batch name/description
|
||||
requester_client_id: Client ID of the requester
|
||||
requester_source: Source of the batch request
|
||||
scrape_purpose: Purpose of the scrape
|
||||
total_jobs: Expected total number of jobs in batch
|
||||
callback_url: URL to call when batch completes
|
||||
metadata: Additional batch metadata
|
||||
|
||||
Returns:
|
||||
UUID of created batch
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
batch_id = await conn.fetchval("""
|
||||
INSERT INTO batches (
|
||||
name, requester_client_id, requester_source, scrape_purpose,
|
||||
total_jobs, callback_url, metadata
|
||||
)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7)
|
||||
RETURNING batch_id
|
||||
""",
|
||||
name, requester_client_id, requester_source, scrape_purpose,
|
||||
total_jobs, callback_url,
|
||||
json.dumps(metadata) if metadata else None
|
||||
)
|
||||
|
||||
log.info(f"Created batch {batch_id}: {name} ({total_jobs} jobs)")
|
||||
return batch_id
|
||||
|
||||
async def get_batch(self, batch_id: UUID) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get batch by ID with job counts.
|
||||
|
||||
Args:
|
||||
batch_id: Batch UUID
|
||||
|
||||
Returns:
|
||||
Batch dictionary with job counts or None if not found
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
row = await conn.fetchrow("""
|
||||
SELECT
|
||||
b.batch_id,
|
||||
b.name,
|
||||
b.status,
|
||||
b.requester_client_id,
|
||||
b.requester_source,
|
||||
b.scrape_purpose,
|
||||
b.total_jobs,
|
||||
b.completed_jobs,
|
||||
b.failed_jobs,
|
||||
b.callback_url,
|
||||
b.callback_status,
|
||||
b.metadata,
|
||||
b.created_at,
|
||||
b.updated_at,
|
||||
b.completed_at,
|
||||
COUNT(j.job_id) FILTER (WHERE j.status = 'pending') as pending_jobs,
|
||||
COUNT(j.job_id) FILTER (WHERE j.status = 'running') as running_jobs,
|
||||
COUNT(j.job_id) as actual_total_jobs
|
||||
FROM batches b
|
||||
LEFT JOIN jobs j ON j.batch_id = b.batch_id
|
||||
WHERE b.batch_id = $1
|
||||
GROUP BY b.batch_id
|
||||
""", batch_id)
|
||||
|
||||
if not row:
|
||||
return None
|
||||
|
||||
return dict(row)
|
||||
|
||||
async def update_batch_progress(self, batch_id: UUID):
|
||||
"""
|
||||
Recalculate and update batch progress from jobs table.
|
||||
|
||||
Args:
|
||||
batch_id: Batch UUID
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
# Calculate job counts
|
||||
counts = await conn.fetchrow("""
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE status = 'completed') as completed,
|
||||
COUNT(*) FILTER (WHERE status = 'failed') as failed,
|
||||
COUNT(*) FILTER (WHERE status = 'partial') as partial,
|
||||
COUNT(*) as total
|
||||
FROM jobs
|
||||
WHERE batch_id = $1
|
||||
""", batch_id)
|
||||
|
||||
completed = counts['completed'] or 0
|
||||
failed = counts['failed'] or 0
|
||||
partial = counts['partial'] or 0
|
||||
total = counts['total'] or 0
|
||||
|
||||
# Determine batch status
|
||||
if total == 0:
|
||||
status = 'pending'
|
||||
elif completed + failed + partial >= total:
|
||||
status = 'completed'
|
||||
elif completed > 0 or failed > 0 or partial > 0:
|
||||
status = 'running'
|
||||
else:
|
||||
status = 'pending'
|
||||
|
||||
# Update batch
|
||||
await conn.execute("""
|
||||
UPDATE batches
|
||||
SET
|
||||
completed_jobs = $2,
|
||||
failed_jobs = $3,
|
||||
status = $4,
|
||||
updated_at = NOW(),
|
||||
completed_at = CASE WHEN $4 = 'completed' THEN NOW() ELSE completed_at END
|
||||
WHERE batch_id = $1
|
||||
""", batch_id, completed, failed, status)
|
||||
|
||||
log.debug(f"Updated batch {batch_id} progress: {completed}/{total} completed, {failed} failed")
|
||||
|
||||
async def get_batches(
|
||||
self,
|
||||
requester_client_id: Optional[str] = None,
|
||||
status: Optional[str] = None,
|
||||
limit: int = 50
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
List batches with optional filtering.
|
||||
|
||||
Args:
|
||||
requester_client_id: Optional filter by requester client ID
|
||||
status: Optional filter by batch status
|
||||
limit: Maximum number of batches to return
|
||||
|
||||
Returns:
|
||||
List of batch dictionaries
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
# Build dynamic WHERE clause
|
||||
conditions = []
|
||||
params = []
|
||||
param_idx = 1
|
||||
|
||||
if requester_client_id:
|
||||
conditions.append(f"requester_client_id = ${param_idx}")
|
||||
params.append(requester_client_id)
|
||||
param_idx += 1
|
||||
|
||||
if status:
|
||||
conditions.append(f"status = ${param_idx}")
|
||||
params.append(status)
|
||||
param_idx += 1
|
||||
|
||||
where_clause = f"WHERE {' AND '.join(conditions)}" if conditions else ""
|
||||
params.append(limit)
|
||||
|
||||
query = f"""
|
||||
SELECT
|
||||
batch_id,
|
||||
name,
|
||||
status,
|
||||
requester_client_id,
|
||||
requester_source,
|
||||
scrape_purpose,
|
||||
total_jobs,
|
||||
completed_jobs,
|
||||
failed_jobs,
|
||||
callback_url,
|
||||
callback_status,
|
||||
metadata,
|
||||
created_at,
|
||||
updated_at,
|
||||
completed_at
|
||||
FROM batches
|
||||
{where_clause}
|
||||
ORDER BY created_at DESC
|
||||
LIMIT ${param_idx}
|
||||
"""
|
||||
|
||||
rows = await conn.fetch(query, *params)
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
async def get_pending_jobs_with_webhooks(self, limit: int = 100) -> List[Dict[str, Any]]:
|
||||
|
||||
Reference in New Issue
Block a user