Phases 2-4: Requester support, batches, webhooks, scraper registry

Phase 2 - Requester & Batch Support: - core/database.py: Added create_job params (requester_*, batch_*, priority, callback_*) - core/database.py: Added batch methods (create_batch, get_batch, update_batch_progress, get_batches) - core/database.py: Added update_job_callback for tracking webhook delivery - api/routes/batches.py: New endpoints: - POST /api/scrape/google-reviews/batch (submit batch) - GET /api/batches (list batches) - GET /api/batches/{id} (batch detail) - DELETE /api/batches/{id} (cancel batch) - api_server_production.py: Updated /api/scrape with requester, priority, callback fields - api_server_production.py: New primary endpoint POST /api/scrape/google-reviews Phase 3 - Webhooks: - services/job_callback_service.py: New service with: - JobCallbackService: send_job_callback, send_batch_callback, retry_failed_callbacks - JobCallbackDispatcher: Background worker for callback monitoring - Payload formats per spec (job.completed, job.failed, batch.completed) - Exponential backoff for retries - Error classification for failure payloads Phase 4 - Scraper Registry: - scrapers/registry.py: Database-backed version routing: - get_scraper(): Version/variant/A/B routing - _get_weighted_scraper(): Traffic-weighted random selection - 60-second TTL cache for performance - register_scraper, deprecate_scraper, update_traffic_allocation - LegacyScraperRegistry preserved for backwards compatibility Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 15:35:58 +00:00
parent 2412996c54
commit 788ef84756
8 changed files with 2503 additions and 98 deletions
--- a/core/database.py
+++ b/core/database.py
@@ -189,7 +189,22 @@ class DatabaseManager:
        url: str,
        webhook_url: Optional[str] = None,
        webhook_secret: Optional[str] = None,
-        metadata: Optional[Dict[str, Any]] = None
+        metadata: Optional[Dict[str, Any]] = None,
+        # Phase 2: New fields for requester tracking
+        requester_client_id: Optional[str] = None,
+        requester_source: Optional[str] = None,
+        scrape_purpose: Optional[str] = None,
+        requester_metadata: Optional[Dict[str, Any]] = None,
+        # Phase 2: Batch support
+        batch_id: Optional[UUID] = None,
+        batch_index: Optional[int] = None,
+        # Phase 2: Job configuration
+        job_type: str = 'google_reviews',
+        priority: int = 0,
+        callback_url: Optional[str] = None,
+        # Phase 2: Scraper versioning
+        scraper_version: Optional[str] = None,
+        scraper_variant: Optional[str] = None
    ) -> UUID:
        """
        Create a new scraping job.
@@ -199,16 +214,41 @@ class DatabaseManager:
            webhook_url: Optional webhook URL for notifications
            webhook_secret: Optional secret for webhook signature
            metadata: Optional additional metadata
+            requester_client_id: Client ID of the requester (for tracking)
+            requester_source: Source of the request (e.g., 'api', 'web', 'batch')
+            scrape_purpose: Purpose of the scrape (e.g., 'competitor_analysis')
+            requester_metadata: Additional requester-specific metadata
+            batch_id: ID of the batch this job belongs to
+            batch_index: Index of this job within the batch
+            job_type: Type of job (default: 'google_reviews')
+            priority: Job priority (higher = more urgent, default: 0)
+            callback_url: URL to call when job completes
+            scraper_version: Version of the scraper to use
+            scraper_variant: Variant of the scraper (e.g., 'stealth', 'fast')

        Returns:
            UUID of created job
        """
        async with self.pool.acquire() as conn:
            job_id = await conn.fetchval("""
-                INSERT INTO jobs (url, webhook_url, webhook_secret, metadata)
-                VALUES ($1, $2, $3, $4)
+                INSERT INTO jobs (
+                    url, webhook_url, webhook_secret, metadata,
+                    requester_client_id, requester_source, scrape_purpose, requester_metadata,
+                    batch_id, batch_index,
+                    job_type, priority, callback_url,
+                    scraper_version, scraper_variant
+                )
+                VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
                RETURNING job_id
-            """, url, webhook_url, webhook_secret, json.dumps(metadata) if metadata else None)
+            """,
+                url, webhook_url, webhook_secret,
+                json.dumps(metadata) if metadata else None,
+                requester_client_id, requester_source, scrape_purpose,
+                json.dumps(requester_metadata) if requester_metadata else None,
+                batch_id, batch_index,
+                job_type, priority, callback_url,
+                scraper_version, scraper_variant
+            )

            log.info(f"Created job {job_id} for URL: {url[:80]}...")
            return job_id
@@ -241,7 +281,20 @@ class DatabaseManager:
                    error_message,
                    metadata,
                    scrape_logs,
-                    review_topics
+                    review_topics,
+                    requester_client_id,
+                    requester_source,
+                    scrape_purpose,
+                    requester_metadata,
+                    batch_id,
+                    batch_index,
+                    job_type,
+                    priority,
+                    callback_url,
+                    callback_status,
+                    callback_attempts,
+                    scraper_version,
+                    scraper_variant
                FROM jobs
                WHERE job_id = $1
            """, job_id)
@@ -493,7 +546,9 @@ class DatabaseManager:
        self,
        status: Optional[JobStatus] = None,
        limit: int = 100,
-        offset: int = 0
+        offset: int = 0,
+        requester_client_id: Optional[str] = None,
+        batch_id: Optional[UUID] = None
    ) -> List[Dict[str, Any]]:
        """
        List jobs with optional filtering.
@@ -502,49 +557,299 @@ class DatabaseManager:
            status: Optional status filter
            limit: Maximum number of jobs to return
            offset: Number of jobs to skip
+            requester_client_id: Optional filter by requester client ID
+            batch_id: Optional filter by batch ID

        Returns:
            List of job dictionaries
        """
        async with self.pool.acquire() as conn:
-            if status:
-                rows = await conn.fetch("""
-                    SELECT
-                        job_id,
-                        status,
-                        url,
-                        created_at,
-                        completed_at,
-                        reviews_count,
-                        total_reviews,
-                        scrape_time,
-                        error_message,
-                        metadata,
-                        review_topics
-                    FROM jobs
-                    WHERE status = $1
-                    ORDER BY created_at DESC
-                    LIMIT $2 OFFSET $3
-                """, status.value, limit, offset)
-            else:
-                rows = await conn.fetch("""
-                    SELECT
-                        job_id,
-                        status,
-                        url,
-                        created_at,
-                        completed_at,
-                        reviews_count,
-                        total_reviews,
-                        scrape_time,
-                        error_message,
-                        metadata,
-                        review_topics
-                    FROM jobs
-                    ORDER BY created_at DESC
-                    LIMIT $1 OFFSET $2
-                """, limit, offset)
+            # Build dynamic WHERE clause
+            conditions = []
+            params = []
+            param_idx = 1

+            if status:
+                conditions.append(f"status = ${param_idx}")
+                params.append(status.value)
+                param_idx += 1
+
+            if requester_client_id:
+                conditions.append(f"requester_client_id = ${param_idx}")
+                params.append(requester_client_id)
+                param_idx += 1
+
+            if batch_id:
+                conditions.append(f"batch_id = ${param_idx}")
+                params.append(batch_id)
+                param_idx += 1
+
+            where_clause = f"WHERE {' AND '.join(conditions)}" if conditions else ""
+
+            # Add limit and offset params
+            params.extend([limit, offset])
+
+            query = f"""
+                SELECT
+                    job_id,
+                    status,
+                    url,
+                    created_at,
+                    completed_at,
+                    reviews_count,
+                    total_reviews,
+                    scrape_time,
+                    error_message,
+                    metadata,
+                    review_topics,
+                    requester_client_id,
+                    requester_source,
+                    scrape_purpose,
+                    requester_metadata,
+                    batch_id,
+                    batch_index,
+                    job_type,
+                    priority,
+                    callback_url,
+                    callback_status,
+                    callback_attempts,
+                    scraper_version,
+                    scraper_variant
+                FROM jobs
+                {where_clause}
+                ORDER BY created_at DESC
+                LIMIT ${param_idx} OFFSET ${param_idx + 1}
+            """
+
+            rows = await conn.fetch(query, *params)
+            return [dict(row) for row in rows]
+
+    async def update_job_callback(
+        self,
+        job_id: UUID,
+        status: str,
+        attempts: Optional[int] = None
+    ):
+        """
+        Update callback status for a job.
+
+        Args:
+            job_id: Job UUID
+            status: Callback status ('pending', 'success', 'failed', 'skipped')
+            attempts: Number of callback attempts (if not provided, increments by 1)
+        """
+        async with self.pool.acquire() as conn:
+            if attempts is not None:
+                await conn.execute("""
+                    UPDATE jobs
+                    SET callback_status = $2, callback_attempts = $3, updated_at = NOW()
+                    WHERE job_id = $1
+                """, job_id, status, attempts)
+            else:
+                await conn.execute("""
+                    UPDATE jobs
+                    SET callback_status = $2,
+                        callback_attempts = COALESCE(callback_attempts, 0) + 1,
+                        updated_at = NOW()
+                    WHERE job_id = $1
+                """, job_id, status)
+
+            log.debug(f"Updated callback status for job {job_id}: {status}")
+
+    # ==================== Batch Operations ====================
+
+    async def create_batch(
+        self,
+        name: str,
+        requester_client_id: Optional[str] = None,
+        requester_source: Optional[str] = None,
+        scrape_purpose: Optional[str] = None,
+        total_jobs: int = 0,
+        callback_url: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> UUID:
+        """
+        Create a new batch for grouping jobs.
+
+        Args:
+            name: Batch name/description
+            requester_client_id: Client ID of the requester
+            requester_source: Source of the batch request
+            scrape_purpose: Purpose of the scrape
+            total_jobs: Expected total number of jobs in batch
+            callback_url: URL to call when batch completes
+            metadata: Additional batch metadata
+
+        Returns:
+            UUID of created batch
+        """
+        async with self.pool.acquire() as conn:
+            batch_id = await conn.fetchval("""
+                INSERT INTO batches (
+                    name, requester_client_id, requester_source, scrape_purpose,
+                    total_jobs, callback_url, metadata
+                )
+                VALUES ($1, $2, $3, $4, $5, $6, $7)
+                RETURNING batch_id
+            """,
+                name, requester_client_id, requester_source, scrape_purpose,
+                total_jobs, callback_url,
+                json.dumps(metadata) if metadata else None
+            )
+
+            log.info(f"Created batch {batch_id}: {name} ({total_jobs} jobs)")
+            return batch_id
+
+    async def get_batch(self, batch_id: UUID) -> Optional[Dict[str, Any]]:
+        """
+        Get batch by ID with job counts.
+
+        Args:
+            batch_id: Batch UUID
+
+        Returns:
+            Batch dictionary with job counts or None if not found
+        """
+        async with self.pool.acquire() as conn:
+            row = await conn.fetchrow("""
+                SELECT
+                    b.batch_id,
+                    b.name,
+                    b.status,
+                    b.requester_client_id,
+                    b.requester_source,
+                    b.scrape_purpose,
+                    b.total_jobs,
+                    b.completed_jobs,
+                    b.failed_jobs,
+                    b.callback_url,
+                    b.callback_status,
+                    b.metadata,
+                    b.created_at,
+                    b.updated_at,
+                    b.completed_at,
+                    COUNT(j.job_id) FILTER (WHERE j.status = 'pending') as pending_jobs,
+                    COUNT(j.job_id) FILTER (WHERE j.status = 'running') as running_jobs,
+                    COUNT(j.job_id) as actual_total_jobs
+                FROM batches b
+                LEFT JOIN jobs j ON j.batch_id = b.batch_id
+                WHERE b.batch_id = $1
+                GROUP BY b.batch_id
+            """, batch_id)
+
+            if not row:
+                return None
+
+            return dict(row)
+
+    async def update_batch_progress(self, batch_id: UUID):
+        """
+        Recalculate and update batch progress from jobs table.
+
+        Args:
+            batch_id: Batch UUID
+        """
+        async with self.pool.acquire() as conn:
+            # Calculate job counts
+            counts = await conn.fetchrow("""
+                SELECT
+                    COUNT(*) FILTER (WHERE status = 'completed') as completed,
+                    COUNT(*) FILTER (WHERE status = 'failed') as failed,
+                    COUNT(*) FILTER (WHERE status = 'partial') as partial,
+                    COUNT(*) as total
+                FROM jobs
+                WHERE batch_id = $1
+            """, batch_id)
+
+            completed = counts['completed'] or 0
+            failed = counts['failed'] or 0
+            partial = counts['partial'] or 0
+            total = counts['total'] or 0
+
+            # Determine batch status
+            if total == 0:
+                status = 'pending'
+            elif completed + failed + partial >= total:
+                status = 'completed'
+            elif completed > 0 or failed > 0 or partial > 0:
+                status = 'running'
+            else:
+                status = 'pending'
+
+            # Update batch
+            await conn.execute("""
+                UPDATE batches
+                SET
+                    completed_jobs = $2,
+                    failed_jobs = $3,
+                    status = $4,
+                    updated_at = NOW(),
+                    completed_at = CASE WHEN $4 = 'completed' THEN NOW() ELSE completed_at END
+                WHERE batch_id = $1
+            """, batch_id, completed, failed, status)
+
+            log.debug(f"Updated batch {batch_id} progress: {completed}/{total} completed, {failed} failed")
+
+    async def get_batches(
+        self,
+        requester_client_id: Optional[str] = None,
+        status: Optional[str] = None,
+        limit: int = 50
+    ) -> List[Dict[str, Any]]:
+        """
+        List batches with optional filtering.
+
+        Args:
+            requester_client_id: Optional filter by requester client ID
+            status: Optional filter by batch status
+            limit: Maximum number of batches to return
+
+        Returns:
+            List of batch dictionaries
+        """
+        async with self.pool.acquire() as conn:
+            # Build dynamic WHERE clause
+            conditions = []
+            params = []
+            param_idx = 1
+
+            if requester_client_id:
+                conditions.append(f"requester_client_id = ${param_idx}")
+                params.append(requester_client_id)
+                param_idx += 1
+
+            if status:
+                conditions.append(f"status = ${param_idx}")
+                params.append(status)
+                param_idx += 1
+
+            where_clause = f"WHERE {' AND '.join(conditions)}" if conditions else ""
+            params.append(limit)
+
+            query = f"""
+                SELECT
+                    batch_id,
+                    name,
+                    status,
+                    requester_client_id,
+                    requester_source,
+                    scrape_purpose,
+                    total_jobs,
+                    completed_jobs,
+                    failed_jobs,
+                    callback_url,
+                    callback_status,
+                    metadata,
+                    created_at,
+                    updated_at,
+                    completed_at
+                FROM batches
+                {where_clause}
+                ORDER BY created_at DESC
+                LIMIT ${param_idx}
+            """
+
+            rows = await conn.fetch(query, *params)
            return [dict(row) for row in rows]

    async def get_pending_jobs_with_webhooks(self, limit: int = 100) -> List[Dict[str, Any]]: