whyrating-engine-legacy/core/database.py

#!/usr/bin/env python3
"""
PostgreSQL database module for production microservice.
Stores job metadata and reviews as JSONB.
"""
import asyncpg
import json
from datetime import datetime
from typing import Optional, List, Dict, Any
from uuid import UUID, uuid4
import logging

from core.enums import JobStatus

log = logging.getLogger(__name__)


class DatabaseManager:
    """PostgreSQL database manager with connection pooling"""

    def __init__(self, database_url: str):
        """
        Initialize database manager.

        Args:
            database_url: PostgreSQL connection URL
                         Format: postgresql://user:password@host:port/database
        """
        self.database_url = database_url
        self.pool: Optional[asyncpg.Pool] = None

    async def connect(self):
        """Create connection pool"""
        log.info("Connecting to PostgreSQL database...")
        self.pool = await asyncpg.create_pool(
            self.database_url,
            min_size=5,
            max_size=20,
            command_timeout=60
        )
        log.info("Database connection pool created")

    async def disconnect(self):
        """Close connection pool"""
        if self.pool:
            await self.pool.close()
            log.info("Database connection pool closed")

    async def initialize_schema(self):
        """Create database schema if it doesn't exist"""
        async with self.pool.acquire() as conn:
            # Create jobs table
            await conn.execute("""
                CREATE TABLE IF NOT EXISTS jobs (
                    job_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
                    status VARCHAR(20) NOT NULL DEFAULT 'pending',
                    url TEXT NOT NULL,
                    webhook_url TEXT,
                    webhook_secret TEXT,

                    created_at TIMESTAMP NOT NULL DEFAULT NOW(),
                    started_at TIMESTAMP,
                    completed_at TIMESTAMP,
                    updated_at TIMESTAMP,

                    reviews_count INTEGER,
                    total_reviews INTEGER,
                    reviews_data JSONB,
                    scrape_time REAL,

                    error_message TEXT,
                    metadata JSONB,
                    scrape_logs JSONB,

                    CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'))
                );
            """)

            # Add scrape_logs column if it doesn't exist (for existing databases)
            await conn.execute("""
                ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
            """)

            # Add updated_at column if it doesn't exist (for incremental progress tracking)
            await conn.execute("""
                ALTER TABLE jobs ADD COLUMN IF NOT EXISTS updated_at TIMESTAMP;
            """)

            # Add review_topics column if it doesn't exist (extracted topic filters with mention counts)
            await conn.execute("""
                ALTER TABLE jobs ADD COLUMN IF NOT EXISTS review_topics JSONB;
            """)

            # Update constraint to include 'partial' status (for existing databases)
            await conn.execute("""
                ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_status;
            """)
            await conn.execute("""
                ALTER TABLE jobs ADD CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'));
            """)

            # Create indexes
            await conn.execute("""
                CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
            """)
            await conn.execute("""
                CREATE INDEX IF NOT EXISTS idx_jobs_created_at ON jobs(created_at DESC);
            """)
            await conn.execute("""
                CREATE INDEX IF NOT EXISTS idx_jobs_webhook ON jobs(webhook_url) WHERE webhook_url IS NOT NULL;
            """)

            # Create canary results table
            await conn.execute("""
                CREATE TABLE IF NOT EXISTS canary_results (
                    id SERIAL PRIMARY KEY,
                    timestamp TIMESTAMP NOT NULL DEFAULT NOW(),
                    success BOOLEAN NOT NULL,
                    reviews_count INTEGER,
                    scrape_time REAL,
                    error_message TEXT,
                    metadata JSONB
                );
            """)

            await conn.execute("""
                CREATE INDEX IF NOT EXISTS idx_canary_timestamp ON canary_results(timestamp DESC);
            """)

            # Create webhook attempts table (for retry tracking)
            await conn.execute("""
                CREATE TABLE IF NOT EXISTS webhook_attempts (
                    id SERIAL PRIMARY KEY,
                    job_id UUID NOT NULL REFERENCES jobs(job_id) ON DELETE CASCADE,
                    attempt_number INTEGER NOT NULL,
                    timestamp TIMESTAMP NOT NULL DEFAULT NOW(),
                    success BOOLEAN NOT NULL,
                    status_code INTEGER,
                    error_message TEXT,
                    response_time_ms REAL
                );
            """)

            await conn.execute("""
                CREATE INDEX IF NOT EXISTS idx_webhook_job_id ON webhook_attempts(job_id);
            """)

            # Add session_fingerprint and metrics_history columns to jobs table
            await conn.execute("""
                ALTER TABLE jobs ADD COLUMN IF NOT EXISTS session_fingerprint JSONB;
            """)
            await conn.execute("""
                ALTER TABLE jobs ADD COLUMN IF NOT EXISTS metrics_history JSONB;
            """)

            # Create crash_reports table
            await conn.execute("""
                CREATE TABLE IF NOT EXISTS crash_reports (
                    crash_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
                    job_id UUID REFERENCES jobs(job_id) ON DELETE CASCADE,
                    created_at TIMESTAMP NOT NULL DEFAULT NOW(),
                    crash_type VARCHAR(50) NOT NULL,
                    error_message TEXT,
                    state JSONB NOT NULL,
                    metrics_history JSONB,
                    logs_before_crash JSONB,
                    analysis JSONB,
                    screenshot_url TEXT,
                    dom_snapshot_id UUID
                );
            """)

            await conn.execute("""
                CREATE INDEX IF NOT EXISTS idx_crash_reports_job ON crash_reports(job_id);
            """)
            await conn.execute("""
                CREATE INDEX IF NOT EXISTS idx_crash_reports_type ON crash_reports(crash_type);
            """)
            await conn.execute("""
                CREATE INDEX IF NOT EXISTS idx_crash_reports_created ON crash_reports(created_at DESC);
            """)

            log.info("Database schema initialized")

    async def run_migrations(self, migrations_dir: str = "migrations/versions"):
        """
        Run versioned migrations from SQL files.

        Args:
            migrations_dir: Path to directory containing .sql migration files.
                           Files are run in sorted order.

        Returns:
            Number of migrations applied.
        """
        from pathlib import Path

        migrations_path = Path(migrations_dir)
        if not migrations_path.exists():
            log.warning(f"Migrations directory not found: {migrations_dir}")
            return 0

        async with self.pool.acquire() as conn:
            # Create migrations tracking table
            await conn.execute("""
                CREATE TABLE IF NOT EXISTS _migrations (
                    id SERIAL PRIMARY KEY,
                    filename VARCHAR(255) UNIQUE NOT NULL,
                    applied_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
                )
            """)

            # Get already applied migrations
            applied = await conn.fetch("SELECT filename FROM _migrations")
            applied_set = {r["filename"] for r in applied}

            # Find and run pending migrations
            migration_files = sorted(migrations_path.glob("*.sql"))
            migrations_run = 0

            for migration_file in migration_files:
                filename = migration_file.name
                if filename in applied_set:
                    continue

                log.info(f"Running migration: {filename}")

                async with conn.transaction():
                    try:
                        sql = migration_file.read_text()
                        await conn.execute(sql)
                        await conn.execute(
                            "INSERT INTO _migrations (filename) VALUES ($1)",
                            filename,
                        )
                        migrations_run += 1
                        log.info(f"Migration {filename} applied successfully")
                    except Exception as e:
                        log.error(f"Migration {filename} failed: {e}")
                        raise

            log.info(f"Ran {migrations_run} migrations")
            return migrations_run

    # ==================== Job Operations ====================

    async def create_job(
        self,
        url: str,
        webhook_url: Optional[str] = None,
        webhook_secret: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
        # Phase 2: New fields for requester tracking
        requester_client_id: Optional[str] = None,
        requester_source: Optional[str] = None,
        scrape_purpose: Optional[str] = None,
        requester_metadata: Optional[Dict[str, Any]] = None,
        # Phase 2: Batch support
        batch_id: Optional[UUID] = None,
        batch_index: Optional[int] = None,
        # Phase 2: Job configuration
        job_type: str = 'google_reviews',
        priority: int = 0,
        callback_url: Optional[str] = None,
        # Phase 2: Scraper versioning
        scraper_version: Optional[str] = None,
        scraper_variant: Optional[str] = None
    ) -> UUID:
        """
        Create a new scraping job.

        Args:
            url: Google Maps URL to scrape
            webhook_url: Optional webhook URL for notifications
            webhook_secret: Optional secret for webhook signature
            metadata: Optional additional metadata
            requester_client_id: Client ID of the requester (for tracking)
            requester_source: Source of the request (e.g., 'api', 'web', 'batch')
            scrape_purpose: Purpose of the scrape (e.g., 'competitor_analysis')
            requester_metadata: Additional requester-specific metadata
            batch_id: ID of the batch this job belongs to
            batch_index: Index of this job within the batch
            job_type: Type of job (default: 'google_reviews')
            priority: Job priority (higher = more urgent, default: 0)
            callback_url: URL to call when job completes
            scraper_version: Version of the scraper to use
            scraper_variant: Variant of the scraper (e.g., 'stealth', 'fast')

        Returns:
            UUID of created job
        """
        async with self.pool.acquire() as conn:
            job_id = await conn.fetchval("""
                INSERT INTO jobs (
                    url, webhook_url, webhook_secret, metadata,
                    requester_client_id, requester_source, scrape_purpose, requester_metadata,
                    batch_id, batch_index,
                    job_type, priority, callback_url,
                    scraper_version, scraper_variant
                )
                VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
                RETURNING job_id
            """,
                url, webhook_url, webhook_secret,
                json.dumps(metadata) if metadata else None,
                requester_client_id, requester_source, scrape_purpose,
                json.dumps(requester_metadata) if requester_metadata else None,
                batch_id, batch_index,
                job_type, priority, callback_url,
                scraper_version, scraper_variant
            )

            log.info(f"Created job {job_id} for URL: {url[:80]}...")
            return job_id

    async def get_job(self, job_id: UUID) -> Optional[Dict[str, Any]]:
        """
        Get job by ID.

        Args:
            job_id: Job UUID

        Returns:
            Job dictionary or None if not found
        """
        async with self.pool.acquire() as conn:
            row = await conn.fetchrow("""
                SELECT
                    job_id,
                    status,
                    url,
                    webhook_url,
                    created_at,
                    started_at,
                    completed_at,
                    updated_at,
                    reviews_count,
                    total_reviews,
                    reviews_data,
                    scrape_time,
                    error_message,
                    metadata,
                    scrape_logs,
                    review_topics,
                    requester_client_id,
                    requester_source,
                    scrape_purpose,
                    requester_metadata,
                    batch_id,
                    batch_index,
                    job_type,
                    priority,
                    callback_url,
                    callback_status,
                    callback_attempts,
                    scraper_version,
                    scraper_variant,
                    business_name,
                    business_category,
                    business_address,
                    business_rating
                FROM jobs
                WHERE job_id = $1
            """, job_id)

            if not row:
                return None

            return dict(row)

    async def get_job_reviews(self, job_id: UUID, include_partial: bool = True) -> Optional[List[Dict[str, Any]]]:
        """
        Get reviews for a specific job.

        Args:
            job_id: Job UUID
            include_partial: If True, also return reviews for running and partial jobs

        Returns:
            List of reviews or None if not found/no reviews
        """
        async with self.pool.acquire() as conn:
            if include_partial:
                # Return reviews for completed, running, or partial jobs
                reviews_data = await conn.fetchval("""
                    SELECT reviews_data
                    FROM jobs
                    WHERE job_id = $1 AND status IN ('completed', 'running', 'partial')
                """, job_id)
            else:
                # Only return reviews for completed jobs
                reviews_data = await conn.fetchval("""
                    SELECT reviews_data
                    FROM jobs
                    WHERE job_id = $1 AND status = 'completed'
                """, job_id)

            if not reviews_data:
                return None

            # asyncpg returns JSONB as string, need to parse it
            if isinstance(reviews_data, str):
                return json.loads(reviews_data)

            return reviews_data

    async def update_job_status(
        self,
        job_id: UUID,
        status: JobStatus,
        **kwargs
    ):
        """
        Update job status and optional fields.

        Args:
            job_id: Job UUID
            status: New status
            **kwargs: Additional fields to update (started_at, completed_at, error_message, etc.)
        """
        # Build dynamic UPDATE query
        set_clauses = ["status = $2"]
        params = [job_id, status.value]
        param_idx = 3

        if status == JobStatus.RUNNING and 'started_at' not in kwargs:
            kwargs['started_at'] = datetime.now()
        elif status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED] and 'completed_at' not in kwargs:
            kwargs['completed_at'] = datetime.now()

        for key, value in kwargs.items():
            # Handle JSONB fields specially
            if key == 'scrape_logs' and value is not None:
                set_clauses.append(f"{key} = ${param_idx}::jsonb")
                params.append(json.dumps(value) if not isinstance(value, str) else value)
            else:
                set_clauses.append(f"{key} = ${param_idx}")
                params.append(value)
            param_idx += 1

        query = f"""
            UPDATE jobs
            SET {', '.join(set_clauses)}
            WHERE job_id = $1
        """

        async with self.pool.acquire() as conn:
            await conn.execute(query, *params)

    async def save_job_result(
        self,
        job_id: UUID,
        reviews: List[Dict[str, Any]],
        scrape_time: float,
        total_reviews: Optional[int] = None,
        scrape_logs: Optional[List[Dict[str, Any]]] = None,
        review_topics: Optional[List[Dict[str, Any]]] = None
    ):
        """
        Save scraping results to database.

        Args:
            job_id: Job UUID
            reviews: List of review dictionaries
            scrape_time: Time taken to scrape in seconds
            total_reviews: Total reviews available (from page counter)
            scrape_logs: List of log entries from the scraper
            review_topics: List of topic filter dictionaries with topic and count
        """
        async with self.pool.acquire() as conn:
            # If reviews list is empty, check if job already has reviews from incremental saves
            # This happens when flush_callback was used during scraping
            if not reviews:
                existing = await conn.fetchval(
                    "SELECT reviews_count FROM jobs WHERE job_id = $1", job_id
                )
                if existing and existing > 0:
                    # Job has reviews from incremental saves, don't overwrite reviews_data
                    await conn.execute("""
                        UPDATE jobs
                        SET
                            status = 'completed',
                            completed_at = NOW(),
                            total_reviews = COALESCE($2, total_reviews),
                            scrape_time = $3,
                            scrape_logs = $4::jsonb,
                            review_topics = $5::jsonb
                        WHERE job_id = $1
                    """, job_id, total_reviews, scrape_time,
                        json.dumps(scrape_logs) if scrape_logs else None,
                        json.dumps(review_topics) if review_topics else None)
                    log.info(f"Completed job {job_id} with {existing} reviews (from incremental saves)")
                    return

            await conn.execute("""
                UPDATE jobs
                SET
                    status = 'completed',
                    completed_at = NOW(),
                    reviews_count = $2,
                    total_reviews = $3,
                    reviews_data = $4::jsonb,
                    scrape_time = $5,
                    scrape_logs = $6::jsonb,
                    review_topics = $7::jsonb
                WHERE job_id = $1
            """, job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
                json.dumps(scrape_logs) if scrape_logs else None,
                json.dumps(review_topics) if review_topics else None)

            log.info(f"Saved {len(reviews)} reviews for job {job_id}")

    async def save_reviews_incremental(
        self,
        job_id: UUID,
        reviews: List[Dict[str, Any]],
        total_reviews: Optional[int] = None
    ):
        """
        Save reviews incrementally during scraping.
        Called on each flush to preserve progress in case of crash.

        Args:
            job_id: Job UUID
            reviews: ALL reviews collected so far (not just new ones)
            total_reviews: Total reviews available (from page counter)
        """
        async with self.pool.acquire() as conn:
            await conn.execute("""
                UPDATE jobs
                SET
                    reviews_count = $2,
                    total_reviews = COALESCE($3, total_reviews),
                    reviews_data = $4::jsonb,
                    updated_at = NOW()
                WHERE job_id = $1 AND status = 'running'
            """, job_id, len(reviews), total_reviews, json.dumps(reviews))

            log.debug(f"Incremental save: {len(reviews)} reviews for job {job_id}")

    async def update_session_fingerprint(
        self,
        job_id: UUID,
        session_fingerprint: Dict[str, Any]
    ):
        """
        Update the session fingerprint for a job.

        This should be called early in the scraping process after the browser
        fingerprint is captured, to record browser characteristics for
        bot detection analysis.

        Args:
            job_id: Job UUID
            session_fingerprint: Dictionary containing browser fingerprint data:
                - user_agent: Browser user agent string
                - platform: OS platform
                - language: Primary language
                - languages: List of accepted languages
                - timezone: Timezone string
                - screen: {width, height, colorDepth}
                - viewport: {width, height}
                - webgl_vendor: WebGL vendor string
                - webgl_renderer: WebGL renderer string
                - canvas_fingerprint: Canvas fingerprint hash
                - hardware_concurrency: Number of CPU cores
                - device_memory: Device memory in GB
                - bot_detection_tests: {webdriver_hidden, chrome_runtime, permissions_query}
                - captured_at: ISO timestamp when fingerprint was captured
        """
        async with self.pool.acquire() as conn:
            await conn.execute("""
                UPDATE jobs
                SET
                    session_fingerprint = $2::jsonb,
                    updated_at = NOW()
                WHERE job_id = $1
            """, job_id, json.dumps(session_fingerprint))

            log.debug(f"Updated session fingerprint for job {job_id}")

    async def update_job_metadata(
        self,
        job_id: UUID,
        metadata_updates: Dict[str, Any]
    ):
        """
        Update specific fields in job metadata without overwriting existing data.

        Args:
            job_id: Job UUID
            metadata_updates: Dictionary of metadata fields to update/add
                - bot_detected: True if sort button was hidden (bot detection)
                - initial_sort_used: Sort order used for scraping
                - sort_orders_attempted: List of all sort orders tried
                - multi_sort: Multi-sort completion info
        """
        async with self.pool.acquire() as conn:
            # Merge new metadata with existing metadata using JSONB concatenation
            await conn.execute("""
                UPDATE jobs
                SET
                    metadata = COALESCE(metadata, '{}'::jsonb) || $2::jsonb,
                    updated_at = NOW()
                WHERE job_id = $1
            """, job_id, json.dumps(metadata_updates))

            log.debug(f"Updated job metadata for job {job_id}: {list(metadata_updates.keys())}")

    async def update_business_info(
        self,
        job_id: UUID,
        business_name: Optional[str] = None,
        business_category: Optional[str] = None,
        business_address: Optional[str] = None,
        business_rating: Optional[float] = None
    ):
        """
        Update business info columns for a job.

        These are dedicated columns (not JSONB) for queryable business data
        captured from the Google Maps page during scraping.

        Args:
            job_id: Job UUID
            business_name: Business name from Google Maps
            business_category: Business category (e.g., "Restaurant", "Toy store")
            business_address: Full address from Google Maps
            business_rating: Aggregate rating at time of scrape (e.g., 4.5)
        """
        async with self.pool.acquire() as conn:
            await conn.execute("""
                UPDATE jobs
                SET
                    business_name = COALESCE($2, business_name),
                    business_category = COALESCE($3, business_category),
                    business_address = COALESCE($4, business_address),
                    business_rating = COALESCE($5, business_rating),
                    updated_at = NOW()
                WHERE job_id = $1
            """, job_id, business_name, business_category, business_address, business_rating)

            log.debug(f"Updated business info for job {job_id}: name={business_name}, category={business_category}")

    async def mark_job_partial(
        self,
        job_id: UUID,
        error_message: str,
        scrape_logs: Optional[List[Dict[str, Any]]] = None
    ):
        """
        Mark a job as partial (crashed but has some reviews saved).

        Args:
            job_id: Job UUID
            error_message: Error that caused the crash
            scrape_logs: Log entries from the scraper
        """
        async with self.pool.acquire() as conn:
            await conn.execute("""
                UPDATE jobs
                SET
                    status = 'partial',
                    completed_at = NOW(),
                    error_message = $2,
                    scrape_logs = $3::jsonb
                WHERE job_id = $1
            """, job_id, error_message, json.dumps(scrape_logs) if scrape_logs else None)

            log.info(f"Marked job {job_id} as partial due to: {error_message}")

    async def list_jobs(
        self,
        status: Optional[JobStatus] = None,
        limit: int = 100,
        offset: int = 0,
        requester_client_id: Optional[str] = None,
        batch_id: Optional[UUID] = None
    ) -> List[Dict[str, Any]]:
        """
        List jobs with optional filtering.

        Args:
            status: Optional status filter
            limit: Maximum number of jobs to return
            offset: Number of jobs to skip
            requester_client_id: Optional filter by requester client ID
            batch_id: Optional filter by batch ID

        Returns:
            List of job dictionaries
        """
        async with self.pool.acquire() as conn:
            # Build dynamic WHERE clause
            conditions = []
            params = []
            param_idx = 1

            if status:
                conditions.append(f"status = ${param_idx}")
                params.append(status.value)
                param_idx += 1

            if requester_client_id:
                conditions.append(f"requester_client_id = ${param_idx}")
                params.append(requester_client_id)
                param_idx += 1

            if batch_id:
                conditions.append(f"batch_id = ${param_idx}")
                params.append(batch_id)
                param_idx += 1

            where_clause = f"WHERE {' AND '.join(conditions)}" if conditions else ""

            # Add limit and offset params
            params.extend([limit, offset])

            query = f"""
                SELECT
                    job_id,
                    status,
                    url,
                    created_at,
                    completed_at,
                    reviews_count,
                    total_reviews,
                    scrape_time,
                    error_message,
                    metadata,
                    review_topics,
                    requester_client_id,
                    requester_source,
                    scrape_purpose,
                    requester_metadata,
                    batch_id,
                    batch_index,
                    job_type,
                    priority,
                    callback_url,
                    callback_status,
                    callback_attempts,
                    scraper_version,
                    scraper_variant,
                    business_name,
                    business_category,
                    business_address,
                    business_rating
                FROM jobs
                {where_clause}
                ORDER BY created_at DESC
                LIMIT ${param_idx} OFFSET ${param_idx + 1}
            """

            rows = await conn.fetch(query, *params)
            return [dict(row) for row in rows]

    async def update_job_callback(
        self,
        job_id: UUID,
        status: str,
        attempts: Optional[int] = None
    ):
        """
        Update callback status for a job.

        Args:
            job_id: Job UUID
            status: Callback status ('pending', 'success', 'failed', 'skipped')
            attempts: Number of callback attempts (if not provided, increments by 1)
        """
        async with self.pool.acquire() as conn:
            if attempts is not None:
                await conn.execute("""
                    UPDATE jobs
                    SET callback_status = $2, callback_attempts = $3, updated_at = NOW()
                    WHERE job_id = $1
                """, job_id, status, attempts)
            else:
                await conn.execute("""
                    UPDATE jobs
                    SET callback_status = $2,
                        callback_attempts = COALESCE(callback_attempts, 0) + 1,
                        updated_at = NOW()
                    WHERE job_id = $1
                """, job_id, status)

            log.debug(f"Updated callback status for job {job_id}: {status}")

    # ==================== Batch Operations ====================

    async def create_batch(
        self,
        name: str,
        requester_client_id: Optional[str] = None,
        requester_source: Optional[str] = None,
        scrape_purpose: Optional[str] = None,
        total_jobs: int = 0,
        callback_url: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None
    ) -> UUID:
        """
        Create a new batch for grouping jobs.

        Args:
            name: Batch name/description
            requester_client_id: Client ID of the requester
            requester_source: Source of the batch request
            scrape_purpose: Purpose of the scrape
            total_jobs: Expected total number of jobs in batch
            callback_url: URL to call when batch completes
            metadata: Additional batch metadata

        Returns:
            UUID of created batch
        """
        async with self.pool.acquire() as conn:
            batch_id = await conn.fetchval("""
                INSERT INTO batches (
                    name, requester_client_id, requester_source, scrape_purpose,
                    total_jobs, callback_url, metadata
                )
                VALUES ($1, $2, $3, $4, $5, $6, $7)
                RETURNING batch_id
            """,
                name, requester_client_id, requester_source, scrape_purpose,
                total_jobs, callback_url,
                json.dumps(metadata) if metadata else None
            )

            log.info(f"Created batch {batch_id}: {name} ({total_jobs} jobs)")
            return batch_id

    async def get_batch(self, batch_id: UUID) -> Optional[Dict[str, Any]]:
        """
        Get batch by ID with job counts.

        Args:
            batch_id: Batch UUID

        Returns:
            Batch dictionary with job counts or None if not found
        """
        async with self.pool.acquire() as conn:
            row = await conn.fetchrow("""
                SELECT
                    b.batch_id,
                    b.name,
                    b.status,
                    b.requester_client_id,
                    b.requester_source,
                    b.scrape_purpose,
                    b.total_jobs,
                    b.completed_jobs,
                    b.failed_jobs,
                    b.callback_url,
                    b.callback_status,
                    b.metadata,
                    b.created_at,
                    b.updated_at,
                    b.completed_at,
                    COUNT(j.job_id) FILTER (WHERE j.status = 'pending') as pending_jobs,
                    COUNT(j.job_id) FILTER (WHERE j.status = 'running') as running_jobs,
                    COUNT(j.job_id) as actual_total_jobs
                FROM batches b
                LEFT JOIN jobs j ON j.batch_id = b.batch_id
                WHERE b.batch_id = $1
                GROUP BY b.batch_id
            """, batch_id)

            if not row:
                return None

            return dict(row)

    async def update_batch_progress(self, batch_id: UUID):
        """
        Recalculate and update batch progress from jobs table.

        Args:
            batch_id: Batch UUID
        """
        async with self.pool.acquire() as conn:
            # Calculate job counts
            counts = await conn.fetchrow("""
                SELECT
                    COUNT(*) FILTER (WHERE status = 'completed') as completed,
                    COUNT(*) FILTER (WHERE status = 'failed') as failed,
                    COUNT(*) FILTER (WHERE status = 'partial') as partial,
                    COUNT(*) as total
                FROM jobs
                WHERE batch_id = $1
            """, batch_id)

            completed = counts['completed'] or 0
            failed = counts['failed'] or 0
            partial = counts['partial'] or 0
            total = counts['total'] or 0

            # Determine batch status
            if total == 0:
                status = 'pending'
            elif completed + failed + partial >= total:
                status = 'completed'
            elif completed > 0 or failed > 0 or partial > 0:
                status = 'running'
            else:
                status = 'pending'

            # Update batch
            await conn.execute("""
                UPDATE batches
                SET
                    completed_jobs = $2,
                    failed_jobs = $3,
                    status = $4,
                    updated_at = NOW(),
                    completed_at = CASE WHEN $4 = 'completed' THEN NOW() ELSE completed_at END
                WHERE batch_id = $1
            """, batch_id, completed, failed, status)

            log.debug(f"Updated batch {batch_id} progress: {completed}/{total} completed, {failed} failed")

    async def get_batches(
        self,
        requester_client_id: Optional[str] = None,
        status: Optional[str] = None,
        limit: int = 50
    ) -> List[Dict[str, Any]]:
        """
        List batches with optional filtering.

        Args:
            requester_client_id: Optional filter by requester client ID
            status: Optional filter by batch status
            limit: Maximum number of batches to return

        Returns:
            List of batch dictionaries
        """
        async with self.pool.acquire() as conn:
            # Build dynamic WHERE clause
            conditions = []
            params = []
            param_idx = 1

            if requester_client_id:
                conditions.append(f"requester_client_id = ${param_idx}")
                params.append(requester_client_id)
                param_idx += 1

            if status:
                conditions.append(f"status = ${param_idx}")
                params.append(status)
                param_idx += 1

            where_clause = f"WHERE {' AND '.join(conditions)}" if conditions else ""
            params.append(limit)

            query = f"""
                SELECT
                    batch_id,
                    name,
                    status,
                    requester_client_id,
                    requester_source,
                    scrape_purpose,
                    total_jobs,
                    completed_jobs,
                    failed_jobs,
                    callback_url,
                    callback_status,
                    metadata,
                    created_at,
                    updated_at,
                    completed_at
                FROM batches
                {where_clause}
                ORDER BY created_at DESC
                LIMIT ${param_idx}
            """

            rows = await conn.fetch(query, *params)
            return [dict(row) for row in rows]

    async def get_pending_jobs_with_webhooks(self, limit: int = 100) -> List[Dict[str, Any]]:
        """
        Get completed jobs that have webhooks pending delivery.

        Args:
            limit: Maximum number of jobs to return

        Returns:
            List of job dictionaries with webhook info
        """
        async with self.pool.acquire() as conn:
            rows = await conn.fetch("""
                SELECT
                    job_id,
                    status,
                    url,
                    webhook_url,
                    webhook_secret,
                    reviews_count,
                    scrape_time,
                    error_message,
                    completed_at
                FROM jobs
                WHERE webhook_url IS NOT NULL
                  AND status IN ('completed', 'failed')
                  AND job_id NOT IN (
                      SELECT job_id
                      FROM webhook_attempts
                      WHERE success = true
                  )
                ORDER BY completed_at ASC
                LIMIT $1
            """, limit)

            return [dict(row) for row in rows]

    async def delete_job(self, job_id: UUID) -> bool:
        """
        Delete a job from the database.

        Args:
            job_id: Job UUID

        Returns:
            True if deleted, False if not found
        """
        async with self.pool.acquire() as conn:
            result = await conn.execute("""
                DELETE FROM jobs WHERE job_id = $1
            """, job_id)

            deleted = result.split()[-1] == "1"
            if deleted:
                log.info(f"Deleted job {job_id}")
            return deleted

    async def cleanup_old_jobs(self, max_age_days: int = 30):
        """
        Delete old completed/failed jobs.

        Args:
            max_age_days: Maximum age in days before deletion
        """
        async with self.pool.acquire() as conn:
            result = await conn.execute("""
                DELETE FROM jobs
                WHERE status IN ('completed', 'failed', 'cancelled')
                  AND completed_at < NOW() - INTERVAL '%s days'
            """, max_age_days)

            deleted_count = int(result.split()[-1])
            if deleted_count > 0:
                log.info(f"Cleaned up {deleted_count} old jobs")

    # ==================== Statistics ====================

    async def get_stats(self) -> Dict[str, Any]:
        """
        Get job statistics.

        Returns:
            Statistics dictionary
        """
        async with self.pool.acquire() as conn:
            stats = await conn.fetchrow("""
                SELECT
                    COUNT(*) as total_jobs,
                    COUNT(*) FILTER (WHERE status = 'pending') as pending,
                    COUNT(*) FILTER (WHERE status = 'running') as running,
                    COUNT(*) FILTER (WHERE status = 'completed') as completed,
                    COUNT(*) FILTER (WHERE status = 'failed') as failed,
                    COUNT(*) FILTER (WHERE status = 'cancelled') as cancelled,
                    AVG(scrape_time) FILTER (WHERE status = 'completed') as avg_scrape_time,
                    SUM(reviews_count) FILTER (WHERE status = 'completed') as total_reviews
                FROM jobs
            """)

            return dict(stats)

    # ==================== Canary Operations ====================

    async def save_canary_result(
        self,
        success: bool,
        reviews_count: Optional[int] = None,
        scrape_time: Optional[float] = None,
        error_message: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None
    ):
        """
        Save canary test result.

        Args:
            success: Whether canary test succeeded
            reviews_count: Number of reviews scraped
            scrape_time: Time taken in seconds
            error_message: Error message if failed
            metadata: Additional metadata
        """
        async with self.pool.acquire() as conn:
            await conn.execute("""
                INSERT INTO canary_results (success, reviews_count, scrape_time, error_message, metadata)
                VALUES ($1, $2, $3, $4, $5)
            """, success, reviews_count, scrape_time, error_message, json.dumps(metadata) if metadata else None)

    async def get_canary_history(self, limit: int = 100) -> List[Dict[str, Any]]:
        """
        Get canary test history.

        Args:
            limit: Maximum number of results to return

        Returns:
            List of canary result dictionaries
        """
        async with self.pool.acquire() as conn:
            rows = await conn.fetch("""
                SELECT
                    timestamp,
                    success,
                    reviews_count,
                    scrape_time,
                    error_message
                FROM canary_results
                ORDER BY timestamp DESC
                LIMIT $1
            """, limit)

            return [dict(row) for row in rows]

    # ==================== Webhook Attempts ====================

    async def log_webhook_attempt(
        self,
        job_id: UUID,
        attempt_number: int,
        success: bool,
        status_code: Optional[int] = None,
        error_message: Optional[str] = None,
        response_time_ms: Optional[float] = None
    ):
        """
        Log a webhook delivery attempt.

        Args:
            job_id: Job UUID
            attempt_number: Attempt number (1, 2, 3...)
            success: Whether delivery succeeded
            status_code: HTTP status code
            error_message: Error message if failed
            response_time_ms: Response time in milliseconds
        """
        async with self.pool.acquire() as conn:
            await conn.execute("""
                INSERT INTO webhook_attempts (job_id, attempt_number, success, status_code, error_message, response_time_ms)
                VALUES ($1, $2, $3, $4, $5, $6)
            """, job_id, attempt_number, success, status_code, error_message, response_time_ms)

    # ==================== Crash Reports ====================

    async def save_crash_report(self, job_id: str, crash_data: dict) -> str:
        """
        Save a crash report and return the crash_id.

        Args:
            job_id: Job UUID as string
            crash_data: Dictionary containing crash report data:
                - crash_type: Type of crash (required)
                - error_message: Error message (optional)
                - state: Current state at crash time (required)
                - metrics_history: Historical metrics (optional)
                - logs_before_crash: Log entries before crash (optional)
                - analysis: Crash analysis data (optional)
                - screenshot_url: URL to screenshot (optional)
                - dom_snapshot_id: UUID of DOM snapshot (optional)

        Returns:
            UUID of created crash report as string
        """
        async with self.pool.acquire() as conn:
            # Convert job_id string to UUID
            job_uuid = UUID(job_id) if isinstance(job_id, str) else job_id

            crash_id = await conn.fetchval("""
                INSERT INTO crash_reports (
                    job_id,
                    crash_type,
                    error_message,
                    state,
                    metrics_history,
                    logs_before_crash,
                    analysis,
                    screenshot_url,
                    dom_snapshot_id
                )
                VALUES ($1, $2, $3, $4::jsonb, $5::jsonb, $6::jsonb, $7::jsonb, $8, $9)
                RETURNING crash_id
            """,
                job_uuid,
                crash_data.get('crash_type'),
                crash_data.get('error_message'),
                json.dumps(crash_data.get('state', {})),
                json.dumps(crash_data.get('metrics_history')) if crash_data.get('metrics_history') else None,
                json.dumps(crash_data.get('logs_before_crash')) if crash_data.get('logs_before_crash') else None,
                json.dumps(crash_data.get('analysis')) if crash_data.get('analysis') else None,
                crash_data.get('screenshot_url'),
                UUID(crash_data['dom_snapshot_id']) if crash_data.get('dom_snapshot_id') else None
            )

            log.info(f"Saved crash report {crash_id} for job {job_id}, type: {crash_data.get('crash_type')}")
            return str(crash_id)

    async def get_crash_report(self, job_id: str) -> Optional[dict]:
        """
        Get crash report for a job, if any.

        Args:
            job_id: Job UUID as string

        Returns:
            Crash report dictionary or None if not found
        """
        async with self.pool.acquire() as conn:
            job_uuid = UUID(job_id) if isinstance(job_id, str) else job_id

            row = await conn.fetchrow("""
                SELECT
                    crash_id,
                    job_id,
                    created_at,
                    crash_type,
                    error_message,
                    state,
                    metrics_history,
                    logs_before_crash,
                    analysis,
                    screenshot_url,
                    dom_snapshot_id
                FROM crash_reports
                WHERE job_id = $1
                ORDER BY created_at DESC
                LIMIT 1
            """, job_uuid)

            if not row:
                return None

            result = dict(row)
            # Convert UUIDs to strings for JSON serialization
            result['crash_id'] = str(result['crash_id'])
            result['job_id'] = str(result['job_id'])
            if result.get('dom_snapshot_id'):
                result['dom_snapshot_id'] = str(result['dom_snapshot_id'])

            return result

    async def get_crash_stats(self, days: int = 7) -> dict:
        """
        Get crash statistics for the last N days.

        Args:
            days: Number of days to look back (default: 7)

        Returns:
            Dictionary with:
                - total: Total number of crashes
                - by_type: Dict mapping crash type to count
                - by_day: List of dicts with date and count
        """
        async with self.pool.acquire() as conn:
            # Get total count
            total = await conn.fetchval("""
                SELECT COUNT(*)
                FROM crash_reports
                WHERE created_at >= NOW() - INTERVAL '%s days'
            """, days)

            # Get counts by type
            type_rows = await conn.fetch("""
                SELECT crash_type, COUNT(*) as count
                FROM crash_reports
                WHERE created_at >= NOW() - INTERVAL '%s days'
                GROUP BY crash_type
                ORDER BY count DESC
            """, days)

            by_type = {row['crash_type']: row['count'] for row in type_rows}

            # Get counts by day
            day_rows = await conn.fetch("""
                SELECT DATE(created_at) as date, COUNT(*) as count
                FROM crash_reports
                WHERE created_at >= NOW() - INTERVAL '%s days'
                GROUP BY DATE(created_at)
                ORDER BY date DESC
            """, days)

            by_day = [{'date': str(row['date']), 'count': row['count']} for row in day_rows]

            return {
                'total': total or 0,
                'by_type': by_type,
                'by_day': by_day
            }

    # ==================== API Key Operations ====================

    async def initialize_api_keys_schema(self):
        """Create api_keys table if it doesn't exist."""
        async with self.pool.acquire() as conn:
            await conn.execute("""
                CREATE TABLE IF NOT EXISTS api_keys (
                    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
                    key_hash VARCHAR(64) NOT NULL UNIQUE,
                    key_prefix VARCHAR(8) NOT NULL,
                    name VARCHAR(255) NOT NULL,
                    client_id VARCHAR(255) NOT NULL,
                    scopes TEXT[] DEFAULT '{}',
                    rate_limit_rpm INTEGER DEFAULT 60,
                    is_active BOOLEAN DEFAULT true,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    last_used_at TIMESTAMP,
                    expires_at TIMESTAMP,
                    metadata JSONB
                );
            """)

            # Create indexes
            await conn.execute("""
                CREATE INDEX IF NOT EXISTS idx_api_keys_key_hash ON api_keys (key_hash);
            """)
            await conn.execute("""
                CREATE INDEX IF NOT EXISTS idx_api_keys_client_id ON api_keys (client_id);
            """)
            await conn.execute("""
                CREATE INDEX IF NOT EXISTS idx_api_keys_active ON api_keys (is_active) WHERE is_active = true;
            """)

            log.info("API keys schema initialized")

    async def get_api_key_by_hash(self, key_hash: str) -> Optional[Dict[str, Any]]:
        """
        Look up API key by its SHA-256 hash.

        This is the primary authentication lookup method. The hash is computed
        from the API key provided in the request header.

        Args:
            key_hash: SHA-256 hash of the API key (64 hex characters)

        Returns:
            API key record dictionary or None if not found:
            {
                "id": UUID,
                "key_prefix": "riq_a1b2",
                "name": "Production Key",
                "client_id": "veritas_123",
                "scopes": ["jobs:read", "jobs:write"],
                "rate_limit_rpm": 60,
                "is_active": True,
                "created_at": datetime,
                "last_used_at": datetime or None,
                "expires_at": datetime or None,
                "metadata": dict or None
            }
        """
        async with self.pool.acquire() as conn:
            row = await conn.fetchrow("""
                SELECT
                    id,
                    key_prefix,
                    name,
                    client_id,
                    scopes,
                    rate_limit_rpm,
                    is_active,
                    created_at,
                    last_used_at,
                    expires_at,
                    metadata
                FROM api_keys
                WHERE key_hash = $1
            """, key_hash)

            if not row:
                return None

            result = dict(row)
            # Convert scopes from PostgreSQL array to Python list
            result['scopes'] = list(result['scopes']) if result['scopes'] else []
            return result

    async def create_api_key(
        self,
        client_id: str,
        name: str,
        scopes: List[str],
        rate_limit_rpm: int = 60,
        expires_at: Optional[datetime] = None,
        metadata: Optional[Dict[str, Any]] = None
    ) -> tuple:
        """
        Create a new API key for a client.

        IMPORTANT: This method returns the plain text API key exactly once.
        After this, only the hash is stored - the key cannot be recovered.
        Make sure to display or securely transmit this key to the user.

        Args:
            client_id: External client identifier (e.g., "veritas_client_123")
            name: Human-readable name for the key (e.g., "Production API Key")
            scopes: List of permission scopes (e.g., ["jobs:read", "jobs:write"])
            rate_limit_rpm: Maximum requests per minute (default: 60)
            expires_at: Optional expiration datetime (None = never expires)
            metadata: Optional additional metadata dict

        Returns:
            Tuple of (plain_api_key, key_id):
                - plain_api_key: The full API key to give to the user (str)
                - key_id: UUID of the created key record

        Security Note:
            The plain_api_key is ONLY returned here. After creation, only
            the SHA-256 hash is stored. Never log or persist the plain key.
        """
        # Import here to avoid circular dependency
        from api.middleware.auth import generate_api_key, APIKeyAuth

        # Generate secure random key
        plain_api_key = generate_api_key()

        # Hash for storage
        key_hash = APIKeyAuth.hash_api_key(plain_api_key)

        # Extract prefix for identification
        key_prefix = APIKeyAuth.get_key_prefix(plain_api_key)

        async with self.pool.acquire() as conn:
            key_id = await conn.fetchval("""
                INSERT INTO api_keys (
                    key_hash,
                    key_prefix,
                    name,
                    client_id,
                    scopes,
                    rate_limit_rpm,
                    expires_at,
                    metadata
                )
                VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
                RETURNING id
            """,
                key_hash,
                key_prefix,
                name,
                client_id,
                scopes,
                rate_limit_rpm,
                expires_at,
                json.dumps(metadata) if metadata else None
            )

            # Log creation with only the prefix (never log full key)
            log.info(
                f"Created API key {key_prefix}... for client {client_id} "
                f"with scopes {scopes}"
            )

            return (plain_api_key, key_id)

    async def update_api_key_last_used(self, key_id: UUID):
        """
        Update the last_used_at timestamp for an API key.

        Called after each successful authentication to track key usage.
        This is non-blocking and failures are logged but not raised.

        Args:
            key_id: UUID of the API key record
        """
        async with self.pool.acquire() as conn:
            await conn.execute("""
                UPDATE api_keys
                SET last_used_at = NOW()
                WHERE id = $1
            """, key_id)

    async def revoke_api_key(self, key_id: UUID) -> bool:
        """
        Revoke an API key by setting is_active to false.

        This is preferred over deletion as it preserves audit history.
        The key will immediately become invalid for authentication.

        Args:
            key_id: UUID of the API key to revoke

        Returns:
            True if key was found and revoked, False if not found
        """
        async with self.pool.acquire() as conn:
            result = await conn.execute("""
                UPDATE api_keys
                SET is_active = false
                WHERE id = $1 AND is_active = true
            """, key_id)

            revoked = result.split()[-1] == "1"
            if revoked:
                log.info(f"Revoked API key {key_id}")
            return revoked

    async def delete_api_key(self, key_id: UUID) -> bool:
        """
        Permanently delete an API key.

        Use revoke_api_key instead if you want to preserve audit history.
        Deletion is permanent and cannot be undone.

        Args:
            key_id: UUID of the API key to delete

        Returns:
            True if deleted, False if not found
        """
        async with self.pool.acquire() as conn:
            result = await conn.execute("""
                DELETE FROM api_keys WHERE id = $1
            """, key_id)

            deleted = result.split()[-1] == "1"
            if deleted:
                log.info(f"Deleted API key {key_id}")
            return deleted

    async def list_api_keys_for_client(
        self,
        client_id: str,
        include_inactive: bool = False
    ) -> List[Dict[str, Any]]:
        """
        List all API keys for a specific client.

        Note: This returns key metadata only, never the actual keys
        (since we only store hashes).

        Args:
            client_id: Client identifier to filter by
            include_inactive: Whether to include revoked keys (default: False)

        Returns:
            List of API key records (without key_hash for security)
        """
        async with self.pool.acquire() as conn:
            if include_inactive:
                rows = await conn.fetch("""
                    SELECT
                        id,
                        key_prefix,
                        name,
                        client_id,
                        scopes,
                        rate_limit_rpm,
                        is_active,
                        created_at,
                        last_used_at,
                        expires_at
                    FROM api_keys
                    WHERE client_id = $1
                    ORDER BY created_at DESC
                """, client_id)
            else:
                rows = await conn.fetch("""
                    SELECT
                        id,
                        key_prefix,
                        name,
                        client_id,
                        scopes,
                        rate_limit_rpm,
                        is_active,
                        created_at,
                        last_used_at,
                        expires_at
                    FROM api_keys
                    WHERE client_id = $1 AND is_active = true
                    ORDER BY created_at DESC
                """, client_id)

            results = []
            for row in rows:
                record = dict(row)
                record['scopes'] = list(record['scopes']) if record['scopes'] else []
                results.append(record)

            return results

    async def get_api_key_by_id(self, key_id: UUID) -> Optional[Dict[str, Any]]:
        """
        Get API key metadata by its ID.

        Note: This returns key metadata only, never the actual key
        (since we only store hashes).

        Args:
            key_id: UUID of the API key

        Returns:
            API key record dictionary or None if not found
        """
        async with self.pool.acquire() as conn:
            row = await conn.fetchrow("""
                SELECT
                    id,
                    key_prefix,
                    name,
                    client_id,
                    scopes,
                    rate_limit_rpm,
                    is_active,
                    created_at,
                    last_used_at,
                    expires_at,
                    metadata
                FROM api_keys
                WHERE id = $1
            """, key_id)

            if not row:
                return None

            result = dict(row)
            result['scopes'] = list(result['scopes']) if result['scopes'] else []
            return result

    async def update_api_key_scopes(
        self,
        key_id: UUID,
        scopes: List[str]
    ) -> bool:
        """
        Update the scopes for an API key.

        Args:
            key_id: UUID of the API key
            scopes: New list of permission scopes

        Returns:
            True if updated, False if key not found
        """
        async with self.pool.acquire() as conn:
            result = await conn.execute("""
                UPDATE api_keys
                SET scopes = $2
                WHERE id = $1
            """, key_id, scopes)

            updated = result.split()[-1] == "1"
            if updated:
                log.info(f"Updated scopes for API key {key_id}: {scopes}")
            return updated