-- Migration: 003_create_scraper_registry
-- Description: Creates the scraper_registry table for dynamic scraper routing
-- Date: 2026-01-24
-- Phase: Phase 1 - ReviewIQ Platform

-- ============================================================================
-- SCRAPER REGISTRY TABLE
-- ============================================================================
-- This table enables dynamic scraper selection and A/B testing capabilities.
--
-- ROUTING LOGIC:
-- 1. For a given job_type, the system first filters by variant (stable/beta/canary)
-- 2. Among matching scrapers, traffic_pct determines probability of selection
-- 3. is_default=true marks the fallback scraper when no traffic routing applies
-- 4. min_priority allows reserving certain scrapers for high-priority jobs
--
-- A/B TESTING:
-- - Set traffic_pct on multiple versions (must sum to 100 for a job_type/variant)
-- - Example: v1.0.0 at 90%, v1.1.0-beta at 10% for gradual rollout
-- - Monitor performance metrics per version to make data-driven decisions
--
-- DEPRECATION:
-- - Set deprecated_at to soft-deprecate a scraper version
-- - Deprecated scrapers are excluded from routing but kept for audit history
-- ============================================================================

CREATE TABLE scraper_registry (
    id                   UUID PRIMARY KEY DEFAULT gen_random_uuid(),

    -- Scraper identification
    job_type             VARCHAR(50) NOT NULL,     -- e.g., "google_reviews", "yelp_reviews"
    version              VARCHAR(50) NOT NULL,     -- semver format: "1.0.0", "1.1.0-beta"
    variant              VARCHAR(20) NOT NULL,     -- "stable", "beta", "canary"

    -- Implementation reference
    module_path          VARCHAR(255) NOT NULL,    -- Python module path: "scrapers.google_reviews.v1_0_0"
    function_name        VARCHAR(100) NOT NULL,    -- Entry function: "fast_scrape_reviews"

    -- Routing configuration
    is_default           BOOLEAN DEFAULT false,    -- Fallback when no traffic routing matches
    traffic_pct          INTEGER DEFAULT 0,        -- 0-100: percentage of traffic for A/B testing
    min_priority         INTEGER DEFAULT 0,        -- Only route jobs with priority >= this value

    -- Lifecycle timestamps
    created_at           TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    deprecated_at        TIMESTAMP,                -- NULL if active; set to deprecate without deletion

    -- Version-specific configuration
    config               JSONB,                    -- Flexible settings: rate limits, retry policies, etc.

    -- Constraints
    UNIQUE(job_type, version),

    -- Ensure traffic_pct is within valid range
    CONSTRAINT valid_traffic_pct CHECK (traffic_pct >= 0 AND traffic_pct <= 100),

    -- Ensure variant is one of the allowed values
    CONSTRAINT valid_variant CHECK (variant IN ('stable', 'beta', 'canary'))
);

-- ============================================================================
-- INDEXES
-- ============================================================================

-- Primary lookup index: find active scrapers for a job type
-- Used by: scraper routing logic to find candidate scrapers
CREATE INDEX idx_scraper_registry_job_type_lookup
    ON scraper_registry (job_type, variant, is_default)
    WHERE deprecated_at IS NULL;

-- Traffic routing index: quickly find scrapers participating in A/B tests
-- Used by: traffic splitting logic
CREATE INDEX idx_scraper_registry_traffic_routing
    ON scraper_registry (job_type, traffic_pct)
    WHERE deprecated_at IS NULL AND traffic_pct > 0;

-- Version lookup index: find specific scraper version
-- Used by: admin tools, debugging, forced version routing
CREATE INDEX idx_scraper_registry_version
    ON scraper_registry (job_type, version);

-- ============================================================================
-- COMMENTS
-- ============================================================================

COMMENT ON TABLE scraper_registry IS
    'Registry of available scraper implementations with A/B testing and routing support';

COMMENT ON COLUMN scraper_registry.job_type IS
    'Type of scraping job (e.g., google_reviews, yelp_reviews)';

COMMENT ON COLUMN scraper_registry.version IS
    'Semantic version of the scraper implementation';

COMMENT ON COLUMN scraper_registry.variant IS
    'Release channel: stable (production), beta (pre-release testing), canary (experimental)';

COMMENT ON COLUMN scraper_registry.traffic_pct IS
    'Percentage of traffic to route to this version (0-100). Sum should equal 100 per job_type/variant.';

COMMENT ON COLUMN scraper_registry.min_priority IS
    'Minimum job priority required to use this scraper. Allows reserving fast scrapers for urgent jobs.';

COMMENT ON COLUMN scraper_registry.config IS
    'JSONB configuration specific to this scraper version (rate limits, timeouts, feature flags)';

-- ============================================================================
-- SEED DATA: Register current Google Reviews scraper
-- ============================================================================

INSERT INTO scraper_registry (
    job_type,
    version,
    variant,
    module_path,
    function_name,
    is_default,
    traffic_pct,
    min_priority,
    config
) VALUES (
    'google_reviews',
    '1.0.0',
    'stable',
    'scrapers.google_reviews.v1_0_0',
    'fast_scrape_reviews',
    true,           -- This is the default scraper
    100,            -- Receives 100% of traffic
    0,              -- Available for all priority levels
    '{
        "max_concurrent_requests": 5,
        "request_timeout_seconds": 30,
        "retry_attempts": 3,
        "retry_delay_seconds": 2,
        "rate_limit_per_minute": 60
    }'::jsonb
);