Phase 1: Database migrations for platform features

Migrations created: - 001_add_job_platform_fields.sql: Add 15 new columns to jobs table - Requester tracking (client_id, source, purpose, metadata) - Batch support (batch_id, batch_index) - Execution tracking (job_type, scraper_version, variant, priority) - Webhook callbacks (url, status, sent_at, attempts) - Result summary (JSONB for cross-type dashboard) - 7 indexes for query performance - 5 CHECK constraints for data validation - 002_create_batches_table.sql: Batch job grouping - Tracks batch progress (total/completed/failed) - Batch-level callbacks - Requester association - 003_create_scraper_registry.sql: Scraper version management - Version routing (stable/beta/canary variants) - A/B traffic splitting (traffic_pct) - Priority-based routing - Seeds google_reviews v1.0.0 as stable default - 004_create_api_keys.sql: API authentication - Secure key storage (SHA-256 hashes, not plaintext) - Scopes-based permissions - Rate limiting support - Key lifecycle (expiry, active status) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 15:24:28 +00:00
parent 544e028c3f
commit 2412996c54
5 changed files with 554 additions and 1 deletions
--- a/migrations/versions/003_create_scraper_registry.sql
+++ b/migrations/versions/003_create_scraper_registry.sql
@@ -0,0 +1,137 @@
+-- Migration: 003_create_scraper_registry
+-- Description: Creates the scraper_registry table for dynamic scraper routing
+-- Date: 2026-01-24
+-- Phase: Phase 1 - ReviewIQ Platform
+
+-- ============================================================================
+-- SCRAPER REGISTRY TABLE
+-- ============================================================================
+-- This table enables dynamic scraper selection and A/B testing capabilities.
+--
+-- ROUTING LOGIC:
+-- 1. For a given job_type, the system first filters by variant (stable/beta/canary)
+-- 2. Among matching scrapers, traffic_pct determines probability of selection
+-- 3. is_default=true marks the fallback scraper when no traffic routing applies
+-- 4. min_priority allows reserving certain scrapers for high-priority jobs
+--
+-- A/B TESTING:
+-- - Set traffic_pct on multiple versions (must sum to 100 for a job_type/variant)
+-- - Example: v1.0.0 at 90%, v1.1.0-beta at 10% for gradual rollout
+-- - Monitor performance metrics per version to make data-driven decisions
+--
+-- DEPRECATION:
+-- - Set deprecated_at to soft-deprecate a scraper version
+-- - Deprecated scrapers are excluded from routing but kept for audit history
+-- ============================================================================
+
+CREATE TABLE scraper_registry (
+    id                   UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+
+    -- Scraper identification
+    job_type             VARCHAR(50) NOT NULL,     -- e.g., "google_reviews", "yelp_reviews"
+    version              VARCHAR(50) NOT NULL,     -- semver format: "1.0.0", "1.1.0-beta"
+    variant              VARCHAR(20) NOT NULL,     -- "stable", "beta", "canary"
+
+    -- Implementation reference
+    module_path          VARCHAR(255) NOT NULL,    -- Python module path: "scrapers.google_reviews.v1_0_0"
+    function_name        VARCHAR(100) NOT NULL,    -- Entry function: "fast_scrape_reviews"
+
+    -- Routing configuration
+    is_default           BOOLEAN DEFAULT false,    -- Fallback when no traffic routing matches
+    traffic_pct          INTEGER DEFAULT 0,        -- 0-100: percentage of traffic for A/B testing
+    min_priority         INTEGER DEFAULT 0,        -- Only route jobs with priority >= this value
+
+    -- Lifecycle timestamps
+    created_at           TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    deprecated_at        TIMESTAMP,                -- NULL if active; set to deprecate without deletion
+
+    -- Version-specific configuration
+    config               JSONB,                    -- Flexible settings: rate limits, retry policies, etc.
+
+    -- Constraints
+    UNIQUE(job_type, version),
+
+    -- Ensure traffic_pct is within valid range
+    CONSTRAINT valid_traffic_pct CHECK (traffic_pct >= 0 AND traffic_pct <= 100),
+
+    -- Ensure variant is one of the allowed values
+    CONSTRAINT valid_variant CHECK (variant IN ('stable', 'beta', 'canary'))
+);
+
+-- ============================================================================
+-- INDEXES
+-- ============================================================================
+
+-- Primary lookup index: find active scrapers for a job type
+-- Used by: scraper routing logic to find candidate scrapers
+CREATE INDEX idx_scraper_registry_job_type_lookup
+    ON scraper_registry (job_type, variant, is_default)
+    WHERE deprecated_at IS NULL;
+
+-- Traffic routing index: quickly find scrapers participating in A/B tests
+-- Used by: traffic splitting logic
+CREATE INDEX idx_scraper_registry_traffic_routing
+    ON scraper_registry (job_type, traffic_pct)
+    WHERE deprecated_at IS NULL AND traffic_pct > 0;
+
+-- Version lookup index: find specific scraper version
+-- Used by: admin tools, debugging, forced version routing
+CREATE INDEX idx_scraper_registry_version
+    ON scraper_registry (job_type, version);
+
+-- ============================================================================
+-- COMMENTS
+-- ============================================================================
+
+COMMENT ON TABLE scraper_registry IS
+    'Registry of available scraper implementations with A/B testing and routing support';
+
+COMMENT ON COLUMN scraper_registry.job_type IS
+    'Type of scraping job (e.g., google_reviews, yelp_reviews)';
+
+COMMENT ON COLUMN scraper_registry.version IS
+    'Semantic version of the scraper implementation';
+
+COMMENT ON COLUMN scraper_registry.variant IS
+    'Release channel: stable (production), beta (pre-release testing), canary (experimental)';
+
+COMMENT ON COLUMN scraper_registry.traffic_pct IS
+    'Percentage of traffic to route to this version (0-100). Sum should equal 100 per job_type/variant.';
+
+COMMENT ON COLUMN scraper_registry.min_priority IS
+    'Minimum job priority required to use this scraper. Allows reserving fast scrapers for urgent jobs.';
+
+COMMENT ON COLUMN scraper_registry.config IS
+    'JSONB configuration specific to this scraper version (rate limits, timeouts, feature flags)';
+
+-- ============================================================================
+-- SEED DATA: Register current Google Reviews scraper
+-- ============================================================================
+
+INSERT INTO scraper_registry (
+    job_type,
+    version,
+    variant,
+    module_path,
+    function_name,
+    is_default,
+    traffic_pct,
+    min_priority,
+    config
+) VALUES (
+    'google_reviews',
+    '1.0.0',
+    'stable',
+    'scrapers.google_reviews.v1_0_0',
+    'fast_scrape_reviews',
+    true,           -- This is the default scraper
+    100,            -- Receives 100% of traffic
+    0,              -- Available for all priority levels
+    '{
+        "max_concurrent_requests": 5,
+        "request_timeout_seconds": 30,
+        "retry_attempts": 3,
+        "retry_delay_seconds": 2,
+        "rate_limit_per_minute": 60
+    }'::jsonb
+);