-- Migration: 003_create_scraper_registry -- Description: Creates the scraper_registry table for dynamic scraper routing -- Date: 2026-01-24 -- Phase: Phase 1 - ReviewIQ Platform -- ============================================================================ -- SCRAPER REGISTRY TABLE -- ============================================================================ -- This table enables dynamic scraper selection and A/B testing capabilities. -- -- ROUTING LOGIC: -- 1. For a given job_type, the system first filters by variant (stable/beta/canary) -- 2. Among matching scrapers, traffic_pct determines probability of selection -- 3. is_default=true marks the fallback scraper when no traffic routing applies -- 4. min_priority allows reserving certain scrapers for high-priority jobs -- -- A/B TESTING: -- - Set traffic_pct on multiple versions (must sum to 100 for a job_type/variant) -- - Example: v1.0.0 at 90%, v1.1.0-beta at 10% for gradual rollout -- - Monitor performance metrics per version to make data-driven decisions -- -- DEPRECATION: -- - Set deprecated_at to soft-deprecate a scraper version -- - Deprecated scrapers are excluded from routing but kept for audit history -- ============================================================================ CREATE TABLE scraper_registry ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), -- Scraper identification job_type VARCHAR(50) NOT NULL, -- e.g., "google_reviews", "yelp_reviews" version VARCHAR(50) NOT NULL, -- semver format: "1.0.0", "1.1.0-beta" variant VARCHAR(20) NOT NULL, -- "stable", "beta", "canary" -- Implementation reference module_path VARCHAR(255) NOT NULL, -- Python module path: "scrapers.google_reviews.v1_0_0" function_name VARCHAR(100) NOT NULL, -- Entry function: "fast_scrape_reviews" -- Routing configuration is_default BOOLEAN DEFAULT false, -- Fallback when no traffic routing matches traffic_pct INTEGER DEFAULT 0, -- 0-100: percentage of traffic for A/B testing min_priority INTEGER DEFAULT 0, -- Only route jobs with priority >= this value -- Lifecycle timestamps created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, deprecated_at TIMESTAMP, -- NULL if active; set to deprecate without deletion -- Version-specific configuration config JSONB, -- Flexible settings: rate limits, retry policies, etc. -- Constraints UNIQUE(job_type, version), -- Ensure traffic_pct is within valid range CONSTRAINT valid_traffic_pct CHECK (traffic_pct >= 0 AND traffic_pct <= 100), -- Ensure variant is one of the allowed values CONSTRAINT valid_variant CHECK (variant IN ('stable', 'beta', 'canary')) ); -- ============================================================================ -- INDEXES -- ============================================================================ -- Primary lookup index: find active scrapers for a job type -- Used by: scraper routing logic to find candidate scrapers CREATE INDEX idx_scraper_registry_job_type_lookup ON scraper_registry (job_type, variant, is_default) WHERE deprecated_at IS NULL; -- Traffic routing index: quickly find scrapers participating in A/B tests -- Used by: traffic splitting logic CREATE INDEX idx_scraper_registry_traffic_routing ON scraper_registry (job_type, traffic_pct) WHERE deprecated_at IS NULL AND traffic_pct > 0; -- Version lookup index: find specific scraper version -- Used by: admin tools, debugging, forced version routing CREATE INDEX idx_scraper_registry_version ON scraper_registry (job_type, version); -- ============================================================================ -- COMMENTS -- ============================================================================ COMMENT ON TABLE scraper_registry IS 'Registry of available scraper implementations with A/B testing and routing support'; COMMENT ON COLUMN scraper_registry.job_type IS 'Type of scraping job (e.g., google_reviews, yelp_reviews)'; COMMENT ON COLUMN scraper_registry.version IS 'Semantic version of the scraper implementation'; COMMENT ON COLUMN scraper_registry.variant IS 'Release channel: stable (production), beta (pre-release testing), canary (experimental)'; COMMENT ON COLUMN scraper_registry.traffic_pct IS 'Percentage of traffic to route to this version (0-100). Sum should equal 100 per job_type/variant.'; COMMENT ON COLUMN scraper_registry.min_priority IS 'Minimum job priority required to use this scraper. Allows reserving fast scrapers for urgent jobs.'; COMMENT ON COLUMN scraper_registry.config IS 'JSONB configuration specific to this scraper version (rate limits, timeouts, feature flags)'; -- ============================================================================ -- SEED DATA: Register current Google Reviews scraper -- ============================================================================ INSERT INTO scraper_registry ( job_type, version, variant, module_path, function_name, is_default, traffic_pct, min_priority, config ) VALUES ( 'google_reviews', '1.0.0', 'stable', 'scrapers.google_reviews.v1_0_0', 'fast_scrape_reviews', true, -- This is the default scraper 100, -- Receives 100% of traffic 0, -- Available for all priority levels '{ "max_concurrent_requests": 5, "request_timeout_seconds": 30, "retry_attempts": 3, "retry_delay_seconds": 2, "rate_limit_per_minute": 60 }'::jsonb );