Phase 1: Database migrations for platform features
Migrations created: - 001_add_job_platform_fields.sql: Add 15 new columns to jobs table - Requester tracking (client_id, source, purpose, metadata) - Batch support (batch_id, batch_index) - Execution tracking (job_type, scraper_version, variant, priority) - Webhook callbacks (url, status, sent_at, attempts) - Result summary (JSONB for cross-type dashboard) - 7 indexes for query performance - 5 CHECK constraints for data validation - 002_create_batches_table.sql: Batch job grouping - Tracks batch progress (total/completed/failed) - Batch-level callbacks - Requester association - 003_create_scraper_registry.sql: Scraper version management - Version routing (stable/beta/canary variants) - A/B traffic splitting (traffic_pct) - Priority-based routing - Seeds google_reviews v1.0.0 as stable default - 004_create_api_keys.sql: API authentication - Secure key storage (SHA-256 hashes, not plaintext) - Scopes-based permissions - Rate limiting support - Key lifecycle (expiry, active status) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
137
migrations/versions/003_create_scraper_registry.sql
Normal file
137
migrations/versions/003_create_scraper_registry.sql
Normal file
@@ -0,0 +1,137 @@
|
||||
-- Migration: 003_create_scraper_registry
|
||||
-- Description: Creates the scraper_registry table for dynamic scraper routing
|
||||
-- Date: 2026-01-24
|
||||
-- Phase: Phase 1 - ReviewIQ Platform
|
||||
|
||||
-- ============================================================================
|
||||
-- SCRAPER REGISTRY TABLE
|
||||
-- ============================================================================
|
||||
-- This table enables dynamic scraper selection and A/B testing capabilities.
|
||||
--
|
||||
-- ROUTING LOGIC:
|
||||
-- 1. For a given job_type, the system first filters by variant (stable/beta/canary)
|
||||
-- 2. Among matching scrapers, traffic_pct determines probability of selection
|
||||
-- 3. is_default=true marks the fallback scraper when no traffic routing applies
|
||||
-- 4. min_priority allows reserving certain scrapers for high-priority jobs
|
||||
--
|
||||
-- A/B TESTING:
|
||||
-- - Set traffic_pct on multiple versions (must sum to 100 for a job_type/variant)
|
||||
-- - Example: v1.0.0 at 90%, v1.1.0-beta at 10% for gradual rollout
|
||||
-- - Monitor performance metrics per version to make data-driven decisions
|
||||
--
|
||||
-- DEPRECATION:
|
||||
-- - Set deprecated_at to soft-deprecate a scraper version
|
||||
-- - Deprecated scrapers are excluded from routing but kept for audit history
|
||||
-- ============================================================================
|
||||
|
||||
CREATE TABLE scraper_registry (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
|
||||
-- Scraper identification
|
||||
job_type VARCHAR(50) NOT NULL, -- e.g., "google_reviews", "yelp_reviews"
|
||||
version VARCHAR(50) NOT NULL, -- semver format: "1.0.0", "1.1.0-beta"
|
||||
variant VARCHAR(20) NOT NULL, -- "stable", "beta", "canary"
|
||||
|
||||
-- Implementation reference
|
||||
module_path VARCHAR(255) NOT NULL, -- Python module path: "scrapers.google_reviews.v1_0_0"
|
||||
function_name VARCHAR(100) NOT NULL, -- Entry function: "fast_scrape_reviews"
|
||||
|
||||
-- Routing configuration
|
||||
is_default BOOLEAN DEFAULT false, -- Fallback when no traffic routing matches
|
||||
traffic_pct INTEGER DEFAULT 0, -- 0-100: percentage of traffic for A/B testing
|
||||
min_priority INTEGER DEFAULT 0, -- Only route jobs with priority >= this value
|
||||
|
||||
-- Lifecycle timestamps
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
deprecated_at TIMESTAMP, -- NULL if active; set to deprecate without deletion
|
||||
|
||||
-- Version-specific configuration
|
||||
config JSONB, -- Flexible settings: rate limits, retry policies, etc.
|
||||
|
||||
-- Constraints
|
||||
UNIQUE(job_type, version),
|
||||
|
||||
-- Ensure traffic_pct is within valid range
|
||||
CONSTRAINT valid_traffic_pct CHECK (traffic_pct >= 0 AND traffic_pct <= 100),
|
||||
|
||||
-- Ensure variant is one of the allowed values
|
||||
CONSTRAINT valid_variant CHECK (variant IN ('stable', 'beta', 'canary'))
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- INDEXES
|
||||
-- ============================================================================
|
||||
|
||||
-- Primary lookup index: find active scrapers for a job type
|
||||
-- Used by: scraper routing logic to find candidate scrapers
|
||||
CREATE INDEX idx_scraper_registry_job_type_lookup
|
||||
ON scraper_registry (job_type, variant, is_default)
|
||||
WHERE deprecated_at IS NULL;
|
||||
|
||||
-- Traffic routing index: quickly find scrapers participating in A/B tests
|
||||
-- Used by: traffic splitting logic
|
||||
CREATE INDEX idx_scraper_registry_traffic_routing
|
||||
ON scraper_registry (job_type, traffic_pct)
|
||||
WHERE deprecated_at IS NULL AND traffic_pct > 0;
|
||||
|
||||
-- Version lookup index: find specific scraper version
|
||||
-- Used by: admin tools, debugging, forced version routing
|
||||
CREATE INDEX idx_scraper_registry_version
|
||||
ON scraper_registry (job_type, version);
|
||||
|
||||
-- ============================================================================
|
||||
-- COMMENTS
|
||||
-- ============================================================================
|
||||
|
||||
COMMENT ON TABLE scraper_registry IS
|
||||
'Registry of available scraper implementations with A/B testing and routing support';
|
||||
|
||||
COMMENT ON COLUMN scraper_registry.job_type IS
|
||||
'Type of scraping job (e.g., google_reviews, yelp_reviews)';
|
||||
|
||||
COMMENT ON COLUMN scraper_registry.version IS
|
||||
'Semantic version of the scraper implementation';
|
||||
|
||||
COMMENT ON COLUMN scraper_registry.variant IS
|
||||
'Release channel: stable (production), beta (pre-release testing), canary (experimental)';
|
||||
|
||||
COMMENT ON COLUMN scraper_registry.traffic_pct IS
|
||||
'Percentage of traffic to route to this version (0-100). Sum should equal 100 per job_type/variant.';
|
||||
|
||||
COMMENT ON COLUMN scraper_registry.min_priority IS
|
||||
'Minimum job priority required to use this scraper. Allows reserving fast scrapers for urgent jobs.';
|
||||
|
||||
COMMENT ON COLUMN scraper_registry.config IS
|
||||
'JSONB configuration specific to this scraper version (rate limits, timeouts, feature flags)';
|
||||
|
||||
-- ============================================================================
|
||||
-- SEED DATA: Register current Google Reviews scraper
|
||||
-- ============================================================================
|
||||
|
||||
INSERT INTO scraper_registry (
|
||||
job_type,
|
||||
version,
|
||||
variant,
|
||||
module_path,
|
||||
function_name,
|
||||
is_default,
|
||||
traffic_pct,
|
||||
min_priority,
|
||||
config
|
||||
) VALUES (
|
||||
'google_reviews',
|
||||
'1.0.0',
|
||||
'stable',
|
||||
'scrapers.google_reviews.v1_0_0',
|
||||
'fast_scrape_reviews',
|
||||
true, -- This is the default scraper
|
||||
100, -- Receives 100% of traffic
|
||||
0, -- Available for all priority levels
|
||||
'{
|
||||
"max_concurrent_requests": 5,
|
||||
"request_timeout_seconds": 30,
|
||||
"retry_attempts": 3,
|
||||
"retry_delay_seconds": 2,
|
||||
"rate_limit_per_minute": 60
|
||||
}'::jsonb
|
||||
);
|
||||
Reference in New Issue
Block a user