Phase 1: Database migrations for platform features
Migrations created: - 001_add_job_platform_fields.sql: Add 15 new columns to jobs table - Requester tracking (client_id, source, purpose, metadata) - Batch support (batch_id, batch_index) - Execution tracking (job_type, scraper_version, variant, priority) - Webhook callbacks (url, status, sent_at, attempts) - Result summary (JSONB for cross-type dashboard) - 7 indexes for query performance - 5 CHECK constraints for data validation - 002_create_batches_table.sql: Batch job grouping - Tracks batch progress (total/completed/failed) - Batch-level callbacks - Requester association - 003_create_scraper_registry.sql: Scraper version management - Version routing (stable/beta/canary variants) - A/B traffic splitting (traffic_pct) - Priority-based routing - Seeds google_reviews v1.0.0 as stable default - 004_create_api_keys.sql: API authentication - Secure key storage (SHA-256 hashes, not plaintext) - Scopes-based permissions - Rate limiting support - Key lifecycle (expiry, active status) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -103,7 +103,7 @@ reviewiq/ # Will rename from google-reviews-scraper-pro
|
|||||||
|
|
||||||
| Phase | Description | Status |
|
| Phase | Description | Status |
|
||||||
|-------|-------------|--------|
|
|-------|-------------|--------|
|
||||||
| 0 | Project restructure (move files to new locations) | Not started |
|
| 0 | Project restructure (move files to new locations) | ✅ COMPLETE |
|
||||||
| 1 | Database migrations (new fields + tables) | Not started |
|
| 1 | Database migrations (new fields + tables) | Not started |
|
||||||
| 2 | Requester & batch support | Not started |
|
| 2 | Requester & batch support | Not started |
|
||||||
| 3 | Webhooks | Not started |
|
| 3 | Webhooks | Not started |
|
||||||
|
|||||||
243
migrations/versions/001_add_job_platform_fields.sql
Normal file
243
migrations/versions/001_add_job_platform_fields.sql
Normal file
@@ -0,0 +1,243 @@
|
|||||||
|
-- =============================================================================
|
||||||
|
-- Migration: 001_add_job_platform_fields.sql
|
||||||
|
-- ReviewIQ Platform - Phase 1
|
||||||
|
-- =============================================================================
|
||||||
|
--
|
||||||
|
-- Adds multi-platform support fields to the jobs table for ReviewIQ integration.
|
||||||
|
-- Enables tracking of job origin, batch processing, execution variants, and
|
||||||
|
-- webhook callbacks for cross-platform orchestration.
|
||||||
|
--
|
||||||
|
-- Prerequisite: jobs table must already exist (created by core/database.py)
|
||||||
|
--
|
||||||
|
-- Date: 2026-01-24
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- SECTION 1: REQUESTER FIELDS
|
||||||
|
-- Track which client/platform submitted the job and why
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- Client identifier from the requesting platform (e.g., "veritas_client_123")
|
||||||
|
-- Used for per-client analytics, rate limiting, and billing
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS requester_client_id VARCHAR(255);
|
||||||
|
COMMENT ON COLUMN jobs.requester_client_id IS
|
||||||
|
'Client identifier from requesting platform (e.g., "veritas_client_123")';
|
||||||
|
|
||||||
|
-- Source platform that submitted the job (e.g., "veritasreview.com")
|
||||||
|
-- Enables multi-tenant tracking and source-specific behavior
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS requester_source VARCHAR(100);
|
||||||
|
COMMENT ON COLUMN jobs.requester_source IS
|
||||||
|
'Source platform that submitted the job (e.g., "veritasreview.com")';
|
||||||
|
|
||||||
|
-- Purpose of the scrape for analytics and prioritization
|
||||||
|
-- Values: "client_report" | "prospect_screening" | "market_research"
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_purpose VARCHAR(50);
|
||||||
|
COMMENT ON COLUMN jobs.scrape_purpose IS
|
||||||
|
'Purpose of scrape: "client_report", "prospect_screening", "market_research"';
|
||||||
|
|
||||||
|
-- Flexible JSONB field for requester-specific metadata
|
||||||
|
-- Allows platforms to pass through custom data without schema changes
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS requester_metadata JSONB;
|
||||||
|
COMMENT ON COLUMN jobs.requester_metadata IS
|
||||||
|
'Flexible JSONB for requester-specific metadata (pass-through data)';
|
||||||
|
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- SECTION 2: BATCH FIELDS
|
||||||
|
-- Support for grouped job submissions (e.g., "scrape these 50 locations")
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- Links job to a batch record (batches table to be created in future migration)
|
||||||
|
-- NULL indicates a standalone job, not part of a batch
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS batch_id UUID;
|
||||||
|
COMMENT ON COLUMN jobs.batch_id IS
|
||||||
|
'UUID linking to batches table (NULL for standalone jobs)';
|
||||||
|
|
||||||
|
-- Position within the batch (1-indexed: 1, 2, 3...)
|
||||||
|
-- Used for ordered processing and progress tracking
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS batch_index INTEGER;
|
||||||
|
COMMENT ON COLUMN jobs.batch_index IS
|
||||||
|
'Position in batch (1-indexed), NULL for standalone jobs';
|
||||||
|
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- SECTION 3: EXECUTION FIELDS
|
||||||
|
-- Control how the job is processed (type, version, priority)
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- Type of scraping job (extensible for future scrapers)
|
||||||
|
-- Default "google_reviews" maintains backward compatibility
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS job_type VARCHAR(50) DEFAULT 'google_reviews';
|
||||||
|
COMMENT ON COLUMN jobs.job_type IS
|
||||||
|
'Job type for multi-scraper support (default: "google_reviews")';
|
||||||
|
|
||||||
|
-- Scraper version that processed the job (e.g., "1.0.0", "2.1.3")
|
||||||
|
-- Essential for debugging, regression analysis, and A/B testing
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scraper_version VARCHAR(50);
|
||||||
|
COMMENT ON COLUMN jobs.scraper_version IS
|
||||||
|
'Scraper version that processed this job (e.g., "1.0.0")';
|
||||||
|
|
||||||
|
-- Deployment variant used for canary/staged rollouts
|
||||||
|
-- Values: "stable" | "beta" | "canary"
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scraper_variant VARCHAR(20);
|
||||||
|
COMMENT ON COLUMN jobs.scraper_variant IS
|
||||||
|
'Deployment variant: "stable", "beta", or "canary"';
|
||||||
|
|
||||||
|
-- Job priority for queue ordering
|
||||||
|
-- 0=normal (default), 1=high, 2=urgent
|
||||||
|
-- Higher priority jobs are processed first
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS priority INTEGER DEFAULT 0;
|
||||||
|
COMMENT ON COLUMN jobs.priority IS
|
||||||
|
'Queue priority: 0=normal (default), 1=high, 2=urgent';
|
||||||
|
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- SECTION 4: CALLBACK FIELDS
|
||||||
|
-- Webhook notification management (enhanced from existing webhook_url)
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- Primary callback URL for job completion notifications
|
||||||
|
-- Separate from existing webhook_url to allow different callback patterns
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS callback_url TEXT;
|
||||||
|
COMMENT ON COLUMN jobs.callback_url IS
|
||||||
|
'Webhook URL for job completion callbacks';
|
||||||
|
|
||||||
|
-- Current status of callback delivery
|
||||||
|
-- Values: "pending" | "sent" | "failed"
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS callback_status VARCHAR(20);
|
||||||
|
COMMENT ON COLUMN jobs.callback_status IS
|
||||||
|
'Callback delivery status: "pending", "sent", "failed"';
|
||||||
|
|
||||||
|
-- Timestamp when callback was successfully sent
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS callback_sent_at TIMESTAMP;
|
||||||
|
COMMENT ON COLUMN jobs.callback_sent_at IS
|
||||||
|
'Timestamp when callback was successfully delivered';
|
||||||
|
|
||||||
|
-- Number of callback delivery attempts (for retry logic)
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS callback_attempts INTEGER DEFAULT 0;
|
||||||
|
COMMENT ON COLUMN jobs.callback_attempts IS
|
||||||
|
'Number of callback delivery attempts (for retry tracking)';
|
||||||
|
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- SECTION 5: RESULT SUMMARY
|
||||||
|
-- Normalized summary for cross-platform dashboards
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- JSONB summary of results for quick dashboard queries
|
||||||
|
-- Contains pre-computed metrics without loading full reviews_data
|
||||||
|
-- Example: {"total_reviews": 150, "avg_rating": 4.2, "sentiment": {"positive": 80, "negative": 20}}
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS result_summary JSONB;
|
||||||
|
COMMENT ON COLUMN jobs.result_summary IS
|
||||||
|
'JSONB summary for dashboards: review counts, ratings, sentiment breakdown';
|
||||||
|
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- SECTION 6: INDEXES
|
||||||
|
-- Optimized for common query patterns
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- Index for client-based queries (per-client job history, analytics)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_jobs_requester_client_id
|
||||||
|
ON jobs(requester_client_id)
|
||||||
|
WHERE requester_client_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- Index for batch operations (get all jobs in a batch)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_jobs_batch_id
|
||||||
|
ON jobs(batch_id)
|
||||||
|
WHERE batch_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- Index for job type filtering (when multiple scrapers exist)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_jobs_job_type
|
||||||
|
ON jobs(job_type);
|
||||||
|
|
||||||
|
-- Index for priority queue ordering (high priority jobs first)
|
||||||
|
-- Composite with status for efficient "get next job" queries
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_jobs_priority_status
|
||||||
|
ON jobs(priority DESC, status, created_at ASC)
|
||||||
|
WHERE status = 'pending';
|
||||||
|
|
||||||
|
-- Index for requester source analytics
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_jobs_requester_source
|
||||||
|
ON jobs(requester_source)
|
||||||
|
WHERE requester_source IS NOT NULL;
|
||||||
|
|
||||||
|
-- Index for callback retry processing
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_jobs_callback_pending
|
||||||
|
ON jobs(callback_status, callback_attempts)
|
||||||
|
WHERE callback_status IN ('pending', 'failed');
|
||||||
|
|
||||||
|
-- Composite index for scraper version analytics
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_jobs_scraper_version
|
||||||
|
ON jobs(scraper_version, scraper_variant)
|
||||||
|
WHERE scraper_version IS NOT NULL;
|
||||||
|
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- SECTION 7: CONSTRAINTS
|
||||||
|
-- Data integrity for new fields
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- Ensure valid scrape_purpose values
|
||||||
|
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_scrape_purpose;
|
||||||
|
ALTER TABLE jobs ADD CONSTRAINT valid_scrape_purpose
|
||||||
|
CHECK (scrape_purpose IS NULL OR scrape_purpose IN ('client_report', 'prospect_screening', 'market_research'));
|
||||||
|
|
||||||
|
-- Ensure valid scraper_variant values
|
||||||
|
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_scraper_variant;
|
||||||
|
ALTER TABLE jobs ADD CONSTRAINT valid_scraper_variant
|
||||||
|
CHECK (scraper_variant IS NULL OR scraper_variant IN ('stable', 'beta', 'canary'));
|
||||||
|
|
||||||
|
-- Ensure valid callback_status values
|
||||||
|
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_callback_status;
|
||||||
|
ALTER TABLE jobs ADD CONSTRAINT valid_callback_status
|
||||||
|
CHECK (callback_status IS NULL OR callback_status IN ('pending', 'sent', 'failed'));
|
||||||
|
|
||||||
|
-- Ensure valid priority range
|
||||||
|
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_priority;
|
||||||
|
ALTER TABLE jobs ADD CONSTRAINT valid_priority
|
||||||
|
CHECK (priority >= 0 AND priority <= 2);
|
||||||
|
|
||||||
|
-- Ensure batch_index is positive when set
|
||||||
|
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_batch_index;
|
||||||
|
ALTER TABLE jobs ADD CONSTRAINT valid_batch_index
|
||||||
|
CHECK (batch_index IS NULL OR batch_index > 0);
|
||||||
|
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- END OF MIGRATION
|
||||||
|
-- =============================================================================
|
||||||
|
--
|
||||||
|
-- Rollback commands (if needed):
|
||||||
|
--
|
||||||
|
-- ALTER TABLE jobs DROP COLUMN IF EXISTS requester_client_id;
|
||||||
|
-- ALTER TABLE jobs DROP COLUMN IF EXISTS requester_source;
|
||||||
|
-- ALTER TABLE jobs DROP COLUMN IF EXISTS scrape_purpose;
|
||||||
|
-- ALTER TABLE jobs DROP COLUMN IF EXISTS requester_metadata;
|
||||||
|
-- ALTER TABLE jobs DROP COLUMN IF EXISTS batch_id;
|
||||||
|
-- ALTER TABLE jobs DROP COLUMN IF EXISTS batch_index;
|
||||||
|
-- ALTER TABLE jobs DROP COLUMN IF EXISTS job_type;
|
||||||
|
-- ALTER TABLE jobs DROP COLUMN IF EXISTS scraper_version;
|
||||||
|
-- ALTER TABLE jobs DROP COLUMN IF EXISTS scraper_variant;
|
||||||
|
-- ALTER TABLE jobs DROP COLUMN IF EXISTS priority;
|
||||||
|
-- ALTER TABLE jobs DROP COLUMN IF EXISTS callback_url;
|
||||||
|
-- ALTER TABLE jobs DROP COLUMN IF EXISTS callback_status;
|
||||||
|
-- ALTER TABLE jobs DROP COLUMN IF EXISTS callback_sent_at;
|
||||||
|
-- ALTER TABLE jobs DROP COLUMN IF EXISTS callback_attempts;
|
||||||
|
-- ALTER TABLE jobs DROP COLUMN IF EXISTS result_summary;
|
||||||
|
--
|
||||||
|
-- DROP INDEX IF EXISTS idx_jobs_requester_client_id;
|
||||||
|
-- DROP INDEX IF EXISTS idx_jobs_batch_id;
|
||||||
|
-- DROP INDEX IF EXISTS idx_jobs_job_type;
|
||||||
|
-- DROP INDEX IF EXISTS idx_jobs_priority_status;
|
||||||
|
-- DROP INDEX IF EXISTS idx_jobs_requester_source;
|
||||||
|
-- DROP INDEX IF EXISTS idx_jobs_callback_pending;
|
||||||
|
-- DROP INDEX IF EXISTS idx_jobs_scraper_version;
|
||||||
|
--
|
||||||
|
-- ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_scrape_purpose;
|
||||||
|
-- ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_scraper_variant;
|
||||||
|
-- ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_callback_status;
|
||||||
|
-- ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_priority;
|
||||||
|
-- ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_batch_index;
|
||||||
|
--
|
||||||
|
-- =============================================================================
|
||||||
94
migrations/versions/002_create_batches_table.sql
Normal file
94
migrations/versions/002_create_batches_table.sql
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
-- Migration: 002_create_batches_table.sql
|
||||||
|
-- Description: Creates the batches table for grouping multiple scrape jobs together
|
||||||
|
-- Author: ReviewIQ Platform
|
||||||
|
-- Date: 2026-01-24
|
||||||
|
--
|
||||||
|
-- The batches table allows clients to submit multiple places/jobs in a single request,
|
||||||
|
-- track aggregate progress, and receive a single callback when all jobs complete.
|
||||||
|
-- This is useful for bulk operations like "screen all 50 prospects" or "refresh all locations".
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- CREATE TABLE: batches
|
||||||
|
-- =============================================================================
|
||||||
|
-- A batch represents a collection of scrape jobs submitted together.
|
||||||
|
-- It tracks overall progress and handles consolidated callbacks.
|
||||||
|
--
|
||||||
|
-- Foreign Key Reference:
|
||||||
|
-- jobs.batch_id -> batches.id (defined in jobs table migration)
|
||||||
|
-- When a batch is created, individual jobs reference it via batch_id.
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
CREATE TABLE batches (
|
||||||
|
-- Primary key
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
|
||||||
|
-- Requester identification
|
||||||
|
-- Tracks which client/system submitted this batch and why
|
||||||
|
requester_client_id VARCHAR(255), -- Client identifier (e.g., "acme-corp", "internal-audit")
|
||||||
|
requester_source VARCHAR(100), -- Source system (e.g., "salesforce", "hubspot", "api")
|
||||||
|
scrape_purpose VARCHAR(50), -- Purpose code (e.g., "screening", "monitoring", "audit")
|
||||||
|
|
||||||
|
-- Batch metadata
|
||||||
|
name VARCHAR(255), -- Human-readable name (e.g., "Q1 Prospect Screening")
|
||||||
|
total_jobs INTEGER NOT NULL DEFAULT 0, -- Total number of jobs in this batch
|
||||||
|
completed_jobs INTEGER DEFAULT 0, -- Count of successfully completed jobs
|
||||||
|
failed_jobs INTEGER DEFAULT 0, -- Count of failed jobs
|
||||||
|
status VARCHAR(20) DEFAULT 'pending', -- Batch status: pending, running, completed
|
||||||
|
|
||||||
|
-- Callback configuration
|
||||||
|
-- When all jobs complete, optionally notify a webhook endpoint
|
||||||
|
callback_url TEXT, -- Webhook URL to call on batch completion
|
||||||
|
callback_status VARCHAR(20), -- Callback result: pending, success, failed
|
||||||
|
callback_sent_at TIMESTAMP, -- When the callback was sent
|
||||||
|
|
||||||
|
-- Timestamps
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, -- When batch was created
|
||||||
|
completed_at TIMESTAMP, -- When batch finished (all jobs done)
|
||||||
|
|
||||||
|
-- Flexible metadata storage
|
||||||
|
-- Allows clients to attach arbitrary data for their own tracking needs
|
||||||
|
metadata JSONB -- Custom client data (e.g., {"campaign_id": "123"})
|
||||||
|
);
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- INDEXES
|
||||||
|
-- =============================================================================
|
||||||
|
-- These indexes optimize common query patterns:
|
||||||
|
|
||||||
|
-- Index for looking up batches by client
|
||||||
|
-- Used when clients query "show me all my batches"
|
||||||
|
CREATE INDEX idx_batches_requester_client_id ON batches(requester_client_id);
|
||||||
|
|
||||||
|
-- Index for filtering by status
|
||||||
|
-- Used for dashboards showing pending/running/completed batches
|
||||||
|
CREATE INDEX idx_batches_status ON batches(status);
|
||||||
|
|
||||||
|
-- Index for time-based queries
|
||||||
|
-- Used for "recent batches", cleanup jobs, and analytics
|
||||||
|
CREATE INDEX idx_batches_created_at ON batches(created_at);
|
||||||
|
|
||||||
|
-- Composite index for common dashboard query pattern
|
||||||
|
-- Optimizes: "show me pending batches for client X ordered by creation time"
|
||||||
|
CREATE INDEX idx_batches_client_status_created ON batches(requester_client_id, status, created_at DESC);
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- COMMENTS
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
COMMENT ON TABLE batches IS 'Groups multiple scrape jobs for batch processing with aggregate tracking and callbacks';
|
||||||
|
|
||||||
|
COMMENT ON COLUMN batches.id IS 'Unique identifier for the batch (UUID)';
|
||||||
|
COMMENT ON COLUMN batches.requester_client_id IS 'Identifier of the client who submitted this batch';
|
||||||
|
COMMENT ON COLUMN batches.requester_source IS 'Source system that originated the request (e.g., salesforce, api)';
|
||||||
|
COMMENT ON COLUMN batches.scrape_purpose IS 'Purpose of the scrape (screening, monitoring, audit, etc.)';
|
||||||
|
COMMENT ON COLUMN batches.name IS 'Human-readable batch name for display purposes';
|
||||||
|
COMMENT ON COLUMN batches.total_jobs IS 'Total number of jobs in this batch';
|
||||||
|
COMMENT ON COLUMN batches.completed_jobs IS 'Number of jobs that completed successfully';
|
||||||
|
COMMENT ON COLUMN batches.failed_jobs IS 'Number of jobs that failed';
|
||||||
|
COMMENT ON COLUMN batches.status IS 'Current batch status: pending, running, or completed';
|
||||||
|
COMMENT ON COLUMN batches.callback_url IS 'Webhook URL to notify when batch completes';
|
||||||
|
COMMENT ON COLUMN batches.callback_status IS 'Result of callback attempt: pending, success, or failed';
|
||||||
|
COMMENT ON COLUMN batches.callback_sent_at IS 'Timestamp when callback was sent';
|
||||||
|
COMMENT ON COLUMN batches.created_at IS 'When the batch was created';
|
||||||
|
COMMENT ON COLUMN batches.completed_at IS 'When the batch finished processing';
|
||||||
|
COMMENT ON COLUMN batches.metadata IS 'Arbitrary JSON metadata for client-specific needs';
|
||||||
137
migrations/versions/003_create_scraper_registry.sql
Normal file
137
migrations/versions/003_create_scraper_registry.sql
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
-- Migration: 003_create_scraper_registry
|
||||||
|
-- Description: Creates the scraper_registry table for dynamic scraper routing
|
||||||
|
-- Date: 2026-01-24
|
||||||
|
-- Phase: Phase 1 - ReviewIQ Platform
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SCRAPER REGISTRY TABLE
|
||||||
|
-- ============================================================================
|
||||||
|
-- This table enables dynamic scraper selection and A/B testing capabilities.
|
||||||
|
--
|
||||||
|
-- ROUTING LOGIC:
|
||||||
|
-- 1. For a given job_type, the system first filters by variant (stable/beta/canary)
|
||||||
|
-- 2. Among matching scrapers, traffic_pct determines probability of selection
|
||||||
|
-- 3. is_default=true marks the fallback scraper when no traffic routing applies
|
||||||
|
-- 4. min_priority allows reserving certain scrapers for high-priority jobs
|
||||||
|
--
|
||||||
|
-- A/B TESTING:
|
||||||
|
-- - Set traffic_pct on multiple versions (must sum to 100 for a job_type/variant)
|
||||||
|
-- - Example: v1.0.0 at 90%, v1.1.0-beta at 10% for gradual rollout
|
||||||
|
-- - Monitor performance metrics per version to make data-driven decisions
|
||||||
|
--
|
||||||
|
-- DEPRECATION:
|
||||||
|
-- - Set deprecated_at to soft-deprecate a scraper version
|
||||||
|
-- - Deprecated scrapers are excluded from routing but kept for audit history
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
CREATE TABLE scraper_registry (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
|
||||||
|
-- Scraper identification
|
||||||
|
job_type VARCHAR(50) NOT NULL, -- e.g., "google_reviews", "yelp_reviews"
|
||||||
|
version VARCHAR(50) NOT NULL, -- semver format: "1.0.0", "1.1.0-beta"
|
||||||
|
variant VARCHAR(20) NOT NULL, -- "stable", "beta", "canary"
|
||||||
|
|
||||||
|
-- Implementation reference
|
||||||
|
module_path VARCHAR(255) NOT NULL, -- Python module path: "scrapers.google_reviews.v1_0_0"
|
||||||
|
function_name VARCHAR(100) NOT NULL, -- Entry function: "fast_scrape_reviews"
|
||||||
|
|
||||||
|
-- Routing configuration
|
||||||
|
is_default BOOLEAN DEFAULT false, -- Fallback when no traffic routing matches
|
||||||
|
traffic_pct INTEGER DEFAULT 0, -- 0-100: percentage of traffic for A/B testing
|
||||||
|
min_priority INTEGER DEFAULT 0, -- Only route jobs with priority >= this value
|
||||||
|
|
||||||
|
-- Lifecycle timestamps
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
deprecated_at TIMESTAMP, -- NULL if active; set to deprecate without deletion
|
||||||
|
|
||||||
|
-- Version-specific configuration
|
||||||
|
config JSONB, -- Flexible settings: rate limits, retry policies, etc.
|
||||||
|
|
||||||
|
-- Constraints
|
||||||
|
UNIQUE(job_type, version),
|
||||||
|
|
||||||
|
-- Ensure traffic_pct is within valid range
|
||||||
|
CONSTRAINT valid_traffic_pct CHECK (traffic_pct >= 0 AND traffic_pct <= 100),
|
||||||
|
|
||||||
|
-- Ensure variant is one of the allowed values
|
||||||
|
CONSTRAINT valid_variant CHECK (variant IN ('stable', 'beta', 'canary'))
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- INDEXES
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Primary lookup index: find active scrapers for a job type
|
||||||
|
-- Used by: scraper routing logic to find candidate scrapers
|
||||||
|
CREATE INDEX idx_scraper_registry_job_type_lookup
|
||||||
|
ON scraper_registry (job_type, variant, is_default)
|
||||||
|
WHERE deprecated_at IS NULL;
|
||||||
|
|
||||||
|
-- Traffic routing index: quickly find scrapers participating in A/B tests
|
||||||
|
-- Used by: traffic splitting logic
|
||||||
|
CREATE INDEX idx_scraper_registry_traffic_routing
|
||||||
|
ON scraper_registry (job_type, traffic_pct)
|
||||||
|
WHERE deprecated_at IS NULL AND traffic_pct > 0;
|
||||||
|
|
||||||
|
-- Version lookup index: find specific scraper version
|
||||||
|
-- Used by: admin tools, debugging, forced version routing
|
||||||
|
CREATE INDEX idx_scraper_registry_version
|
||||||
|
ON scraper_registry (job_type, version);
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- COMMENTS
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
COMMENT ON TABLE scraper_registry IS
|
||||||
|
'Registry of available scraper implementations with A/B testing and routing support';
|
||||||
|
|
||||||
|
COMMENT ON COLUMN scraper_registry.job_type IS
|
||||||
|
'Type of scraping job (e.g., google_reviews, yelp_reviews)';
|
||||||
|
|
||||||
|
COMMENT ON COLUMN scraper_registry.version IS
|
||||||
|
'Semantic version of the scraper implementation';
|
||||||
|
|
||||||
|
COMMENT ON COLUMN scraper_registry.variant IS
|
||||||
|
'Release channel: stable (production), beta (pre-release testing), canary (experimental)';
|
||||||
|
|
||||||
|
COMMENT ON COLUMN scraper_registry.traffic_pct IS
|
||||||
|
'Percentage of traffic to route to this version (0-100). Sum should equal 100 per job_type/variant.';
|
||||||
|
|
||||||
|
COMMENT ON COLUMN scraper_registry.min_priority IS
|
||||||
|
'Minimum job priority required to use this scraper. Allows reserving fast scrapers for urgent jobs.';
|
||||||
|
|
||||||
|
COMMENT ON COLUMN scraper_registry.config IS
|
||||||
|
'JSONB configuration specific to this scraper version (rate limits, timeouts, feature flags)';
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SEED DATA: Register current Google Reviews scraper
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
INSERT INTO scraper_registry (
|
||||||
|
job_type,
|
||||||
|
version,
|
||||||
|
variant,
|
||||||
|
module_path,
|
||||||
|
function_name,
|
||||||
|
is_default,
|
||||||
|
traffic_pct,
|
||||||
|
min_priority,
|
||||||
|
config
|
||||||
|
) VALUES (
|
||||||
|
'google_reviews',
|
||||||
|
'1.0.0',
|
||||||
|
'stable',
|
||||||
|
'scrapers.google_reviews.v1_0_0',
|
||||||
|
'fast_scrape_reviews',
|
||||||
|
true, -- This is the default scraper
|
||||||
|
100, -- Receives 100% of traffic
|
||||||
|
0, -- Available for all priority levels
|
||||||
|
'{
|
||||||
|
"max_concurrent_requests": 5,
|
||||||
|
"request_timeout_seconds": 30,
|
||||||
|
"retry_attempts": 3,
|
||||||
|
"retry_delay_seconds": 2,
|
||||||
|
"rate_limit_per_minute": 60
|
||||||
|
}'::jsonb
|
||||||
|
);
|
||||||
79
migrations/versions/004_create_api_keys.sql
Normal file
79
migrations/versions/004_create_api_keys.sql
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
-- Migration: 004_create_api_keys.sql
|
||||||
|
-- Description: Create api_keys table for API authentication
|
||||||
|
-- Created: 2026-01-24
|
||||||
|
--
|
||||||
|
-- Security Model:
|
||||||
|
-- ================
|
||||||
|
-- API keys are NEVER stored in plain text. When a new API key is generated:
|
||||||
|
-- 1. A random key is generated (e.g., "riq_abc123xyz...")
|
||||||
|
-- 2. The full key is returned to the user ONCE and never stored
|
||||||
|
-- 3. We store only the SHA-256 hash of the key (key_hash)
|
||||||
|
-- 4. We store the first 8 characters (key_prefix) for identification in logs/UI
|
||||||
|
--
|
||||||
|
-- Authentication Flow:
|
||||||
|
-- 1. Client sends API key in Authorization header
|
||||||
|
-- 2. Server hashes the received key with SHA-256
|
||||||
|
-- 3. Server looks up the hash in this table
|
||||||
|
-- 4. If found and is_active=true and not expired, request is authenticated
|
||||||
|
--
|
||||||
|
-- This approach ensures that even if the database is compromised,
|
||||||
|
-- attackers cannot recover the actual API keys.
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- CREATE TABLE
|
||||||
|
-- ============================================================================
|
||||||
|
CREATE TABLE api_keys (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
|
||||||
|
-- Key identification (security: we store hash, not the actual key)
|
||||||
|
key_hash VARCHAR(64) NOT NULL UNIQUE, -- SHA-256 hash of API key (64 hex chars)
|
||||||
|
key_prefix VARCHAR(8) NOT NULL, -- First 8 chars for identification in UI/logs
|
||||||
|
name VARCHAR(255) NOT NULL, -- Human-readable name, e.g., "Veritas Production Key"
|
||||||
|
|
||||||
|
-- Client association
|
||||||
|
client_id VARCHAR(255) NOT NULL, -- External client identifier, e.g., "veritas_client_123"
|
||||||
|
|
||||||
|
-- Permissions (PostgreSQL array of allowed scopes)
|
||||||
|
scopes TEXT[] DEFAULT '{}', -- e.g., {"jobs:read", "jobs:write", "admin"}
|
||||||
|
|
||||||
|
-- Rate limiting
|
||||||
|
rate_limit_rpm INTEGER DEFAULT 60, -- Maximum requests per minute for this key
|
||||||
|
|
||||||
|
-- Status
|
||||||
|
is_active BOOLEAN DEFAULT true, -- Set to false to revoke without deleting
|
||||||
|
|
||||||
|
-- Timestamps
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
last_used_at TIMESTAMP, -- Updated on each successful authentication
|
||||||
|
expires_at TIMESTAMP, -- NULL means the key never expires
|
||||||
|
|
||||||
|
-- Extensible metadata (for future use: IP allowlists, custom limits, etc.)
|
||||||
|
metadata JSONB
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- INDEXES
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Primary lookup index: This is the critical path for every API request.
|
||||||
|
-- When authenticating, we hash the provided key and look it up here.
|
||||||
|
-- UNIQUE constraint already creates an index, but we're explicit for clarity.
|
||||||
|
CREATE INDEX idx_api_keys_key_hash ON api_keys (key_hash);
|
||||||
|
|
||||||
|
-- Client lookup index: For admin operations like "list all keys for client X"
|
||||||
|
-- or "revoke all keys for client X"
|
||||||
|
CREATE INDEX idx_api_keys_client_id ON api_keys (client_id);
|
||||||
|
|
||||||
|
-- Active keys index: Useful for filtering active/inactive keys in queries
|
||||||
|
-- Partial index for efficiency (only indexes active keys)
|
||||||
|
CREATE INDEX idx_api_keys_active ON api_keys (is_active) WHERE is_active = true;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- COMMENTS
|
||||||
|
-- ============================================================================
|
||||||
|
COMMENT ON TABLE api_keys IS 'API keys for authenticating external clients. Keys are stored as SHA-256 hashes for security.';
|
||||||
|
COMMENT ON COLUMN api_keys.key_hash IS 'SHA-256 hash of the API key. The actual key is never stored.';
|
||||||
|
COMMENT ON COLUMN api_keys.key_prefix IS 'First 8 characters of the key for identification in UI and logs.';
|
||||||
|
COMMENT ON COLUMN api_keys.scopes IS 'Array of permission scopes: jobs:read, jobs:write, admin, etc.';
|
||||||
|
COMMENT ON COLUMN api_keys.rate_limit_rpm IS 'Rate limit in requests per minute. NULL uses system default.';
|
||||||
|
COMMENT ON COLUMN api_keys.metadata IS 'Extensible JSON metadata: IP allowlists, usage notes, etc.';
|
||||||
Reference in New Issue
Block a user