From 2412996c54fe16cde7b03fbc49de228e43057300 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Sat, 24 Jan 2026 15:24:28 +0000 Subject: [PATCH] Phase 1: Database migrations for platform features Migrations created: - 001_add_job_platform_fields.sql: Add 15 new columns to jobs table - Requester tracking (client_id, source, purpose, metadata) - Batch support (batch_id, batch_index) - Execution tracking (job_type, scraper_version, variant, priority) - Webhook callbacks (url, status, sent_at, attempts) - Result summary (JSONB for cross-type dashboard) - 7 indexes for query performance - 5 CHECK constraints for data validation - 002_create_batches_table.sql: Batch job grouping - Tracks batch progress (total/completed/failed) - Batch-level callbacks - Requester association - 003_create_scraper_registry.sql: Scraper version management - Version routing (stable/beta/canary variants) - A/B traffic splitting (traffic_pct) - Priority-based routing - Seeds google_reviews v1.0.0 as stable default - 004_create_api_keys.sql: API authentication - Secure key storage (SHA-256 hashes, not plaintext) - Scopes-based permissions - Rate limiting support - Key lifecycle (expiry, active status) Co-Authored-By: Claude Opus 4.5 --- .artifacts/CONTEXT-KEEPER.md | 2 +- .../versions/001_add_job_platform_fields.sql | 243 ++++++++++++++++++ .../versions/002_create_batches_table.sql | 94 +++++++ .../versions/003_create_scraper_registry.sql | 137 ++++++++++ migrations/versions/004_create_api_keys.sql | 79 ++++++ 5 files changed, 554 insertions(+), 1 deletion(-) create mode 100644 migrations/versions/001_add_job_platform_fields.sql create mode 100644 migrations/versions/002_create_batches_table.sql create mode 100644 migrations/versions/003_create_scraper_registry.sql create mode 100644 migrations/versions/004_create_api_keys.sql diff --git a/.artifacts/CONTEXT-KEEPER.md b/.artifacts/CONTEXT-KEEPER.md index d9a568c..4caad8f 100644 --- a/.artifacts/CONTEXT-KEEPER.md +++ b/.artifacts/CONTEXT-KEEPER.md @@ -103,7 +103,7 @@ reviewiq/ # Will rename from google-reviews-scraper-pro | Phase | Description | Status | |-------|-------------|--------| -| 0 | Project restructure (move files to new locations) | Not started | +| 0 | Project restructure (move files to new locations) | ✅ COMPLETE | | 1 | Database migrations (new fields + tables) | Not started | | 2 | Requester & batch support | Not started | | 3 | Webhooks | Not started | diff --git a/migrations/versions/001_add_job_platform_fields.sql b/migrations/versions/001_add_job_platform_fields.sql new file mode 100644 index 0000000..81bccbe --- /dev/null +++ b/migrations/versions/001_add_job_platform_fields.sql @@ -0,0 +1,243 @@ +-- ============================================================================= +-- Migration: 001_add_job_platform_fields.sql +-- ReviewIQ Platform - Phase 1 +-- ============================================================================= +-- +-- Adds multi-platform support fields to the jobs table for ReviewIQ integration. +-- Enables tracking of job origin, batch processing, execution variants, and +-- webhook callbacks for cross-platform orchestration. +-- +-- Prerequisite: jobs table must already exist (created by core/database.py) +-- +-- Date: 2026-01-24 +-- ============================================================================= + +-- ============================================================================= +-- SECTION 1: REQUESTER FIELDS +-- Track which client/platform submitted the job and why +-- ============================================================================= + +-- Client identifier from the requesting platform (e.g., "veritas_client_123") +-- Used for per-client analytics, rate limiting, and billing +ALTER TABLE jobs ADD COLUMN IF NOT EXISTS requester_client_id VARCHAR(255); +COMMENT ON COLUMN jobs.requester_client_id IS + 'Client identifier from requesting platform (e.g., "veritas_client_123")'; + +-- Source platform that submitted the job (e.g., "veritasreview.com") +-- Enables multi-tenant tracking and source-specific behavior +ALTER TABLE jobs ADD COLUMN IF NOT EXISTS requester_source VARCHAR(100); +COMMENT ON COLUMN jobs.requester_source IS + 'Source platform that submitted the job (e.g., "veritasreview.com")'; + +-- Purpose of the scrape for analytics and prioritization +-- Values: "client_report" | "prospect_screening" | "market_research" +ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_purpose VARCHAR(50); +COMMENT ON COLUMN jobs.scrape_purpose IS + 'Purpose of scrape: "client_report", "prospect_screening", "market_research"'; + +-- Flexible JSONB field for requester-specific metadata +-- Allows platforms to pass through custom data without schema changes +ALTER TABLE jobs ADD COLUMN IF NOT EXISTS requester_metadata JSONB; +COMMENT ON COLUMN jobs.requester_metadata IS + 'Flexible JSONB for requester-specific metadata (pass-through data)'; + + +-- ============================================================================= +-- SECTION 2: BATCH FIELDS +-- Support for grouped job submissions (e.g., "scrape these 50 locations") +-- ============================================================================= + +-- Links job to a batch record (batches table to be created in future migration) +-- NULL indicates a standalone job, not part of a batch +ALTER TABLE jobs ADD COLUMN IF NOT EXISTS batch_id UUID; +COMMENT ON COLUMN jobs.batch_id IS + 'UUID linking to batches table (NULL for standalone jobs)'; + +-- Position within the batch (1-indexed: 1, 2, 3...) +-- Used for ordered processing and progress tracking +ALTER TABLE jobs ADD COLUMN IF NOT EXISTS batch_index INTEGER; +COMMENT ON COLUMN jobs.batch_index IS + 'Position in batch (1-indexed), NULL for standalone jobs'; + + +-- ============================================================================= +-- SECTION 3: EXECUTION FIELDS +-- Control how the job is processed (type, version, priority) +-- ============================================================================= + +-- Type of scraping job (extensible for future scrapers) +-- Default "google_reviews" maintains backward compatibility +ALTER TABLE jobs ADD COLUMN IF NOT EXISTS job_type VARCHAR(50) DEFAULT 'google_reviews'; +COMMENT ON COLUMN jobs.job_type IS + 'Job type for multi-scraper support (default: "google_reviews")'; + +-- Scraper version that processed the job (e.g., "1.0.0", "2.1.3") +-- Essential for debugging, regression analysis, and A/B testing +ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scraper_version VARCHAR(50); +COMMENT ON COLUMN jobs.scraper_version IS + 'Scraper version that processed this job (e.g., "1.0.0")'; + +-- Deployment variant used for canary/staged rollouts +-- Values: "stable" | "beta" | "canary" +ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scraper_variant VARCHAR(20); +COMMENT ON COLUMN jobs.scraper_variant IS + 'Deployment variant: "stable", "beta", or "canary"'; + +-- Job priority for queue ordering +-- 0=normal (default), 1=high, 2=urgent +-- Higher priority jobs are processed first +ALTER TABLE jobs ADD COLUMN IF NOT EXISTS priority INTEGER DEFAULT 0; +COMMENT ON COLUMN jobs.priority IS + 'Queue priority: 0=normal (default), 1=high, 2=urgent'; + + +-- ============================================================================= +-- SECTION 4: CALLBACK FIELDS +-- Webhook notification management (enhanced from existing webhook_url) +-- ============================================================================= + +-- Primary callback URL for job completion notifications +-- Separate from existing webhook_url to allow different callback patterns +ALTER TABLE jobs ADD COLUMN IF NOT EXISTS callback_url TEXT; +COMMENT ON COLUMN jobs.callback_url IS + 'Webhook URL for job completion callbacks'; + +-- Current status of callback delivery +-- Values: "pending" | "sent" | "failed" +ALTER TABLE jobs ADD COLUMN IF NOT EXISTS callback_status VARCHAR(20); +COMMENT ON COLUMN jobs.callback_status IS + 'Callback delivery status: "pending", "sent", "failed"'; + +-- Timestamp when callback was successfully sent +ALTER TABLE jobs ADD COLUMN IF NOT EXISTS callback_sent_at TIMESTAMP; +COMMENT ON COLUMN jobs.callback_sent_at IS + 'Timestamp when callback was successfully delivered'; + +-- Number of callback delivery attempts (for retry logic) +ALTER TABLE jobs ADD COLUMN IF NOT EXISTS callback_attempts INTEGER DEFAULT 0; +COMMENT ON COLUMN jobs.callback_attempts IS + 'Number of callback delivery attempts (for retry tracking)'; + + +-- ============================================================================= +-- SECTION 5: RESULT SUMMARY +-- Normalized summary for cross-platform dashboards +-- ============================================================================= + +-- JSONB summary of results for quick dashboard queries +-- Contains pre-computed metrics without loading full reviews_data +-- Example: {"total_reviews": 150, "avg_rating": 4.2, "sentiment": {"positive": 80, "negative": 20}} +ALTER TABLE jobs ADD COLUMN IF NOT EXISTS result_summary JSONB; +COMMENT ON COLUMN jobs.result_summary IS + 'JSONB summary for dashboards: review counts, ratings, sentiment breakdown'; + + +-- ============================================================================= +-- SECTION 6: INDEXES +-- Optimized for common query patterns +-- ============================================================================= + +-- Index for client-based queries (per-client job history, analytics) +CREATE INDEX IF NOT EXISTS idx_jobs_requester_client_id + ON jobs(requester_client_id) + WHERE requester_client_id IS NOT NULL; + +-- Index for batch operations (get all jobs in a batch) +CREATE INDEX IF NOT EXISTS idx_jobs_batch_id + ON jobs(batch_id) + WHERE batch_id IS NOT NULL; + +-- Index for job type filtering (when multiple scrapers exist) +CREATE INDEX IF NOT EXISTS idx_jobs_job_type + ON jobs(job_type); + +-- Index for priority queue ordering (high priority jobs first) +-- Composite with status for efficient "get next job" queries +CREATE INDEX IF NOT EXISTS idx_jobs_priority_status + ON jobs(priority DESC, status, created_at ASC) + WHERE status = 'pending'; + +-- Index for requester source analytics +CREATE INDEX IF NOT EXISTS idx_jobs_requester_source + ON jobs(requester_source) + WHERE requester_source IS NOT NULL; + +-- Index for callback retry processing +CREATE INDEX IF NOT EXISTS idx_jobs_callback_pending + ON jobs(callback_status, callback_attempts) + WHERE callback_status IN ('pending', 'failed'); + +-- Composite index for scraper version analytics +CREATE INDEX IF NOT EXISTS idx_jobs_scraper_version + ON jobs(scraper_version, scraper_variant) + WHERE scraper_version IS NOT NULL; + + +-- ============================================================================= +-- SECTION 7: CONSTRAINTS +-- Data integrity for new fields +-- ============================================================================= + +-- Ensure valid scrape_purpose values +ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_scrape_purpose; +ALTER TABLE jobs ADD CONSTRAINT valid_scrape_purpose + CHECK (scrape_purpose IS NULL OR scrape_purpose IN ('client_report', 'prospect_screening', 'market_research')); + +-- Ensure valid scraper_variant values +ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_scraper_variant; +ALTER TABLE jobs ADD CONSTRAINT valid_scraper_variant + CHECK (scraper_variant IS NULL OR scraper_variant IN ('stable', 'beta', 'canary')); + +-- Ensure valid callback_status values +ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_callback_status; +ALTER TABLE jobs ADD CONSTRAINT valid_callback_status + CHECK (callback_status IS NULL OR callback_status IN ('pending', 'sent', 'failed')); + +-- Ensure valid priority range +ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_priority; +ALTER TABLE jobs ADD CONSTRAINT valid_priority + CHECK (priority >= 0 AND priority <= 2); + +-- Ensure batch_index is positive when set +ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_batch_index; +ALTER TABLE jobs ADD CONSTRAINT valid_batch_index + CHECK (batch_index IS NULL OR batch_index > 0); + + +-- ============================================================================= +-- END OF MIGRATION +-- ============================================================================= +-- +-- Rollback commands (if needed): +-- +-- ALTER TABLE jobs DROP COLUMN IF EXISTS requester_client_id; +-- ALTER TABLE jobs DROP COLUMN IF EXISTS requester_source; +-- ALTER TABLE jobs DROP COLUMN IF EXISTS scrape_purpose; +-- ALTER TABLE jobs DROP COLUMN IF EXISTS requester_metadata; +-- ALTER TABLE jobs DROP COLUMN IF EXISTS batch_id; +-- ALTER TABLE jobs DROP COLUMN IF EXISTS batch_index; +-- ALTER TABLE jobs DROP COLUMN IF EXISTS job_type; +-- ALTER TABLE jobs DROP COLUMN IF EXISTS scraper_version; +-- ALTER TABLE jobs DROP COLUMN IF EXISTS scraper_variant; +-- ALTER TABLE jobs DROP COLUMN IF EXISTS priority; +-- ALTER TABLE jobs DROP COLUMN IF EXISTS callback_url; +-- ALTER TABLE jobs DROP COLUMN IF EXISTS callback_status; +-- ALTER TABLE jobs DROP COLUMN IF EXISTS callback_sent_at; +-- ALTER TABLE jobs DROP COLUMN IF EXISTS callback_attempts; +-- ALTER TABLE jobs DROP COLUMN IF EXISTS result_summary; +-- +-- DROP INDEX IF EXISTS idx_jobs_requester_client_id; +-- DROP INDEX IF EXISTS idx_jobs_batch_id; +-- DROP INDEX IF EXISTS idx_jobs_job_type; +-- DROP INDEX IF EXISTS idx_jobs_priority_status; +-- DROP INDEX IF EXISTS idx_jobs_requester_source; +-- DROP INDEX IF EXISTS idx_jobs_callback_pending; +-- DROP INDEX IF EXISTS idx_jobs_scraper_version; +-- +-- ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_scrape_purpose; +-- ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_scraper_variant; +-- ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_callback_status; +-- ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_priority; +-- ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_batch_index; +-- +-- ============================================================================= diff --git a/migrations/versions/002_create_batches_table.sql b/migrations/versions/002_create_batches_table.sql new file mode 100644 index 0000000..298d15b --- /dev/null +++ b/migrations/versions/002_create_batches_table.sql @@ -0,0 +1,94 @@ +-- Migration: 002_create_batches_table.sql +-- Description: Creates the batches table for grouping multiple scrape jobs together +-- Author: ReviewIQ Platform +-- Date: 2026-01-24 +-- +-- The batches table allows clients to submit multiple places/jobs in a single request, +-- track aggregate progress, and receive a single callback when all jobs complete. +-- This is useful for bulk operations like "screen all 50 prospects" or "refresh all locations". + +-- ============================================================================= +-- CREATE TABLE: batches +-- ============================================================================= +-- A batch represents a collection of scrape jobs submitted together. +-- It tracks overall progress and handles consolidated callbacks. +-- +-- Foreign Key Reference: +-- jobs.batch_id -> batches.id (defined in jobs table migration) +-- When a batch is created, individual jobs reference it via batch_id. +-- ============================================================================= + +CREATE TABLE batches ( + -- Primary key + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + + -- Requester identification + -- Tracks which client/system submitted this batch and why + requester_client_id VARCHAR(255), -- Client identifier (e.g., "acme-corp", "internal-audit") + requester_source VARCHAR(100), -- Source system (e.g., "salesforce", "hubspot", "api") + scrape_purpose VARCHAR(50), -- Purpose code (e.g., "screening", "monitoring", "audit") + + -- Batch metadata + name VARCHAR(255), -- Human-readable name (e.g., "Q1 Prospect Screening") + total_jobs INTEGER NOT NULL DEFAULT 0, -- Total number of jobs in this batch + completed_jobs INTEGER DEFAULT 0, -- Count of successfully completed jobs + failed_jobs INTEGER DEFAULT 0, -- Count of failed jobs + status VARCHAR(20) DEFAULT 'pending', -- Batch status: pending, running, completed + + -- Callback configuration + -- When all jobs complete, optionally notify a webhook endpoint + callback_url TEXT, -- Webhook URL to call on batch completion + callback_status VARCHAR(20), -- Callback result: pending, success, failed + callback_sent_at TIMESTAMP, -- When the callback was sent + + -- Timestamps + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, -- When batch was created + completed_at TIMESTAMP, -- When batch finished (all jobs done) + + -- Flexible metadata storage + -- Allows clients to attach arbitrary data for their own tracking needs + metadata JSONB -- Custom client data (e.g., {"campaign_id": "123"}) +); + +-- ============================================================================= +-- INDEXES +-- ============================================================================= +-- These indexes optimize common query patterns: + +-- Index for looking up batches by client +-- Used when clients query "show me all my batches" +CREATE INDEX idx_batches_requester_client_id ON batches(requester_client_id); + +-- Index for filtering by status +-- Used for dashboards showing pending/running/completed batches +CREATE INDEX idx_batches_status ON batches(status); + +-- Index for time-based queries +-- Used for "recent batches", cleanup jobs, and analytics +CREATE INDEX idx_batches_created_at ON batches(created_at); + +-- Composite index for common dashboard query pattern +-- Optimizes: "show me pending batches for client X ordered by creation time" +CREATE INDEX idx_batches_client_status_created ON batches(requester_client_id, status, created_at DESC); + +-- ============================================================================= +-- COMMENTS +-- ============================================================================= + +COMMENT ON TABLE batches IS 'Groups multiple scrape jobs for batch processing with aggregate tracking and callbacks'; + +COMMENT ON COLUMN batches.id IS 'Unique identifier for the batch (UUID)'; +COMMENT ON COLUMN batches.requester_client_id IS 'Identifier of the client who submitted this batch'; +COMMENT ON COLUMN batches.requester_source IS 'Source system that originated the request (e.g., salesforce, api)'; +COMMENT ON COLUMN batches.scrape_purpose IS 'Purpose of the scrape (screening, monitoring, audit, etc.)'; +COMMENT ON COLUMN batches.name IS 'Human-readable batch name for display purposes'; +COMMENT ON COLUMN batches.total_jobs IS 'Total number of jobs in this batch'; +COMMENT ON COLUMN batches.completed_jobs IS 'Number of jobs that completed successfully'; +COMMENT ON COLUMN batches.failed_jobs IS 'Number of jobs that failed'; +COMMENT ON COLUMN batches.status IS 'Current batch status: pending, running, or completed'; +COMMENT ON COLUMN batches.callback_url IS 'Webhook URL to notify when batch completes'; +COMMENT ON COLUMN batches.callback_status IS 'Result of callback attempt: pending, success, or failed'; +COMMENT ON COLUMN batches.callback_sent_at IS 'Timestamp when callback was sent'; +COMMENT ON COLUMN batches.created_at IS 'When the batch was created'; +COMMENT ON COLUMN batches.completed_at IS 'When the batch finished processing'; +COMMENT ON COLUMN batches.metadata IS 'Arbitrary JSON metadata for client-specific needs'; diff --git a/migrations/versions/003_create_scraper_registry.sql b/migrations/versions/003_create_scraper_registry.sql new file mode 100644 index 0000000..aaf565a --- /dev/null +++ b/migrations/versions/003_create_scraper_registry.sql @@ -0,0 +1,137 @@ +-- Migration: 003_create_scraper_registry +-- Description: Creates the scraper_registry table for dynamic scraper routing +-- Date: 2026-01-24 +-- Phase: Phase 1 - ReviewIQ Platform + +-- ============================================================================ +-- SCRAPER REGISTRY TABLE +-- ============================================================================ +-- This table enables dynamic scraper selection and A/B testing capabilities. +-- +-- ROUTING LOGIC: +-- 1. For a given job_type, the system first filters by variant (stable/beta/canary) +-- 2. Among matching scrapers, traffic_pct determines probability of selection +-- 3. is_default=true marks the fallback scraper when no traffic routing applies +-- 4. min_priority allows reserving certain scrapers for high-priority jobs +-- +-- A/B TESTING: +-- - Set traffic_pct on multiple versions (must sum to 100 for a job_type/variant) +-- - Example: v1.0.0 at 90%, v1.1.0-beta at 10% for gradual rollout +-- - Monitor performance metrics per version to make data-driven decisions +-- +-- DEPRECATION: +-- - Set deprecated_at to soft-deprecate a scraper version +-- - Deprecated scrapers are excluded from routing but kept for audit history +-- ============================================================================ + +CREATE TABLE scraper_registry ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + + -- Scraper identification + job_type VARCHAR(50) NOT NULL, -- e.g., "google_reviews", "yelp_reviews" + version VARCHAR(50) NOT NULL, -- semver format: "1.0.0", "1.1.0-beta" + variant VARCHAR(20) NOT NULL, -- "stable", "beta", "canary" + + -- Implementation reference + module_path VARCHAR(255) NOT NULL, -- Python module path: "scrapers.google_reviews.v1_0_0" + function_name VARCHAR(100) NOT NULL, -- Entry function: "fast_scrape_reviews" + + -- Routing configuration + is_default BOOLEAN DEFAULT false, -- Fallback when no traffic routing matches + traffic_pct INTEGER DEFAULT 0, -- 0-100: percentage of traffic for A/B testing + min_priority INTEGER DEFAULT 0, -- Only route jobs with priority >= this value + + -- Lifecycle timestamps + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + deprecated_at TIMESTAMP, -- NULL if active; set to deprecate without deletion + + -- Version-specific configuration + config JSONB, -- Flexible settings: rate limits, retry policies, etc. + + -- Constraints + UNIQUE(job_type, version), + + -- Ensure traffic_pct is within valid range + CONSTRAINT valid_traffic_pct CHECK (traffic_pct >= 0 AND traffic_pct <= 100), + + -- Ensure variant is one of the allowed values + CONSTRAINT valid_variant CHECK (variant IN ('stable', 'beta', 'canary')) +); + +-- ============================================================================ +-- INDEXES +-- ============================================================================ + +-- Primary lookup index: find active scrapers for a job type +-- Used by: scraper routing logic to find candidate scrapers +CREATE INDEX idx_scraper_registry_job_type_lookup + ON scraper_registry (job_type, variant, is_default) + WHERE deprecated_at IS NULL; + +-- Traffic routing index: quickly find scrapers participating in A/B tests +-- Used by: traffic splitting logic +CREATE INDEX idx_scraper_registry_traffic_routing + ON scraper_registry (job_type, traffic_pct) + WHERE deprecated_at IS NULL AND traffic_pct > 0; + +-- Version lookup index: find specific scraper version +-- Used by: admin tools, debugging, forced version routing +CREATE INDEX idx_scraper_registry_version + ON scraper_registry (job_type, version); + +-- ============================================================================ +-- COMMENTS +-- ============================================================================ + +COMMENT ON TABLE scraper_registry IS + 'Registry of available scraper implementations with A/B testing and routing support'; + +COMMENT ON COLUMN scraper_registry.job_type IS + 'Type of scraping job (e.g., google_reviews, yelp_reviews)'; + +COMMENT ON COLUMN scraper_registry.version IS + 'Semantic version of the scraper implementation'; + +COMMENT ON COLUMN scraper_registry.variant IS + 'Release channel: stable (production), beta (pre-release testing), canary (experimental)'; + +COMMENT ON COLUMN scraper_registry.traffic_pct IS + 'Percentage of traffic to route to this version (0-100). Sum should equal 100 per job_type/variant.'; + +COMMENT ON COLUMN scraper_registry.min_priority IS + 'Minimum job priority required to use this scraper. Allows reserving fast scrapers for urgent jobs.'; + +COMMENT ON COLUMN scraper_registry.config IS + 'JSONB configuration specific to this scraper version (rate limits, timeouts, feature flags)'; + +-- ============================================================================ +-- SEED DATA: Register current Google Reviews scraper +-- ============================================================================ + +INSERT INTO scraper_registry ( + job_type, + version, + variant, + module_path, + function_name, + is_default, + traffic_pct, + min_priority, + config +) VALUES ( + 'google_reviews', + '1.0.0', + 'stable', + 'scrapers.google_reviews.v1_0_0', + 'fast_scrape_reviews', + true, -- This is the default scraper + 100, -- Receives 100% of traffic + 0, -- Available for all priority levels + '{ + "max_concurrent_requests": 5, + "request_timeout_seconds": 30, + "retry_attempts": 3, + "retry_delay_seconds": 2, + "rate_limit_per_minute": 60 + }'::jsonb +); diff --git a/migrations/versions/004_create_api_keys.sql b/migrations/versions/004_create_api_keys.sql new file mode 100644 index 0000000..9bb1dbb --- /dev/null +++ b/migrations/versions/004_create_api_keys.sql @@ -0,0 +1,79 @@ +-- Migration: 004_create_api_keys.sql +-- Description: Create api_keys table for API authentication +-- Created: 2026-01-24 +-- +-- Security Model: +-- ================ +-- API keys are NEVER stored in plain text. When a new API key is generated: +-- 1. A random key is generated (e.g., "riq_abc123xyz...") +-- 2. The full key is returned to the user ONCE and never stored +-- 3. We store only the SHA-256 hash of the key (key_hash) +-- 4. We store the first 8 characters (key_prefix) for identification in logs/UI +-- +-- Authentication Flow: +-- 1. Client sends API key in Authorization header +-- 2. Server hashes the received key with SHA-256 +-- 3. Server looks up the hash in this table +-- 4. If found and is_active=true and not expired, request is authenticated +-- +-- This approach ensures that even if the database is compromised, +-- attackers cannot recover the actual API keys. + +-- ============================================================================ +-- CREATE TABLE +-- ============================================================================ +CREATE TABLE api_keys ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + + -- Key identification (security: we store hash, not the actual key) + key_hash VARCHAR(64) NOT NULL UNIQUE, -- SHA-256 hash of API key (64 hex chars) + key_prefix VARCHAR(8) NOT NULL, -- First 8 chars for identification in UI/logs + name VARCHAR(255) NOT NULL, -- Human-readable name, e.g., "Veritas Production Key" + + -- Client association + client_id VARCHAR(255) NOT NULL, -- External client identifier, e.g., "veritas_client_123" + + -- Permissions (PostgreSQL array of allowed scopes) + scopes TEXT[] DEFAULT '{}', -- e.g., {"jobs:read", "jobs:write", "admin"} + + -- Rate limiting + rate_limit_rpm INTEGER DEFAULT 60, -- Maximum requests per minute for this key + + -- Status + is_active BOOLEAN DEFAULT true, -- Set to false to revoke without deleting + + -- Timestamps + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + last_used_at TIMESTAMP, -- Updated on each successful authentication + expires_at TIMESTAMP, -- NULL means the key never expires + + -- Extensible metadata (for future use: IP allowlists, custom limits, etc.) + metadata JSONB +); + +-- ============================================================================ +-- INDEXES +-- ============================================================================ + +-- Primary lookup index: This is the critical path for every API request. +-- When authenticating, we hash the provided key and look it up here. +-- UNIQUE constraint already creates an index, but we're explicit for clarity. +CREATE INDEX idx_api_keys_key_hash ON api_keys (key_hash); + +-- Client lookup index: For admin operations like "list all keys for client X" +-- or "revoke all keys for client X" +CREATE INDEX idx_api_keys_client_id ON api_keys (client_id); + +-- Active keys index: Useful for filtering active/inactive keys in queries +-- Partial index for efficiency (only indexes active keys) +CREATE INDEX idx_api_keys_active ON api_keys (is_active) WHERE is_active = true; + +-- ============================================================================ +-- COMMENTS +-- ============================================================================ +COMMENT ON TABLE api_keys IS 'API keys for authenticating external clients. Keys are stored as SHA-256 hashes for security.'; +COMMENT ON COLUMN api_keys.key_hash IS 'SHA-256 hash of the API key. The actual key is never stored.'; +COMMENT ON COLUMN api_keys.key_prefix IS 'First 8 characters of the key for identification in UI and logs.'; +COMMENT ON COLUMN api_keys.scopes IS 'Array of permission scopes: jobs:read, jobs:write, admin, etc.'; +COMMENT ON COLUMN api_keys.rate_limit_rpm IS 'Rate limit in requests per minute. NULL uses system default.'; +COMMENT ON COLUMN api_keys.metadata IS 'Extensible JSON metadata: IP allowlists, usage notes, etc.';