feat: Add decoupled pipeline schema with separate PostgreSQL namespace

- Create consolidated migration (005_create_pipeline_schema.sql) with 'pipeline' schema for all classification tables - Update pipeline repositories to use schema prefix (pipeline.*) - Add run_migrations() method to DatabaseManager - Add CLI tool for running versioned migrations Tables created in pipeline schema: - reviews_raw, reviews_enriched (Stage 1) - review_spans (Stage 2) - issues, issue_spans, issue_events (Stage 3) - fact_timeseries (Stage 4) - urt_domains, urt_categories (taxonomy lookup) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 18:17:20 +00:00
parent 7d720f5378
commit 03ed7029e2
4 changed files with 710 additions and 23 deletions
--- a/core/database.py
+++ b/core/database.py
@@ -182,6 +182,66 @@ class DatabaseManager:

            log.info("Database schema initialized")

+    async def run_migrations(self, migrations_dir: str = "migrations/versions"):
+        """
+        Run versioned migrations from SQL files.
+
+        Args:
+            migrations_dir: Path to directory containing .sql migration files.
+                           Files are run in sorted order.
+
+        Returns:
+            Number of migrations applied.
+        """
+        from pathlib import Path
+
+        migrations_path = Path(migrations_dir)
+        if not migrations_path.exists():
+            log.warning(f"Migrations directory not found: {migrations_dir}")
+            return 0
+
+        async with self.pool.acquire() as conn:
+            # Create migrations tracking table
+            await conn.execute("""
+                CREATE TABLE IF NOT EXISTS _migrations (
+                    id SERIAL PRIMARY KEY,
+                    filename VARCHAR(255) UNIQUE NOT NULL,
+                    applied_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
+                )
+            """)
+
+            # Get already applied migrations
+            applied = await conn.fetch("SELECT filename FROM _migrations")
+            applied_set = {r["filename"] for r in applied}
+
+            # Find and run pending migrations
+            migration_files = sorted(migrations_path.glob("*.sql"))
+            migrations_run = 0
+
+            for migration_file in migration_files:
+                filename = migration_file.name
+                if filename in applied_set:
+                    continue
+
+                log.info(f"Running migration: {filename}")
+
+                async with conn.transaction():
+                    try:
+                        sql = migration_file.read_text()
+                        await conn.execute(sql)
+                        await conn.execute(
+                            "INSERT INTO _migrations (filename) VALUES ($1)",
+                            filename,
+                        )
+                        migrations_run += 1
+                        log.info(f"Migration {filename} applied successfully")
+                    except Exception as e:
+                        log.error(f"Migration {filename} failed: {e}")
+                        raise
+
+            log.info(f"Ran {migrations_run} migrations")
+            return migrations_run
+
    # ==================== Job Operations ====================

    async def create_job(
--- a/migrations/versions/005_create_pipeline_schema.sql
+++ b/migrations/versions/005_create_pipeline_schema.sql
@@ -0,0 +1,544 @@
+-- =============================================================================
+-- Migration: 005_create_pipeline_schema.sql
+-- ReviewIQ Pipeline - Decoupled Schema
+-- =============================================================================
+--
+-- Creates a separate 'pipeline' schema for all review classification tables.
+-- This keeps the pipeline tables decoupled from the main scraper schema while
+-- sharing the same database.
+--
+-- Tables created:
+--   pipeline.reviews_raw       - Immutable audit log of scraped reviews
+--   pipeline.reviews_enriched  - Normalized/classified reviews
+--   pipeline.review_spans      - Extracted semantic spans
+--   pipeline.issues            - Aggregated issues from negative spans
+--   pipeline.issue_spans       - Issue-to-span linking
+--   pipeline.issue_events      - Audit log for issue changes
+--   pipeline.fact_timeseries   - Pre-aggregated metrics for dashboards
+--   pipeline.urt_domains       - URT taxonomy domains
+--   pipeline.urt_categories    - URT taxonomy categories
+--
+-- Soft FK: pipeline.reviews_raw.job_id -> public.jobs.job_id (optional)
+--
+-- Date: 2026-01-24
+-- =============================================================================
+
+-- Create the pipeline schema
+CREATE SCHEMA IF NOT EXISTS pipeline;
+
+COMMENT ON SCHEMA pipeline IS 'ReviewIQ Pipeline - LLM-powered review classification and aggregation';
+
+
+-- =============================================================================
+-- SECTION 1: ENUM TYPES (in pipeline schema)
+-- =============================================================================
+
+-- Valence enum
+DO $$ BEGIN
+    CREATE TYPE pipeline.valence_type AS ENUM ('V+', 'V-', 'V0', 'V±');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+-- Intensity enum
+DO $$ BEGIN
+    CREATE TYPE pipeline.intensity_type AS ENUM ('I1', 'I2', 'I3');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+-- Specificity enum
+DO $$ BEGIN
+    CREATE TYPE pipeline.specificity_type AS ENUM ('S1', 'S2', 'S3');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+-- Actionability enum
+DO $$ BEGIN
+    CREATE TYPE pipeline.actionability_type AS ENUM ('A1', 'A2', 'A3');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+-- Temporal enum
+DO $$ BEGIN
+    CREATE TYPE pipeline.temporal_type AS ENUM ('TC', 'TR', 'TH', 'TF');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+-- Evidence enum
+DO $$ BEGIN
+    CREATE TYPE pipeline.evidence_type AS ENUM ('ES', 'EI', 'EC');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+-- Comparative enum
+DO $$ BEGIN
+    CREATE TYPE pipeline.comparative_type AS ENUM ('CR-N', 'CR-B', 'CR-W', 'CR-S');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+-- Issue state enum
+DO $$ BEGIN
+    CREATE TYPE pipeline.issue_state AS ENUM ('open', 'resolved', 'ignored', 'merged');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+-- Subject type enum (for facts)
+DO $$ BEGIN
+    CREATE TYPE pipeline.subject_type AS ENUM ('overall', 'urt_code', 'domain', 'issue');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+-- Bucket type enum (for facts)
+DO $$ BEGIN
+    CREATE TYPE pipeline.bucket_type AS ENUM ('day', 'week', 'month');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+
+-- =============================================================================
+-- SECTION 2: URT TAXONOMY LOOKUP TABLES
+-- =============================================================================
+
+-- URT Domain lookup table
+CREATE TABLE IF NOT EXISTS pipeline.urt_domains (
+    code CHAR(1) PRIMARY KEY,
+    name VARCHAR(50) NOT NULL,
+    description TEXT
+);
+
+INSERT INTO pipeline.urt_domains (code, name, description) VALUES
+    ('O', 'Offering', 'Product/service quality, features, variety'),
+    ('P', 'Price', 'Value, pricing, promotions, payment'),
+    ('J', 'Journey', 'Timing, process, convenience, accessibility'),
+    ('E', 'Environment', 'Physical space, ambiance, cleanliness, digital UX'),
+    ('A', 'Attitude', 'Staff behavior, helpfulness, professionalism'),
+    ('V', 'Voice', 'Brand, communication, marketing, transparency'),
+    ('R', 'Relationship', 'Loyalty, trust, consistency, personalization')
+ON CONFLICT (code) DO NOTHING;
+
+-- URT Tier-2 categories lookup table
+CREATE TABLE IF NOT EXISTS pipeline.urt_categories (
+    code VARCHAR(5) PRIMARY KEY,
+    domain_code CHAR(1) NOT NULL REFERENCES pipeline.urt_domains(code),
+    name VARCHAR(100) NOT NULL,
+    description TEXT
+);
+
+INSERT INTO pipeline.urt_categories (code, domain_code, name) VALUES
+    ('O1', 'O', 'Core Product/Service'),
+    ('O2', 'O', 'Product Features'),
+    ('O3', 'O', 'Variety & Selection'),
+    ('O4', 'O', 'Customization'),
+    ('P1', 'P', 'Value Perception'),
+    ('P2', 'P', 'Pricing Structure'),
+    ('P3', 'P', 'Promotions & Deals'),
+    ('P4', 'P', 'Payment Process'),
+    ('J1', 'J', 'Wait Times'),
+    ('J2', 'J', 'Booking & Reservations'),
+    ('J3', 'J', 'Navigation & Convenience'),
+    ('J4', 'J', 'Accessibility'),
+    ('E1', 'E', 'Physical Environment'),
+    ('E2', 'E', 'Ambiance & Atmosphere'),
+    ('E3', 'E', 'Cleanliness'),
+    ('E4', 'E', 'Digital Experience'),
+    ('A1', 'A', 'Friendliness'),
+    ('A2', 'A', 'Helpfulness'),
+    ('A3', 'A', 'Professionalism'),
+    ('A4', 'A', 'Knowledge & Expertise'),
+    ('V1', 'V', 'Brand Identity'),
+    ('V2', 'V', 'Communication'),
+    ('V3', 'V', 'Marketing'),
+    ('V4', 'V', 'Transparency'),
+    ('R1', 'R', 'Loyalty'),
+    ('R2', 'R', 'Trust'),
+    ('R3', 'R', 'Consistency'),
+    ('R4', 'R', 'Personalization')
+ON CONFLICT (code) DO NOTHING;
+
+COMMENT ON TABLE pipeline.urt_domains IS 'URT v5.1 top-level domains';
+COMMENT ON TABLE pipeline.urt_categories IS 'URT v5.1 Tier-2 categories';
+
+
+-- =============================================================================
+-- SECTION 3: STAGE 1 - RAW & ENRICHED REVIEWS
+-- =============================================================================
+
+-- Raw reviews table (immutable audit log)
+CREATE TABLE IF NOT EXISTS pipeline.reviews_raw (
+    id BIGSERIAL PRIMARY KEY,
+
+    -- Link to scraper job (soft FK to public.jobs)
+    job_id UUID,
+
+    source VARCHAR(20) NOT NULL DEFAULT 'google',
+    review_id VARCHAR(255) NOT NULL,
+    place_id VARCHAR(255) NOT NULL,
+    raw_payload JSONB NOT NULL DEFAULT '{}',
+    review_text TEXT,
+    rating SMALLINT NOT NULL CHECK (rating BETWEEN 1 AND 5),
+    review_time TIMESTAMP WITH TIME ZONE NOT NULL,
+    reviewer_name VARCHAR(255) NOT NULL,
+    reviewer_id VARCHAR(255),
+    review_version INTEGER NOT NULL DEFAULT 1,
+    pulled_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
+    created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
+
+    CONSTRAINT reviews_raw_unique UNIQUE (source, review_id, review_version)
+);
+
+-- Indexes for reviews_raw
+CREATE INDEX IF NOT EXISTS idx_reviews_raw_job_id ON pipeline.reviews_raw(job_id) WHERE job_id IS NOT NULL;
+CREATE INDEX IF NOT EXISTS idx_reviews_raw_place_id ON pipeline.reviews_raw(place_id);
+CREATE INDEX IF NOT EXISTS idx_reviews_raw_review_time ON pipeline.reviews_raw(review_time);
+CREATE INDEX IF NOT EXISTS idx_reviews_raw_pulled_at ON pipeline.reviews_raw(pulled_at);
+
+COMMENT ON TABLE pipeline.reviews_raw IS 'Immutable raw review data as scraped from source';
+COMMENT ON COLUMN pipeline.reviews_raw.job_id IS 'Optional link to public.jobs.job_id for traceability';
+
+
+-- Enriched reviews table (mutable, updated by classification)
+CREATE TABLE IF NOT EXISTS pipeline.reviews_enriched (
+    id BIGSERIAL PRIMARY KEY,
+    source VARCHAR(20) NOT NULL DEFAULT 'google',
+    review_id VARCHAR(255) NOT NULL,
+    review_version INTEGER NOT NULL DEFAULT 1,
+    is_latest BOOLEAN NOT NULL DEFAULT TRUE,
+    raw_id BIGINT REFERENCES pipeline.reviews_raw(id),
+
+    -- Tenant context
+    business_id VARCHAR(255) NOT NULL,
+    place_id VARCHAR(255) NOT NULL,
+
+    -- Content
+    text TEXT NOT NULL,
+    text_normalized TEXT NOT NULL,
+    rating SMALLINT NOT NULL CHECK (rating BETWEEN 1 AND 5),
+    review_time TIMESTAMP WITH TIME ZONE NOT NULL,
+
+    -- Normalization fields
+    language VARCHAR(10) NOT NULL DEFAULT 'en',
+    taxonomy_version VARCHAR(20) NOT NULL DEFAULT 'v5.1',
+
+    -- Classification fields (NULL until Stage 2)
+    urt_primary VARCHAR(10),
+    urt_secondary VARCHAR(10)[] DEFAULT '{}',
+    valence VARCHAR(5),
+    intensity VARCHAR(5),
+    comparative VARCHAR(10),
+    staff_mentions VARCHAR(255)[] DEFAULT '{}',
+    quotes JSONB DEFAULT '{}',
+    embedding REAL[] DEFAULT '{}',
+    trust_score REAL,
+    classification_model VARCHAR(100),
+    classification_confidence JSONB DEFAULT '{}',
+    processed_at TIMESTAMP WITH TIME ZONE,
+
+    created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
+
+    CONSTRAINT reviews_enriched_unique UNIQUE (source, review_id, review_version)
+);
+
+-- Indexes for reviews_enriched
+CREATE INDEX IF NOT EXISTS idx_reviews_enriched_business_id ON pipeline.reviews_enriched(business_id);
+CREATE INDEX IF NOT EXISTS idx_reviews_enriched_place_id ON pipeline.reviews_enriched(place_id);
+CREATE INDEX IF NOT EXISTS idx_reviews_enriched_review_time ON pipeline.reviews_enriched(review_time);
+CREATE INDEX IF NOT EXISTS idx_reviews_enriched_urt_primary ON pipeline.reviews_enriched(urt_primary) WHERE urt_primary IS NOT NULL;
+CREATE INDEX IF NOT EXISTS idx_reviews_enriched_unclassified ON pipeline.reviews_enriched(review_time DESC) WHERE urt_primary IS NULL AND is_latest = TRUE;
+CREATE INDEX IF NOT EXISTS idx_reviews_enriched_valence ON pipeline.reviews_enriched(valence) WHERE valence IS NOT NULL;
+
+COMMENT ON TABLE pipeline.reviews_enriched IS 'Enriched reviews with normalization and classification';
+
+
+-- =============================================================================
+-- SECTION 4: STAGE 2 - REVIEW SPANS
+-- =============================================================================
+
+CREATE TABLE IF NOT EXISTS pipeline.review_spans (
+    id BIGSERIAL PRIMARY KEY,
+    span_id VARCHAR(50) NOT NULL UNIQUE,
+
+    -- Context
+    business_id VARCHAR(255) NOT NULL,
+    place_id VARCHAR(255) NOT NULL,
+    source VARCHAR(20) NOT NULL DEFAULT 'google',
+    review_id VARCHAR(255) NOT NULL,
+    review_version INTEGER NOT NULL DEFAULT 1,
+
+    -- Position
+    span_index INTEGER NOT NULL CHECK (span_index >= 0),
+    span_text TEXT NOT NULL,
+    span_start INTEGER NOT NULL CHECK (span_start >= 0),
+    span_end INTEGER NOT NULL CHECK (span_end > span_start),
+
+    -- Classification profile
+    profile VARCHAR(20) NOT NULL DEFAULT 'standard',
+
+    -- Core URT classification
+    urt_primary VARCHAR(10) NOT NULL,
+    urt_secondary VARCHAR(10)[] DEFAULT '{}',
+    valence VARCHAR(5) NOT NULL,
+    intensity VARCHAR(5) NOT NULL,
+    comparative VARCHAR(10) NOT NULL DEFAULT 'CR-N',
+
+    -- Extended classification (standard/full profile)
+    specificity VARCHAR(5),
+    actionability VARCHAR(5),
+    temporal VARCHAR(5),
+    evidence VARCHAR(5),
+
+    -- Entity extraction
+    entity VARCHAR(255),
+    entity_type VARCHAR(20),
+    entity_normalized VARCHAR(255),
+
+    -- Causal relations (full profile)
+    relation_type VARCHAR(20),
+    related_span_id VARCHAR(50),
+    causal_chain JSONB,
+
+    -- Flags
+    is_primary BOOLEAN NOT NULL DEFAULT FALSE,
+    is_active BOOLEAN NOT NULL DEFAULT TRUE,
+
+    -- Time reference
+    review_time TIMESTAMP WITH TIME ZONE NOT NULL,
+
+    -- Metadata
+    confidence VARCHAR(10) NOT NULL DEFAULT 'medium',
+    usn VARCHAR(100) NOT NULL,
+    taxonomy_version VARCHAR(20) NOT NULL,
+    model_version VARCHAR(100) NOT NULL,
+    ingest_batch_id VARCHAR(50) NOT NULL,
+
+    created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
+
+    -- Foreign key to review
+    CONSTRAINT fk_review FOREIGN KEY (source, review_id, review_version)
+        REFERENCES pipeline.reviews_enriched(source, review_id, review_version)
+);
+
+-- Indexes for review_spans
+CREATE INDEX IF NOT EXISTS idx_spans_business_id ON pipeline.review_spans(business_id);
+CREATE INDEX IF NOT EXISTS idx_spans_place_id ON pipeline.review_spans(place_id);
+CREATE INDEX IF NOT EXISTS idx_spans_review_time ON pipeline.review_spans(review_time);
+CREATE INDEX IF NOT EXISTS idx_spans_urt_primary ON pipeline.review_spans(urt_primary);
+CREATE INDEX IF NOT EXISTS idx_spans_valence ON pipeline.review_spans(valence);
+CREATE INDEX IF NOT EXISTS idx_spans_intensity ON pipeline.review_spans(intensity);
+CREATE INDEX IF NOT EXISTS idx_spans_is_active ON pipeline.review_spans(is_active) WHERE is_active = TRUE;
+CREATE INDEX IF NOT EXISTS idx_spans_is_primary ON pipeline.review_spans(is_primary) WHERE is_primary = TRUE;
+CREATE INDEX IF NOT EXISTS idx_spans_entity_normalized ON pipeline.review_spans(entity_normalized) WHERE entity_normalized IS NOT NULL;
+CREATE INDEX IF NOT EXISTS idx_spans_batch ON pipeline.review_spans(ingest_batch_id);
+
+-- Index for unrouted negative spans (Stage 3 query)
+CREATE INDEX IF NOT EXISTS idx_spans_unrouted_negative ON pipeline.review_spans(review_time DESC)
+    WHERE is_active = TRUE AND valence IN ('V-', 'V±');
+
+COMMENT ON TABLE pipeline.review_spans IS 'Extracted semantic spans with URT classification from reviews';
+
+
+-- =============================================================================
+-- SECTION 5: STAGE 3 - ISSUES
+-- =============================================================================
+
+-- Issues table
+CREATE TABLE IF NOT EXISTS pipeline.issues (
+    id BIGSERIAL PRIMARY KEY,
+    issue_id VARCHAR(50) NOT NULL UNIQUE,
+
+    -- Context
+    business_id VARCHAR(255) NOT NULL,
+    place_id VARCHAR(255) NOT NULL,
+
+    -- Classification
+    primary_subcode VARCHAR(10) NOT NULL,
+    domain CHAR(1) NOT NULL,
+
+    -- State
+    state pipeline.issue_state NOT NULL DEFAULT 'open',
+    priority_score REAL NOT NULL DEFAULT 1.0,
+    confidence_score REAL NOT NULL DEFAULT 1.0,
+
+    -- Aggregates
+    span_count INTEGER NOT NULL DEFAULT 1,
+    max_intensity VARCHAR(5) NOT NULL DEFAULT 'I1',
+
+    -- Entity (optional - for entity-specific issues)
+    entity VARCHAR(255),
+    entity_normalized VARCHAR(255),
+
+    -- Metadata
+    taxonomy_version VARCHAR(20) NOT NULL,
+    created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
+    updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
+);
+
+-- Issue-span links (1:1 - each span routes to exactly one issue)
+CREATE TABLE IF NOT EXISTS pipeline.issue_spans (
+    id BIGSERIAL PRIMARY KEY,
+    issue_id VARCHAR(50) NOT NULL REFERENCES pipeline.issues(issue_id),
+    span_id VARCHAR(50) NOT NULL UNIQUE,
+
+    -- Review reference
+    source VARCHAR(20) NOT NULL DEFAULT 'google',
+    review_id VARCHAR(255) NOT NULL,
+    review_version INTEGER NOT NULL DEFAULT 1,
+
+    -- Match info
+    is_primary_match BOOLEAN NOT NULL DEFAULT TRUE,
+    intensity VARCHAR(5) NOT NULL,
+    review_time TIMESTAMP WITH TIME ZONE NOT NULL,
+
+    created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
+);
+
+-- Issue events (audit log)
+CREATE TABLE IF NOT EXISTS pipeline.issue_events (
+    id BIGSERIAL PRIMARY KEY,
+    issue_id VARCHAR(50) NOT NULL REFERENCES pipeline.issues(issue_id),
+    event_type VARCHAR(50) NOT NULL,
+    span_id VARCHAR(50),
+    old_value TEXT,
+    new_value TEXT,
+    metadata JSONB,
+    created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
+);
+
+-- Indexes for issues
+CREATE INDEX IF NOT EXISTS idx_issues_business_id ON pipeline.issues(business_id);
+CREATE INDEX IF NOT EXISTS idx_issues_place_id ON pipeline.issues(place_id);
+CREATE INDEX IF NOT EXISTS idx_issues_state ON pipeline.issues(state);
+CREATE INDEX IF NOT EXISTS idx_issues_primary_subcode ON pipeline.issues(primary_subcode);
+CREATE INDEX IF NOT EXISTS idx_issues_domain ON pipeline.issues(domain);
+CREATE INDEX IF NOT EXISTS idx_issues_entity_normalized ON pipeline.issues(entity_normalized) WHERE entity_normalized IS NOT NULL;
+CREATE INDEX IF NOT EXISTS idx_issues_priority ON pipeline.issues(priority_score DESC) WHERE state = 'open';
+CREATE INDEX IF NOT EXISTS idx_issues_created ON pipeline.issues(created_at);
+CREATE INDEX IF NOT EXISTS idx_issues_updated ON pipeline.issues(updated_at);
+
+-- Indexes for issue_spans
+CREATE INDEX IF NOT EXISTS idx_issue_spans_issue_id ON pipeline.issue_spans(issue_id);
+CREATE INDEX IF NOT EXISTS idx_issue_spans_review_time ON pipeline.issue_spans(review_time);
+
+-- Indexes for issue_events
+CREATE INDEX IF NOT EXISTS idx_issue_events_issue_id ON pipeline.issue_events(issue_id);
+CREATE INDEX IF NOT EXISTS idx_issue_events_created ON pipeline.issue_events(created_at);
+CREATE INDEX IF NOT EXISTS idx_issue_events_type ON pipeline.issue_events(event_type);
+
+COMMENT ON TABLE pipeline.issues IS 'Aggregated issues derived from negative/mixed spans';
+COMMENT ON TABLE pipeline.issue_spans IS 'Links between issues and their source spans';
+COMMENT ON TABLE pipeline.issue_events IS 'Audit log for issue state changes';
+
+
+-- =============================================================================
+-- SECTION 6: STAGE 4 - FACT TIMESERIES
+-- =============================================================================
+
+CREATE TABLE IF NOT EXISTS pipeline.fact_timeseries (
+    id BIGSERIAL PRIMARY KEY,
+
+    -- Dimension keys
+    business_id VARCHAR(255) NOT NULL,
+    place_id VARCHAR(255) NOT NULL,  -- Or 'ALL' for rollup
+    period_date DATE NOT NULL,
+    bucket_type pipeline.bucket_type NOT NULL DEFAULT 'day',
+    subject_type pipeline.subject_type NOT NULL DEFAULT 'urt_code',
+    subject_id VARCHAR(50) NOT NULL,  -- URT code, domain letter, or issue_id
+    taxonomy_version VARCHAR(20) NOT NULL,
+
+    -- Core counts
+    review_count INTEGER NOT NULL DEFAULT 0,
+    span_count INTEGER NOT NULL DEFAULT 0,
+
+    -- Valence counts
+    negative_count INTEGER NOT NULL DEFAULT 0,
+    positive_count INTEGER NOT NULL DEFAULT 0,
+    neutral_count INTEGER NOT NULL DEFAULT 0,
+    mixed_count INTEGER NOT NULL DEFAULT 0,
+
+    -- Strength scores
+    strength_score REAL NOT NULL DEFAULT 0.0,
+    negative_strength REAL NOT NULL DEFAULT 0.0,
+    positive_strength REAL NOT NULL DEFAULT 0.0,
+
+    -- Rating
+    avg_rating REAL,
+
+    -- Intensity counts
+    i1_count INTEGER NOT NULL DEFAULT 0,
+    i2_count INTEGER NOT NULL DEFAULT 0,
+    i3_count INTEGER NOT NULL DEFAULT 0,
+
+    -- Comparative counts
+    cr_better INTEGER NOT NULL DEFAULT 0,
+    cr_worse INTEGER NOT NULL DEFAULT 0,
+    cr_same INTEGER NOT NULL DEFAULT 0,
+
+    -- Trust-weighted metrics
+    trust_weighted_strength REAL NOT NULL DEFAULT 0.0,
+    trust_weighted_negative REAL NOT NULL DEFAULT 0.0,
+
+    -- Metadata
+    computed_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
+    created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
+
+    -- Unique constraint for upsert
+    CONSTRAINT fact_timeseries_unique UNIQUE (
+        business_id, place_id, period_date, bucket_type,
+        subject_type, subject_id, taxonomy_version
+    )
+);
+
+-- Indexes for fact_timeseries
+CREATE INDEX IF NOT EXISTS idx_facts_business_id ON pipeline.fact_timeseries(business_id);
+CREATE INDEX IF NOT EXISTS idx_facts_place_id ON pipeline.fact_timeseries(place_id);
+CREATE INDEX IF NOT EXISTS idx_facts_period ON pipeline.fact_timeseries(period_date);
+CREATE INDEX IF NOT EXISTS idx_facts_bucket ON pipeline.fact_timeseries(bucket_type);
+CREATE INDEX IF NOT EXISTS idx_facts_subject_type ON pipeline.fact_timeseries(subject_type);
+CREATE INDEX IF NOT EXISTS idx_facts_subject_id ON pipeline.fact_timeseries(subject_id);
+
+-- Composite index for common dashboard queries
+CREATE INDEX IF NOT EXISTS idx_facts_dashboard ON pipeline.fact_timeseries(
+    business_id, place_id, bucket_type, period_date DESC
+);
+
+-- Index for specific code trends
+CREATE INDEX IF NOT EXISTS idx_facts_code_trend ON pipeline.fact_timeseries(
+    business_id, subject_id, bucket_type, period_date DESC
+) WHERE subject_type = 'urt_code';
+
+-- Index for domain aggregates
+CREATE INDEX IF NOT EXISTS idx_facts_domain ON pipeline.fact_timeseries(
+    business_id, subject_id, bucket_type, period_date DESC
+) WHERE subject_type = 'domain';
+
+COMMENT ON TABLE pipeline.fact_timeseries IS 'Pre-aggregated time series facts for dashboard queries';
+
+
+-- =============================================================================
+-- SECTION 7: HELPER VIEWS
+-- =============================================================================
+
+-- View for latest enriched reviews only
+CREATE OR REPLACE VIEW pipeline.reviews_latest AS
+SELECT * FROM pipeline.reviews_enriched WHERE is_latest = TRUE;
+
+-- View for open issues with span counts
+CREATE OR REPLACE VIEW pipeline.issues_open AS
+SELECT
+    i.*,
+    COUNT(s.id) as total_spans
+FROM pipeline.issues i
+LEFT JOIN pipeline.issue_spans s ON i.issue_id = s.issue_id
+WHERE i.state = 'open'
+GROUP BY i.id;
+
+COMMENT ON VIEW pipeline.reviews_latest IS 'Latest version of each review';
+COMMENT ON VIEW pipeline.issues_open IS 'Open issues with total span counts';
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/repositories.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/repositories.py
@@ -1,4 +1,8 @@
-"""Data access layer for pipeline operations."""
+"""Data access layer for pipeline operations.
+
+All tables live in the 'pipeline' schema, keeping them decoupled from the
+main scraper schema while sharing the same database.
+"""

 from __future__ import annotations

@@ -20,6 +24,9 @@ if TYPE_CHECKING:

 logger = logging.getLogger(__name__)

+# Schema prefix for all pipeline tables
+SCHEMA = "pipeline"
+

 class ReviewRepository:
    """Repository for review data operations."""
@@ -35,7 +42,7 @@ class ReviewRepository:
    ) -> int:
        """Insert a raw review and return its ID."""
        query = """
-            INSERT INTO reviews_raw (
+            INSERT INTO pipeline.reviews_raw (
                source, review_id, place_id, raw_payload,
                review_text, rating, review_time, reviewer_name, reviewer_id,
                review_version, pulled_at
@@ -66,7 +73,7 @@ class ReviewRepository:
    ) -> int:
        """Insert an enriched review stub (pre-classification)."""
        query = """
-            INSERT INTO reviews_enriched (
+            INSERT INTO pipeline.reviews_enriched (
                source, review_id, review_version, is_latest, raw_id,
                business_id, place_id, text, text_normalized, rating, review_time,
                language, taxonomy_version
@@ -101,7 +108,7 @@ class ReviewRepository:
    ) -> None:
        """Update an enriched review with classification results."""
        query = """
-            UPDATE reviews_enriched SET
+            UPDATE pipeline.reviews_enriched SET
                urt_primary = $1,
                urt_secondary = $2,
                valence = $3,
@@ -147,7 +154,7 @@ class ReviewRepository:
            SELECT
                source, review_id, review_version, business_id, place_id,
                text, text_normalized, rating, review_time
-            FROM reviews_enriched
+            FROM pipeline.reviews_enriched
            WHERE urt_primary IS NULL
              AND is_latest = TRUE
            ORDER BY review_time DESC
@@ -164,7 +171,7 @@ class ReviewRepository:
    ) -> dict[str, Any] | None:
        """Get a specific review by its composite key."""
        query = """
-            SELECT * FROM reviews_enriched
+            SELECT * FROM pipeline.reviews_enriched
            WHERE source = $1 AND review_id = $2 AND review_version = $3
        """
        row = await self.db.fetchrow(query, source, review_id, review_version)
@@ -179,7 +186,7 @@ class ReviewRepository:
        # For now, we check by querying the first occurrence
        # A proper dedup table would be better for production
        query = """
-            SELECT review_id FROM reviews_enriched
+            SELECT review_id FROM pipeline.reviews_enriched
            WHERE business_id = $1
              AND text_normalized IS NOT NULL
            LIMIT 1
@@ -209,7 +216,7 @@ class SpanRepository:
    ) -> None:
        """Insert a span into the database."""
        query = """
-            INSERT INTO review_spans (
+            INSERT INTO pipeline.review_spans (
                span_id, business_id, place_id, source, review_id, review_version,
                span_index, span_text, span_start, span_end,
                profile, urt_primary, urt_secondary, valence, intensity, comparative,
@@ -282,8 +289,8 @@ class SpanRepository:
                rs.urt_primary, rs.valence, rs.intensity,
                rs.entity_normalized, rs.review_time, rs.confidence,
                re.trust_score
-            FROM review_spans rs
-            JOIN reviews_enriched re ON (
+            FROM pipeline.review_spans rs
+            JOIN pipeline.reviews_enriched re ON (
                re.source = rs.source
                AND re.review_id = rs.review_id
                AND re.review_version = rs.review_version
@@ -291,7 +298,7 @@ class SpanRepository:
            WHERE rs.is_active = TRUE
              AND rs.valence IN ('V-', 'V±')
              AND NOT EXISTS (
-                SELECT 1 FROM issue_spans iss WHERE iss.span_id = rs.span_id
+                SELECT 1 FROM pipeline.issue_spans iss WHERE iss.span_id = rs.span_id
              )
            ORDER BY rs.review_time DESC
            LIMIT $1
@@ -301,7 +308,7 @@ class SpanRepository:

    async def get_span_by_id(self, span_id: str) -> dict[str, Any] | None:
        """Get a span by its ID."""
-        query = "SELECT * FROM review_spans WHERE span_id = $1"
+        query = "SELECT * FROM pipeline.review_spans WHERE span_id = $1"
        row = await self.db.fetchrow(query, span_id)
        return dict(row) if row else None

@@ -326,7 +333,7 @@ class IssueRepository:
        """Create or update an issue. Returns True if newly created."""
        # First check if exists
        existing = await self.db.fetchval(
-            "SELECT 1 FROM issues WHERE issue_id = $1",
+            "SELECT 1 FROM pipeline.issues WHERE issue_id = $1",
            issue_id,
        )

@@ -334,7 +341,7 @@ class IssueRepository:
            # Update
            await self.db.execute(
                """
-                UPDATE issues SET
+                UPDATE pipeline.issues SET
                    span_count = span_count + 1,
                    max_intensity = CASE
                        WHEN $1 = 'I3' THEN 'I3'
@@ -353,7 +360,7 @@ class IssueRepository:
            domain = primary_subcode[0] if primary_subcode else "O"
            await self.db.execute(
                """
-                INSERT INTO issues (
+                INSERT INTO pipeline.issues (
                    issue_id, business_id, place_id, primary_subcode, domain,
                    state, priority_score, confidence_score, span_count, max_intensity,
                    entity, entity_normalized, taxonomy_version
@@ -388,7 +395,7 @@ class IssueRepository:
        """Link a span to an issue."""
        await self.db.execute(
            """
-            INSERT INTO issue_spans (
+            INSERT INTO pipeline.issue_spans (
                issue_id, span_id, source, review_id, review_version,
                is_primary_match, intensity, review_time
            ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
@@ -416,7 +423,7 @@ class IssueRepository:
        """Log an issue event for audit trail."""
        await self.db.execute(
            """
-            INSERT INTO issue_events (
+            INSERT INTO pipeline.issue_events (
                issue_id, event_type, span_id, old_value, new_value, metadata
            ) VALUES ($1, $2, $3, $4, $5, $6)
            """,
@@ -430,14 +437,14 @@ class IssueRepository:

    async def get_issue_by_id(self, issue_id: str) -> dict[str, Any] | None:
        """Get an issue by its ID."""
-        query = "SELECT * FROM issues WHERE issue_id = $1"
+        query = "SELECT * FROM pipeline.issues WHERE issue_id = $1"
        row = await self.db.fetchrow(query, issue_id)
        return dict(row) if row else None

    async def check_span_already_linked(self, span_id: str) -> str | None:
        """Check if a span is already linked to an issue."""
        return await self.db.fetchval(
-            "SELECT issue_id FROM issue_spans WHERE span_id = $1",
+            "SELECT issue_id FROM pipeline.issue_spans WHERE span_id = $1",
            span_id,
        )

@@ -452,7 +459,7 @@ class FactRepository:
        """Insert or update a fact record."""
        await self.db.execute(
            """
-            INSERT INTO fact_timeseries (
+            INSERT INTO pipeline.fact_timeseries (
                business_id, place_id, period_date, bucket_type,
                subject_type, subject_id, taxonomy_version,
                review_count, span_count, negative_count, positive_count,
@@ -534,8 +541,8 @@ class FactRepository:
                rs.comparative,
                re.trust_score,
                re.rating
-            FROM review_spans rs
-            JOIN reviews_enriched re ON (
+            FROM pipeline.review_spans rs
+            JOIN pipeline.reviews_enriched re ON (
                re.source = rs.source
                AND re.review_id = rs.review_id
                AND re.review_version = rs.review_version
@@ -554,7 +561,7 @@ class FactRepository:
        """Get all place IDs for a business."""
        rows = await self.db.fetch(
            """
-            SELECT DISTINCT place_id FROM reviews_enriched
+            SELECT DISTINCT place_id FROM pipeline.reviews_enriched
            WHERE business_id = $1
            """,
            business_id,
--- a/tools/run_migrations.py
+++ b/tools/run_migrations.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+"""
+CLI tool to run database migrations.
+
+Usage:
+    python tools/run_migrations.py --database-url $DATABASE_URL
+
+    # Or with environment variable
+    export DATABASE_URL=postgresql://user:pass@localhost/db
+    python tools/run_migrations.py
+"""
+
+import asyncio
+import os
+import sys
+import argparse
+import logging
+
+# Add project root to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from core.database import DatabaseManager
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+
+
+async def main(database_url: str, migrations_dir: str = "migrations/versions"):
+    """Run migrations against the database."""
+    db = DatabaseManager(database_url)
+
+    try:
+        await db.connect()
+
+        # First initialize base schema (jobs table, etc.)
+        print("Initializing base schema...")
+        await db.initialize_schema()
+
+        # Then run versioned migrations
+        print(f"\nRunning migrations from {migrations_dir}...")
+        count = await db.run_migrations(migrations_dir)
+
+        if count > 0:
+            print(f"\n✓ Applied {count} migration(s)")
+        else:
+            print("\n✓ No pending migrations")
+
+    except Exception as e:
+        print(f"\n✗ Migration failed: {e}", file=sys.stderr)
+        sys.exit(1)
+    finally:
+        await db.disconnect()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run database migrations")
+    parser.add_argument(
+        "--database-url",
+        default=os.environ.get("DATABASE_URL"),
+        help="PostgreSQL connection string (default: $DATABASE_URL)",
+    )
+    parser.add_argument(
+        "--migrations-dir",
+        default="migrations/versions",
+        help="Directory containing .sql migration files",
+    )
+
+    args = parser.parse_args()
+
+    if not args.database_url:
+        print("Error: --database-url required or set DATABASE_URL environment variable", file=sys.stderr)
+        sys.exit(1)
+
+    asyncio.run(main(args.database_url, args.migrations_dir))