feat: Add reviewiq-pipeline package for LLM-powered review classification

Implement a standalone Python package for processing customer reviews through a 4-stage pipeline using URT (Universal Review Taxonomy) v5.1: - Stage 1: Normalization (text cleaning, language detection, deduplication) - Stage 2: LLM Classification (OpenAI/Anthropic span extraction with URT codes) - Stage 3: Issue Routing (deterministic issue ID generation, span linking) - Stage 4: Fact Aggregation (time series metrics for dashboards) Package includes: - TypedDict contracts matching Pipeline-Contracts-v1.md - Async database layer with asyncpg and 5 SQL migrations - LLM client abstraction supporting both OpenAI and Anthropic - Sentence-transformers integration for embeddings - Validation rules V1.x through V4.x - CLI commands: migrate, run, validate, check - 55 unit and integration tests (all passing) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 18:07:11 +00:00
parent b780a23b66
commit 7d720f5378
34 changed files with 7222 additions and 0 deletions
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/init.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/init.py
@@ -0,0 +1,56 @@
+"""
+ReviewIQ Pipeline - LLM-powered review classification and analysis.
+
+This package provides a complete pipeline for processing customer reviews:
+- Stage 1: Normalization (text cleaning, language detection, deduplication)
+- Stage 2: LLM Classification (span extraction with URT codes)
+- Stage 3: Issue Routing (route negative spans to issues)
+- Stage 4: Fact Aggregation (pre-aggregate metrics for dashboards)
+"""
+
+from reviewiq_pipeline.config import Config
+from reviewiq_pipeline.contracts import (
+    ClassifiedReview,
+    ExtractedSpan,
+    FactRecord,
+    NormalizedReview,
+    RawReview,
+    RoutedSpan,
+    ScraperOutput,
+    Stage1Input,
+    Stage1Output,
+    Stage2Input,
+    Stage2Output,
+    Stage3Input,
+    Stage3Output,
+    Stage4Input,
+    Stage4Output,
+    ValidationError,
+    ValidationResult,
+)
+from reviewiq_pipeline.pipeline import Pipeline
+
+__version__ = "0.1.0"
+__all__ = [
+    # Main API
+    "Pipeline",
+    "Config",
+    # Contracts
+    "ScraperOutput",
+    "RawReview",
+    "Stage1Input",
+    "Stage1Output",
+    "NormalizedReview",
+    "Stage2Input",
+    "Stage2Output",
+    "ClassifiedReview",
+    "ExtractedSpan",
+    "Stage3Input",
+    "Stage3Output",
+    "RoutedSpan",
+    "Stage4Input",
+    "Stage4Output",
+    "FactRecord",
+    "ValidationResult",
+    "ValidationError",
+]
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/cli.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/cli.py
@@ -0,0 +1,322 @@
+"""
+CLI for the ReviewIQ pipeline.
+
+Usage:
+    reviewiq-pipeline migrate --database-url $DATABASE_URL
+    reviewiq-pipeline run --job-id <UUID> --stages 1,2,3,4
+    reviewiq-pipeline validate --job-id <UUID>
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import sys
+from typing import Any
+
+import click
+
+from reviewiq_pipeline import __version__
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger("reviewiq_pipeline")
+
+
+def get_config(**overrides: Any):
+    """Get configuration with optional overrides."""
+    from reviewiq_pipeline.config import Config
+
+    return Config(**{k: v for k, v in overrides.items() if v is not None})
+
+
+@click.group()
+@click.version_option(version=__version__)
+def main():
+    """ReviewIQ Pipeline - LLM-powered review classification."""
+    pass
+
+
+@main.command()
+@click.option(
+    "--database-url",
+    envvar="DATABASE_URL",
+    required=True,
+    help="PostgreSQL connection string",
+)
+def migrate(database_url: str):
+    """Run database migrations."""
+
+    async def _migrate():
+        from reviewiq_pipeline.db.connection import DatabasePool
+
+        config = get_config(database_url=database_url)
+        db = DatabasePool(config)
+
+        try:
+            await db.initialize()
+            count = await db.run_migrations()
+            click.echo(f"Successfully ran {count} migrations")
+        except Exception as e:
+            click.echo(f"Migration failed: {e}", err=True)
+            sys.exit(1)
+        finally:
+            await db.close()
+
+    asyncio.run(_migrate())
+
+
+@main.command()
+@click.option(
+    "--job-id",
+    required=True,
+    help="Job ID to process",
+)
+@click.option(
+    "--stages",
+    default="1,2,3,4",
+    help="Comma-separated list of stages to run (default: 1,2,3,4)",
+)
+@click.option(
+    "--database-url",
+    envvar="DATABASE_URL",
+    required=True,
+    help="PostgreSQL connection string",
+)
+@click.option(
+    "--llm-provider",
+    envvar="LLM_PROVIDER",
+    type=click.Choice(["openai", "anthropic"]),
+    default="openai",
+    help="LLM provider",
+)
+@click.option(
+    "--llm-model",
+    envvar="LLM_MODEL",
+    default="gpt-4o-mini",
+    help="LLM model to use",
+)
+@click.option(
+    "--openai-api-key",
+    envvar="OPENAI_API_KEY",
+    help="OpenAI API key",
+)
+@click.option(
+    "--anthropic-api-key",
+    envvar="ANTHROPIC_API_KEY",
+    help="Anthropic API key",
+)
+@click.option(
+    "--validate/--no-validate",
+    default=True,
+    help="Validate output after each stage",
+)
+@click.option(
+    "--output",
+    type=click.Path(),
+    help="Output file for results (JSON)",
+)
+def run(
+    job_id: str,
+    stages: str,
+    database_url: str,
+    llm_provider: str,
+    llm_model: str,
+    openai_api_key: str | None,
+    anthropic_api_key: str | None,
+    validate: bool,
+    output: str | None,
+):
+    """Run pipeline stages for a job."""
+
+    async def _run():
+        from reviewiq_pipeline import Pipeline
+
+        # Parse stages
+        stage_list = [int(s.strip()) for s in stages.split(",") if s.strip()]
+
+        config = get_config(
+            database_url=database_url,
+            llm_provider=llm_provider,
+            llm_model=llm_model,
+            openai_api_key=openai_api_key,
+            anthropic_api_key=anthropic_api_key,
+        )
+
+        pipeline = Pipeline(config)
+
+        try:
+            await pipeline.initialize()
+
+            # Fetch job from database
+            job_data = await pipeline._db.fetchrow(
+                "SELECT * FROM jobs WHERE job_id = $1",
+                job_id,
+            )
+
+            if not job_data:
+                click.echo(f"Job {job_id} not found", err=True)
+                sys.exit(1)
+
+            # Build scraper output from job data
+            reviews_data = job_data.get("reviews_data") or {}
+            scraper_output = {
+                "job_id": job_id,
+                "status": job_data.get("status", "completed"),
+                "business_id": reviews_data.get("business_id", job_id),
+                "place_id": reviews_data.get("place_id", ""),
+                "business_info": reviews_data.get("business_info", {}),
+                "reviews": reviews_data.get("reviews", []),
+                "scrape_time_ms": 0,
+                "reviews_scraped": len(reviews_data.get("reviews", [])),
+                "scraper_version": "v1.0.0",
+            }
+
+            # Run pipeline
+            result = await pipeline.process(
+                scraper_output,
+                stages=stage_list,
+                validate=validate,
+            )
+
+            # Output results
+            if result.success:
+                click.echo(click.style("Pipeline completed successfully!", fg="green"))
+            else:
+                click.echo(click.style("Pipeline completed with validation errors", fg="yellow"))
+
+            # Print summary
+            if result.stage1:
+                click.echo(f"  Stage 1: {result.stage1['stats']['output_count']} reviews normalized")
+            if result.stage2:
+                click.echo(f"  Stage 2: {result.stage2['stats']['success_count']} reviews classified")
+            if result.stage3:
+                click.echo(f"  Stage 3: {result.stage3['stats']['spans_routed']} spans routed")
+            if result.stage4:
+                click.echo(f"  Stage 4: {result.stage4['stats']['facts_upserted']} facts written")
+
+            # Validation summary
+            for stage, validation in result.validation.items():
+                status = "PASS" if validation["passed"] else f"FAIL ({validation['error_count']} errors)"
+                click.echo(f"  {stage} validation: {status}")
+
+            # Write output file
+            if output:
+                with open(output, "w") as f:
+                    json.dump(result.to_dict(), f, indent=2, default=str)
+                click.echo(f"Results written to {output}")
+
+            if not result.success:
+                sys.exit(1)
+
+        except Exception as e:
+            click.echo(f"Pipeline failed: {e}", err=True)
+            logger.exception("Pipeline error")
+            sys.exit(1)
+        finally:
+            await pipeline.close()
+
+    asyncio.run(_run())
+
+
+@main.command()
+@click.option(
+    "--job-id",
+    required=True,
+    help="Job ID to validate",
+)
+@click.option(
+    "--database-url",
+    envvar="DATABASE_URL",
+    required=True,
+    help="PostgreSQL connection string",
+)
+@click.option(
+    "--stage",
+    type=click.Choice(["1", "2", "3", "4", "all"]),
+    default="all",
+    help="Stage to validate (default: all)",
+)
+def validate(job_id: str, database_url: str, stage: str):
+    """Validate pipeline output for a job."""
+
+    async def _validate():
+        from reviewiq_pipeline import Pipeline
+
+        config = get_config(database_url=database_url)
+        pipeline = Pipeline(config)
+
+        try:
+            await pipeline.initialize()
+
+            results = await pipeline.validate(job_id)
+
+            all_passed = True
+            for stage_name, validation in results.items():
+                if stage != "all" and f"stage{stage}" != stage_name:
+                    continue
+
+                status = "PASS" if validation["passed"] else "FAIL"
+                color = "green" if validation["passed"] else "red"
+                click.echo(click.style(f"{stage_name}: {status}", fg=color))
+
+                if not validation["passed"]:
+                    all_passed = False
+                    for error in validation["errors"][:10]:
+                        click.echo(f"  - [{error['rule']}] {error['identifier']}: {error['message']}")
+                    if validation["error_count"] > 10:
+                        click.echo(f"  ... and {validation['error_count'] - 10} more errors")
+
+            if not all_passed:
+                sys.exit(1)
+
+        except Exception as e:
+            click.echo(f"Validation failed: {e}", err=True)
+            sys.exit(1)
+        finally:
+            await pipeline.close()
+
+    asyncio.run(_validate())
+
+
+@main.command()
+@click.option(
+    "--database-url",
+    envvar="DATABASE_URL",
+    required=True,
+    help="PostgreSQL connection string",
+)
+def check(database_url: str):
+    """Check database connection."""
+
+    async def _check():
+        from reviewiq_pipeline.db.connection import DatabasePool
+
+        config = get_config(database_url=database_url)
+        db = DatabasePool(config)
+
+        try:
+            await db.initialize()
+            if await db.check_connection():
+                click.echo(click.style("Database connection OK", fg="green"))
+            else:
+                click.echo(click.style("Database connection failed", fg="red"))
+                sys.exit(1)
+        finally:
+            await db.close()
+
+    asyncio.run(_check())
+
+
+@main.command()
+def version():
+    """Show version information."""
+    click.echo(f"reviewiq-pipeline {__version__}")
+
+
+if __name__ == "__main__":
+    main()
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/config.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/config.py
@@ -0,0 +1,177 @@
+"""Configuration management for the ReviewIQ pipeline."""
+
+from __future__ import annotations
+
+from typing import Literal
+
+from pydantic import Field, SecretStr, field_validator
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Config(BaseSettings):
+    """Pipeline configuration loaded from environment variables or passed directly."""
+
+    model_config = SettingsConfigDict(
+        env_prefix="REVIEWIQ_",
+        env_file=".env",
+        env_file_encoding="utf-8",
+        extra="ignore",
+    )
+
+    # Database
+    database_url: str = Field(
+        default="postgresql://localhost:5432/reviewiq",
+        description="PostgreSQL connection string",
+    )
+    db_pool_min_size: int = Field(default=2, ge=1, le=50)
+    db_pool_max_size: int = Field(default=10, ge=1, le=100)
+
+    # LLM Provider
+    llm_provider: Literal["openai", "anthropic"] = Field(
+        default="openai",
+        description="LLM provider to use for classification",
+    )
+    openai_api_key: SecretStr | None = Field(
+        default=None,
+        description="OpenAI API key",
+    )
+    anthropic_api_key: SecretStr | None = Field(
+        default=None,
+        description="Anthropic API key",
+    )
+
+    # Model settings
+    llm_model: str = Field(
+        default="gpt-4o-mini",
+        description="LLM model to use for classification",
+    )
+    llm_temperature: float = Field(default=0.0, ge=0.0, le=2.0)
+    llm_max_retries: int = Field(default=3, ge=1, le=10)
+    llm_timeout_seconds: int = Field(default=60, ge=10, le=300)
+
+    # Embedding settings
+    embedding_model: str = Field(
+        default="all-MiniLM-L6-v2",
+        description="Sentence transformer model for embeddings",
+    )
+    embedding_dimension: int = Field(
+        default=384,
+        description="Expected embedding dimension",
+    )
+
+    # Taxonomy
+    taxonomy_version: str = Field(
+        default="v5.1",
+        description="URT taxonomy version",
+    )
+
+    # Classification
+    classification_profile: Literal["lite", "core", "standard", "full"] = Field(
+        default="standard",
+        description="Classification profile to use",
+    )
+    max_spans_per_review: int = Field(default=10, ge=1, le=20)
+
+    # Processing
+    batch_size: int = Field(default=50, ge=1, le=500)
+    trust_score_floor: float = Field(default=0.2, ge=0.0, le=1.0)
+
+    # Migrations
+    migrations_path: str = Field(
+        default="",
+        description="Path to migrations directory (empty for default)",
+    )
+
+    @field_validator("llm_provider")
+    @classmethod
+    def validate_provider_api_key(cls, v: str) -> str:
+        """Validate that provider is supported."""
+        if v not in ("openai", "anthropic"):
+            raise ValueError(f"Unsupported LLM provider: {v}")
+        return v
+
+    def get_llm_api_key(self) -> str:
+        """Get the API key for the configured LLM provider."""
+        if self.llm_provider == "openai":
+            if self.openai_api_key is None:
+                raise ValueError("OpenAI API key is required when llm_provider is 'openai'")
+            return self.openai_api_key.get_secret_value()
+        elif self.llm_provider == "anthropic":
+            if self.anthropic_api_key is None:
+                raise ValueError("Anthropic API key is required when llm_provider is 'anthropic'")
+            return self.anthropic_api_key.get_secret_value()
+        else:
+            raise ValueError(f"Unsupported LLM provider: {self.llm_provider}")
+
+    @property
+    def effective_migrations_path(self) -> str:
+        """Get the effective migrations path."""
+        if self.migrations_path:
+            return self.migrations_path
+        # Default to package's migrations directory
+        import importlib.resources
+
+        try:
+            # Python 3.11+
+            return str(importlib.resources.files("reviewiq_pipeline.db") / "migrations")
+        except (AttributeError, TypeError):
+            # Fallback for older Python
+            import os
+
+            return os.path.join(os.path.dirname(__file__), "db", "migrations")
+
+
+class ClassificationConfig:
+    """Configuration specifically for the LLM classification stage."""
+
+    def __init__(self, config: Config):
+        self.model = config.llm_model
+        self.taxonomy_version = config.taxonomy_version
+        self.profile = config.classification_profile
+        self.max_spans_per_review = config.max_spans_per_review
+        self.temperature = config.llm_temperature
+        self.max_retries = config.llm_max_retries
+        self.timeout_seconds = config.llm_timeout_seconds
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary for contract compatibility."""
+        return {
+            "model": self.model,
+            "taxonomy_version": self.taxonomy_version,
+            "profile": self.profile,
+            "max_spans_per_review": self.max_spans_per_review,
+        }
+
+
+class EmbeddingConfig:
+    """Configuration for the embedding service."""
+
+    def __init__(self, config: Config):
+        self.model = config.embedding_model
+        self.dimension = config.embedding_dimension
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary."""
+        return {
+            "model": self.model,
+            "dimension": self.dimension,
+        }
+
+
+class DatabaseConfig:
+    """Configuration for database connections."""
+
+    def __init__(self, config: Config):
+        self.url = config.database_url
+        self.pool_min_size = config.db_pool_min_size
+        self.pool_max_size = config.db_pool_max_size
+        self.migrations_path = config.effective_migrations_path
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary."""
+        return {
+            "url": self.url,
+            "pool_min_size": self.pool_min_size,
+            "pool_max_size": self.pool_max_size,
+            "migrations_path": self.migrations_path,
+        }
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/contracts.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/contracts.py
@@ -0,0 +1,648 @@
+"""
+TypedDict definitions for pipeline stage inputs and outputs.
+
+These contracts define the data structures passed between pipeline stages,
+enabling independent development and validation of each stage.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Literal, TypedDict
+
+
+# =============================================================================
+# Common Types
+# =============================================================================
+
+ValenceType = Literal["V+", "V-", "V0", "V±"]
+IntensityType = Literal["I1", "I2", "I3"]
+SpecificityType = Literal["S1", "S2", "S3"]
+ActionabilityType = Literal["A1", "A2", "A3"]
+TemporalType = Literal["TC", "TR", "TH", "TF"]
+EvidenceType = Literal["ES", "EI", "EC"]
+ComparativeType = Literal["CR-N", "CR-B", "CR-W", "CR-S"]
+ConfidenceType = Literal["high", "medium", "low"]
+EntityTypeValue = Literal["location", "staff", "product", "process", "time", "other"]
+RelationType = Literal["cause_of", "effect_of", "contrast", "resolution"]
+ProfileType = Literal["lite", "core", "standard", "full"]
+BucketType = Literal["day", "week", "month"]
+SubjectType = Literal["overall", "urt_code", "domain", "issue"]
+IssueState = Literal["open", "resolved", "ignored", "merged"]
+
+
+# =============================================================================
+# Validation Types
+# =============================================================================
+
+
+class ValidationError(TypedDict):
+    """A single validation error."""
+
+    rule: str
+    identifier: str
+    message: str
+
+
+class ValidationResult(TypedDict):
+    """Result of validating a stage output."""
+
+    stage: str
+    passed: bool
+    error_count: int
+    errors: list[ValidationError]
+
+
+# =============================================================================
+# Stage 0: Raw Ingestion (from Scraper)
+# =============================================================================
+
+
+class BusinessInfo(TypedDict):
+    """Business metadata from scraper."""
+
+    name: str
+    address: str
+    category: str
+    total_reviews: int
+    average_rating: float
+
+
+class RawReview(TypedDict, total=False):
+    """Raw review as scraped from Google Maps."""
+
+    review_id: str
+    author_name: str
+    author_id: str | None
+    rating: int
+    text: str | None
+    review_time: str
+    response_text: str | None
+    response_time: str | None
+    photos: list[str] | None
+    raw_payload: dict[str, Any]
+
+
+class ScraperOutput(TypedDict):
+    """Output from the scraper (Stage 0), input to pipeline."""
+
+    job_id: str
+    status: Literal["completed", "failed", "partial"]
+    business_id: str
+    place_id: str
+    business_info: BusinessInfo
+    reviews: list[RawReview]
+    scrape_time_ms: int
+    reviews_scraped: int
+    scraper_version: str
+
+
+# =============================================================================
+# Stage 1: Normalization
+# =============================================================================
+
+
+class Stage1Input(TypedDict):
+    """Input to Stage 1 normalization."""
+
+    job_id: str
+    business_id: str
+    place_id: str
+    reviews: list[RawReview]
+
+
+class NormalizedReview(TypedDict, total=False):
+    """A normalized review ready for classification."""
+
+    # Identity (composite key)
+    source: Literal["google"]
+    review_id: str
+    review_version: int
+
+    # Tenant context
+    business_id: str
+    place_id: str
+
+    # Content
+    text: str
+    text_normalized: str
+    text_language: str
+    text_length: int
+    word_count: int
+
+    # Metadata
+    rating: int
+    review_time: str
+    author_name: str
+    author_id: str | None
+
+    # Dedup
+    content_hash: str
+    dedup_group_id: str | None
+
+    # Reference
+    raw_id: int
+
+
+class Stage1Stats(TypedDict):
+    """Statistics from Stage 1 processing."""
+
+    input_count: int
+    output_count: int
+    skipped_empty: int
+    skipped_duplicate: int
+
+
+class Stage1Output(TypedDict):
+    """Output from Stage 1 normalization."""
+
+    job_id: str
+    business_id: str
+    place_id: str
+    reviews_normalized: list[NormalizedReview]
+    stats: Stage1Stats
+
+
+# =============================================================================
+# Stage 2: LLM Classification
+# =============================================================================
+
+
+class ReviewToClassify(TypedDict):
+    """A review to be classified by the LLM."""
+
+    source: str
+    review_id: str
+    review_version: int
+    business_id: str
+    place_id: str
+    text: str
+    text_normalized: str
+    rating: int
+    review_time: str
+
+
+class ClassificationConfig(TypedDict):
+    """Configuration for LLM classification."""
+
+    model: str
+    taxonomy_version: str
+    profile: ProfileType
+    max_spans_per_review: int
+
+
+class Stage2Input(TypedDict):
+    """Input to Stage 2 classification."""
+
+    reviews: list[ReviewToClassify]
+    config: ClassificationConfig
+
+
+class CausalLink(TypedDict):
+    """A link in a causal chain."""
+
+    code: str
+    role: Literal["cause", "effect", "context", "outcome"]
+    order: int
+
+
+class ExtractedSpan(TypedDict, total=False):
+    """A span extracted from a review with URT classification."""
+
+    # Identity
+    span_id: str
+    span_index: int
+
+    # Position (offsets into original text)
+    span_text: str
+    span_start: int
+    span_end: int
+
+    # Classification
+    profile: ProfileType
+    urt_primary: str
+    urt_secondary: list[str]
+    valence: ValenceType
+    intensity: IntensityType
+    comparative: ComparativeType
+
+    # Extended (standard/full profile)
+    specificity: SpecificityType
+    actionability: ActionabilityType
+    temporal: TemporalType
+    evidence: EvidenceType
+
+    # Entity
+    entity: str | None
+    entity_type: EntityTypeValue | None
+    entity_normalized: str | None
+
+    # Causal (full profile)
+    relation_type: RelationType | None
+    related_span_index: int | None
+    causal_chain: list[CausalLink] | None
+
+    # Metadata
+    confidence: ConfidenceType
+    usn: str
+
+    # Flags
+    is_primary: bool
+
+
+class ClassifiedReview(TypedDict, total=False):
+    """A review with LLM classification results."""
+
+    # Identity
+    source: str
+    review_id: str
+    review_version: int
+
+    # Review-level classification (from primary span)
+    urt_primary: str
+    urt_secondary: list[str]
+    valence: ValenceType
+    intensity: IntensityType
+    comparative: ComparativeType
+
+    # Extracted entities
+    staff_mentions: list[str]
+    quotes: dict[str, str]
+
+    # Trust score
+    trust_score: float
+
+    # Embedding
+    embedding: list[float]
+
+    # Spans
+    spans: list[ExtractedSpan]
+
+    # Processing metadata
+    classification_confidence: dict[str, float]
+    processing_time_ms: int
+
+
+class Stage2Stats(TypedDict):
+    """Statistics from Stage 2 processing."""
+
+    input_count: int
+    success_count: int
+    error_count: int
+    total_spans: int
+    avg_spans_per_review: float
+    llm_tokens_used: int
+    llm_cost_usd: float
+
+
+class Stage2Output(TypedDict):
+    """Output from Stage 2 classification."""
+
+    batch_id: str
+    taxonomy_version: str
+    model_version: str
+    prompt_version: str
+    reviews_classified: list[ClassifiedReview]
+    stats: Stage2Stats
+
+
+# =============================================================================
+# Stage 3: Issue Routing
+# =============================================================================
+
+
+class SpanToRoute(TypedDict):
+    """A span to be routed to an issue."""
+
+    span_id: str
+    business_id: str
+    place_id: str
+    urt_primary: str
+    valence: str
+    intensity: str
+    entity_normalized: str | None
+    review_time: str
+    confidence: str
+    trust_score: float
+
+
+class Stage3Input(TypedDict):
+    """Input to Stage 3 issue routing."""
+
+    spans: list[SpanToRoute]
+
+
+class RoutedSpan(TypedDict):
+    """A span that has been routed to an issue."""
+
+    span_id: str
+    issue_id: str
+    routing_key: str
+    is_new_issue: bool
+
+
+class Stage3Stats(TypedDict):
+    """Statistics from Stage 3 processing."""
+
+    spans_processed: int
+    spans_routed: int
+    spans_skipped: int
+    issues_created: int
+    issues_updated: int
+
+
+class Stage3Output(TypedDict):
+    """Output from Stage 3 issue routing."""
+
+    routed_spans: list[RoutedSpan]
+    issues_created: list[str]
+    issues_updated: list[str]
+    stats: Stage3Stats
+
+
+# =============================================================================
+# Stage 4: Fact Aggregation
+# =============================================================================
+
+
+class Stage4Input(TypedDict):
+    """Input to Stage 4 fact aggregation."""
+
+    business_id: str
+    date: str
+    bucket_types: list[BucketType]
+    taxonomy_version: str
+
+
+class FactRecord(TypedDict, total=False):
+    """An aggregated fact record for time series data."""
+
+    # Keys
+    business_id: str
+    place_id: str
+    period_date: str
+    bucket_type: str
+    subject_type: SubjectType
+    subject_id: str
+    taxonomy_version: str
+
+    # Metrics
+    review_count: int
+    span_count: int
+    negative_count: int
+    positive_count: int
+    neutral_count: int
+    mixed_count: int
+    strength_score: float
+    negative_strength: float
+    positive_strength: float
+    avg_rating: float | None
+    i1_count: int
+    i2_count: int
+    i3_count: int
+    cr_better: int
+    cr_worse: int
+    cr_same: int
+    trust_weighted_strength: float
+    trust_weighted_negative: float
+
+
+class Stage4Stats(TypedDict):
+    """Statistics from Stage 4 processing."""
+
+    business_id: str
+    date: str
+    locations_processed: int
+    codes_aggregated: int
+    facts_upserted: int
+
+
+class Stage4Output(TypedDict):
+    """Output from Stage 4 fact aggregation."""
+
+    facts_written: list[FactRecord]
+    stats: Stage4Stats
+
+
+# =============================================================================
+# Database Entity Types
+# =============================================================================
+
+
+class ReviewRaw(TypedDict, total=False):
+    """A raw review record in the database."""
+
+    id: int
+    source: str
+    review_id: str
+    place_id: str
+    raw_payload: dict[str, Any]
+    review_text: str | None
+    rating: int
+    review_time: str
+    reviewer_name: str
+    reviewer_id: str | None
+    review_version: int
+    pulled_at: str
+    created_at: str
+
+
+class ReviewEnriched(TypedDict, total=False):
+    """An enriched review record in the database."""
+
+    id: int
+    source: str
+    review_id: str
+    review_version: int
+    is_latest: bool
+    raw_id: int
+    business_id: str
+    place_id: str
+    text: str
+    text_normalized: str
+    rating: int
+    review_time: str
+    language: str
+    taxonomy_version: str
+    urt_primary: str | None
+    urt_secondary: list[str] | None
+    valence: ValenceType | None
+    intensity: IntensityType | None
+    comparative: ComparativeType | None
+    staff_mentions: list[str] | None
+    quotes: dict[str, str] | None
+    embedding: list[float] | None
+    trust_score: float | None
+    classification_model: str | None
+    classification_confidence: dict[str, float] | None
+    processed_at: str | None
+    created_at: str
+
+
+class ReviewSpan(TypedDict, total=False):
+    """A span record in the database."""
+
+    id: int
+    span_id: str
+    business_id: str
+    place_id: str
+    source: str
+    review_id: str
+    review_version: int
+    span_index: int
+    span_text: str
+    span_start: int
+    span_end: int
+    profile: ProfileType
+    urt_primary: str
+    urt_secondary: list[str]
+    valence: ValenceType
+    intensity: IntensityType
+    comparative: ComparativeType
+    specificity: SpecificityType | None
+    actionability: ActionabilityType | None
+    temporal: TemporalType | None
+    evidence: EvidenceType | None
+    entity: str | None
+    entity_type: EntityTypeValue | None
+    entity_normalized: str | None
+    relation_type: RelationType | None
+    related_span_id: str | None
+    causal_chain: list[CausalLink] | None
+    is_primary: bool
+    is_active: bool
+    review_time: str
+    confidence: ConfidenceType
+    usn: str
+    taxonomy_version: str
+    model_version: str
+    ingest_batch_id: str
+    created_at: str
+
+
+class Issue(TypedDict, total=False):
+    """An issue record in the database."""
+
+    id: int
+    issue_id: str
+    business_id: str
+    place_id: str
+    primary_subcode: str
+    domain: str
+    state: IssueState
+    priority_score: float
+    confidence_score: float
+    span_count: int
+    max_intensity: IntensityType
+    entity: str | None
+    entity_normalized: str | None
+    taxonomy_version: str
+    created_at: str
+    updated_at: str
+
+
+class IssueSpan(TypedDict):
+    """A link between an issue and a span."""
+
+    id: int
+    issue_id: str
+    span_id: str
+    source: str
+    review_id: str
+    review_version: int
+    is_primary_match: bool
+    intensity: IntensityType
+    review_time: str
+    created_at: str
+
+
+class IssueEvent(TypedDict, total=False):
+    """An event in the issue audit log."""
+
+    id: int
+    issue_id: str
+    event_type: str
+    span_id: str | None
+    old_value: str | None
+    new_value: str | None
+    metadata: dict[str, Any] | None
+    created_at: str
+
+
+class FactTimeseries(TypedDict, total=False):
+    """A fact time series record in the database."""
+
+    id: int
+    business_id: str
+    place_id: str
+    period_date: str
+    bucket_type: BucketType
+    subject_type: SubjectType
+    subject_id: str
+    taxonomy_version: str
+    review_count: int
+    span_count: int
+    negative_count: int
+    positive_count: int
+    neutral_count: int
+    mixed_count: int
+    strength_score: float
+    negative_strength: float
+    positive_strength: float
+    avg_rating: float | None
+    i1_count: int
+    i2_count: int
+    i3_count: int
+    cr_better: int
+    cr_worse: int
+    cr_same: int
+    trust_weighted_strength: float
+    trust_weighted_negative: float
+    computed_at: str
+    created_at: str
+
+
+# =============================================================================
+# LLM Response Types
+# =============================================================================
+
+
+class LLMSpanResponse(TypedDict, total=False):
+    """A span in the LLM response."""
+
+    span_index: int
+    span_text: str
+    span_start: int
+    span_end: int
+    urt_primary: str
+    urt_secondary: list[str]
+    valence: ValenceType
+    intensity: IntensityType
+    specificity: SpecificityType
+    actionability: ActionabilityType
+    temporal: TemporalType
+    evidence: EvidenceType
+    comparative: ComparativeType
+    is_primary: bool
+    confidence: ConfidenceType
+    entity: str | None
+    entity_type: EntityTypeValue | None
+    relation_type: RelationType | None
+    related_span_index: int | None
+    usn: str
+
+
+class LLMReviewSummary(TypedDict):
+    """Review summary in the LLM response."""
+
+    dominant_valence: ValenceType
+    dominant_domain: str
+    span_count: int
+    has_comparative: bool
+    has_entity: bool
+
+
+class LLMClassificationResponse(TypedDict):
+    """The full LLM classification response."""
+
+    spans: list[LLMSpanResponse]
+    review_summary: LLMReviewSummary
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/init.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/init.py
@@ -0,0 +1,17 @@
+"""Database layer for pipeline operations."""
+
+from reviewiq_pipeline.db.connection import DatabasePool
+from reviewiq_pipeline.db.repositories import (
+    FactRepository,
+    IssueRepository,
+    ReviewRepository,
+    SpanRepository,
+)
+
+__all__ = [
+    "DatabasePool",
+    "ReviewRepository",
+    "SpanRepository",
+    "IssueRepository",
+    "FactRepository",
+]
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/connection.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/connection.py
@@ -0,0 +1,157 @@
+"""Database connection management using asyncpg."""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from contextlib import asynccontextmanager
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, AsyncGenerator
+
+import asyncpg
+
+if TYPE_CHECKING:
+    from reviewiq_pipeline.config import Config
+
+logger = logging.getLogger(__name__)
+
+
+class DatabasePool:
+    """Manages an asyncpg connection pool."""
+
+    def __init__(self, config: Config):
+        self.config = config
+        self._pool: asyncpg.Pool | None = None
+        self._lock = asyncio.Lock()
+
+    async def initialize(self) -> None:
+        """Initialize the connection pool."""
+        async with self._lock:
+            if self._pool is not None:
+                return
+
+            logger.info("Creating database connection pool...")
+            self._pool = await asyncpg.create_pool(
+                self.config.database_url,
+                min_size=self.config.db_pool_min_size,
+                max_size=self.config.db_pool_max_size,
+                command_timeout=60,
+            )
+            logger.info("Database pool created successfully")
+
+    async def close(self) -> None:
+        """Close the connection pool."""
+        async with self._lock:
+            if self._pool is not None:
+                await self._pool.close()
+                self._pool = None
+                logger.info("Database pool closed")
+
+    @property
+    def pool(self) -> asyncpg.Pool:
+        """Get the connection pool, raising if not initialized."""
+        if self._pool is None:
+            raise RuntimeError("Database pool not initialized. Call initialize() first.")
+        return self._pool
+
+    @asynccontextmanager
+    async def acquire(self) -> AsyncGenerator[asyncpg.Connection, None]:
+        """Acquire a connection from the pool."""
+        async with self.pool.acquire() as conn:
+            yield conn
+
+    @asynccontextmanager
+    async def transaction(self) -> AsyncGenerator[asyncpg.Connection, None]:
+        """Acquire a connection and start a transaction."""
+        async with self.pool.acquire() as conn:
+            async with conn.transaction():
+                yield conn
+
+    async def execute(self, query: str, *args: Any) -> str:
+        """Execute a query and return the status string."""
+        async with self.acquire() as conn:
+            return await conn.execute(query, *args)
+
+    async def executemany(self, query: str, args: list[tuple]) -> None:
+        """Execute a query with multiple argument sets."""
+        async with self.acquire() as conn:
+            await conn.executemany(query, args)
+
+    async def fetch(self, query: str, *args: Any) -> list[asyncpg.Record]:
+        """Fetch multiple rows."""
+        async with self.acquire() as conn:
+            return await conn.fetch(query, *args)
+
+    async def fetchrow(self, query: str, *args: Any) -> asyncpg.Record | None:
+        """Fetch a single row."""
+        async with self.acquire() as conn:
+            return await conn.fetchrow(query, *args)
+
+    async def fetchval(self, query: str, *args: Any) -> Any:
+        """Fetch a single value."""
+        async with self.acquire() as conn:
+            return await conn.fetchval(query, *args)
+
+    async def run_migrations(self, migrations_path: str | None = None) -> int:
+        """Run all pending migrations.
+
+        Args:
+            migrations_path: Path to migrations directory. Uses config default if None.
+
+        Returns:
+            Number of migrations run.
+        """
+        path = Path(migrations_path or self.config.effective_migrations_path)
+        if not path.exists():
+            logger.warning(f"Migrations path does not exist: {path}")
+            return 0
+
+        async with self.transaction() as conn:
+            # Create migrations tracking table
+            await conn.execute("""
+                CREATE TABLE IF NOT EXISTS _migrations (
+                    id SERIAL PRIMARY KEY,
+                    filename VARCHAR(255) UNIQUE NOT NULL,
+                    applied_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
+                )
+            """)
+
+            # Get already applied migrations
+            applied = await conn.fetch("SELECT filename FROM _migrations")
+            applied_set = {r["filename"] for r in applied}
+
+            # Find and run pending migrations
+            migration_files = sorted(path.glob("*.sql"))
+            migrations_run = 0
+
+            for migration_file in migration_files:
+                filename = migration_file.name
+                if filename in applied_set:
+                    continue
+
+                logger.info(f"Running migration: {filename}")
+                sql = migration_file.read_text()
+
+                try:
+                    await conn.execute(sql)
+                    await conn.execute(
+                        "INSERT INTO _migrations (filename) VALUES ($1)",
+                        filename,
+                    )
+                    migrations_run += 1
+                    logger.info(f"Migration {filename} applied successfully")
+                except Exception as e:
+                    logger.error(f"Migration {filename} failed: {e}")
+                    raise
+
+            logger.info(f"Ran {migrations_run} migrations")
+            return migrations_run
+
+    async def check_connection(self) -> bool:
+        """Check if the database connection is working."""
+        try:
+            result = await self.fetchval("SELECT 1")
+            return result == 1
+        except Exception as e:
+            logger.error(f"Database connection check failed: {e}")
+            return False
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/001_create_reviews_tables.sql
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/001_create_reviews_tables.sql
@@ -0,0 +1,80 @@
+-- Migration: 001_create_reviews_tables.sql
+-- Purpose: Create the core reviews tables for Stage 1 normalization
+
+-- Raw reviews table (immutable audit log)
+CREATE TABLE IF NOT EXISTS reviews_raw (
+    id BIGSERIAL PRIMARY KEY,
+    source VARCHAR(20) NOT NULL DEFAULT 'google',
+    review_id VARCHAR(255) NOT NULL,
+    place_id VARCHAR(255) NOT NULL,
+    raw_payload JSONB NOT NULL DEFAULT '{}',
+    review_text TEXT,
+    rating SMALLINT NOT NULL CHECK (rating BETWEEN 1 AND 5),
+    review_time TIMESTAMP WITH TIME ZONE NOT NULL,
+    reviewer_name VARCHAR(255) NOT NULL,
+    reviewer_id VARCHAR(255),
+    review_version INTEGER NOT NULL DEFAULT 1,
+    pulled_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
+    created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
+
+    CONSTRAINT reviews_raw_unique UNIQUE (source, review_id, review_version)
+);
+
+-- Indexes for reviews_raw
+CREATE INDEX IF NOT EXISTS idx_reviews_raw_place_id ON reviews_raw(place_id);
+CREATE INDEX IF NOT EXISTS idx_reviews_raw_review_time ON reviews_raw(review_time);
+CREATE INDEX IF NOT EXISTS idx_reviews_raw_pulled_at ON reviews_raw(pulled_at);
+
+-- Enriched reviews table (mutable, updated by classification)
+CREATE TABLE IF NOT EXISTS reviews_enriched (
+    id BIGSERIAL PRIMARY KEY,
+    source VARCHAR(20) NOT NULL DEFAULT 'google',
+    review_id VARCHAR(255) NOT NULL,
+    review_version INTEGER NOT NULL DEFAULT 1,
+    is_latest BOOLEAN NOT NULL DEFAULT TRUE,
+    raw_id BIGINT REFERENCES reviews_raw(id),
+
+    -- Tenant context
+    business_id VARCHAR(255) NOT NULL,
+    place_id VARCHAR(255) NOT NULL,
+
+    -- Content
+    text TEXT NOT NULL,
+    text_normalized TEXT NOT NULL,
+    rating SMALLINT NOT NULL CHECK (rating BETWEEN 1 AND 5),
+    review_time TIMESTAMP WITH TIME ZONE NOT NULL,
+
+    -- Normalization fields
+    language VARCHAR(10) NOT NULL DEFAULT 'en',
+    taxonomy_version VARCHAR(20) NOT NULL DEFAULT 'v5.1',
+
+    -- Classification fields (NULL until Stage 2)
+    urt_primary VARCHAR(10),
+    urt_secondary VARCHAR(10)[] DEFAULT '{}',
+    valence VARCHAR(5),
+    intensity VARCHAR(5),
+    comparative VARCHAR(10),
+    staff_mentions VARCHAR(255)[] DEFAULT '{}',
+    quotes JSONB DEFAULT '{}',
+    embedding REAL[] DEFAULT '{}',
+    trust_score REAL,
+    classification_model VARCHAR(100),
+    classification_confidence JSONB DEFAULT '{}',
+    processed_at TIMESTAMP WITH TIME ZONE,
+
+    created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
+
+    CONSTRAINT reviews_enriched_unique UNIQUE (source, review_id, review_version)
+);
+
+-- Indexes for reviews_enriched
+CREATE INDEX IF NOT EXISTS idx_reviews_enriched_business_id ON reviews_enriched(business_id);
+CREATE INDEX IF NOT EXISTS idx_reviews_enriched_place_id ON reviews_enriched(place_id);
+CREATE INDEX IF NOT EXISTS idx_reviews_enriched_review_time ON reviews_enriched(review_time);
+CREATE INDEX IF NOT EXISTS idx_reviews_enriched_urt_primary ON reviews_enriched(urt_primary) WHERE urt_primary IS NOT NULL;
+CREATE INDEX IF NOT EXISTS idx_reviews_enriched_unclassified ON reviews_enriched(review_time DESC) WHERE urt_primary IS NULL AND is_latest = TRUE;
+CREATE INDEX IF NOT EXISTS idx_reviews_enriched_valence ON reviews_enriched(valence) WHERE valence IS NOT NULL;
+
+-- Comment on tables
+COMMENT ON TABLE reviews_raw IS 'Immutable raw review data as scraped from source';
+COMMENT ON TABLE reviews_enriched IS 'Enriched reviews with normalization and classification';
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/002_create_spans_table.sql
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/002_create_spans_table.sql
@@ -0,0 +1,84 @@
+-- Migration: 002_create_spans_table.sql
+-- Purpose: Create the review_spans table for Stage 2 classification output
+
+CREATE TABLE IF NOT EXISTS review_spans (
+    id BIGSERIAL PRIMARY KEY,
+    span_id VARCHAR(50) NOT NULL UNIQUE,
+
+    -- Context
+    business_id VARCHAR(255) NOT NULL,
+    place_id VARCHAR(255) NOT NULL,
+    source VARCHAR(20) NOT NULL DEFAULT 'google',
+    review_id VARCHAR(255) NOT NULL,
+    review_version INTEGER NOT NULL DEFAULT 1,
+
+    -- Position
+    span_index INTEGER NOT NULL CHECK (span_index >= 0),
+    span_text TEXT NOT NULL,
+    span_start INTEGER NOT NULL CHECK (span_start >= 0),
+    span_end INTEGER NOT NULL CHECK (span_end > span_start),
+
+    -- Classification profile
+    profile VARCHAR(20) NOT NULL DEFAULT 'standard',
+
+    -- Core URT classification
+    urt_primary VARCHAR(10) NOT NULL,
+    urt_secondary VARCHAR(10)[] DEFAULT '{}',
+    valence VARCHAR(5) NOT NULL,
+    intensity VARCHAR(5) NOT NULL,
+    comparative VARCHAR(10) NOT NULL DEFAULT 'CR-N',
+
+    -- Extended classification (standard/full profile)
+    specificity VARCHAR(5),
+    actionability VARCHAR(5),
+    temporal VARCHAR(5),
+    evidence VARCHAR(5),
+
+    -- Entity extraction
+    entity VARCHAR(255),
+    entity_type VARCHAR(20),
+    entity_normalized VARCHAR(255),
+
+    -- Causal relations (full profile)
+    relation_type VARCHAR(20),
+    related_span_id VARCHAR(50),
+    causal_chain JSONB,
+
+    -- Flags
+    is_primary BOOLEAN NOT NULL DEFAULT FALSE,
+    is_active BOOLEAN NOT NULL DEFAULT TRUE,
+
+    -- Time reference
+    review_time TIMESTAMP WITH TIME ZONE NOT NULL,
+
+    -- Metadata
+    confidence VARCHAR(10) NOT NULL DEFAULT 'medium',
+    usn VARCHAR(100) NOT NULL,
+    taxonomy_version VARCHAR(20) NOT NULL,
+    model_version VARCHAR(100) NOT NULL,
+    ingest_batch_id VARCHAR(50) NOT NULL,
+
+    created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
+
+    -- Foreign key to review
+    CONSTRAINT fk_review FOREIGN KEY (source, review_id, review_version)
+        REFERENCES reviews_enriched(source, review_id, review_version)
+);
+
+-- Indexes for review_spans
+CREATE INDEX IF NOT EXISTS idx_spans_business_id ON review_spans(business_id);
+CREATE INDEX IF NOT EXISTS idx_spans_place_id ON review_spans(place_id);
+CREATE INDEX IF NOT EXISTS idx_spans_review_time ON review_spans(review_time);
+CREATE INDEX IF NOT EXISTS idx_spans_urt_primary ON review_spans(urt_primary);
+CREATE INDEX IF NOT EXISTS idx_spans_valence ON review_spans(valence);
+CREATE INDEX IF NOT EXISTS idx_spans_intensity ON review_spans(intensity);
+CREATE INDEX IF NOT EXISTS idx_spans_is_active ON review_spans(is_active) WHERE is_active = TRUE;
+CREATE INDEX IF NOT EXISTS idx_spans_is_primary ON review_spans(is_primary) WHERE is_primary = TRUE;
+CREATE INDEX IF NOT EXISTS idx_spans_entity_normalized ON review_spans(entity_normalized) WHERE entity_normalized IS NOT NULL;
+CREATE INDEX IF NOT EXISTS idx_spans_batch ON review_spans(ingest_batch_id);
+
+-- Index for unrouted negative spans (Stage 3 query)
+CREATE INDEX IF NOT EXISTS idx_spans_unrouted_negative ON review_spans(review_time DESC)
+    WHERE is_active = TRUE AND valence IN ('V-', 'V±');
+
+COMMENT ON TABLE review_spans IS 'Extracted semantic spans with URT classification from reviews';
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/003_create_urt_enums.sql
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/003_create_urt_enums.sql
@@ -0,0 +1,111 @@
+-- Migration: 003_create_urt_enums.sql
+-- Purpose: Create enum types and lookup tables for URT taxonomy
+
+-- Valence enum
+DO $$ BEGIN
+    CREATE TYPE valence_type AS ENUM ('V+', 'V-', 'V0', 'V±');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+-- Intensity enum
+DO $$ BEGIN
+    CREATE TYPE intensity_type AS ENUM ('I1', 'I2', 'I3');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+-- Specificity enum
+DO $$ BEGIN
+    CREATE TYPE specificity_type AS ENUM ('S1', 'S2', 'S3');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+-- Actionability enum
+DO $$ BEGIN
+    CREATE TYPE actionability_type AS ENUM ('A1', 'A2', 'A3');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+-- Temporal enum
+DO $$ BEGIN
+    CREATE TYPE temporal_type AS ENUM ('TC', 'TR', 'TH', 'TF');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+-- Evidence enum
+DO $$ BEGIN
+    CREATE TYPE evidence_type AS ENUM ('ES', 'EI', 'EC');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+-- Comparative enum
+DO $$ BEGIN
+    CREATE TYPE comparative_type AS ENUM ('CR-N', 'CR-B', 'CR-W', 'CR-S');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+-- URT Domain lookup table
+CREATE TABLE IF NOT EXISTS urt_domains (
+    code CHAR(1) PRIMARY KEY,
+    name VARCHAR(50) NOT NULL,
+    description TEXT
+);
+
+INSERT INTO urt_domains (code, name, description) VALUES
+    ('O', 'Offering', 'Product/service quality, features, variety'),
+    ('P', 'Price', 'Value, pricing, promotions, payment'),
+    ('J', 'Journey', 'Timing, process, convenience, accessibility'),
+    ('E', 'Environment', 'Physical space, ambiance, cleanliness, digital UX'),
+    ('A', 'Attitude', 'Staff behavior, helpfulness, professionalism'),
+    ('V', 'Voice', 'Brand, communication, marketing, transparency'),
+    ('R', 'Relationship', 'Loyalty, trust, consistency, personalization')
+ON CONFLICT (code) DO NOTHING;
+
+-- URT Tier-2 categories lookup table
+CREATE TABLE IF NOT EXISTS urt_categories (
+    code VARCHAR(5) PRIMARY KEY,
+    domain_code CHAR(1) NOT NULL REFERENCES urt_domains(code),
+    name VARCHAR(100) NOT NULL,
+    description TEXT
+);
+
+-- Insert standard Tier-2 categories
+INSERT INTO urt_categories (code, domain_code, name) VALUES
+    ('O1', 'O', 'Core Product/Service'),
+    ('O2', 'O', 'Product Features'),
+    ('O3', 'O', 'Variety & Selection'),
+    ('O4', 'O', 'Customization'),
+    ('P1', 'P', 'Value Perception'),
+    ('P2', 'P', 'Pricing Structure'),
+    ('P3', 'P', 'Promotions & Deals'),
+    ('P4', 'P', 'Payment Process'),
+    ('J1', 'J', 'Wait Times'),
+    ('J2', 'J', 'Booking & Reservations'),
+    ('J3', 'J', 'Navigation & Convenience'),
+    ('J4', 'J', 'Accessibility'),
+    ('E1', 'E', 'Physical Environment'),
+    ('E2', 'E', 'Ambiance & Atmosphere'),
+    ('E3', 'E', 'Cleanliness'),
+    ('E4', 'E', 'Digital Experience'),
+    ('A1', 'A', 'Friendliness'),
+    ('A2', 'A', 'Helpfulness'),
+    ('A3', 'A', 'Professionalism'),
+    ('A4', 'A', 'Knowledge & Expertise'),
+    ('V1', 'V', 'Brand Identity'),
+    ('V2', 'V', 'Communication'),
+    ('V3', 'V', 'Marketing'),
+    ('V4', 'V', 'Transparency'),
+    ('R1', 'R', 'Loyalty'),
+    ('R2', 'R', 'Trust'),
+    ('R3', 'R', 'Consistency'),
+    ('R4', 'R', 'Personalization')
+ON CONFLICT (code) DO NOTHING;
+
+COMMENT ON TABLE urt_domains IS 'URT v5.1 top-level domains';
+COMMENT ON TABLE urt_categories IS 'URT v5.1 Tier-2 categories';
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/004_create_issues_tables.sql
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/004_create_issues_tables.sql
@@ -0,0 +1,96 @@
+-- Migration: 004_create_issues_tables.sql
+-- Purpose: Create tables for Stage 3 issue routing
+
+-- Issue state enum
+DO $$ BEGIN
+    CREATE TYPE issue_state AS ENUM ('open', 'resolved', 'ignored', 'merged');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+-- Issues table
+CREATE TABLE IF NOT EXISTS issues (
+    id BIGSERIAL PRIMARY KEY,
+    issue_id VARCHAR(50) NOT NULL UNIQUE,
+
+    -- Context
+    business_id VARCHAR(255) NOT NULL,
+    place_id VARCHAR(255) NOT NULL,
+
+    -- Classification
+    primary_subcode VARCHAR(10) NOT NULL,
+    domain CHAR(1) NOT NULL,
+
+    -- State
+    state issue_state NOT NULL DEFAULT 'open',
+    priority_score REAL NOT NULL DEFAULT 1.0,
+    confidence_score REAL NOT NULL DEFAULT 1.0,
+
+    -- Aggregates
+    span_count INTEGER NOT NULL DEFAULT 1,
+    max_intensity VARCHAR(5) NOT NULL DEFAULT 'I1',
+
+    -- Entity (optional - for entity-specific issues)
+    entity VARCHAR(255),
+    entity_normalized VARCHAR(255),
+
+    -- Metadata
+    taxonomy_version VARCHAR(20) NOT NULL,
+    created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
+    updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
+);
+
+-- Issue-span links (1:1 - each span routes to exactly one issue)
+CREATE TABLE IF NOT EXISTS issue_spans (
+    id BIGSERIAL PRIMARY KEY,
+    issue_id VARCHAR(50) NOT NULL REFERENCES issues(issue_id),
+    span_id VARCHAR(50) NOT NULL UNIQUE,
+
+    -- Review reference
+    source VARCHAR(20) NOT NULL DEFAULT 'google',
+    review_id VARCHAR(255) NOT NULL,
+    review_version INTEGER NOT NULL DEFAULT 1,
+
+    -- Match info
+    is_primary_match BOOLEAN NOT NULL DEFAULT TRUE,
+    intensity VARCHAR(5) NOT NULL,
+    review_time TIMESTAMP WITH TIME ZONE NOT NULL,
+
+    created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
+);
+
+-- Issue events (audit log)
+CREATE TABLE IF NOT EXISTS issue_events (
+    id BIGSERIAL PRIMARY KEY,
+    issue_id VARCHAR(50) NOT NULL REFERENCES issues(issue_id),
+    event_type VARCHAR(50) NOT NULL,
+    span_id VARCHAR(50),
+    old_value TEXT,
+    new_value TEXT,
+    metadata JSONB,
+    created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
+);
+
+-- Indexes for issues
+CREATE INDEX IF NOT EXISTS idx_issues_business_id ON issues(business_id);
+CREATE INDEX IF NOT EXISTS idx_issues_place_id ON issues(place_id);
+CREATE INDEX IF NOT EXISTS idx_issues_state ON issues(state);
+CREATE INDEX IF NOT EXISTS idx_issues_primary_subcode ON issues(primary_subcode);
+CREATE INDEX IF NOT EXISTS idx_issues_domain ON issues(domain);
+CREATE INDEX IF NOT EXISTS idx_issues_entity_normalized ON issues(entity_normalized) WHERE entity_normalized IS NOT NULL;
+CREATE INDEX IF NOT EXISTS idx_issues_priority ON issues(priority_score DESC) WHERE state = 'open';
+CREATE INDEX IF NOT EXISTS idx_issues_created ON issues(created_at);
+CREATE INDEX IF NOT EXISTS idx_issues_updated ON issues(updated_at);
+
+-- Indexes for issue_spans
+CREATE INDEX IF NOT EXISTS idx_issue_spans_issue_id ON issue_spans(issue_id);
+CREATE INDEX IF NOT EXISTS idx_issue_spans_review_time ON issue_spans(review_time);
+
+-- Indexes for issue_events
+CREATE INDEX IF NOT EXISTS idx_issue_events_issue_id ON issue_events(issue_id);
+CREATE INDEX IF NOT EXISTS idx_issue_events_created ON issue_events(created_at);
+CREATE INDEX IF NOT EXISTS idx_issue_events_type ON issue_events(event_type);
+
+COMMENT ON TABLE issues IS 'Aggregated issues derived from negative/mixed spans';
+COMMENT ON TABLE issue_spans IS 'Links between issues and their source spans';
+COMMENT ON TABLE issue_events IS 'Audit log for issue state changes';
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/005_create_facts_table.sql
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/005_create_facts_table.sql
@@ -0,0 +1,97 @@
+-- Migration: 005_create_facts_table.sql
+-- Purpose: Create the fact_timeseries table for Stage 4 aggregation
+
+-- Subject type enum
+DO $$ BEGIN
+    CREATE TYPE subject_type AS ENUM ('overall', 'urt_code', 'domain', 'issue');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+-- Bucket type enum
+DO $$ BEGIN
+    CREATE TYPE bucket_type AS ENUM ('day', 'week', 'month');
+EXCEPTION
+    WHEN duplicate_object THEN NULL;
+END $$;
+
+-- Fact timeseries table
+CREATE TABLE IF NOT EXISTS fact_timeseries (
+    id BIGSERIAL PRIMARY KEY,
+
+    -- Dimension keys
+    business_id VARCHAR(255) NOT NULL,
+    place_id VARCHAR(255) NOT NULL,  -- Or 'ALL' for rollup
+    period_date DATE NOT NULL,
+    bucket_type bucket_type NOT NULL DEFAULT 'day',
+    subject_type subject_type NOT NULL DEFAULT 'urt_code',
+    subject_id VARCHAR(50) NOT NULL,  -- URT code, domain letter, or issue_id
+    taxonomy_version VARCHAR(20) NOT NULL,
+
+    -- Core counts
+    review_count INTEGER NOT NULL DEFAULT 0,
+    span_count INTEGER NOT NULL DEFAULT 0,
+
+    -- Valence counts
+    negative_count INTEGER NOT NULL DEFAULT 0,
+    positive_count INTEGER NOT NULL DEFAULT 0,
+    neutral_count INTEGER NOT NULL DEFAULT 0,
+    mixed_count INTEGER NOT NULL DEFAULT 0,
+
+    -- Strength scores
+    strength_score REAL NOT NULL DEFAULT 0.0,
+    negative_strength REAL NOT NULL DEFAULT 0.0,
+    positive_strength REAL NOT NULL DEFAULT 0.0,
+
+    -- Rating
+    avg_rating REAL,
+
+    -- Intensity counts
+    i1_count INTEGER NOT NULL DEFAULT 0,
+    i2_count INTEGER NOT NULL DEFAULT 0,
+    i3_count INTEGER NOT NULL DEFAULT 0,
+
+    -- Comparative counts
+    cr_better INTEGER NOT NULL DEFAULT 0,
+    cr_worse INTEGER NOT NULL DEFAULT 0,
+    cr_same INTEGER NOT NULL DEFAULT 0,
+
+    -- Trust-weighted metrics
+    trust_weighted_strength REAL NOT NULL DEFAULT 0.0,
+    trust_weighted_negative REAL NOT NULL DEFAULT 0.0,
+
+    -- Metadata
+    computed_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
+    created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
+
+    -- Unique constraint for upsert
+    CONSTRAINT fact_timeseries_unique UNIQUE (
+        business_id, place_id, period_date, bucket_type,
+        subject_type, subject_id, taxonomy_version
+    )
+);
+
+-- Indexes for fact_timeseries
+CREATE INDEX IF NOT EXISTS idx_facts_business_id ON fact_timeseries(business_id);
+CREATE INDEX IF NOT EXISTS idx_facts_place_id ON fact_timeseries(place_id);
+CREATE INDEX IF NOT EXISTS idx_facts_period ON fact_timeseries(period_date);
+CREATE INDEX IF NOT EXISTS idx_facts_bucket ON fact_timeseries(bucket_type);
+CREATE INDEX IF NOT EXISTS idx_facts_subject_type ON fact_timeseries(subject_type);
+CREATE INDEX IF NOT EXISTS idx_facts_subject_id ON fact_timeseries(subject_id);
+
+-- Composite index for common dashboard queries
+CREATE INDEX IF NOT EXISTS idx_facts_dashboard ON fact_timeseries(
+    business_id, place_id, bucket_type, period_date DESC
+);
+
+-- Index for specific code trends
+CREATE INDEX IF NOT EXISTS idx_facts_code_trend ON fact_timeseries(
+    business_id, subject_id, bucket_type, period_date DESC
+) WHERE subject_type = 'urt_code';
+
+-- Index for domain aggregates
+CREATE INDEX IF NOT EXISTS idx_facts_domain ON fact_timeseries(
+    business_id, subject_id, bucket_type, period_date DESC
+) WHERE subject_type = 'domain';
+
+COMMENT ON TABLE fact_timeseries IS 'Pre-aggregated time series facts for dashboard queries';
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/repositories.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/repositories.py
@@ -0,0 +1,562 @@
+"""Data access layer for pipeline operations."""
+
+from __future__ import annotations
+
+import json
+import logging
+from datetime import date, datetime
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from reviewiq_pipeline.contracts import (
+        ClassifiedReview,
+        ExtractedSpan,
+        FactRecord,
+        NormalizedReview,
+        RawReview,
+        RoutedSpan,
+    )
+    from reviewiq_pipeline.db.connection import DatabasePool
+
+logger = logging.getLogger(__name__)
+
+
+class ReviewRepository:
+    """Repository for review data operations."""
+
+    def __init__(self, db: DatabasePool):
+        self.db = db
+
+    async def insert_raw_review(
+        self,
+        review: RawReview,
+        place_id: str,
+        source: str = "google",
+    ) -> int:
+        """Insert a raw review and return its ID."""
+        query = """
+            INSERT INTO reviews_raw (
+                source, review_id, place_id, raw_payload,
+                review_text, rating, review_time, reviewer_name, reviewer_id,
+                review_version, pulled_at
+            ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, NOW())
+            ON CONFLICT (source, review_id, review_version) DO UPDATE SET
+                pulled_at = NOW()
+            RETURNING id
+        """
+        raw_id = await self.db.fetchval(
+            query,
+            source,
+            review["review_id"],
+            place_id,
+            json.dumps(review.get("raw_payload", {})),
+            review.get("text"),
+            review["rating"],
+            review["review_time"],
+            review["author_name"],
+            review.get("author_id"),
+            1,  # Initial version
+        )
+        return raw_id
+
+    async def insert_enriched_review(
+        self,
+        review: NormalizedReview,
+        raw_id: int,
+    ) -> int:
+        """Insert an enriched review stub (pre-classification)."""
+        query = """
+            INSERT INTO reviews_enriched (
+                source, review_id, review_version, is_latest, raw_id,
+                business_id, place_id, text, text_normalized, rating, review_time,
+                language, taxonomy_version
+            ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13)
+            ON CONFLICT (source, review_id, review_version) DO UPDATE SET
+                is_latest = EXCLUDED.is_latest
+            RETURNING id
+        """
+        enriched_id = await self.db.fetchval(
+            query,
+            review["source"],
+            review["review_id"],
+            review["review_version"],
+            True,  # is_latest
+            raw_id,
+            review["business_id"],
+            review["place_id"],
+            review["text"],
+            review["text_normalized"],
+            review["rating"],
+            review["review_time"],
+            review["text_language"],
+            "v5.1",  # taxonomy_version - will be updated by Stage 2
+        )
+        return enriched_id
+
+    async def update_enriched_with_classification(
+        self,
+        classified: ClassifiedReview,
+        model_version: str,
+        taxonomy_version: str,
+    ) -> None:
+        """Update an enriched review with classification results."""
+        query = """
+            UPDATE reviews_enriched SET
+                urt_primary = $1,
+                urt_secondary = $2,
+                valence = $3,
+                intensity = $4,
+                comparative = $5,
+                staff_mentions = $6,
+                quotes = $7,
+                embedding = $8,
+                trust_score = $9,
+                classification_model = $10,
+                classification_confidence = $11,
+                taxonomy_version = $12,
+                processed_at = NOW()
+            WHERE source = $13
+              AND review_id = $14
+              AND review_version = $15
+        """
+        await self.db.execute(
+            query,
+            classified["urt_primary"],
+            classified.get("urt_secondary", []),
+            classified["valence"],
+            classified["intensity"],
+            classified.get("comparative", "CR-N"),
+            classified.get("staff_mentions", []),
+            json.dumps(classified.get("quotes", {})),
+            classified.get("embedding", []),
+            classified.get("trust_score", 0.5),
+            model_version,
+            json.dumps(classified.get("classification_confidence", {})),
+            taxonomy_version,
+            classified["source"],
+            classified["review_id"],
+            classified["review_version"],
+        )
+
+    async def get_unclassified_reviews(
+        self,
+        limit: int = 100,
+    ) -> list[dict[str, Any]]:
+        """Get reviews that haven't been classified yet."""
+        query = """
+            SELECT
+                source, review_id, review_version, business_id, place_id,
+                text, text_normalized, rating, review_time
+            FROM reviews_enriched
+            WHERE urt_primary IS NULL
+              AND is_latest = TRUE
+            ORDER BY review_time DESC
+            LIMIT $1
+        """
+        rows = await self.db.fetch(query, limit)
+        return [dict(r) for r in rows]
+
+    async def get_review_by_id(
+        self,
+        source: str,
+        review_id: str,
+        review_version: int,
+    ) -> dict[str, Any] | None:
+        """Get a specific review by its composite key."""
+        query = """
+            SELECT * FROM reviews_enriched
+            WHERE source = $1 AND review_id = $2 AND review_version = $3
+        """
+        row = await self.db.fetchrow(query, source, review_id, review_version)
+        return dict(row) if row else None
+
+    async def check_duplicate(
+        self,
+        content_hash: str,
+        business_id: str,
+    ) -> str | None:
+        """Check if a content hash already exists, return dedup_group_id if so."""
+        # For now, we check by querying the first occurrence
+        # A proper dedup table would be better for production
+        query = """
+            SELECT review_id FROM reviews_enriched
+            WHERE business_id = $1
+              AND text_normalized IS NOT NULL
+            LIMIT 1
+        """
+        # Simplified - in production, use a separate dedup table with content_hash index
+        return None
+
+
+class SpanRepository:
+    """Repository for span data operations."""
+
+    def __init__(self, db: DatabasePool):
+        self.db = db
+
+    async def insert_span(
+        self,
+        span: ExtractedSpan,
+        business_id: str,
+        place_id: str,
+        source: str,
+        review_id: str,
+        review_version: int,
+        review_time: str,
+        batch_id: str,
+        model_version: str,
+        taxonomy_version: str,
+    ) -> None:
+        """Insert a span into the database."""
+        query = """
+            INSERT INTO review_spans (
+                span_id, business_id, place_id, source, review_id, review_version,
+                span_index, span_text, span_start, span_end,
+                profile, urt_primary, urt_secondary, valence, intensity, comparative,
+                specificity, actionability, temporal, evidence,
+                entity, entity_type, entity_normalized,
+                relation_type, related_span_id, causal_chain,
+                is_primary, is_active, review_time,
+                confidence, usn, taxonomy_version, model_version, ingest_batch_id
+            ) VALUES (
+                $1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
+                $11, $12, $13, $14, $15, $16, $17, $18, $19, $20,
+                $21, $22, $23, $24, $25, $26, $27, $28, $29, $30,
+                $31, $32, $33, $34
+            )
+            ON CONFLICT (span_id) DO UPDATE SET
+                is_active = EXCLUDED.is_active
+        """
+        # Build related_span_id from index if needed
+        related_span_id = None
+        if span.get("related_span_index") is not None:
+            # Build the related span_id (would need the actual span_id mapping)
+            pass
+
+        await self.db.execute(
+            query,
+            span["span_id"],
+            business_id,
+            place_id,
+            source,
+            review_id,
+            review_version,
+            span["span_index"],
+            span["span_text"],
+            span["span_start"],
+            span["span_end"],
+            span.get("profile", "standard"),
+            span["urt_primary"],
+            span.get("urt_secondary", []),
+            span["valence"],
+            span["intensity"],
+            span.get("comparative", "CR-N"),
+            span.get("specificity"),
+            span.get("actionability"),
+            span.get("temporal"),
+            span.get("evidence"),
+            span.get("entity"),
+            span.get("entity_type"),
+            span.get("entity_normalized"),
+            span.get("relation_type"),
+            related_span_id,
+            json.dumps(span.get("causal_chain")) if span.get("causal_chain") else None,
+            span.get("is_primary", False),
+            True,  # is_active
+            review_time,
+            span.get("confidence", "medium"),
+            span["usn"],
+            taxonomy_version,
+            model_version,
+            batch_id,
+        )
+
+    async def get_unrouted_negative_spans(
+        self,
+        limit: int = 100,
+    ) -> list[dict[str, Any]]:
+        """Get negative spans that haven't been routed to issues yet."""
+        query = """
+            SELECT
+                rs.span_id, rs.business_id, rs.place_id,
+                rs.urt_primary, rs.valence, rs.intensity,
+                rs.entity_normalized, rs.review_time, rs.confidence,
+                re.trust_score
+            FROM review_spans rs
+            JOIN reviews_enriched re ON (
+                re.source = rs.source
+                AND re.review_id = rs.review_id
+                AND re.review_version = rs.review_version
+            )
+            WHERE rs.is_active = TRUE
+              AND rs.valence IN ('V-', 'V±')
+              AND NOT EXISTS (
+                SELECT 1 FROM issue_spans iss WHERE iss.span_id = rs.span_id
+              )
+            ORDER BY rs.review_time DESC
+            LIMIT $1
+        """
+        rows = await self.db.fetch(query, limit)
+        return [dict(r) for r in rows]
+
+    async def get_span_by_id(self, span_id: str) -> dict[str, Any] | None:
+        """Get a span by its ID."""
+        query = "SELECT * FROM review_spans WHERE span_id = $1"
+        row = await self.db.fetchrow(query, span_id)
+        return dict(row) if row else None
+
+
+class IssueRepository:
+    """Repository for issue data operations."""
+
+    def __init__(self, db: DatabasePool):
+        self.db = db
+
+    async def upsert_issue(
+        self,
+        issue_id: str,
+        business_id: str,
+        place_id: str,
+        primary_subcode: str,
+        intensity: str,
+        entity: str | None,
+        entity_normalized: str | None,
+        taxonomy_version: str,
+    ) -> bool:
+        """Create or update an issue. Returns True if newly created."""
+        # First check if exists
+        existing = await self.db.fetchval(
+            "SELECT 1 FROM issues WHERE issue_id = $1",
+            issue_id,
+        )
+
+        if existing:
+            # Update
+            await self.db.execute(
+                """
+                UPDATE issues SET
+                    span_count = span_count + 1,
+                    max_intensity = CASE
+                        WHEN $1 = 'I3' THEN 'I3'
+                        WHEN $1 = 'I2' AND max_intensity != 'I3' THEN 'I2'
+                        ELSE max_intensity
+                    END,
+                    updated_at = NOW()
+                WHERE issue_id = $2
+                """,
+                intensity,
+                issue_id,
+            )
+            return False
+        else:
+            # Insert
+            domain = primary_subcode[0] if primary_subcode else "O"
+            await self.db.execute(
+                """
+                INSERT INTO issues (
+                    issue_id, business_id, place_id, primary_subcode, domain,
+                    state, priority_score, confidence_score, span_count, max_intensity,
+                    entity, entity_normalized, taxonomy_version
+                ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13)
+                """,
+                issue_id,
+                business_id,
+                place_id,
+                primary_subcode,
+                domain,
+                "open",
+                1.0,  # Initial priority
+                1.0,  # Initial confidence
+                1,  # Initial span count
+                intensity,
+                entity,
+                entity_normalized,
+                taxonomy_version,
+            )
+            return True
+
+    async def link_span_to_issue(
+        self,
+        routed: RoutedSpan,
+        source: str,
+        review_id: str,
+        review_version: int,
+        intensity: str,
+        review_time: str,
+        is_primary_match: bool = True,
+    ) -> None:
+        """Link a span to an issue."""
+        await self.db.execute(
+            """
+            INSERT INTO issue_spans (
+                issue_id, span_id, source, review_id, review_version,
+                is_primary_match, intensity, review_time
+            ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
+            ON CONFLICT (span_id) DO NOTHING
+            """,
+            routed["issue_id"],
+            routed["span_id"],
+            source,
+            review_id,
+            review_version,
+            is_primary_match,
+            intensity,
+            review_time,
+        )
+
+    async def log_event(
+        self,
+        issue_id: str,
+        event_type: str,
+        span_id: str | None = None,
+        old_value: str | None = None,
+        new_value: str | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> None:
+        """Log an issue event for audit trail."""
+        await self.db.execute(
+            """
+            INSERT INTO issue_events (
+                issue_id, event_type, span_id, old_value, new_value, metadata
+            ) VALUES ($1, $2, $3, $4, $5, $6)
+            """,
+            issue_id,
+            event_type,
+            span_id,
+            old_value,
+            new_value,
+            json.dumps(metadata) if metadata else None,
+        )
+
+    async def get_issue_by_id(self, issue_id: str) -> dict[str, Any] | None:
+        """Get an issue by its ID."""
+        query = "SELECT * FROM issues WHERE issue_id = $1"
+        row = await self.db.fetchrow(query, issue_id)
+        return dict(row) if row else None
+
+    async def check_span_already_linked(self, span_id: str) -> str | None:
+        """Check if a span is already linked to an issue."""
+        return await self.db.fetchval(
+            "SELECT issue_id FROM issue_spans WHERE span_id = $1",
+            span_id,
+        )
+
+
+class FactRepository:
+    """Repository for fact time series operations."""
+
+    def __init__(self, db: DatabasePool):
+        self.db = db
+
+    async def upsert_fact(self, fact: FactRecord) -> None:
+        """Insert or update a fact record."""
+        await self.db.execute(
+            """
+            INSERT INTO fact_timeseries (
+                business_id, place_id, period_date, bucket_type,
+                subject_type, subject_id, taxonomy_version,
+                review_count, span_count, negative_count, positive_count,
+                neutral_count, mixed_count, strength_score, negative_strength,
+                positive_strength, avg_rating, i1_count, i2_count, i3_count,
+                cr_better, cr_worse, cr_same,
+                trust_weighted_strength, trust_weighted_negative,
+                computed_at
+            ) VALUES (
+                $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14,
+                $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, NOW()
+            )
+            ON CONFLICT (
+                business_id, place_id, period_date, bucket_type,
+                subject_type, subject_id, taxonomy_version
+            ) DO UPDATE SET
+                review_count = EXCLUDED.review_count,
+                span_count = EXCLUDED.span_count,
+                negative_count = EXCLUDED.negative_count,
+                positive_count = EXCLUDED.positive_count,
+                neutral_count = EXCLUDED.neutral_count,
+                mixed_count = EXCLUDED.mixed_count,
+                strength_score = EXCLUDED.strength_score,
+                negative_strength = EXCLUDED.negative_strength,
+                positive_strength = EXCLUDED.positive_strength,
+                avg_rating = EXCLUDED.avg_rating,
+                i1_count = EXCLUDED.i1_count,
+                i2_count = EXCLUDED.i2_count,
+                i3_count = EXCLUDED.i3_count,
+                cr_better = EXCLUDED.cr_better,
+                cr_worse = EXCLUDED.cr_worse,
+                cr_same = EXCLUDED.cr_same,
+                trust_weighted_strength = EXCLUDED.trust_weighted_strength,
+                trust_weighted_negative = EXCLUDED.trust_weighted_negative,
+                computed_at = NOW()
+            """,
+            fact["business_id"],
+            fact["place_id"],
+            fact["period_date"],
+            fact["bucket_type"],
+            fact["subject_type"],
+            fact["subject_id"],
+            fact["taxonomy_version"],
+            fact["review_count"],
+            fact["span_count"],
+            fact["negative_count"],
+            fact["positive_count"],
+            fact["neutral_count"],
+            fact["mixed_count"],
+            fact["strength_score"],
+            fact["negative_strength"],
+            fact["positive_strength"],
+            fact.get("avg_rating"),
+            fact["i1_count"],
+            fact["i2_count"],
+            fact["i3_count"],
+            fact["cr_better"],
+            fact["cr_worse"],
+            fact["cr_same"],
+            fact["trust_weighted_strength"],
+            fact["trust_weighted_negative"],
+        )
+
+    async def get_aggregation_data(
+        self,
+        business_id: str,
+        start_date: date,
+        end_date: date,
+    ) -> list[dict[str, Any]]:
+        """Get span data for aggregation within a date range."""
+        query = """
+            SELECT
+                rs.business_id,
+                rs.place_id,
+                DATE(rs.review_time) as review_date,
+                rs.urt_primary,
+                rs.valence,
+                rs.intensity,
+                rs.comparative,
+                re.trust_score,
+                re.rating
+            FROM review_spans rs
+            JOIN reviews_enriched re ON (
+                re.source = rs.source
+                AND re.review_id = rs.review_id
+                AND re.review_version = rs.review_version
+            )
+            WHERE rs.business_id = $1
+              AND rs.is_active = TRUE
+              AND DATE(rs.review_time) BETWEEN $2 AND $3
+        """
+        rows = await self.db.fetch(query, business_id, start_date, end_date)
+        return [dict(r) for r in rows]
+
+    async def get_place_ids_for_business(
+        self,
+        business_id: str,
+    ) -> list[str]:
+        """Get all place IDs for a business."""
+        rows = await self.db.fetch(
+            """
+            SELECT DISTINCT place_id FROM reviews_enriched
+            WHERE business_id = $1
+            """,
+            business_id,
+        )
+        return [r["place_id"] for r in rows]
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/pipeline.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/pipeline.py
@@ -0,0 +1,402 @@
+"""
+Pipeline class - main public API for the ReviewIQ pipeline.
+
+Provides a unified interface for running pipeline stages.
+"""
+
+from __future__ import annotations
+
+import logging
+from datetime import date
+from typing import TYPE_CHECKING, Any
+
+from reviewiq_pipeline.config import Config
+from reviewiq_pipeline.contracts import (
+    ClassificationConfig,
+    NormalizedReview,
+    ReviewToClassify,
+    ScraperOutput,
+    SpanToRoute,
+    Stage1Input,
+    Stage1Output,
+    Stage2Input,
+    Stage2Output,
+    Stage3Input,
+    Stage3Output,
+    Stage4Input,
+    Stage4Output,
+    ValidationResult,
+)
+from reviewiq_pipeline.db.connection import DatabasePool
+from reviewiq_pipeline.db.repositories import (
+    FactRepository,
+    IssueRepository,
+    ReviewRepository,
+    SpanRepository,
+)
+from reviewiq_pipeline.services.embeddings import EmbeddingService
+from reviewiq_pipeline.stages.stage1_normalize import Stage1Normalizer
+from reviewiq_pipeline.stages.stage2_classify import Stage2Classifier
+from reviewiq_pipeline.stages.stage3_route import Stage3Router
+from reviewiq_pipeline.stages.stage4_aggregate import Stage4Aggregator
+from reviewiq_pipeline.validation.validators import (
+    validate_stage1_output,
+    validate_stage2_output,
+    validate_stage3_output,
+    validate_stage4_output,
+)
+
+if TYPE_CHECKING:
+    pass
+
+logger = logging.getLogger(__name__)
+
+
+class PipelineResult:
+    """Result from running the full pipeline."""
+
+    def __init__(
+        self,
+        stage1: Stage1Output | None = None,
+        stage2: Stage2Output | None = None,
+        stage3: Stage3Output | None = None,
+        stage4: Stage4Output | None = None,
+        validation: dict[str, ValidationResult] | None = None,
+    ):
+        self.stage1 = stage1
+        self.stage2 = stage2
+        self.stage3 = stage3
+        self.stage4 = stage4
+        self.validation = validation or {}
+
+    @property
+    def success(self) -> bool:
+        """Check if all ran stages passed validation."""
+        return all(v["passed"] for v in self.validation.values())
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary."""
+        return {
+            "stage1": self.stage1,
+            "stage2": self.stage2,
+            "stage3": self.stage3,
+            "stage4": self.stage4,
+            "validation": self.validation,
+            "success": self.success,
+        }
+
+
+class Pipeline:
+    """
+    Main pipeline class for processing reviews.
+
+    Usage:
+        config = Config(database_url="...", llm_provider="openai", ...)
+        pipeline = Pipeline(config)
+
+        # Run full pipeline
+        result = await pipeline.process(scraper_output)
+
+        # Or run individual stages
+        stage1_result = await pipeline.normalize(scraper_output)
+        stage2_result = await pipeline.classify(stage1_result)
+    """
+
+    def __init__(self, config: Config):
+        """
+        Initialize the pipeline.
+
+        Args:
+            config: Pipeline configuration
+        """
+        self.config = config
+        self._db: DatabasePool | None = None
+        self._review_repo: ReviewRepository | None = None
+        self._span_repo: SpanRepository | None = None
+        self._issue_repo: IssueRepository | None = None
+        self._fact_repo: FactRepository | None = None
+        self._embedding_service: EmbeddingService | None = None
+        self._initialized = False
+
+    async def initialize(self) -> None:
+        """Initialize database connections and services."""
+        if self._initialized:
+            return
+
+        logger.info("Initializing pipeline...")
+
+        # Initialize database
+        self._db = DatabasePool(self.config)
+        await self._db.initialize()
+
+        # Initialize repositories
+        self._review_repo = ReviewRepository(self._db)
+        self._span_repo = SpanRepository(self._db)
+        self._issue_repo = IssueRepository(self._db)
+        self._fact_repo = FactRepository(self._db)
+
+        # Initialize embedding service
+        self._embedding_service = EmbeddingService(self.config)
+
+        self._initialized = True
+        logger.info("Pipeline initialized")
+
+    async def close(self) -> None:
+        """Close all connections and cleanup resources."""
+        if self._db:
+            await self._db.close()
+            self._db = None
+
+        self._initialized = False
+        logger.info("Pipeline closed")
+
+    async def migrate(self) -> int:
+        """
+        Run database migrations.
+
+        Returns:
+            Number of migrations run
+        """
+        if not self._db:
+            self._db = DatabasePool(self.config)
+            await self._db.initialize()
+
+        return await self._db.run_migrations()
+
+    async def process(
+        self,
+        scraper_output: ScraperOutput,
+        stages: list[int] | None = None,
+        validate: bool = True,
+    ) -> PipelineResult:
+        """
+        Run the full pipeline on scraper output.
+
+        Args:
+            scraper_output: Output from the scraper (Stage 0)
+            stages: List of stages to run (default: all [1, 2, 3, 4])
+            validate: Whether to validate each stage output
+
+        Returns:
+            PipelineResult with all stage outputs and validation results
+        """
+        await self.initialize()
+
+        stages = stages or [1, 2, 3, 4]
+        result = PipelineResult()
+        validation_results: dict[str, ValidationResult] = {}
+
+        # Stage 1: Normalize
+        if 1 in stages:
+            logger.info("Running Stage 1: Normalization")
+            result.stage1 = await self.normalize(scraper_output)
+
+            if validate:
+                validation_results["stage1"] = validate_stage1_output(result.stage1)
+
+        # Stage 2: Classify
+        if 2 in stages and result.stage1:
+            logger.info("Running Stage 2: Classification")
+            result.stage2 = await self.classify(result.stage1)
+
+            if validate:
+                # Build input reviews map for validation
+                input_reviews = {
+                    (r["source"], r["review_id"], r["review_version"]): r
+                    for r in result.stage1["reviews_normalized"]
+                }
+                validation_results["stage2"] = validate_stage2_output(
+                    result.stage2, input_reviews
+                )
+
+        # Stage 3: Route
+        if 3 in stages and result.stage2:
+            logger.info("Running Stage 3: Issue Routing")
+            result.stage3 = await self.route(result.stage2)
+
+            if validate:
+                validation_results["stage3"] = await validate_stage3_output(
+                    result.stage3, self._db
+                )
+
+        # Stage 4: Aggregate
+        if 4 in stages:
+            logger.info("Running Stage 4: Aggregation")
+            result.stage4 = await self.aggregate(
+                scraper_output["business_id"],
+                date.today().isoformat(),
+            )
+
+            if validate:
+                validation_results["stage4"] = validate_stage4_output(result.stage4)
+
+        result.validation = validation_results
+        return result
+
+    async def normalize(self, scraper_output: ScraperOutput) -> Stage1Output:
+        """
+        Run Stage 1: Normalization.
+
+        Args:
+            scraper_output: Raw scraper output
+
+        Returns:
+            Stage1Output with normalized reviews
+        """
+        await self.initialize()
+
+        stage1 = Stage1Normalizer(
+            self.config,
+            self._db,
+            self._review_repo,
+        )
+
+        input_data = Stage1Input(
+            job_id=scraper_output["job_id"],
+            business_id=scraper_output["business_id"],
+            place_id=scraper_output["place_id"],
+            reviews=scraper_output["reviews"],
+        )
+
+        return await stage1.process(input_data)
+
+    async def classify(self, stage1_output: Stage1Output) -> Stage2Output:
+        """
+        Run Stage 2: Classification.
+
+        Args:
+            stage1_output: Output from Stage 1
+
+        Returns:
+            Stage2Output with classified reviews
+        """
+        await self.initialize()
+
+        stage2 = Stage2Classifier(
+            self.config,
+            self._db,
+            self._review_repo,
+            self._span_repo,
+            self._embedding_service,
+        )
+
+        # Convert normalized reviews to classification input
+        reviews_to_classify = [
+            ReviewToClassify(
+                source=r["source"],
+                review_id=r["review_id"],
+                review_version=r["review_version"],
+                business_id=r["business_id"],
+                place_id=r["place_id"],
+                text=r["text"],
+                text_normalized=r["text_normalized"],
+                rating=r["rating"],
+                review_time=r["review_time"],
+            )
+            for r in stage1_output["reviews_normalized"]
+        ]
+
+        input_data = Stage2Input(
+            reviews=reviews_to_classify,
+            config=ClassificationConfig(
+                model=self.config.llm_model,
+                taxonomy_version=self.config.taxonomy_version,
+                profile=self.config.classification_profile,
+                max_spans_per_review=self.config.max_spans_per_review,
+            ),
+        )
+
+        try:
+            return await stage2.process(input_data)
+        finally:
+            await stage2.close()
+
+    async def route(self, stage2_output: Stage2Output) -> Stage3Output:
+        """
+        Run Stage 3: Issue Routing.
+
+        Args:
+            stage2_output: Output from Stage 2
+
+        Returns:
+            Stage3Output with routing results
+        """
+        await self.initialize()
+
+        stage3 = Stage3Router(
+            self.config,
+            self._db,
+            self._span_repo,
+            self._issue_repo,
+        )
+
+        # Extract negative/mixed spans for routing
+        spans_to_route = []
+        for review in stage2_output["reviews_classified"]:
+            for span in review.get("spans", []):
+                if span["valence"] in ("V-", "V±"):
+                    spans_to_route.append(
+                        SpanToRoute(
+                            span_id=span["span_id"],
+                            business_id=review.get("business_id", ""),
+                            place_id=review.get("place_id", ""),
+                            urt_primary=span["urt_primary"],
+                            valence=span["valence"],
+                            intensity=span["intensity"],
+                            entity_normalized=span.get("entity_normalized"),
+                            review_time=review.get("review_time", ""),
+                            confidence=span.get("confidence", "medium"),
+                            trust_score=review.get("trust_score", 0.5),
+                        )
+                    )
+
+        return await stage3.process(Stage3Input(spans=spans_to_route))
+
+    async def aggregate(
+        self,
+        business_id: str,
+        date_str: str,
+        bucket_types: list[str] | None = None,
+    ) -> Stage4Output:
+        """
+        Run Stage 4: Fact Aggregation.
+
+        Args:
+            business_id: Business identifier
+            date_str: Date string (YYYY-MM-DD)
+            bucket_types: List of bucket types (default: ['day'])
+
+        Returns:
+            Stage4Output with aggregated facts
+        """
+        await self.initialize()
+
+        stage4 = Stage4Aggregator(
+            self.config,
+            self._db,
+            self._fact_repo,
+        )
+
+        input_data = Stage4Input(
+            business_id=business_id,
+            date=date_str,
+            bucket_types=bucket_types or ["day"],  # type: ignore
+            taxonomy_version=self.config.taxonomy_version,
+        )
+
+        return await stage4.process(input_data)
+
+    async def validate(self, job_id: str) -> dict[str, ValidationResult]:
+        """
+        Validate pipeline output for a job.
+
+        Args:
+            job_id: Job identifier
+
+        Returns:
+            Dictionary of validation results by stage
+        """
+        # This would query the database for the job's output and validate
+        # For now, return empty results
+        logger.warning(f"validate() for job {job_id} not fully implemented")
+        return {}
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/init.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/init.py
@@ -0,0 +1,11 @@
+"""Services for pipeline operations."""
+
+from reviewiq_pipeline.services.embeddings import EmbeddingService
+from reviewiq_pipeline.services.llm_client import LLMClient
+from reviewiq_pipeline.services.text_processor import TextProcessor
+
+__all__ = [
+    "LLMClient",
+    "EmbeddingService",
+    "TextProcessor",
+]
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/embeddings.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/embeddings.py
@@ -0,0 +1,225 @@
+"""
+Embedding service for generating text embeddings.
+
+Uses sentence-transformers with the all-MiniLM-L6-v2 model (384 dimensions).
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+if TYPE_CHECKING:
+    from reviewiq_pipeline.config import Config
+
+logger = logging.getLogger(__name__)
+
+
+class EmbeddingService:
+    """
+    Service for generating text embeddings using sentence-transformers.
+
+    Uses the all-MiniLM-L6-v2 model by default, which produces 384-dimensional
+    embeddings suitable for semantic similarity and clustering.
+    """
+
+    def __init__(self, config: Config):
+        """
+        Initialize the embedding service.
+
+        Args:
+            config: Pipeline configuration with embedding model settings
+        """
+        self.config = config
+        self.model_name = config.embedding_model
+        self.dimension = config.embedding_dimension
+        self._model = None
+        self._initialized = False
+
+    def _ensure_initialized(self) -> None:
+        """Lazy initialization of the sentence-transformers model."""
+        if self._initialized:
+            return
+
+        try:
+            from sentence_transformers import SentenceTransformer
+
+            logger.info(f"Loading embedding model: {self.model_name}")
+            self._model = SentenceTransformer(self.model_name)
+            self._initialized = True
+            logger.info(f"Embedding model loaded. Dimension: {self._model.get_sentence_embedding_dimension()}")
+        except ImportError:
+            raise ImportError(
+                "sentence-transformers is required for embeddings. "
+                "Install with: pip install sentence-transformers"
+            )
+
+    def embed(self, text: str) -> list[float]:
+        """
+        Generate embedding for a single text.
+
+        Args:
+            text: Text to embed
+
+        Returns:
+            List of floats representing the embedding vector
+        """
+        self._ensure_initialized()
+
+        if not text or not text.strip():
+            # Return zero vector for empty text
+            return [0.0] * self.dimension
+
+        embedding = self._model.encode(text, convert_to_numpy=True)
+        return embedding.tolist()
+
+    def embed_batch(self, texts: list[str]) -> list[list[float]]:
+        """
+        Generate embeddings for multiple texts.
+
+        More efficient than calling embed() repeatedly.
+
+        Args:
+            texts: List of texts to embed
+
+        Returns:
+            List of embedding vectors
+        """
+        self._ensure_initialized()
+
+        if not texts:
+            return []
+
+        # Handle empty strings
+        non_empty_indices = [i for i, t in enumerate(texts) if t and t.strip()]
+        non_empty_texts = [texts[i] for i in non_empty_indices]
+
+        if not non_empty_texts:
+            return [[0.0] * self.dimension for _ in texts]
+
+        # Batch encode
+        embeddings = self._model.encode(non_empty_texts, convert_to_numpy=True)
+
+        # Build result with zero vectors for empty strings
+        result = [[0.0] * self.dimension for _ in texts]
+        for idx, emb in zip(non_empty_indices, embeddings):
+            result[idx] = emb.tolist()
+
+        return result
+
+    def similarity(self, embedding1: list[float], embedding2: list[float]) -> float:
+        """
+        Calculate cosine similarity between two embeddings.
+
+        Args:
+            embedding1: First embedding vector
+            embedding2: Second embedding vector
+
+        Returns:
+            Cosine similarity score between -1 and 1
+        """
+        vec1 = np.array(embedding1)
+        vec2 = np.array(embedding2)
+
+        # Handle zero vectors
+        norm1 = np.linalg.norm(vec1)
+        norm2 = np.linalg.norm(vec2)
+
+        if norm1 == 0 or norm2 == 0:
+            return 0.0
+
+        return float(np.dot(vec1, vec2) / (norm1 * norm2))
+
+    def find_similar(
+        self,
+        query_embedding: list[float],
+        candidate_embeddings: list[list[float]],
+        top_k: int = 5,
+        threshold: float = 0.0,
+    ) -> list[tuple[int, float]]:
+        """
+        Find most similar embeddings to a query.
+
+        Args:
+            query_embedding: Query embedding vector
+            candidate_embeddings: List of candidate embeddings
+            top_k: Number of top results to return
+            threshold: Minimum similarity threshold
+
+        Returns:
+            List of (index, similarity) tuples, sorted by similarity descending
+        """
+        if not candidate_embeddings:
+            return []
+
+        query = np.array(query_embedding)
+        candidates = np.array(candidate_embeddings)
+
+        # Compute all similarities at once
+        query_norm = np.linalg.norm(query)
+        if query_norm == 0:
+            return []
+
+        candidate_norms = np.linalg.norm(candidates, axis=1)
+
+        # Avoid division by zero
+        valid_mask = candidate_norms > 0
+        similarities = np.zeros(len(candidates))
+        similarities[valid_mask] = (
+            np.dot(candidates[valid_mask], query)
+            / (candidate_norms[valid_mask] * query_norm)
+        )
+
+        # Filter by threshold and get top k
+        results = [
+            (i, float(sim))
+            for i, sim in enumerate(similarities)
+            if sim >= threshold
+        ]
+        results.sort(key=lambda x: x[1], reverse=True)
+
+        return results[:top_k]
+
+    @property
+    def model(self):
+        """Get the underlying sentence-transformers model."""
+        self._ensure_initialized()
+        return self._model
+
+
+def normalize_embedding(embedding: list[float]) -> list[float]:
+    """
+    Normalize an embedding to unit length.
+
+    Args:
+        embedding: Embedding vector
+
+    Returns:
+        Unit-normalized embedding
+    """
+    vec = np.array(embedding)
+    norm = np.linalg.norm(vec)
+    if norm == 0:
+        return embedding
+    return (vec / norm).tolist()
+
+
+def average_embeddings(embeddings: list[list[float]]) -> list[float]:
+    """
+    Compute the average of multiple embeddings.
+
+    Useful for creating centroid vectors for clustering.
+
+    Args:
+        embeddings: List of embedding vectors
+
+    Returns:
+        Averaged embedding vector
+    """
+    if not embeddings:
+        raise ValueError("Cannot average empty embedding list")
+
+    arr = np.array(embeddings)
+    return arr.mean(axis=0).tolist()
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/llm_client.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/llm_client.py
@@ -0,0 +1,432 @@
+"""
+LLM client abstraction supporting OpenAI and Anthropic.
+
+Provides a unified interface for classification requests with:
+- Provider abstraction (OpenAI/Anthropic)
+- Structured output (JSON mode)
+- Retry handling
+- Cost tracking
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import time
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from reviewiq_pipeline.config import Config
+    from reviewiq_pipeline.contracts import LLMClassificationResponse
+
+logger = logging.getLogger(__name__)
+
+# System prompt for URT classification
+SYSTEM_PROMPT = """You are a review classification system using URT (Universal Review Taxonomy) v5.1.
+
+Your task is to extract semantic spans from customer reviews and classify each span independently.
+
+## SPAN EXTRACTION RULES
+
+1. **Split on contrasting conjunctions**: but, however, although, despite, yet, though
+2. **Split on topic/target change**: food → service → bathroom = 3 spans
+3. **Split on valence change**: positive → negative = split
+4. **Split on domain change**: O (Offering) → J (Journey) → E (Environment) = split
+5. **Keep together**: cause→effect within same feedback unit ("X because Y" = 1 span)
+
+**Guardrails**:
+- Max 3 spans per sentence (if 4+, re-check for over-splitting)
+- Min 1 span per review (even single-word reviews)
+- Spans must be non-overlapping and cover meaningful content
+
+## URT DOMAINS (Tier-3 codes: X#.##)
+
+| Domain | Code | Description |
+|--------|------|-------------|
+| Offering | O1-O4 | Product/service quality, features, variety |
+| Price | P1-P4 | Value, pricing, promotions, payment |
+| Journey | J1-J4 | Timing, process, convenience, accessibility |
+| Environment | E1-E4 | Physical space, ambiance, cleanliness, digital UX |
+| Attitude | A1-A4 | Staff behavior, helpfulness, professionalism |
+| Voice | V1-V4 | Brand, communication, marketing, transparency |
+| Relationship | R1-R4 | Loyalty, trust, consistency, personalization |
+
+## DIMENSION CODES
+
+### Valence
+- V+ : Positive sentiment
+- V- : Negative sentiment
+- V0 : Neutral/factual
+- V± : Mixed within the span
+
+### Intensity
+- I1 : Low ("okay", "fine", "decent")
+- I2 : Moderate ("good", "bad", "slow")
+- I3 : High ("amazing", "terrible", "unacceptable")
+
+### Specificity
+- S1 : Vague ("it was bad")
+- S2 : Some detail ("the food was cold")
+- S3 : Precise ("waited 45 minutes for appetizers")
+
+### Actionability
+- A1 : No clear action possible
+- A2 : Possible actions, unclear which
+- A3 : Clear, specific action ("train staff on X", "fix Y")
+
+### Temporal
+- TC : Current visit (default when no markers)
+- TR : Recent pattern ("lately", "recently", "again")
+- TH : Historical ("for years", "always", "used to")
+- TF : Future ("won't return", "next time", "I expect")
+
+### Evidence
+- ES : Stated explicitly in text (default)
+- EI : Inferred logically (not stated, but entailed)
+- EC : Contextual (depends on surrounding text)
+
+### Comparative
+- CR-N : No comparison (default)
+- CR-B : Better than alternatives
+- CR-W : Worse than alternatives
+- CR-S : Same as alternatives
+
+## PRIMARY SPAN SELECTION
+
+Mark exactly ONE span as is_primary=true using this order:
+1. Highest intensity (I3 > I2 > I1)
+2. Tie-break: negative over positive (V- > V± > V0 > V+)
+3. Tie-break: earliest span_index
+
+## USN (URT String Notation)
+
+Generate a USN string for each span:
+```
+URT:S:{primary}[+{sec1}][+{sec2}]:{valence_sign}{intensity_num}:{S#}{A#}{temporal}.{evidence}.{CR_suffix}
+```
+
+Examples:
+- `URT:S:J1.03:-2:22TC.ES.N` (J1.03, V-, I2, S2, A2, TC, ES, CR-N)
+- `URT:S:P1.01+O2.03:+3:33TR.ES.B` (P1.01 primary, O2.03 secondary, V+, I3, S3, A3, TR, ES, CR-B)
+
+Valence encoding: + for V+, - for V-, 0 for V0, ± for V±
+CR suffix: N=CR-N, B=CR-B, W=CR-W, S=CR-S
+
+## OUTPUT FORMAT
+
+Return valid JSON matching this schema. No markdown, no explanations.
+
+{
+  "spans": [
+    {
+      "span_index": 0,
+      "span_text": "exact text from review",
+      "span_start": 0,
+      "span_end": 25,
+      "urt_primary": "O1.01",
+      "urt_secondary": [],
+      "valence": "V+",
+      "intensity": "I2",
+      "specificity": "S2",
+      "actionability": "A1",
+      "temporal": "TC",
+      "evidence": "ES",
+      "comparative": "CR-N",
+      "is_primary": true,
+      "confidence": "high",
+      "entity": null,
+      "entity_type": null,
+      "relation_type": null,
+      "related_span_index": null,
+      "usn": "URT:S:O1.01:+2:21TC.ES.N"
+    }
+  ],
+  "review_summary": {
+    "dominant_valence": "V+",
+    "dominant_domain": "O",
+    "span_count": 1,
+    "has_comparative": false,
+    "has_entity": false
+  }
+}"""
+
+
+class LLMClientBase(ABC):
+    """Abstract base class for LLM clients."""
+
+    def __init__(self, config: Config):
+        self.config = config
+        self.total_tokens_used = 0
+        self.total_cost_usd = 0.0
+
+    @abstractmethod
+    async def classify(
+        self,
+        review_text: str,
+        profile: str = "standard",
+    ) -> tuple[LLMClassificationResponse, dict[str, Any]]:
+        """
+        Classify a review and extract spans.
+
+        Args:
+            review_text: The review text to classify
+            profile: Classification profile (lite/core/standard/full)
+
+        Returns:
+            Tuple of (classification response, metadata dict with tokens/cost)
+        """
+        pass
+
+    @abstractmethod
+    async def close(self) -> None:
+        """Close the client and cleanup resources."""
+        pass
+
+
+class OpenAIClient(LLMClientBase):
+    """OpenAI LLM client implementation."""
+
+    # Pricing per 1M tokens (as of 2024)
+    PRICING = {
+        "gpt-4o": {"input": 5.0, "output": 15.0},
+        "gpt-4o-mini": {"input": 0.15, "output": 0.60},
+        "gpt-4-turbo": {"input": 10.0, "output": 30.0},
+        "gpt-3.5-turbo": {"input": 0.50, "output": 1.50},
+    }
+
+    def __init__(self, config: Config):
+        super().__init__(config)
+        from openai import AsyncOpenAI
+
+        self.client = AsyncOpenAI(api_key=config.get_llm_api_key())
+        self.model = config.llm_model
+
+    async def classify(
+        self,
+        review_text: str,
+        profile: str = "standard",
+    ) -> tuple[LLMClassificationResponse, dict[str, Any]]:
+        """Classify using OpenAI."""
+        start_time = time.time()
+
+        messages = [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {
+                "role": "user",
+                "content": f'Classify this review:\n\n"{review_text}"',
+            },
+        ]
+
+        response = await self.client.chat.completions.create(
+            model=self.model,
+            messages=messages,
+            temperature=self.config.llm_temperature,
+            response_format={"type": "json_object"},
+            timeout=self.config.llm_timeout_seconds,
+        )
+
+        # Parse response
+        content = response.choices[0].message.content
+        if not content:
+            raise ValueError("Empty response from OpenAI")
+
+        result = json.loads(content)
+
+        # Calculate costs
+        input_tokens = response.usage.prompt_tokens if response.usage else 0
+        output_tokens = response.usage.completion_tokens if response.usage else 0
+        total_tokens = input_tokens + output_tokens
+
+        pricing = self.PRICING.get(self.model, {"input": 0.15, "output": 0.60})
+        cost = (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1_000_000
+
+        self.total_tokens_used += total_tokens
+        self.total_cost_usd += cost
+
+        metadata = {
+            "model": self.model,
+            "input_tokens": input_tokens,
+            "output_tokens": output_tokens,
+            "total_tokens": total_tokens,
+            "cost_usd": cost,
+            "latency_ms": int((time.time() - start_time) * 1000),
+        }
+
+        return result, metadata
+
+    async def close(self) -> None:
+        """Close the OpenAI client."""
+        await self.client.close()
+
+
+class AnthropicClient(LLMClientBase):
+    """Anthropic LLM client implementation."""
+
+    # Pricing per 1M tokens (as of 2024)
+    PRICING = {
+        "claude-3-opus-20240229": {"input": 15.0, "output": 75.0},
+        "claude-3-sonnet-20240229": {"input": 3.0, "output": 15.0},
+        "claude-3-haiku-20240307": {"input": 0.25, "output": 1.25},
+        "claude-3-5-sonnet-20241022": {"input": 3.0, "output": 15.0},
+    }
+
+    def __init__(self, config: Config):
+        super().__init__(config)
+        from anthropic import AsyncAnthropic
+
+        self.client = AsyncAnthropic(api_key=config.get_llm_api_key())
+        self.model = config.llm_model
+
+    async def classify(
+        self,
+        review_text: str,
+        profile: str = "standard",
+    ) -> tuple[LLMClassificationResponse, dict[str, Any]]:
+        """Classify using Anthropic."""
+        start_time = time.time()
+
+        response = await self.client.messages.create(
+            model=self.model,
+            max_tokens=4096,
+            system=SYSTEM_PROMPT,
+            messages=[
+                {
+                    "role": "user",
+                    "content": f'Classify this review and return JSON only:\n\n"{review_text}"',
+                },
+            ],
+            temperature=self.config.llm_temperature,
+        )
+
+        # Parse response
+        content = response.content[0].text if response.content else ""
+        if not content:
+            raise ValueError("Empty response from Anthropic")
+
+        # Try to extract JSON from response
+        result = self._extract_json(content)
+
+        # Calculate costs
+        input_tokens = response.usage.input_tokens
+        output_tokens = response.usage.output_tokens
+        total_tokens = input_tokens + output_tokens
+
+        pricing = self.PRICING.get(self.model, {"input": 3.0, "output": 15.0})
+        cost = (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1_000_000
+
+        self.total_tokens_used += total_tokens
+        self.total_cost_usd += cost
+
+        metadata = {
+            "model": self.model,
+            "input_tokens": input_tokens,
+            "output_tokens": output_tokens,
+            "total_tokens": total_tokens,
+            "cost_usd": cost,
+            "latency_ms": int((time.time() - start_time) * 1000),
+        }
+
+        return result, metadata
+
+    def _extract_json(self, content: str) -> dict[str, Any]:
+        """Extract JSON from response, handling markdown code blocks."""
+        content = content.strip()
+
+        # Try direct parse first
+        try:
+            return json.loads(content)
+        except json.JSONDecodeError:
+            pass
+
+        # Try to find JSON in code blocks
+        import re
+
+        json_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", content)
+        if json_match:
+            return json.loads(json_match.group(1))
+
+        # Try to find JSON object
+        json_match = re.search(r"\{[\s\S]*\}", content)
+        if json_match:
+            return json.loads(json_match.group(0))
+
+        raise ValueError(f"Could not extract JSON from response: {content[:200]}")
+
+    async def close(self) -> None:
+        """Close the Anthropic client."""
+        await self.client.close()
+
+
+class LLMClient:
+    """
+    Factory class for LLM clients.
+
+    Usage:
+        client = LLMClient.create(config)
+        result, metadata = await client.classify(review_text)
+        await client.close()
+    """
+
+    @staticmethod
+    def create(config: Config) -> LLMClientBase:
+        """
+        Create an LLM client based on configuration.
+
+        Args:
+            config: Pipeline configuration
+
+        Returns:
+            LLM client instance (OpenAI or Anthropic)
+        """
+        if config.llm_provider == "openai":
+            return OpenAIClient(config)
+        elif config.llm_provider == "anthropic":
+            return AnthropicClient(config)
+        else:
+            raise ValueError(f"Unsupported LLM provider: {config.llm_provider}")
+
+
+def create_fallback_response(review_text: str) -> LLMClassificationResponse:
+    """
+    Create a fallback classification response when LLM fails.
+
+    Args:
+        review_text: Original review text
+
+    Returns:
+        Minimal valid classification response
+    """
+    return {
+        "spans": [
+            {
+                "span_index": 0,
+                "span_text": review_text,
+                "span_start": 0,
+                "span_end": len(review_text),
+                "urt_primary": "O1.01",
+                "urt_secondary": [],
+                "valence": "V0",
+                "intensity": "I1",
+                "specificity": "S1",
+                "actionability": "A1",
+                "temporal": "TC",
+                "evidence": "ES",
+                "comparative": "CR-N",
+                "is_primary": True,
+                "confidence": "low",
+                "entity": None,
+                "entity_type": None,
+                "relation_type": None,
+                "related_span_index": None,
+                "usn": "URT:S:O1.01:01:11TC.ES.N",
+            }
+        ],
+        "review_summary": {
+            "dominant_valence": "V0",
+            "dominant_domain": "O",
+            "span_count": 1,
+            "has_comparative": False,
+            "has_entity": False,
+        },
+    }
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/text_processor.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/text_processor.py
@@ -0,0 +1,262 @@
+"""Text processing utilities for normalization."""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+import re
+import unicodedata
+from typing import NamedTuple
+
+logger = logging.getLogger(__name__)
+
+
+class NormalizationResult(NamedTuple):
+    """Result of text normalization."""
+
+    normalized: str
+    language: str
+    word_count: int
+    char_count: int
+
+
+class TextProcessor:
+    """Service for text normalization and processing."""
+
+    # Common emoji ranges
+    EMOJI_PATTERN = re.compile(
+        "["
+        "\U0001F600-\U0001F64F"  # emoticons
+        "\U0001F300-\U0001F5FF"  # symbols & pictographs
+        "\U0001F680-\U0001F6FF"  # transport & map symbols
+        "\U0001F1E0-\U0001F1FF"  # flags
+        "\U00002702-\U000027B0"  # dingbats
+        "\U000024C2-\U0001F251"  # enclosed characters
+        "]+",
+        flags=re.UNICODE,
+    )
+
+    # Control characters (except newlines and tabs we want to normalize)
+    CONTROL_CHAR_PATTERN = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]")
+
+    # Multiple whitespace
+    MULTI_WHITESPACE_PATTERN = re.compile(r"\s+")
+
+    # URL pattern
+    URL_PATTERN = re.compile(
+        r"https?://[^\s<>\"{}|\\^`\[\]]+|www\.[^\s<>\"{}|\\^`\[\]]+"
+    )
+
+    def __init__(self):
+        self._langdetect_available = False
+        try:
+            from langdetect import detect, DetectorFactory
+
+            # Make detection deterministic
+            DetectorFactory.seed = 0
+            self._langdetect_available = True
+        except ImportError:
+            logger.warning("langdetect not available, defaulting to 'en' for all text")
+
+    def normalize(self, text: str) -> NormalizationResult:
+        """
+        Normalize text for classification.
+
+        Steps:
+        1. Remove control characters
+        2. Normalize Unicode (NFC)
+        3. Lowercase
+        4. Normalize whitespace (collapse multiple spaces, trim)
+        5. Standardize emoji (keep but normalize)
+        6. Detect language
+
+        Args:
+            text: Original review text
+
+        Returns:
+            NormalizationResult with normalized text and metadata
+        """
+        if not text:
+            return NormalizationResult(
+                normalized="",
+                language="en",
+                word_count=0,
+                char_count=0,
+            )
+
+        # Step 1: Remove control characters
+        normalized = self.CONTROL_CHAR_PATTERN.sub("", text)
+
+        # Step 2: Unicode normalization (NFC - composed form)
+        normalized = unicodedata.normalize("NFC", normalized)
+
+        # Step 3: Lowercase
+        normalized = normalized.lower()
+
+        # Step 4: Normalize whitespace
+        normalized = self.MULTI_WHITESPACE_PATTERN.sub(" ", normalized)
+        normalized = normalized.strip()
+
+        # Detect language on original text (before lowercasing can help)
+        language = self.detect_language(text)
+
+        # Calculate metrics
+        word_count = len(normalized.split()) if normalized else 0
+        char_count = len(normalized)
+
+        return NormalizationResult(
+            normalized=normalized,
+            language=language,
+            word_count=word_count,
+            char_count=char_count,
+        )
+
+    def detect_language(self, text: str) -> str:
+        """
+        Detect the language of the text.
+
+        Args:
+            text: Text to analyze
+
+        Returns:
+            ISO 639-1 language code (e.g., 'en', 'es', 'fr')
+        """
+        if not text or not self._langdetect_available:
+            return "en"
+
+        try:
+            from langdetect import detect
+
+            # Need reasonable length for detection
+            sample = text[:1000] if len(text) > 1000 else text
+            return detect(sample)
+        except Exception as e:
+            logger.debug(f"Language detection failed: {e}")
+            return "en"
+
+    def generate_content_hash(self, text_normalized: str) -> str:
+        """
+        Generate a SHA256 hash of normalized text for deduplication.
+
+        Args:
+            text_normalized: Normalized text
+
+        Returns:
+            64-character hex string
+        """
+        return hashlib.sha256(text_normalized.encode("utf-8")).hexdigest()
+
+    def has_control_characters(self, text: str) -> bool:
+        """Check if text contains control characters."""
+        return bool(self.CONTROL_CHAR_PATTERN.search(text))
+
+    def extract_urls(self, text: str) -> list[str]:
+        """Extract URLs from text."""
+        return self.URL_PATTERN.findall(text)
+
+    def count_emoji(self, text: str) -> int:
+        """Count emoji in text."""
+        return len(self.EMOJI_PATTERN.findall(text))
+
+    def is_empty_or_trivial(self, text: str | None, min_chars: int = 3) -> bool:
+        """
+        Check if text is empty or trivially short.
+
+        Args:
+            text: Text to check
+            min_chars: Minimum meaningful character count
+
+        Returns:
+            True if text should be skipped
+        """
+        if not text:
+            return True
+        stripped = text.strip()
+        if not stripped:
+            return True
+        if len(stripped) < min_chars:
+            return True
+        return False
+
+    def clean_for_llm(self, text: str) -> str:
+        """
+        Clean text for LLM input.
+
+        Similar to normalize but preserves case and some formatting
+        for better LLM understanding.
+
+        Args:
+            text: Original text
+
+        Returns:
+            Cleaned text suitable for LLM input
+        """
+        if not text:
+            return ""
+
+        # Remove control characters
+        cleaned = self.CONTROL_CHAR_PATTERN.sub("", text)
+
+        # Unicode normalization
+        cleaned = unicodedata.normalize("NFC", cleaned)
+
+        # Normalize whitespace but preserve single newlines for paragraphs
+        cleaned = re.sub(r"[^\S\n]+", " ", cleaned)  # Collapse horizontal space
+        cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)  # Max 2 consecutive newlines
+        cleaned = cleaned.strip()
+
+        return cleaned
+
+
+def is_valid_iso639(code: str) -> bool:
+    """
+    Check if a language code is a valid ISO 639-1 code.
+
+    Args:
+        code: Language code to validate
+
+    Returns:
+        True if valid ISO 639-1 code
+    """
+    # Common ISO 639-1 codes (not exhaustive but covers most)
+    valid_codes = {
+        "aa", "ab", "ae", "af", "ak", "am", "an", "ar", "as", "av",
+        "ay", "az", "ba", "be", "bg", "bh", "bi", "bm", "bn", "bo",
+        "br", "bs", "ca", "ce", "ch", "co", "cr", "cs", "cu", "cv",
+        "cy", "da", "de", "dv", "dz", "ee", "el", "en", "eo", "es",
+        "et", "eu", "fa", "ff", "fi", "fj", "fo", "fr", "fy", "ga",
+        "gd", "gl", "gn", "gu", "gv", "ha", "he", "hi", "ho", "hr",
+        "ht", "hu", "hy", "hz", "ia", "id", "ie", "ig", "ii", "ik",
+        "io", "is", "it", "iu", "ja", "jv", "ka", "kg", "ki", "kj",
+        "kk", "kl", "km", "kn", "ko", "kr", "ks", "ku", "kv", "kw",
+        "ky", "la", "lb", "lg", "li", "ln", "lo", "lt", "lu", "lv",
+        "mg", "mh", "mi", "mk", "ml", "mn", "mr", "ms", "mt", "my",
+        "na", "nb", "nd", "ne", "ng", "nl", "nn", "no", "nr", "nv",
+        "ny", "oc", "oj", "om", "or", "os", "pa", "pi", "pl", "ps",
+        "pt", "qu", "rm", "rn", "ro", "ru", "rw", "sa", "sc", "sd",
+        "se", "sg", "si", "sk", "sl", "sm", "sn", "so", "sq", "sr",
+        "ss", "st", "su", "sv", "sw", "ta", "te", "tg", "th", "ti",
+        "tk", "tl", "tn", "to", "tr", "ts", "tt", "tw", "ty", "ug",
+        "uk", "ur", "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi",
+        "yo", "za", "zh", "zu",
+    }
+    return code.lower() in valid_codes
+
+
+def is_valid_sha256(hash_str: str) -> bool:
+    """
+    Check if a string is a valid SHA256 hex hash.
+
+    Args:
+        hash_str: Hash string to validate
+
+    Returns:
+        True if valid 64-character hex string
+    """
+    if not hash_str or len(hash_str) != 64:
+        return False
+    try:
+        int(hash_str, 16)
+        return True
+    except ValueError:
+        return False
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/init.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/init.py
@@ -0,0 +1,13 @@
+"""Pipeline stages for review processing."""
+
+from reviewiq_pipeline.stages.stage1_normalize import Stage1Normalizer
+from reviewiq_pipeline.stages.stage2_classify import Stage2Classifier
+from reviewiq_pipeline.stages.stage3_route import Stage3Router
+from reviewiq_pipeline.stages.stage4_aggregate import Stage4Aggregator
+
+__all__ = [
+    "Stage1Normalizer",
+    "Stage2Classifier",
+    "Stage3Router",
+    "Stage4Aggregator",
+]
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage1_normalize.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage1_normalize.py
@@ -0,0 +1,247 @@
+"""
+Stage 1: Normalization
+
+Transform raw scraped reviews into clean, versioned records ready for LLM classification.
+
+Responsibilities:
+- Read raw reviews from input
+- Text normalization (lowercase, whitespace, emoji)
+- Language detection
+- Content hash generation for deduplication
+- Write to reviews_raw + reviews_enriched stub
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+from reviewiq_pipeline.contracts import (
+    NormalizedReview,
+    RawReview,
+    Stage1Input,
+    Stage1Output,
+    Stage1Stats,
+)
+from reviewiq_pipeline.services.text_processor import TextProcessor
+
+if TYPE_CHECKING:
+    from reviewiq_pipeline.config import Config
+    from reviewiq_pipeline.db.connection import DatabasePool
+    from reviewiq_pipeline.db.repositories import ReviewRepository
+
+logger = logging.getLogger(__name__)
+
+
+class Stage1Normalizer:
+    """
+    Stage 1: Normalize raw reviews for classification.
+
+    This stage:
+    1. Reads raw reviews from Stage 0 output
+    2. Normalizes text (lowercase, whitespace, unicode)
+    3. Detects language
+    4. Generates content hash for deduplication
+    5. Writes to reviews_raw and reviews_enriched tables
+    """
+
+    def __init__(
+        self,
+        config: Config,
+        db: DatabasePool | None = None,
+        review_repo: ReviewRepository | None = None,
+    ):
+        self.config = config
+        self.db = db
+        self.review_repo = review_repo
+        self.text_processor = TextProcessor()
+
+    async def process(self, input_data: Stage1Input) -> Stage1Output:
+        """
+        Process raw reviews through normalization stage.
+
+        Args:
+            input_data: Stage 1 input containing raw reviews
+
+        Returns:
+            Stage1Output with normalized reviews and stats
+        """
+        logger.info(
+            f"Stage 1: Processing {len(input_data['reviews'])} reviews "
+            f"for job {input_data['job_id']}"
+        )
+
+        normalized_reviews: list[NormalizedReview] = []
+        stats = Stage1Stats(
+            input_count=len(input_data["reviews"]),
+            output_count=0,
+            skipped_empty=0,
+            skipped_duplicate=0,
+        )
+
+        seen_hashes: set[str] = set()
+
+        for raw_review in input_data["reviews"]:
+            try:
+                result = self._normalize_review(
+                    raw_review,
+                    input_data["business_id"],
+                    input_data["place_id"],
+                )
+
+                if result is None:
+                    stats["skipped_empty"] += 1
+                    continue
+
+                # Check for duplicates within this batch
+                if result["content_hash"] in seen_hashes:
+                    stats["skipped_duplicate"] += 1
+                    continue
+
+                seen_hashes.add(result["content_hash"])
+
+                # If we have a database, persist and check cross-batch duplicates
+                if self.review_repo:
+                    raw_id = await self._persist_review(raw_review, result, input_data)
+                    result["raw_id"] = raw_id
+
+                normalized_reviews.append(result)
+                stats["output_count"] += 1
+
+            except Exception as e:
+                logger.error(f"Error normalizing review {raw_review.get('review_id')}: {e}")
+                raise
+
+        logger.info(
+            f"Stage 1 complete: {stats['output_count']} normalized, "
+            f"{stats['skipped_empty']} empty, {stats['skipped_duplicate']} duplicate"
+        )
+
+        return Stage1Output(
+            job_id=input_data["job_id"],
+            business_id=input_data["business_id"],
+            place_id=input_data["place_id"],
+            reviews_normalized=normalized_reviews,
+            stats=stats,
+        )
+
+    def _normalize_review(
+        self,
+        raw: RawReview,
+        business_id: str,
+        place_id: str,
+    ) -> NormalizedReview | None:
+        """
+        Normalize a single raw review.
+
+        Args:
+            raw: Raw review from scraper
+            business_id: Business identifier
+            place_id: Google Place ID
+
+        Returns:
+            NormalizedReview or None if should be skipped
+        """
+        text = raw.get("text")
+
+        # Skip empty reviews (rating-only)
+        if self.text_processor.is_empty_or_trivial(text):
+            logger.debug(f"Skipping empty review {raw['review_id']}")
+            return None
+
+        # Normalize text
+        norm_result = self.text_processor.normalize(text)  # type: ignore
+
+        # Skip if normalized to empty
+        if not norm_result.normalized:
+            return None
+
+        # Generate content hash
+        content_hash = self.text_processor.generate_content_hash(norm_result.normalized)
+
+        return NormalizedReview(
+            source="google",
+            review_id=raw["review_id"],
+            review_version=1,
+            business_id=business_id,
+            place_id=place_id,
+            text=text,  # type: ignore
+            text_normalized=norm_result.normalized,
+            text_language=norm_result.language,
+            text_length=norm_result.char_count,
+            word_count=norm_result.word_count,
+            rating=raw["rating"],
+            review_time=raw["review_time"],
+            author_name=raw["author_name"],
+            author_id=raw.get("author_id"),
+            content_hash=content_hash,
+            dedup_group_id=None,
+        )
+
+    async def _persist_review(
+        self,
+        raw: RawReview,
+        normalized: NormalizedReview,
+        input_data: Stage1Input,
+    ) -> int:
+        """
+        Persist a normalized review to the database.
+
+        Args:
+            raw: Original raw review
+            normalized: Normalized review data
+            input_data: Stage 1 input for context
+
+        Returns:
+            The raw_id from reviews_raw table
+        """
+        if not self.review_repo:
+            raise RuntimeError("ReviewRepository not configured")
+
+        # Insert raw review
+        raw_id = await self.review_repo.insert_raw_review(
+            raw,
+            input_data["place_id"],
+            source="google",
+        )
+
+        # Insert enriched review stub
+        await self.review_repo.insert_enriched_review(
+            normalized,
+            raw_id,
+        )
+
+        return raw_id
+
+    def normalize_batch(
+        self,
+        reviews: list[RawReview],
+        business_id: str,
+        place_id: str,
+    ) -> list[NormalizedReview]:
+        """
+        Normalize a batch of reviews without database persistence.
+
+        Useful for testing or when processing reviews in memory.
+
+        Args:
+            reviews: List of raw reviews
+            business_id: Business identifier
+            place_id: Google Place ID
+
+        Returns:
+            List of normalized reviews (skipped reviews excluded)
+        """
+        results = []
+        seen_hashes: set[str] = set()
+
+        for raw in reviews:
+            normalized = self._normalize_review(raw, business_id, place_id)
+            if normalized is None:
+                continue
+            if normalized["content_hash"] in seen_hashes:
+                continue
+            seen_hashes.add(normalized["content_hash"])
+            results.append(normalized)
+
+        return results
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage2_classify.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage2_classify.py
@@ -0,0 +1,539 @@
+"""
+Stage 2: LLM Classification
+
+Classify normalized reviews into URT codes with span-level extraction.
+
+Responsibilities:
+- Call LLM for span extraction and classification
+- Generate embeddings
+- Calculate trust scores
+- Select primary span
+- Write to reviews_enriched and review_spans tables
+"""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+import re
+import uuid
+from typing import TYPE_CHECKING, Any
+
+from reviewiq_pipeline.contracts import (
+    ClassifiedReview,
+    ExtractedSpan,
+    ReviewToClassify,
+    Stage2Input,
+    Stage2Output,
+    Stage2Stats,
+)
+from reviewiq_pipeline.services.llm_client import LLMClient, create_fallback_response
+
+if TYPE_CHECKING:
+    from reviewiq_pipeline.config import Config
+    from reviewiq_pipeline.contracts import LLMClassificationResponse, LLMSpanResponse
+    from reviewiq_pipeline.db.connection import DatabasePool
+    from reviewiq_pipeline.db.repositories import ReviewRepository, SpanRepository
+    from reviewiq_pipeline.services.embeddings import EmbeddingService
+    from reviewiq_pipeline.services.llm_client import LLMClientBase
+
+logger = logging.getLogger(__name__)
+
+# URT code validation pattern
+URT_CODE_PATTERN = re.compile(r"^[OPJEAVR][1-4]\.[0-9]{2}$")
+
+# Valence priority for primary span selection (lower = higher priority)
+VALENCE_PRIORITY = {"V-": 0, "V±": 1, "V0": 2, "V+": 3}
+
+# Intensity priority (lower = higher priority for I3)
+INTENSITY_PRIORITY = {"I3": 0, "I2": 1, "I1": 2}
+
+
+class Stage2Classifier:
+    """
+    Stage 2: Classify reviews using LLM and extract spans.
+
+    This stage:
+    1. Calls LLM to extract and classify spans
+    2. Generates embeddings for each review
+    3. Calculates trust scores
+    4. Selects primary span
+    5. Writes classification results to database
+    """
+
+    def __init__(
+        self,
+        config: Config,
+        db: DatabasePool | None = None,
+        review_repo: ReviewRepository | None = None,
+        span_repo: SpanRepository | None = None,
+        embedding_service: EmbeddingService | None = None,
+    ):
+        self.config = config
+        self.db = db
+        self.review_repo = review_repo
+        self.span_repo = span_repo
+        self.embedding_service = embedding_service
+        self._llm_client: LLMClientBase | None = None
+
+    async def _get_llm_client(self) -> LLMClientBase:
+        """Get or create LLM client."""
+        if self._llm_client is None:
+            self._llm_client = LLMClient.create(self.config)
+        return self._llm_client
+
+    async def close(self) -> None:
+        """Close resources."""
+        if self._llm_client:
+            await self._llm_client.close()
+            self._llm_client = None
+
+    async def process(self, input_data: Stage2Input) -> Stage2Output:
+        """
+        Process reviews through classification stage.
+
+        Args:
+            input_data: Stage 2 input with reviews and config
+
+        Returns:
+            Stage2Output with classified reviews and stats
+        """
+        batch_id = str(uuid.uuid4())[:8]
+        logger.info(
+            f"Stage 2: Classifying {len(input_data['reviews'])} reviews "
+            f"(batch {batch_id})"
+        )
+
+        classified_reviews: list[ClassifiedReview] = []
+        total_tokens = 0
+        total_cost = 0.0
+        total_spans = 0
+        error_count = 0
+
+        llm_client = await self._get_llm_client()
+
+        for review in input_data["reviews"]:
+            try:
+                classified, metadata = await self._classify_review(
+                    review,
+                    input_data["config"]["profile"],
+                    llm_client,
+                    batch_id,
+                )
+
+                if classified:
+                    classified_reviews.append(classified)
+                    total_spans += len(classified.get("spans", []))
+                    total_tokens += metadata.get("total_tokens", 0)
+                    total_cost += metadata.get("cost_usd", 0.0)
+
+                    # Persist to database if configured
+                    if self.review_repo and self.span_repo:
+                        await self._persist_classification(
+                            classified,
+                            review,
+                            batch_id,
+                            input_data["config"],
+                        )
+
+            except Exception as e:
+                logger.error(
+                    f"Error classifying review {review['review_id']}: {e}",
+                    exc_info=True,
+                )
+                error_count += 1
+
+        avg_spans = total_spans / len(classified_reviews) if classified_reviews else 0
+
+        logger.info(
+            f"Stage 2 complete: {len(classified_reviews)} classified, "
+            f"{error_count} errors, {total_spans} spans total"
+        )
+
+        return Stage2Output(
+            batch_id=batch_id,
+            taxonomy_version=input_data["config"]["taxonomy_version"],
+            model_version=self.config.llm_model,
+            prompt_version="v1.0",
+            reviews_classified=classified_reviews,
+            stats=Stage2Stats(
+                input_count=len(input_data["reviews"]),
+                success_count=len(classified_reviews),
+                error_count=error_count,
+                total_spans=total_spans,
+                avg_spans_per_review=avg_spans,
+                llm_tokens_used=total_tokens,
+                llm_cost_usd=total_cost,
+            ),
+        )
+
+    async def _classify_review(
+        self,
+        review: ReviewToClassify,
+        profile: str,
+        llm_client: LLMClientBase,
+        batch_id: str,
+    ) -> tuple[ClassifiedReview | None, dict[str, Any]]:
+        """
+        Classify a single review.
+
+        Args:
+            review: Review to classify
+            profile: Classification profile
+            llm_client: LLM client instance
+            batch_id: Batch identifier
+
+        Returns:
+            Tuple of (classified review, metadata)
+        """
+        metadata: dict[str, Any] = {}
+
+        # Call LLM for classification
+        try:
+            llm_response, llm_metadata = await llm_client.classify(
+                review["text"],
+                profile,
+            )
+            metadata.update(llm_metadata)
+        except Exception as e:
+            logger.warning(
+                f"LLM classification failed for {review['review_id']}, "
+                f"using fallback: {e}"
+            )
+            llm_response = create_fallback_response(review["text"])
+            metadata["fallback"] = True
+
+        # Validate and fix response
+        llm_response = self._validate_and_fix_response(llm_response, review["text"])
+
+        # Convert spans to our format
+        spans = self._convert_spans(
+            llm_response["spans"],
+            review,
+            profile,
+            batch_id,
+        )
+
+        # Ensure exactly one primary span
+        spans = self._ensure_primary_span(spans)
+
+        # Find the primary span for review-level classification
+        primary_span = next((s for s in spans if s.get("is_primary")), spans[0] if spans else None)
+
+        # Generate embedding
+        embedding: list[float] = []
+        if self.embedding_service:
+            embedding = self.embedding_service.embed(review["text_normalized"])
+
+        # Calculate trust score
+        trust_score = self._calculate_trust_score(review, spans)
+
+        # Extract staff mentions and quotes
+        staff_mentions = self._extract_staff_mentions(spans)
+        quotes = self._extract_quotes(spans)
+
+        return ClassifiedReview(
+            source=review["source"],
+            review_id=review["review_id"],
+            review_version=review["review_version"],
+            urt_primary=primary_span["urt_primary"] if primary_span else "O1.01",
+            urt_secondary=primary_span.get("urt_secondary", []) if primary_span else [],
+            valence=primary_span["valence"] if primary_span else "V0",
+            intensity=primary_span["intensity"] if primary_span else "I1",
+            comparative=primary_span.get("comparative", "CR-N") if primary_span else "CR-N",
+            staff_mentions=staff_mentions,
+            quotes=quotes,
+            trust_score=trust_score,
+            embedding=embedding,
+            spans=spans,
+            classification_confidence={
+                "overall": 0.8 if not metadata.get("fallback") else 0.3
+            },
+            processing_time_ms=metadata.get("latency_ms", 0),
+        ), metadata
+
+    def _validate_and_fix_response(
+        self,
+        response: LLMClassificationResponse,
+        original_text: str,
+    ) -> LLMClassificationResponse:
+        """
+        Validate LLM response and fix common issues.
+
+        Args:
+            response: Raw LLM response
+            original_text: Original review text for offset validation
+
+        Returns:
+            Validated and fixed response
+        """
+        spans = response.get("spans", [])
+        if not spans:
+            # Create fallback if no spans
+            return create_fallback_response(original_text)
+
+        fixed_spans = []
+        for i, span in enumerate(spans):
+            # Ensure required fields
+            span["span_index"] = i
+
+            # Validate and fix offsets
+            start = span.get("span_start", 0)
+            end = span.get("span_end", len(original_text))
+
+            if start < 0:
+                start = 0
+            if end > len(original_text):
+                end = len(original_text)
+            if end <= start:
+                end = start + len(span.get("span_text", "")) or len(original_text)
+
+            span["span_start"] = start
+            span["span_end"] = end
+
+            # Validate URT code
+            urt_primary = span.get("urt_primary", "O1.01")
+            if not URT_CODE_PATTERN.match(urt_primary):
+                logger.warning(f"Invalid URT code '{urt_primary}', defaulting to O1.01")
+                span["urt_primary"] = "O1.01"
+
+            # Ensure valid enums
+            if span.get("valence") not in ("V+", "V-", "V0", "V±"):
+                span["valence"] = "V0"
+            if span.get("intensity") not in ("I1", "I2", "I3"):
+                span["intensity"] = "I1"
+
+            fixed_spans.append(span)
+
+        response["spans"] = fixed_spans
+        return response
+
+    def _convert_spans(
+        self,
+        llm_spans: list[LLMSpanResponse],
+        review: ReviewToClassify,
+        profile: str,
+        batch_id: str,
+    ) -> list[ExtractedSpan]:
+        """
+        Convert LLM spans to our ExtractedSpan format.
+
+        Args:
+            llm_spans: Spans from LLM response
+            review: Source review
+            profile: Classification profile
+            batch_id: Batch identifier
+
+        Returns:
+            List of ExtractedSpan objects
+        """
+        spans = []
+
+        for llm_span in llm_spans:
+            # Generate deterministic span ID
+            span_key = f"{review['review_id']}:{llm_span['span_index']}:{llm_span.get('span_text', '')[:50]}"
+            span_hash = hashlib.sha256(span_key.encode()).hexdigest()[:16]
+            span_id = f"SPN-{span_hash}"
+
+            span = ExtractedSpan(
+                span_id=span_id,
+                span_index=llm_span["span_index"],
+                span_text=llm_span.get("span_text", ""),
+                span_start=llm_span.get("span_start", 0),
+                span_end=llm_span.get("span_end", 0),
+                profile=profile,  # type: ignore
+                urt_primary=llm_span["urt_primary"],
+                urt_secondary=llm_span.get("urt_secondary", []),
+                valence=llm_span["valence"],
+                intensity=llm_span["intensity"],
+                comparative=llm_span.get("comparative", "CR-N"),
+                specificity=llm_span.get("specificity"),
+                actionability=llm_span.get("actionability"),
+                temporal=llm_span.get("temporal"),
+                evidence=llm_span.get("evidence"),
+                entity=llm_span.get("entity"),
+                entity_type=llm_span.get("entity_type"),
+                entity_normalized=llm_span.get("entity", "").lower() if llm_span.get("entity") else None,
+                relation_type=llm_span.get("relation_type"),
+                related_span_index=llm_span.get("related_span_index"),
+                confidence=llm_span.get("confidence", "medium"),
+                usn=llm_span.get("usn", self._generate_usn(llm_span)),
+                is_primary=llm_span.get("is_primary", False),
+            )
+            spans.append(span)
+
+        return spans
+
+    def _ensure_primary_span(self, spans: list[ExtractedSpan]) -> list[ExtractedSpan]:
+        """
+        Ensure exactly one span is marked as primary.
+
+        Uses selection rules:
+        1. Highest intensity (I3 > I2 > I1)
+        2. Tie-break: negative over positive (V- > V± > V0 > V+)
+        3. Tie-break: earliest span_index
+
+        Args:
+            spans: List of spans
+
+        Returns:
+            List of spans with exactly one primary
+        """
+        if not spans:
+            return spans
+
+        # Count current primaries
+        primary_count = sum(1 for s in spans if s.get("is_primary"))
+
+        if primary_count == 1:
+            return spans
+
+        # Clear all primaries and re-select
+        for span in spans:
+            span["is_primary"] = False
+
+        # Sort by selection criteria
+        def sort_key(s: ExtractedSpan) -> tuple[int, int, int]:
+            return (
+                INTENSITY_PRIORITY.get(s["intensity"], 2),
+                VALENCE_PRIORITY.get(s["valence"], 3),
+                s["span_index"],
+            )
+
+        sorted_spans = sorted(spans, key=sort_key)
+        sorted_spans[0]["is_primary"] = True
+
+        return spans
+
+    def _calculate_trust_score(
+        self,
+        review: ReviewToClassify,
+        spans: list[ExtractedSpan],
+    ) -> float:
+        """
+        Calculate trust score for a review.
+
+        Factors:
+        - Text length (longer = more trust)
+        - Specificity of spans
+        - Confidence levels
+
+        Args:
+            review: Source review
+            spans: Classified spans
+
+        Returns:
+            Trust score between 0.2 and 1.0
+        """
+        score = 0.5  # Base score
+
+        # Length factor (up to +0.2)
+        text_len = len(review["text"])
+        if text_len > 200:
+            score += 0.2
+        elif text_len > 100:
+            score += 0.1
+        elif text_len > 50:
+            score += 0.05
+
+        # Specificity factor (up to +0.2)
+        if spans:
+            high_spec_count = sum(1 for s in spans if s.get("specificity") == "S3")
+            if high_spec_count > 0:
+                score += 0.1 + (0.1 * min(high_spec_count / len(spans), 1.0))
+
+        # Confidence factor (up to +0.1)
+        if spans:
+            high_conf_count = sum(1 for s in spans if s.get("confidence") == "high")
+            score += 0.1 * (high_conf_count / len(spans))
+
+        # Ensure floor of 0.2 and ceiling of 1.0
+        return max(self.config.trust_score_floor, min(1.0, score))
+
+    def _extract_staff_mentions(self, spans: list[ExtractedSpan]) -> list[str]:
+        """Extract staff names from spans."""
+        staff = []
+        for span in spans:
+            if span.get("entity_type") == "staff" and span.get("entity"):
+                staff.append(span["entity"])
+        return list(set(staff))
+
+    def _extract_quotes(self, spans: list[ExtractedSpan]) -> dict[str, str]:
+        """Extract representative quotes by URT code."""
+        quotes = {}
+        for span in spans:
+            code = span["urt_primary"]
+            if code not in quotes:
+                quotes[code] = span["span_text"][:100]
+        return quotes
+
+    def _generate_usn(self, span: LLMSpanResponse) -> str:
+        """
+        Generate USN (URT String Notation) for a span.
+
+        Format: URT:S:{primary}[+{sec}]:{valence_sign}{intensity_num}:{S#}{A#}{temporal}.{evidence}.{CR_suffix}
+        """
+        primary = span.get("urt_primary", "O1.01")
+        secondary = span.get("urt_secondary", [])
+        valence = span.get("valence", "V0")
+        intensity = span.get("intensity", "I1")
+        specificity = span.get("specificity", "S1")
+        actionability = span.get("actionability", "A1")
+        temporal = span.get("temporal", "TC")
+        evidence = span.get("evidence", "ES")
+        comparative = span.get("comparative", "CR-N")
+
+        # Build code portion
+        code_part = primary
+        for sec in secondary[:2]:
+            code_part += f"+{sec}"
+
+        # Valence encoding
+        valence_map = {"V+": "+", "V-": "-", "V0": "0", "V±": "±"}
+        valence_sign = valence_map.get(valence, "0")
+
+        # Intensity number
+        intensity_num = intensity[1] if intensity.startswith("I") else "1"
+
+        # Dimensions
+        spec_num = specificity[1] if specificity and specificity.startswith("S") else "1"
+        act_num = actionability[1] if actionability and actionability.startswith("A") else "1"
+
+        # CR suffix
+        cr_map = {"CR-N": "N", "CR-B": "B", "CR-W": "W", "CR-S": "S"}
+        cr_suffix = cr_map.get(comparative, "N")
+
+        return f"URT:S:{code_part}:{valence_sign}{intensity_num}:{spec_num}{act_num}{temporal}.{evidence}.{cr_suffix}"
+
+    async def _persist_classification(
+        self,
+        classified: ClassifiedReview,
+        review: ReviewToClassify,
+        batch_id: str,
+        config: dict[str, Any],
+    ) -> None:
+        """Persist classification results to database."""
+        if not self.review_repo or not self.span_repo:
+            return
+
+        # Update reviews_enriched
+        await self.review_repo.update_enriched_with_classification(
+            classified,
+            self.config.llm_model,
+            config["taxonomy_version"],
+        )
+
+        # Insert spans
+        for span in classified.get("spans", []):
+            await self.span_repo.insert_span(
+                span,
+                review["business_id"],
+                review["place_id"],
+                review["source"],
+                review["review_id"],
+                review["review_version"],
+                review["review_time"],
+                batch_id,
+                self.config.llm_model,
+                config["taxonomy_version"],
+            )
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage3_route.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage3_route.py
@@ -0,0 +1,274 @@
+"""
+Stage 3: Issue Routing
+
+Route classified spans to issues (create new or aggregate to existing).
+
+Responsibilities:
+- Query unrouted V-/V± spans
+- Generate deterministic issue IDs
+- Create/update issues with span counts
+- Insert issue_spans links
+- Log events for audit trail
+"""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+from typing import TYPE_CHECKING
+
+from reviewiq_pipeline.contracts import (
+    RoutedSpan,
+    SpanToRoute,
+    Stage3Input,
+    Stage3Output,
+    Stage3Stats,
+)
+
+if TYPE_CHECKING:
+    from reviewiq_pipeline.config import Config
+    from reviewiq_pipeline.db.connection import DatabasePool
+    from reviewiq_pipeline.db.repositories import IssueRepository, SpanRepository
+
+logger = logging.getLogger(__name__)
+
+
+class Stage3Router:
+    """
+    Stage 3: Route negative/mixed spans to issues.
+
+    This stage:
+    1. Queries unrouted spans with V- or V± valence
+    2. Generates deterministic issue IDs from routing keys
+    3. Creates new issues or updates existing ones
+    4. Links spans to issues (1:1 mapping)
+    5. Logs events for audit trail
+    """
+
+    def __init__(
+        self,
+        config: Config,
+        db: DatabasePool | None = None,
+        span_repo: SpanRepository | None = None,
+        issue_repo: IssueRepository | None = None,
+    ):
+        self.config = config
+        self.db = db
+        self.span_repo = span_repo
+        self.issue_repo = issue_repo
+
+    async def process(self, input_data: Stage3Input) -> Stage3Output:
+        """
+        Process spans through routing stage.
+
+        Args:
+            input_data: Stage 3 input with spans to route
+
+        Returns:
+            Stage3Output with routing results and stats
+        """
+        logger.info(f"Stage 3: Routing {len(input_data['spans'])} spans")
+
+        routed_spans: list[RoutedSpan] = []
+        issues_created: list[str] = []
+        issues_updated: list[str] = []
+        spans_skipped = 0
+
+        for span in input_data["spans"]:
+            try:
+                # Skip positive spans
+                if span["valence"] not in ("V-", "V±"):
+                    spans_skipped += 1
+                    continue
+
+                routed = await self._route_span(span)
+                if routed:
+                    routed_spans.append(routed)
+
+                    if routed["is_new_issue"]:
+                        issues_created.append(routed["issue_id"])
+                    else:
+                        if routed["issue_id"] not in issues_updated:
+                            issues_updated.append(routed["issue_id"])
+
+            except Exception as e:
+                logger.error(f"Error routing span {span['span_id']}: {e}")
+                raise
+
+        logger.info(
+            f"Stage 3 complete: {len(routed_spans)} routed, "
+            f"{len(issues_created)} issues created, "
+            f"{len(issues_updated)} issues updated"
+        )
+
+        return Stage3Output(
+            routed_spans=routed_spans,
+            issues_created=issues_created,
+            issues_updated=issues_updated,
+            stats=Stage3Stats(
+                spans_processed=len(input_data["spans"]),
+                spans_routed=len(routed_spans),
+                spans_skipped=spans_skipped,
+                issues_created=len(issues_created),
+                issues_updated=len(issues_updated),
+            ),
+        )
+
+    async def _route_span(self, span: SpanToRoute) -> RoutedSpan | None:
+        """
+        Route a single span to an issue.
+
+        Args:
+            span: Span to route
+
+        Returns:
+            RoutedSpan with routing info, or None if skipped
+        """
+        # Generate routing key and issue ID
+        routing_key = self._generate_routing_key(span)
+        issue_id = self._generate_issue_id(routing_key)
+
+        # Check if span already routed (should not happen, but defensive)
+        if self.issue_repo:
+            existing_issue = await self.issue_repo.check_span_already_linked(span["span_id"])
+            if existing_issue:
+                logger.warning(
+                    f"Span {span['span_id']} already linked to {existing_issue}"
+                )
+                return None
+
+        # Create or update issue
+        is_new_issue = True
+        if self.issue_repo:
+            is_new_issue = await self.issue_repo.upsert_issue(
+                issue_id=issue_id,
+                business_id=span["business_id"],
+                place_id=span["place_id"],
+                primary_subcode=span["urt_primary"],
+                intensity=span["intensity"],
+                entity=span.get("entity_normalized"),
+                entity_normalized=span.get("entity_normalized"),
+                taxonomy_version=self.config.taxonomy_version,
+            )
+
+        routed = RoutedSpan(
+            span_id=span["span_id"],
+            issue_id=issue_id,
+            routing_key=routing_key,
+            is_new_issue=is_new_issue,
+        )
+
+        # Link span to issue
+        if self.issue_repo:
+            await self.issue_repo.link_span_to_issue(
+                routed=routed,
+                source="google",  # Assuming Google source
+                review_id="",  # Would need to be passed from span metadata
+                review_version=1,
+                intensity=span["intensity"],
+                review_time=span["review_time"],
+                is_primary_match=True,
+            )
+
+            # Log event
+            event_type = "issue_created" if is_new_issue else "span_added"
+            await self.issue_repo.log_event(
+                issue_id=issue_id,
+                event_type=event_type,
+                span_id=span["span_id"],
+                metadata={
+                    "urt_primary": span["urt_primary"],
+                    "valence": span["valence"],
+                    "intensity": span["intensity"],
+                },
+            )
+
+        return routed
+
+    def _generate_routing_key(self, span: SpanToRoute) -> str:
+        """
+        Generate routing key for a span.
+
+        Format: business_id|place_id|urt_primary|entity_normalized
+
+        Args:
+            span: Span to generate key for
+
+        Returns:
+            Routing key string
+        """
+        entity = span.get("entity_normalized") or ""
+        return f"{span['business_id']}|{span['place_id']}|{span['urt_primary']}|{entity}"
+
+    def _generate_issue_id(self, routing_key: str) -> str:
+        """
+        Generate deterministic issue ID from routing key.
+
+        Args:
+            routing_key: Routing key string
+
+        Returns:
+            Issue ID in format ISS-{hash16}
+        """
+        hash_value = hashlib.sha256(routing_key.encode()).hexdigest()
+        return f"ISS-{hash_value[:16]}"
+
+    async def process_from_db(self, limit: int = 100) -> Stage3Output:
+        """
+        Process unrouted spans directly from database.
+
+        Convenience method that queries unrouted spans and processes them.
+
+        Args:
+            limit: Maximum number of spans to process
+
+        Returns:
+            Stage3Output with routing results
+        """
+        if not self.span_repo:
+            raise RuntimeError("SpanRepository not configured")
+
+        # Query unrouted negative spans
+        span_rows = await self.span_repo.get_unrouted_negative_spans(limit)
+
+        # Convert to SpanToRoute format
+        spans = [
+            SpanToRoute(
+                span_id=row["span_id"],
+                business_id=row["business_id"],
+                place_id=row["place_id"],
+                urt_primary=row["urt_primary"],
+                valence=row["valence"],
+                intensity=row["intensity"],
+                entity_normalized=row.get("entity_normalized"),
+                review_time=str(row["review_time"]),
+                confidence=row["confidence"],
+                trust_score=row.get("trust_score", 0.5),
+            )
+            for row in span_rows
+        ]
+
+        return await self.process(Stage3Input(spans=spans))
+
+    def route_span_sync(self, span: SpanToRoute) -> RoutedSpan:
+        """
+        Route a span without database operations (for testing).
+
+        Args:
+            span: Span to route
+
+        Returns:
+            RoutedSpan with routing info
+        """
+        if span["valence"] not in ("V-", "V±"):
+            raise ValueError(f"Cannot route positive span (valence={span['valence']})")
+
+        routing_key = self._generate_routing_key(span)
+        issue_id = self._generate_issue_id(routing_key)
+
+        return RoutedSpan(
+            span_id=span["span_id"],
+            issue_id=issue_id,
+            routing_key=routing_key,
+            is_new_issue=True,  # Can't know without DB
+        )
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage4_aggregate.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage4_aggregate.py
@@ -0,0 +1,485 @@
+"""
+Stage 4: Fact Aggregation
+
+Pre-aggregate span/review data into fact_timeseries for fast dashboard queries.
+
+Responsibilities:
+- Aggregate spans by URT code per time bucket
+- Calculate valence/intensity distributions
+- Compute strength scores (trust-weighted)
+- UPSERT into fact_timeseries table
+"""
+
+from __future__ import annotations
+
+import logging
+from collections import defaultdict
+from datetime import date, datetime, timedelta
+from typing import TYPE_CHECKING, Any
+
+from reviewiq_pipeline.contracts import (
+    FactRecord,
+    Stage4Input,
+    Stage4Output,
+    Stage4Stats,
+)
+
+if TYPE_CHECKING:
+    from reviewiq_pipeline.config import Config
+    from reviewiq_pipeline.db.connection import DatabasePool
+    from reviewiq_pipeline.db.repositories import FactRepository
+
+logger = logging.getLogger(__name__)
+
+
+class Stage4Aggregator:
+    """
+    Stage 4: Aggregate span data into time series facts.
+
+    This stage:
+    1. Queries span data for a business/date range
+    2. Aggregates by URT code and time bucket
+    3. Calculates valence/intensity distributions
+    4. Computes trust-weighted strength scores
+    5. UPSERTs results into fact_timeseries table
+    """
+
+    def __init__(
+        self,
+        config: Config,
+        db: DatabasePool | None = None,
+        fact_repo: FactRepository | None = None,
+    ):
+        self.config = config
+        self.db = db
+        self.fact_repo = fact_repo
+
+    async def process(self, input_data: Stage4Input) -> Stage4Output:
+        """
+        Process aggregation for a business and date.
+
+        Args:
+            input_data: Stage 4 input with aggregation parameters
+
+        Returns:
+            Stage4Output with aggregated facts and stats
+        """
+        logger.info(
+            f"Stage 4: Aggregating for business {input_data['business_id']} "
+            f"on {input_data['date']}"
+        )
+
+        facts_written: list[FactRecord] = []
+        locations_processed = 0
+        codes_aggregated = set()
+
+        # Get date range based on bucket types
+        target_date = datetime.strptime(input_data["date"], "%Y-%m-%d").date()
+
+        for bucket_type in input_data["bucket_types"]:
+            start_date, end_date = self._get_bucket_range(target_date, bucket_type)
+            period_date = self._get_period_date(target_date, bucket_type)
+
+            # Get aggregation data from database
+            if self.fact_repo:
+                span_data = await self.fact_repo.get_aggregation_data(
+                    input_data["business_id"],
+                    start_date,
+                    end_date,
+                )
+                place_ids = await self.fact_repo.get_place_ids_for_business(
+                    input_data["business_id"]
+                )
+            else:
+                span_data = []
+                place_ids = []
+
+            # Aggregate by place_id and URT code
+            for place_id in place_ids + ["ALL"]:
+                place_data = [
+                    s for s in span_data
+                    if place_id == "ALL" or s["place_id"] == place_id
+                ]
+
+                if not place_data:
+                    continue
+
+                locations_processed += 1 if place_id != "ALL" else 0
+
+                # Aggregate by URT code
+                code_facts = self._aggregate_by_code(
+                    place_data,
+                    input_data["business_id"],
+                    place_id,
+                    period_date,
+                    bucket_type,
+                    input_data["taxonomy_version"],
+                )
+
+                for fact in code_facts:
+                    facts_written.append(fact)
+                    codes_aggregated.add(fact["subject_id"])
+
+                    if self.fact_repo:
+                        await self.fact_repo.upsert_fact(fact)
+
+                # Aggregate by domain
+                domain_facts = self._aggregate_by_domain(
+                    place_data,
+                    input_data["business_id"],
+                    place_id,
+                    period_date,
+                    bucket_type,
+                    input_data["taxonomy_version"],
+                )
+
+                for fact in domain_facts:
+                    facts_written.append(fact)
+
+                    if self.fact_repo:
+                        await self.fact_repo.upsert_fact(fact)
+
+                # Overall aggregation
+                overall_fact = self._aggregate_overall(
+                    place_data,
+                    input_data["business_id"],
+                    place_id,
+                    period_date,
+                    bucket_type,
+                    input_data["taxonomy_version"],
+                )
+
+                facts_written.append(overall_fact)
+
+                if self.fact_repo:
+                    await self.fact_repo.upsert_fact(overall_fact)
+
+        logger.info(
+            f"Stage 4 complete: {len(facts_written)} facts written, "
+            f"{len(codes_aggregated)} unique codes"
+        )
+
+        return Stage4Output(
+            facts_written=facts_written,
+            stats=Stage4Stats(
+                business_id=input_data["business_id"],
+                date=input_data["date"],
+                locations_processed=locations_processed,
+                codes_aggregated=len(codes_aggregated),
+                facts_upserted=len(facts_written),
+            ),
+        )
+
+    def _get_bucket_range(
+        self,
+        target_date: date,
+        bucket_type: str,
+    ) -> tuple[date, date]:
+        """Get start and end dates for a time bucket."""
+        if bucket_type == "day":
+            return target_date, target_date
+        elif bucket_type == "week":
+            # Week starts on Monday
+            start = target_date - timedelta(days=target_date.weekday())
+            end = start + timedelta(days=6)
+            return start, end
+        elif bucket_type == "month":
+            start = target_date.replace(day=1)
+            # Get last day of month
+            if target_date.month == 12:
+                end = target_date.replace(year=target_date.year + 1, month=1, day=1) - timedelta(days=1)
+            else:
+                end = target_date.replace(month=target_date.month + 1, day=1) - timedelta(days=1)
+            return start, end
+        else:
+            raise ValueError(f"Unknown bucket type: {bucket_type}")
+
+    def _get_period_date(self, target_date: date, bucket_type: str) -> str:
+        """Get the period date string for a bucket."""
+        if bucket_type == "day":
+            return target_date.isoformat()
+        elif bucket_type == "week":
+            # Week starts on Monday
+            start = target_date - timedelta(days=target_date.weekday())
+            return start.isoformat()
+        elif bucket_type == "month":
+            return target_date.replace(day=1).isoformat()
+        else:
+            return target_date.isoformat()
+
+    def _aggregate_by_code(
+        self,
+        span_data: list[dict[str, Any]],
+        business_id: str,
+        place_id: str,
+        period_date: str,
+        bucket_type: str,
+        taxonomy_version: str,
+    ) -> list[FactRecord]:
+        """Aggregate spans by URT code."""
+        code_groups: dict[str, list[dict]] = defaultdict(list)
+
+        for span in span_data:
+            code_groups[span["urt_primary"]].append(span)
+
+        facts = []
+        for code, spans in code_groups.items():
+            fact = self._compute_fact_metrics(
+                spans,
+                business_id,
+                place_id,
+                period_date,
+                bucket_type,
+                "urt_code",
+                code,
+                taxonomy_version,
+            )
+            facts.append(fact)
+
+        return facts
+
+    def _aggregate_by_domain(
+        self,
+        span_data: list[dict[str, Any]],
+        business_id: str,
+        place_id: str,
+        period_date: str,
+        bucket_type: str,
+        taxonomy_version: str,
+    ) -> list[FactRecord]:
+        """Aggregate spans by domain (first letter of URT code)."""
+        domain_groups: dict[str, list[dict]] = defaultdict(list)
+
+        for span in span_data:
+            domain = span["urt_primary"][0]  # First letter
+            domain_groups[domain].append(span)
+
+        facts = []
+        for domain, spans in domain_groups.items():
+            fact = self._compute_fact_metrics(
+                spans,
+                business_id,
+                place_id,
+                period_date,
+                bucket_type,
+                "domain",
+                domain,
+                taxonomy_version,
+            )
+            facts.append(fact)
+
+        return facts
+
+    def _aggregate_overall(
+        self,
+        span_data: list[dict[str, Any]],
+        business_id: str,
+        place_id: str,
+        period_date: str,
+        bucket_type: str,
+        taxonomy_version: str,
+    ) -> FactRecord:
+        """Aggregate all spans for overall metrics."""
+        return self._compute_fact_metrics(
+            span_data,
+            business_id,
+            place_id,
+            period_date,
+            bucket_type,
+            "overall",
+            "all",
+            taxonomy_version,
+        )
+
+    def _compute_fact_metrics(
+        self,
+        spans: list[dict[str, Any]],
+        business_id: str,
+        place_id: str,
+        period_date: str,
+        bucket_type: str,
+        subject_type: str,
+        subject_id: str,
+        taxonomy_version: str,
+    ) -> FactRecord:
+        """
+        Compute aggregated metrics for a group of spans.
+
+        Args:
+            spans: List of span data
+            business_id: Business identifier
+            place_id: Place ID or 'ALL'
+            period_date: Period date string
+            bucket_type: day/week/month
+            subject_type: overall/urt_code/domain/issue
+            subject_id: Subject identifier
+            taxonomy_version: Taxonomy version
+
+        Returns:
+            FactRecord with computed metrics
+        """
+        if not spans:
+            return self._empty_fact(
+                business_id, place_id, period_date, bucket_type,
+                subject_type, subject_id, taxonomy_version,
+            )
+
+        # Count unique reviews
+        review_ids = set()
+        for span in spans:
+            # Assuming span has review_id in metadata
+            review_id = span.get("review_id", span.get("span_id", ""))
+            review_ids.add(review_id)
+
+        span_count = len(spans)
+        review_count = len(review_ids) if review_ids else span_count
+
+        # Valence counts
+        negative_count = sum(1 for s in spans if s["valence"] == "V-")
+        positive_count = sum(1 for s in spans if s["valence"] == "V+")
+        neutral_count = sum(1 for s in spans if s["valence"] == "V0")
+        mixed_count = sum(1 for s in spans if s["valence"] == "V±")
+
+        # Intensity counts
+        i1_count = sum(1 for s in spans if s["intensity"] == "I1")
+        i2_count = sum(1 for s in spans if s["intensity"] == "I2")
+        i3_count = sum(1 for s in spans if s["intensity"] == "I3")
+
+        # Comparative counts
+        cr_better = sum(1 for s in spans if s.get("comparative") == "CR-B")
+        cr_worse = sum(1 for s in spans if s.get("comparative") == "CR-W")
+        cr_same = sum(1 for s in spans if s.get("comparative") == "CR-S")
+
+        # Calculate strength scores
+        strength_score = self._compute_strength_score(spans)
+        negative_strength = self._compute_strength_score(
+            [s for s in spans if s["valence"] in ("V-", "V±")]
+        )
+        positive_strength = self._compute_strength_score(
+            [s for s in spans if s["valence"] == "V+"]
+        )
+
+        # Trust-weighted scores
+        trust_weighted_strength = self._compute_trust_weighted_strength(spans)
+        trust_weighted_negative = self._compute_trust_weighted_strength(
+            [s for s in spans if s["valence"] in ("V-", "V±")]
+        )
+
+        # Average rating
+        ratings = [s["rating"] for s in spans if s.get("rating")]
+        avg_rating = sum(ratings) / len(ratings) if ratings else None
+
+        return FactRecord(
+            business_id=business_id,
+            place_id=place_id,
+            period_date=period_date,
+            bucket_type=bucket_type,
+            subject_type=subject_type,  # type: ignore
+            subject_id=subject_id,
+            taxonomy_version=taxonomy_version,
+            review_count=review_count,
+            span_count=span_count,
+            negative_count=negative_count,
+            positive_count=positive_count,
+            neutral_count=neutral_count,
+            mixed_count=mixed_count,
+            strength_score=strength_score,
+            negative_strength=negative_strength,
+            positive_strength=positive_strength,
+            avg_rating=avg_rating,
+            i1_count=i1_count,
+            i2_count=i2_count,
+            i3_count=i3_count,
+            cr_better=cr_better,
+            cr_worse=cr_worse,
+            cr_same=cr_same,
+            trust_weighted_strength=trust_weighted_strength,
+            trust_weighted_negative=trust_weighted_negative,
+        )
+
+    def _compute_strength_score(self, spans: list[dict[str, Any]]) -> float:
+        """
+        Compute strength score from intensity distribution.
+
+        Score: sum of (intensity_weight * valence_multiplier)
+        I1=1, I2=2, I3=4 (exponential)
+        V-=1, V±=0.5, V0=0, V+=1
+        """
+        if not spans:
+            return 0.0
+
+        intensity_weights = {"I1": 1, "I2": 2, "I3": 4}
+        valence_multipliers = {"V-": 1.0, "V±": 0.5, "V0": 0.0, "V+": 1.0}
+
+        total = 0.0
+        for span in spans:
+            intensity = span.get("intensity", "I1")
+            valence = span.get("valence", "V0")
+            weight = intensity_weights.get(intensity, 1)
+            multiplier = valence_multipliers.get(valence, 0)
+            total += weight * multiplier
+
+        return total
+
+    def _compute_trust_weighted_strength(self, spans: list[dict[str, Any]]) -> float:
+        """
+        Compute trust-weighted strength score.
+
+        Similar to strength score but weighted by trust_score.
+        """
+        if not spans:
+            return 0.0
+
+        intensity_weights = {"I1": 1, "I2": 2, "I3": 4}
+        valence_multipliers = {"V-": 1.0, "V±": 0.5, "V0": 0.0, "V+": 1.0}
+
+        total = 0.0
+        for span in spans:
+            intensity = span.get("intensity", "I1")
+            valence = span.get("valence", "V0")
+            trust = span.get("trust_score", 0.5)
+
+            weight = intensity_weights.get(intensity, 1)
+            multiplier = valence_multipliers.get(valence, 0)
+            total += weight * multiplier * trust
+
+        return total
+
+    def _empty_fact(
+        self,
+        business_id: str,
+        place_id: str,
+        period_date: str,
+        bucket_type: str,
+        subject_type: str,
+        subject_id: str,
+        taxonomy_version: str,
+    ) -> FactRecord:
+        """Create an empty fact record with zero counts."""
+        return FactRecord(
+            business_id=business_id,
+            place_id=place_id,
+            period_date=period_date,
+            bucket_type=bucket_type,
+            subject_type=subject_type,  # type: ignore
+            subject_id=subject_id,
+            taxonomy_version=taxonomy_version,
+            review_count=0,
+            span_count=0,
+            negative_count=0,
+            positive_count=0,
+            neutral_count=0,
+            mixed_count=0,
+            strength_score=0.0,
+            negative_strength=0.0,
+            positive_strength=0.0,
+            avg_rating=None,
+            i1_count=0,
+            i2_count=0,
+            i3_count=0,
+            cr_better=0,
+            cr_worse=0,
+            cr_same=0,
+            trust_weighted_strength=0.0,
+            trust_weighted_negative=0.0,
+        )
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/validation/init.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/validation/init.py
@@ -0,0 +1,23 @@
+"""Validation rules for pipeline stages."""
+
+from reviewiq_pipeline.validation.validators import (
+    Stage1Validator,
+    Stage2Validator,
+    Stage3Validator,
+    Stage4Validator,
+    validate_stage1_output,
+    validate_stage2_output,
+    validate_stage3_output,
+    validate_stage4_output,
+)
+
+__all__ = [
+    "Stage1Validator",
+    "Stage2Validator",
+    "Stage3Validator",
+    "Stage4Validator",
+    "validate_stage1_output",
+    "validate_stage2_output",
+    "validate_stage3_output",
+    "validate_stage4_output",
+]
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/validation/validators.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/validation/validators.py
@@ -0,0 +1,506 @@
+"""
+Validation rules for pipeline stages.
+
+Implements validation rules V1.x, V2.x, V3.x, V4.x from the contracts.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import TYPE_CHECKING, Any, Callable
+
+from reviewiq_pipeline.contracts import (
+    ValidationError,
+    ValidationResult,
+)
+from reviewiq_pipeline.services.text_processor import is_valid_iso639, is_valid_sha256
+
+if TYPE_CHECKING:
+    from reviewiq_pipeline.contracts import (
+        FactRecord,
+        NormalizedReview,
+        RoutedSpan,
+        Stage1Output,
+        Stage2Output,
+        Stage3Output,
+        Stage4Output,
+    )
+    from reviewiq_pipeline.db.connection import DatabasePool
+
+# URT code pattern
+URT_CODE_PATTERN = re.compile(r"^[OPJEAVR][1-4]\.[0-9]{2}$")
+
+# Issue ID pattern
+ISSUE_ID_PATTERN = re.compile(r"^ISS-[a-f0-9]{16}$")
+
+# Valid enum values
+VALID_VALENCES = {"V+", "V-", "V0", "V±"}
+VALID_INTENSITIES = {"I1", "I2", "I3"}
+VALID_SPECIFICITIES = {"S1", "S2", "S3"}
+VALID_ACTIONABILITIES = {"A1", "A2", "A3"}
+VALID_TEMPORALS = {"TC", "TR", "TH", "TF"}
+VALID_EVIDENCES = {"ES", "EI", "EC"}
+VALID_COMPARATIVES = {"CR-N", "CR-B", "CR-W", "CR-S"}
+
+
+def _has_control_chars(text: str) -> bool:
+    """Check if text contains control characters."""
+    return bool(re.search(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]", text))
+
+
+class Stage1Validator:
+    """Validator for Stage 1 output."""
+
+    def validate(self, output: Stage1Output) -> ValidationResult:
+        """
+        Validate Stage 1 output.
+
+        Rules:
+        - V1.1: text is non-empty string
+        - V1.2: text_normalized contains no control chars
+        - V1.3: content_hash is 64-char hex
+        - V1.4: review_version >= 1
+        - V1.5: text_language is valid ISO 639-1
+        - V1.6: raw_id references valid reviews_raw row (requires DB)
+        """
+        errors: list[ValidationError] = []
+
+        for review in output["reviews_normalized"]:
+            review_id = review["review_id"]
+
+            # V1.1: Non-empty text
+            if not review.get("text") or not review["text"].strip():
+                errors.append(ValidationError(
+                    rule="V1.1",
+                    identifier=review_id,
+                    message="Empty text",
+                ))
+
+            # V1.2: No control characters in normalized text
+            if review.get("text_normalized") and _has_control_chars(review["text_normalized"]):
+                errors.append(ValidationError(
+                    rule="V1.2",
+                    identifier=review_id,
+                    message="Control chars in normalized text",
+                ))
+
+            # V1.3: Valid content hash
+            if not is_valid_sha256(review.get("content_hash", "")):
+                errors.append(ValidationError(
+                    rule="V1.3",
+                    identifier=review_id,
+                    message=f"Invalid content hash: {review.get('content_hash', '')[:20]}...",
+                ))
+
+            # V1.4: Version >= 1
+            if review.get("review_version", 0) < 1:
+                errors.append(ValidationError(
+                    rule="V1.4",
+                    identifier=review_id,
+                    message=f"Invalid version: {review.get('review_version')}",
+                ))
+
+            # V1.5: Valid language code
+            if not is_valid_iso639(review.get("text_language", "")):
+                errors.append(ValidationError(
+                    rule="V1.5",
+                    identifier=review_id,
+                    message=f"Invalid language: {review.get('text_language')}",
+                ))
+
+        return ValidationResult(
+            stage="stage1",
+            passed=len(errors) == 0,
+            error_count=len(errors),
+            errors=errors,
+        )
+
+
+class Stage2Validator:
+    """Validator for Stage 2 output."""
+
+    def validate(
+        self,
+        output: Stage2Output,
+        input_reviews: dict[tuple[str, str, int], dict[str, Any]] | None = None,
+    ) -> ValidationResult:
+        """
+        Validate Stage 2 output.
+
+        Rules:
+        - V2.1: urt_primary matches pattern
+        - V2.2: urt_secondary has max 2 elements
+        - V2.3: valence is valid enum
+        - V2.4: intensity is valid enum
+        - V2.5: span_end > span_start
+        - V2.6: span_text matches text[span_start:span_end]
+        - V2.7: spans do not overlap
+        - V2.8: exactly one is_primary per review
+        - V2.9: trust_score between 0.2 and 1.0
+        - V2.10: embedding is 384-dim array
+        - V2.11: usn matches profile-specific regex
+        - V2.12: related_span_index references valid span
+
+        Args:
+            output: Stage 2 output to validate
+            input_reviews: Optional dict mapping (source, review_id, version) -> review data
+        """
+        errors: list[ValidationError] = []
+
+        for review in output["reviews_classified"]:
+            review_id = review["review_id"]
+
+            # V2.1: Valid URT code
+            if not URT_CODE_PATTERN.match(review.get("urt_primary", "")):
+                errors.append(ValidationError(
+                    rule="V2.1",
+                    identifier=review_id,
+                    message=f"Invalid URT code: {review.get('urt_primary')}",
+                ))
+
+            # V2.2: Max 2 secondary codes
+            if len(review.get("urt_secondary", [])) > 2:
+                errors.append(ValidationError(
+                    rule="V2.2",
+                    identifier=review_id,
+                    message=f"Too many secondary codes: {len(review.get('urt_secondary', []))}",
+                ))
+
+            # V2.3: Valid valence
+            if review.get("valence") not in VALID_VALENCES:
+                errors.append(ValidationError(
+                    rule="V2.3",
+                    identifier=review_id,
+                    message=f"Invalid valence: {review.get('valence')}",
+                ))
+
+            # V2.4: Valid intensity
+            if review.get("intensity") not in VALID_INTENSITIES:
+                errors.append(ValidationError(
+                    rule="V2.4",
+                    identifier=review_id,
+                    message=f"Invalid intensity: {review.get('intensity')}",
+                ))
+
+            # V2.9: Trust score bounds
+            trust = review.get("trust_score", 0)
+            if not (0.2 <= trust <= 1.0):
+                errors.append(ValidationError(
+                    rule="V2.9",
+                    identifier=review_id,
+                    message=f"Trust score out of bounds: {trust}",
+                ))
+
+            # V2.10: Embedding dimension
+            embedding = review.get("embedding", [])
+            if embedding and len(embedding) != 384:
+                errors.append(ValidationError(
+                    rule="V2.10",
+                    identifier=review_id,
+                    message=f"Invalid embedding dimension: {len(embedding)}",
+                ))
+
+            # Validate spans
+            spans = review.get("spans", [])
+            primary_count = 0
+            span_ranges: list[tuple[int, int]] = []
+
+            # Get original text if available
+            original_text = ""
+            if input_reviews:
+                key = (review["source"], review["review_id"], review["review_version"])
+                original_text = input_reviews.get(key, {}).get("text", "")
+
+            for span in spans:
+                span_id = span.get("span_id", f"{review_id}:span")
+
+                # V2.5: Valid bounds
+                start = span.get("span_start", 0)
+                end = span.get("span_end", 0)
+                if end <= start:
+                    errors.append(ValidationError(
+                        rule="V2.5",
+                        identifier=span_id,
+                        message=f"Invalid bounds: {start}:{end}",
+                    ))
+
+                # V2.6: Text matches (if we have original)
+                if original_text and span.get("span_text"):
+                    expected = original_text[start:end]
+                    # Allow whitespace normalization
+                    expected_norm = " ".join(expected.split())
+                    actual_norm = " ".join(span["span_text"].split())
+                    if expected_norm != actual_norm:
+                        errors.append(ValidationError(
+                            rule="V2.6",
+                            identifier=span_id,
+                            message=f"Text mismatch at {start}:{end}",
+                        ))
+
+                # V2.7: Check overlap
+                for prev_start, prev_end in span_ranges:
+                    if not (end <= prev_start or start >= prev_end):
+                        errors.append(ValidationError(
+                            rule="V2.7",
+                            identifier=span_id,
+                            message="Overlapping span",
+                        ))
+                        break
+                span_ranges.append((start, end))
+
+                # V2.8: Count primaries
+                if span.get("is_primary"):
+                    primary_count += 1
+
+                # V2.12: Valid related_span_index
+                related_idx = span.get("related_span_index")
+                if related_idx is not None:
+                    if related_idx < 0 or related_idx >= len(spans):
+                        errors.append(ValidationError(
+                            rule="V2.12",
+                            identifier=span_id,
+                            message=f"Invalid related_span_index: {related_idx}",
+                        ))
+                    elif related_idx == span.get("span_index"):
+                        errors.append(ValidationError(
+                            rule="V2.12",
+                            identifier=span_id,
+                            message="Self-referencing span",
+                        ))
+
+            # V2.8: Exactly one primary
+            if primary_count != 1:
+                errors.append(ValidationError(
+                    rule="V2.8",
+                    identifier=review_id,
+                    message=f"Primary span count: {primary_count}",
+                ))
+
+        return ValidationResult(
+            stage="stage2",
+            passed=len(errors) == 0,
+            error_count=len(errors),
+            errors=errors,
+        )
+
+
+class Stage3Validator:
+    """Validator for Stage 3 output."""
+
+    def __init__(self, db: DatabasePool | None = None):
+        self.db = db
+
+    async def validate(self, output: Stage3Output) -> ValidationResult:
+        """
+        Validate Stage 3 output.
+
+        Rules:
+        - V3.1: issue_id matches pattern
+        - V3.2: routing_key is non-empty
+        - V3.3: span not already linked to different issue
+        - V3.4: issue exists in issues table
+        - V3.5: only V-/V± spans create issues
+        """
+        errors: list[ValidationError] = []
+
+        for routed in output["routed_spans"]:
+            span_id = routed["span_id"]
+
+            # V3.1: Valid issue ID format
+            if not ISSUE_ID_PATTERN.match(routed.get("issue_id", "")):
+                errors.append(ValidationError(
+                    rule="V3.1",
+                    identifier=span_id,
+                    message=f"Invalid issue_id: {routed.get('issue_id')}",
+                ))
+
+            # V3.2: Non-empty routing key
+            if not routed.get("routing_key"):
+                errors.append(ValidationError(
+                    rule="V3.2",
+                    identifier=span_id,
+                    message="Empty routing key",
+                ))
+
+            # V3.3, V3.4: Require database for these checks
+            if self.db:
+                # V3.3: Check no duplicate routing
+                existing = await self.db.fetchval(
+                    "SELECT issue_id FROM issue_spans WHERE span_id = $1",
+                    span_id,
+                )
+                if existing and existing != routed["issue_id"]:
+                    errors.append(ValidationError(
+                        rule="V3.3",
+                        identifier=span_id,
+                        message=f"Already routed to {existing}",
+                    ))
+
+                # V3.4: Issue exists
+                issue_exists = await self.db.fetchval(
+                    "SELECT 1 FROM issues WHERE issue_id = $1",
+                    routed["issue_id"],
+                )
+                if not issue_exists:
+                    errors.append(ValidationError(
+                        rule="V3.4",
+                        identifier=span_id,
+                        message=f"Issue not found: {routed['issue_id']}",
+                    ))
+
+        return ValidationResult(
+            stage="stage3",
+            passed=len(errors) == 0,
+            error_count=len(errors),
+            errors=errors,
+        )
+
+    def validate_sync(self, output: Stage3Output) -> ValidationResult:
+        """Synchronous validation without database checks."""
+        errors: list[ValidationError] = []
+
+        for routed in output["routed_spans"]:
+            span_id = routed["span_id"]
+
+            # V3.1: Valid issue ID format
+            if not ISSUE_ID_PATTERN.match(routed.get("issue_id", "")):
+                errors.append(ValidationError(
+                    rule="V3.1",
+                    identifier=span_id,
+                    message=f"Invalid issue_id: {routed.get('issue_id')}",
+                ))
+
+            # V3.2: Non-empty routing key
+            if not routed.get("routing_key"):
+                errors.append(ValidationError(
+                    rule="V3.2",
+                    identifier=span_id,
+                    message="Empty routing key",
+                ))
+
+        return ValidationResult(
+            stage="stage3",
+            passed=len(errors) == 0,
+            error_count=len(errors),
+            errors=errors,
+        )
+
+
+class Stage4Validator:
+    """Validator for Stage 4 output."""
+
+    def validate(self, output: Stage4Output) -> ValidationResult:
+        """
+        Validate Stage 4 output.
+
+        Rules:
+        - V4.1: place_id is valid or 'ALL'
+        - V4.2: period_date matches bucket
+        - V4.3: span_count >= review_count
+        - V4.4: valence counts sum to span_count
+        - V4.5: intensity counts sum to span_count
+        - V4.6: strength_score >= 0
+        - V4.7: avg_rating between 1.0 and 5.0 (or NULL)
+        """
+        errors: list[ValidationError] = []
+
+        for fact in output["facts_written"]:
+            fact_id = f"{fact['subject_type']}:{fact['subject_id']}"
+
+            # V4.1: Valid place_id
+            place_id = fact.get("place_id", "")
+            if not place_id:
+                errors.append(ValidationError(
+                    rule="V4.1",
+                    identifier=fact_id,
+                    message="Empty place_id",
+                ))
+
+            # V4.3: span_count >= review_count
+            if fact.get("span_count", 0) < fact.get("review_count", 0):
+                errors.append(ValidationError(
+                    rule="V4.3",
+                    identifier=fact_id,
+                    message=f"span_count ({fact.get('span_count')}) < review_count ({fact.get('review_count')})",
+                ))
+
+            # V4.4: Valence sum
+            valence_sum = (
+                fact.get("negative_count", 0) +
+                fact.get("positive_count", 0) +
+                fact.get("neutral_count", 0) +
+                fact.get("mixed_count", 0)
+            )
+            if valence_sum != fact.get("span_count", 0):
+                errors.append(ValidationError(
+                    rule="V4.4",
+                    identifier=fact_id,
+                    message=f"Valence sum {valence_sum} != span_count {fact.get('span_count')}",
+                ))
+
+            # V4.5: Intensity sum
+            intensity_sum = (
+                fact.get("i1_count", 0) +
+                fact.get("i2_count", 0) +
+                fact.get("i3_count", 0)
+            )
+            if intensity_sum != fact.get("span_count", 0):
+                errors.append(ValidationError(
+                    rule="V4.5",
+                    identifier=fact_id,
+                    message=f"Intensity sum {intensity_sum} != span_count {fact.get('span_count')}",
+                ))
+
+            # V4.6: Non-negative strength
+            if fact.get("strength_score", 0) < 0:
+                errors.append(ValidationError(
+                    rule="V4.6",
+                    identifier=fact_id,
+                    message=f"Negative strength_score: {fact.get('strength_score')}",
+                ))
+
+            # V4.7: Rating bounds
+            avg_rating = fact.get("avg_rating")
+            if avg_rating is not None and not (1.0 <= avg_rating <= 5.0):
+                errors.append(ValidationError(
+                    rule="V4.7",
+                    identifier=fact_id,
+                    message=f"Invalid avg_rating: {avg_rating}",
+                ))
+
+        return ValidationResult(
+            stage="stage4",
+            passed=len(errors) == 0,
+            error_count=len(errors),
+            errors=errors,
+        )
+
+
+# Convenience functions
+def validate_stage1_output(output: Stage1Output) -> ValidationResult:
+    """Validate Stage 1 output."""
+    return Stage1Validator().validate(output)
+
+
+def validate_stage2_output(
+    output: Stage2Output,
+    input_reviews: dict[tuple[str, str, int], dict[str, Any]] | None = None,
+) -> ValidationResult:
+    """Validate Stage 2 output."""
+    return Stage2Validator().validate(output, input_reviews)
+
+
+async def validate_stage3_output(
+    output: Stage3Output,
+    db: DatabasePool | None = None,
+) -> ValidationResult:
+    """Validate Stage 3 output."""
+    validator = Stage3Validator(db)
+    if db:
+        return await validator.validate(output)
+    return validator.validate_sync(output)
+
+
+def validate_stage4_output(output: Stage4Output) -> ValidationResult:
+    """Validate Stage 4 output."""
+    return Stage4Validator().validate(output)