From 7d720f5378f544bbc230334d6af761d5e6397ee5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Sat, 24 Jan 2026 18:07:11 +0000 Subject: [PATCH] feat: Add reviewiq-pipeline package for LLM-powered review classification Implement a standalone Python package for processing customer reviews through a 4-stage pipeline using URT (Universal Review Taxonomy) v5.1: - Stage 1: Normalization (text cleaning, language detection, deduplication) - Stage 2: LLM Classification (OpenAI/Anthropic span extraction with URT codes) - Stage 3: Issue Routing (deterministic issue ID generation, span linking) - Stage 4: Fact Aggregation (time series metrics for dashboards) Package includes: - TypedDict contracts matching Pipeline-Contracts-v1.md - Async database layer with asyncpg and 5 SQL migrations - LLM client abstraction supporting both OpenAI and Anthropic - Sentence-transformers integration for embeddings - Validation rules V1.x through V4.x - CLI commands: migrate, run, validate, check - 55 unit and integration tests (all passing) Co-Authored-By: Claude Opus 4.5 --- packages/reviewiq-pipeline/README.md | 97 +++ packages/reviewiq-pipeline/pyproject.toml | 75 ++ .../src/reviewiq_pipeline/__init__.py | 56 ++ .../src/reviewiq_pipeline/cli.py | 322 +++++++++ .../src/reviewiq_pipeline/config.py | 177 +++++ .../src/reviewiq_pipeline/contracts.py | 648 ++++++++++++++++++ .../src/reviewiq_pipeline/db/__init__.py | 17 + .../src/reviewiq_pipeline/db/connection.py | 157 +++++ .../migrations/001_create_reviews_tables.sql | 80 +++ .../db/migrations/002_create_spans_table.sql | 84 +++ .../db/migrations/003_create_urt_enums.sql | 111 +++ .../migrations/004_create_issues_tables.sql | 96 +++ .../db/migrations/005_create_facts_table.sql | 97 +++ .../src/reviewiq_pipeline/db/repositories.py | 562 +++++++++++++++ .../src/reviewiq_pipeline/pipeline.py | 402 +++++++++++ .../reviewiq_pipeline/services/__init__.py | 11 + .../reviewiq_pipeline/services/embeddings.py | 225 ++++++ .../reviewiq_pipeline/services/llm_client.py | 432 ++++++++++++ .../services/text_processor.py | 262 +++++++ .../src/reviewiq_pipeline/stages/__init__.py | 13 + .../stages/stage1_normalize.py | 247 +++++++ .../stages/stage2_classify.py | 539 +++++++++++++++ .../reviewiq_pipeline/stages/stage3_route.py | 274 ++++++++ .../stages/stage4_aggregate.py | 485 +++++++++++++ .../reviewiq_pipeline/validation/__init__.py | 23 + .../validation/validators.py | 506 ++++++++++++++ packages/reviewiq-pipeline/tests/__init__.py | 1 + packages/reviewiq-pipeline/tests/conftest.py | 269 ++++++++ .../tests/integration/__init__.py | 1 + .../tests/integration/test_e2e.py | 179 +++++ .../reviewiq-pipeline/tests/test_stage1.py | 218 ++++++ .../reviewiq-pipeline/tests/test_stage2.py | 193 ++++++ .../reviewiq-pipeline/tests/test_stage3.py | 162 +++++ .../reviewiq-pipeline/tests/test_stage4.py | 201 ++++++ 34 files changed, 7222 insertions(+) create mode 100644 packages/reviewiq-pipeline/README.md create mode 100644 packages/reviewiq-pipeline/pyproject.toml create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/__init__.py create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/cli.py create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/config.py create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/contracts.py create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/db/__init__.py create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/db/connection.py create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/001_create_reviews_tables.sql create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/002_create_spans_table.sql create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/003_create_urt_enums.sql create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/004_create_issues_tables.sql create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/005_create_facts_table.sql create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/db/repositories.py create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/pipeline.py create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/services/__init__.py create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/services/embeddings.py create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/services/llm_client.py create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/services/text_processor.py create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/__init__.py create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage1_normalize.py create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage2_classify.py create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage3_route.py create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage4_aggregate.py create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/validation/__init__.py create mode 100644 packages/reviewiq-pipeline/src/reviewiq_pipeline/validation/validators.py create mode 100644 packages/reviewiq-pipeline/tests/__init__.py create mode 100644 packages/reviewiq-pipeline/tests/conftest.py create mode 100644 packages/reviewiq-pipeline/tests/integration/__init__.py create mode 100644 packages/reviewiq-pipeline/tests/integration/test_e2e.py create mode 100644 packages/reviewiq-pipeline/tests/test_stage1.py create mode 100644 packages/reviewiq-pipeline/tests/test_stage2.py create mode 100644 packages/reviewiq-pipeline/tests/test_stage3.py create mode 100644 packages/reviewiq-pipeline/tests/test_stage4.py diff --git a/packages/reviewiq-pipeline/README.md b/packages/reviewiq-pipeline/README.md new file mode 100644 index 0000000..1cacf0f --- /dev/null +++ b/packages/reviewiq-pipeline/README.md @@ -0,0 +1,97 @@ +# ReviewIQ Pipeline + +LLM-powered review classification and analysis pipeline using URT (Universal Review Taxonomy) v5.1. + +## Features + +- **Stage 1: Normalization** - Text cleaning, language detection, deduplication +- **Stage 2: LLM Classification** - Span extraction with URT codes using OpenAI/Anthropic +- **Stage 3: Issue Routing** - Route negative spans to issues for tracking +- **Stage 4: Fact Aggregation** - Pre-aggregate metrics for dashboard queries + +## Installation + +```bash +pip install reviewiq-pipeline +``` + +Or install from source: + +```bash +pip install -e packages/reviewiq-pipeline +``` + +## Quick Start + +### Python API + +```python +from reviewiq_pipeline import Pipeline, Config + +# Initialize +config = Config( + database_url="postgresql://...", + llm_provider="openai", + llm_api_key="sk-...", + taxonomy_version="v5.1" +) +pipeline = Pipeline(config) + +# Run full pipeline +result = await pipeline.process(scraper_output) + +# Or run individual stages +stage1_result = await pipeline.normalize(scraper_output) +stage2_result = await pipeline.classify(stage1_result) +stage3_result = await pipeline.route(stage2_result) +stage4_result = await pipeline.aggregate(business_id, date) + +# Validate +validation = await pipeline.validate(job_id) +``` + +### CLI + +```bash +# Run migrations +reviewiq-pipeline migrate --database-url $DATABASE_URL + +# Process a job +reviewiq-pipeline run --job-id --stages 1,2,3,4 + +# Validate pipeline output +reviewiq-pipeline validate --job-id +``` + +## Configuration + +Environment variables: + +- `DATABASE_URL` - PostgreSQL connection string +- `LLM_PROVIDER` - `openai` or `anthropic` +- `OPENAI_API_KEY` - OpenAI API key (if using OpenAI) +- `ANTHROPIC_API_KEY` - Anthropic API key (if using Anthropic) +- `TAXONOMY_VERSION` - URT taxonomy version (default: `v5.1`) + +## Development + +```bash +# Install with dev dependencies +pip install -e "packages/reviewiq-pipeline[dev]" + +# Run tests +pytest + +# Run with coverage +pytest --cov=reviewiq_pipeline + +# Type checking +mypy src/reviewiq_pipeline + +# Linting +ruff check src/reviewiq_pipeline +``` + +## License + +MIT diff --git a/packages/reviewiq-pipeline/pyproject.toml b/packages/reviewiq-pipeline/pyproject.toml new file mode 100644 index 0000000..ff531e8 --- /dev/null +++ b/packages/reviewiq-pipeline/pyproject.toml @@ -0,0 +1,75 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "reviewiq-pipeline" +version = "0.1.0" +description = "ReviewIQ Pipeline - LLM-powered review classification and analysis" +readme = "README.md" +license = "MIT" +requires-python = ">=3.11" +authors = [ + { name = "ReviewIQ Team" } +] +keywords = ["reviews", "nlp", "classification", "llm", "pipeline"] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +dependencies = [ + "asyncpg>=0.28.0", + "pydantic>=2.0", + "pydantic-settings>=2.0", + "langdetect>=1.0.9", + "sentence-transformers>=2.2.0", + "openai>=1.0.0", + "anthropic>=0.18.0", + "click>=8.0.0", + "python-dotenv>=1.0.0", + "aiofiles>=23.0.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.0", + "pytest-asyncio>=0.21.0", + "pytest-cov>=4.0", + "ruff>=0.1.0", + "mypy>=1.0", +] + +[project.scripts] +reviewiq-pipeline = "reviewiq_pipeline.cli:main" + +[project.urls] +Homepage = "https://github.com/reviewiq/reviewiq-pipeline" +Documentation = "https://github.com/reviewiq/reviewiq-pipeline#readme" +Repository = "https://github.com/reviewiq/reviewiq-pipeline" + +[tool.hatch.build.targets.wheel] +packages = ["src/reviewiq_pipeline"] + +[tool.pytest.ini_options] +asyncio_mode = "auto" +testpaths = ["tests"] +pythonpath = ["src"] + +[tool.ruff] +line-length = 100 +target-version = "py311" + +[tool.ruff.lint] +select = ["E", "F", "I", "N", "W", "UP"] +ignore = ["E501"] + +[tool.mypy] +python_version = "3.11" +strict = true +warn_return_any = true +warn_unused_ignores = true diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/__init__.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/__init__.py new file mode 100644 index 0000000..f5ac699 --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/__init__.py @@ -0,0 +1,56 @@ +""" +ReviewIQ Pipeline - LLM-powered review classification and analysis. + +This package provides a complete pipeline for processing customer reviews: +- Stage 1: Normalization (text cleaning, language detection, deduplication) +- Stage 2: LLM Classification (span extraction with URT codes) +- Stage 3: Issue Routing (route negative spans to issues) +- Stage 4: Fact Aggregation (pre-aggregate metrics for dashboards) +""" + +from reviewiq_pipeline.config import Config +from reviewiq_pipeline.contracts import ( + ClassifiedReview, + ExtractedSpan, + FactRecord, + NormalizedReview, + RawReview, + RoutedSpan, + ScraperOutput, + Stage1Input, + Stage1Output, + Stage2Input, + Stage2Output, + Stage3Input, + Stage3Output, + Stage4Input, + Stage4Output, + ValidationError, + ValidationResult, +) +from reviewiq_pipeline.pipeline import Pipeline + +__version__ = "0.1.0" +__all__ = [ + # Main API + "Pipeline", + "Config", + # Contracts + "ScraperOutput", + "RawReview", + "Stage1Input", + "Stage1Output", + "NormalizedReview", + "Stage2Input", + "Stage2Output", + "ClassifiedReview", + "ExtractedSpan", + "Stage3Input", + "Stage3Output", + "RoutedSpan", + "Stage4Input", + "Stage4Output", + "FactRecord", + "ValidationResult", + "ValidationError", +] diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/cli.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/cli.py new file mode 100644 index 0000000..e352b06 --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/cli.py @@ -0,0 +1,322 @@ +""" +CLI for the ReviewIQ pipeline. + +Usage: + reviewiq-pipeline migrate --database-url $DATABASE_URL + reviewiq-pipeline run --job-id --stages 1,2,3,4 + reviewiq-pipeline validate --job-id +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import sys +from typing import Any + +import click + +from reviewiq_pipeline import __version__ + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", +) +logger = logging.getLogger("reviewiq_pipeline") + + +def get_config(**overrides: Any): + """Get configuration with optional overrides.""" + from reviewiq_pipeline.config import Config + + return Config(**{k: v for k, v in overrides.items() if v is not None}) + + +@click.group() +@click.version_option(version=__version__) +def main(): + """ReviewIQ Pipeline - LLM-powered review classification.""" + pass + + +@main.command() +@click.option( + "--database-url", + envvar="DATABASE_URL", + required=True, + help="PostgreSQL connection string", +) +def migrate(database_url: str): + """Run database migrations.""" + + async def _migrate(): + from reviewiq_pipeline.db.connection import DatabasePool + + config = get_config(database_url=database_url) + db = DatabasePool(config) + + try: + await db.initialize() + count = await db.run_migrations() + click.echo(f"Successfully ran {count} migrations") + except Exception as e: + click.echo(f"Migration failed: {e}", err=True) + sys.exit(1) + finally: + await db.close() + + asyncio.run(_migrate()) + + +@main.command() +@click.option( + "--job-id", + required=True, + help="Job ID to process", +) +@click.option( + "--stages", + default="1,2,3,4", + help="Comma-separated list of stages to run (default: 1,2,3,4)", +) +@click.option( + "--database-url", + envvar="DATABASE_URL", + required=True, + help="PostgreSQL connection string", +) +@click.option( + "--llm-provider", + envvar="LLM_PROVIDER", + type=click.Choice(["openai", "anthropic"]), + default="openai", + help="LLM provider", +) +@click.option( + "--llm-model", + envvar="LLM_MODEL", + default="gpt-4o-mini", + help="LLM model to use", +) +@click.option( + "--openai-api-key", + envvar="OPENAI_API_KEY", + help="OpenAI API key", +) +@click.option( + "--anthropic-api-key", + envvar="ANTHROPIC_API_KEY", + help="Anthropic API key", +) +@click.option( + "--validate/--no-validate", + default=True, + help="Validate output after each stage", +) +@click.option( + "--output", + type=click.Path(), + help="Output file for results (JSON)", +) +def run( + job_id: str, + stages: str, + database_url: str, + llm_provider: str, + llm_model: str, + openai_api_key: str | None, + anthropic_api_key: str | None, + validate: bool, + output: str | None, +): + """Run pipeline stages for a job.""" + + async def _run(): + from reviewiq_pipeline import Pipeline + + # Parse stages + stage_list = [int(s.strip()) for s in stages.split(",") if s.strip()] + + config = get_config( + database_url=database_url, + llm_provider=llm_provider, + llm_model=llm_model, + openai_api_key=openai_api_key, + anthropic_api_key=anthropic_api_key, + ) + + pipeline = Pipeline(config) + + try: + await pipeline.initialize() + + # Fetch job from database + job_data = await pipeline._db.fetchrow( + "SELECT * FROM jobs WHERE job_id = $1", + job_id, + ) + + if not job_data: + click.echo(f"Job {job_id} not found", err=True) + sys.exit(1) + + # Build scraper output from job data + reviews_data = job_data.get("reviews_data") or {} + scraper_output = { + "job_id": job_id, + "status": job_data.get("status", "completed"), + "business_id": reviews_data.get("business_id", job_id), + "place_id": reviews_data.get("place_id", ""), + "business_info": reviews_data.get("business_info", {}), + "reviews": reviews_data.get("reviews", []), + "scrape_time_ms": 0, + "reviews_scraped": len(reviews_data.get("reviews", [])), + "scraper_version": "v1.0.0", + } + + # Run pipeline + result = await pipeline.process( + scraper_output, + stages=stage_list, + validate=validate, + ) + + # Output results + if result.success: + click.echo(click.style("Pipeline completed successfully!", fg="green")) + else: + click.echo(click.style("Pipeline completed with validation errors", fg="yellow")) + + # Print summary + if result.stage1: + click.echo(f" Stage 1: {result.stage1['stats']['output_count']} reviews normalized") + if result.stage2: + click.echo(f" Stage 2: {result.stage2['stats']['success_count']} reviews classified") + if result.stage3: + click.echo(f" Stage 3: {result.stage3['stats']['spans_routed']} spans routed") + if result.stage4: + click.echo(f" Stage 4: {result.stage4['stats']['facts_upserted']} facts written") + + # Validation summary + for stage, validation in result.validation.items(): + status = "PASS" if validation["passed"] else f"FAIL ({validation['error_count']} errors)" + click.echo(f" {stage} validation: {status}") + + # Write output file + if output: + with open(output, "w") as f: + json.dump(result.to_dict(), f, indent=2, default=str) + click.echo(f"Results written to {output}") + + if not result.success: + sys.exit(1) + + except Exception as e: + click.echo(f"Pipeline failed: {e}", err=True) + logger.exception("Pipeline error") + sys.exit(1) + finally: + await pipeline.close() + + asyncio.run(_run()) + + +@main.command() +@click.option( + "--job-id", + required=True, + help="Job ID to validate", +) +@click.option( + "--database-url", + envvar="DATABASE_URL", + required=True, + help="PostgreSQL connection string", +) +@click.option( + "--stage", + type=click.Choice(["1", "2", "3", "4", "all"]), + default="all", + help="Stage to validate (default: all)", +) +def validate(job_id: str, database_url: str, stage: str): + """Validate pipeline output for a job.""" + + async def _validate(): + from reviewiq_pipeline import Pipeline + + config = get_config(database_url=database_url) + pipeline = Pipeline(config) + + try: + await pipeline.initialize() + + results = await pipeline.validate(job_id) + + all_passed = True + for stage_name, validation in results.items(): + if stage != "all" and f"stage{stage}" != stage_name: + continue + + status = "PASS" if validation["passed"] else "FAIL" + color = "green" if validation["passed"] else "red" + click.echo(click.style(f"{stage_name}: {status}", fg=color)) + + if not validation["passed"]: + all_passed = False + for error in validation["errors"][:10]: + click.echo(f" - [{error['rule']}] {error['identifier']}: {error['message']}") + if validation["error_count"] > 10: + click.echo(f" ... and {validation['error_count'] - 10} more errors") + + if not all_passed: + sys.exit(1) + + except Exception as e: + click.echo(f"Validation failed: {e}", err=True) + sys.exit(1) + finally: + await pipeline.close() + + asyncio.run(_validate()) + + +@main.command() +@click.option( + "--database-url", + envvar="DATABASE_URL", + required=True, + help="PostgreSQL connection string", +) +def check(database_url: str): + """Check database connection.""" + + async def _check(): + from reviewiq_pipeline.db.connection import DatabasePool + + config = get_config(database_url=database_url) + db = DatabasePool(config) + + try: + await db.initialize() + if await db.check_connection(): + click.echo(click.style("Database connection OK", fg="green")) + else: + click.echo(click.style("Database connection failed", fg="red")) + sys.exit(1) + finally: + await db.close() + + asyncio.run(_check()) + + +@main.command() +def version(): + """Show version information.""" + click.echo(f"reviewiq-pipeline {__version__}") + + +if __name__ == "__main__": + main() diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/config.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/config.py new file mode 100644 index 0000000..800c1b3 --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/config.py @@ -0,0 +1,177 @@ +"""Configuration management for the ReviewIQ pipeline.""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import Field, SecretStr, field_validator +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Config(BaseSettings): + """Pipeline configuration loaded from environment variables or passed directly.""" + + model_config = SettingsConfigDict( + env_prefix="REVIEWIQ_", + env_file=".env", + env_file_encoding="utf-8", + extra="ignore", + ) + + # Database + database_url: str = Field( + default="postgresql://localhost:5432/reviewiq", + description="PostgreSQL connection string", + ) + db_pool_min_size: int = Field(default=2, ge=1, le=50) + db_pool_max_size: int = Field(default=10, ge=1, le=100) + + # LLM Provider + llm_provider: Literal["openai", "anthropic"] = Field( + default="openai", + description="LLM provider to use for classification", + ) + openai_api_key: SecretStr | None = Field( + default=None, + description="OpenAI API key", + ) + anthropic_api_key: SecretStr | None = Field( + default=None, + description="Anthropic API key", + ) + + # Model settings + llm_model: str = Field( + default="gpt-4o-mini", + description="LLM model to use for classification", + ) + llm_temperature: float = Field(default=0.0, ge=0.0, le=2.0) + llm_max_retries: int = Field(default=3, ge=1, le=10) + llm_timeout_seconds: int = Field(default=60, ge=10, le=300) + + # Embedding settings + embedding_model: str = Field( + default="all-MiniLM-L6-v2", + description="Sentence transformer model for embeddings", + ) + embedding_dimension: int = Field( + default=384, + description="Expected embedding dimension", + ) + + # Taxonomy + taxonomy_version: str = Field( + default="v5.1", + description="URT taxonomy version", + ) + + # Classification + classification_profile: Literal["lite", "core", "standard", "full"] = Field( + default="standard", + description="Classification profile to use", + ) + max_spans_per_review: int = Field(default=10, ge=1, le=20) + + # Processing + batch_size: int = Field(default=50, ge=1, le=500) + trust_score_floor: float = Field(default=0.2, ge=0.0, le=1.0) + + # Migrations + migrations_path: str = Field( + default="", + description="Path to migrations directory (empty for default)", + ) + + @field_validator("llm_provider") + @classmethod + def validate_provider_api_key(cls, v: str) -> str: + """Validate that provider is supported.""" + if v not in ("openai", "anthropic"): + raise ValueError(f"Unsupported LLM provider: {v}") + return v + + def get_llm_api_key(self) -> str: + """Get the API key for the configured LLM provider.""" + if self.llm_provider == "openai": + if self.openai_api_key is None: + raise ValueError("OpenAI API key is required when llm_provider is 'openai'") + return self.openai_api_key.get_secret_value() + elif self.llm_provider == "anthropic": + if self.anthropic_api_key is None: + raise ValueError("Anthropic API key is required when llm_provider is 'anthropic'") + return self.anthropic_api_key.get_secret_value() + else: + raise ValueError(f"Unsupported LLM provider: {self.llm_provider}") + + @property + def effective_migrations_path(self) -> str: + """Get the effective migrations path.""" + if self.migrations_path: + return self.migrations_path + # Default to package's migrations directory + import importlib.resources + + try: + # Python 3.11+ + return str(importlib.resources.files("reviewiq_pipeline.db") / "migrations") + except (AttributeError, TypeError): + # Fallback for older Python + import os + + return os.path.join(os.path.dirname(__file__), "db", "migrations") + + +class ClassificationConfig: + """Configuration specifically for the LLM classification stage.""" + + def __init__(self, config: Config): + self.model = config.llm_model + self.taxonomy_version = config.taxonomy_version + self.profile = config.classification_profile + self.max_spans_per_review = config.max_spans_per_review + self.temperature = config.llm_temperature + self.max_retries = config.llm_max_retries + self.timeout_seconds = config.llm_timeout_seconds + + def to_dict(self) -> dict: + """Convert to dictionary for contract compatibility.""" + return { + "model": self.model, + "taxonomy_version": self.taxonomy_version, + "profile": self.profile, + "max_spans_per_review": self.max_spans_per_review, + } + + +class EmbeddingConfig: + """Configuration for the embedding service.""" + + def __init__(self, config: Config): + self.model = config.embedding_model + self.dimension = config.embedding_dimension + + def to_dict(self) -> dict: + """Convert to dictionary.""" + return { + "model": self.model, + "dimension": self.dimension, + } + + +class DatabaseConfig: + """Configuration for database connections.""" + + def __init__(self, config: Config): + self.url = config.database_url + self.pool_min_size = config.db_pool_min_size + self.pool_max_size = config.db_pool_max_size + self.migrations_path = config.effective_migrations_path + + def to_dict(self) -> dict: + """Convert to dictionary.""" + return { + "url": self.url, + "pool_min_size": self.pool_min_size, + "pool_max_size": self.pool_max_size, + "migrations_path": self.migrations_path, + } diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/contracts.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/contracts.py new file mode 100644 index 0000000..4a04396 --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/contracts.py @@ -0,0 +1,648 @@ +""" +TypedDict definitions for pipeline stage inputs and outputs. + +These contracts define the data structures passed between pipeline stages, +enabling independent development and validation of each stage. +""" + +from __future__ import annotations + +from typing import Any, Literal, TypedDict + + +# ============================================================================= +# Common Types +# ============================================================================= + +ValenceType = Literal["V+", "V-", "V0", "V±"] +IntensityType = Literal["I1", "I2", "I3"] +SpecificityType = Literal["S1", "S2", "S3"] +ActionabilityType = Literal["A1", "A2", "A3"] +TemporalType = Literal["TC", "TR", "TH", "TF"] +EvidenceType = Literal["ES", "EI", "EC"] +ComparativeType = Literal["CR-N", "CR-B", "CR-W", "CR-S"] +ConfidenceType = Literal["high", "medium", "low"] +EntityTypeValue = Literal["location", "staff", "product", "process", "time", "other"] +RelationType = Literal["cause_of", "effect_of", "contrast", "resolution"] +ProfileType = Literal["lite", "core", "standard", "full"] +BucketType = Literal["day", "week", "month"] +SubjectType = Literal["overall", "urt_code", "domain", "issue"] +IssueState = Literal["open", "resolved", "ignored", "merged"] + + +# ============================================================================= +# Validation Types +# ============================================================================= + + +class ValidationError(TypedDict): + """A single validation error.""" + + rule: str + identifier: str + message: str + + +class ValidationResult(TypedDict): + """Result of validating a stage output.""" + + stage: str + passed: bool + error_count: int + errors: list[ValidationError] + + +# ============================================================================= +# Stage 0: Raw Ingestion (from Scraper) +# ============================================================================= + + +class BusinessInfo(TypedDict): + """Business metadata from scraper.""" + + name: str + address: str + category: str + total_reviews: int + average_rating: float + + +class RawReview(TypedDict, total=False): + """Raw review as scraped from Google Maps.""" + + review_id: str + author_name: str + author_id: str | None + rating: int + text: str | None + review_time: str + response_text: str | None + response_time: str | None + photos: list[str] | None + raw_payload: dict[str, Any] + + +class ScraperOutput(TypedDict): + """Output from the scraper (Stage 0), input to pipeline.""" + + job_id: str + status: Literal["completed", "failed", "partial"] + business_id: str + place_id: str + business_info: BusinessInfo + reviews: list[RawReview] + scrape_time_ms: int + reviews_scraped: int + scraper_version: str + + +# ============================================================================= +# Stage 1: Normalization +# ============================================================================= + + +class Stage1Input(TypedDict): + """Input to Stage 1 normalization.""" + + job_id: str + business_id: str + place_id: str + reviews: list[RawReview] + + +class NormalizedReview(TypedDict, total=False): + """A normalized review ready for classification.""" + + # Identity (composite key) + source: Literal["google"] + review_id: str + review_version: int + + # Tenant context + business_id: str + place_id: str + + # Content + text: str + text_normalized: str + text_language: str + text_length: int + word_count: int + + # Metadata + rating: int + review_time: str + author_name: str + author_id: str | None + + # Dedup + content_hash: str + dedup_group_id: str | None + + # Reference + raw_id: int + + +class Stage1Stats(TypedDict): + """Statistics from Stage 1 processing.""" + + input_count: int + output_count: int + skipped_empty: int + skipped_duplicate: int + + +class Stage1Output(TypedDict): + """Output from Stage 1 normalization.""" + + job_id: str + business_id: str + place_id: str + reviews_normalized: list[NormalizedReview] + stats: Stage1Stats + + +# ============================================================================= +# Stage 2: LLM Classification +# ============================================================================= + + +class ReviewToClassify(TypedDict): + """A review to be classified by the LLM.""" + + source: str + review_id: str + review_version: int + business_id: str + place_id: str + text: str + text_normalized: str + rating: int + review_time: str + + +class ClassificationConfig(TypedDict): + """Configuration for LLM classification.""" + + model: str + taxonomy_version: str + profile: ProfileType + max_spans_per_review: int + + +class Stage2Input(TypedDict): + """Input to Stage 2 classification.""" + + reviews: list[ReviewToClassify] + config: ClassificationConfig + + +class CausalLink(TypedDict): + """A link in a causal chain.""" + + code: str + role: Literal["cause", "effect", "context", "outcome"] + order: int + + +class ExtractedSpan(TypedDict, total=False): + """A span extracted from a review with URT classification.""" + + # Identity + span_id: str + span_index: int + + # Position (offsets into original text) + span_text: str + span_start: int + span_end: int + + # Classification + profile: ProfileType + urt_primary: str + urt_secondary: list[str] + valence: ValenceType + intensity: IntensityType + comparative: ComparativeType + + # Extended (standard/full profile) + specificity: SpecificityType + actionability: ActionabilityType + temporal: TemporalType + evidence: EvidenceType + + # Entity + entity: str | None + entity_type: EntityTypeValue | None + entity_normalized: str | None + + # Causal (full profile) + relation_type: RelationType | None + related_span_index: int | None + causal_chain: list[CausalLink] | None + + # Metadata + confidence: ConfidenceType + usn: str + + # Flags + is_primary: bool + + +class ClassifiedReview(TypedDict, total=False): + """A review with LLM classification results.""" + + # Identity + source: str + review_id: str + review_version: int + + # Review-level classification (from primary span) + urt_primary: str + urt_secondary: list[str] + valence: ValenceType + intensity: IntensityType + comparative: ComparativeType + + # Extracted entities + staff_mentions: list[str] + quotes: dict[str, str] + + # Trust score + trust_score: float + + # Embedding + embedding: list[float] + + # Spans + spans: list[ExtractedSpan] + + # Processing metadata + classification_confidence: dict[str, float] + processing_time_ms: int + + +class Stage2Stats(TypedDict): + """Statistics from Stage 2 processing.""" + + input_count: int + success_count: int + error_count: int + total_spans: int + avg_spans_per_review: float + llm_tokens_used: int + llm_cost_usd: float + + +class Stage2Output(TypedDict): + """Output from Stage 2 classification.""" + + batch_id: str + taxonomy_version: str + model_version: str + prompt_version: str + reviews_classified: list[ClassifiedReview] + stats: Stage2Stats + + +# ============================================================================= +# Stage 3: Issue Routing +# ============================================================================= + + +class SpanToRoute(TypedDict): + """A span to be routed to an issue.""" + + span_id: str + business_id: str + place_id: str + urt_primary: str + valence: str + intensity: str + entity_normalized: str | None + review_time: str + confidence: str + trust_score: float + + +class Stage3Input(TypedDict): + """Input to Stage 3 issue routing.""" + + spans: list[SpanToRoute] + + +class RoutedSpan(TypedDict): + """A span that has been routed to an issue.""" + + span_id: str + issue_id: str + routing_key: str + is_new_issue: bool + + +class Stage3Stats(TypedDict): + """Statistics from Stage 3 processing.""" + + spans_processed: int + spans_routed: int + spans_skipped: int + issues_created: int + issues_updated: int + + +class Stage3Output(TypedDict): + """Output from Stage 3 issue routing.""" + + routed_spans: list[RoutedSpan] + issues_created: list[str] + issues_updated: list[str] + stats: Stage3Stats + + +# ============================================================================= +# Stage 4: Fact Aggregation +# ============================================================================= + + +class Stage4Input(TypedDict): + """Input to Stage 4 fact aggregation.""" + + business_id: str + date: str + bucket_types: list[BucketType] + taxonomy_version: str + + +class FactRecord(TypedDict, total=False): + """An aggregated fact record for time series data.""" + + # Keys + business_id: str + place_id: str + period_date: str + bucket_type: str + subject_type: SubjectType + subject_id: str + taxonomy_version: str + + # Metrics + review_count: int + span_count: int + negative_count: int + positive_count: int + neutral_count: int + mixed_count: int + strength_score: float + negative_strength: float + positive_strength: float + avg_rating: float | None + i1_count: int + i2_count: int + i3_count: int + cr_better: int + cr_worse: int + cr_same: int + trust_weighted_strength: float + trust_weighted_negative: float + + +class Stage4Stats(TypedDict): + """Statistics from Stage 4 processing.""" + + business_id: str + date: str + locations_processed: int + codes_aggregated: int + facts_upserted: int + + +class Stage4Output(TypedDict): + """Output from Stage 4 fact aggregation.""" + + facts_written: list[FactRecord] + stats: Stage4Stats + + +# ============================================================================= +# Database Entity Types +# ============================================================================= + + +class ReviewRaw(TypedDict, total=False): + """A raw review record in the database.""" + + id: int + source: str + review_id: str + place_id: str + raw_payload: dict[str, Any] + review_text: str | None + rating: int + review_time: str + reviewer_name: str + reviewer_id: str | None + review_version: int + pulled_at: str + created_at: str + + +class ReviewEnriched(TypedDict, total=False): + """An enriched review record in the database.""" + + id: int + source: str + review_id: str + review_version: int + is_latest: bool + raw_id: int + business_id: str + place_id: str + text: str + text_normalized: str + rating: int + review_time: str + language: str + taxonomy_version: str + urt_primary: str | None + urt_secondary: list[str] | None + valence: ValenceType | None + intensity: IntensityType | None + comparative: ComparativeType | None + staff_mentions: list[str] | None + quotes: dict[str, str] | None + embedding: list[float] | None + trust_score: float | None + classification_model: str | None + classification_confidence: dict[str, float] | None + processed_at: str | None + created_at: str + + +class ReviewSpan(TypedDict, total=False): + """A span record in the database.""" + + id: int + span_id: str + business_id: str + place_id: str + source: str + review_id: str + review_version: int + span_index: int + span_text: str + span_start: int + span_end: int + profile: ProfileType + urt_primary: str + urt_secondary: list[str] + valence: ValenceType + intensity: IntensityType + comparative: ComparativeType + specificity: SpecificityType | None + actionability: ActionabilityType | None + temporal: TemporalType | None + evidence: EvidenceType | None + entity: str | None + entity_type: EntityTypeValue | None + entity_normalized: str | None + relation_type: RelationType | None + related_span_id: str | None + causal_chain: list[CausalLink] | None + is_primary: bool + is_active: bool + review_time: str + confidence: ConfidenceType + usn: str + taxonomy_version: str + model_version: str + ingest_batch_id: str + created_at: str + + +class Issue(TypedDict, total=False): + """An issue record in the database.""" + + id: int + issue_id: str + business_id: str + place_id: str + primary_subcode: str + domain: str + state: IssueState + priority_score: float + confidence_score: float + span_count: int + max_intensity: IntensityType + entity: str | None + entity_normalized: str | None + taxonomy_version: str + created_at: str + updated_at: str + + +class IssueSpan(TypedDict): + """A link between an issue and a span.""" + + id: int + issue_id: str + span_id: str + source: str + review_id: str + review_version: int + is_primary_match: bool + intensity: IntensityType + review_time: str + created_at: str + + +class IssueEvent(TypedDict, total=False): + """An event in the issue audit log.""" + + id: int + issue_id: str + event_type: str + span_id: str | None + old_value: str | None + new_value: str | None + metadata: dict[str, Any] | None + created_at: str + + +class FactTimeseries(TypedDict, total=False): + """A fact time series record in the database.""" + + id: int + business_id: str + place_id: str + period_date: str + bucket_type: BucketType + subject_type: SubjectType + subject_id: str + taxonomy_version: str + review_count: int + span_count: int + negative_count: int + positive_count: int + neutral_count: int + mixed_count: int + strength_score: float + negative_strength: float + positive_strength: float + avg_rating: float | None + i1_count: int + i2_count: int + i3_count: int + cr_better: int + cr_worse: int + cr_same: int + trust_weighted_strength: float + trust_weighted_negative: float + computed_at: str + created_at: str + + +# ============================================================================= +# LLM Response Types +# ============================================================================= + + +class LLMSpanResponse(TypedDict, total=False): + """A span in the LLM response.""" + + span_index: int + span_text: str + span_start: int + span_end: int + urt_primary: str + urt_secondary: list[str] + valence: ValenceType + intensity: IntensityType + specificity: SpecificityType + actionability: ActionabilityType + temporal: TemporalType + evidence: EvidenceType + comparative: ComparativeType + is_primary: bool + confidence: ConfidenceType + entity: str | None + entity_type: EntityTypeValue | None + relation_type: RelationType | None + related_span_index: int | None + usn: str + + +class LLMReviewSummary(TypedDict): + """Review summary in the LLM response.""" + + dominant_valence: ValenceType + dominant_domain: str + span_count: int + has_comparative: bool + has_entity: bool + + +class LLMClassificationResponse(TypedDict): + """The full LLM classification response.""" + + spans: list[LLMSpanResponse] + review_summary: LLMReviewSummary diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/__init__.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/__init__.py new file mode 100644 index 0000000..56b3ba3 --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/__init__.py @@ -0,0 +1,17 @@ +"""Database layer for pipeline operations.""" + +from reviewiq_pipeline.db.connection import DatabasePool +from reviewiq_pipeline.db.repositories import ( + FactRepository, + IssueRepository, + ReviewRepository, + SpanRepository, +) + +__all__ = [ + "DatabasePool", + "ReviewRepository", + "SpanRepository", + "IssueRepository", + "FactRepository", +] diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/connection.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/connection.py new file mode 100644 index 0000000..7348071 --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/connection.py @@ -0,0 +1,157 @@ +"""Database connection management using asyncpg.""" + +from __future__ import annotations + +import asyncio +import logging +from contextlib import asynccontextmanager +from pathlib import Path +from typing import TYPE_CHECKING, Any, AsyncGenerator + +import asyncpg + +if TYPE_CHECKING: + from reviewiq_pipeline.config import Config + +logger = logging.getLogger(__name__) + + +class DatabasePool: + """Manages an asyncpg connection pool.""" + + def __init__(self, config: Config): + self.config = config + self._pool: asyncpg.Pool | None = None + self._lock = asyncio.Lock() + + async def initialize(self) -> None: + """Initialize the connection pool.""" + async with self._lock: + if self._pool is not None: + return + + logger.info("Creating database connection pool...") + self._pool = await asyncpg.create_pool( + self.config.database_url, + min_size=self.config.db_pool_min_size, + max_size=self.config.db_pool_max_size, + command_timeout=60, + ) + logger.info("Database pool created successfully") + + async def close(self) -> None: + """Close the connection pool.""" + async with self._lock: + if self._pool is not None: + await self._pool.close() + self._pool = None + logger.info("Database pool closed") + + @property + def pool(self) -> asyncpg.Pool: + """Get the connection pool, raising if not initialized.""" + if self._pool is None: + raise RuntimeError("Database pool not initialized. Call initialize() first.") + return self._pool + + @asynccontextmanager + async def acquire(self) -> AsyncGenerator[asyncpg.Connection, None]: + """Acquire a connection from the pool.""" + async with self.pool.acquire() as conn: + yield conn + + @asynccontextmanager + async def transaction(self) -> AsyncGenerator[asyncpg.Connection, None]: + """Acquire a connection and start a transaction.""" + async with self.pool.acquire() as conn: + async with conn.transaction(): + yield conn + + async def execute(self, query: str, *args: Any) -> str: + """Execute a query and return the status string.""" + async with self.acquire() as conn: + return await conn.execute(query, *args) + + async def executemany(self, query: str, args: list[tuple]) -> None: + """Execute a query with multiple argument sets.""" + async with self.acquire() as conn: + await conn.executemany(query, args) + + async def fetch(self, query: str, *args: Any) -> list[asyncpg.Record]: + """Fetch multiple rows.""" + async with self.acquire() as conn: + return await conn.fetch(query, *args) + + async def fetchrow(self, query: str, *args: Any) -> asyncpg.Record | None: + """Fetch a single row.""" + async with self.acquire() as conn: + return await conn.fetchrow(query, *args) + + async def fetchval(self, query: str, *args: Any) -> Any: + """Fetch a single value.""" + async with self.acquire() as conn: + return await conn.fetchval(query, *args) + + async def run_migrations(self, migrations_path: str | None = None) -> int: + """Run all pending migrations. + + Args: + migrations_path: Path to migrations directory. Uses config default if None. + + Returns: + Number of migrations run. + """ + path = Path(migrations_path or self.config.effective_migrations_path) + if not path.exists(): + logger.warning(f"Migrations path does not exist: {path}") + return 0 + + async with self.transaction() as conn: + # Create migrations tracking table + await conn.execute(""" + CREATE TABLE IF NOT EXISTS _migrations ( + id SERIAL PRIMARY KEY, + filename VARCHAR(255) UNIQUE NOT NULL, + applied_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() + ) + """) + + # Get already applied migrations + applied = await conn.fetch("SELECT filename FROM _migrations") + applied_set = {r["filename"] for r in applied} + + # Find and run pending migrations + migration_files = sorted(path.glob("*.sql")) + migrations_run = 0 + + for migration_file in migration_files: + filename = migration_file.name + if filename in applied_set: + continue + + logger.info(f"Running migration: {filename}") + sql = migration_file.read_text() + + try: + await conn.execute(sql) + await conn.execute( + "INSERT INTO _migrations (filename) VALUES ($1)", + filename, + ) + migrations_run += 1 + logger.info(f"Migration {filename} applied successfully") + except Exception as e: + logger.error(f"Migration {filename} failed: {e}") + raise + + logger.info(f"Ran {migrations_run} migrations") + return migrations_run + + async def check_connection(self) -> bool: + """Check if the database connection is working.""" + try: + result = await self.fetchval("SELECT 1") + return result == 1 + except Exception as e: + logger.error(f"Database connection check failed: {e}") + return False diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/001_create_reviews_tables.sql b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/001_create_reviews_tables.sql new file mode 100644 index 0000000..d08a696 --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/001_create_reviews_tables.sql @@ -0,0 +1,80 @@ +-- Migration: 001_create_reviews_tables.sql +-- Purpose: Create the core reviews tables for Stage 1 normalization + +-- Raw reviews table (immutable audit log) +CREATE TABLE IF NOT EXISTS reviews_raw ( + id BIGSERIAL PRIMARY KEY, + source VARCHAR(20) NOT NULL DEFAULT 'google', + review_id VARCHAR(255) NOT NULL, + place_id VARCHAR(255) NOT NULL, + raw_payload JSONB NOT NULL DEFAULT '{}', + review_text TEXT, + rating SMALLINT NOT NULL CHECK (rating BETWEEN 1 AND 5), + review_time TIMESTAMP WITH TIME ZONE NOT NULL, + reviewer_name VARCHAR(255) NOT NULL, + reviewer_id VARCHAR(255), + review_version INTEGER NOT NULL DEFAULT 1, + pulled_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + + CONSTRAINT reviews_raw_unique UNIQUE (source, review_id, review_version) +); + +-- Indexes for reviews_raw +CREATE INDEX IF NOT EXISTS idx_reviews_raw_place_id ON reviews_raw(place_id); +CREATE INDEX IF NOT EXISTS idx_reviews_raw_review_time ON reviews_raw(review_time); +CREATE INDEX IF NOT EXISTS idx_reviews_raw_pulled_at ON reviews_raw(pulled_at); + +-- Enriched reviews table (mutable, updated by classification) +CREATE TABLE IF NOT EXISTS reviews_enriched ( + id BIGSERIAL PRIMARY KEY, + source VARCHAR(20) NOT NULL DEFAULT 'google', + review_id VARCHAR(255) NOT NULL, + review_version INTEGER NOT NULL DEFAULT 1, + is_latest BOOLEAN NOT NULL DEFAULT TRUE, + raw_id BIGINT REFERENCES reviews_raw(id), + + -- Tenant context + business_id VARCHAR(255) NOT NULL, + place_id VARCHAR(255) NOT NULL, + + -- Content + text TEXT NOT NULL, + text_normalized TEXT NOT NULL, + rating SMALLINT NOT NULL CHECK (rating BETWEEN 1 AND 5), + review_time TIMESTAMP WITH TIME ZONE NOT NULL, + + -- Normalization fields + language VARCHAR(10) NOT NULL DEFAULT 'en', + taxonomy_version VARCHAR(20) NOT NULL DEFAULT 'v5.1', + + -- Classification fields (NULL until Stage 2) + urt_primary VARCHAR(10), + urt_secondary VARCHAR(10)[] DEFAULT '{}', + valence VARCHAR(5), + intensity VARCHAR(5), + comparative VARCHAR(10), + staff_mentions VARCHAR(255)[] DEFAULT '{}', + quotes JSONB DEFAULT '{}', + embedding REAL[] DEFAULT '{}', + trust_score REAL, + classification_model VARCHAR(100), + classification_confidence JSONB DEFAULT '{}', + processed_at TIMESTAMP WITH TIME ZONE, + + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + + CONSTRAINT reviews_enriched_unique UNIQUE (source, review_id, review_version) +); + +-- Indexes for reviews_enriched +CREATE INDEX IF NOT EXISTS idx_reviews_enriched_business_id ON reviews_enriched(business_id); +CREATE INDEX IF NOT EXISTS idx_reviews_enriched_place_id ON reviews_enriched(place_id); +CREATE INDEX IF NOT EXISTS idx_reviews_enriched_review_time ON reviews_enriched(review_time); +CREATE INDEX IF NOT EXISTS idx_reviews_enriched_urt_primary ON reviews_enriched(urt_primary) WHERE urt_primary IS NOT NULL; +CREATE INDEX IF NOT EXISTS idx_reviews_enriched_unclassified ON reviews_enriched(review_time DESC) WHERE urt_primary IS NULL AND is_latest = TRUE; +CREATE INDEX IF NOT EXISTS idx_reviews_enriched_valence ON reviews_enriched(valence) WHERE valence IS NOT NULL; + +-- Comment on tables +COMMENT ON TABLE reviews_raw IS 'Immutable raw review data as scraped from source'; +COMMENT ON TABLE reviews_enriched IS 'Enriched reviews with normalization and classification'; diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/002_create_spans_table.sql b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/002_create_spans_table.sql new file mode 100644 index 0000000..d160d1b --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/002_create_spans_table.sql @@ -0,0 +1,84 @@ +-- Migration: 002_create_spans_table.sql +-- Purpose: Create the review_spans table for Stage 2 classification output + +CREATE TABLE IF NOT EXISTS review_spans ( + id BIGSERIAL PRIMARY KEY, + span_id VARCHAR(50) NOT NULL UNIQUE, + + -- Context + business_id VARCHAR(255) NOT NULL, + place_id VARCHAR(255) NOT NULL, + source VARCHAR(20) NOT NULL DEFAULT 'google', + review_id VARCHAR(255) NOT NULL, + review_version INTEGER NOT NULL DEFAULT 1, + + -- Position + span_index INTEGER NOT NULL CHECK (span_index >= 0), + span_text TEXT NOT NULL, + span_start INTEGER NOT NULL CHECK (span_start >= 0), + span_end INTEGER NOT NULL CHECK (span_end > span_start), + + -- Classification profile + profile VARCHAR(20) NOT NULL DEFAULT 'standard', + + -- Core URT classification + urt_primary VARCHAR(10) NOT NULL, + urt_secondary VARCHAR(10)[] DEFAULT '{}', + valence VARCHAR(5) NOT NULL, + intensity VARCHAR(5) NOT NULL, + comparative VARCHAR(10) NOT NULL DEFAULT 'CR-N', + + -- Extended classification (standard/full profile) + specificity VARCHAR(5), + actionability VARCHAR(5), + temporal VARCHAR(5), + evidence VARCHAR(5), + + -- Entity extraction + entity VARCHAR(255), + entity_type VARCHAR(20), + entity_normalized VARCHAR(255), + + -- Causal relations (full profile) + relation_type VARCHAR(20), + related_span_id VARCHAR(50), + causal_chain JSONB, + + -- Flags + is_primary BOOLEAN NOT NULL DEFAULT FALSE, + is_active BOOLEAN NOT NULL DEFAULT TRUE, + + -- Time reference + review_time TIMESTAMP WITH TIME ZONE NOT NULL, + + -- Metadata + confidence VARCHAR(10) NOT NULL DEFAULT 'medium', + usn VARCHAR(100) NOT NULL, + taxonomy_version VARCHAR(20) NOT NULL, + model_version VARCHAR(100) NOT NULL, + ingest_batch_id VARCHAR(50) NOT NULL, + + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + + -- Foreign key to review + CONSTRAINT fk_review FOREIGN KEY (source, review_id, review_version) + REFERENCES reviews_enriched(source, review_id, review_version) +); + +-- Indexes for review_spans +CREATE INDEX IF NOT EXISTS idx_spans_business_id ON review_spans(business_id); +CREATE INDEX IF NOT EXISTS idx_spans_place_id ON review_spans(place_id); +CREATE INDEX IF NOT EXISTS idx_spans_review_time ON review_spans(review_time); +CREATE INDEX IF NOT EXISTS idx_spans_urt_primary ON review_spans(urt_primary); +CREATE INDEX IF NOT EXISTS idx_spans_valence ON review_spans(valence); +CREATE INDEX IF NOT EXISTS idx_spans_intensity ON review_spans(intensity); +CREATE INDEX IF NOT EXISTS idx_spans_is_active ON review_spans(is_active) WHERE is_active = TRUE; +CREATE INDEX IF NOT EXISTS idx_spans_is_primary ON review_spans(is_primary) WHERE is_primary = TRUE; +CREATE INDEX IF NOT EXISTS idx_spans_entity_normalized ON review_spans(entity_normalized) WHERE entity_normalized IS NOT NULL; +CREATE INDEX IF NOT EXISTS idx_spans_batch ON review_spans(ingest_batch_id); + +-- Index for unrouted negative spans (Stage 3 query) +CREATE INDEX IF NOT EXISTS idx_spans_unrouted_negative ON review_spans(review_time DESC) + WHERE is_active = TRUE AND valence IN ('V-', 'V±'); + +COMMENT ON TABLE review_spans IS 'Extracted semantic spans with URT classification from reviews'; diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/003_create_urt_enums.sql b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/003_create_urt_enums.sql new file mode 100644 index 0000000..03906b2 --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/003_create_urt_enums.sql @@ -0,0 +1,111 @@ +-- Migration: 003_create_urt_enums.sql +-- Purpose: Create enum types and lookup tables for URT taxonomy + +-- Valence enum +DO $$ BEGIN + CREATE TYPE valence_type AS ENUM ('V+', 'V-', 'V0', 'V±'); +EXCEPTION + WHEN duplicate_object THEN NULL; +END $$; + +-- Intensity enum +DO $$ BEGIN + CREATE TYPE intensity_type AS ENUM ('I1', 'I2', 'I3'); +EXCEPTION + WHEN duplicate_object THEN NULL; +END $$; + +-- Specificity enum +DO $$ BEGIN + CREATE TYPE specificity_type AS ENUM ('S1', 'S2', 'S3'); +EXCEPTION + WHEN duplicate_object THEN NULL; +END $$; + +-- Actionability enum +DO $$ BEGIN + CREATE TYPE actionability_type AS ENUM ('A1', 'A2', 'A3'); +EXCEPTION + WHEN duplicate_object THEN NULL; +END $$; + +-- Temporal enum +DO $$ BEGIN + CREATE TYPE temporal_type AS ENUM ('TC', 'TR', 'TH', 'TF'); +EXCEPTION + WHEN duplicate_object THEN NULL; +END $$; + +-- Evidence enum +DO $$ BEGIN + CREATE TYPE evidence_type AS ENUM ('ES', 'EI', 'EC'); +EXCEPTION + WHEN duplicate_object THEN NULL; +END $$; + +-- Comparative enum +DO $$ BEGIN + CREATE TYPE comparative_type AS ENUM ('CR-N', 'CR-B', 'CR-W', 'CR-S'); +EXCEPTION + WHEN duplicate_object THEN NULL; +END $$; + +-- URT Domain lookup table +CREATE TABLE IF NOT EXISTS urt_domains ( + code CHAR(1) PRIMARY KEY, + name VARCHAR(50) NOT NULL, + description TEXT +); + +INSERT INTO urt_domains (code, name, description) VALUES + ('O', 'Offering', 'Product/service quality, features, variety'), + ('P', 'Price', 'Value, pricing, promotions, payment'), + ('J', 'Journey', 'Timing, process, convenience, accessibility'), + ('E', 'Environment', 'Physical space, ambiance, cleanliness, digital UX'), + ('A', 'Attitude', 'Staff behavior, helpfulness, professionalism'), + ('V', 'Voice', 'Brand, communication, marketing, transparency'), + ('R', 'Relationship', 'Loyalty, trust, consistency, personalization') +ON CONFLICT (code) DO NOTHING; + +-- URT Tier-2 categories lookup table +CREATE TABLE IF NOT EXISTS urt_categories ( + code VARCHAR(5) PRIMARY KEY, + domain_code CHAR(1) NOT NULL REFERENCES urt_domains(code), + name VARCHAR(100) NOT NULL, + description TEXT +); + +-- Insert standard Tier-2 categories +INSERT INTO urt_categories (code, domain_code, name) VALUES + ('O1', 'O', 'Core Product/Service'), + ('O2', 'O', 'Product Features'), + ('O3', 'O', 'Variety & Selection'), + ('O4', 'O', 'Customization'), + ('P1', 'P', 'Value Perception'), + ('P2', 'P', 'Pricing Structure'), + ('P3', 'P', 'Promotions & Deals'), + ('P4', 'P', 'Payment Process'), + ('J1', 'J', 'Wait Times'), + ('J2', 'J', 'Booking & Reservations'), + ('J3', 'J', 'Navigation & Convenience'), + ('J4', 'J', 'Accessibility'), + ('E1', 'E', 'Physical Environment'), + ('E2', 'E', 'Ambiance & Atmosphere'), + ('E3', 'E', 'Cleanliness'), + ('E4', 'E', 'Digital Experience'), + ('A1', 'A', 'Friendliness'), + ('A2', 'A', 'Helpfulness'), + ('A3', 'A', 'Professionalism'), + ('A4', 'A', 'Knowledge & Expertise'), + ('V1', 'V', 'Brand Identity'), + ('V2', 'V', 'Communication'), + ('V3', 'V', 'Marketing'), + ('V4', 'V', 'Transparency'), + ('R1', 'R', 'Loyalty'), + ('R2', 'R', 'Trust'), + ('R3', 'R', 'Consistency'), + ('R4', 'R', 'Personalization') +ON CONFLICT (code) DO NOTHING; + +COMMENT ON TABLE urt_domains IS 'URT v5.1 top-level domains'; +COMMENT ON TABLE urt_categories IS 'URT v5.1 Tier-2 categories'; diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/004_create_issues_tables.sql b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/004_create_issues_tables.sql new file mode 100644 index 0000000..a1944f8 --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/004_create_issues_tables.sql @@ -0,0 +1,96 @@ +-- Migration: 004_create_issues_tables.sql +-- Purpose: Create tables for Stage 3 issue routing + +-- Issue state enum +DO $$ BEGIN + CREATE TYPE issue_state AS ENUM ('open', 'resolved', 'ignored', 'merged'); +EXCEPTION + WHEN duplicate_object THEN NULL; +END $$; + +-- Issues table +CREATE TABLE IF NOT EXISTS issues ( + id BIGSERIAL PRIMARY KEY, + issue_id VARCHAR(50) NOT NULL UNIQUE, + + -- Context + business_id VARCHAR(255) NOT NULL, + place_id VARCHAR(255) NOT NULL, + + -- Classification + primary_subcode VARCHAR(10) NOT NULL, + domain CHAR(1) NOT NULL, + + -- State + state issue_state NOT NULL DEFAULT 'open', + priority_score REAL NOT NULL DEFAULT 1.0, + confidence_score REAL NOT NULL DEFAULT 1.0, + + -- Aggregates + span_count INTEGER NOT NULL DEFAULT 1, + max_intensity VARCHAR(5) NOT NULL DEFAULT 'I1', + + -- Entity (optional - for entity-specific issues) + entity VARCHAR(255), + entity_normalized VARCHAR(255), + + -- Metadata + taxonomy_version VARCHAR(20) NOT NULL, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() +); + +-- Issue-span links (1:1 - each span routes to exactly one issue) +CREATE TABLE IF NOT EXISTS issue_spans ( + id BIGSERIAL PRIMARY KEY, + issue_id VARCHAR(50) NOT NULL REFERENCES issues(issue_id), + span_id VARCHAR(50) NOT NULL UNIQUE, + + -- Review reference + source VARCHAR(20) NOT NULL DEFAULT 'google', + review_id VARCHAR(255) NOT NULL, + review_version INTEGER NOT NULL DEFAULT 1, + + -- Match info + is_primary_match BOOLEAN NOT NULL DEFAULT TRUE, + intensity VARCHAR(5) NOT NULL, + review_time TIMESTAMP WITH TIME ZONE NOT NULL, + + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() +); + +-- Issue events (audit log) +CREATE TABLE IF NOT EXISTS issue_events ( + id BIGSERIAL PRIMARY KEY, + issue_id VARCHAR(50) NOT NULL REFERENCES issues(issue_id), + event_type VARCHAR(50) NOT NULL, + span_id VARCHAR(50), + old_value TEXT, + new_value TEXT, + metadata JSONB, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() +); + +-- Indexes for issues +CREATE INDEX IF NOT EXISTS idx_issues_business_id ON issues(business_id); +CREATE INDEX IF NOT EXISTS idx_issues_place_id ON issues(place_id); +CREATE INDEX IF NOT EXISTS idx_issues_state ON issues(state); +CREATE INDEX IF NOT EXISTS idx_issues_primary_subcode ON issues(primary_subcode); +CREATE INDEX IF NOT EXISTS idx_issues_domain ON issues(domain); +CREATE INDEX IF NOT EXISTS idx_issues_entity_normalized ON issues(entity_normalized) WHERE entity_normalized IS NOT NULL; +CREATE INDEX IF NOT EXISTS idx_issues_priority ON issues(priority_score DESC) WHERE state = 'open'; +CREATE INDEX IF NOT EXISTS idx_issues_created ON issues(created_at); +CREATE INDEX IF NOT EXISTS idx_issues_updated ON issues(updated_at); + +-- Indexes for issue_spans +CREATE INDEX IF NOT EXISTS idx_issue_spans_issue_id ON issue_spans(issue_id); +CREATE INDEX IF NOT EXISTS idx_issue_spans_review_time ON issue_spans(review_time); + +-- Indexes for issue_events +CREATE INDEX IF NOT EXISTS idx_issue_events_issue_id ON issue_events(issue_id); +CREATE INDEX IF NOT EXISTS idx_issue_events_created ON issue_events(created_at); +CREATE INDEX IF NOT EXISTS idx_issue_events_type ON issue_events(event_type); + +COMMENT ON TABLE issues IS 'Aggregated issues derived from negative/mixed spans'; +COMMENT ON TABLE issue_spans IS 'Links between issues and their source spans'; +COMMENT ON TABLE issue_events IS 'Audit log for issue state changes'; diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/005_create_facts_table.sql b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/005_create_facts_table.sql new file mode 100644 index 0000000..c565700 --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/migrations/005_create_facts_table.sql @@ -0,0 +1,97 @@ +-- Migration: 005_create_facts_table.sql +-- Purpose: Create the fact_timeseries table for Stage 4 aggregation + +-- Subject type enum +DO $$ BEGIN + CREATE TYPE subject_type AS ENUM ('overall', 'urt_code', 'domain', 'issue'); +EXCEPTION + WHEN duplicate_object THEN NULL; +END $$; + +-- Bucket type enum +DO $$ BEGIN + CREATE TYPE bucket_type AS ENUM ('day', 'week', 'month'); +EXCEPTION + WHEN duplicate_object THEN NULL; +END $$; + +-- Fact timeseries table +CREATE TABLE IF NOT EXISTS fact_timeseries ( + id BIGSERIAL PRIMARY KEY, + + -- Dimension keys + business_id VARCHAR(255) NOT NULL, + place_id VARCHAR(255) NOT NULL, -- Or 'ALL' for rollup + period_date DATE NOT NULL, + bucket_type bucket_type NOT NULL DEFAULT 'day', + subject_type subject_type NOT NULL DEFAULT 'urt_code', + subject_id VARCHAR(50) NOT NULL, -- URT code, domain letter, or issue_id + taxonomy_version VARCHAR(20) NOT NULL, + + -- Core counts + review_count INTEGER NOT NULL DEFAULT 0, + span_count INTEGER NOT NULL DEFAULT 0, + + -- Valence counts + negative_count INTEGER NOT NULL DEFAULT 0, + positive_count INTEGER NOT NULL DEFAULT 0, + neutral_count INTEGER NOT NULL DEFAULT 0, + mixed_count INTEGER NOT NULL DEFAULT 0, + + -- Strength scores + strength_score REAL NOT NULL DEFAULT 0.0, + negative_strength REAL NOT NULL DEFAULT 0.0, + positive_strength REAL NOT NULL DEFAULT 0.0, + + -- Rating + avg_rating REAL, + + -- Intensity counts + i1_count INTEGER NOT NULL DEFAULT 0, + i2_count INTEGER NOT NULL DEFAULT 0, + i3_count INTEGER NOT NULL DEFAULT 0, + + -- Comparative counts + cr_better INTEGER NOT NULL DEFAULT 0, + cr_worse INTEGER NOT NULL DEFAULT 0, + cr_same INTEGER NOT NULL DEFAULT 0, + + -- Trust-weighted metrics + trust_weighted_strength REAL NOT NULL DEFAULT 0.0, + trust_weighted_negative REAL NOT NULL DEFAULT 0.0, + + -- Metadata + computed_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + + -- Unique constraint for upsert + CONSTRAINT fact_timeseries_unique UNIQUE ( + business_id, place_id, period_date, bucket_type, + subject_type, subject_id, taxonomy_version + ) +); + +-- Indexes for fact_timeseries +CREATE INDEX IF NOT EXISTS idx_facts_business_id ON fact_timeseries(business_id); +CREATE INDEX IF NOT EXISTS idx_facts_place_id ON fact_timeseries(place_id); +CREATE INDEX IF NOT EXISTS idx_facts_period ON fact_timeseries(period_date); +CREATE INDEX IF NOT EXISTS idx_facts_bucket ON fact_timeseries(bucket_type); +CREATE INDEX IF NOT EXISTS idx_facts_subject_type ON fact_timeseries(subject_type); +CREATE INDEX IF NOT EXISTS idx_facts_subject_id ON fact_timeseries(subject_id); + +-- Composite index for common dashboard queries +CREATE INDEX IF NOT EXISTS idx_facts_dashboard ON fact_timeseries( + business_id, place_id, bucket_type, period_date DESC +); + +-- Index for specific code trends +CREATE INDEX IF NOT EXISTS idx_facts_code_trend ON fact_timeseries( + business_id, subject_id, bucket_type, period_date DESC +) WHERE subject_type = 'urt_code'; + +-- Index for domain aggregates +CREATE INDEX IF NOT EXISTS idx_facts_domain ON fact_timeseries( + business_id, subject_id, bucket_type, period_date DESC +) WHERE subject_type = 'domain'; + +COMMENT ON TABLE fact_timeseries IS 'Pre-aggregated time series facts for dashboard queries'; diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/repositories.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/repositories.py new file mode 100644 index 0000000..88e5a6b --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/repositories.py @@ -0,0 +1,562 @@ +"""Data access layer for pipeline operations.""" + +from __future__ import annotations + +import json +import logging +from datetime import date, datetime +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from reviewiq_pipeline.contracts import ( + ClassifiedReview, + ExtractedSpan, + FactRecord, + NormalizedReview, + RawReview, + RoutedSpan, + ) + from reviewiq_pipeline.db.connection import DatabasePool + +logger = logging.getLogger(__name__) + + +class ReviewRepository: + """Repository for review data operations.""" + + def __init__(self, db: DatabasePool): + self.db = db + + async def insert_raw_review( + self, + review: RawReview, + place_id: str, + source: str = "google", + ) -> int: + """Insert a raw review and return its ID.""" + query = """ + INSERT INTO reviews_raw ( + source, review_id, place_id, raw_payload, + review_text, rating, review_time, reviewer_name, reviewer_id, + review_version, pulled_at + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, NOW()) + ON CONFLICT (source, review_id, review_version) DO UPDATE SET + pulled_at = NOW() + RETURNING id + """ + raw_id = await self.db.fetchval( + query, + source, + review["review_id"], + place_id, + json.dumps(review.get("raw_payload", {})), + review.get("text"), + review["rating"], + review["review_time"], + review["author_name"], + review.get("author_id"), + 1, # Initial version + ) + return raw_id + + async def insert_enriched_review( + self, + review: NormalizedReview, + raw_id: int, + ) -> int: + """Insert an enriched review stub (pre-classification).""" + query = """ + INSERT INTO reviews_enriched ( + source, review_id, review_version, is_latest, raw_id, + business_id, place_id, text, text_normalized, rating, review_time, + language, taxonomy_version + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13) + ON CONFLICT (source, review_id, review_version) DO UPDATE SET + is_latest = EXCLUDED.is_latest + RETURNING id + """ + enriched_id = await self.db.fetchval( + query, + review["source"], + review["review_id"], + review["review_version"], + True, # is_latest + raw_id, + review["business_id"], + review["place_id"], + review["text"], + review["text_normalized"], + review["rating"], + review["review_time"], + review["text_language"], + "v5.1", # taxonomy_version - will be updated by Stage 2 + ) + return enriched_id + + async def update_enriched_with_classification( + self, + classified: ClassifiedReview, + model_version: str, + taxonomy_version: str, + ) -> None: + """Update an enriched review with classification results.""" + query = """ + UPDATE reviews_enriched SET + urt_primary = $1, + urt_secondary = $2, + valence = $3, + intensity = $4, + comparative = $5, + staff_mentions = $6, + quotes = $7, + embedding = $8, + trust_score = $9, + classification_model = $10, + classification_confidence = $11, + taxonomy_version = $12, + processed_at = NOW() + WHERE source = $13 + AND review_id = $14 + AND review_version = $15 + """ + await self.db.execute( + query, + classified["urt_primary"], + classified.get("urt_secondary", []), + classified["valence"], + classified["intensity"], + classified.get("comparative", "CR-N"), + classified.get("staff_mentions", []), + json.dumps(classified.get("quotes", {})), + classified.get("embedding", []), + classified.get("trust_score", 0.5), + model_version, + json.dumps(classified.get("classification_confidence", {})), + taxonomy_version, + classified["source"], + classified["review_id"], + classified["review_version"], + ) + + async def get_unclassified_reviews( + self, + limit: int = 100, + ) -> list[dict[str, Any]]: + """Get reviews that haven't been classified yet.""" + query = """ + SELECT + source, review_id, review_version, business_id, place_id, + text, text_normalized, rating, review_time + FROM reviews_enriched + WHERE urt_primary IS NULL + AND is_latest = TRUE + ORDER BY review_time DESC + LIMIT $1 + """ + rows = await self.db.fetch(query, limit) + return [dict(r) for r in rows] + + async def get_review_by_id( + self, + source: str, + review_id: str, + review_version: int, + ) -> dict[str, Any] | None: + """Get a specific review by its composite key.""" + query = """ + SELECT * FROM reviews_enriched + WHERE source = $1 AND review_id = $2 AND review_version = $3 + """ + row = await self.db.fetchrow(query, source, review_id, review_version) + return dict(row) if row else None + + async def check_duplicate( + self, + content_hash: str, + business_id: str, + ) -> str | None: + """Check if a content hash already exists, return dedup_group_id if so.""" + # For now, we check by querying the first occurrence + # A proper dedup table would be better for production + query = """ + SELECT review_id FROM reviews_enriched + WHERE business_id = $1 + AND text_normalized IS NOT NULL + LIMIT 1 + """ + # Simplified - in production, use a separate dedup table with content_hash index + return None + + +class SpanRepository: + """Repository for span data operations.""" + + def __init__(self, db: DatabasePool): + self.db = db + + async def insert_span( + self, + span: ExtractedSpan, + business_id: str, + place_id: str, + source: str, + review_id: str, + review_version: int, + review_time: str, + batch_id: str, + model_version: str, + taxonomy_version: str, + ) -> None: + """Insert a span into the database.""" + query = """ + INSERT INTO review_spans ( + span_id, business_id, place_id, source, review_id, review_version, + span_index, span_text, span_start, span_end, + profile, urt_primary, urt_secondary, valence, intensity, comparative, + specificity, actionability, temporal, evidence, + entity, entity_type, entity_normalized, + relation_type, related_span_id, causal_chain, + is_primary, is_active, review_time, + confidence, usn, taxonomy_version, model_version, ingest_batch_id + ) VALUES ( + $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, + $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, + $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, + $31, $32, $33, $34 + ) + ON CONFLICT (span_id) DO UPDATE SET + is_active = EXCLUDED.is_active + """ + # Build related_span_id from index if needed + related_span_id = None + if span.get("related_span_index") is not None: + # Build the related span_id (would need the actual span_id mapping) + pass + + await self.db.execute( + query, + span["span_id"], + business_id, + place_id, + source, + review_id, + review_version, + span["span_index"], + span["span_text"], + span["span_start"], + span["span_end"], + span.get("profile", "standard"), + span["urt_primary"], + span.get("urt_secondary", []), + span["valence"], + span["intensity"], + span.get("comparative", "CR-N"), + span.get("specificity"), + span.get("actionability"), + span.get("temporal"), + span.get("evidence"), + span.get("entity"), + span.get("entity_type"), + span.get("entity_normalized"), + span.get("relation_type"), + related_span_id, + json.dumps(span.get("causal_chain")) if span.get("causal_chain") else None, + span.get("is_primary", False), + True, # is_active + review_time, + span.get("confidence", "medium"), + span["usn"], + taxonomy_version, + model_version, + batch_id, + ) + + async def get_unrouted_negative_spans( + self, + limit: int = 100, + ) -> list[dict[str, Any]]: + """Get negative spans that haven't been routed to issues yet.""" + query = """ + SELECT + rs.span_id, rs.business_id, rs.place_id, + rs.urt_primary, rs.valence, rs.intensity, + rs.entity_normalized, rs.review_time, rs.confidence, + re.trust_score + FROM review_spans rs + JOIN reviews_enriched re ON ( + re.source = rs.source + AND re.review_id = rs.review_id + AND re.review_version = rs.review_version + ) + WHERE rs.is_active = TRUE + AND rs.valence IN ('V-', 'V±') + AND NOT EXISTS ( + SELECT 1 FROM issue_spans iss WHERE iss.span_id = rs.span_id + ) + ORDER BY rs.review_time DESC + LIMIT $1 + """ + rows = await self.db.fetch(query, limit) + return [dict(r) for r in rows] + + async def get_span_by_id(self, span_id: str) -> dict[str, Any] | None: + """Get a span by its ID.""" + query = "SELECT * FROM review_spans WHERE span_id = $1" + row = await self.db.fetchrow(query, span_id) + return dict(row) if row else None + + +class IssueRepository: + """Repository for issue data operations.""" + + def __init__(self, db: DatabasePool): + self.db = db + + async def upsert_issue( + self, + issue_id: str, + business_id: str, + place_id: str, + primary_subcode: str, + intensity: str, + entity: str | None, + entity_normalized: str | None, + taxonomy_version: str, + ) -> bool: + """Create or update an issue. Returns True if newly created.""" + # First check if exists + existing = await self.db.fetchval( + "SELECT 1 FROM issues WHERE issue_id = $1", + issue_id, + ) + + if existing: + # Update + await self.db.execute( + """ + UPDATE issues SET + span_count = span_count + 1, + max_intensity = CASE + WHEN $1 = 'I3' THEN 'I3' + WHEN $1 = 'I2' AND max_intensity != 'I3' THEN 'I2' + ELSE max_intensity + END, + updated_at = NOW() + WHERE issue_id = $2 + """, + intensity, + issue_id, + ) + return False + else: + # Insert + domain = primary_subcode[0] if primary_subcode else "O" + await self.db.execute( + """ + INSERT INTO issues ( + issue_id, business_id, place_id, primary_subcode, domain, + state, priority_score, confidence_score, span_count, max_intensity, + entity, entity_normalized, taxonomy_version + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13) + """, + issue_id, + business_id, + place_id, + primary_subcode, + domain, + "open", + 1.0, # Initial priority + 1.0, # Initial confidence + 1, # Initial span count + intensity, + entity, + entity_normalized, + taxonomy_version, + ) + return True + + async def link_span_to_issue( + self, + routed: RoutedSpan, + source: str, + review_id: str, + review_version: int, + intensity: str, + review_time: str, + is_primary_match: bool = True, + ) -> None: + """Link a span to an issue.""" + await self.db.execute( + """ + INSERT INTO issue_spans ( + issue_id, span_id, source, review_id, review_version, + is_primary_match, intensity, review_time + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) + ON CONFLICT (span_id) DO NOTHING + """, + routed["issue_id"], + routed["span_id"], + source, + review_id, + review_version, + is_primary_match, + intensity, + review_time, + ) + + async def log_event( + self, + issue_id: str, + event_type: str, + span_id: str | None = None, + old_value: str | None = None, + new_value: str | None = None, + metadata: dict[str, Any] | None = None, + ) -> None: + """Log an issue event for audit trail.""" + await self.db.execute( + """ + INSERT INTO issue_events ( + issue_id, event_type, span_id, old_value, new_value, metadata + ) VALUES ($1, $2, $3, $4, $5, $6) + """, + issue_id, + event_type, + span_id, + old_value, + new_value, + json.dumps(metadata) if metadata else None, + ) + + async def get_issue_by_id(self, issue_id: str) -> dict[str, Any] | None: + """Get an issue by its ID.""" + query = "SELECT * FROM issues WHERE issue_id = $1" + row = await self.db.fetchrow(query, issue_id) + return dict(row) if row else None + + async def check_span_already_linked(self, span_id: str) -> str | None: + """Check if a span is already linked to an issue.""" + return await self.db.fetchval( + "SELECT issue_id FROM issue_spans WHERE span_id = $1", + span_id, + ) + + +class FactRepository: + """Repository for fact time series operations.""" + + def __init__(self, db: DatabasePool): + self.db = db + + async def upsert_fact(self, fact: FactRecord) -> None: + """Insert or update a fact record.""" + await self.db.execute( + """ + INSERT INTO fact_timeseries ( + business_id, place_id, period_date, bucket_type, + subject_type, subject_id, taxonomy_version, + review_count, span_count, negative_count, positive_count, + neutral_count, mixed_count, strength_score, negative_strength, + positive_strength, avg_rating, i1_count, i2_count, i3_count, + cr_better, cr_worse, cr_same, + trust_weighted_strength, trust_weighted_negative, + computed_at + ) VALUES ( + $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, + $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, NOW() + ) + ON CONFLICT ( + business_id, place_id, period_date, bucket_type, + subject_type, subject_id, taxonomy_version + ) DO UPDATE SET + review_count = EXCLUDED.review_count, + span_count = EXCLUDED.span_count, + negative_count = EXCLUDED.negative_count, + positive_count = EXCLUDED.positive_count, + neutral_count = EXCLUDED.neutral_count, + mixed_count = EXCLUDED.mixed_count, + strength_score = EXCLUDED.strength_score, + negative_strength = EXCLUDED.negative_strength, + positive_strength = EXCLUDED.positive_strength, + avg_rating = EXCLUDED.avg_rating, + i1_count = EXCLUDED.i1_count, + i2_count = EXCLUDED.i2_count, + i3_count = EXCLUDED.i3_count, + cr_better = EXCLUDED.cr_better, + cr_worse = EXCLUDED.cr_worse, + cr_same = EXCLUDED.cr_same, + trust_weighted_strength = EXCLUDED.trust_weighted_strength, + trust_weighted_negative = EXCLUDED.trust_weighted_negative, + computed_at = NOW() + """, + fact["business_id"], + fact["place_id"], + fact["period_date"], + fact["bucket_type"], + fact["subject_type"], + fact["subject_id"], + fact["taxonomy_version"], + fact["review_count"], + fact["span_count"], + fact["negative_count"], + fact["positive_count"], + fact["neutral_count"], + fact["mixed_count"], + fact["strength_score"], + fact["negative_strength"], + fact["positive_strength"], + fact.get("avg_rating"), + fact["i1_count"], + fact["i2_count"], + fact["i3_count"], + fact["cr_better"], + fact["cr_worse"], + fact["cr_same"], + fact["trust_weighted_strength"], + fact["trust_weighted_negative"], + ) + + async def get_aggregation_data( + self, + business_id: str, + start_date: date, + end_date: date, + ) -> list[dict[str, Any]]: + """Get span data for aggregation within a date range.""" + query = """ + SELECT + rs.business_id, + rs.place_id, + DATE(rs.review_time) as review_date, + rs.urt_primary, + rs.valence, + rs.intensity, + rs.comparative, + re.trust_score, + re.rating + FROM review_spans rs + JOIN reviews_enriched re ON ( + re.source = rs.source + AND re.review_id = rs.review_id + AND re.review_version = rs.review_version + ) + WHERE rs.business_id = $1 + AND rs.is_active = TRUE + AND DATE(rs.review_time) BETWEEN $2 AND $3 + """ + rows = await self.db.fetch(query, business_id, start_date, end_date) + return [dict(r) for r in rows] + + async def get_place_ids_for_business( + self, + business_id: str, + ) -> list[str]: + """Get all place IDs for a business.""" + rows = await self.db.fetch( + """ + SELECT DISTINCT place_id FROM reviews_enriched + WHERE business_id = $1 + """, + business_id, + ) + return [r["place_id"] for r in rows] diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/pipeline.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/pipeline.py new file mode 100644 index 0000000..b386420 --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/pipeline.py @@ -0,0 +1,402 @@ +""" +Pipeline class - main public API for the ReviewIQ pipeline. + +Provides a unified interface for running pipeline stages. +""" + +from __future__ import annotations + +import logging +from datetime import date +from typing import TYPE_CHECKING, Any + +from reviewiq_pipeline.config import Config +from reviewiq_pipeline.contracts import ( + ClassificationConfig, + NormalizedReview, + ReviewToClassify, + ScraperOutput, + SpanToRoute, + Stage1Input, + Stage1Output, + Stage2Input, + Stage2Output, + Stage3Input, + Stage3Output, + Stage4Input, + Stage4Output, + ValidationResult, +) +from reviewiq_pipeline.db.connection import DatabasePool +from reviewiq_pipeline.db.repositories import ( + FactRepository, + IssueRepository, + ReviewRepository, + SpanRepository, +) +from reviewiq_pipeline.services.embeddings import EmbeddingService +from reviewiq_pipeline.stages.stage1_normalize import Stage1Normalizer +from reviewiq_pipeline.stages.stage2_classify import Stage2Classifier +from reviewiq_pipeline.stages.stage3_route import Stage3Router +from reviewiq_pipeline.stages.stage4_aggregate import Stage4Aggregator +from reviewiq_pipeline.validation.validators import ( + validate_stage1_output, + validate_stage2_output, + validate_stage3_output, + validate_stage4_output, +) + +if TYPE_CHECKING: + pass + +logger = logging.getLogger(__name__) + + +class PipelineResult: + """Result from running the full pipeline.""" + + def __init__( + self, + stage1: Stage1Output | None = None, + stage2: Stage2Output | None = None, + stage3: Stage3Output | None = None, + stage4: Stage4Output | None = None, + validation: dict[str, ValidationResult] | None = None, + ): + self.stage1 = stage1 + self.stage2 = stage2 + self.stage3 = stage3 + self.stage4 = stage4 + self.validation = validation or {} + + @property + def success(self) -> bool: + """Check if all ran stages passed validation.""" + return all(v["passed"] for v in self.validation.values()) + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary.""" + return { + "stage1": self.stage1, + "stage2": self.stage2, + "stage3": self.stage3, + "stage4": self.stage4, + "validation": self.validation, + "success": self.success, + } + + +class Pipeline: + """ + Main pipeline class for processing reviews. + + Usage: + config = Config(database_url="...", llm_provider="openai", ...) + pipeline = Pipeline(config) + + # Run full pipeline + result = await pipeline.process(scraper_output) + + # Or run individual stages + stage1_result = await pipeline.normalize(scraper_output) + stage2_result = await pipeline.classify(stage1_result) + """ + + def __init__(self, config: Config): + """ + Initialize the pipeline. + + Args: + config: Pipeline configuration + """ + self.config = config + self._db: DatabasePool | None = None + self._review_repo: ReviewRepository | None = None + self._span_repo: SpanRepository | None = None + self._issue_repo: IssueRepository | None = None + self._fact_repo: FactRepository | None = None + self._embedding_service: EmbeddingService | None = None + self._initialized = False + + async def initialize(self) -> None: + """Initialize database connections and services.""" + if self._initialized: + return + + logger.info("Initializing pipeline...") + + # Initialize database + self._db = DatabasePool(self.config) + await self._db.initialize() + + # Initialize repositories + self._review_repo = ReviewRepository(self._db) + self._span_repo = SpanRepository(self._db) + self._issue_repo = IssueRepository(self._db) + self._fact_repo = FactRepository(self._db) + + # Initialize embedding service + self._embedding_service = EmbeddingService(self.config) + + self._initialized = True + logger.info("Pipeline initialized") + + async def close(self) -> None: + """Close all connections and cleanup resources.""" + if self._db: + await self._db.close() + self._db = None + + self._initialized = False + logger.info("Pipeline closed") + + async def migrate(self) -> int: + """ + Run database migrations. + + Returns: + Number of migrations run + """ + if not self._db: + self._db = DatabasePool(self.config) + await self._db.initialize() + + return await self._db.run_migrations() + + async def process( + self, + scraper_output: ScraperOutput, + stages: list[int] | None = None, + validate: bool = True, + ) -> PipelineResult: + """ + Run the full pipeline on scraper output. + + Args: + scraper_output: Output from the scraper (Stage 0) + stages: List of stages to run (default: all [1, 2, 3, 4]) + validate: Whether to validate each stage output + + Returns: + PipelineResult with all stage outputs and validation results + """ + await self.initialize() + + stages = stages or [1, 2, 3, 4] + result = PipelineResult() + validation_results: dict[str, ValidationResult] = {} + + # Stage 1: Normalize + if 1 in stages: + logger.info("Running Stage 1: Normalization") + result.stage1 = await self.normalize(scraper_output) + + if validate: + validation_results["stage1"] = validate_stage1_output(result.stage1) + + # Stage 2: Classify + if 2 in stages and result.stage1: + logger.info("Running Stage 2: Classification") + result.stage2 = await self.classify(result.stage1) + + if validate: + # Build input reviews map for validation + input_reviews = { + (r["source"], r["review_id"], r["review_version"]): r + for r in result.stage1["reviews_normalized"] + } + validation_results["stage2"] = validate_stage2_output( + result.stage2, input_reviews + ) + + # Stage 3: Route + if 3 in stages and result.stage2: + logger.info("Running Stage 3: Issue Routing") + result.stage3 = await self.route(result.stage2) + + if validate: + validation_results["stage3"] = await validate_stage3_output( + result.stage3, self._db + ) + + # Stage 4: Aggregate + if 4 in stages: + logger.info("Running Stage 4: Aggregation") + result.stage4 = await self.aggregate( + scraper_output["business_id"], + date.today().isoformat(), + ) + + if validate: + validation_results["stage4"] = validate_stage4_output(result.stage4) + + result.validation = validation_results + return result + + async def normalize(self, scraper_output: ScraperOutput) -> Stage1Output: + """ + Run Stage 1: Normalization. + + Args: + scraper_output: Raw scraper output + + Returns: + Stage1Output with normalized reviews + """ + await self.initialize() + + stage1 = Stage1Normalizer( + self.config, + self._db, + self._review_repo, + ) + + input_data = Stage1Input( + job_id=scraper_output["job_id"], + business_id=scraper_output["business_id"], + place_id=scraper_output["place_id"], + reviews=scraper_output["reviews"], + ) + + return await stage1.process(input_data) + + async def classify(self, stage1_output: Stage1Output) -> Stage2Output: + """ + Run Stage 2: Classification. + + Args: + stage1_output: Output from Stage 1 + + Returns: + Stage2Output with classified reviews + """ + await self.initialize() + + stage2 = Stage2Classifier( + self.config, + self._db, + self._review_repo, + self._span_repo, + self._embedding_service, + ) + + # Convert normalized reviews to classification input + reviews_to_classify = [ + ReviewToClassify( + source=r["source"], + review_id=r["review_id"], + review_version=r["review_version"], + business_id=r["business_id"], + place_id=r["place_id"], + text=r["text"], + text_normalized=r["text_normalized"], + rating=r["rating"], + review_time=r["review_time"], + ) + for r in stage1_output["reviews_normalized"] + ] + + input_data = Stage2Input( + reviews=reviews_to_classify, + config=ClassificationConfig( + model=self.config.llm_model, + taxonomy_version=self.config.taxonomy_version, + profile=self.config.classification_profile, + max_spans_per_review=self.config.max_spans_per_review, + ), + ) + + try: + return await stage2.process(input_data) + finally: + await stage2.close() + + async def route(self, stage2_output: Stage2Output) -> Stage3Output: + """ + Run Stage 3: Issue Routing. + + Args: + stage2_output: Output from Stage 2 + + Returns: + Stage3Output with routing results + """ + await self.initialize() + + stage3 = Stage3Router( + self.config, + self._db, + self._span_repo, + self._issue_repo, + ) + + # Extract negative/mixed spans for routing + spans_to_route = [] + for review in stage2_output["reviews_classified"]: + for span in review.get("spans", []): + if span["valence"] in ("V-", "V±"): + spans_to_route.append( + SpanToRoute( + span_id=span["span_id"], + business_id=review.get("business_id", ""), + place_id=review.get("place_id", ""), + urt_primary=span["urt_primary"], + valence=span["valence"], + intensity=span["intensity"], + entity_normalized=span.get("entity_normalized"), + review_time=review.get("review_time", ""), + confidence=span.get("confidence", "medium"), + trust_score=review.get("trust_score", 0.5), + ) + ) + + return await stage3.process(Stage3Input(spans=spans_to_route)) + + async def aggregate( + self, + business_id: str, + date_str: str, + bucket_types: list[str] | None = None, + ) -> Stage4Output: + """ + Run Stage 4: Fact Aggregation. + + Args: + business_id: Business identifier + date_str: Date string (YYYY-MM-DD) + bucket_types: List of bucket types (default: ['day']) + + Returns: + Stage4Output with aggregated facts + """ + await self.initialize() + + stage4 = Stage4Aggregator( + self.config, + self._db, + self._fact_repo, + ) + + input_data = Stage4Input( + business_id=business_id, + date=date_str, + bucket_types=bucket_types or ["day"], # type: ignore + taxonomy_version=self.config.taxonomy_version, + ) + + return await stage4.process(input_data) + + async def validate(self, job_id: str) -> dict[str, ValidationResult]: + """ + Validate pipeline output for a job. + + Args: + job_id: Job identifier + + Returns: + Dictionary of validation results by stage + """ + # This would query the database for the job's output and validate + # For now, return empty results + logger.warning(f"validate() for job {job_id} not fully implemented") + return {} diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/__init__.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/__init__.py new file mode 100644 index 0000000..79ce5c8 --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/__init__.py @@ -0,0 +1,11 @@ +"""Services for pipeline operations.""" + +from reviewiq_pipeline.services.embeddings import EmbeddingService +from reviewiq_pipeline.services.llm_client import LLMClient +from reviewiq_pipeline.services.text_processor import TextProcessor + +__all__ = [ + "LLMClient", + "EmbeddingService", + "TextProcessor", +] diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/embeddings.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/embeddings.py new file mode 100644 index 0000000..32bff60 --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/embeddings.py @@ -0,0 +1,225 @@ +""" +Embedding service for generating text embeddings. + +Uses sentence-transformers with the all-MiniLM-L6-v2 model (384 dimensions). +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +import numpy as np + +if TYPE_CHECKING: + from reviewiq_pipeline.config import Config + +logger = logging.getLogger(__name__) + + +class EmbeddingService: + """ + Service for generating text embeddings using sentence-transformers. + + Uses the all-MiniLM-L6-v2 model by default, which produces 384-dimensional + embeddings suitable for semantic similarity and clustering. + """ + + def __init__(self, config: Config): + """ + Initialize the embedding service. + + Args: + config: Pipeline configuration with embedding model settings + """ + self.config = config + self.model_name = config.embedding_model + self.dimension = config.embedding_dimension + self._model = None + self._initialized = False + + def _ensure_initialized(self) -> None: + """Lazy initialization of the sentence-transformers model.""" + if self._initialized: + return + + try: + from sentence_transformers import SentenceTransformer + + logger.info(f"Loading embedding model: {self.model_name}") + self._model = SentenceTransformer(self.model_name) + self._initialized = True + logger.info(f"Embedding model loaded. Dimension: {self._model.get_sentence_embedding_dimension()}") + except ImportError: + raise ImportError( + "sentence-transformers is required for embeddings. " + "Install with: pip install sentence-transformers" + ) + + def embed(self, text: str) -> list[float]: + """ + Generate embedding for a single text. + + Args: + text: Text to embed + + Returns: + List of floats representing the embedding vector + """ + self._ensure_initialized() + + if not text or not text.strip(): + # Return zero vector for empty text + return [0.0] * self.dimension + + embedding = self._model.encode(text, convert_to_numpy=True) + return embedding.tolist() + + def embed_batch(self, texts: list[str]) -> list[list[float]]: + """ + Generate embeddings for multiple texts. + + More efficient than calling embed() repeatedly. + + Args: + texts: List of texts to embed + + Returns: + List of embedding vectors + """ + self._ensure_initialized() + + if not texts: + return [] + + # Handle empty strings + non_empty_indices = [i for i, t in enumerate(texts) if t and t.strip()] + non_empty_texts = [texts[i] for i in non_empty_indices] + + if not non_empty_texts: + return [[0.0] * self.dimension for _ in texts] + + # Batch encode + embeddings = self._model.encode(non_empty_texts, convert_to_numpy=True) + + # Build result with zero vectors for empty strings + result = [[0.0] * self.dimension for _ in texts] + for idx, emb in zip(non_empty_indices, embeddings): + result[idx] = emb.tolist() + + return result + + def similarity(self, embedding1: list[float], embedding2: list[float]) -> float: + """ + Calculate cosine similarity between two embeddings. + + Args: + embedding1: First embedding vector + embedding2: Second embedding vector + + Returns: + Cosine similarity score between -1 and 1 + """ + vec1 = np.array(embedding1) + vec2 = np.array(embedding2) + + # Handle zero vectors + norm1 = np.linalg.norm(vec1) + norm2 = np.linalg.norm(vec2) + + if norm1 == 0 or norm2 == 0: + return 0.0 + + return float(np.dot(vec1, vec2) / (norm1 * norm2)) + + def find_similar( + self, + query_embedding: list[float], + candidate_embeddings: list[list[float]], + top_k: int = 5, + threshold: float = 0.0, + ) -> list[tuple[int, float]]: + """ + Find most similar embeddings to a query. + + Args: + query_embedding: Query embedding vector + candidate_embeddings: List of candidate embeddings + top_k: Number of top results to return + threshold: Minimum similarity threshold + + Returns: + List of (index, similarity) tuples, sorted by similarity descending + """ + if not candidate_embeddings: + return [] + + query = np.array(query_embedding) + candidates = np.array(candidate_embeddings) + + # Compute all similarities at once + query_norm = np.linalg.norm(query) + if query_norm == 0: + return [] + + candidate_norms = np.linalg.norm(candidates, axis=1) + + # Avoid division by zero + valid_mask = candidate_norms > 0 + similarities = np.zeros(len(candidates)) + similarities[valid_mask] = ( + np.dot(candidates[valid_mask], query) + / (candidate_norms[valid_mask] * query_norm) + ) + + # Filter by threshold and get top k + results = [ + (i, float(sim)) + for i, sim in enumerate(similarities) + if sim >= threshold + ] + results.sort(key=lambda x: x[1], reverse=True) + + return results[:top_k] + + @property + def model(self): + """Get the underlying sentence-transformers model.""" + self._ensure_initialized() + return self._model + + +def normalize_embedding(embedding: list[float]) -> list[float]: + """ + Normalize an embedding to unit length. + + Args: + embedding: Embedding vector + + Returns: + Unit-normalized embedding + """ + vec = np.array(embedding) + norm = np.linalg.norm(vec) + if norm == 0: + return embedding + return (vec / norm).tolist() + + +def average_embeddings(embeddings: list[list[float]]) -> list[float]: + """ + Compute the average of multiple embeddings. + + Useful for creating centroid vectors for clustering. + + Args: + embeddings: List of embedding vectors + + Returns: + Averaged embedding vector + """ + if not embeddings: + raise ValueError("Cannot average empty embedding list") + + arr = np.array(embeddings) + return arr.mean(axis=0).tolist() diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/llm_client.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/llm_client.py new file mode 100644 index 0000000..738b409 --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/llm_client.py @@ -0,0 +1,432 @@ +""" +LLM client abstraction supporting OpenAI and Anthropic. + +Provides a unified interface for classification requests with: +- Provider abstraction (OpenAI/Anthropic) +- Structured output (JSON mode) +- Retry handling +- Cost tracking +""" + +from __future__ import annotations + +import json +import logging +import time +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from reviewiq_pipeline.config import Config + from reviewiq_pipeline.contracts import LLMClassificationResponse + +logger = logging.getLogger(__name__) + +# System prompt for URT classification +SYSTEM_PROMPT = """You are a review classification system using URT (Universal Review Taxonomy) v5.1. + +Your task is to extract semantic spans from customer reviews and classify each span independently. + +## SPAN EXTRACTION RULES + +1. **Split on contrasting conjunctions**: but, however, although, despite, yet, though +2. **Split on topic/target change**: food → service → bathroom = 3 spans +3. **Split on valence change**: positive → negative = split +4. **Split on domain change**: O (Offering) → J (Journey) → E (Environment) = split +5. **Keep together**: cause→effect within same feedback unit ("X because Y" = 1 span) + +**Guardrails**: +- Max 3 spans per sentence (if 4+, re-check for over-splitting) +- Min 1 span per review (even single-word reviews) +- Spans must be non-overlapping and cover meaningful content + +## URT DOMAINS (Tier-3 codes: X#.##) + +| Domain | Code | Description | +|--------|------|-------------| +| Offering | O1-O4 | Product/service quality, features, variety | +| Price | P1-P4 | Value, pricing, promotions, payment | +| Journey | J1-J4 | Timing, process, convenience, accessibility | +| Environment | E1-E4 | Physical space, ambiance, cleanliness, digital UX | +| Attitude | A1-A4 | Staff behavior, helpfulness, professionalism | +| Voice | V1-V4 | Brand, communication, marketing, transparency | +| Relationship | R1-R4 | Loyalty, trust, consistency, personalization | + +## DIMENSION CODES + +### Valence +- V+ : Positive sentiment +- V- : Negative sentiment +- V0 : Neutral/factual +- V± : Mixed within the span + +### Intensity +- I1 : Low ("okay", "fine", "decent") +- I2 : Moderate ("good", "bad", "slow") +- I3 : High ("amazing", "terrible", "unacceptable") + +### Specificity +- S1 : Vague ("it was bad") +- S2 : Some detail ("the food was cold") +- S3 : Precise ("waited 45 minutes for appetizers") + +### Actionability +- A1 : No clear action possible +- A2 : Possible actions, unclear which +- A3 : Clear, specific action ("train staff on X", "fix Y") + +### Temporal +- TC : Current visit (default when no markers) +- TR : Recent pattern ("lately", "recently", "again") +- TH : Historical ("for years", "always", "used to") +- TF : Future ("won't return", "next time", "I expect") + +### Evidence +- ES : Stated explicitly in text (default) +- EI : Inferred logically (not stated, but entailed) +- EC : Contextual (depends on surrounding text) + +### Comparative +- CR-N : No comparison (default) +- CR-B : Better than alternatives +- CR-W : Worse than alternatives +- CR-S : Same as alternatives + +## PRIMARY SPAN SELECTION + +Mark exactly ONE span as is_primary=true using this order: +1. Highest intensity (I3 > I2 > I1) +2. Tie-break: negative over positive (V- > V± > V0 > V+) +3. Tie-break: earliest span_index + +## USN (URT String Notation) + +Generate a USN string for each span: +``` +URT:S:{primary}[+{sec1}][+{sec2}]:{valence_sign}{intensity_num}:{S#}{A#}{temporal}.{evidence}.{CR_suffix} +``` + +Examples: +- `URT:S:J1.03:-2:22TC.ES.N` (J1.03, V-, I2, S2, A2, TC, ES, CR-N) +- `URT:S:P1.01+O2.03:+3:33TR.ES.B` (P1.01 primary, O2.03 secondary, V+, I3, S3, A3, TR, ES, CR-B) + +Valence encoding: + for V+, - for V-, 0 for V0, ± for V± +CR suffix: N=CR-N, B=CR-B, W=CR-W, S=CR-S + +## OUTPUT FORMAT + +Return valid JSON matching this schema. No markdown, no explanations. + +{ + "spans": [ + { + "span_index": 0, + "span_text": "exact text from review", + "span_start": 0, + "span_end": 25, + "urt_primary": "O1.01", + "urt_secondary": [], + "valence": "V+", + "intensity": "I2", + "specificity": "S2", + "actionability": "A1", + "temporal": "TC", + "evidence": "ES", + "comparative": "CR-N", + "is_primary": true, + "confidence": "high", + "entity": null, + "entity_type": null, + "relation_type": null, + "related_span_index": null, + "usn": "URT:S:O1.01:+2:21TC.ES.N" + } + ], + "review_summary": { + "dominant_valence": "V+", + "dominant_domain": "O", + "span_count": 1, + "has_comparative": false, + "has_entity": false + } +}""" + + +class LLMClientBase(ABC): + """Abstract base class for LLM clients.""" + + def __init__(self, config: Config): + self.config = config + self.total_tokens_used = 0 + self.total_cost_usd = 0.0 + + @abstractmethod + async def classify( + self, + review_text: str, + profile: str = "standard", + ) -> tuple[LLMClassificationResponse, dict[str, Any]]: + """ + Classify a review and extract spans. + + Args: + review_text: The review text to classify + profile: Classification profile (lite/core/standard/full) + + Returns: + Tuple of (classification response, metadata dict with tokens/cost) + """ + pass + + @abstractmethod + async def close(self) -> None: + """Close the client and cleanup resources.""" + pass + + +class OpenAIClient(LLMClientBase): + """OpenAI LLM client implementation.""" + + # Pricing per 1M tokens (as of 2024) + PRICING = { + "gpt-4o": {"input": 5.0, "output": 15.0}, + "gpt-4o-mini": {"input": 0.15, "output": 0.60}, + "gpt-4-turbo": {"input": 10.0, "output": 30.0}, + "gpt-3.5-turbo": {"input": 0.50, "output": 1.50}, + } + + def __init__(self, config: Config): + super().__init__(config) + from openai import AsyncOpenAI + + self.client = AsyncOpenAI(api_key=config.get_llm_api_key()) + self.model = config.llm_model + + async def classify( + self, + review_text: str, + profile: str = "standard", + ) -> tuple[LLMClassificationResponse, dict[str, Any]]: + """Classify using OpenAI.""" + start_time = time.time() + + messages = [ + {"role": "system", "content": SYSTEM_PROMPT}, + { + "role": "user", + "content": f'Classify this review:\n\n"{review_text}"', + }, + ] + + response = await self.client.chat.completions.create( + model=self.model, + messages=messages, + temperature=self.config.llm_temperature, + response_format={"type": "json_object"}, + timeout=self.config.llm_timeout_seconds, + ) + + # Parse response + content = response.choices[0].message.content + if not content: + raise ValueError("Empty response from OpenAI") + + result = json.loads(content) + + # Calculate costs + input_tokens = response.usage.prompt_tokens if response.usage else 0 + output_tokens = response.usage.completion_tokens if response.usage else 0 + total_tokens = input_tokens + output_tokens + + pricing = self.PRICING.get(self.model, {"input": 0.15, "output": 0.60}) + cost = (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1_000_000 + + self.total_tokens_used += total_tokens + self.total_cost_usd += cost + + metadata = { + "model": self.model, + "input_tokens": input_tokens, + "output_tokens": output_tokens, + "total_tokens": total_tokens, + "cost_usd": cost, + "latency_ms": int((time.time() - start_time) * 1000), + } + + return result, metadata + + async def close(self) -> None: + """Close the OpenAI client.""" + await self.client.close() + + +class AnthropicClient(LLMClientBase): + """Anthropic LLM client implementation.""" + + # Pricing per 1M tokens (as of 2024) + PRICING = { + "claude-3-opus-20240229": {"input": 15.0, "output": 75.0}, + "claude-3-sonnet-20240229": {"input": 3.0, "output": 15.0}, + "claude-3-haiku-20240307": {"input": 0.25, "output": 1.25}, + "claude-3-5-sonnet-20241022": {"input": 3.0, "output": 15.0}, + } + + def __init__(self, config: Config): + super().__init__(config) + from anthropic import AsyncAnthropic + + self.client = AsyncAnthropic(api_key=config.get_llm_api_key()) + self.model = config.llm_model + + async def classify( + self, + review_text: str, + profile: str = "standard", + ) -> tuple[LLMClassificationResponse, dict[str, Any]]: + """Classify using Anthropic.""" + start_time = time.time() + + response = await self.client.messages.create( + model=self.model, + max_tokens=4096, + system=SYSTEM_PROMPT, + messages=[ + { + "role": "user", + "content": f'Classify this review and return JSON only:\n\n"{review_text}"', + }, + ], + temperature=self.config.llm_temperature, + ) + + # Parse response + content = response.content[0].text if response.content else "" + if not content: + raise ValueError("Empty response from Anthropic") + + # Try to extract JSON from response + result = self._extract_json(content) + + # Calculate costs + input_tokens = response.usage.input_tokens + output_tokens = response.usage.output_tokens + total_tokens = input_tokens + output_tokens + + pricing = self.PRICING.get(self.model, {"input": 3.0, "output": 15.0}) + cost = (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1_000_000 + + self.total_tokens_used += total_tokens + self.total_cost_usd += cost + + metadata = { + "model": self.model, + "input_tokens": input_tokens, + "output_tokens": output_tokens, + "total_tokens": total_tokens, + "cost_usd": cost, + "latency_ms": int((time.time() - start_time) * 1000), + } + + return result, metadata + + def _extract_json(self, content: str) -> dict[str, Any]: + """Extract JSON from response, handling markdown code blocks.""" + content = content.strip() + + # Try direct parse first + try: + return json.loads(content) + except json.JSONDecodeError: + pass + + # Try to find JSON in code blocks + import re + + json_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", content) + if json_match: + return json.loads(json_match.group(1)) + + # Try to find JSON object + json_match = re.search(r"\{[\s\S]*\}", content) + if json_match: + return json.loads(json_match.group(0)) + + raise ValueError(f"Could not extract JSON from response: {content[:200]}") + + async def close(self) -> None: + """Close the Anthropic client.""" + await self.client.close() + + +class LLMClient: + """ + Factory class for LLM clients. + + Usage: + client = LLMClient.create(config) + result, metadata = await client.classify(review_text) + await client.close() + """ + + @staticmethod + def create(config: Config) -> LLMClientBase: + """ + Create an LLM client based on configuration. + + Args: + config: Pipeline configuration + + Returns: + LLM client instance (OpenAI or Anthropic) + """ + if config.llm_provider == "openai": + return OpenAIClient(config) + elif config.llm_provider == "anthropic": + return AnthropicClient(config) + else: + raise ValueError(f"Unsupported LLM provider: {config.llm_provider}") + + +def create_fallback_response(review_text: str) -> LLMClassificationResponse: + """ + Create a fallback classification response when LLM fails. + + Args: + review_text: Original review text + + Returns: + Minimal valid classification response + """ + return { + "spans": [ + { + "span_index": 0, + "span_text": review_text, + "span_start": 0, + "span_end": len(review_text), + "urt_primary": "O1.01", + "urt_secondary": [], + "valence": "V0", + "intensity": "I1", + "specificity": "S1", + "actionability": "A1", + "temporal": "TC", + "evidence": "ES", + "comparative": "CR-N", + "is_primary": True, + "confidence": "low", + "entity": None, + "entity_type": None, + "relation_type": None, + "related_span_index": None, + "usn": "URT:S:O1.01:01:11TC.ES.N", + } + ], + "review_summary": { + "dominant_valence": "V0", + "dominant_domain": "O", + "span_count": 1, + "has_comparative": False, + "has_entity": False, + }, + } diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/text_processor.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/text_processor.py new file mode 100644 index 0000000..3a80a84 --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/services/text_processor.py @@ -0,0 +1,262 @@ +"""Text processing utilities for normalization.""" + +from __future__ import annotations + +import hashlib +import logging +import re +import unicodedata +from typing import NamedTuple + +logger = logging.getLogger(__name__) + + +class NormalizationResult(NamedTuple): + """Result of text normalization.""" + + normalized: str + language: str + word_count: int + char_count: int + + +class TextProcessor: + """Service for text normalization and processing.""" + + # Common emoji ranges + EMOJI_PATTERN = re.compile( + "[" + "\U0001F600-\U0001F64F" # emoticons + "\U0001F300-\U0001F5FF" # symbols & pictographs + "\U0001F680-\U0001F6FF" # transport & map symbols + "\U0001F1E0-\U0001F1FF" # flags + "\U00002702-\U000027B0" # dingbats + "\U000024C2-\U0001F251" # enclosed characters + "]+", + flags=re.UNICODE, + ) + + # Control characters (except newlines and tabs we want to normalize) + CONTROL_CHAR_PATTERN = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]") + + # Multiple whitespace + MULTI_WHITESPACE_PATTERN = re.compile(r"\s+") + + # URL pattern + URL_PATTERN = re.compile( + r"https?://[^\s<>\"{}|\\^`\[\]]+|www\.[^\s<>\"{}|\\^`\[\]]+" + ) + + def __init__(self): + self._langdetect_available = False + try: + from langdetect import detect, DetectorFactory + + # Make detection deterministic + DetectorFactory.seed = 0 + self._langdetect_available = True + except ImportError: + logger.warning("langdetect not available, defaulting to 'en' for all text") + + def normalize(self, text: str) -> NormalizationResult: + """ + Normalize text for classification. + + Steps: + 1. Remove control characters + 2. Normalize Unicode (NFC) + 3. Lowercase + 4. Normalize whitespace (collapse multiple spaces, trim) + 5. Standardize emoji (keep but normalize) + 6. Detect language + + Args: + text: Original review text + + Returns: + NormalizationResult with normalized text and metadata + """ + if not text: + return NormalizationResult( + normalized="", + language="en", + word_count=0, + char_count=0, + ) + + # Step 1: Remove control characters + normalized = self.CONTROL_CHAR_PATTERN.sub("", text) + + # Step 2: Unicode normalization (NFC - composed form) + normalized = unicodedata.normalize("NFC", normalized) + + # Step 3: Lowercase + normalized = normalized.lower() + + # Step 4: Normalize whitespace + normalized = self.MULTI_WHITESPACE_PATTERN.sub(" ", normalized) + normalized = normalized.strip() + + # Detect language on original text (before lowercasing can help) + language = self.detect_language(text) + + # Calculate metrics + word_count = len(normalized.split()) if normalized else 0 + char_count = len(normalized) + + return NormalizationResult( + normalized=normalized, + language=language, + word_count=word_count, + char_count=char_count, + ) + + def detect_language(self, text: str) -> str: + """ + Detect the language of the text. + + Args: + text: Text to analyze + + Returns: + ISO 639-1 language code (e.g., 'en', 'es', 'fr') + """ + if not text or not self._langdetect_available: + return "en" + + try: + from langdetect import detect + + # Need reasonable length for detection + sample = text[:1000] if len(text) > 1000 else text + return detect(sample) + except Exception as e: + logger.debug(f"Language detection failed: {e}") + return "en" + + def generate_content_hash(self, text_normalized: str) -> str: + """ + Generate a SHA256 hash of normalized text for deduplication. + + Args: + text_normalized: Normalized text + + Returns: + 64-character hex string + """ + return hashlib.sha256(text_normalized.encode("utf-8")).hexdigest() + + def has_control_characters(self, text: str) -> bool: + """Check if text contains control characters.""" + return bool(self.CONTROL_CHAR_PATTERN.search(text)) + + def extract_urls(self, text: str) -> list[str]: + """Extract URLs from text.""" + return self.URL_PATTERN.findall(text) + + def count_emoji(self, text: str) -> int: + """Count emoji in text.""" + return len(self.EMOJI_PATTERN.findall(text)) + + def is_empty_or_trivial(self, text: str | None, min_chars: int = 3) -> bool: + """ + Check if text is empty or trivially short. + + Args: + text: Text to check + min_chars: Minimum meaningful character count + + Returns: + True if text should be skipped + """ + if not text: + return True + stripped = text.strip() + if not stripped: + return True + if len(stripped) < min_chars: + return True + return False + + def clean_for_llm(self, text: str) -> str: + """ + Clean text for LLM input. + + Similar to normalize but preserves case and some formatting + for better LLM understanding. + + Args: + text: Original text + + Returns: + Cleaned text suitable for LLM input + """ + if not text: + return "" + + # Remove control characters + cleaned = self.CONTROL_CHAR_PATTERN.sub("", text) + + # Unicode normalization + cleaned = unicodedata.normalize("NFC", cleaned) + + # Normalize whitespace but preserve single newlines for paragraphs + cleaned = re.sub(r"[^\S\n]+", " ", cleaned) # Collapse horizontal space + cleaned = re.sub(r"\n{3,}", "\n\n", cleaned) # Max 2 consecutive newlines + cleaned = cleaned.strip() + + return cleaned + + +def is_valid_iso639(code: str) -> bool: + """ + Check if a language code is a valid ISO 639-1 code. + + Args: + code: Language code to validate + + Returns: + True if valid ISO 639-1 code + """ + # Common ISO 639-1 codes (not exhaustive but covers most) + valid_codes = { + "aa", "ab", "ae", "af", "ak", "am", "an", "ar", "as", "av", + "ay", "az", "ba", "be", "bg", "bh", "bi", "bm", "bn", "bo", + "br", "bs", "ca", "ce", "ch", "co", "cr", "cs", "cu", "cv", + "cy", "da", "de", "dv", "dz", "ee", "el", "en", "eo", "es", + "et", "eu", "fa", "ff", "fi", "fj", "fo", "fr", "fy", "ga", + "gd", "gl", "gn", "gu", "gv", "ha", "he", "hi", "ho", "hr", + "ht", "hu", "hy", "hz", "ia", "id", "ie", "ig", "ii", "ik", + "io", "is", "it", "iu", "ja", "jv", "ka", "kg", "ki", "kj", + "kk", "kl", "km", "kn", "ko", "kr", "ks", "ku", "kv", "kw", + "ky", "la", "lb", "lg", "li", "ln", "lo", "lt", "lu", "lv", + "mg", "mh", "mi", "mk", "ml", "mn", "mr", "ms", "mt", "my", + "na", "nb", "nd", "ne", "ng", "nl", "nn", "no", "nr", "nv", + "ny", "oc", "oj", "om", "or", "os", "pa", "pi", "pl", "ps", + "pt", "qu", "rm", "rn", "ro", "ru", "rw", "sa", "sc", "sd", + "se", "sg", "si", "sk", "sl", "sm", "sn", "so", "sq", "sr", + "ss", "st", "su", "sv", "sw", "ta", "te", "tg", "th", "ti", + "tk", "tl", "tn", "to", "tr", "ts", "tt", "tw", "ty", "ug", + "uk", "ur", "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi", + "yo", "za", "zh", "zu", + } + return code.lower() in valid_codes + + +def is_valid_sha256(hash_str: str) -> bool: + """ + Check if a string is a valid SHA256 hex hash. + + Args: + hash_str: Hash string to validate + + Returns: + True if valid 64-character hex string + """ + if not hash_str or len(hash_str) != 64: + return False + try: + int(hash_str, 16) + return True + except ValueError: + return False diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/__init__.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/__init__.py new file mode 100644 index 0000000..8888a69 --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/__init__.py @@ -0,0 +1,13 @@ +"""Pipeline stages for review processing.""" + +from reviewiq_pipeline.stages.stage1_normalize import Stage1Normalizer +from reviewiq_pipeline.stages.stage2_classify import Stage2Classifier +from reviewiq_pipeline.stages.stage3_route import Stage3Router +from reviewiq_pipeline.stages.stage4_aggregate import Stage4Aggregator + +__all__ = [ + "Stage1Normalizer", + "Stage2Classifier", + "Stage3Router", + "Stage4Aggregator", +] diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage1_normalize.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage1_normalize.py new file mode 100644 index 0000000..c09f52b --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage1_normalize.py @@ -0,0 +1,247 @@ +""" +Stage 1: Normalization + +Transform raw scraped reviews into clean, versioned records ready for LLM classification. + +Responsibilities: +- Read raw reviews from input +- Text normalization (lowercase, whitespace, emoji) +- Language detection +- Content hash generation for deduplication +- Write to reviews_raw + reviews_enriched stub +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +from reviewiq_pipeline.contracts import ( + NormalizedReview, + RawReview, + Stage1Input, + Stage1Output, + Stage1Stats, +) +from reviewiq_pipeline.services.text_processor import TextProcessor + +if TYPE_CHECKING: + from reviewiq_pipeline.config import Config + from reviewiq_pipeline.db.connection import DatabasePool + from reviewiq_pipeline.db.repositories import ReviewRepository + +logger = logging.getLogger(__name__) + + +class Stage1Normalizer: + """ + Stage 1: Normalize raw reviews for classification. + + This stage: + 1. Reads raw reviews from Stage 0 output + 2. Normalizes text (lowercase, whitespace, unicode) + 3. Detects language + 4. Generates content hash for deduplication + 5. Writes to reviews_raw and reviews_enriched tables + """ + + def __init__( + self, + config: Config, + db: DatabasePool | None = None, + review_repo: ReviewRepository | None = None, + ): + self.config = config + self.db = db + self.review_repo = review_repo + self.text_processor = TextProcessor() + + async def process(self, input_data: Stage1Input) -> Stage1Output: + """ + Process raw reviews through normalization stage. + + Args: + input_data: Stage 1 input containing raw reviews + + Returns: + Stage1Output with normalized reviews and stats + """ + logger.info( + f"Stage 1: Processing {len(input_data['reviews'])} reviews " + f"for job {input_data['job_id']}" + ) + + normalized_reviews: list[NormalizedReview] = [] + stats = Stage1Stats( + input_count=len(input_data["reviews"]), + output_count=0, + skipped_empty=0, + skipped_duplicate=0, + ) + + seen_hashes: set[str] = set() + + for raw_review in input_data["reviews"]: + try: + result = self._normalize_review( + raw_review, + input_data["business_id"], + input_data["place_id"], + ) + + if result is None: + stats["skipped_empty"] += 1 + continue + + # Check for duplicates within this batch + if result["content_hash"] in seen_hashes: + stats["skipped_duplicate"] += 1 + continue + + seen_hashes.add(result["content_hash"]) + + # If we have a database, persist and check cross-batch duplicates + if self.review_repo: + raw_id = await self._persist_review(raw_review, result, input_data) + result["raw_id"] = raw_id + + normalized_reviews.append(result) + stats["output_count"] += 1 + + except Exception as e: + logger.error(f"Error normalizing review {raw_review.get('review_id')}: {e}") + raise + + logger.info( + f"Stage 1 complete: {stats['output_count']} normalized, " + f"{stats['skipped_empty']} empty, {stats['skipped_duplicate']} duplicate" + ) + + return Stage1Output( + job_id=input_data["job_id"], + business_id=input_data["business_id"], + place_id=input_data["place_id"], + reviews_normalized=normalized_reviews, + stats=stats, + ) + + def _normalize_review( + self, + raw: RawReview, + business_id: str, + place_id: str, + ) -> NormalizedReview | None: + """ + Normalize a single raw review. + + Args: + raw: Raw review from scraper + business_id: Business identifier + place_id: Google Place ID + + Returns: + NormalizedReview or None if should be skipped + """ + text = raw.get("text") + + # Skip empty reviews (rating-only) + if self.text_processor.is_empty_or_trivial(text): + logger.debug(f"Skipping empty review {raw['review_id']}") + return None + + # Normalize text + norm_result = self.text_processor.normalize(text) # type: ignore + + # Skip if normalized to empty + if not norm_result.normalized: + return None + + # Generate content hash + content_hash = self.text_processor.generate_content_hash(norm_result.normalized) + + return NormalizedReview( + source="google", + review_id=raw["review_id"], + review_version=1, + business_id=business_id, + place_id=place_id, + text=text, # type: ignore + text_normalized=norm_result.normalized, + text_language=norm_result.language, + text_length=norm_result.char_count, + word_count=norm_result.word_count, + rating=raw["rating"], + review_time=raw["review_time"], + author_name=raw["author_name"], + author_id=raw.get("author_id"), + content_hash=content_hash, + dedup_group_id=None, + ) + + async def _persist_review( + self, + raw: RawReview, + normalized: NormalizedReview, + input_data: Stage1Input, + ) -> int: + """ + Persist a normalized review to the database. + + Args: + raw: Original raw review + normalized: Normalized review data + input_data: Stage 1 input for context + + Returns: + The raw_id from reviews_raw table + """ + if not self.review_repo: + raise RuntimeError("ReviewRepository not configured") + + # Insert raw review + raw_id = await self.review_repo.insert_raw_review( + raw, + input_data["place_id"], + source="google", + ) + + # Insert enriched review stub + await self.review_repo.insert_enriched_review( + normalized, + raw_id, + ) + + return raw_id + + def normalize_batch( + self, + reviews: list[RawReview], + business_id: str, + place_id: str, + ) -> list[NormalizedReview]: + """ + Normalize a batch of reviews without database persistence. + + Useful for testing or when processing reviews in memory. + + Args: + reviews: List of raw reviews + business_id: Business identifier + place_id: Google Place ID + + Returns: + List of normalized reviews (skipped reviews excluded) + """ + results = [] + seen_hashes: set[str] = set() + + for raw in reviews: + normalized = self._normalize_review(raw, business_id, place_id) + if normalized is None: + continue + if normalized["content_hash"] in seen_hashes: + continue + seen_hashes.add(normalized["content_hash"]) + results.append(normalized) + + return results diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage2_classify.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage2_classify.py new file mode 100644 index 0000000..bfb3b5d --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage2_classify.py @@ -0,0 +1,539 @@ +""" +Stage 2: LLM Classification + +Classify normalized reviews into URT codes with span-level extraction. + +Responsibilities: +- Call LLM for span extraction and classification +- Generate embeddings +- Calculate trust scores +- Select primary span +- Write to reviews_enriched and review_spans tables +""" + +from __future__ import annotations + +import hashlib +import logging +import re +import uuid +from typing import TYPE_CHECKING, Any + +from reviewiq_pipeline.contracts import ( + ClassifiedReview, + ExtractedSpan, + ReviewToClassify, + Stage2Input, + Stage2Output, + Stage2Stats, +) +from reviewiq_pipeline.services.llm_client import LLMClient, create_fallback_response + +if TYPE_CHECKING: + from reviewiq_pipeline.config import Config + from reviewiq_pipeline.contracts import LLMClassificationResponse, LLMSpanResponse + from reviewiq_pipeline.db.connection import DatabasePool + from reviewiq_pipeline.db.repositories import ReviewRepository, SpanRepository + from reviewiq_pipeline.services.embeddings import EmbeddingService + from reviewiq_pipeline.services.llm_client import LLMClientBase + +logger = logging.getLogger(__name__) + +# URT code validation pattern +URT_CODE_PATTERN = re.compile(r"^[OPJEAVR][1-4]\.[0-9]{2}$") + +# Valence priority for primary span selection (lower = higher priority) +VALENCE_PRIORITY = {"V-": 0, "V±": 1, "V0": 2, "V+": 3} + +# Intensity priority (lower = higher priority for I3) +INTENSITY_PRIORITY = {"I3": 0, "I2": 1, "I1": 2} + + +class Stage2Classifier: + """ + Stage 2: Classify reviews using LLM and extract spans. + + This stage: + 1. Calls LLM to extract and classify spans + 2. Generates embeddings for each review + 3. Calculates trust scores + 4. Selects primary span + 5. Writes classification results to database + """ + + def __init__( + self, + config: Config, + db: DatabasePool | None = None, + review_repo: ReviewRepository | None = None, + span_repo: SpanRepository | None = None, + embedding_service: EmbeddingService | None = None, + ): + self.config = config + self.db = db + self.review_repo = review_repo + self.span_repo = span_repo + self.embedding_service = embedding_service + self._llm_client: LLMClientBase | None = None + + async def _get_llm_client(self) -> LLMClientBase: + """Get or create LLM client.""" + if self._llm_client is None: + self._llm_client = LLMClient.create(self.config) + return self._llm_client + + async def close(self) -> None: + """Close resources.""" + if self._llm_client: + await self._llm_client.close() + self._llm_client = None + + async def process(self, input_data: Stage2Input) -> Stage2Output: + """ + Process reviews through classification stage. + + Args: + input_data: Stage 2 input with reviews and config + + Returns: + Stage2Output with classified reviews and stats + """ + batch_id = str(uuid.uuid4())[:8] + logger.info( + f"Stage 2: Classifying {len(input_data['reviews'])} reviews " + f"(batch {batch_id})" + ) + + classified_reviews: list[ClassifiedReview] = [] + total_tokens = 0 + total_cost = 0.0 + total_spans = 0 + error_count = 0 + + llm_client = await self._get_llm_client() + + for review in input_data["reviews"]: + try: + classified, metadata = await self._classify_review( + review, + input_data["config"]["profile"], + llm_client, + batch_id, + ) + + if classified: + classified_reviews.append(classified) + total_spans += len(classified.get("spans", [])) + total_tokens += metadata.get("total_tokens", 0) + total_cost += metadata.get("cost_usd", 0.0) + + # Persist to database if configured + if self.review_repo and self.span_repo: + await self._persist_classification( + classified, + review, + batch_id, + input_data["config"], + ) + + except Exception as e: + logger.error( + f"Error classifying review {review['review_id']}: {e}", + exc_info=True, + ) + error_count += 1 + + avg_spans = total_spans / len(classified_reviews) if classified_reviews else 0 + + logger.info( + f"Stage 2 complete: {len(classified_reviews)} classified, " + f"{error_count} errors, {total_spans} spans total" + ) + + return Stage2Output( + batch_id=batch_id, + taxonomy_version=input_data["config"]["taxonomy_version"], + model_version=self.config.llm_model, + prompt_version="v1.0", + reviews_classified=classified_reviews, + stats=Stage2Stats( + input_count=len(input_data["reviews"]), + success_count=len(classified_reviews), + error_count=error_count, + total_spans=total_spans, + avg_spans_per_review=avg_spans, + llm_tokens_used=total_tokens, + llm_cost_usd=total_cost, + ), + ) + + async def _classify_review( + self, + review: ReviewToClassify, + profile: str, + llm_client: LLMClientBase, + batch_id: str, + ) -> tuple[ClassifiedReview | None, dict[str, Any]]: + """ + Classify a single review. + + Args: + review: Review to classify + profile: Classification profile + llm_client: LLM client instance + batch_id: Batch identifier + + Returns: + Tuple of (classified review, metadata) + """ + metadata: dict[str, Any] = {} + + # Call LLM for classification + try: + llm_response, llm_metadata = await llm_client.classify( + review["text"], + profile, + ) + metadata.update(llm_metadata) + except Exception as e: + logger.warning( + f"LLM classification failed for {review['review_id']}, " + f"using fallback: {e}" + ) + llm_response = create_fallback_response(review["text"]) + metadata["fallback"] = True + + # Validate and fix response + llm_response = self._validate_and_fix_response(llm_response, review["text"]) + + # Convert spans to our format + spans = self._convert_spans( + llm_response["spans"], + review, + profile, + batch_id, + ) + + # Ensure exactly one primary span + spans = self._ensure_primary_span(spans) + + # Find the primary span for review-level classification + primary_span = next((s for s in spans if s.get("is_primary")), spans[0] if spans else None) + + # Generate embedding + embedding: list[float] = [] + if self.embedding_service: + embedding = self.embedding_service.embed(review["text_normalized"]) + + # Calculate trust score + trust_score = self._calculate_trust_score(review, spans) + + # Extract staff mentions and quotes + staff_mentions = self._extract_staff_mentions(spans) + quotes = self._extract_quotes(spans) + + return ClassifiedReview( + source=review["source"], + review_id=review["review_id"], + review_version=review["review_version"], + urt_primary=primary_span["urt_primary"] if primary_span else "O1.01", + urt_secondary=primary_span.get("urt_secondary", []) if primary_span else [], + valence=primary_span["valence"] if primary_span else "V0", + intensity=primary_span["intensity"] if primary_span else "I1", + comparative=primary_span.get("comparative", "CR-N") if primary_span else "CR-N", + staff_mentions=staff_mentions, + quotes=quotes, + trust_score=trust_score, + embedding=embedding, + spans=spans, + classification_confidence={ + "overall": 0.8 if not metadata.get("fallback") else 0.3 + }, + processing_time_ms=metadata.get("latency_ms", 0), + ), metadata + + def _validate_and_fix_response( + self, + response: LLMClassificationResponse, + original_text: str, + ) -> LLMClassificationResponse: + """ + Validate LLM response and fix common issues. + + Args: + response: Raw LLM response + original_text: Original review text for offset validation + + Returns: + Validated and fixed response + """ + spans = response.get("spans", []) + if not spans: + # Create fallback if no spans + return create_fallback_response(original_text) + + fixed_spans = [] + for i, span in enumerate(spans): + # Ensure required fields + span["span_index"] = i + + # Validate and fix offsets + start = span.get("span_start", 0) + end = span.get("span_end", len(original_text)) + + if start < 0: + start = 0 + if end > len(original_text): + end = len(original_text) + if end <= start: + end = start + len(span.get("span_text", "")) or len(original_text) + + span["span_start"] = start + span["span_end"] = end + + # Validate URT code + urt_primary = span.get("urt_primary", "O1.01") + if not URT_CODE_PATTERN.match(urt_primary): + logger.warning(f"Invalid URT code '{urt_primary}', defaulting to O1.01") + span["urt_primary"] = "O1.01" + + # Ensure valid enums + if span.get("valence") not in ("V+", "V-", "V0", "V±"): + span["valence"] = "V0" + if span.get("intensity") not in ("I1", "I2", "I3"): + span["intensity"] = "I1" + + fixed_spans.append(span) + + response["spans"] = fixed_spans + return response + + def _convert_spans( + self, + llm_spans: list[LLMSpanResponse], + review: ReviewToClassify, + profile: str, + batch_id: str, + ) -> list[ExtractedSpan]: + """ + Convert LLM spans to our ExtractedSpan format. + + Args: + llm_spans: Spans from LLM response + review: Source review + profile: Classification profile + batch_id: Batch identifier + + Returns: + List of ExtractedSpan objects + """ + spans = [] + + for llm_span in llm_spans: + # Generate deterministic span ID + span_key = f"{review['review_id']}:{llm_span['span_index']}:{llm_span.get('span_text', '')[:50]}" + span_hash = hashlib.sha256(span_key.encode()).hexdigest()[:16] + span_id = f"SPN-{span_hash}" + + span = ExtractedSpan( + span_id=span_id, + span_index=llm_span["span_index"], + span_text=llm_span.get("span_text", ""), + span_start=llm_span.get("span_start", 0), + span_end=llm_span.get("span_end", 0), + profile=profile, # type: ignore + urt_primary=llm_span["urt_primary"], + urt_secondary=llm_span.get("urt_secondary", []), + valence=llm_span["valence"], + intensity=llm_span["intensity"], + comparative=llm_span.get("comparative", "CR-N"), + specificity=llm_span.get("specificity"), + actionability=llm_span.get("actionability"), + temporal=llm_span.get("temporal"), + evidence=llm_span.get("evidence"), + entity=llm_span.get("entity"), + entity_type=llm_span.get("entity_type"), + entity_normalized=llm_span.get("entity", "").lower() if llm_span.get("entity") else None, + relation_type=llm_span.get("relation_type"), + related_span_index=llm_span.get("related_span_index"), + confidence=llm_span.get("confidence", "medium"), + usn=llm_span.get("usn", self._generate_usn(llm_span)), + is_primary=llm_span.get("is_primary", False), + ) + spans.append(span) + + return spans + + def _ensure_primary_span(self, spans: list[ExtractedSpan]) -> list[ExtractedSpan]: + """ + Ensure exactly one span is marked as primary. + + Uses selection rules: + 1. Highest intensity (I3 > I2 > I1) + 2. Tie-break: negative over positive (V- > V± > V0 > V+) + 3. Tie-break: earliest span_index + + Args: + spans: List of spans + + Returns: + List of spans with exactly one primary + """ + if not spans: + return spans + + # Count current primaries + primary_count = sum(1 for s in spans if s.get("is_primary")) + + if primary_count == 1: + return spans + + # Clear all primaries and re-select + for span in spans: + span["is_primary"] = False + + # Sort by selection criteria + def sort_key(s: ExtractedSpan) -> tuple[int, int, int]: + return ( + INTENSITY_PRIORITY.get(s["intensity"], 2), + VALENCE_PRIORITY.get(s["valence"], 3), + s["span_index"], + ) + + sorted_spans = sorted(spans, key=sort_key) + sorted_spans[0]["is_primary"] = True + + return spans + + def _calculate_trust_score( + self, + review: ReviewToClassify, + spans: list[ExtractedSpan], + ) -> float: + """ + Calculate trust score for a review. + + Factors: + - Text length (longer = more trust) + - Specificity of spans + - Confidence levels + + Args: + review: Source review + spans: Classified spans + + Returns: + Trust score between 0.2 and 1.0 + """ + score = 0.5 # Base score + + # Length factor (up to +0.2) + text_len = len(review["text"]) + if text_len > 200: + score += 0.2 + elif text_len > 100: + score += 0.1 + elif text_len > 50: + score += 0.05 + + # Specificity factor (up to +0.2) + if spans: + high_spec_count = sum(1 for s in spans if s.get("specificity") == "S3") + if high_spec_count > 0: + score += 0.1 + (0.1 * min(high_spec_count / len(spans), 1.0)) + + # Confidence factor (up to +0.1) + if spans: + high_conf_count = sum(1 for s in spans if s.get("confidence") == "high") + score += 0.1 * (high_conf_count / len(spans)) + + # Ensure floor of 0.2 and ceiling of 1.0 + return max(self.config.trust_score_floor, min(1.0, score)) + + def _extract_staff_mentions(self, spans: list[ExtractedSpan]) -> list[str]: + """Extract staff names from spans.""" + staff = [] + for span in spans: + if span.get("entity_type") == "staff" and span.get("entity"): + staff.append(span["entity"]) + return list(set(staff)) + + def _extract_quotes(self, spans: list[ExtractedSpan]) -> dict[str, str]: + """Extract representative quotes by URT code.""" + quotes = {} + for span in spans: + code = span["urt_primary"] + if code not in quotes: + quotes[code] = span["span_text"][:100] + return quotes + + def _generate_usn(self, span: LLMSpanResponse) -> str: + """ + Generate USN (URT String Notation) for a span. + + Format: URT:S:{primary}[+{sec}]:{valence_sign}{intensity_num}:{S#}{A#}{temporal}.{evidence}.{CR_suffix} + """ + primary = span.get("urt_primary", "O1.01") + secondary = span.get("urt_secondary", []) + valence = span.get("valence", "V0") + intensity = span.get("intensity", "I1") + specificity = span.get("specificity", "S1") + actionability = span.get("actionability", "A1") + temporal = span.get("temporal", "TC") + evidence = span.get("evidence", "ES") + comparative = span.get("comparative", "CR-N") + + # Build code portion + code_part = primary + for sec in secondary[:2]: + code_part += f"+{sec}" + + # Valence encoding + valence_map = {"V+": "+", "V-": "-", "V0": "0", "V±": "±"} + valence_sign = valence_map.get(valence, "0") + + # Intensity number + intensity_num = intensity[1] if intensity.startswith("I") else "1" + + # Dimensions + spec_num = specificity[1] if specificity and specificity.startswith("S") else "1" + act_num = actionability[1] if actionability and actionability.startswith("A") else "1" + + # CR suffix + cr_map = {"CR-N": "N", "CR-B": "B", "CR-W": "W", "CR-S": "S"} + cr_suffix = cr_map.get(comparative, "N") + + return f"URT:S:{code_part}:{valence_sign}{intensity_num}:{spec_num}{act_num}{temporal}.{evidence}.{cr_suffix}" + + async def _persist_classification( + self, + classified: ClassifiedReview, + review: ReviewToClassify, + batch_id: str, + config: dict[str, Any], + ) -> None: + """Persist classification results to database.""" + if not self.review_repo or not self.span_repo: + return + + # Update reviews_enriched + await self.review_repo.update_enriched_with_classification( + classified, + self.config.llm_model, + config["taxonomy_version"], + ) + + # Insert spans + for span in classified.get("spans", []): + await self.span_repo.insert_span( + span, + review["business_id"], + review["place_id"], + review["source"], + review["review_id"], + review["review_version"], + review["review_time"], + batch_id, + self.config.llm_model, + config["taxonomy_version"], + ) diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage3_route.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage3_route.py new file mode 100644 index 0000000..3d16c7b --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage3_route.py @@ -0,0 +1,274 @@ +""" +Stage 3: Issue Routing + +Route classified spans to issues (create new or aggregate to existing). + +Responsibilities: +- Query unrouted V-/V± spans +- Generate deterministic issue IDs +- Create/update issues with span counts +- Insert issue_spans links +- Log events for audit trail +""" + +from __future__ import annotations + +import hashlib +import logging +from typing import TYPE_CHECKING + +from reviewiq_pipeline.contracts import ( + RoutedSpan, + SpanToRoute, + Stage3Input, + Stage3Output, + Stage3Stats, +) + +if TYPE_CHECKING: + from reviewiq_pipeline.config import Config + from reviewiq_pipeline.db.connection import DatabasePool + from reviewiq_pipeline.db.repositories import IssueRepository, SpanRepository + +logger = logging.getLogger(__name__) + + +class Stage3Router: + """ + Stage 3: Route negative/mixed spans to issues. + + This stage: + 1. Queries unrouted spans with V- or V± valence + 2. Generates deterministic issue IDs from routing keys + 3. Creates new issues or updates existing ones + 4. Links spans to issues (1:1 mapping) + 5. Logs events for audit trail + """ + + def __init__( + self, + config: Config, + db: DatabasePool | None = None, + span_repo: SpanRepository | None = None, + issue_repo: IssueRepository | None = None, + ): + self.config = config + self.db = db + self.span_repo = span_repo + self.issue_repo = issue_repo + + async def process(self, input_data: Stage3Input) -> Stage3Output: + """ + Process spans through routing stage. + + Args: + input_data: Stage 3 input with spans to route + + Returns: + Stage3Output with routing results and stats + """ + logger.info(f"Stage 3: Routing {len(input_data['spans'])} spans") + + routed_spans: list[RoutedSpan] = [] + issues_created: list[str] = [] + issues_updated: list[str] = [] + spans_skipped = 0 + + for span in input_data["spans"]: + try: + # Skip positive spans + if span["valence"] not in ("V-", "V±"): + spans_skipped += 1 + continue + + routed = await self._route_span(span) + if routed: + routed_spans.append(routed) + + if routed["is_new_issue"]: + issues_created.append(routed["issue_id"]) + else: + if routed["issue_id"] not in issues_updated: + issues_updated.append(routed["issue_id"]) + + except Exception as e: + logger.error(f"Error routing span {span['span_id']}: {e}") + raise + + logger.info( + f"Stage 3 complete: {len(routed_spans)} routed, " + f"{len(issues_created)} issues created, " + f"{len(issues_updated)} issues updated" + ) + + return Stage3Output( + routed_spans=routed_spans, + issues_created=issues_created, + issues_updated=issues_updated, + stats=Stage3Stats( + spans_processed=len(input_data["spans"]), + spans_routed=len(routed_spans), + spans_skipped=spans_skipped, + issues_created=len(issues_created), + issues_updated=len(issues_updated), + ), + ) + + async def _route_span(self, span: SpanToRoute) -> RoutedSpan | None: + """ + Route a single span to an issue. + + Args: + span: Span to route + + Returns: + RoutedSpan with routing info, or None if skipped + """ + # Generate routing key and issue ID + routing_key = self._generate_routing_key(span) + issue_id = self._generate_issue_id(routing_key) + + # Check if span already routed (should not happen, but defensive) + if self.issue_repo: + existing_issue = await self.issue_repo.check_span_already_linked(span["span_id"]) + if existing_issue: + logger.warning( + f"Span {span['span_id']} already linked to {existing_issue}" + ) + return None + + # Create or update issue + is_new_issue = True + if self.issue_repo: + is_new_issue = await self.issue_repo.upsert_issue( + issue_id=issue_id, + business_id=span["business_id"], + place_id=span["place_id"], + primary_subcode=span["urt_primary"], + intensity=span["intensity"], + entity=span.get("entity_normalized"), + entity_normalized=span.get("entity_normalized"), + taxonomy_version=self.config.taxonomy_version, + ) + + routed = RoutedSpan( + span_id=span["span_id"], + issue_id=issue_id, + routing_key=routing_key, + is_new_issue=is_new_issue, + ) + + # Link span to issue + if self.issue_repo: + await self.issue_repo.link_span_to_issue( + routed=routed, + source="google", # Assuming Google source + review_id="", # Would need to be passed from span metadata + review_version=1, + intensity=span["intensity"], + review_time=span["review_time"], + is_primary_match=True, + ) + + # Log event + event_type = "issue_created" if is_new_issue else "span_added" + await self.issue_repo.log_event( + issue_id=issue_id, + event_type=event_type, + span_id=span["span_id"], + metadata={ + "urt_primary": span["urt_primary"], + "valence": span["valence"], + "intensity": span["intensity"], + }, + ) + + return routed + + def _generate_routing_key(self, span: SpanToRoute) -> str: + """ + Generate routing key for a span. + + Format: business_id|place_id|urt_primary|entity_normalized + + Args: + span: Span to generate key for + + Returns: + Routing key string + """ + entity = span.get("entity_normalized") or "" + return f"{span['business_id']}|{span['place_id']}|{span['urt_primary']}|{entity}" + + def _generate_issue_id(self, routing_key: str) -> str: + """ + Generate deterministic issue ID from routing key. + + Args: + routing_key: Routing key string + + Returns: + Issue ID in format ISS-{hash16} + """ + hash_value = hashlib.sha256(routing_key.encode()).hexdigest() + return f"ISS-{hash_value[:16]}" + + async def process_from_db(self, limit: int = 100) -> Stage3Output: + """ + Process unrouted spans directly from database. + + Convenience method that queries unrouted spans and processes them. + + Args: + limit: Maximum number of spans to process + + Returns: + Stage3Output with routing results + """ + if not self.span_repo: + raise RuntimeError("SpanRepository not configured") + + # Query unrouted negative spans + span_rows = await self.span_repo.get_unrouted_negative_spans(limit) + + # Convert to SpanToRoute format + spans = [ + SpanToRoute( + span_id=row["span_id"], + business_id=row["business_id"], + place_id=row["place_id"], + urt_primary=row["urt_primary"], + valence=row["valence"], + intensity=row["intensity"], + entity_normalized=row.get("entity_normalized"), + review_time=str(row["review_time"]), + confidence=row["confidence"], + trust_score=row.get("trust_score", 0.5), + ) + for row in span_rows + ] + + return await self.process(Stage3Input(spans=spans)) + + def route_span_sync(self, span: SpanToRoute) -> RoutedSpan: + """ + Route a span without database operations (for testing). + + Args: + span: Span to route + + Returns: + RoutedSpan with routing info + """ + if span["valence"] not in ("V-", "V±"): + raise ValueError(f"Cannot route positive span (valence={span['valence']})") + + routing_key = self._generate_routing_key(span) + issue_id = self._generate_issue_id(routing_key) + + return RoutedSpan( + span_id=span["span_id"], + issue_id=issue_id, + routing_key=routing_key, + is_new_issue=True, # Can't know without DB + ) diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage4_aggregate.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage4_aggregate.py new file mode 100644 index 0000000..1bee8f0 --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage4_aggregate.py @@ -0,0 +1,485 @@ +""" +Stage 4: Fact Aggregation + +Pre-aggregate span/review data into fact_timeseries for fast dashboard queries. + +Responsibilities: +- Aggregate spans by URT code per time bucket +- Calculate valence/intensity distributions +- Compute strength scores (trust-weighted) +- UPSERT into fact_timeseries table +""" + +from __future__ import annotations + +import logging +from collections import defaultdict +from datetime import date, datetime, timedelta +from typing import TYPE_CHECKING, Any + +from reviewiq_pipeline.contracts import ( + FactRecord, + Stage4Input, + Stage4Output, + Stage4Stats, +) + +if TYPE_CHECKING: + from reviewiq_pipeline.config import Config + from reviewiq_pipeline.db.connection import DatabasePool + from reviewiq_pipeline.db.repositories import FactRepository + +logger = logging.getLogger(__name__) + + +class Stage4Aggregator: + """ + Stage 4: Aggregate span data into time series facts. + + This stage: + 1. Queries span data for a business/date range + 2. Aggregates by URT code and time bucket + 3. Calculates valence/intensity distributions + 4. Computes trust-weighted strength scores + 5. UPSERTs results into fact_timeseries table + """ + + def __init__( + self, + config: Config, + db: DatabasePool | None = None, + fact_repo: FactRepository | None = None, + ): + self.config = config + self.db = db + self.fact_repo = fact_repo + + async def process(self, input_data: Stage4Input) -> Stage4Output: + """ + Process aggregation for a business and date. + + Args: + input_data: Stage 4 input with aggregation parameters + + Returns: + Stage4Output with aggregated facts and stats + """ + logger.info( + f"Stage 4: Aggregating for business {input_data['business_id']} " + f"on {input_data['date']}" + ) + + facts_written: list[FactRecord] = [] + locations_processed = 0 + codes_aggregated = set() + + # Get date range based on bucket types + target_date = datetime.strptime(input_data["date"], "%Y-%m-%d").date() + + for bucket_type in input_data["bucket_types"]: + start_date, end_date = self._get_bucket_range(target_date, bucket_type) + period_date = self._get_period_date(target_date, bucket_type) + + # Get aggregation data from database + if self.fact_repo: + span_data = await self.fact_repo.get_aggregation_data( + input_data["business_id"], + start_date, + end_date, + ) + place_ids = await self.fact_repo.get_place_ids_for_business( + input_data["business_id"] + ) + else: + span_data = [] + place_ids = [] + + # Aggregate by place_id and URT code + for place_id in place_ids + ["ALL"]: + place_data = [ + s for s in span_data + if place_id == "ALL" or s["place_id"] == place_id + ] + + if not place_data: + continue + + locations_processed += 1 if place_id != "ALL" else 0 + + # Aggregate by URT code + code_facts = self._aggregate_by_code( + place_data, + input_data["business_id"], + place_id, + period_date, + bucket_type, + input_data["taxonomy_version"], + ) + + for fact in code_facts: + facts_written.append(fact) + codes_aggregated.add(fact["subject_id"]) + + if self.fact_repo: + await self.fact_repo.upsert_fact(fact) + + # Aggregate by domain + domain_facts = self._aggregate_by_domain( + place_data, + input_data["business_id"], + place_id, + period_date, + bucket_type, + input_data["taxonomy_version"], + ) + + for fact in domain_facts: + facts_written.append(fact) + + if self.fact_repo: + await self.fact_repo.upsert_fact(fact) + + # Overall aggregation + overall_fact = self._aggregate_overall( + place_data, + input_data["business_id"], + place_id, + period_date, + bucket_type, + input_data["taxonomy_version"], + ) + + facts_written.append(overall_fact) + + if self.fact_repo: + await self.fact_repo.upsert_fact(overall_fact) + + logger.info( + f"Stage 4 complete: {len(facts_written)} facts written, " + f"{len(codes_aggregated)} unique codes" + ) + + return Stage4Output( + facts_written=facts_written, + stats=Stage4Stats( + business_id=input_data["business_id"], + date=input_data["date"], + locations_processed=locations_processed, + codes_aggregated=len(codes_aggregated), + facts_upserted=len(facts_written), + ), + ) + + def _get_bucket_range( + self, + target_date: date, + bucket_type: str, + ) -> tuple[date, date]: + """Get start and end dates for a time bucket.""" + if bucket_type == "day": + return target_date, target_date + elif bucket_type == "week": + # Week starts on Monday + start = target_date - timedelta(days=target_date.weekday()) + end = start + timedelta(days=6) + return start, end + elif bucket_type == "month": + start = target_date.replace(day=1) + # Get last day of month + if target_date.month == 12: + end = target_date.replace(year=target_date.year + 1, month=1, day=1) - timedelta(days=1) + else: + end = target_date.replace(month=target_date.month + 1, day=1) - timedelta(days=1) + return start, end + else: + raise ValueError(f"Unknown bucket type: {bucket_type}") + + def _get_period_date(self, target_date: date, bucket_type: str) -> str: + """Get the period date string for a bucket.""" + if bucket_type == "day": + return target_date.isoformat() + elif bucket_type == "week": + # Week starts on Monday + start = target_date - timedelta(days=target_date.weekday()) + return start.isoformat() + elif bucket_type == "month": + return target_date.replace(day=1).isoformat() + else: + return target_date.isoformat() + + def _aggregate_by_code( + self, + span_data: list[dict[str, Any]], + business_id: str, + place_id: str, + period_date: str, + bucket_type: str, + taxonomy_version: str, + ) -> list[FactRecord]: + """Aggregate spans by URT code.""" + code_groups: dict[str, list[dict]] = defaultdict(list) + + for span in span_data: + code_groups[span["urt_primary"]].append(span) + + facts = [] + for code, spans in code_groups.items(): + fact = self._compute_fact_metrics( + spans, + business_id, + place_id, + period_date, + bucket_type, + "urt_code", + code, + taxonomy_version, + ) + facts.append(fact) + + return facts + + def _aggregate_by_domain( + self, + span_data: list[dict[str, Any]], + business_id: str, + place_id: str, + period_date: str, + bucket_type: str, + taxonomy_version: str, + ) -> list[FactRecord]: + """Aggregate spans by domain (first letter of URT code).""" + domain_groups: dict[str, list[dict]] = defaultdict(list) + + for span in span_data: + domain = span["urt_primary"][0] # First letter + domain_groups[domain].append(span) + + facts = [] + for domain, spans in domain_groups.items(): + fact = self._compute_fact_metrics( + spans, + business_id, + place_id, + period_date, + bucket_type, + "domain", + domain, + taxonomy_version, + ) + facts.append(fact) + + return facts + + def _aggregate_overall( + self, + span_data: list[dict[str, Any]], + business_id: str, + place_id: str, + period_date: str, + bucket_type: str, + taxonomy_version: str, + ) -> FactRecord: + """Aggregate all spans for overall metrics.""" + return self._compute_fact_metrics( + span_data, + business_id, + place_id, + period_date, + bucket_type, + "overall", + "all", + taxonomy_version, + ) + + def _compute_fact_metrics( + self, + spans: list[dict[str, Any]], + business_id: str, + place_id: str, + period_date: str, + bucket_type: str, + subject_type: str, + subject_id: str, + taxonomy_version: str, + ) -> FactRecord: + """ + Compute aggregated metrics for a group of spans. + + Args: + spans: List of span data + business_id: Business identifier + place_id: Place ID or 'ALL' + period_date: Period date string + bucket_type: day/week/month + subject_type: overall/urt_code/domain/issue + subject_id: Subject identifier + taxonomy_version: Taxonomy version + + Returns: + FactRecord with computed metrics + """ + if not spans: + return self._empty_fact( + business_id, place_id, period_date, bucket_type, + subject_type, subject_id, taxonomy_version, + ) + + # Count unique reviews + review_ids = set() + for span in spans: + # Assuming span has review_id in metadata + review_id = span.get("review_id", span.get("span_id", "")) + review_ids.add(review_id) + + span_count = len(spans) + review_count = len(review_ids) if review_ids else span_count + + # Valence counts + negative_count = sum(1 for s in spans if s["valence"] == "V-") + positive_count = sum(1 for s in spans if s["valence"] == "V+") + neutral_count = sum(1 for s in spans if s["valence"] == "V0") + mixed_count = sum(1 for s in spans if s["valence"] == "V±") + + # Intensity counts + i1_count = sum(1 for s in spans if s["intensity"] == "I1") + i2_count = sum(1 for s in spans if s["intensity"] == "I2") + i3_count = sum(1 for s in spans if s["intensity"] == "I3") + + # Comparative counts + cr_better = sum(1 for s in spans if s.get("comparative") == "CR-B") + cr_worse = sum(1 for s in spans if s.get("comparative") == "CR-W") + cr_same = sum(1 for s in spans if s.get("comparative") == "CR-S") + + # Calculate strength scores + strength_score = self._compute_strength_score(spans) + negative_strength = self._compute_strength_score( + [s for s in spans if s["valence"] in ("V-", "V±")] + ) + positive_strength = self._compute_strength_score( + [s for s in spans if s["valence"] == "V+"] + ) + + # Trust-weighted scores + trust_weighted_strength = self._compute_trust_weighted_strength(spans) + trust_weighted_negative = self._compute_trust_weighted_strength( + [s for s in spans if s["valence"] in ("V-", "V±")] + ) + + # Average rating + ratings = [s["rating"] for s in spans if s.get("rating")] + avg_rating = sum(ratings) / len(ratings) if ratings else None + + return FactRecord( + business_id=business_id, + place_id=place_id, + period_date=period_date, + bucket_type=bucket_type, + subject_type=subject_type, # type: ignore + subject_id=subject_id, + taxonomy_version=taxonomy_version, + review_count=review_count, + span_count=span_count, + negative_count=negative_count, + positive_count=positive_count, + neutral_count=neutral_count, + mixed_count=mixed_count, + strength_score=strength_score, + negative_strength=negative_strength, + positive_strength=positive_strength, + avg_rating=avg_rating, + i1_count=i1_count, + i2_count=i2_count, + i3_count=i3_count, + cr_better=cr_better, + cr_worse=cr_worse, + cr_same=cr_same, + trust_weighted_strength=trust_weighted_strength, + trust_weighted_negative=trust_weighted_negative, + ) + + def _compute_strength_score(self, spans: list[dict[str, Any]]) -> float: + """ + Compute strength score from intensity distribution. + + Score: sum of (intensity_weight * valence_multiplier) + I1=1, I2=2, I3=4 (exponential) + V-=1, V±=0.5, V0=0, V+=1 + """ + if not spans: + return 0.0 + + intensity_weights = {"I1": 1, "I2": 2, "I3": 4} + valence_multipliers = {"V-": 1.0, "V±": 0.5, "V0": 0.0, "V+": 1.0} + + total = 0.0 + for span in spans: + intensity = span.get("intensity", "I1") + valence = span.get("valence", "V0") + weight = intensity_weights.get(intensity, 1) + multiplier = valence_multipliers.get(valence, 0) + total += weight * multiplier + + return total + + def _compute_trust_weighted_strength(self, spans: list[dict[str, Any]]) -> float: + """ + Compute trust-weighted strength score. + + Similar to strength score but weighted by trust_score. + """ + if not spans: + return 0.0 + + intensity_weights = {"I1": 1, "I2": 2, "I3": 4} + valence_multipliers = {"V-": 1.0, "V±": 0.5, "V0": 0.0, "V+": 1.0} + + total = 0.0 + for span in spans: + intensity = span.get("intensity", "I1") + valence = span.get("valence", "V0") + trust = span.get("trust_score", 0.5) + + weight = intensity_weights.get(intensity, 1) + multiplier = valence_multipliers.get(valence, 0) + total += weight * multiplier * trust + + return total + + def _empty_fact( + self, + business_id: str, + place_id: str, + period_date: str, + bucket_type: str, + subject_type: str, + subject_id: str, + taxonomy_version: str, + ) -> FactRecord: + """Create an empty fact record with zero counts.""" + return FactRecord( + business_id=business_id, + place_id=place_id, + period_date=period_date, + bucket_type=bucket_type, + subject_type=subject_type, # type: ignore + subject_id=subject_id, + taxonomy_version=taxonomy_version, + review_count=0, + span_count=0, + negative_count=0, + positive_count=0, + neutral_count=0, + mixed_count=0, + strength_score=0.0, + negative_strength=0.0, + positive_strength=0.0, + avg_rating=None, + i1_count=0, + i2_count=0, + i3_count=0, + cr_better=0, + cr_worse=0, + cr_same=0, + trust_weighted_strength=0.0, + trust_weighted_negative=0.0, + ) diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/validation/__init__.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/validation/__init__.py new file mode 100644 index 0000000..fa14508 --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/validation/__init__.py @@ -0,0 +1,23 @@ +"""Validation rules for pipeline stages.""" + +from reviewiq_pipeline.validation.validators import ( + Stage1Validator, + Stage2Validator, + Stage3Validator, + Stage4Validator, + validate_stage1_output, + validate_stage2_output, + validate_stage3_output, + validate_stage4_output, +) + +__all__ = [ + "Stage1Validator", + "Stage2Validator", + "Stage3Validator", + "Stage4Validator", + "validate_stage1_output", + "validate_stage2_output", + "validate_stage3_output", + "validate_stage4_output", +] diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/validation/validators.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/validation/validators.py new file mode 100644 index 0000000..05e23a6 --- /dev/null +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/validation/validators.py @@ -0,0 +1,506 @@ +""" +Validation rules for pipeline stages. + +Implements validation rules V1.x, V2.x, V3.x, V4.x from the contracts. +""" + +from __future__ import annotations + +import re +from typing import TYPE_CHECKING, Any, Callable + +from reviewiq_pipeline.contracts import ( + ValidationError, + ValidationResult, +) +from reviewiq_pipeline.services.text_processor import is_valid_iso639, is_valid_sha256 + +if TYPE_CHECKING: + from reviewiq_pipeline.contracts import ( + FactRecord, + NormalizedReview, + RoutedSpan, + Stage1Output, + Stage2Output, + Stage3Output, + Stage4Output, + ) + from reviewiq_pipeline.db.connection import DatabasePool + +# URT code pattern +URT_CODE_PATTERN = re.compile(r"^[OPJEAVR][1-4]\.[0-9]{2}$") + +# Issue ID pattern +ISSUE_ID_PATTERN = re.compile(r"^ISS-[a-f0-9]{16}$") + +# Valid enum values +VALID_VALENCES = {"V+", "V-", "V0", "V±"} +VALID_INTENSITIES = {"I1", "I2", "I3"} +VALID_SPECIFICITIES = {"S1", "S2", "S3"} +VALID_ACTIONABILITIES = {"A1", "A2", "A3"} +VALID_TEMPORALS = {"TC", "TR", "TH", "TF"} +VALID_EVIDENCES = {"ES", "EI", "EC"} +VALID_COMPARATIVES = {"CR-N", "CR-B", "CR-W", "CR-S"} + + +def _has_control_chars(text: str) -> bool: + """Check if text contains control characters.""" + return bool(re.search(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]", text)) + + +class Stage1Validator: + """Validator for Stage 1 output.""" + + def validate(self, output: Stage1Output) -> ValidationResult: + """ + Validate Stage 1 output. + + Rules: + - V1.1: text is non-empty string + - V1.2: text_normalized contains no control chars + - V1.3: content_hash is 64-char hex + - V1.4: review_version >= 1 + - V1.5: text_language is valid ISO 639-1 + - V1.6: raw_id references valid reviews_raw row (requires DB) + """ + errors: list[ValidationError] = [] + + for review in output["reviews_normalized"]: + review_id = review["review_id"] + + # V1.1: Non-empty text + if not review.get("text") or not review["text"].strip(): + errors.append(ValidationError( + rule="V1.1", + identifier=review_id, + message="Empty text", + )) + + # V1.2: No control characters in normalized text + if review.get("text_normalized") and _has_control_chars(review["text_normalized"]): + errors.append(ValidationError( + rule="V1.2", + identifier=review_id, + message="Control chars in normalized text", + )) + + # V1.3: Valid content hash + if not is_valid_sha256(review.get("content_hash", "")): + errors.append(ValidationError( + rule="V1.3", + identifier=review_id, + message=f"Invalid content hash: {review.get('content_hash', '')[:20]}...", + )) + + # V1.4: Version >= 1 + if review.get("review_version", 0) < 1: + errors.append(ValidationError( + rule="V1.4", + identifier=review_id, + message=f"Invalid version: {review.get('review_version')}", + )) + + # V1.5: Valid language code + if not is_valid_iso639(review.get("text_language", "")): + errors.append(ValidationError( + rule="V1.5", + identifier=review_id, + message=f"Invalid language: {review.get('text_language')}", + )) + + return ValidationResult( + stage="stage1", + passed=len(errors) == 0, + error_count=len(errors), + errors=errors, + ) + + +class Stage2Validator: + """Validator for Stage 2 output.""" + + def validate( + self, + output: Stage2Output, + input_reviews: dict[tuple[str, str, int], dict[str, Any]] | None = None, + ) -> ValidationResult: + """ + Validate Stage 2 output. + + Rules: + - V2.1: urt_primary matches pattern + - V2.2: urt_secondary has max 2 elements + - V2.3: valence is valid enum + - V2.4: intensity is valid enum + - V2.5: span_end > span_start + - V2.6: span_text matches text[span_start:span_end] + - V2.7: spans do not overlap + - V2.8: exactly one is_primary per review + - V2.9: trust_score between 0.2 and 1.0 + - V2.10: embedding is 384-dim array + - V2.11: usn matches profile-specific regex + - V2.12: related_span_index references valid span + + Args: + output: Stage 2 output to validate + input_reviews: Optional dict mapping (source, review_id, version) -> review data + """ + errors: list[ValidationError] = [] + + for review in output["reviews_classified"]: + review_id = review["review_id"] + + # V2.1: Valid URT code + if not URT_CODE_PATTERN.match(review.get("urt_primary", "")): + errors.append(ValidationError( + rule="V2.1", + identifier=review_id, + message=f"Invalid URT code: {review.get('urt_primary')}", + )) + + # V2.2: Max 2 secondary codes + if len(review.get("urt_secondary", [])) > 2: + errors.append(ValidationError( + rule="V2.2", + identifier=review_id, + message=f"Too many secondary codes: {len(review.get('urt_secondary', []))}", + )) + + # V2.3: Valid valence + if review.get("valence") not in VALID_VALENCES: + errors.append(ValidationError( + rule="V2.3", + identifier=review_id, + message=f"Invalid valence: {review.get('valence')}", + )) + + # V2.4: Valid intensity + if review.get("intensity") not in VALID_INTENSITIES: + errors.append(ValidationError( + rule="V2.4", + identifier=review_id, + message=f"Invalid intensity: {review.get('intensity')}", + )) + + # V2.9: Trust score bounds + trust = review.get("trust_score", 0) + if not (0.2 <= trust <= 1.0): + errors.append(ValidationError( + rule="V2.9", + identifier=review_id, + message=f"Trust score out of bounds: {trust}", + )) + + # V2.10: Embedding dimension + embedding = review.get("embedding", []) + if embedding and len(embedding) != 384: + errors.append(ValidationError( + rule="V2.10", + identifier=review_id, + message=f"Invalid embedding dimension: {len(embedding)}", + )) + + # Validate spans + spans = review.get("spans", []) + primary_count = 0 + span_ranges: list[tuple[int, int]] = [] + + # Get original text if available + original_text = "" + if input_reviews: + key = (review["source"], review["review_id"], review["review_version"]) + original_text = input_reviews.get(key, {}).get("text", "") + + for span in spans: + span_id = span.get("span_id", f"{review_id}:span") + + # V2.5: Valid bounds + start = span.get("span_start", 0) + end = span.get("span_end", 0) + if end <= start: + errors.append(ValidationError( + rule="V2.5", + identifier=span_id, + message=f"Invalid bounds: {start}:{end}", + )) + + # V2.6: Text matches (if we have original) + if original_text and span.get("span_text"): + expected = original_text[start:end] + # Allow whitespace normalization + expected_norm = " ".join(expected.split()) + actual_norm = " ".join(span["span_text"].split()) + if expected_norm != actual_norm: + errors.append(ValidationError( + rule="V2.6", + identifier=span_id, + message=f"Text mismatch at {start}:{end}", + )) + + # V2.7: Check overlap + for prev_start, prev_end in span_ranges: + if not (end <= prev_start or start >= prev_end): + errors.append(ValidationError( + rule="V2.7", + identifier=span_id, + message="Overlapping span", + )) + break + span_ranges.append((start, end)) + + # V2.8: Count primaries + if span.get("is_primary"): + primary_count += 1 + + # V2.12: Valid related_span_index + related_idx = span.get("related_span_index") + if related_idx is not None: + if related_idx < 0 or related_idx >= len(spans): + errors.append(ValidationError( + rule="V2.12", + identifier=span_id, + message=f"Invalid related_span_index: {related_idx}", + )) + elif related_idx == span.get("span_index"): + errors.append(ValidationError( + rule="V2.12", + identifier=span_id, + message="Self-referencing span", + )) + + # V2.8: Exactly one primary + if primary_count != 1: + errors.append(ValidationError( + rule="V2.8", + identifier=review_id, + message=f"Primary span count: {primary_count}", + )) + + return ValidationResult( + stage="stage2", + passed=len(errors) == 0, + error_count=len(errors), + errors=errors, + ) + + +class Stage3Validator: + """Validator for Stage 3 output.""" + + def __init__(self, db: DatabasePool | None = None): + self.db = db + + async def validate(self, output: Stage3Output) -> ValidationResult: + """ + Validate Stage 3 output. + + Rules: + - V3.1: issue_id matches pattern + - V3.2: routing_key is non-empty + - V3.3: span not already linked to different issue + - V3.4: issue exists in issues table + - V3.5: only V-/V± spans create issues + """ + errors: list[ValidationError] = [] + + for routed in output["routed_spans"]: + span_id = routed["span_id"] + + # V3.1: Valid issue ID format + if not ISSUE_ID_PATTERN.match(routed.get("issue_id", "")): + errors.append(ValidationError( + rule="V3.1", + identifier=span_id, + message=f"Invalid issue_id: {routed.get('issue_id')}", + )) + + # V3.2: Non-empty routing key + if not routed.get("routing_key"): + errors.append(ValidationError( + rule="V3.2", + identifier=span_id, + message="Empty routing key", + )) + + # V3.3, V3.4: Require database for these checks + if self.db: + # V3.3: Check no duplicate routing + existing = await self.db.fetchval( + "SELECT issue_id FROM issue_spans WHERE span_id = $1", + span_id, + ) + if existing and existing != routed["issue_id"]: + errors.append(ValidationError( + rule="V3.3", + identifier=span_id, + message=f"Already routed to {existing}", + )) + + # V3.4: Issue exists + issue_exists = await self.db.fetchval( + "SELECT 1 FROM issues WHERE issue_id = $1", + routed["issue_id"], + ) + if not issue_exists: + errors.append(ValidationError( + rule="V3.4", + identifier=span_id, + message=f"Issue not found: {routed['issue_id']}", + )) + + return ValidationResult( + stage="stage3", + passed=len(errors) == 0, + error_count=len(errors), + errors=errors, + ) + + def validate_sync(self, output: Stage3Output) -> ValidationResult: + """Synchronous validation without database checks.""" + errors: list[ValidationError] = [] + + for routed in output["routed_spans"]: + span_id = routed["span_id"] + + # V3.1: Valid issue ID format + if not ISSUE_ID_PATTERN.match(routed.get("issue_id", "")): + errors.append(ValidationError( + rule="V3.1", + identifier=span_id, + message=f"Invalid issue_id: {routed.get('issue_id')}", + )) + + # V3.2: Non-empty routing key + if not routed.get("routing_key"): + errors.append(ValidationError( + rule="V3.2", + identifier=span_id, + message="Empty routing key", + )) + + return ValidationResult( + stage="stage3", + passed=len(errors) == 0, + error_count=len(errors), + errors=errors, + ) + + +class Stage4Validator: + """Validator for Stage 4 output.""" + + def validate(self, output: Stage4Output) -> ValidationResult: + """ + Validate Stage 4 output. + + Rules: + - V4.1: place_id is valid or 'ALL' + - V4.2: period_date matches bucket + - V4.3: span_count >= review_count + - V4.4: valence counts sum to span_count + - V4.5: intensity counts sum to span_count + - V4.6: strength_score >= 0 + - V4.7: avg_rating between 1.0 and 5.0 (or NULL) + """ + errors: list[ValidationError] = [] + + for fact in output["facts_written"]: + fact_id = f"{fact['subject_type']}:{fact['subject_id']}" + + # V4.1: Valid place_id + place_id = fact.get("place_id", "") + if not place_id: + errors.append(ValidationError( + rule="V4.1", + identifier=fact_id, + message="Empty place_id", + )) + + # V4.3: span_count >= review_count + if fact.get("span_count", 0) < fact.get("review_count", 0): + errors.append(ValidationError( + rule="V4.3", + identifier=fact_id, + message=f"span_count ({fact.get('span_count')}) < review_count ({fact.get('review_count')})", + )) + + # V4.4: Valence sum + valence_sum = ( + fact.get("negative_count", 0) + + fact.get("positive_count", 0) + + fact.get("neutral_count", 0) + + fact.get("mixed_count", 0) + ) + if valence_sum != fact.get("span_count", 0): + errors.append(ValidationError( + rule="V4.4", + identifier=fact_id, + message=f"Valence sum {valence_sum} != span_count {fact.get('span_count')}", + )) + + # V4.5: Intensity sum + intensity_sum = ( + fact.get("i1_count", 0) + + fact.get("i2_count", 0) + + fact.get("i3_count", 0) + ) + if intensity_sum != fact.get("span_count", 0): + errors.append(ValidationError( + rule="V4.5", + identifier=fact_id, + message=f"Intensity sum {intensity_sum} != span_count {fact.get('span_count')}", + )) + + # V4.6: Non-negative strength + if fact.get("strength_score", 0) < 0: + errors.append(ValidationError( + rule="V4.6", + identifier=fact_id, + message=f"Negative strength_score: {fact.get('strength_score')}", + )) + + # V4.7: Rating bounds + avg_rating = fact.get("avg_rating") + if avg_rating is not None and not (1.0 <= avg_rating <= 5.0): + errors.append(ValidationError( + rule="V4.7", + identifier=fact_id, + message=f"Invalid avg_rating: {avg_rating}", + )) + + return ValidationResult( + stage="stage4", + passed=len(errors) == 0, + error_count=len(errors), + errors=errors, + ) + + +# Convenience functions +def validate_stage1_output(output: Stage1Output) -> ValidationResult: + """Validate Stage 1 output.""" + return Stage1Validator().validate(output) + + +def validate_stage2_output( + output: Stage2Output, + input_reviews: dict[tuple[str, str, int], dict[str, Any]] | None = None, +) -> ValidationResult: + """Validate Stage 2 output.""" + return Stage2Validator().validate(output, input_reviews) + + +async def validate_stage3_output( + output: Stage3Output, + db: DatabasePool | None = None, +) -> ValidationResult: + """Validate Stage 3 output.""" + validator = Stage3Validator(db) + if db: + return await validator.validate(output) + return validator.validate_sync(output) + + +def validate_stage4_output(output: Stage4Output) -> ValidationResult: + """Validate Stage 4 output.""" + return Stage4Validator().validate(output) diff --git a/packages/reviewiq-pipeline/tests/__init__.py b/packages/reviewiq-pipeline/tests/__init__.py new file mode 100644 index 0000000..4132be1 --- /dev/null +++ b/packages/reviewiq-pipeline/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for reviewiq-pipeline.""" diff --git a/packages/reviewiq-pipeline/tests/conftest.py b/packages/reviewiq-pipeline/tests/conftest.py new file mode 100644 index 0000000..4f634df --- /dev/null +++ b/packages/reviewiq-pipeline/tests/conftest.py @@ -0,0 +1,269 @@ +""" +Pytest configuration and fixtures for reviewiq-pipeline tests. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import pytest + +# Sample data fixtures matching the contract examples + + +@pytest.fixture +def sample_raw_review() -> dict[str, Any]: + """Sample raw review from Stage 0 output.""" + return { + "review_id": "ChdDSUhNMG9nS0VJQ0FnSURBdWJQX3h3RRAB", + "author_name": "John Smith", + "author_id": "103456789012345678901", + "rating": 2, + "text": "The food was great but the wait was absolutely terrible. We waited 45 minutes just to be seated, and another 30 minutes for our appetizers. The server Mike was rude and dismissive when we complained. However, the steak was cooked perfectly and the dessert was amazing.", + "review_time": "2026-01-20T14:30:00Z", + "response_text": None, + "photos": [], + "raw_payload": {}, + } + + +@pytest.fixture +def sample_scraper_output(sample_raw_review: dict) -> dict[str, Any]: + """Sample Stage 0 output.""" + return { + "job_id": "test-job-001", + "status": "completed", + "business_id": "acme-corp", + "place_id": "ChIJN1t_tDeuEmsRUsoyG83frY4", + "business_info": { + "name": "Acme Restaurant", + "address": "123 Main St, Anytown, USA", + "category": "Restaurant", + "total_reviews": 1247, + "average_rating": 4.2, + }, + "reviews": [sample_raw_review], + "scrape_time_ms": 12500, + "reviews_scraped": 1, + "scraper_version": "v1.0.0", + } + + +@pytest.fixture +def sample_normalized_review() -> dict[str, Any]: + """Sample normalized review from Stage 1 output.""" + return { + "source": "google", + "review_id": "ChdDSUhNMG9nS0VJQ0FnSURBdWJQX3h3RRAB", + "review_version": 1, + "business_id": "acme-corp", + "place_id": "ChIJN1t_tDeuEmsRUsoyG83frY4", + "text": "The food was great but the wait was absolutely terrible. We waited 45 minutes just to be seated, and another 30 minutes for our appetizers. The server Mike was rude and dismissive when we complained. However, the steak was cooked perfectly and the dessert was amazing.", + "text_normalized": "the food was great but the wait was absolutely terrible we waited 45 minutes just to be seated and another 30 minutes for our appetizers the server mike was rude and dismissive when we complained however the steak was cooked perfectly and the dessert was amazing", + "text_language": "en", + "text_length": 267, + "word_count": 52, + "rating": 2, + "review_time": "2026-01-20T14:30:00Z", + "author_name": "John Smith", + "content_hash": "a1b2c3d4e5f6789012345678901234567890123456789012345678901234abcd", + "raw_id": 12345, + } + + +@pytest.fixture +def sample_stage1_output(sample_normalized_review: dict) -> dict[str, Any]: + """Sample Stage 1 output.""" + return { + "job_id": "test-job-001", + "business_id": "acme-corp", + "place_id": "ChIJN1t_tDeuEmsRUsoyG83frY4", + "reviews_normalized": [sample_normalized_review], + "stats": { + "input_count": 1, + "output_count": 1, + "skipped_empty": 0, + "skipped_duplicate": 0, + }, + } + + +@pytest.fixture +def sample_span() -> dict[str, Any]: + """Sample extracted span.""" + return { + "span_id": "SPN-b2c3d4e5f6789012", + "span_index": 1, + "span_text": "the wait was absolutely terrible. We waited 45 minutes just to be seated, and another 30 minutes for our appetizers", + "span_start": 23, + "span_end": 138, + "profile": "standard", + "urt_primary": "J1.01", + "urt_secondary": [], + "valence": "V-", + "intensity": "I3", + "comparative": "CR-N", + "specificity": "S3", + "actionability": "A2", + "temporal": "TC", + "evidence": "EC", + "confidence": "high", + "usn": "URT:S:J1.01:-3:32TC.EC.N", + "is_primary": True, + } + + +@pytest.fixture +def sample_classified_review(sample_span: dict) -> dict[str, Any]: + """Sample classified review from Stage 2 output.""" + return { + "source": "google", + "review_id": "ChdDSUhNMG9nS0VJQ0FnSURBdWJQX3h3RRAB", + "review_version": 1, + "urt_primary": "J1.01", + "urt_secondary": ["P1.02"], + "valence": "V±", + "intensity": "I3", + "comparative": "CR-N", + "staff_mentions": ["Mike"], + "quotes": { + "J1.01": "waited 45 minutes just to be seated", + "P1.02": "rude and dismissive", + }, + "trust_score": 0.85, + "embedding": [0.1] * 384, # Placeholder + "spans": [ + { + "span_id": "SPN-a1b2c3d4e5f67890", + "span_index": 0, + "span_text": "The food was great", + "span_start": 0, + "span_end": 18, + "profile": "standard", + "urt_primary": "O1.01", + "urt_secondary": [], + "valence": "V+", + "intensity": "I2", + "comparative": "CR-N", + "confidence": "high", + "usn": "URT:S:O1.01:+2:21TC.ES.N", + "is_primary": False, + }, + sample_span, + ], + "classification_confidence": {"overall": 0.85}, + "processing_time_ms": 500, + } + + +@pytest.fixture +def sample_stage2_output(sample_classified_review: dict) -> dict[str, Any]: + """Sample Stage 2 output.""" + return { + "batch_id": "batch001", + "taxonomy_version": "v5.1", + "model_version": "gpt-4o-mini", + "prompt_version": "v1.0", + "reviews_classified": [sample_classified_review], + "stats": { + "input_count": 1, + "success_count": 1, + "error_count": 0, + "total_spans": 2, + "avg_spans_per_review": 2.0, + "llm_tokens_used": 1500, + "llm_cost_usd": 0.001, + }, + } + + +@pytest.fixture +def sample_routed_span() -> dict[str, Any]: + """Sample routed span from Stage 3.""" + return { + "span_id": "SPN-b2c3d4e5f6789012", + "issue_id": "ISS-7a8b9c0d1e2f3a4b", + "routing_key": "acme-corp|ChIJN1t_tDeuEmsRUsoyG83frY4|J1.01|", + "is_new_issue": True, + } + + +@pytest.fixture +def sample_stage3_output(sample_routed_span: dict) -> dict[str, Any]: + """Sample Stage 3 output.""" + return { + "routed_spans": [sample_routed_span], + "issues_created": ["ISS-7a8b9c0d1e2f3a4b"], + "issues_updated": [], + "stats": { + "spans_processed": 2, + "spans_routed": 1, + "spans_skipped": 1, + "issues_created": 1, + "issues_updated": 0, + }, + } + + +@pytest.fixture +def sample_fact() -> dict[str, Any]: + """Sample fact record from Stage 4.""" + return { + "business_id": "acme-corp", + "place_id": "ChIJN1t_tDeuEmsRUsoyG83frY4", + "period_date": "2026-01-20", + "bucket_type": "day", + "subject_type": "urt_code", + "subject_id": "J1.01", + "taxonomy_version": "v5.1", + "review_count": 1, + "span_count": 1, + "negative_count": 1, + "positive_count": 0, + "neutral_count": 0, + "mixed_count": 0, + "strength_score": 4.0, + "negative_strength": 4.0, + "positive_strength": 0.0, + "avg_rating": 2.0, + "i1_count": 0, + "i2_count": 0, + "i3_count": 1, + "cr_better": 0, + "cr_worse": 0, + "cr_same": 0, + "trust_weighted_strength": 3.4, + "trust_weighted_negative": 3.4, + } + + +@pytest.fixture +def sample_stage4_output(sample_fact: dict) -> dict[str, Any]: + """Sample Stage 4 output.""" + return { + "facts_written": [sample_fact], + "stats": { + "business_id": "acme-corp", + "date": "2026-01-20", + "locations_processed": 1, + "codes_aggregated": 1, + "facts_upserted": 1, + }, + } + + +@pytest.fixture +def fixtures_dir() -> Path: + """Get the path to the fixtures directory.""" + return Path(__file__).parent / "fixtures" + + +# Helper to load JSON fixtures +def load_fixture(name: str) -> dict[str, Any]: + """Load a JSON fixture by name.""" + fixtures_path = Path(__file__).parent / "fixtures" / f"{name}.json" + if fixtures_path.exists(): + return json.loads(fixtures_path.read_text()) + raise FileNotFoundError(f"Fixture not found: {name}") diff --git a/packages/reviewiq-pipeline/tests/integration/__init__.py b/packages/reviewiq-pipeline/tests/integration/__init__.py new file mode 100644 index 0000000..1e55d4b --- /dev/null +++ b/packages/reviewiq-pipeline/tests/integration/__init__.py @@ -0,0 +1 @@ +"""Integration tests for reviewiq-pipeline.""" diff --git a/packages/reviewiq-pipeline/tests/integration/test_e2e.py b/packages/reviewiq-pipeline/tests/integration/test_e2e.py new file mode 100644 index 0000000..0575d07 --- /dev/null +++ b/packages/reviewiq-pipeline/tests/integration/test_e2e.py @@ -0,0 +1,179 @@ +"""End-to-end integration tests for the pipeline.""" + +from __future__ import annotations + +import pytest + + +class TestPipelineE2E: + """End-to-end integration tests.""" + + def test_stage1_to_stage2_contract(self, sample_scraper_output): + """Test that Stage 1 output is valid Stage 2 input.""" + from reviewiq_pipeline.config import Config + from reviewiq_pipeline.stages.stage1_normalize import Stage1Normalizer + from reviewiq_pipeline.contracts import Stage1Input + + config = Config() + normalizer = Stage1Normalizer(config) + + # Run Stage 1 + input_data = Stage1Input( + job_id=sample_scraper_output["job_id"], + business_id=sample_scraper_output["business_id"], + place_id=sample_scraper_output["place_id"], + reviews=sample_scraper_output["reviews"], + ) + + # Note: This is synchronous test, so we use the batch method + normalized = normalizer.normalize_batch( + sample_scraper_output["reviews"], + sample_scraper_output["business_id"], + sample_scraper_output["place_id"], + ) + + # Verify Stage 1 output can be used as Stage 2 input + assert len(normalized) > 0 + for review in normalized: + # Check required fields for Stage 2 + assert review["source"] is not None + assert review["review_id"] is not None + assert review["text"] is not None + assert review["text_normalized"] is not None + assert review["rating"] is not None + assert review["review_time"] is not None + + def test_stage2_to_stage3_contract(self, sample_stage2_output): + """Test that Stage 2 output spans can be routed by Stage 3.""" + from reviewiq_pipeline.config import Config + from reviewiq_pipeline.stages.stage3_route import Stage3Router + from reviewiq_pipeline.contracts import SpanToRoute + + config = Config() + router = Stage3Router(config) + + # Extract negative spans from Stage 2 output + spans_to_route = [] + for review in sample_stage2_output["reviews_classified"]: + for span in review.get("spans", []): + if span["valence"] in ("V-", "V±"): + spans_to_route.append( + SpanToRoute( + span_id=span["span_id"], + business_id="test-business", + place_id="test-place", + urt_primary=span["urt_primary"], + valence=span["valence"], + intensity=span["intensity"], + entity_normalized=span.get("entity_normalized"), + review_time="2026-01-20T14:30:00Z", + confidence=span.get("confidence", "medium"), + trust_score=0.85, + ) + ) + + # Verify we can route these spans + for span in spans_to_route: + routed = router.route_span_sync(span) + assert routed["span_id"] == span["span_id"] + assert routed["issue_id"].startswith("ISS-") + + def test_validation_chain( + self, + sample_stage1_output, + sample_stage2_output, + sample_stage3_output, + sample_stage4_output, + ): + """Test that all sample outputs pass validation.""" + from reviewiq_pipeline.validation.validators import ( + validate_stage1_output, + validate_stage2_output, + validate_stage4_output, + Stage3Validator, + ) + + # Validate Stage 1 + result1 = validate_stage1_output(sample_stage1_output) + assert result1["passed"], f"Stage 1 failed: {result1['errors']}" + + # Validate Stage 2 + result2 = validate_stage2_output(sample_stage2_output) + assert result2["passed"], f"Stage 2 failed: {result2['errors']}" + + # Validate Stage 3 (sync version) + validator3 = Stage3Validator() + result3 = validator3.validate_sync(sample_stage3_output) + assert result3["passed"], f"Stage 3 failed: {result3['errors']}" + + # Validate Stage 4 + result4 = validate_stage4_output(sample_stage4_output) + assert result4["passed"], f"Stage 4 failed: {result4['errors']}" + + def test_text_normalization_preserves_meaning(self, sample_raw_review): + """Test that normalization preserves review meaning.""" + from reviewiq_pipeline.services.text_processor import TextProcessor + + processor = TextProcessor() + result = processor.normalize(sample_raw_review["text"]) + + # Key terms should still be present (lowercased) + assert "food" in result.normalized + assert "wait" in result.normalized + assert "terrible" in result.normalized + assert "mike" in result.normalized + assert "steak" in result.normalized + + def test_issue_id_determinism(self): + """Test that same inputs always produce same issue ID.""" + from reviewiq_pipeline.config import Config + from reviewiq_pipeline.stages.stage3_route import Stage3Router + + config = Config() + router = Stage3Router(config) + + span = { + "span_id": "test-span", + "business_id": "acme-corp", + "place_id": "place123", + "urt_primary": "J1.01", + "valence": "V-", + "intensity": "I3", + "entity_normalized": "mike", + "review_time": "2026-01-20T14:30:00Z", + "confidence": "high", + "trust_score": 0.85, + } + + # Route the same span multiple times + ids = [router.route_span_sync(span)["issue_id"] for _ in range(10)] + + # All IDs should be identical + assert len(set(ids)) == 1 + + +@pytest.mark.asyncio +class TestAsyncPipeline: + """Async pipeline tests (require database).""" + + @pytest.mark.skip(reason="Requires database connection") + async def test_full_pipeline_flow(self, sample_scraper_output): + """Test full pipeline from scraper output to facts.""" + from reviewiq_pipeline import Pipeline, Config + + config = Config( + database_url="postgresql://localhost:5432/reviewiq_test", + llm_provider="openai", + ) + + pipeline = Pipeline(config) + + try: + await pipeline.initialize() + result = await pipeline.process(sample_scraper_output) + + assert result.stage1 is not None + assert result.success or len(result.validation) > 0 + + finally: + await pipeline.close() diff --git a/packages/reviewiq-pipeline/tests/test_stage1.py b/packages/reviewiq-pipeline/tests/test_stage1.py new file mode 100644 index 0000000..a930e48 --- /dev/null +++ b/packages/reviewiq-pipeline/tests/test_stage1.py @@ -0,0 +1,218 @@ +"""Tests for Stage 1: Normalization.""" + +from __future__ import annotations + +import pytest + +from reviewiq_pipeline.services.text_processor import ( + TextProcessor, + is_valid_iso639, + is_valid_sha256, +) +from reviewiq_pipeline.stages.stage1_normalize import Stage1Normalizer +from reviewiq_pipeline.validation.validators import validate_stage1_output + + +class TestTextProcessor: + """Tests for the TextProcessor service.""" + + def test_normalize_basic(self): + """Test basic text normalization.""" + processor = TextProcessor() + result = processor.normalize(" Hello World! ") + + assert result.normalized == "hello world!" + assert result.word_count == 2 + assert result.char_count == 12 + + def test_normalize_unicode(self): + """Test Unicode normalization.""" + processor = TextProcessor() + # NFC normalization test + result = processor.normalize("café") + + assert "cafe" in result.normalized or "café" in result.normalized + + def test_normalize_control_chars(self): + """Test removal of control characters.""" + processor = TextProcessor() + result = processor.normalize("Hello\x00World\x1fTest") + + assert "\x00" not in result.normalized + assert "\x1f" not in result.normalized + + def test_detect_language_english(self): + """Test English language detection.""" + processor = TextProcessor() + lang = processor.detect_language("This is a test sentence in English.") + + assert lang == "en" + + def test_generate_content_hash(self): + """Test content hash generation.""" + processor = TextProcessor() + hash1 = processor.generate_content_hash("test content") + hash2 = processor.generate_content_hash("test content") + hash3 = processor.generate_content_hash("different content") + + assert hash1 == hash2 # Same input = same hash + assert hash1 != hash3 # Different input = different hash + assert len(hash1) == 64 # SHA256 hex length + + def test_is_empty_or_trivial(self): + """Test empty/trivial text detection.""" + processor = TextProcessor() + + assert processor.is_empty_or_trivial(None) is True + assert processor.is_empty_or_trivial("") is True + assert processor.is_empty_or_trivial(" ") is True + assert processor.is_empty_or_trivial("ab") is True + assert processor.is_empty_or_trivial("abc") is False + assert processor.is_empty_or_trivial("Hello world") is False + + +class TestHelperFunctions: + """Tests for helper functions.""" + + def test_is_valid_iso639(self): + """Test ISO 639-1 validation.""" + assert is_valid_iso639("en") is True + assert is_valid_iso639("es") is True + assert is_valid_iso639("fr") is True + assert is_valid_iso639("de") is True + assert is_valid_iso639("xx") is False + assert is_valid_iso639("") is False + assert is_valid_iso639("english") is False + + def test_is_valid_sha256(self): + """Test SHA256 hash validation.""" + valid_hash = "a" * 64 + invalid_short = "a" * 63 + invalid_long = "a" * 65 + invalid_chars = "g" * 64 # 'g' is not hex + + assert is_valid_sha256(valid_hash) is True + assert is_valid_sha256(invalid_short) is False + assert is_valid_sha256(invalid_long) is False + assert is_valid_sha256(invalid_chars) is False + assert is_valid_sha256("") is False + assert is_valid_sha256(None) is False # type: ignore + + +class TestStage1Normalizer: + """Tests for Stage 1 normalizer.""" + + def test_normalize_review_basic(self, sample_raw_review): + """Test basic review normalization.""" + from reviewiq_pipeline.config import Config + + config = Config() + normalizer = Stage1Normalizer(config) + + result = normalizer._normalize_review( + sample_raw_review, + "test-business", + "test-place", + ) + + assert result is not None + assert result["source"] == "google" + assert result["review_id"] == sample_raw_review["review_id"] + assert result["text"] == sample_raw_review["text"] + assert result["text_normalized"] is not None + assert result["text_language"] == "en" + assert len(result["content_hash"]) == 64 + + def test_normalize_empty_review(self): + """Test that empty reviews are skipped.""" + from reviewiq_pipeline.config import Config + + config = Config() + normalizer = Stage1Normalizer(config) + + empty_review = { + "review_id": "test-empty", + "author_name": "Test", + "rating": 5, + "text": "", + "review_time": "2026-01-20T14:30:00Z", + } + + result = normalizer._normalize_review(empty_review, "test-business", "test-place") + assert result is None + + def test_normalize_batch(self, sample_raw_review): + """Test batch normalization.""" + from reviewiq_pipeline.config import Config + + config = Config() + normalizer = Stage1Normalizer(config) + + reviews = [ + sample_raw_review, + { + "review_id": "second-review", + "author_name": "Jane", + "rating": 5, + "text": "Great service!", + "review_time": "2026-01-21T10:00:00Z", + }, + ] + + results = normalizer.normalize_batch(reviews, "test-business", "test-place") + assert len(results) == 2 + + def test_normalize_deduplication(self, sample_raw_review): + """Test that duplicate reviews are detected.""" + from reviewiq_pipeline.config import Config + + config = Config() + normalizer = Stage1Normalizer(config) + + # Two reviews with same text + reviews = [ + sample_raw_review, + {**sample_raw_review, "review_id": "duplicate-review"}, + ] + + results = normalizer.normalize_batch(reviews, "test-business", "test-place") + assert len(results) == 1 # Duplicate should be filtered + + +class TestStage1Validation: + """Tests for Stage 1 validation.""" + + def test_validate_valid_output(self, sample_stage1_output): + """Test validation of valid Stage 1 output.""" + result = validate_stage1_output(sample_stage1_output) + + assert result["stage"] == "stage1" + assert result["passed"] is True + assert result["error_count"] == 0 + + def test_validate_empty_text(self, sample_stage1_output): + """Test validation catches empty text.""" + sample_stage1_output["reviews_normalized"][0]["text"] = "" + + result = validate_stage1_output(sample_stage1_output) + + assert result["passed"] is False + assert any(e["rule"] == "V1.1" for e in result["errors"]) + + def test_validate_invalid_hash(self, sample_stage1_output): + """Test validation catches invalid content hash.""" + sample_stage1_output["reviews_normalized"][0]["content_hash"] = "invalid" + + result = validate_stage1_output(sample_stage1_output) + + assert result["passed"] is False + assert any(e["rule"] == "V1.3" for e in result["errors"]) + + def test_validate_invalid_language(self, sample_stage1_output): + """Test validation catches invalid language code.""" + sample_stage1_output["reviews_normalized"][0]["text_language"] = "invalid" + + result = validate_stage1_output(sample_stage1_output) + + assert result["passed"] is False + assert any(e["rule"] == "V1.5" for e in result["errors"]) diff --git a/packages/reviewiq-pipeline/tests/test_stage2.py b/packages/reviewiq-pipeline/tests/test_stage2.py new file mode 100644 index 0000000..ec94896 --- /dev/null +++ b/packages/reviewiq-pipeline/tests/test_stage2.py @@ -0,0 +1,193 @@ +"""Tests for Stage 2: LLM Classification.""" + +from __future__ import annotations + +import pytest + +from reviewiq_pipeline.services.llm_client import create_fallback_response +from reviewiq_pipeline.validation.validators import validate_stage2_output + + +class TestLLMClient: + """Tests for LLM client functionality.""" + + def test_fallback_response_structure(self): + """Test that fallback response has correct structure.""" + review_text = "This is a test review." + response = create_fallback_response(review_text) + + assert "spans" in response + assert "review_summary" in response + assert len(response["spans"]) == 1 + + span = response["spans"][0] + assert span["span_index"] == 0 + assert span["span_text"] == review_text + assert span["span_start"] == 0 + assert span["span_end"] == len(review_text) + assert span["is_primary"] is True + assert span["confidence"] == "low" + + def test_fallback_response_valid_urt(self): + """Test that fallback response has valid URT codes.""" + response = create_fallback_response("Test review") + span = response["spans"][0] + + assert span["urt_primary"] == "O1.01" + assert span["valence"] == "V0" + assert span["intensity"] == "I1" + + +class TestStage2Validation: + """Tests for Stage 2 validation.""" + + def test_validate_valid_output(self, sample_stage2_output): + """Test validation of valid Stage 2 output.""" + result = validate_stage2_output(sample_stage2_output) + + assert result["stage"] == "stage2" + assert result["passed"] is True + assert result["error_count"] == 0 + + def test_validate_invalid_urt_code(self, sample_stage2_output): + """Test validation catches invalid URT code.""" + sample_stage2_output["reviews_classified"][0]["urt_primary"] = "INVALID" + + result = validate_stage2_output(sample_stage2_output) + + assert result["passed"] is False + assert any(e["rule"] == "V2.1" for e in result["errors"]) + + def test_validate_too_many_secondary(self, sample_stage2_output): + """Test validation catches too many secondary codes.""" + sample_stage2_output["reviews_classified"][0]["urt_secondary"] = [ + "O1.01", "O1.02", "O1.03" + ] + + result = validate_stage2_output(sample_stage2_output) + + assert result["passed"] is False + assert any(e["rule"] == "V2.2" for e in result["errors"]) + + def test_validate_invalid_valence(self, sample_stage2_output): + """Test validation catches invalid valence.""" + sample_stage2_output["reviews_classified"][0]["valence"] = "INVALID" + + result = validate_stage2_output(sample_stage2_output) + + assert result["passed"] is False + assert any(e["rule"] == "V2.3" for e in result["errors"]) + + def test_validate_invalid_trust_score(self, sample_stage2_output): + """Test validation catches trust score out of bounds.""" + sample_stage2_output["reviews_classified"][0]["trust_score"] = 0.1 + + result = validate_stage2_output(sample_stage2_output) + + assert result["passed"] is False + assert any(e["rule"] == "V2.9" for e in result["errors"]) + + def test_validate_invalid_embedding_dim(self, sample_stage2_output): + """Test validation catches wrong embedding dimension.""" + sample_stage2_output["reviews_classified"][0]["embedding"] = [0.1] * 100 + + result = validate_stage2_output(sample_stage2_output) + + assert result["passed"] is False + assert any(e["rule"] == "V2.10" for e in result["errors"]) + + def test_validate_multiple_primaries(self, sample_stage2_output): + """Test validation catches multiple primary spans.""" + for span in sample_stage2_output["reviews_classified"][0]["spans"]: + span["is_primary"] = True + + result = validate_stage2_output(sample_stage2_output) + + assert result["passed"] is False + assert any(e["rule"] == "V2.8" for e in result["errors"]) + + def test_validate_no_primary(self, sample_stage2_output): + """Test validation catches no primary span.""" + for span in sample_stage2_output["reviews_classified"][0]["spans"]: + span["is_primary"] = False + + result = validate_stage2_output(sample_stage2_output) + + assert result["passed"] is False + assert any(e["rule"] == "V2.8" for e in result["errors"]) + + def test_validate_invalid_span_bounds(self, sample_stage2_output): + """Test validation catches invalid span bounds.""" + sample_stage2_output["reviews_classified"][0]["spans"][0]["span_start"] = 100 + sample_stage2_output["reviews_classified"][0]["spans"][0]["span_end"] = 50 + + result = validate_stage2_output(sample_stage2_output) + + assert result["passed"] is False + assert any(e["rule"] == "V2.5" for e in result["errors"]) + + +class TestSpanExtraction: + """Tests for span extraction logic.""" + + def test_primary_span_selection_by_intensity(self): + """Test that primary span is selected by highest intensity.""" + from reviewiq_pipeline.stages.stage2_classify import Stage2Classifier + from reviewiq_pipeline.config import Config + + config = Config() + classifier = Stage2Classifier(config) + + spans = [ + { + "span_id": "span1", + "span_index": 0, + "valence": "V-", + "intensity": "I1", + "is_primary": False, + }, + { + "span_id": "span2", + "span_index": 1, + "valence": "V-", + "intensity": "I3", + "is_primary": False, + }, + ] + + result = classifier._ensure_primary_span(spans) + + # Span with I3 should be primary + assert result[1]["is_primary"] is True + assert result[0]["is_primary"] is False + + def test_primary_span_selection_by_valence(self): + """Test that negative valence is preferred over positive.""" + from reviewiq_pipeline.stages.stage2_classify import Stage2Classifier + from reviewiq_pipeline.config import Config + + config = Config() + classifier = Stage2Classifier(config) + + spans = [ + { + "span_id": "span1", + "span_index": 0, + "valence": "V+", + "intensity": "I2", + "is_primary": False, + }, + { + "span_id": "span2", + "span_index": 1, + "valence": "V-", + "intensity": "I2", + "is_primary": False, + }, + ] + + result = classifier._ensure_primary_span(spans) + + # Span with V- should be primary + assert result[1]["is_primary"] is True + assert result[0]["is_primary"] is False diff --git a/packages/reviewiq-pipeline/tests/test_stage3.py b/packages/reviewiq-pipeline/tests/test_stage3.py new file mode 100644 index 0000000..fa254d5 --- /dev/null +++ b/packages/reviewiq-pipeline/tests/test_stage3.py @@ -0,0 +1,162 @@ +"""Tests for Stage 3: Issue Routing.""" + +from __future__ import annotations + +import pytest + +from reviewiq_pipeline.stages.stage3_route import Stage3Router +from reviewiq_pipeline.validation.validators import Stage3Validator + + +class TestStage3Router: + """Tests for Stage 3 router.""" + + def test_generate_routing_key(self): + """Test routing key generation.""" + from reviewiq_pipeline.config import Config + + config = Config() + router = Stage3Router(config) + + span = { + "span_id": "test-span", + "business_id": "acme-corp", + "place_id": "place123", + "urt_primary": "J1.01", + "valence": "V-", + "intensity": "I3", + "entity_normalized": "mike", + "review_time": "2026-01-20T14:30:00Z", + "confidence": "high", + "trust_score": 0.85, + } + + key = router._generate_routing_key(span) + assert key == "acme-corp|place123|J1.01|mike" + + def test_generate_routing_key_no_entity(self): + """Test routing key generation without entity.""" + from reviewiq_pipeline.config import Config + + config = Config() + router = Stage3Router(config) + + span = { + "span_id": "test-span", + "business_id": "acme-corp", + "place_id": "place123", + "urt_primary": "J1.01", + "valence": "V-", + "intensity": "I3", + "entity_normalized": None, + "review_time": "2026-01-20T14:30:00Z", + "confidence": "high", + "trust_score": 0.85, + } + + key = router._generate_routing_key(span) + assert key == "acme-corp|place123|J1.01|" + + def test_generate_issue_id(self): + """Test deterministic issue ID generation.""" + from reviewiq_pipeline.config import Config + + config = Config() + router = Stage3Router(config) + + key1 = "acme-corp|place123|J1.01|" + key2 = "acme-corp|place123|J1.01|" + key3 = "acme-corp|place123|J1.02|" + + id1 = router._generate_issue_id(key1) + id2 = router._generate_issue_id(key2) + id3 = router._generate_issue_id(key3) + + # Same key = same ID + assert id1 == id2 + # Different key = different ID + assert id1 != id3 + # Format check + assert id1.startswith("ISS-") + assert len(id1) == 20 # ISS- + 16 hex chars + + def test_route_span_sync(self): + """Test synchronous span routing.""" + from reviewiq_pipeline.config import Config + + config = Config() + router = Stage3Router(config) + + span = { + "span_id": "test-span", + "business_id": "acme-corp", + "place_id": "place123", + "urt_primary": "J1.01", + "valence": "V-", + "intensity": "I3", + "entity_normalized": None, + "review_time": "2026-01-20T14:30:00Z", + "confidence": "high", + "trust_score": 0.85, + } + + result = router.route_span_sync(span) + + assert result["span_id"] == "test-span" + assert result["issue_id"].startswith("ISS-") + assert "J1.01" in result["routing_key"] + + def test_route_span_rejects_positive(self): + """Test that positive spans cannot be routed.""" + from reviewiq_pipeline.config import Config + + config = Config() + router = Stage3Router(config) + + span = { + "span_id": "test-span", + "business_id": "acme-corp", + "place_id": "place123", + "urt_primary": "O1.01", + "valence": "V+", # Positive + "intensity": "I2", + "entity_normalized": None, + "review_time": "2026-01-20T14:30:00Z", + "confidence": "high", + "trust_score": 0.85, + } + + with pytest.raises(ValueError, match="Cannot route positive span"): + router.route_span_sync(span) + + +class TestStage3Validation: + """Tests for Stage 3 validation.""" + + def test_validate_valid_output(self, sample_stage3_output): + """Test validation of valid Stage 3 output.""" + validator = Stage3Validator() + result = validator.validate_sync(sample_stage3_output) + + assert result["stage"] == "stage3" + assert result["passed"] is True + + def test_validate_invalid_issue_id(self, sample_stage3_output): + """Test validation catches invalid issue ID format.""" + sample_stage3_output["routed_spans"][0]["issue_id"] = "INVALID" + + validator = Stage3Validator() + result = validator.validate_sync(sample_stage3_output) + + assert result["passed"] is False + assert any(e["rule"] == "V3.1" for e in result["errors"]) + + def test_validate_empty_routing_key(self, sample_stage3_output): + """Test validation catches empty routing key.""" + sample_stage3_output["routed_spans"][0]["routing_key"] = "" + + validator = Stage3Validator() + result = validator.validate_sync(sample_stage3_output) + + assert result["passed"] is False + assert any(e["rule"] == "V3.2" for e in result["errors"]) diff --git a/packages/reviewiq-pipeline/tests/test_stage4.py b/packages/reviewiq-pipeline/tests/test_stage4.py new file mode 100644 index 0000000..c9c660a --- /dev/null +++ b/packages/reviewiq-pipeline/tests/test_stage4.py @@ -0,0 +1,201 @@ +"""Tests for Stage 4: Fact Aggregation.""" + +from __future__ import annotations + +from datetime import date + +import pytest + +from reviewiq_pipeline.stages.stage4_aggregate import Stage4Aggregator +from reviewiq_pipeline.validation.validators import validate_stage4_output + + +class TestStage4Aggregator: + """Tests for Stage 4 aggregator.""" + + def test_get_bucket_range_day(self): + """Test day bucket range calculation.""" + from reviewiq_pipeline.config import Config + + config = Config() + aggregator = Stage4Aggregator(config) + + target = date(2026, 1, 20) + start, end = aggregator._get_bucket_range(target, "day") + + assert start == target + assert end == target + + def test_get_bucket_range_week(self): + """Test week bucket range calculation.""" + from reviewiq_pipeline.config import Config + + config = Config() + aggregator = Stage4Aggregator(config) + + # 2026-01-20 is a Tuesday + target = date(2026, 1, 20) + start, end = aggregator._get_bucket_range(target, "week") + + # Week should start on Monday (Jan 19) and end on Sunday (Jan 25) + assert start == date(2026, 1, 19) + assert end == date(2026, 1, 25) + + def test_get_bucket_range_month(self): + """Test month bucket range calculation.""" + from reviewiq_pipeline.config import Config + + config = Config() + aggregator = Stage4Aggregator(config) + + target = date(2026, 1, 20) + start, end = aggregator._get_bucket_range(target, "month") + + assert start == date(2026, 1, 1) + assert end == date(2026, 1, 31) + + def test_compute_strength_score(self): + """Test strength score calculation.""" + from reviewiq_pipeline.config import Config + + config = Config() + aggregator = Stage4Aggregator(config) + + spans = [ + {"valence": "V-", "intensity": "I3"}, # 4 * 1.0 = 4 + {"valence": "V-", "intensity": "I2"}, # 2 * 1.0 = 2 + {"valence": "V+", "intensity": "I2"}, # 2 * 1.0 = 2 + {"valence": "V0", "intensity": "I1"}, # 1 * 0.0 = 0 + ] + + score = aggregator._compute_strength_score(spans) + assert score == 8.0 + + def test_compute_trust_weighted_strength(self): + """Test trust-weighted strength calculation.""" + from reviewiq_pipeline.config import Config + + config = Config() + aggregator = Stage4Aggregator(config) + + spans = [ + {"valence": "V-", "intensity": "I3", "trust_score": 1.0}, # 4 * 1.0 * 1.0 = 4 + {"valence": "V-", "intensity": "I2", "trust_score": 0.5}, # 2 * 1.0 * 0.5 = 1 + ] + + score = aggregator._compute_trust_weighted_strength(spans) + assert score == 5.0 + + def test_compute_fact_metrics(self): + """Test fact metrics computation.""" + from reviewiq_pipeline.config import Config + + config = Config() + aggregator = Stage4Aggregator(config) + + spans = [ + { + "valence": "V-", + "intensity": "I3", + "comparative": "CR-N", + "trust_score": 0.8, + "rating": 2, + }, + { + "valence": "V+", + "intensity": "I2", + "comparative": "CR-B", + "trust_score": 0.9, + "rating": 5, + }, + ] + + fact = aggregator._compute_fact_metrics( + spans, + "test-business", + "test-place", + "2026-01-20", + "day", + "urt_code", + "J1.01", + "v5.1", + ) + + assert fact["span_count"] == 2 + assert fact["negative_count"] == 1 + assert fact["positive_count"] == 1 + assert fact["i3_count"] == 1 + assert fact["i2_count"] == 1 + assert fact["cr_better"] == 1 + assert fact["avg_rating"] == 3.5 + + +class TestStage4Validation: + """Tests for Stage 4 validation.""" + + def test_validate_valid_output(self, sample_stage4_output): + """Test validation of valid Stage 4 output.""" + result = validate_stage4_output(sample_stage4_output) + + assert result["stage"] == "stage4" + assert result["passed"] is True + + def test_validate_span_less_than_review(self, sample_stage4_output): + """Test validation catches span_count < review_count.""" + sample_stage4_output["facts_written"][0]["span_count"] = 0 + sample_stage4_output["facts_written"][0]["review_count"] = 1 + + result = validate_stage4_output(sample_stage4_output) + + assert result["passed"] is False + assert any(e["rule"] == "V4.3" for e in result["errors"]) + + def test_validate_valence_sum(self, sample_stage4_output): + """Test validation catches valence sum mismatch.""" + # Set span_count to 5 but valence counts only sum to 1 + sample_stage4_output["facts_written"][0]["span_count"] = 5 + + result = validate_stage4_output(sample_stage4_output) + + assert result["passed"] is False + assert any(e["rule"] == "V4.4" for e in result["errors"]) + + def test_validate_intensity_sum(self, sample_stage4_output): + """Test validation catches intensity sum mismatch.""" + # Set span_count to 5 but intensity counts only sum to 1 + sample_stage4_output["facts_written"][0]["span_count"] = 5 + # Fix valence sum + sample_stage4_output["facts_written"][0]["negative_count"] = 5 + + result = validate_stage4_output(sample_stage4_output) + + assert result["passed"] is False + assert any(e["rule"] == "V4.5" for e in result["errors"]) + + def test_validate_negative_strength(self, sample_stage4_output): + """Test validation catches negative strength score.""" + sample_stage4_output["facts_written"][0]["strength_score"] = -1.0 + + result = validate_stage4_output(sample_stage4_output) + + assert result["passed"] is False + assert any(e["rule"] == "V4.6" for e in result["errors"]) + + def test_validate_invalid_rating(self, sample_stage4_output): + """Test validation catches invalid average rating.""" + sample_stage4_output["facts_written"][0]["avg_rating"] = 6.0 + + result = validate_stage4_output(sample_stage4_output) + + assert result["passed"] is False + assert any(e["rule"] == "V4.7" for e in result["errors"]) + + def test_validate_null_rating_allowed(self, sample_stage4_output): + """Test that NULL rating is allowed.""" + sample_stage4_output["facts_written"][0]["avg_rating"] = None + + result = validate_stage4_output(sample_stage4_output) + + # Should still pass (NULL is valid) + # Check no V4.7 errors + assert not any(e["rule"] == "V4.7" for e in result["errors"])