feat: Add reviewiq-pipeline package for LLM-powered review classification

Implement a standalone Python package for processing customer reviews through a 4-stage pipeline using URT (Universal Review Taxonomy) v5.1: - Stage 1: Normalization (text cleaning, language detection, deduplication) - Stage 2: LLM Classification (OpenAI/Anthropic span extraction with URT codes) - Stage 3: Issue Routing (deterministic issue ID generation, span linking) - Stage 4: Fact Aggregation (time series metrics for dashboards) Package includes: - TypedDict contracts matching Pipeline-Contracts-v1.md - Async database layer with asyncpg and 5 SQL migrations - LLM client abstraction supporting both OpenAI and Anthropic - Sentence-transformers integration for embeddings - Validation rules V1.x through V4.x - CLI commands: migrate, run, validate, check - 55 unit and integration tests (all passing) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 18:07:11 +00:00
parent b780a23b66
commit 7d720f5378
34 changed files with 7222 additions and 0 deletions
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/pipeline.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/pipeline.py
@@ -0,0 +1,402 @@
+"""
+Pipeline class - main public API for the ReviewIQ pipeline.
+
+Provides a unified interface for running pipeline stages.
+"""
+
+from __future__ import annotations
+
+import logging
+from datetime import date
+from typing import TYPE_CHECKING, Any
+
+from reviewiq_pipeline.config import Config
+from reviewiq_pipeline.contracts import (
+    ClassificationConfig,
+    NormalizedReview,
+    ReviewToClassify,
+    ScraperOutput,
+    SpanToRoute,
+    Stage1Input,
+    Stage1Output,
+    Stage2Input,
+    Stage2Output,
+    Stage3Input,
+    Stage3Output,
+    Stage4Input,
+    Stage4Output,
+    ValidationResult,
+)
+from reviewiq_pipeline.db.connection import DatabasePool
+from reviewiq_pipeline.db.repositories import (
+    FactRepository,
+    IssueRepository,
+    ReviewRepository,
+    SpanRepository,
+)
+from reviewiq_pipeline.services.embeddings import EmbeddingService
+from reviewiq_pipeline.stages.stage1_normalize import Stage1Normalizer
+from reviewiq_pipeline.stages.stage2_classify import Stage2Classifier
+from reviewiq_pipeline.stages.stage3_route import Stage3Router
+from reviewiq_pipeline.stages.stage4_aggregate import Stage4Aggregator
+from reviewiq_pipeline.validation.validators import (
+    validate_stage1_output,
+    validate_stage2_output,
+    validate_stage3_output,
+    validate_stage4_output,
+)
+
+if TYPE_CHECKING:
+    pass
+
+logger = logging.getLogger(__name__)
+
+
+class PipelineResult:
+    """Result from running the full pipeline."""
+
+    def __init__(
+        self,
+        stage1: Stage1Output | None = None,
+        stage2: Stage2Output | None = None,
+        stage3: Stage3Output | None = None,
+        stage4: Stage4Output | None = None,
+        validation: dict[str, ValidationResult] | None = None,
+    ):
+        self.stage1 = stage1
+        self.stage2 = stage2
+        self.stage3 = stage3
+        self.stage4 = stage4
+        self.validation = validation or {}
+
+    @property
+    def success(self) -> bool:
+        """Check if all ran stages passed validation."""
+        return all(v["passed"] for v in self.validation.values())
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary."""
+        return {
+            "stage1": self.stage1,
+            "stage2": self.stage2,
+            "stage3": self.stage3,
+            "stage4": self.stage4,
+            "validation": self.validation,
+            "success": self.success,
+        }
+
+
+class Pipeline:
+    """
+    Main pipeline class for processing reviews.
+
+    Usage:
+        config = Config(database_url="...", llm_provider="openai", ...)
+        pipeline = Pipeline(config)
+
+        # Run full pipeline
+        result = await pipeline.process(scraper_output)
+
+        # Or run individual stages
+        stage1_result = await pipeline.normalize(scraper_output)
+        stage2_result = await pipeline.classify(stage1_result)
+    """
+
+    def __init__(self, config: Config):
+        """
+        Initialize the pipeline.
+
+        Args:
+            config: Pipeline configuration
+        """
+        self.config = config
+        self._db: DatabasePool | None = None
+        self._review_repo: ReviewRepository | None = None
+        self._span_repo: SpanRepository | None = None
+        self._issue_repo: IssueRepository | None = None
+        self._fact_repo: FactRepository | None = None
+        self._embedding_service: EmbeddingService | None = None
+        self._initialized = False
+
+    async def initialize(self) -> None:
+        """Initialize database connections and services."""
+        if self._initialized:
+            return
+
+        logger.info("Initializing pipeline...")
+
+        # Initialize database
+        self._db = DatabasePool(self.config)
+        await self._db.initialize()
+
+        # Initialize repositories
+        self._review_repo = ReviewRepository(self._db)
+        self._span_repo = SpanRepository(self._db)
+        self._issue_repo = IssueRepository(self._db)
+        self._fact_repo = FactRepository(self._db)
+
+        # Initialize embedding service
+        self._embedding_service = EmbeddingService(self.config)
+
+        self._initialized = True
+        logger.info("Pipeline initialized")
+
+    async def close(self) -> None:
+        """Close all connections and cleanup resources."""
+        if self._db:
+            await self._db.close()
+            self._db = None
+
+        self._initialized = False
+        logger.info("Pipeline closed")
+
+    async def migrate(self) -> int:
+        """
+        Run database migrations.
+
+        Returns:
+            Number of migrations run
+        """
+        if not self._db:
+            self._db = DatabasePool(self.config)
+            await self._db.initialize()
+
+        return await self._db.run_migrations()
+
+    async def process(
+        self,
+        scraper_output: ScraperOutput,
+        stages: list[int] | None = None,
+        validate: bool = True,
+    ) -> PipelineResult:
+        """
+        Run the full pipeline on scraper output.
+
+        Args:
+            scraper_output: Output from the scraper (Stage 0)
+            stages: List of stages to run (default: all [1, 2, 3, 4])
+            validate: Whether to validate each stage output
+
+        Returns:
+            PipelineResult with all stage outputs and validation results
+        """
+        await self.initialize()
+
+        stages = stages or [1, 2, 3, 4]
+        result = PipelineResult()
+        validation_results: dict[str, ValidationResult] = {}
+
+        # Stage 1: Normalize
+        if 1 in stages:
+            logger.info("Running Stage 1: Normalization")
+            result.stage1 = await self.normalize(scraper_output)
+
+            if validate:
+                validation_results["stage1"] = validate_stage1_output(result.stage1)
+
+        # Stage 2: Classify
+        if 2 in stages and result.stage1:
+            logger.info("Running Stage 2: Classification")
+            result.stage2 = await self.classify(result.stage1)
+
+            if validate:
+                # Build input reviews map for validation
+                input_reviews = {
+                    (r["source"], r["review_id"], r["review_version"]): r
+                    for r in result.stage1["reviews_normalized"]
+                }
+                validation_results["stage2"] = validate_stage2_output(
+                    result.stage2, input_reviews
+                )
+
+        # Stage 3: Route
+        if 3 in stages and result.stage2:
+            logger.info("Running Stage 3: Issue Routing")
+            result.stage3 = await self.route(result.stage2)
+
+            if validate:
+                validation_results["stage3"] = await validate_stage3_output(
+                    result.stage3, self._db
+                )
+
+        # Stage 4: Aggregate
+        if 4 in stages:
+            logger.info("Running Stage 4: Aggregation")
+            result.stage4 = await self.aggregate(
+                scraper_output["business_id"],
+                date.today().isoformat(),
+            )
+
+            if validate:
+                validation_results["stage4"] = validate_stage4_output(result.stage4)
+
+        result.validation = validation_results
+        return result
+
+    async def normalize(self, scraper_output: ScraperOutput) -> Stage1Output:
+        """
+        Run Stage 1: Normalization.
+
+        Args:
+            scraper_output: Raw scraper output
+
+        Returns:
+            Stage1Output with normalized reviews
+        """
+        await self.initialize()
+
+        stage1 = Stage1Normalizer(
+            self.config,
+            self._db,
+            self._review_repo,
+        )
+
+        input_data = Stage1Input(
+            job_id=scraper_output["job_id"],
+            business_id=scraper_output["business_id"],
+            place_id=scraper_output["place_id"],
+            reviews=scraper_output["reviews"],
+        )
+
+        return await stage1.process(input_data)
+
+    async def classify(self, stage1_output: Stage1Output) -> Stage2Output:
+        """
+        Run Stage 2: Classification.
+
+        Args:
+            stage1_output: Output from Stage 1
+
+        Returns:
+            Stage2Output with classified reviews
+        """
+        await self.initialize()
+
+        stage2 = Stage2Classifier(
+            self.config,
+            self._db,
+            self._review_repo,
+            self._span_repo,
+            self._embedding_service,
+        )
+
+        # Convert normalized reviews to classification input
+        reviews_to_classify = [
+            ReviewToClassify(
+                source=r["source"],
+                review_id=r["review_id"],
+                review_version=r["review_version"],
+                business_id=r["business_id"],
+                place_id=r["place_id"],
+                text=r["text"],
+                text_normalized=r["text_normalized"],
+                rating=r["rating"],
+                review_time=r["review_time"],
+            )
+            for r in stage1_output["reviews_normalized"]
+        ]
+
+        input_data = Stage2Input(
+            reviews=reviews_to_classify,
+            config=ClassificationConfig(
+                model=self.config.llm_model,
+                taxonomy_version=self.config.taxonomy_version,
+                profile=self.config.classification_profile,
+                max_spans_per_review=self.config.max_spans_per_review,
+            ),
+        )
+
+        try:
+            return await stage2.process(input_data)
+        finally:
+            await stage2.close()
+
+    async def route(self, stage2_output: Stage2Output) -> Stage3Output:
+        """
+        Run Stage 3: Issue Routing.
+
+        Args:
+            stage2_output: Output from Stage 2
+
+        Returns:
+            Stage3Output with routing results
+        """
+        await self.initialize()
+
+        stage3 = Stage3Router(
+            self.config,
+            self._db,
+            self._span_repo,
+            self._issue_repo,
+        )
+
+        # Extract negative/mixed spans for routing
+        spans_to_route = []
+        for review in stage2_output["reviews_classified"]:
+            for span in review.get("spans", []):
+                if span["valence"] in ("V-", "V±"):
+                    spans_to_route.append(
+                        SpanToRoute(
+                            span_id=span["span_id"],
+                            business_id=review.get("business_id", ""),
+                            place_id=review.get("place_id", ""),
+                            urt_primary=span["urt_primary"],
+                            valence=span["valence"],
+                            intensity=span["intensity"],
+                            entity_normalized=span.get("entity_normalized"),
+                            review_time=review.get("review_time", ""),
+                            confidence=span.get("confidence", "medium"),
+                            trust_score=review.get("trust_score", 0.5),
+                        )
+                    )
+
+        return await stage3.process(Stage3Input(spans=spans_to_route))
+
+    async def aggregate(
+        self,
+        business_id: str,
+        date_str: str,
+        bucket_types: list[str] | None = None,
+    ) -> Stage4Output:
+        """
+        Run Stage 4: Fact Aggregation.
+
+        Args:
+            business_id: Business identifier
+            date_str: Date string (YYYY-MM-DD)
+            bucket_types: List of bucket types (default: ['day'])
+
+        Returns:
+            Stage4Output with aggregated facts
+        """
+        await self.initialize()
+
+        stage4 = Stage4Aggregator(
+            self.config,
+            self._db,
+            self._fact_repo,
+        )
+
+        input_data = Stage4Input(
+            business_id=business_id,
+            date=date_str,
+            bucket_types=bucket_types or ["day"],  # type: ignore
+            taxonomy_version=self.config.taxonomy_version,
+        )
+
+        return await stage4.process(input_data)
+
+    async def validate(self, job_id: str) -> dict[str, ValidationResult]:
+        """
+        Validate pipeline output for a job.
+
+        Args:
+            job_id: Job identifier
+
+        Returns:
+            Dictionary of validation results by stage
+        """
+        # This would query the database for the job's output and validate
+        # For now, return empty results
+        logger.warning(f"validate() for job {job_id} not fully implemented")
+        return {}