feat: Add reviewiq-pipeline package for LLM-powered review classification

Implement a standalone Python package for processing customer reviews through
a 4-stage pipeline using URT (Universal Review Taxonomy) v5.1:

- Stage 1: Normalization (text cleaning, language detection, deduplication)
- Stage 2: LLM Classification (OpenAI/Anthropic span extraction with URT codes)
- Stage 3: Issue Routing (deterministic issue ID generation, span linking)
- Stage 4: Fact Aggregation (time series metrics for dashboards)

Package includes:
- TypedDict contracts matching Pipeline-Contracts-v1.md
- Async database layer with asyncpg and 5 SQL migrations
- LLM client abstraction supporting both OpenAI and Anthropic
- Sentence-transformers integration for embeddings
- Validation rules V1.x through V4.x
- CLI commands: migrate, run, validate, check
- 55 unit and integration tests (all passing)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 18:07:11 +00:00
parent b780a23b66
commit 7d720f5378
34 changed files with 7222 additions and 0 deletions

View File

@@ -0,0 +1,402 @@
"""
Pipeline class - main public API for the ReviewIQ pipeline.
Provides a unified interface for running pipeline stages.
"""
from __future__ import annotations
import logging
from datetime import date
from typing import TYPE_CHECKING, Any
from reviewiq_pipeline.config import Config
from reviewiq_pipeline.contracts import (
ClassificationConfig,
NormalizedReview,
ReviewToClassify,
ScraperOutput,
SpanToRoute,
Stage1Input,
Stage1Output,
Stage2Input,
Stage2Output,
Stage3Input,
Stage3Output,
Stage4Input,
Stage4Output,
ValidationResult,
)
from reviewiq_pipeline.db.connection import DatabasePool
from reviewiq_pipeline.db.repositories import (
FactRepository,
IssueRepository,
ReviewRepository,
SpanRepository,
)
from reviewiq_pipeline.services.embeddings import EmbeddingService
from reviewiq_pipeline.stages.stage1_normalize import Stage1Normalizer
from reviewiq_pipeline.stages.stage2_classify import Stage2Classifier
from reviewiq_pipeline.stages.stage3_route import Stage3Router
from reviewiq_pipeline.stages.stage4_aggregate import Stage4Aggregator
from reviewiq_pipeline.validation.validators import (
validate_stage1_output,
validate_stage2_output,
validate_stage3_output,
validate_stage4_output,
)
if TYPE_CHECKING:
pass
logger = logging.getLogger(__name__)
class PipelineResult:
"""Result from running the full pipeline."""
def __init__(
self,
stage1: Stage1Output | None = None,
stage2: Stage2Output | None = None,
stage3: Stage3Output | None = None,
stage4: Stage4Output | None = None,
validation: dict[str, ValidationResult] | None = None,
):
self.stage1 = stage1
self.stage2 = stage2
self.stage3 = stage3
self.stage4 = stage4
self.validation = validation or {}
@property
def success(self) -> bool:
"""Check if all ran stages passed validation."""
return all(v["passed"] for v in self.validation.values())
def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary."""
return {
"stage1": self.stage1,
"stage2": self.stage2,
"stage3": self.stage3,
"stage4": self.stage4,
"validation": self.validation,
"success": self.success,
}
class Pipeline:
"""
Main pipeline class for processing reviews.
Usage:
config = Config(database_url="...", llm_provider="openai", ...)
pipeline = Pipeline(config)
# Run full pipeline
result = await pipeline.process(scraper_output)
# Or run individual stages
stage1_result = await pipeline.normalize(scraper_output)
stage2_result = await pipeline.classify(stage1_result)
"""
def __init__(self, config: Config):
"""
Initialize the pipeline.
Args:
config: Pipeline configuration
"""
self.config = config
self._db: DatabasePool | None = None
self._review_repo: ReviewRepository | None = None
self._span_repo: SpanRepository | None = None
self._issue_repo: IssueRepository | None = None
self._fact_repo: FactRepository | None = None
self._embedding_service: EmbeddingService | None = None
self._initialized = False
async def initialize(self) -> None:
"""Initialize database connections and services."""
if self._initialized:
return
logger.info("Initializing pipeline...")
# Initialize database
self._db = DatabasePool(self.config)
await self._db.initialize()
# Initialize repositories
self._review_repo = ReviewRepository(self._db)
self._span_repo = SpanRepository(self._db)
self._issue_repo = IssueRepository(self._db)
self._fact_repo = FactRepository(self._db)
# Initialize embedding service
self._embedding_service = EmbeddingService(self.config)
self._initialized = True
logger.info("Pipeline initialized")
async def close(self) -> None:
"""Close all connections and cleanup resources."""
if self._db:
await self._db.close()
self._db = None
self._initialized = False
logger.info("Pipeline closed")
async def migrate(self) -> int:
"""
Run database migrations.
Returns:
Number of migrations run
"""
if not self._db:
self._db = DatabasePool(self.config)
await self._db.initialize()
return await self._db.run_migrations()
async def process(
self,
scraper_output: ScraperOutput,
stages: list[int] | None = None,
validate: bool = True,
) -> PipelineResult:
"""
Run the full pipeline on scraper output.
Args:
scraper_output: Output from the scraper (Stage 0)
stages: List of stages to run (default: all [1, 2, 3, 4])
validate: Whether to validate each stage output
Returns:
PipelineResult with all stage outputs and validation results
"""
await self.initialize()
stages = stages or [1, 2, 3, 4]
result = PipelineResult()
validation_results: dict[str, ValidationResult] = {}
# Stage 1: Normalize
if 1 in stages:
logger.info("Running Stage 1: Normalization")
result.stage1 = await self.normalize(scraper_output)
if validate:
validation_results["stage1"] = validate_stage1_output(result.stage1)
# Stage 2: Classify
if 2 in stages and result.stage1:
logger.info("Running Stage 2: Classification")
result.stage2 = await self.classify(result.stage1)
if validate:
# Build input reviews map for validation
input_reviews = {
(r["source"], r["review_id"], r["review_version"]): r
for r in result.stage1["reviews_normalized"]
}
validation_results["stage2"] = validate_stage2_output(
result.stage2, input_reviews
)
# Stage 3: Route
if 3 in stages and result.stage2:
logger.info("Running Stage 3: Issue Routing")
result.stage3 = await self.route(result.stage2)
if validate:
validation_results["stage3"] = await validate_stage3_output(
result.stage3, self._db
)
# Stage 4: Aggregate
if 4 in stages:
logger.info("Running Stage 4: Aggregation")
result.stage4 = await self.aggregate(
scraper_output["business_id"],
date.today().isoformat(),
)
if validate:
validation_results["stage4"] = validate_stage4_output(result.stage4)
result.validation = validation_results
return result
async def normalize(self, scraper_output: ScraperOutput) -> Stage1Output:
"""
Run Stage 1: Normalization.
Args:
scraper_output: Raw scraper output
Returns:
Stage1Output with normalized reviews
"""
await self.initialize()
stage1 = Stage1Normalizer(
self.config,
self._db,
self._review_repo,
)
input_data = Stage1Input(
job_id=scraper_output["job_id"],
business_id=scraper_output["business_id"],
place_id=scraper_output["place_id"],
reviews=scraper_output["reviews"],
)
return await stage1.process(input_data)
async def classify(self, stage1_output: Stage1Output) -> Stage2Output:
"""
Run Stage 2: Classification.
Args:
stage1_output: Output from Stage 1
Returns:
Stage2Output with classified reviews
"""
await self.initialize()
stage2 = Stage2Classifier(
self.config,
self._db,
self._review_repo,
self._span_repo,
self._embedding_service,
)
# Convert normalized reviews to classification input
reviews_to_classify = [
ReviewToClassify(
source=r["source"],
review_id=r["review_id"],
review_version=r["review_version"],
business_id=r["business_id"],
place_id=r["place_id"],
text=r["text"],
text_normalized=r["text_normalized"],
rating=r["rating"],
review_time=r["review_time"],
)
for r in stage1_output["reviews_normalized"]
]
input_data = Stage2Input(
reviews=reviews_to_classify,
config=ClassificationConfig(
model=self.config.llm_model,
taxonomy_version=self.config.taxonomy_version,
profile=self.config.classification_profile,
max_spans_per_review=self.config.max_spans_per_review,
),
)
try:
return await stage2.process(input_data)
finally:
await stage2.close()
async def route(self, stage2_output: Stage2Output) -> Stage3Output:
"""
Run Stage 3: Issue Routing.
Args:
stage2_output: Output from Stage 2
Returns:
Stage3Output with routing results
"""
await self.initialize()
stage3 = Stage3Router(
self.config,
self._db,
self._span_repo,
self._issue_repo,
)
# Extract negative/mixed spans for routing
spans_to_route = []
for review in stage2_output["reviews_classified"]:
for span in review.get("spans", []):
if span["valence"] in ("V-", ""):
spans_to_route.append(
SpanToRoute(
span_id=span["span_id"],
business_id=review.get("business_id", ""),
place_id=review.get("place_id", ""),
urt_primary=span["urt_primary"],
valence=span["valence"],
intensity=span["intensity"],
entity_normalized=span.get("entity_normalized"),
review_time=review.get("review_time", ""),
confidence=span.get("confidence", "medium"),
trust_score=review.get("trust_score", 0.5),
)
)
return await stage3.process(Stage3Input(spans=spans_to_route))
async def aggregate(
self,
business_id: str,
date_str: str,
bucket_types: list[str] | None = None,
) -> Stage4Output:
"""
Run Stage 4: Fact Aggregation.
Args:
business_id: Business identifier
date_str: Date string (YYYY-MM-DD)
bucket_types: List of bucket types (default: ['day'])
Returns:
Stage4Output with aggregated facts
"""
await self.initialize()
stage4 = Stage4Aggregator(
self.config,
self._db,
self._fact_repo,
)
input_data = Stage4Input(
business_id=business_id,
date=date_str,
bucket_types=bucket_types or ["day"], # type: ignore
taxonomy_version=self.config.taxonomy_version,
)
return await stage4.process(input_data)
async def validate(self, job_id: str) -> dict[str, ValidationResult]:
"""
Validate pipeline output for a job.
Args:
job_id: Job identifier
Returns:
Dictionary of validation results by stage
"""
# This would query the database for the job's output and validate
# For now, return empty results
logger.warning(f"validate() for job {job_id} not fully implemented")
return {}