feat: Add reviewiq-pipeline package for LLM-powered review classification
Implement a standalone Python package for processing customer reviews through a 4-stage pipeline using URT (Universal Review Taxonomy) v5.1: - Stage 1: Normalization (text cleaning, language detection, deduplication) - Stage 2: LLM Classification (OpenAI/Anthropic span extraction with URT codes) - Stage 3: Issue Routing (deterministic issue ID generation, span linking) - Stage 4: Fact Aggregation (time series metrics for dashboards) Package includes: - TypedDict contracts matching Pipeline-Contracts-v1.md - Async database layer with asyncpg and 5 SQL migrations - LLM client abstraction supporting both OpenAI and Anthropic - Sentence-transformers integration for embeddings - Validation rules V1.x through V4.x - CLI commands: migrate, run, validate, check - 55 unit and integration tests (all passing) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
402
packages/reviewiq-pipeline/src/reviewiq_pipeline/pipeline.py
Normal file
402
packages/reviewiq-pipeline/src/reviewiq_pipeline/pipeline.py
Normal file
@@ -0,0 +1,402 @@
|
||||
"""
|
||||
Pipeline class - main public API for the ReviewIQ pipeline.
|
||||
|
||||
Provides a unified interface for running pipeline stages.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from datetime import date
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from reviewiq_pipeline.config import Config
|
||||
from reviewiq_pipeline.contracts import (
|
||||
ClassificationConfig,
|
||||
NormalizedReview,
|
||||
ReviewToClassify,
|
||||
ScraperOutput,
|
||||
SpanToRoute,
|
||||
Stage1Input,
|
||||
Stage1Output,
|
||||
Stage2Input,
|
||||
Stage2Output,
|
||||
Stage3Input,
|
||||
Stage3Output,
|
||||
Stage4Input,
|
||||
Stage4Output,
|
||||
ValidationResult,
|
||||
)
|
||||
from reviewiq_pipeline.db.connection import DatabasePool
|
||||
from reviewiq_pipeline.db.repositories import (
|
||||
FactRepository,
|
||||
IssueRepository,
|
||||
ReviewRepository,
|
||||
SpanRepository,
|
||||
)
|
||||
from reviewiq_pipeline.services.embeddings import EmbeddingService
|
||||
from reviewiq_pipeline.stages.stage1_normalize import Stage1Normalizer
|
||||
from reviewiq_pipeline.stages.stage2_classify import Stage2Classifier
|
||||
from reviewiq_pipeline.stages.stage3_route import Stage3Router
|
||||
from reviewiq_pipeline.stages.stage4_aggregate import Stage4Aggregator
|
||||
from reviewiq_pipeline.validation.validators import (
|
||||
validate_stage1_output,
|
||||
validate_stage2_output,
|
||||
validate_stage3_output,
|
||||
validate_stage4_output,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
pass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PipelineResult:
|
||||
"""Result from running the full pipeline."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
stage1: Stage1Output | None = None,
|
||||
stage2: Stage2Output | None = None,
|
||||
stage3: Stage3Output | None = None,
|
||||
stage4: Stage4Output | None = None,
|
||||
validation: dict[str, ValidationResult] | None = None,
|
||||
):
|
||||
self.stage1 = stage1
|
||||
self.stage2 = stage2
|
||||
self.stage3 = stage3
|
||||
self.stage4 = stage4
|
||||
self.validation = validation or {}
|
||||
|
||||
@property
|
||||
def success(self) -> bool:
|
||||
"""Check if all ran stages passed validation."""
|
||||
return all(v["passed"] for v in self.validation.values())
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Convert to dictionary."""
|
||||
return {
|
||||
"stage1": self.stage1,
|
||||
"stage2": self.stage2,
|
||||
"stage3": self.stage3,
|
||||
"stage4": self.stage4,
|
||||
"validation": self.validation,
|
||||
"success": self.success,
|
||||
}
|
||||
|
||||
|
||||
class Pipeline:
|
||||
"""
|
||||
Main pipeline class for processing reviews.
|
||||
|
||||
Usage:
|
||||
config = Config(database_url="...", llm_provider="openai", ...)
|
||||
pipeline = Pipeline(config)
|
||||
|
||||
# Run full pipeline
|
||||
result = await pipeline.process(scraper_output)
|
||||
|
||||
# Or run individual stages
|
||||
stage1_result = await pipeline.normalize(scraper_output)
|
||||
stage2_result = await pipeline.classify(stage1_result)
|
||||
"""
|
||||
|
||||
def __init__(self, config: Config):
|
||||
"""
|
||||
Initialize the pipeline.
|
||||
|
||||
Args:
|
||||
config: Pipeline configuration
|
||||
"""
|
||||
self.config = config
|
||||
self._db: DatabasePool | None = None
|
||||
self._review_repo: ReviewRepository | None = None
|
||||
self._span_repo: SpanRepository | None = None
|
||||
self._issue_repo: IssueRepository | None = None
|
||||
self._fact_repo: FactRepository | None = None
|
||||
self._embedding_service: EmbeddingService | None = None
|
||||
self._initialized = False
|
||||
|
||||
async def initialize(self) -> None:
|
||||
"""Initialize database connections and services."""
|
||||
if self._initialized:
|
||||
return
|
||||
|
||||
logger.info("Initializing pipeline...")
|
||||
|
||||
# Initialize database
|
||||
self._db = DatabasePool(self.config)
|
||||
await self._db.initialize()
|
||||
|
||||
# Initialize repositories
|
||||
self._review_repo = ReviewRepository(self._db)
|
||||
self._span_repo = SpanRepository(self._db)
|
||||
self._issue_repo = IssueRepository(self._db)
|
||||
self._fact_repo = FactRepository(self._db)
|
||||
|
||||
# Initialize embedding service
|
||||
self._embedding_service = EmbeddingService(self.config)
|
||||
|
||||
self._initialized = True
|
||||
logger.info("Pipeline initialized")
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close all connections and cleanup resources."""
|
||||
if self._db:
|
||||
await self._db.close()
|
||||
self._db = None
|
||||
|
||||
self._initialized = False
|
||||
logger.info("Pipeline closed")
|
||||
|
||||
async def migrate(self) -> int:
|
||||
"""
|
||||
Run database migrations.
|
||||
|
||||
Returns:
|
||||
Number of migrations run
|
||||
"""
|
||||
if not self._db:
|
||||
self._db = DatabasePool(self.config)
|
||||
await self._db.initialize()
|
||||
|
||||
return await self._db.run_migrations()
|
||||
|
||||
async def process(
|
||||
self,
|
||||
scraper_output: ScraperOutput,
|
||||
stages: list[int] | None = None,
|
||||
validate: bool = True,
|
||||
) -> PipelineResult:
|
||||
"""
|
||||
Run the full pipeline on scraper output.
|
||||
|
||||
Args:
|
||||
scraper_output: Output from the scraper (Stage 0)
|
||||
stages: List of stages to run (default: all [1, 2, 3, 4])
|
||||
validate: Whether to validate each stage output
|
||||
|
||||
Returns:
|
||||
PipelineResult with all stage outputs and validation results
|
||||
"""
|
||||
await self.initialize()
|
||||
|
||||
stages = stages or [1, 2, 3, 4]
|
||||
result = PipelineResult()
|
||||
validation_results: dict[str, ValidationResult] = {}
|
||||
|
||||
# Stage 1: Normalize
|
||||
if 1 in stages:
|
||||
logger.info("Running Stage 1: Normalization")
|
||||
result.stage1 = await self.normalize(scraper_output)
|
||||
|
||||
if validate:
|
||||
validation_results["stage1"] = validate_stage1_output(result.stage1)
|
||||
|
||||
# Stage 2: Classify
|
||||
if 2 in stages and result.stage1:
|
||||
logger.info("Running Stage 2: Classification")
|
||||
result.stage2 = await self.classify(result.stage1)
|
||||
|
||||
if validate:
|
||||
# Build input reviews map for validation
|
||||
input_reviews = {
|
||||
(r["source"], r["review_id"], r["review_version"]): r
|
||||
for r in result.stage1["reviews_normalized"]
|
||||
}
|
||||
validation_results["stage2"] = validate_stage2_output(
|
||||
result.stage2, input_reviews
|
||||
)
|
||||
|
||||
# Stage 3: Route
|
||||
if 3 in stages and result.stage2:
|
||||
logger.info("Running Stage 3: Issue Routing")
|
||||
result.stage3 = await self.route(result.stage2)
|
||||
|
||||
if validate:
|
||||
validation_results["stage3"] = await validate_stage3_output(
|
||||
result.stage3, self._db
|
||||
)
|
||||
|
||||
# Stage 4: Aggregate
|
||||
if 4 in stages:
|
||||
logger.info("Running Stage 4: Aggregation")
|
||||
result.stage4 = await self.aggregate(
|
||||
scraper_output["business_id"],
|
||||
date.today().isoformat(),
|
||||
)
|
||||
|
||||
if validate:
|
||||
validation_results["stage4"] = validate_stage4_output(result.stage4)
|
||||
|
||||
result.validation = validation_results
|
||||
return result
|
||||
|
||||
async def normalize(self, scraper_output: ScraperOutput) -> Stage1Output:
|
||||
"""
|
||||
Run Stage 1: Normalization.
|
||||
|
||||
Args:
|
||||
scraper_output: Raw scraper output
|
||||
|
||||
Returns:
|
||||
Stage1Output with normalized reviews
|
||||
"""
|
||||
await self.initialize()
|
||||
|
||||
stage1 = Stage1Normalizer(
|
||||
self.config,
|
||||
self._db,
|
||||
self._review_repo,
|
||||
)
|
||||
|
||||
input_data = Stage1Input(
|
||||
job_id=scraper_output["job_id"],
|
||||
business_id=scraper_output["business_id"],
|
||||
place_id=scraper_output["place_id"],
|
||||
reviews=scraper_output["reviews"],
|
||||
)
|
||||
|
||||
return await stage1.process(input_data)
|
||||
|
||||
async def classify(self, stage1_output: Stage1Output) -> Stage2Output:
|
||||
"""
|
||||
Run Stage 2: Classification.
|
||||
|
||||
Args:
|
||||
stage1_output: Output from Stage 1
|
||||
|
||||
Returns:
|
||||
Stage2Output with classified reviews
|
||||
"""
|
||||
await self.initialize()
|
||||
|
||||
stage2 = Stage2Classifier(
|
||||
self.config,
|
||||
self._db,
|
||||
self._review_repo,
|
||||
self._span_repo,
|
||||
self._embedding_service,
|
||||
)
|
||||
|
||||
# Convert normalized reviews to classification input
|
||||
reviews_to_classify = [
|
||||
ReviewToClassify(
|
||||
source=r["source"],
|
||||
review_id=r["review_id"],
|
||||
review_version=r["review_version"],
|
||||
business_id=r["business_id"],
|
||||
place_id=r["place_id"],
|
||||
text=r["text"],
|
||||
text_normalized=r["text_normalized"],
|
||||
rating=r["rating"],
|
||||
review_time=r["review_time"],
|
||||
)
|
||||
for r in stage1_output["reviews_normalized"]
|
||||
]
|
||||
|
||||
input_data = Stage2Input(
|
||||
reviews=reviews_to_classify,
|
||||
config=ClassificationConfig(
|
||||
model=self.config.llm_model,
|
||||
taxonomy_version=self.config.taxonomy_version,
|
||||
profile=self.config.classification_profile,
|
||||
max_spans_per_review=self.config.max_spans_per_review,
|
||||
),
|
||||
)
|
||||
|
||||
try:
|
||||
return await stage2.process(input_data)
|
||||
finally:
|
||||
await stage2.close()
|
||||
|
||||
async def route(self, stage2_output: Stage2Output) -> Stage3Output:
|
||||
"""
|
||||
Run Stage 3: Issue Routing.
|
||||
|
||||
Args:
|
||||
stage2_output: Output from Stage 2
|
||||
|
||||
Returns:
|
||||
Stage3Output with routing results
|
||||
"""
|
||||
await self.initialize()
|
||||
|
||||
stage3 = Stage3Router(
|
||||
self.config,
|
||||
self._db,
|
||||
self._span_repo,
|
||||
self._issue_repo,
|
||||
)
|
||||
|
||||
# Extract negative/mixed spans for routing
|
||||
spans_to_route = []
|
||||
for review in stage2_output["reviews_classified"]:
|
||||
for span in review.get("spans", []):
|
||||
if span["valence"] in ("V-", "V±"):
|
||||
spans_to_route.append(
|
||||
SpanToRoute(
|
||||
span_id=span["span_id"],
|
||||
business_id=review.get("business_id", ""),
|
||||
place_id=review.get("place_id", ""),
|
||||
urt_primary=span["urt_primary"],
|
||||
valence=span["valence"],
|
||||
intensity=span["intensity"],
|
||||
entity_normalized=span.get("entity_normalized"),
|
||||
review_time=review.get("review_time", ""),
|
||||
confidence=span.get("confidence", "medium"),
|
||||
trust_score=review.get("trust_score", 0.5),
|
||||
)
|
||||
)
|
||||
|
||||
return await stage3.process(Stage3Input(spans=spans_to_route))
|
||||
|
||||
async def aggregate(
|
||||
self,
|
||||
business_id: str,
|
||||
date_str: str,
|
||||
bucket_types: list[str] | None = None,
|
||||
) -> Stage4Output:
|
||||
"""
|
||||
Run Stage 4: Fact Aggregation.
|
||||
|
||||
Args:
|
||||
business_id: Business identifier
|
||||
date_str: Date string (YYYY-MM-DD)
|
||||
bucket_types: List of bucket types (default: ['day'])
|
||||
|
||||
Returns:
|
||||
Stage4Output with aggregated facts
|
||||
"""
|
||||
await self.initialize()
|
||||
|
||||
stage4 = Stage4Aggregator(
|
||||
self.config,
|
||||
self._db,
|
||||
self._fact_repo,
|
||||
)
|
||||
|
||||
input_data = Stage4Input(
|
||||
business_id=business_id,
|
||||
date=date_str,
|
||||
bucket_types=bucket_types or ["day"], # type: ignore
|
||||
taxonomy_version=self.config.taxonomy_version,
|
||||
)
|
||||
|
||||
return await stage4.process(input_data)
|
||||
|
||||
async def validate(self, job_id: str) -> dict[str, ValidationResult]:
|
||||
"""
|
||||
Validate pipeline output for a job.
|
||||
|
||||
Args:
|
||||
job_id: Job identifier
|
||||
|
||||
Returns:
|
||||
Dictionary of validation results by stage
|
||||
"""
|
||||
# This would query the database for the job's output and validate
|
||||
# For now, return empty results
|
||||
logger.warning(f"validate() for job {job_id} not fully implemented")
|
||||
return {}
|
||||
Reference in New Issue
Block a user