feat: Add reviewiq-pipeline package for LLM-powered review classification
Implement a standalone Python package for processing customer reviews through a 4-stage pipeline using URT (Universal Review Taxonomy) v5.1: - Stage 1: Normalization (text cleaning, language detection, deduplication) - Stage 2: LLM Classification (OpenAI/Anthropic span extraction with URT codes) - Stage 3: Issue Routing (deterministic issue ID generation, span linking) - Stage 4: Fact Aggregation (time series metrics for dashboards) Package includes: - TypedDict contracts matching Pipeline-Contracts-v1.md - Async database layer with asyncpg and 5 SQL migrations - LLM client abstraction supporting both OpenAI and Anthropic - Sentence-transformers integration for embeddings - Validation rules V1.x through V4.x - CLI commands: migrate, run, validate, check - 55 unit and integration tests (all passing) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
56
packages/reviewiq-pipeline/src/reviewiq_pipeline/__init__.py
Normal file
56
packages/reviewiq-pipeline/src/reviewiq_pipeline/__init__.py
Normal file
@@ -0,0 +1,56 @@
|
||||
"""
|
||||
ReviewIQ Pipeline - LLM-powered review classification and analysis.
|
||||
|
||||
This package provides a complete pipeline for processing customer reviews:
|
||||
- Stage 1: Normalization (text cleaning, language detection, deduplication)
|
||||
- Stage 2: LLM Classification (span extraction with URT codes)
|
||||
- Stage 3: Issue Routing (route negative spans to issues)
|
||||
- Stage 4: Fact Aggregation (pre-aggregate metrics for dashboards)
|
||||
"""
|
||||
|
||||
from reviewiq_pipeline.config import Config
|
||||
from reviewiq_pipeline.contracts import (
|
||||
ClassifiedReview,
|
||||
ExtractedSpan,
|
||||
FactRecord,
|
||||
NormalizedReview,
|
||||
RawReview,
|
||||
RoutedSpan,
|
||||
ScraperOutput,
|
||||
Stage1Input,
|
||||
Stage1Output,
|
||||
Stage2Input,
|
||||
Stage2Output,
|
||||
Stage3Input,
|
||||
Stage3Output,
|
||||
Stage4Input,
|
||||
Stage4Output,
|
||||
ValidationError,
|
||||
ValidationResult,
|
||||
)
|
||||
from reviewiq_pipeline.pipeline import Pipeline
|
||||
|
||||
__version__ = "0.1.0"
|
||||
__all__ = [
|
||||
# Main API
|
||||
"Pipeline",
|
||||
"Config",
|
||||
# Contracts
|
||||
"ScraperOutput",
|
||||
"RawReview",
|
||||
"Stage1Input",
|
||||
"Stage1Output",
|
||||
"NormalizedReview",
|
||||
"Stage2Input",
|
||||
"Stage2Output",
|
||||
"ClassifiedReview",
|
||||
"ExtractedSpan",
|
||||
"Stage3Input",
|
||||
"Stage3Output",
|
||||
"RoutedSpan",
|
||||
"Stage4Input",
|
||||
"Stage4Output",
|
||||
"FactRecord",
|
||||
"ValidationResult",
|
||||
"ValidationError",
|
||||
]
|
||||
322
packages/reviewiq-pipeline/src/reviewiq_pipeline/cli.py
Normal file
322
packages/reviewiq-pipeline/src/reviewiq_pipeline/cli.py
Normal file
@@ -0,0 +1,322 @@
|
||||
"""
|
||||
CLI for the ReviewIQ pipeline.
|
||||
|
||||
Usage:
|
||||
reviewiq-pipeline migrate --database-url $DATABASE_URL
|
||||
reviewiq-pipeline run --job-id <UUID> --stages 1,2,3,4
|
||||
reviewiq-pipeline validate --job-id <UUID>
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
import click
|
||||
|
||||
from reviewiq_pipeline import __version__
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger("reviewiq_pipeline")
|
||||
|
||||
|
||||
def get_config(**overrides: Any):
|
||||
"""Get configuration with optional overrides."""
|
||||
from reviewiq_pipeline.config import Config
|
||||
|
||||
return Config(**{k: v for k, v in overrides.items() if v is not None})
|
||||
|
||||
|
||||
@click.group()
|
||||
@click.version_option(version=__version__)
|
||||
def main():
|
||||
"""ReviewIQ Pipeline - LLM-powered review classification."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command()
|
||||
@click.option(
|
||||
"--database-url",
|
||||
envvar="DATABASE_URL",
|
||||
required=True,
|
||||
help="PostgreSQL connection string",
|
||||
)
|
||||
def migrate(database_url: str):
|
||||
"""Run database migrations."""
|
||||
|
||||
async def _migrate():
|
||||
from reviewiq_pipeline.db.connection import DatabasePool
|
||||
|
||||
config = get_config(database_url=database_url)
|
||||
db = DatabasePool(config)
|
||||
|
||||
try:
|
||||
await db.initialize()
|
||||
count = await db.run_migrations()
|
||||
click.echo(f"Successfully ran {count} migrations")
|
||||
except Exception as e:
|
||||
click.echo(f"Migration failed: {e}", err=True)
|
||||
sys.exit(1)
|
||||
finally:
|
||||
await db.close()
|
||||
|
||||
asyncio.run(_migrate())
|
||||
|
||||
|
||||
@main.command()
|
||||
@click.option(
|
||||
"--job-id",
|
||||
required=True,
|
||||
help="Job ID to process",
|
||||
)
|
||||
@click.option(
|
||||
"--stages",
|
||||
default="1,2,3,4",
|
||||
help="Comma-separated list of stages to run (default: 1,2,3,4)",
|
||||
)
|
||||
@click.option(
|
||||
"--database-url",
|
||||
envvar="DATABASE_URL",
|
||||
required=True,
|
||||
help="PostgreSQL connection string",
|
||||
)
|
||||
@click.option(
|
||||
"--llm-provider",
|
||||
envvar="LLM_PROVIDER",
|
||||
type=click.Choice(["openai", "anthropic"]),
|
||||
default="openai",
|
||||
help="LLM provider",
|
||||
)
|
||||
@click.option(
|
||||
"--llm-model",
|
||||
envvar="LLM_MODEL",
|
||||
default="gpt-4o-mini",
|
||||
help="LLM model to use",
|
||||
)
|
||||
@click.option(
|
||||
"--openai-api-key",
|
||||
envvar="OPENAI_API_KEY",
|
||||
help="OpenAI API key",
|
||||
)
|
||||
@click.option(
|
||||
"--anthropic-api-key",
|
||||
envvar="ANTHROPIC_API_KEY",
|
||||
help="Anthropic API key",
|
||||
)
|
||||
@click.option(
|
||||
"--validate/--no-validate",
|
||||
default=True,
|
||||
help="Validate output after each stage",
|
||||
)
|
||||
@click.option(
|
||||
"--output",
|
||||
type=click.Path(),
|
||||
help="Output file for results (JSON)",
|
||||
)
|
||||
def run(
|
||||
job_id: str,
|
||||
stages: str,
|
||||
database_url: str,
|
||||
llm_provider: str,
|
||||
llm_model: str,
|
||||
openai_api_key: str | None,
|
||||
anthropic_api_key: str | None,
|
||||
validate: bool,
|
||||
output: str | None,
|
||||
):
|
||||
"""Run pipeline stages for a job."""
|
||||
|
||||
async def _run():
|
||||
from reviewiq_pipeline import Pipeline
|
||||
|
||||
# Parse stages
|
||||
stage_list = [int(s.strip()) for s in stages.split(",") if s.strip()]
|
||||
|
||||
config = get_config(
|
||||
database_url=database_url,
|
||||
llm_provider=llm_provider,
|
||||
llm_model=llm_model,
|
||||
openai_api_key=openai_api_key,
|
||||
anthropic_api_key=anthropic_api_key,
|
||||
)
|
||||
|
||||
pipeline = Pipeline(config)
|
||||
|
||||
try:
|
||||
await pipeline.initialize()
|
||||
|
||||
# Fetch job from database
|
||||
job_data = await pipeline._db.fetchrow(
|
||||
"SELECT * FROM jobs WHERE job_id = $1",
|
||||
job_id,
|
||||
)
|
||||
|
||||
if not job_data:
|
||||
click.echo(f"Job {job_id} not found", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Build scraper output from job data
|
||||
reviews_data = job_data.get("reviews_data") or {}
|
||||
scraper_output = {
|
||||
"job_id": job_id,
|
||||
"status": job_data.get("status", "completed"),
|
||||
"business_id": reviews_data.get("business_id", job_id),
|
||||
"place_id": reviews_data.get("place_id", ""),
|
||||
"business_info": reviews_data.get("business_info", {}),
|
||||
"reviews": reviews_data.get("reviews", []),
|
||||
"scrape_time_ms": 0,
|
||||
"reviews_scraped": len(reviews_data.get("reviews", [])),
|
||||
"scraper_version": "v1.0.0",
|
||||
}
|
||||
|
||||
# Run pipeline
|
||||
result = await pipeline.process(
|
||||
scraper_output,
|
||||
stages=stage_list,
|
||||
validate=validate,
|
||||
)
|
||||
|
||||
# Output results
|
||||
if result.success:
|
||||
click.echo(click.style("Pipeline completed successfully!", fg="green"))
|
||||
else:
|
||||
click.echo(click.style("Pipeline completed with validation errors", fg="yellow"))
|
||||
|
||||
# Print summary
|
||||
if result.stage1:
|
||||
click.echo(f" Stage 1: {result.stage1['stats']['output_count']} reviews normalized")
|
||||
if result.stage2:
|
||||
click.echo(f" Stage 2: {result.stage2['stats']['success_count']} reviews classified")
|
||||
if result.stage3:
|
||||
click.echo(f" Stage 3: {result.stage3['stats']['spans_routed']} spans routed")
|
||||
if result.stage4:
|
||||
click.echo(f" Stage 4: {result.stage4['stats']['facts_upserted']} facts written")
|
||||
|
||||
# Validation summary
|
||||
for stage, validation in result.validation.items():
|
||||
status = "PASS" if validation["passed"] else f"FAIL ({validation['error_count']} errors)"
|
||||
click.echo(f" {stage} validation: {status}")
|
||||
|
||||
# Write output file
|
||||
if output:
|
||||
with open(output, "w") as f:
|
||||
json.dump(result.to_dict(), f, indent=2, default=str)
|
||||
click.echo(f"Results written to {output}")
|
||||
|
||||
if not result.success:
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"Pipeline failed: {e}", err=True)
|
||||
logger.exception("Pipeline error")
|
||||
sys.exit(1)
|
||||
finally:
|
||||
await pipeline.close()
|
||||
|
||||
asyncio.run(_run())
|
||||
|
||||
|
||||
@main.command()
|
||||
@click.option(
|
||||
"--job-id",
|
||||
required=True,
|
||||
help="Job ID to validate",
|
||||
)
|
||||
@click.option(
|
||||
"--database-url",
|
||||
envvar="DATABASE_URL",
|
||||
required=True,
|
||||
help="PostgreSQL connection string",
|
||||
)
|
||||
@click.option(
|
||||
"--stage",
|
||||
type=click.Choice(["1", "2", "3", "4", "all"]),
|
||||
default="all",
|
||||
help="Stage to validate (default: all)",
|
||||
)
|
||||
def validate(job_id: str, database_url: str, stage: str):
|
||||
"""Validate pipeline output for a job."""
|
||||
|
||||
async def _validate():
|
||||
from reviewiq_pipeline import Pipeline
|
||||
|
||||
config = get_config(database_url=database_url)
|
||||
pipeline = Pipeline(config)
|
||||
|
||||
try:
|
||||
await pipeline.initialize()
|
||||
|
||||
results = await pipeline.validate(job_id)
|
||||
|
||||
all_passed = True
|
||||
for stage_name, validation in results.items():
|
||||
if stage != "all" and f"stage{stage}" != stage_name:
|
||||
continue
|
||||
|
||||
status = "PASS" if validation["passed"] else "FAIL"
|
||||
color = "green" if validation["passed"] else "red"
|
||||
click.echo(click.style(f"{stage_name}: {status}", fg=color))
|
||||
|
||||
if not validation["passed"]:
|
||||
all_passed = False
|
||||
for error in validation["errors"][:10]:
|
||||
click.echo(f" - [{error['rule']}] {error['identifier']}: {error['message']}")
|
||||
if validation["error_count"] > 10:
|
||||
click.echo(f" ... and {validation['error_count'] - 10} more errors")
|
||||
|
||||
if not all_passed:
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"Validation failed: {e}", err=True)
|
||||
sys.exit(1)
|
||||
finally:
|
||||
await pipeline.close()
|
||||
|
||||
asyncio.run(_validate())
|
||||
|
||||
|
||||
@main.command()
|
||||
@click.option(
|
||||
"--database-url",
|
||||
envvar="DATABASE_URL",
|
||||
required=True,
|
||||
help="PostgreSQL connection string",
|
||||
)
|
||||
def check(database_url: str):
|
||||
"""Check database connection."""
|
||||
|
||||
async def _check():
|
||||
from reviewiq_pipeline.db.connection import DatabasePool
|
||||
|
||||
config = get_config(database_url=database_url)
|
||||
db = DatabasePool(config)
|
||||
|
||||
try:
|
||||
await db.initialize()
|
||||
if await db.check_connection():
|
||||
click.echo(click.style("Database connection OK", fg="green"))
|
||||
else:
|
||||
click.echo(click.style("Database connection failed", fg="red"))
|
||||
sys.exit(1)
|
||||
finally:
|
||||
await db.close()
|
||||
|
||||
asyncio.run(_check())
|
||||
|
||||
|
||||
@main.command()
|
||||
def version():
|
||||
"""Show version information."""
|
||||
click.echo(f"reviewiq-pipeline {__version__}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
177
packages/reviewiq-pipeline/src/reviewiq_pipeline/config.py
Normal file
177
packages/reviewiq-pipeline/src/reviewiq_pipeline/config.py
Normal file
@@ -0,0 +1,177 @@
|
||||
"""Configuration management for the ReviewIQ pipeline."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import Field, SecretStr, field_validator
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
class Config(BaseSettings):
|
||||
"""Pipeline configuration loaded from environment variables or passed directly."""
|
||||
|
||||
model_config = SettingsConfigDict(
|
||||
env_prefix="REVIEWIQ_",
|
||||
env_file=".env",
|
||||
env_file_encoding="utf-8",
|
||||
extra="ignore",
|
||||
)
|
||||
|
||||
# Database
|
||||
database_url: str = Field(
|
||||
default="postgresql://localhost:5432/reviewiq",
|
||||
description="PostgreSQL connection string",
|
||||
)
|
||||
db_pool_min_size: int = Field(default=2, ge=1, le=50)
|
||||
db_pool_max_size: int = Field(default=10, ge=1, le=100)
|
||||
|
||||
# LLM Provider
|
||||
llm_provider: Literal["openai", "anthropic"] = Field(
|
||||
default="openai",
|
||||
description="LLM provider to use for classification",
|
||||
)
|
||||
openai_api_key: SecretStr | None = Field(
|
||||
default=None,
|
||||
description="OpenAI API key",
|
||||
)
|
||||
anthropic_api_key: SecretStr | None = Field(
|
||||
default=None,
|
||||
description="Anthropic API key",
|
||||
)
|
||||
|
||||
# Model settings
|
||||
llm_model: str = Field(
|
||||
default="gpt-4o-mini",
|
||||
description="LLM model to use for classification",
|
||||
)
|
||||
llm_temperature: float = Field(default=0.0, ge=0.0, le=2.0)
|
||||
llm_max_retries: int = Field(default=3, ge=1, le=10)
|
||||
llm_timeout_seconds: int = Field(default=60, ge=10, le=300)
|
||||
|
||||
# Embedding settings
|
||||
embedding_model: str = Field(
|
||||
default="all-MiniLM-L6-v2",
|
||||
description="Sentence transformer model for embeddings",
|
||||
)
|
||||
embedding_dimension: int = Field(
|
||||
default=384,
|
||||
description="Expected embedding dimension",
|
||||
)
|
||||
|
||||
# Taxonomy
|
||||
taxonomy_version: str = Field(
|
||||
default="v5.1",
|
||||
description="URT taxonomy version",
|
||||
)
|
||||
|
||||
# Classification
|
||||
classification_profile: Literal["lite", "core", "standard", "full"] = Field(
|
||||
default="standard",
|
||||
description="Classification profile to use",
|
||||
)
|
||||
max_spans_per_review: int = Field(default=10, ge=1, le=20)
|
||||
|
||||
# Processing
|
||||
batch_size: int = Field(default=50, ge=1, le=500)
|
||||
trust_score_floor: float = Field(default=0.2, ge=0.0, le=1.0)
|
||||
|
||||
# Migrations
|
||||
migrations_path: str = Field(
|
||||
default="",
|
||||
description="Path to migrations directory (empty for default)",
|
||||
)
|
||||
|
||||
@field_validator("llm_provider")
|
||||
@classmethod
|
||||
def validate_provider_api_key(cls, v: str) -> str:
|
||||
"""Validate that provider is supported."""
|
||||
if v not in ("openai", "anthropic"):
|
||||
raise ValueError(f"Unsupported LLM provider: {v}")
|
||||
return v
|
||||
|
||||
def get_llm_api_key(self) -> str:
|
||||
"""Get the API key for the configured LLM provider."""
|
||||
if self.llm_provider == "openai":
|
||||
if self.openai_api_key is None:
|
||||
raise ValueError("OpenAI API key is required when llm_provider is 'openai'")
|
||||
return self.openai_api_key.get_secret_value()
|
||||
elif self.llm_provider == "anthropic":
|
||||
if self.anthropic_api_key is None:
|
||||
raise ValueError("Anthropic API key is required when llm_provider is 'anthropic'")
|
||||
return self.anthropic_api_key.get_secret_value()
|
||||
else:
|
||||
raise ValueError(f"Unsupported LLM provider: {self.llm_provider}")
|
||||
|
||||
@property
|
||||
def effective_migrations_path(self) -> str:
|
||||
"""Get the effective migrations path."""
|
||||
if self.migrations_path:
|
||||
return self.migrations_path
|
||||
# Default to package's migrations directory
|
||||
import importlib.resources
|
||||
|
||||
try:
|
||||
# Python 3.11+
|
||||
return str(importlib.resources.files("reviewiq_pipeline.db") / "migrations")
|
||||
except (AttributeError, TypeError):
|
||||
# Fallback for older Python
|
||||
import os
|
||||
|
||||
return os.path.join(os.path.dirname(__file__), "db", "migrations")
|
||||
|
||||
|
||||
class ClassificationConfig:
|
||||
"""Configuration specifically for the LLM classification stage."""
|
||||
|
||||
def __init__(self, config: Config):
|
||||
self.model = config.llm_model
|
||||
self.taxonomy_version = config.taxonomy_version
|
||||
self.profile = config.classification_profile
|
||||
self.max_spans_per_review = config.max_spans_per_review
|
||||
self.temperature = config.llm_temperature
|
||||
self.max_retries = config.llm_max_retries
|
||||
self.timeout_seconds = config.llm_timeout_seconds
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for contract compatibility."""
|
||||
return {
|
||||
"model": self.model,
|
||||
"taxonomy_version": self.taxonomy_version,
|
||||
"profile": self.profile,
|
||||
"max_spans_per_review": self.max_spans_per_review,
|
||||
}
|
||||
|
||||
|
||||
class EmbeddingConfig:
|
||||
"""Configuration for the embedding service."""
|
||||
|
||||
def __init__(self, config: Config):
|
||||
self.model = config.embedding_model
|
||||
self.dimension = config.embedding_dimension
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary."""
|
||||
return {
|
||||
"model": self.model,
|
||||
"dimension": self.dimension,
|
||||
}
|
||||
|
||||
|
||||
class DatabaseConfig:
|
||||
"""Configuration for database connections."""
|
||||
|
||||
def __init__(self, config: Config):
|
||||
self.url = config.database_url
|
||||
self.pool_min_size = config.db_pool_min_size
|
||||
self.pool_max_size = config.db_pool_max_size
|
||||
self.migrations_path = config.effective_migrations_path
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary."""
|
||||
return {
|
||||
"url": self.url,
|
||||
"pool_min_size": self.pool_min_size,
|
||||
"pool_max_size": self.pool_max_size,
|
||||
"migrations_path": self.migrations_path,
|
||||
}
|
||||
648
packages/reviewiq-pipeline/src/reviewiq_pipeline/contracts.py
Normal file
648
packages/reviewiq-pipeline/src/reviewiq_pipeline/contracts.py
Normal file
@@ -0,0 +1,648 @@
|
||||
"""
|
||||
TypedDict definitions for pipeline stage inputs and outputs.
|
||||
|
||||
These contracts define the data structures passed between pipeline stages,
|
||||
enabling independent development and validation of each stage.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Literal, TypedDict
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Common Types
|
||||
# =============================================================================
|
||||
|
||||
ValenceType = Literal["V+", "V-", "V0", "V±"]
|
||||
IntensityType = Literal["I1", "I2", "I3"]
|
||||
SpecificityType = Literal["S1", "S2", "S3"]
|
||||
ActionabilityType = Literal["A1", "A2", "A3"]
|
||||
TemporalType = Literal["TC", "TR", "TH", "TF"]
|
||||
EvidenceType = Literal["ES", "EI", "EC"]
|
||||
ComparativeType = Literal["CR-N", "CR-B", "CR-W", "CR-S"]
|
||||
ConfidenceType = Literal["high", "medium", "low"]
|
||||
EntityTypeValue = Literal["location", "staff", "product", "process", "time", "other"]
|
||||
RelationType = Literal["cause_of", "effect_of", "contrast", "resolution"]
|
||||
ProfileType = Literal["lite", "core", "standard", "full"]
|
||||
BucketType = Literal["day", "week", "month"]
|
||||
SubjectType = Literal["overall", "urt_code", "domain", "issue"]
|
||||
IssueState = Literal["open", "resolved", "ignored", "merged"]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Validation Types
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class ValidationError(TypedDict):
|
||||
"""A single validation error."""
|
||||
|
||||
rule: str
|
||||
identifier: str
|
||||
message: str
|
||||
|
||||
|
||||
class ValidationResult(TypedDict):
|
||||
"""Result of validating a stage output."""
|
||||
|
||||
stage: str
|
||||
passed: bool
|
||||
error_count: int
|
||||
errors: list[ValidationError]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Stage 0: Raw Ingestion (from Scraper)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class BusinessInfo(TypedDict):
|
||||
"""Business metadata from scraper."""
|
||||
|
||||
name: str
|
||||
address: str
|
||||
category: str
|
||||
total_reviews: int
|
||||
average_rating: float
|
||||
|
||||
|
||||
class RawReview(TypedDict, total=False):
|
||||
"""Raw review as scraped from Google Maps."""
|
||||
|
||||
review_id: str
|
||||
author_name: str
|
||||
author_id: str | None
|
||||
rating: int
|
||||
text: str | None
|
||||
review_time: str
|
||||
response_text: str | None
|
||||
response_time: str | None
|
||||
photos: list[str] | None
|
||||
raw_payload: dict[str, Any]
|
||||
|
||||
|
||||
class ScraperOutput(TypedDict):
|
||||
"""Output from the scraper (Stage 0), input to pipeline."""
|
||||
|
||||
job_id: str
|
||||
status: Literal["completed", "failed", "partial"]
|
||||
business_id: str
|
||||
place_id: str
|
||||
business_info: BusinessInfo
|
||||
reviews: list[RawReview]
|
||||
scrape_time_ms: int
|
||||
reviews_scraped: int
|
||||
scraper_version: str
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Stage 1: Normalization
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class Stage1Input(TypedDict):
|
||||
"""Input to Stage 1 normalization."""
|
||||
|
||||
job_id: str
|
||||
business_id: str
|
||||
place_id: str
|
||||
reviews: list[RawReview]
|
||||
|
||||
|
||||
class NormalizedReview(TypedDict, total=False):
|
||||
"""A normalized review ready for classification."""
|
||||
|
||||
# Identity (composite key)
|
||||
source: Literal["google"]
|
||||
review_id: str
|
||||
review_version: int
|
||||
|
||||
# Tenant context
|
||||
business_id: str
|
||||
place_id: str
|
||||
|
||||
# Content
|
||||
text: str
|
||||
text_normalized: str
|
||||
text_language: str
|
||||
text_length: int
|
||||
word_count: int
|
||||
|
||||
# Metadata
|
||||
rating: int
|
||||
review_time: str
|
||||
author_name: str
|
||||
author_id: str | None
|
||||
|
||||
# Dedup
|
||||
content_hash: str
|
||||
dedup_group_id: str | None
|
||||
|
||||
# Reference
|
||||
raw_id: int
|
||||
|
||||
|
||||
class Stage1Stats(TypedDict):
|
||||
"""Statistics from Stage 1 processing."""
|
||||
|
||||
input_count: int
|
||||
output_count: int
|
||||
skipped_empty: int
|
||||
skipped_duplicate: int
|
||||
|
||||
|
||||
class Stage1Output(TypedDict):
|
||||
"""Output from Stage 1 normalization."""
|
||||
|
||||
job_id: str
|
||||
business_id: str
|
||||
place_id: str
|
||||
reviews_normalized: list[NormalizedReview]
|
||||
stats: Stage1Stats
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Stage 2: LLM Classification
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class ReviewToClassify(TypedDict):
|
||||
"""A review to be classified by the LLM."""
|
||||
|
||||
source: str
|
||||
review_id: str
|
||||
review_version: int
|
||||
business_id: str
|
||||
place_id: str
|
||||
text: str
|
||||
text_normalized: str
|
||||
rating: int
|
||||
review_time: str
|
||||
|
||||
|
||||
class ClassificationConfig(TypedDict):
|
||||
"""Configuration for LLM classification."""
|
||||
|
||||
model: str
|
||||
taxonomy_version: str
|
||||
profile: ProfileType
|
||||
max_spans_per_review: int
|
||||
|
||||
|
||||
class Stage2Input(TypedDict):
|
||||
"""Input to Stage 2 classification."""
|
||||
|
||||
reviews: list[ReviewToClassify]
|
||||
config: ClassificationConfig
|
||||
|
||||
|
||||
class CausalLink(TypedDict):
|
||||
"""A link in a causal chain."""
|
||||
|
||||
code: str
|
||||
role: Literal["cause", "effect", "context", "outcome"]
|
||||
order: int
|
||||
|
||||
|
||||
class ExtractedSpan(TypedDict, total=False):
|
||||
"""A span extracted from a review with URT classification."""
|
||||
|
||||
# Identity
|
||||
span_id: str
|
||||
span_index: int
|
||||
|
||||
# Position (offsets into original text)
|
||||
span_text: str
|
||||
span_start: int
|
||||
span_end: int
|
||||
|
||||
# Classification
|
||||
profile: ProfileType
|
||||
urt_primary: str
|
||||
urt_secondary: list[str]
|
||||
valence: ValenceType
|
||||
intensity: IntensityType
|
||||
comparative: ComparativeType
|
||||
|
||||
# Extended (standard/full profile)
|
||||
specificity: SpecificityType
|
||||
actionability: ActionabilityType
|
||||
temporal: TemporalType
|
||||
evidence: EvidenceType
|
||||
|
||||
# Entity
|
||||
entity: str | None
|
||||
entity_type: EntityTypeValue | None
|
||||
entity_normalized: str | None
|
||||
|
||||
# Causal (full profile)
|
||||
relation_type: RelationType | None
|
||||
related_span_index: int | None
|
||||
causal_chain: list[CausalLink] | None
|
||||
|
||||
# Metadata
|
||||
confidence: ConfidenceType
|
||||
usn: str
|
||||
|
||||
# Flags
|
||||
is_primary: bool
|
||||
|
||||
|
||||
class ClassifiedReview(TypedDict, total=False):
|
||||
"""A review with LLM classification results."""
|
||||
|
||||
# Identity
|
||||
source: str
|
||||
review_id: str
|
||||
review_version: int
|
||||
|
||||
# Review-level classification (from primary span)
|
||||
urt_primary: str
|
||||
urt_secondary: list[str]
|
||||
valence: ValenceType
|
||||
intensity: IntensityType
|
||||
comparative: ComparativeType
|
||||
|
||||
# Extracted entities
|
||||
staff_mentions: list[str]
|
||||
quotes: dict[str, str]
|
||||
|
||||
# Trust score
|
||||
trust_score: float
|
||||
|
||||
# Embedding
|
||||
embedding: list[float]
|
||||
|
||||
# Spans
|
||||
spans: list[ExtractedSpan]
|
||||
|
||||
# Processing metadata
|
||||
classification_confidence: dict[str, float]
|
||||
processing_time_ms: int
|
||||
|
||||
|
||||
class Stage2Stats(TypedDict):
|
||||
"""Statistics from Stage 2 processing."""
|
||||
|
||||
input_count: int
|
||||
success_count: int
|
||||
error_count: int
|
||||
total_spans: int
|
||||
avg_spans_per_review: float
|
||||
llm_tokens_used: int
|
||||
llm_cost_usd: float
|
||||
|
||||
|
||||
class Stage2Output(TypedDict):
|
||||
"""Output from Stage 2 classification."""
|
||||
|
||||
batch_id: str
|
||||
taxonomy_version: str
|
||||
model_version: str
|
||||
prompt_version: str
|
||||
reviews_classified: list[ClassifiedReview]
|
||||
stats: Stage2Stats
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Stage 3: Issue Routing
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class SpanToRoute(TypedDict):
|
||||
"""A span to be routed to an issue."""
|
||||
|
||||
span_id: str
|
||||
business_id: str
|
||||
place_id: str
|
||||
urt_primary: str
|
||||
valence: str
|
||||
intensity: str
|
||||
entity_normalized: str | None
|
||||
review_time: str
|
||||
confidence: str
|
||||
trust_score: float
|
||||
|
||||
|
||||
class Stage3Input(TypedDict):
|
||||
"""Input to Stage 3 issue routing."""
|
||||
|
||||
spans: list[SpanToRoute]
|
||||
|
||||
|
||||
class RoutedSpan(TypedDict):
|
||||
"""A span that has been routed to an issue."""
|
||||
|
||||
span_id: str
|
||||
issue_id: str
|
||||
routing_key: str
|
||||
is_new_issue: bool
|
||||
|
||||
|
||||
class Stage3Stats(TypedDict):
|
||||
"""Statistics from Stage 3 processing."""
|
||||
|
||||
spans_processed: int
|
||||
spans_routed: int
|
||||
spans_skipped: int
|
||||
issues_created: int
|
||||
issues_updated: int
|
||||
|
||||
|
||||
class Stage3Output(TypedDict):
|
||||
"""Output from Stage 3 issue routing."""
|
||||
|
||||
routed_spans: list[RoutedSpan]
|
||||
issues_created: list[str]
|
||||
issues_updated: list[str]
|
||||
stats: Stage3Stats
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Stage 4: Fact Aggregation
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class Stage4Input(TypedDict):
|
||||
"""Input to Stage 4 fact aggregation."""
|
||||
|
||||
business_id: str
|
||||
date: str
|
||||
bucket_types: list[BucketType]
|
||||
taxonomy_version: str
|
||||
|
||||
|
||||
class FactRecord(TypedDict, total=False):
|
||||
"""An aggregated fact record for time series data."""
|
||||
|
||||
# Keys
|
||||
business_id: str
|
||||
place_id: str
|
||||
period_date: str
|
||||
bucket_type: str
|
||||
subject_type: SubjectType
|
||||
subject_id: str
|
||||
taxonomy_version: str
|
||||
|
||||
# Metrics
|
||||
review_count: int
|
||||
span_count: int
|
||||
negative_count: int
|
||||
positive_count: int
|
||||
neutral_count: int
|
||||
mixed_count: int
|
||||
strength_score: float
|
||||
negative_strength: float
|
||||
positive_strength: float
|
||||
avg_rating: float | None
|
||||
i1_count: int
|
||||
i2_count: int
|
||||
i3_count: int
|
||||
cr_better: int
|
||||
cr_worse: int
|
||||
cr_same: int
|
||||
trust_weighted_strength: float
|
||||
trust_weighted_negative: float
|
||||
|
||||
|
||||
class Stage4Stats(TypedDict):
|
||||
"""Statistics from Stage 4 processing."""
|
||||
|
||||
business_id: str
|
||||
date: str
|
||||
locations_processed: int
|
||||
codes_aggregated: int
|
||||
facts_upserted: int
|
||||
|
||||
|
||||
class Stage4Output(TypedDict):
|
||||
"""Output from Stage 4 fact aggregation."""
|
||||
|
||||
facts_written: list[FactRecord]
|
||||
stats: Stage4Stats
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Database Entity Types
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class ReviewRaw(TypedDict, total=False):
|
||||
"""A raw review record in the database."""
|
||||
|
||||
id: int
|
||||
source: str
|
||||
review_id: str
|
||||
place_id: str
|
||||
raw_payload: dict[str, Any]
|
||||
review_text: str | None
|
||||
rating: int
|
||||
review_time: str
|
||||
reviewer_name: str
|
||||
reviewer_id: str | None
|
||||
review_version: int
|
||||
pulled_at: str
|
||||
created_at: str
|
||||
|
||||
|
||||
class ReviewEnriched(TypedDict, total=False):
|
||||
"""An enriched review record in the database."""
|
||||
|
||||
id: int
|
||||
source: str
|
||||
review_id: str
|
||||
review_version: int
|
||||
is_latest: bool
|
||||
raw_id: int
|
||||
business_id: str
|
||||
place_id: str
|
||||
text: str
|
||||
text_normalized: str
|
||||
rating: int
|
||||
review_time: str
|
||||
language: str
|
||||
taxonomy_version: str
|
||||
urt_primary: str | None
|
||||
urt_secondary: list[str] | None
|
||||
valence: ValenceType | None
|
||||
intensity: IntensityType | None
|
||||
comparative: ComparativeType | None
|
||||
staff_mentions: list[str] | None
|
||||
quotes: dict[str, str] | None
|
||||
embedding: list[float] | None
|
||||
trust_score: float | None
|
||||
classification_model: str | None
|
||||
classification_confidence: dict[str, float] | None
|
||||
processed_at: str | None
|
||||
created_at: str
|
||||
|
||||
|
||||
class ReviewSpan(TypedDict, total=False):
|
||||
"""A span record in the database."""
|
||||
|
||||
id: int
|
||||
span_id: str
|
||||
business_id: str
|
||||
place_id: str
|
||||
source: str
|
||||
review_id: str
|
||||
review_version: int
|
||||
span_index: int
|
||||
span_text: str
|
||||
span_start: int
|
||||
span_end: int
|
||||
profile: ProfileType
|
||||
urt_primary: str
|
||||
urt_secondary: list[str]
|
||||
valence: ValenceType
|
||||
intensity: IntensityType
|
||||
comparative: ComparativeType
|
||||
specificity: SpecificityType | None
|
||||
actionability: ActionabilityType | None
|
||||
temporal: TemporalType | None
|
||||
evidence: EvidenceType | None
|
||||
entity: str | None
|
||||
entity_type: EntityTypeValue | None
|
||||
entity_normalized: str | None
|
||||
relation_type: RelationType | None
|
||||
related_span_id: str | None
|
||||
causal_chain: list[CausalLink] | None
|
||||
is_primary: bool
|
||||
is_active: bool
|
||||
review_time: str
|
||||
confidence: ConfidenceType
|
||||
usn: str
|
||||
taxonomy_version: str
|
||||
model_version: str
|
||||
ingest_batch_id: str
|
||||
created_at: str
|
||||
|
||||
|
||||
class Issue(TypedDict, total=False):
|
||||
"""An issue record in the database."""
|
||||
|
||||
id: int
|
||||
issue_id: str
|
||||
business_id: str
|
||||
place_id: str
|
||||
primary_subcode: str
|
||||
domain: str
|
||||
state: IssueState
|
||||
priority_score: float
|
||||
confidence_score: float
|
||||
span_count: int
|
||||
max_intensity: IntensityType
|
||||
entity: str | None
|
||||
entity_normalized: str | None
|
||||
taxonomy_version: str
|
||||
created_at: str
|
||||
updated_at: str
|
||||
|
||||
|
||||
class IssueSpan(TypedDict):
|
||||
"""A link between an issue and a span."""
|
||||
|
||||
id: int
|
||||
issue_id: str
|
||||
span_id: str
|
||||
source: str
|
||||
review_id: str
|
||||
review_version: int
|
||||
is_primary_match: bool
|
||||
intensity: IntensityType
|
||||
review_time: str
|
||||
created_at: str
|
||||
|
||||
|
||||
class IssueEvent(TypedDict, total=False):
|
||||
"""An event in the issue audit log."""
|
||||
|
||||
id: int
|
||||
issue_id: str
|
||||
event_type: str
|
||||
span_id: str | None
|
||||
old_value: str | None
|
||||
new_value: str | None
|
||||
metadata: dict[str, Any] | None
|
||||
created_at: str
|
||||
|
||||
|
||||
class FactTimeseries(TypedDict, total=False):
|
||||
"""A fact time series record in the database."""
|
||||
|
||||
id: int
|
||||
business_id: str
|
||||
place_id: str
|
||||
period_date: str
|
||||
bucket_type: BucketType
|
||||
subject_type: SubjectType
|
||||
subject_id: str
|
||||
taxonomy_version: str
|
||||
review_count: int
|
||||
span_count: int
|
||||
negative_count: int
|
||||
positive_count: int
|
||||
neutral_count: int
|
||||
mixed_count: int
|
||||
strength_score: float
|
||||
negative_strength: float
|
||||
positive_strength: float
|
||||
avg_rating: float | None
|
||||
i1_count: int
|
||||
i2_count: int
|
||||
i3_count: int
|
||||
cr_better: int
|
||||
cr_worse: int
|
||||
cr_same: int
|
||||
trust_weighted_strength: float
|
||||
trust_weighted_negative: float
|
||||
computed_at: str
|
||||
created_at: str
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# LLM Response Types
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class LLMSpanResponse(TypedDict, total=False):
|
||||
"""A span in the LLM response."""
|
||||
|
||||
span_index: int
|
||||
span_text: str
|
||||
span_start: int
|
||||
span_end: int
|
||||
urt_primary: str
|
||||
urt_secondary: list[str]
|
||||
valence: ValenceType
|
||||
intensity: IntensityType
|
||||
specificity: SpecificityType
|
||||
actionability: ActionabilityType
|
||||
temporal: TemporalType
|
||||
evidence: EvidenceType
|
||||
comparative: ComparativeType
|
||||
is_primary: bool
|
||||
confidence: ConfidenceType
|
||||
entity: str | None
|
||||
entity_type: EntityTypeValue | None
|
||||
relation_type: RelationType | None
|
||||
related_span_index: int | None
|
||||
usn: str
|
||||
|
||||
|
||||
class LLMReviewSummary(TypedDict):
|
||||
"""Review summary in the LLM response."""
|
||||
|
||||
dominant_valence: ValenceType
|
||||
dominant_domain: str
|
||||
span_count: int
|
||||
has_comparative: bool
|
||||
has_entity: bool
|
||||
|
||||
|
||||
class LLMClassificationResponse(TypedDict):
|
||||
"""The full LLM classification response."""
|
||||
|
||||
spans: list[LLMSpanResponse]
|
||||
review_summary: LLMReviewSummary
|
||||
@@ -0,0 +1,17 @@
|
||||
"""Database layer for pipeline operations."""
|
||||
|
||||
from reviewiq_pipeline.db.connection import DatabasePool
|
||||
from reviewiq_pipeline.db.repositories import (
|
||||
FactRepository,
|
||||
IssueRepository,
|
||||
ReviewRepository,
|
||||
SpanRepository,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"DatabasePool",
|
||||
"ReviewRepository",
|
||||
"SpanRepository",
|
||||
"IssueRepository",
|
||||
"FactRepository",
|
||||
]
|
||||
@@ -0,0 +1,157 @@
|
||||
"""Database connection management using asyncpg."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from contextlib import asynccontextmanager
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, AsyncGenerator
|
||||
|
||||
import asyncpg
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from reviewiq_pipeline.config import Config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DatabasePool:
|
||||
"""Manages an asyncpg connection pool."""
|
||||
|
||||
def __init__(self, config: Config):
|
||||
self.config = config
|
||||
self._pool: asyncpg.Pool | None = None
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def initialize(self) -> None:
|
||||
"""Initialize the connection pool."""
|
||||
async with self._lock:
|
||||
if self._pool is not None:
|
||||
return
|
||||
|
||||
logger.info("Creating database connection pool...")
|
||||
self._pool = await asyncpg.create_pool(
|
||||
self.config.database_url,
|
||||
min_size=self.config.db_pool_min_size,
|
||||
max_size=self.config.db_pool_max_size,
|
||||
command_timeout=60,
|
||||
)
|
||||
logger.info("Database pool created successfully")
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close the connection pool."""
|
||||
async with self._lock:
|
||||
if self._pool is not None:
|
||||
await self._pool.close()
|
||||
self._pool = None
|
||||
logger.info("Database pool closed")
|
||||
|
||||
@property
|
||||
def pool(self) -> asyncpg.Pool:
|
||||
"""Get the connection pool, raising if not initialized."""
|
||||
if self._pool is None:
|
||||
raise RuntimeError("Database pool not initialized. Call initialize() first.")
|
||||
return self._pool
|
||||
|
||||
@asynccontextmanager
|
||||
async def acquire(self) -> AsyncGenerator[asyncpg.Connection, None]:
|
||||
"""Acquire a connection from the pool."""
|
||||
async with self.pool.acquire() as conn:
|
||||
yield conn
|
||||
|
||||
@asynccontextmanager
|
||||
async def transaction(self) -> AsyncGenerator[asyncpg.Connection, None]:
|
||||
"""Acquire a connection and start a transaction."""
|
||||
async with self.pool.acquire() as conn:
|
||||
async with conn.transaction():
|
||||
yield conn
|
||||
|
||||
async def execute(self, query: str, *args: Any) -> str:
|
||||
"""Execute a query and return the status string."""
|
||||
async with self.acquire() as conn:
|
||||
return await conn.execute(query, *args)
|
||||
|
||||
async def executemany(self, query: str, args: list[tuple]) -> None:
|
||||
"""Execute a query with multiple argument sets."""
|
||||
async with self.acquire() as conn:
|
||||
await conn.executemany(query, args)
|
||||
|
||||
async def fetch(self, query: str, *args: Any) -> list[asyncpg.Record]:
|
||||
"""Fetch multiple rows."""
|
||||
async with self.acquire() as conn:
|
||||
return await conn.fetch(query, *args)
|
||||
|
||||
async def fetchrow(self, query: str, *args: Any) -> asyncpg.Record | None:
|
||||
"""Fetch a single row."""
|
||||
async with self.acquire() as conn:
|
||||
return await conn.fetchrow(query, *args)
|
||||
|
||||
async def fetchval(self, query: str, *args: Any) -> Any:
|
||||
"""Fetch a single value."""
|
||||
async with self.acquire() as conn:
|
||||
return await conn.fetchval(query, *args)
|
||||
|
||||
async def run_migrations(self, migrations_path: str | None = None) -> int:
|
||||
"""Run all pending migrations.
|
||||
|
||||
Args:
|
||||
migrations_path: Path to migrations directory. Uses config default if None.
|
||||
|
||||
Returns:
|
||||
Number of migrations run.
|
||||
"""
|
||||
path = Path(migrations_path or self.config.effective_migrations_path)
|
||||
if not path.exists():
|
||||
logger.warning(f"Migrations path does not exist: {path}")
|
||||
return 0
|
||||
|
||||
async with self.transaction() as conn:
|
||||
# Create migrations tracking table
|
||||
await conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS _migrations (
|
||||
id SERIAL PRIMARY KEY,
|
||||
filename VARCHAR(255) UNIQUE NOT NULL,
|
||||
applied_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
|
||||
)
|
||||
""")
|
||||
|
||||
# Get already applied migrations
|
||||
applied = await conn.fetch("SELECT filename FROM _migrations")
|
||||
applied_set = {r["filename"] for r in applied}
|
||||
|
||||
# Find and run pending migrations
|
||||
migration_files = sorted(path.glob("*.sql"))
|
||||
migrations_run = 0
|
||||
|
||||
for migration_file in migration_files:
|
||||
filename = migration_file.name
|
||||
if filename in applied_set:
|
||||
continue
|
||||
|
||||
logger.info(f"Running migration: {filename}")
|
||||
sql = migration_file.read_text()
|
||||
|
||||
try:
|
||||
await conn.execute(sql)
|
||||
await conn.execute(
|
||||
"INSERT INTO _migrations (filename) VALUES ($1)",
|
||||
filename,
|
||||
)
|
||||
migrations_run += 1
|
||||
logger.info(f"Migration {filename} applied successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Migration {filename} failed: {e}")
|
||||
raise
|
||||
|
||||
logger.info(f"Ran {migrations_run} migrations")
|
||||
return migrations_run
|
||||
|
||||
async def check_connection(self) -> bool:
|
||||
"""Check if the database connection is working."""
|
||||
try:
|
||||
result = await self.fetchval("SELECT 1")
|
||||
return result == 1
|
||||
except Exception as e:
|
||||
logger.error(f"Database connection check failed: {e}")
|
||||
return False
|
||||
@@ -0,0 +1,80 @@
|
||||
-- Migration: 001_create_reviews_tables.sql
|
||||
-- Purpose: Create the core reviews tables for Stage 1 normalization
|
||||
|
||||
-- Raw reviews table (immutable audit log)
|
||||
CREATE TABLE IF NOT EXISTS reviews_raw (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
source VARCHAR(20) NOT NULL DEFAULT 'google',
|
||||
review_id VARCHAR(255) NOT NULL,
|
||||
place_id VARCHAR(255) NOT NULL,
|
||||
raw_payload JSONB NOT NULL DEFAULT '{}',
|
||||
review_text TEXT,
|
||||
rating SMALLINT NOT NULL CHECK (rating BETWEEN 1 AND 5),
|
||||
review_time TIMESTAMP WITH TIME ZONE NOT NULL,
|
||||
reviewer_name VARCHAR(255) NOT NULL,
|
||||
reviewer_id VARCHAR(255),
|
||||
review_version INTEGER NOT NULL DEFAULT 1,
|
||||
pulled_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT reviews_raw_unique UNIQUE (source, review_id, review_version)
|
||||
);
|
||||
|
||||
-- Indexes for reviews_raw
|
||||
CREATE INDEX IF NOT EXISTS idx_reviews_raw_place_id ON reviews_raw(place_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_reviews_raw_review_time ON reviews_raw(review_time);
|
||||
CREATE INDEX IF NOT EXISTS idx_reviews_raw_pulled_at ON reviews_raw(pulled_at);
|
||||
|
||||
-- Enriched reviews table (mutable, updated by classification)
|
||||
CREATE TABLE IF NOT EXISTS reviews_enriched (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
source VARCHAR(20) NOT NULL DEFAULT 'google',
|
||||
review_id VARCHAR(255) NOT NULL,
|
||||
review_version INTEGER NOT NULL DEFAULT 1,
|
||||
is_latest BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
raw_id BIGINT REFERENCES reviews_raw(id),
|
||||
|
||||
-- Tenant context
|
||||
business_id VARCHAR(255) NOT NULL,
|
||||
place_id VARCHAR(255) NOT NULL,
|
||||
|
||||
-- Content
|
||||
text TEXT NOT NULL,
|
||||
text_normalized TEXT NOT NULL,
|
||||
rating SMALLINT NOT NULL CHECK (rating BETWEEN 1 AND 5),
|
||||
review_time TIMESTAMP WITH TIME ZONE NOT NULL,
|
||||
|
||||
-- Normalization fields
|
||||
language VARCHAR(10) NOT NULL DEFAULT 'en',
|
||||
taxonomy_version VARCHAR(20) NOT NULL DEFAULT 'v5.1',
|
||||
|
||||
-- Classification fields (NULL until Stage 2)
|
||||
urt_primary VARCHAR(10),
|
||||
urt_secondary VARCHAR(10)[] DEFAULT '{}',
|
||||
valence VARCHAR(5),
|
||||
intensity VARCHAR(5),
|
||||
comparative VARCHAR(10),
|
||||
staff_mentions VARCHAR(255)[] DEFAULT '{}',
|
||||
quotes JSONB DEFAULT '{}',
|
||||
embedding REAL[] DEFAULT '{}',
|
||||
trust_score REAL,
|
||||
classification_model VARCHAR(100),
|
||||
classification_confidence JSONB DEFAULT '{}',
|
||||
processed_at TIMESTAMP WITH TIME ZONE,
|
||||
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT reviews_enriched_unique UNIQUE (source, review_id, review_version)
|
||||
);
|
||||
|
||||
-- Indexes for reviews_enriched
|
||||
CREATE INDEX IF NOT EXISTS idx_reviews_enriched_business_id ON reviews_enriched(business_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_reviews_enriched_place_id ON reviews_enriched(place_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_reviews_enriched_review_time ON reviews_enriched(review_time);
|
||||
CREATE INDEX IF NOT EXISTS idx_reviews_enriched_urt_primary ON reviews_enriched(urt_primary) WHERE urt_primary IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_reviews_enriched_unclassified ON reviews_enriched(review_time DESC) WHERE urt_primary IS NULL AND is_latest = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_reviews_enriched_valence ON reviews_enriched(valence) WHERE valence IS NOT NULL;
|
||||
|
||||
-- Comment on tables
|
||||
COMMENT ON TABLE reviews_raw IS 'Immutable raw review data as scraped from source';
|
||||
COMMENT ON TABLE reviews_enriched IS 'Enriched reviews with normalization and classification';
|
||||
@@ -0,0 +1,84 @@
|
||||
-- Migration: 002_create_spans_table.sql
|
||||
-- Purpose: Create the review_spans table for Stage 2 classification output
|
||||
|
||||
CREATE TABLE IF NOT EXISTS review_spans (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
span_id VARCHAR(50) NOT NULL UNIQUE,
|
||||
|
||||
-- Context
|
||||
business_id VARCHAR(255) NOT NULL,
|
||||
place_id VARCHAR(255) NOT NULL,
|
||||
source VARCHAR(20) NOT NULL DEFAULT 'google',
|
||||
review_id VARCHAR(255) NOT NULL,
|
||||
review_version INTEGER NOT NULL DEFAULT 1,
|
||||
|
||||
-- Position
|
||||
span_index INTEGER NOT NULL CHECK (span_index >= 0),
|
||||
span_text TEXT NOT NULL,
|
||||
span_start INTEGER NOT NULL CHECK (span_start >= 0),
|
||||
span_end INTEGER NOT NULL CHECK (span_end > span_start),
|
||||
|
||||
-- Classification profile
|
||||
profile VARCHAR(20) NOT NULL DEFAULT 'standard',
|
||||
|
||||
-- Core URT classification
|
||||
urt_primary VARCHAR(10) NOT NULL,
|
||||
urt_secondary VARCHAR(10)[] DEFAULT '{}',
|
||||
valence VARCHAR(5) NOT NULL,
|
||||
intensity VARCHAR(5) NOT NULL,
|
||||
comparative VARCHAR(10) NOT NULL DEFAULT 'CR-N',
|
||||
|
||||
-- Extended classification (standard/full profile)
|
||||
specificity VARCHAR(5),
|
||||
actionability VARCHAR(5),
|
||||
temporal VARCHAR(5),
|
||||
evidence VARCHAR(5),
|
||||
|
||||
-- Entity extraction
|
||||
entity VARCHAR(255),
|
||||
entity_type VARCHAR(20),
|
||||
entity_normalized VARCHAR(255),
|
||||
|
||||
-- Causal relations (full profile)
|
||||
relation_type VARCHAR(20),
|
||||
related_span_id VARCHAR(50),
|
||||
causal_chain JSONB,
|
||||
|
||||
-- Flags
|
||||
is_primary BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
is_active BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
|
||||
-- Time reference
|
||||
review_time TIMESTAMP WITH TIME ZONE NOT NULL,
|
||||
|
||||
-- Metadata
|
||||
confidence VARCHAR(10) NOT NULL DEFAULT 'medium',
|
||||
usn VARCHAR(100) NOT NULL,
|
||||
taxonomy_version VARCHAR(20) NOT NULL,
|
||||
model_version VARCHAR(100) NOT NULL,
|
||||
ingest_batch_id VARCHAR(50) NOT NULL,
|
||||
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
|
||||
|
||||
-- Foreign key to review
|
||||
CONSTRAINT fk_review FOREIGN KEY (source, review_id, review_version)
|
||||
REFERENCES reviews_enriched(source, review_id, review_version)
|
||||
);
|
||||
|
||||
-- Indexes for review_spans
|
||||
CREATE INDEX IF NOT EXISTS idx_spans_business_id ON review_spans(business_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_spans_place_id ON review_spans(place_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_spans_review_time ON review_spans(review_time);
|
||||
CREATE INDEX IF NOT EXISTS idx_spans_urt_primary ON review_spans(urt_primary);
|
||||
CREATE INDEX IF NOT EXISTS idx_spans_valence ON review_spans(valence);
|
||||
CREATE INDEX IF NOT EXISTS idx_spans_intensity ON review_spans(intensity);
|
||||
CREATE INDEX IF NOT EXISTS idx_spans_is_active ON review_spans(is_active) WHERE is_active = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_spans_is_primary ON review_spans(is_primary) WHERE is_primary = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_spans_entity_normalized ON review_spans(entity_normalized) WHERE entity_normalized IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_spans_batch ON review_spans(ingest_batch_id);
|
||||
|
||||
-- Index for unrouted negative spans (Stage 3 query)
|
||||
CREATE INDEX IF NOT EXISTS idx_spans_unrouted_negative ON review_spans(review_time DESC)
|
||||
WHERE is_active = TRUE AND valence IN ('V-', 'V±');
|
||||
|
||||
COMMENT ON TABLE review_spans IS 'Extracted semantic spans with URT classification from reviews';
|
||||
@@ -0,0 +1,111 @@
|
||||
-- Migration: 003_create_urt_enums.sql
|
||||
-- Purpose: Create enum types and lookup tables for URT taxonomy
|
||||
|
||||
-- Valence enum
|
||||
DO $$ BEGIN
|
||||
CREATE TYPE valence_type AS ENUM ('V+', 'V-', 'V0', 'V±');
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN NULL;
|
||||
END $$;
|
||||
|
||||
-- Intensity enum
|
||||
DO $$ BEGIN
|
||||
CREATE TYPE intensity_type AS ENUM ('I1', 'I2', 'I3');
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN NULL;
|
||||
END $$;
|
||||
|
||||
-- Specificity enum
|
||||
DO $$ BEGIN
|
||||
CREATE TYPE specificity_type AS ENUM ('S1', 'S2', 'S3');
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN NULL;
|
||||
END $$;
|
||||
|
||||
-- Actionability enum
|
||||
DO $$ BEGIN
|
||||
CREATE TYPE actionability_type AS ENUM ('A1', 'A2', 'A3');
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN NULL;
|
||||
END $$;
|
||||
|
||||
-- Temporal enum
|
||||
DO $$ BEGIN
|
||||
CREATE TYPE temporal_type AS ENUM ('TC', 'TR', 'TH', 'TF');
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN NULL;
|
||||
END $$;
|
||||
|
||||
-- Evidence enum
|
||||
DO $$ BEGIN
|
||||
CREATE TYPE evidence_type AS ENUM ('ES', 'EI', 'EC');
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN NULL;
|
||||
END $$;
|
||||
|
||||
-- Comparative enum
|
||||
DO $$ BEGIN
|
||||
CREATE TYPE comparative_type AS ENUM ('CR-N', 'CR-B', 'CR-W', 'CR-S');
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN NULL;
|
||||
END $$;
|
||||
|
||||
-- URT Domain lookup table
|
||||
CREATE TABLE IF NOT EXISTS urt_domains (
|
||||
code CHAR(1) PRIMARY KEY,
|
||||
name VARCHAR(50) NOT NULL,
|
||||
description TEXT
|
||||
);
|
||||
|
||||
INSERT INTO urt_domains (code, name, description) VALUES
|
||||
('O', 'Offering', 'Product/service quality, features, variety'),
|
||||
('P', 'Price', 'Value, pricing, promotions, payment'),
|
||||
('J', 'Journey', 'Timing, process, convenience, accessibility'),
|
||||
('E', 'Environment', 'Physical space, ambiance, cleanliness, digital UX'),
|
||||
('A', 'Attitude', 'Staff behavior, helpfulness, professionalism'),
|
||||
('V', 'Voice', 'Brand, communication, marketing, transparency'),
|
||||
('R', 'Relationship', 'Loyalty, trust, consistency, personalization')
|
||||
ON CONFLICT (code) DO NOTHING;
|
||||
|
||||
-- URT Tier-2 categories lookup table
|
||||
CREATE TABLE IF NOT EXISTS urt_categories (
|
||||
code VARCHAR(5) PRIMARY KEY,
|
||||
domain_code CHAR(1) NOT NULL REFERENCES urt_domains(code),
|
||||
name VARCHAR(100) NOT NULL,
|
||||
description TEXT
|
||||
);
|
||||
|
||||
-- Insert standard Tier-2 categories
|
||||
INSERT INTO urt_categories (code, domain_code, name) VALUES
|
||||
('O1', 'O', 'Core Product/Service'),
|
||||
('O2', 'O', 'Product Features'),
|
||||
('O3', 'O', 'Variety & Selection'),
|
||||
('O4', 'O', 'Customization'),
|
||||
('P1', 'P', 'Value Perception'),
|
||||
('P2', 'P', 'Pricing Structure'),
|
||||
('P3', 'P', 'Promotions & Deals'),
|
||||
('P4', 'P', 'Payment Process'),
|
||||
('J1', 'J', 'Wait Times'),
|
||||
('J2', 'J', 'Booking & Reservations'),
|
||||
('J3', 'J', 'Navigation & Convenience'),
|
||||
('J4', 'J', 'Accessibility'),
|
||||
('E1', 'E', 'Physical Environment'),
|
||||
('E2', 'E', 'Ambiance & Atmosphere'),
|
||||
('E3', 'E', 'Cleanliness'),
|
||||
('E4', 'E', 'Digital Experience'),
|
||||
('A1', 'A', 'Friendliness'),
|
||||
('A2', 'A', 'Helpfulness'),
|
||||
('A3', 'A', 'Professionalism'),
|
||||
('A4', 'A', 'Knowledge & Expertise'),
|
||||
('V1', 'V', 'Brand Identity'),
|
||||
('V2', 'V', 'Communication'),
|
||||
('V3', 'V', 'Marketing'),
|
||||
('V4', 'V', 'Transparency'),
|
||||
('R1', 'R', 'Loyalty'),
|
||||
('R2', 'R', 'Trust'),
|
||||
('R3', 'R', 'Consistency'),
|
||||
('R4', 'R', 'Personalization')
|
||||
ON CONFLICT (code) DO NOTHING;
|
||||
|
||||
COMMENT ON TABLE urt_domains IS 'URT v5.1 top-level domains';
|
||||
COMMENT ON TABLE urt_categories IS 'URT v5.1 Tier-2 categories';
|
||||
@@ -0,0 +1,96 @@
|
||||
-- Migration: 004_create_issues_tables.sql
|
||||
-- Purpose: Create tables for Stage 3 issue routing
|
||||
|
||||
-- Issue state enum
|
||||
DO $$ BEGIN
|
||||
CREATE TYPE issue_state AS ENUM ('open', 'resolved', 'ignored', 'merged');
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN NULL;
|
||||
END $$;
|
||||
|
||||
-- Issues table
|
||||
CREATE TABLE IF NOT EXISTS issues (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
issue_id VARCHAR(50) NOT NULL UNIQUE,
|
||||
|
||||
-- Context
|
||||
business_id VARCHAR(255) NOT NULL,
|
||||
place_id VARCHAR(255) NOT NULL,
|
||||
|
||||
-- Classification
|
||||
primary_subcode VARCHAR(10) NOT NULL,
|
||||
domain CHAR(1) NOT NULL,
|
||||
|
||||
-- State
|
||||
state issue_state NOT NULL DEFAULT 'open',
|
||||
priority_score REAL NOT NULL DEFAULT 1.0,
|
||||
confidence_score REAL NOT NULL DEFAULT 1.0,
|
||||
|
||||
-- Aggregates
|
||||
span_count INTEGER NOT NULL DEFAULT 1,
|
||||
max_intensity VARCHAR(5) NOT NULL DEFAULT 'I1',
|
||||
|
||||
-- Entity (optional - for entity-specific issues)
|
||||
entity VARCHAR(255),
|
||||
entity_normalized VARCHAR(255),
|
||||
|
||||
-- Metadata
|
||||
taxonomy_version VARCHAR(20) NOT NULL,
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Issue-span links (1:1 - each span routes to exactly one issue)
|
||||
CREATE TABLE IF NOT EXISTS issue_spans (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
issue_id VARCHAR(50) NOT NULL REFERENCES issues(issue_id),
|
||||
span_id VARCHAR(50) NOT NULL UNIQUE,
|
||||
|
||||
-- Review reference
|
||||
source VARCHAR(20) NOT NULL DEFAULT 'google',
|
||||
review_id VARCHAR(255) NOT NULL,
|
||||
review_version INTEGER NOT NULL DEFAULT 1,
|
||||
|
||||
-- Match info
|
||||
is_primary_match BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
intensity VARCHAR(5) NOT NULL,
|
||||
review_time TIMESTAMP WITH TIME ZONE NOT NULL,
|
||||
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Issue events (audit log)
|
||||
CREATE TABLE IF NOT EXISTS issue_events (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
issue_id VARCHAR(50) NOT NULL REFERENCES issues(issue_id),
|
||||
event_type VARCHAR(50) NOT NULL,
|
||||
span_id VARCHAR(50),
|
||||
old_value TEXT,
|
||||
new_value TEXT,
|
||||
metadata JSONB,
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Indexes for issues
|
||||
CREATE INDEX IF NOT EXISTS idx_issues_business_id ON issues(business_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_issues_place_id ON issues(place_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_issues_state ON issues(state);
|
||||
CREATE INDEX IF NOT EXISTS idx_issues_primary_subcode ON issues(primary_subcode);
|
||||
CREATE INDEX IF NOT EXISTS idx_issues_domain ON issues(domain);
|
||||
CREATE INDEX IF NOT EXISTS idx_issues_entity_normalized ON issues(entity_normalized) WHERE entity_normalized IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_issues_priority ON issues(priority_score DESC) WHERE state = 'open';
|
||||
CREATE INDEX IF NOT EXISTS idx_issues_created ON issues(created_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_issues_updated ON issues(updated_at);
|
||||
|
||||
-- Indexes for issue_spans
|
||||
CREATE INDEX IF NOT EXISTS idx_issue_spans_issue_id ON issue_spans(issue_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_issue_spans_review_time ON issue_spans(review_time);
|
||||
|
||||
-- Indexes for issue_events
|
||||
CREATE INDEX IF NOT EXISTS idx_issue_events_issue_id ON issue_events(issue_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_issue_events_created ON issue_events(created_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_issue_events_type ON issue_events(event_type);
|
||||
|
||||
COMMENT ON TABLE issues IS 'Aggregated issues derived from negative/mixed spans';
|
||||
COMMENT ON TABLE issue_spans IS 'Links between issues and their source spans';
|
||||
COMMENT ON TABLE issue_events IS 'Audit log for issue state changes';
|
||||
@@ -0,0 +1,97 @@
|
||||
-- Migration: 005_create_facts_table.sql
|
||||
-- Purpose: Create the fact_timeseries table for Stage 4 aggregation
|
||||
|
||||
-- Subject type enum
|
||||
DO $$ BEGIN
|
||||
CREATE TYPE subject_type AS ENUM ('overall', 'urt_code', 'domain', 'issue');
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN NULL;
|
||||
END $$;
|
||||
|
||||
-- Bucket type enum
|
||||
DO $$ BEGIN
|
||||
CREATE TYPE bucket_type AS ENUM ('day', 'week', 'month');
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN NULL;
|
||||
END $$;
|
||||
|
||||
-- Fact timeseries table
|
||||
CREATE TABLE IF NOT EXISTS fact_timeseries (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
|
||||
-- Dimension keys
|
||||
business_id VARCHAR(255) NOT NULL,
|
||||
place_id VARCHAR(255) NOT NULL, -- Or 'ALL' for rollup
|
||||
period_date DATE NOT NULL,
|
||||
bucket_type bucket_type NOT NULL DEFAULT 'day',
|
||||
subject_type subject_type NOT NULL DEFAULT 'urt_code',
|
||||
subject_id VARCHAR(50) NOT NULL, -- URT code, domain letter, or issue_id
|
||||
taxonomy_version VARCHAR(20) NOT NULL,
|
||||
|
||||
-- Core counts
|
||||
review_count INTEGER NOT NULL DEFAULT 0,
|
||||
span_count INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
-- Valence counts
|
||||
negative_count INTEGER NOT NULL DEFAULT 0,
|
||||
positive_count INTEGER NOT NULL DEFAULT 0,
|
||||
neutral_count INTEGER NOT NULL DEFAULT 0,
|
||||
mixed_count INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
-- Strength scores
|
||||
strength_score REAL NOT NULL DEFAULT 0.0,
|
||||
negative_strength REAL NOT NULL DEFAULT 0.0,
|
||||
positive_strength REAL NOT NULL DEFAULT 0.0,
|
||||
|
||||
-- Rating
|
||||
avg_rating REAL,
|
||||
|
||||
-- Intensity counts
|
||||
i1_count INTEGER NOT NULL DEFAULT 0,
|
||||
i2_count INTEGER NOT NULL DEFAULT 0,
|
||||
i3_count INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
-- Comparative counts
|
||||
cr_better INTEGER NOT NULL DEFAULT 0,
|
||||
cr_worse INTEGER NOT NULL DEFAULT 0,
|
||||
cr_same INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
-- Trust-weighted metrics
|
||||
trust_weighted_strength REAL NOT NULL DEFAULT 0.0,
|
||||
trust_weighted_negative REAL NOT NULL DEFAULT 0.0,
|
||||
|
||||
-- Metadata
|
||||
computed_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
|
||||
|
||||
-- Unique constraint for upsert
|
||||
CONSTRAINT fact_timeseries_unique UNIQUE (
|
||||
business_id, place_id, period_date, bucket_type,
|
||||
subject_type, subject_id, taxonomy_version
|
||||
)
|
||||
);
|
||||
|
||||
-- Indexes for fact_timeseries
|
||||
CREATE INDEX IF NOT EXISTS idx_facts_business_id ON fact_timeseries(business_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_facts_place_id ON fact_timeseries(place_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_facts_period ON fact_timeseries(period_date);
|
||||
CREATE INDEX IF NOT EXISTS idx_facts_bucket ON fact_timeseries(bucket_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_facts_subject_type ON fact_timeseries(subject_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_facts_subject_id ON fact_timeseries(subject_id);
|
||||
|
||||
-- Composite index for common dashboard queries
|
||||
CREATE INDEX IF NOT EXISTS idx_facts_dashboard ON fact_timeseries(
|
||||
business_id, place_id, bucket_type, period_date DESC
|
||||
);
|
||||
|
||||
-- Index for specific code trends
|
||||
CREATE INDEX IF NOT EXISTS idx_facts_code_trend ON fact_timeseries(
|
||||
business_id, subject_id, bucket_type, period_date DESC
|
||||
) WHERE subject_type = 'urt_code';
|
||||
|
||||
-- Index for domain aggregates
|
||||
CREATE INDEX IF NOT EXISTS idx_facts_domain ON fact_timeseries(
|
||||
business_id, subject_id, bucket_type, period_date DESC
|
||||
) WHERE subject_type = 'domain';
|
||||
|
||||
COMMENT ON TABLE fact_timeseries IS 'Pre-aggregated time series facts for dashboard queries';
|
||||
@@ -0,0 +1,562 @@
|
||||
"""Data access layer for pipeline operations."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from datetime import date, datetime
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from reviewiq_pipeline.contracts import (
|
||||
ClassifiedReview,
|
||||
ExtractedSpan,
|
||||
FactRecord,
|
||||
NormalizedReview,
|
||||
RawReview,
|
||||
RoutedSpan,
|
||||
)
|
||||
from reviewiq_pipeline.db.connection import DatabasePool
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ReviewRepository:
|
||||
"""Repository for review data operations."""
|
||||
|
||||
def __init__(self, db: DatabasePool):
|
||||
self.db = db
|
||||
|
||||
async def insert_raw_review(
|
||||
self,
|
||||
review: RawReview,
|
||||
place_id: str,
|
||||
source: str = "google",
|
||||
) -> int:
|
||||
"""Insert a raw review and return its ID."""
|
||||
query = """
|
||||
INSERT INTO reviews_raw (
|
||||
source, review_id, place_id, raw_payload,
|
||||
review_text, rating, review_time, reviewer_name, reviewer_id,
|
||||
review_version, pulled_at
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, NOW())
|
||||
ON CONFLICT (source, review_id, review_version) DO UPDATE SET
|
||||
pulled_at = NOW()
|
||||
RETURNING id
|
||||
"""
|
||||
raw_id = await self.db.fetchval(
|
||||
query,
|
||||
source,
|
||||
review["review_id"],
|
||||
place_id,
|
||||
json.dumps(review.get("raw_payload", {})),
|
||||
review.get("text"),
|
||||
review["rating"],
|
||||
review["review_time"],
|
||||
review["author_name"],
|
||||
review.get("author_id"),
|
||||
1, # Initial version
|
||||
)
|
||||
return raw_id
|
||||
|
||||
async def insert_enriched_review(
|
||||
self,
|
||||
review: NormalizedReview,
|
||||
raw_id: int,
|
||||
) -> int:
|
||||
"""Insert an enriched review stub (pre-classification)."""
|
||||
query = """
|
||||
INSERT INTO reviews_enriched (
|
||||
source, review_id, review_version, is_latest, raw_id,
|
||||
business_id, place_id, text, text_normalized, rating, review_time,
|
||||
language, taxonomy_version
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13)
|
||||
ON CONFLICT (source, review_id, review_version) DO UPDATE SET
|
||||
is_latest = EXCLUDED.is_latest
|
||||
RETURNING id
|
||||
"""
|
||||
enriched_id = await self.db.fetchval(
|
||||
query,
|
||||
review["source"],
|
||||
review["review_id"],
|
||||
review["review_version"],
|
||||
True, # is_latest
|
||||
raw_id,
|
||||
review["business_id"],
|
||||
review["place_id"],
|
||||
review["text"],
|
||||
review["text_normalized"],
|
||||
review["rating"],
|
||||
review["review_time"],
|
||||
review["text_language"],
|
||||
"v5.1", # taxonomy_version - will be updated by Stage 2
|
||||
)
|
||||
return enriched_id
|
||||
|
||||
async def update_enriched_with_classification(
|
||||
self,
|
||||
classified: ClassifiedReview,
|
||||
model_version: str,
|
||||
taxonomy_version: str,
|
||||
) -> None:
|
||||
"""Update an enriched review with classification results."""
|
||||
query = """
|
||||
UPDATE reviews_enriched SET
|
||||
urt_primary = $1,
|
||||
urt_secondary = $2,
|
||||
valence = $3,
|
||||
intensity = $4,
|
||||
comparative = $5,
|
||||
staff_mentions = $6,
|
||||
quotes = $7,
|
||||
embedding = $8,
|
||||
trust_score = $9,
|
||||
classification_model = $10,
|
||||
classification_confidence = $11,
|
||||
taxonomy_version = $12,
|
||||
processed_at = NOW()
|
||||
WHERE source = $13
|
||||
AND review_id = $14
|
||||
AND review_version = $15
|
||||
"""
|
||||
await self.db.execute(
|
||||
query,
|
||||
classified["urt_primary"],
|
||||
classified.get("urt_secondary", []),
|
||||
classified["valence"],
|
||||
classified["intensity"],
|
||||
classified.get("comparative", "CR-N"),
|
||||
classified.get("staff_mentions", []),
|
||||
json.dumps(classified.get("quotes", {})),
|
||||
classified.get("embedding", []),
|
||||
classified.get("trust_score", 0.5),
|
||||
model_version,
|
||||
json.dumps(classified.get("classification_confidence", {})),
|
||||
taxonomy_version,
|
||||
classified["source"],
|
||||
classified["review_id"],
|
||||
classified["review_version"],
|
||||
)
|
||||
|
||||
async def get_unclassified_reviews(
|
||||
self,
|
||||
limit: int = 100,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Get reviews that haven't been classified yet."""
|
||||
query = """
|
||||
SELECT
|
||||
source, review_id, review_version, business_id, place_id,
|
||||
text, text_normalized, rating, review_time
|
||||
FROM reviews_enriched
|
||||
WHERE urt_primary IS NULL
|
||||
AND is_latest = TRUE
|
||||
ORDER BY review_time DESC
|
||||
LIMIT $1
|
||||
"""
|
||||
rows = await self.db.fetch(query, limit)
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
async def get_review_by_id(
|
||||
self,
|
||||
source: str,
|
||||
review_id: str,
|
||||
review_version: int,
|
||||
) -> dict[str, Any] | None:
|
||||
"""Get a specific review by its composite key."""
|
||||
query = """
|
||||
SELECT * FROM reviews_enriched
|
||||
WHERE source = $1 AND review_id = $2 AND review_version = $3
|
||||
"""
|
||||
row = await self.db.fetchrow(query, source, review_id, review_version)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def check_duplicate(
|
||||
self,
|
||||
content_hash: str,
|
||||
business_id: str,
|
||||
) -> str | None:
|
||||
"""Check if a content hash already exists, return dedup_group_id if so."""
|
||||
# For now, we check by querying the first occurrence
|
||||
# A proper dedup table would be better for production
|
||||
query = """
|
||||
SELECT review_id FROM reviews_enriched
|
||||
WHERE business_id = $1
|
||||
AND text_normalized IS NOT NULL
|
||||
LIMIT 1
|
||||
"""
|
||||
# Simplified - in production, use a separate dedup table with content_hash index
|
||||
return None
|
||||
|
||||
|
||||
class SpanRepository:
|
||||
"""Repository for span data operations."""
|
||||
|
||||
def __init__(self, db: DatabasePool):
|
||||
self.db = db
|
||||
|
||||
async def insert_span(
|
||||
self,
|
||||
span: ExtractedSpan,
|
||||
business_id: str,
|
||||
place_id: str,
|
||||
source: str,
|
||||
review_id: str,
|
||||
review_version: int,
|
||||
review_time: str,
|
||||
batch_id: str,
|
||||
model_version: str,
|
||||
taxonomy_version: str,
|
||||
) -> None:
|
||||
"""Insert a span into the database."""
|
||||
query = """
|
||||
INSERT INTO review_spans (
|
||||
span_id, business_id, place_id, source, review_id, review_version,
|
||||
span_index, span_text, span_start, span_end,
|
||||
profile, urt_primary, urt_secondary, valence, intensity, comparative,
|
||||
specificity, actionability, temporal, evidence,
|
||||
entity, entity_type, entity_normalized,
|
||||
relation_type, related_span_id, causal_chain,
|
||||
is_primary, is_active, review_time,
|
||||
confidence, usn, taxonomy_version, model_version, ingest_batch_id
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
|
||||
$11, $12, $13, $14, $15, $16, $17, $18, $19, $20,
|
||||
$21, $22, $23, $24, $25, $26, $27, $28, $29, $30,
|
||||
$31, $32, $33, $34
|
||||
)
|
||||
ON CONFLICT (span_id) DO UPDATE SET
|
||||
is_active = EXCLUDED.is_active
|
||||
"""
|
||||
# Build related_span_id from index if needed
|
||||
related_span_id = None
|
||||
if span.get("related_span_index") is not None:
|
||||
# Build the related span_id (would need the actual span_id mapping)
|
||||
pass
|
||||
|
||||
await self.db.execute(
|
||||
query,
|
||||
span["span_id"],
|
||||
business_id,
|
||||
place_id,
|
||||
source,
|
||||
review_id,
|
||||
review_version,
|
||||
span["span_index"],
|
||||
span["span_text"],
|
||||
span["span_start"],
|
||||
span["span_end"],
|
||||
span.get("profile", "standard"),
|
||||
span["urt_primary"],
|
||||
span.get("urt_secondary", []),
|
||||
span["valence"],
|
||||
span["intensity"],
|
||||
span.get("comparative", "CR-N"),
|
||||
span.get("specificity"),
|
||||
span.get("actionability"),
|
||||
span.get("temporal"),
|
||||
span.get("evidence"),
|
||||
span.get("entity"),
|
||||
span.get("entity_type"),
|
||||
span.get("entity_normalized"),
|
||||
span.get("relation_type"),
|
||||
related_span_id,
|
||||
json.dumps(span.get("causal_chain")) if span.get("causal_chain") else None,
|
||||
span.get("is_primary", False),
|
||||
True, # is_active
|
||||
review_time,
|
||||
span.get("confidence", "medium"),
|
||||
span["usn"],
|
||||
taxonomy_version,
|
||||
model_version,
|
||||
batch_id,
|
||||
)
|
||||
|
||||
async def get_unrouted_negative_spans(
|
||||
self,
|
||||
limit: int = 100,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Get negative spans that haven't been routed to issues yet."""
|
||||
query = """
|
||||
SELECT
|
||||
rs.span_id, rs.business_id, rs.place_id,
|
||||
rs.urt_primary, rs.valence, rs.intensity,
|
||||
rs.entity_normalized, rs.review_time, rs.confidence,
|
||||
re.trust_score
|
||||
FROM review_spans rs
|
||||
JOIN reviews_enriched re ON (
|
||||
re.source = rs.source
|
||||
AND re.review_id = rs.review_id
|
||||
AND re.review_version = rs.review_version
|
||||
)
|
||||
WHERE rs.is_active = TRUE
|
||||
AND rs.valence IN ('V-', 'V±')
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM issue_spans iss WHERE iss.span_id = rs.span_id
|
||||
)
|
||||
ORDER BY rs.review_time DESC
|
||||
LIMIT $1
|
||||
"""
|
||||
rows = await self.db.fetch(query, limit)
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
async def get_span_by_id(self, span_id: str) -> dict[str, Any] | None:
|
||||
"""Get a span by its ID."""
|
||||
query = "SELECT * FROM review_spans WHERE span_id = $1"
|
||||
row = await self.db.fetchrow(query, span_id)
|
||||
return dict(row) if row else None
|
||||
|
||||
|
||||
class IssueRepository:
|
||||
"""Repository for issue data operations."""
|
||||
|
||||
def __init__(self, db: DatabasePool):
|
||||
self.db = db
|
||||
|
||||
async def upsert_issue(
|
||||
self,
|
||||
issue_id: str,
|
||||
business_id: str,
|
||||
place_id: str,
|
||||
primary_subcode: str,
|
||||
intensity: str,
|
||||
entity: str | None,
|
||||
entity_normalized: str | None,
|
||||
taxonomy_version: str,
|
||||
) -> bool:
|
||||
"""Create or update an issue. Returns True if newly created."""
|
||||
# First check if exists
|
||||
existing = await self.db.fetchval(
|
||||
"SELECT 1 FROM issues WHERE issue_id = $1",
|
||||
issue_id,
|
||||
)
|
||||
|
||||
if existing:
|
||||
# Update
|
||||
await self.db.execute(
|
||||
"""
|
||||
UPDATE issues SET
|
||||
span_count = span_count + 1,
|
||||
max_intensity = CASE
|
||||
WHEN $1 = 'I3' THEN 'I3'
|
||||
WHEN $1 = 'I2' AND max_intensity != 'I3' THEN 'I2'
|
||||
ELSE max_intensity
|
||||
END,
|
||||
updated_at = NOW()
|
||||
WHERE issue_id = $2
|
||||
""",
|
||||
intensity,
|
||||
issue_id,
|
||||
)
|
||||
return False
|
||||
else:
|
||||
# Insert
|
||||
domain = primary_subcode[0] if primary_subcode else "O"
|
||||
await self.db.execute(
|
||||
"""
|
||||
INSERT INTO issues (
|
||||
issue_id, business_id, place_id, primary_subcode, domain,
|
||||
state, priority_score, confidence_score, span_count, max_intensity,
|
||||
entity, entity_normalized, taxonomy_version
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13)
|
||||
""",
|
||||
issue_id,
|
||||
business_id,
|
||||
place_id,
|
||||
primary_subcode,
|
||||
domain,
|
||||
"open",
|
||||
1.0, # Initial priority
|
||||
1.0, # Initial confidence
|
||||
1, # Initial span count
|
||||
intensity,
|
||||
entity,
|
||||
entity_normalized,
|
||||
taxonomy_version,
|
||||
)
|
||||
return True
|
||||
|
||||
async def link_span_to_issue(
|
||||
self,
|
||||
routed: RoutedSpan,
|
||||
source: str,
|
||||
review_id: str,
|
||||
review_version: int,
|
||||
intensity: str,
|
||||
review_time: str,
|
||||
is_primary_match: bool = True,
|
||||
) -> None:
|
||||
"""Link a span to an issue."""
|
||||
await self.db.execute(
|
||||
"""
|
||||
INSERT INTO issue_spans (
|
||||
issue_id, span_id, source, review_id, review_version,
|
||||
is_primary_match, intensity, review_time
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
ON CONFLICT (span_id) DO NOTHING
|
||||
""",
|
||||
routed["issue_id"],
|
||||
routed["span_id"],
|
||||
source,
|
||||
review_id,
|
||||
review_version,
|
||||
is_primary_match,
|
||||
intensity,
|
||||
review_time,
|
||||
)
|
||||
|
||||
async def log_event(
|
||||
self,
|
||||
issue_id: str,
|
||||
event_type: str,
|
||||
span_id: str | None = None,
|
||||
old_value: str | None = None,
|
||||
new_value: str | None = None,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
) -> None:
|
||||
"""Log an issue event for audit trail."""
|
||||
await self.db.execute(
|
||||
"""
|
||||
INSERT INTO issue_events (
|
||||
issue_id, event_type, span_id, old_value, new_value, metadata
|
||||
) VALUES ($1, $2, $3, $4, $5, $6)
|
||||
""",
|
||||
issue_id,
|
||||
event_type,
|
||||
span_id,
|
||||
old_value,
|
||||
new_value,
|
||||
json.dumps(metadata) if metadata else None,
|
||||
)
|
||||
|
||||
async def get_issue_by_id(self, issue_id: str) -> dict[str, Any] | None:
|
||||
"""Get an issue by its ID."""
|
||||
query = "SELECT * FROM issues WHERE issue_id = $1"
|
||||
row = await self.db.fetchrow(query, issue_id)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def check_span_already_linked(self, span_id: str) -> str | None:
|
||||
"""Check if a span is already linked to an issue."""
|
||||
return await self.db.fetchval(
|
||||
"SELECT issue_id FROM issue_spans WHERE span_id = $1",
|
||||
span_id,
|
||||
)
|
||||
|
||||
|
||||
class FactRepository:
|
||||
"""Repository for fact time series operations."""
|
||||
|
||||
def __init__(self, db: DatabasePool):
|
||||
self.db = db
|
||||
|
||||
async def upsert_fact(self, fact: FactRecord) -> None:
|
||||
"""Insert or update a fact record."""
|
||||
await self.db.execute(
|
||||
"""
|
||||
INSERT INTO fact_timeseries (
|
||||
business_id, place_id, period_date, bucket_type,
|
||||
subject_type, subject_id, taxonomy_version,
|
||||
review_count, span_count, negative_count, positive_count,
|
||||
neutral_count, mixed_count, strength_score, negative_strength,
|
||||
positive_strength, avg_rating, i1_count, i2_count, i3_count,
|
||||
cr_better, cr_worse, cr_same,
|
||||
trust_weighted_strength, trust_weighted_negative,
|
||||
computed_at
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14,
|
||||
$15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, NOW()
|
||||
)
|
||||
ON CONFLICT (
|
||||
business_id, place_id, period_date, bucket_type,
|
||||
subject_type, subject_id, taxonomy_version
|
||||
) DO UPDATE SET
|
||||
review_count = EXCLUDED.review_count,
|
||||
span_count = EXCLUDED.span_count,
|
||||
negative_count = EXCLUDED.negative_count,
|
||||
positive_count = EXCLUDED.positive_count,
|
||||
neutral_count = EXCLUDED.neutral_count,
|
||||
mixed_count = EXCLUDED.mixed_count,
|
||||
strength_score = EXCLUDED.strength_score,
|
||||
negative_strength = EXCLUDED.negative_strength,
|
||||
positive_strength = EXCLUDED.positive_strength,
|
||||
avg_rating = EXCLUDED.avg_rating,
|
||||
i1_count = EXCLUDED.i1_count,
|
||||
i2_count = EXCLUDED.i2_count,
|
||||
i3_count = EXCLUDED.i3_count,
|
||||
cr_better = EXCLUDED.cr_better,
|
||||
cr_worse = EXCLUDED.cr_worse,
|
||||
cr_same = EXCLUDED.cr_same,
|
||||
trust_weighted_strength = EXCLUDED.trust_weighted_strength,
|
||||
trust_weighted_negative = EXCLUDED.trust_weighted_negative,
|
||||
computed_at = NOW()
|
||||
""",
|
||||
fact["business_id"],
|
||||
fact["place_id"],
|
||||
fact["period_date"],
|
||||
fact["bucket_type"],
|
||||
fact["subject_type"],
|
||||
fact["subject_id"],
|
||||
fact["taxonomy_version"],
|
||||
fact["review_count"],
|
||||
fact["span_count"],
|
||||
fact["negative_count"],
|
||||
fact["positive_count"],
|
||||
fact["neutral_count"],
|
||||
fact["mixed_count"],
|
||||
fact["strength_score"],
|
||||
fact["negative_strength"],
|
||||
fact["positive_strength"],
|
||||
fact.get("avg_rating"),
|
||||
fact["i1_count"],
|
||||
fact["i2_count"],
|
||||
fact["i3_count"],
|
||||
fact["cr_better"],
|
||||
fact["cr_worse"],
|
||||
fact["cr_same"],
|
||||
fact["trust_weighted_strength"],
|
||||
fact["trust_weighted_negative"],
|
||||
)
|
||||
|
||||
async def get_aggregation_data(
|
||||
self,
|
||||
business_id: str,
|
||||
start_date: date,
|
||||
end_date: date,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Get span data for aggregation within a date range."""
|
||||
query = """
|
||||
SELECT
|
||||
rs.business_id,
|
||||
rs.place_id,
|
||||
DATE(rs.review_time) as review_date,
|
||||
rs.urt_primary,
|
||||
rs.valence,
|
||||
rs.intensity,
|
||||
rs.comparative,
|
||||
re.trust_score,
|
||||
re.rating
|
||||
FROM review_spans rs
|
||||
JOIN reviews_enriched re ON (
|
||||
re.source = rs.source
|
||||
AND re.review_id = rs.review_id
|
||||
AND re.review_version = rs.review_version
|
||||
)
|
||||
WHERE rs.business_id = $1
|
||||
AND rs.is_active = TRUE
|
||||
AND DATE(rs.review_time) BETWEEN $2 AND $3
|
||||
"""
|
||||
rows = await self.db.fetch(query, business_id, start_date, end_date)
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
async def get_place_ids_for_business(
|
||||
self,
|
||||
business_id: str,
|
||||
) -> list[str]:
|
||||
"""Get all place IDs for a business."""
|
||||
rows = await self.db.fetch(
|
||||
"""
|
||||
SELECT DISTINCT place_id FROM reviews_enriched
|
||||
WHERE business_id = $1
|
||||
""",
|
||||
business_id,
|
||||
)
|
||||
return [r["place_id"] for r in rows]
|
||||
402
packages/reviewiq-pipeline/src/reviewiq_pipeline/pipeline.py
Normal file
402
packages/reviewiq-pipeline/src/reviewiq_pipeline/pipeline.py
Normal file
@@ -0,0 +1,402 @@
|
||||
"""
|
||||
Pipeline class - main public API for the ReviewIQ pipeline.
|
||||
|
||||
Provides a unified interface for running pipeline stages.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from datetime import date
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from reviewiq_pipeline.config import Config
|
||||
from reviewiq_pipeline.contracts import (
|
||||
ClassificationConfig,
|
||||
NormalizedReview,
|
||||
ReviewToClassify,
|
||||
ScraperOutput,
|
||||
SpanToRoute,
|
||||
Stage1Input,
|
||||
Stage1Output,
|
||||
Stage2Input,
|
||||
Stage2Output,
|
||||
Stage3Input,
|
||||
Stage3Output,
|
||||
Stage4Input,
|
||||
Stage4Output,
|
||||
ValidationResult,
|
||||
)
|
||||
from reviewiq_pipeline.db.connection import DatabasePool
|
||||
from reviewiq_pipeline.db.repositories import (
|
||||
FactRepository,
|
||||
IssueRepository,
|
||||
ReviewRepository,
|
||||
SpanRepository,
|
||||
)
|
||||
from reviewiq_pipeline.services.embeddings import EmbeddingService
|
||||
from reviewiq_pipeline.stages.stage1_normalize import Stage1Normalizer
|
||||
from reviewiq_pipeline.stages.stage2_classify import Stage2Classifier
|
||||
from reviewiq_pipeline.stages.stage3_route import Stage3Router
|
||||
from reviewiq_pipeline.stages.stage4_aggregate import Stage4Aggregator
|
||||
from reviewiq_pipeline.validation.validators import (
|
||||
validate_stage1_output,
|
||||
validate_stage2_output,
|
||||
validate_stage3_output,
|
||||
validate_stage4_output,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
pass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PipelineResult:
|
||||
"""Result from running the full pipeline."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
stage1: Stage1Output | None = None,
|
||||
stage2: Stage2Output | None = None,
|
||||
stage3: Stage3Output | None = None,
|
||||
stage4: Stage4Output | None = None,
|
||||
validation: dict[str, ValidationResult] | None = None,
|
||||
):
|
||||
self.stage1 = stage1
|
||||
self.stage2 = stage2
|
||||
self.stage3 = stage3
|
||||
self.stage4 = stage4
|
||||
self.validation = validation or {}
|
||||
|
||||
@property
|
||||
def success(self) -> bool:
|
||||
"""Check if all ran stages passed validation."""
|
||||
return all(v["passed"] for v in self.validation.values())
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Convert to dictionary."""
|
||||
return {
|
||||
"stage1": self.stage1,
|
||||
"stage2": self.stage2,
|
||||
"stage3": self.stage3,
|
||||
"stage4": self.stage4,
|
||||
"validation": self.validation,
|
||||
"success": self.success,
|
||||
}
|
||||
|
||||
|
||||
class Pipeline:
|
||||
"""
|
||||
Main pipeline class for processing reviews.
|
||||
|
||||
Usage:
|
||||
config = Config(database_url="...", llm_provider="openai", ...)
|
||||
pipeline = Pipeline(config)
|
||||
|
||||
# Run full pipeline
|
||||
result = await pipeline.process(scraper_output)
|
||||
|
||||
# Or run individual stages
|
||||
stage1_result = await pipeline.normalize(scraper_output)
|
||||
stage2_result = await pipeline.classify(stage1_result)
|
||||
"""
|
||||
|
||||
def __init__(self, config: Config):
|
||||
"""
|
||||
Initialize the pipeline.
|
||||
|
||||
Args:
|
||||
config: Pipeline configuration
|
||||
"""
|
||||
self.config = config
|
||||
self._db: DatabasePool | None = None
|
||||
self._review_repo: ReviewRepository | None = None
|
||||
self._span_repo: SpanRepository | None = None
|
||||
self._issue_repo: IssueRepository | None = None
|
||||
self._fact_repo: FactRepository | None = None
|
||||
self._embedding_service: EmbeddingService | None = None
|
||||
self._initialized = False
|
||||
|
||||
async def initialize(self) -> None:
|
||||
"""Initialize database connections and services."""
|
||||
if self._initialized:
|
||||
return
|
||||
|
||||
logger.info("Initializing pipeline...")
|
||||
|
||||
# Initialize database
|
||||
self._db = DatabasePool(self.config)
|
||||
await self._db.initialize()
|
||||
|
||||
# Initialize repositories
|
||||
self._review_repo = ReviewRepository(self._db)
|
||||
self._span_repo = SpanRepository(self._db)
|
||||
self._issue_repo = IssueRepository(self._db)
|
||||
self._fact_repo = FactRepository(self._db)
|
||||
|
||||
# Initialize embedding service
|
||||
self._embedding_service = EmbeddingService(self.config)
|
||||
|
||||
self._initialized = True
|
||||
logger.info("Pipeline initialized")
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close all connections and cleanup resources."""
|
||||
if self._db:
|
||||
await self._db.close()
|
||||
self._db = None
|
||||
|
||||
self._initialized = False
|
||||
logger.info("Pipeline closed")
|
||||
|
||||
async def migrate(self) -> int:
|
||||
"""
|
||||
Run database migrations.
|
||||
|
||||
Returns:
|
||||
Number of migrations run
|
||||
"""
|
||||
if not self._db:
|
||||
self._db = DatabasePool(self.config)
|
||||
await self._db.initialize()
|
||||
|
||||
return await self._db.run_migrations()
|
||||
|
||||
async def process(
|
||||
self,
|
||||
scraper_output: ScraperOutput,
|
||||
stages: list[int] | None = None,
|
||||
validate: bool = True,
|
||||
) -> PipelineResult:
|
||||
"""
|
||||
Run the full pipeline on scraper output.
|
||||
|
||||
Args:
|
||||
scraper_output: Output from the scraper (Stage 0)
|
||||
stages: List of stages to run (default: all [1, 2, 3, 4])
|
||||
validate: Whether to validate each stage output
|
||||
|
||||
Returns:
|
||||
PipelineResult with all stage outputs and validation results
|
||||
"""
|
||||
await self.initialize()
|
||||
|
||||
stages = stages or [1, 2, 3, 4]
|
||||
result = PipelineResult()
|
||||
validation_results: dict[str, ValidationResult] = {}
|
||||
|
||||
# Stage 1: Normalize
|
||||
if 1 in stages:
|
||||
logger.info("Running Stage 1: Normalization")
|
||||
result.stage1 = await self.normalize(scraper_output)
|
||||
|
||||
if validate:
|
||||
validation_results["stage1"] = validate_stage1_output(result.stage1)
|
||||
|
||||
# Stage 2: Classify
|
||||
if 2 in stages and result.stage1:
|
||||
logger.info("Running Stage 2: Classification")
|
||||
result.stage2 = await self.classify(result.stage1)
|
||||
|
||||
if validate:
|
||||
# Build input reviews map for validation
|
||||
input_reviews = {
|
||||
(r["source"], r["review_id"], r["review_version"]): r
|
||||
for r in result.stage1["reviews_normalized"]
|
||||
}
|
||||
validation_results["stage2"] = validate_stage2_output(
|
||||
result.stage2, input_reviews
|
||||
)
|
||||
|
||||
# Stage 3: Route
|
||||
if 3 in stages and result.stage2:
|
||||
logger.info("Running Stage 3: Issue Routing")
|
||||
result.stage3 = await self.route(result.stage2)
|
||||
|
||||
if validate:
|
||||
validation_results["stage3"] = await validate_stage3_output(
|
||||
result.stage3, self._db
|
||||
)
|
||||
|
||||
# Stage 4: Aggregate
|
||||
if 4 in stages:
|
||||
logger.info("Running Stage 4: Aggregation")
|
||||
result.stage4 = await self.aggregate(
|
||||
scraper_output["business_id"],
|
||||
date.today().isoformat(),
|
||||
)
|
||||
|
||||
if validate:
|
||||
validation_results["stage4"] = validate_stage4_output(result.stage4)
|
||||
|
||||
result.validation = validation_results
|
||||
return result
|
||||
|
||||
async def normalize(self, scraper_output: ScraperOutput) -> Stage1Output:
|
||||
"""
|
||||
Run Stage 1: Normalization.
|
||||
|
||||
Args:
|
||||
scraper_output: Raw scraper output
|
||||
|
||||
Returns:
|
||||
Stage1Output with normalized reviews
|
||||
"""
|
||||
await self.initialize()
|
||||
|
||||
stage1 = Stage1Normalizer(
|
||||
self.config,
|
||||
self._db,
|
||||
self._review_repo,
|
||||
)
|
||||
|
||||
input_data = Stage1Input(
|
||||
job_id=scraper_output["job_id"],
|
||||
business_id=scraper_output["business_id"],
|
||||
place_id=scraper_output["place_id"],
|
||||
reviews=scraper_output["reviews"],
|
||||
)
|
||||
|
||||
return await stage1.process(input_data)
|
||||
|
||||
async def classify(self, stage1_output: Stage1Output) -> Stage2Output:
|
||||
"""
|
||||
Run Stage 2: Classification.
|
||||
|
||||
Args:
|
||||
stage1_output: Output from Stage 1
|
||||
|
||||
Returns:
|
||||
Stage2Output with classified reviews
|
||||
"""
|
||||
await self.initialize()
|
||||
|
||||
stage2 = Stage2Classifier(
|
||||
self.config,
|
||||
self._db,
|
||||
self._review_repo,
|
||||
self._span_repo,
|
||||
self._embedding_service,
|
||||
)
|
||||
|
||||
# Convert normalized reviews to classification input
|
||||
reviews_to_classify = [
|
||||
ReviewToClassify(
|
||||
source=r["source"],
|
||||
review_id=r["review_id"],
|
||||
review_version=r["review_version"],
|
||||
business_id=r["business_id"],
|
||||
place_id=r["place_id"],
|
||||
text=r["text"],
|
||||
text_normalized=r["text_normalized"],
|
||||
rating=r["rating"],
|
||||
review_time=r["review_time"],
|
||||
)
|
||||
for r in stage1_output["reviews_normalized"]
|
||||
]
|
||||
|
||||
input_data = Stage2Input(
|
||||
reviews=reviews_to_classify,
|
||||
config=ClassificationConfig(
|
||||
model=self.config.llm_model,
|
||||
taxonomy_version=self.config.taxonomy_version,
|
||||
profile=self.config.classification_profile,
|
||||
max_spans_per_review=self.config.max_spans_per_review,
|
||||
),
|
||||
)
|
||||
|
||||
try:
|
||||
return await stage2.process(input_data)
|
||||
finally:
|
||||
await stage2.close()
|
||||
|
||||
async def route(self, stage2_output: Stage2Output) -> Stage3Output:
|
||||
"""
|
||||
Run Stage 3: Issue Routing.
|
||||
|
||||
Args:
|
||||
stage2_output: Output from Stage 2
|
||||
|
||||
Returns:
|
||||
Stage3Output with routing results
|
||||
"""
|
||||
await self.initialize()
|
||||
|
||||
stage3 = Stage3Router(
|
||||
self.config,
|
||||
self._db,
|
||||
self._span_repo,
|
||||
self._issue_repo,
|
||||
)
|
||||
|
||||
# Extract negative/mixed spans for routing
|
||||
spans_to_route = []
|
||||
for review in stage2_output["reviews_classified"]:
|
||||
for span in review.get("spans", []):
|
||||
if span["valence"] in ("V-", "V±"):
|
||||
spans_to_route.append(
|
||||
SpanToRoute(
|
||||
span_id=span["span_id"],
|
||||
business_id=review.get("business_id", ""),
|
||||
place_id=review.get("place_id", ""),
|
||||
urt_primary=span["urt_primary"],
|
||||
valence=span["valence"],
|
||||
intensity=span["intensity"],
|
||||
entity_normalized=span.get("entity_normalized"),
|
||||
review_time=review.get("review_time", ""),
|
||||
confidence=span.get("confidence", "medium"),
|
||||
trust_score=review.get("trust_score", 0.5),
|
||||
)
|
||||
)
|
||||
|
||||
return await stage3.process(Stage3Input(spans=spans_to_route))
|
||||
|
||||
async def aggregate(
|
||||
self,
|
||||
business_id: str,
|
||||
date_str: str,
|
||||
bucket_types: list[str] | None = None,
|
||||
) -> Stage4Output:
|
||||
"""
|
||||
Run Stage 4: Fact Aggregation.
|
||||
|
||||
Args:
|
||||
business_id: Business identifier
|
||||
date_str: Date string (YYYY-MM-DD)
|
||||
bucket_types: List of bucket types (default: ['day'])
|
||||
|
||||
Returns:
|
||||
Stage4Output with aggregated facts
|
||||
"""
|
||||
await self.initialize()
|
||||
|
||||
stage4 = Stage4Aggregator(
|
||||
self.config,
|
||||
self._db,
|
||||
self._fact_repo,
|
||||
)
|
||||
|
||||
input_data = Stage4Input(
|
||||
business_id=business_id,
|
||||
date=date_str,
|
||||
bucket_types=bucket_types or ["day"], # type: ignore
|
||||
taxonomy_version=self.config.taxonomy_version,
|
||||
)
|
||||
|
||||
return await stage4.process(input_data)
|
||||
|
||||
async def validate(self, job_id: str) -> dict[str, ValidationResult]:
|
||||
"""
|
||||
Validate pipeline output for a job.
|
||||
|
||||
Args:
|
||||
job_id: Job identifier
|
||||
|
||||
Returns:
|
||||
Dictionary of validation results by stage
|
||||
"""
|
||||
# This would query the database for the job's output and validate
|
||||
# For now, return empty results
|
||||
logger.warning(f"validate() for job {job_id} not fully implemented")
|
||||
return {}
|
||||
@@ -0,0 +1,11 @@
|
||||
"""Services for pipeline operations."""
|
||||
|
||||
from reviewiq_pipeline.services.embeddings import EmbeddingService
|
||||
from reviewiq_pipeline.services.llm_client import LLMClient
|
||||
from reviewiq_pipeline.services.text_processor import TextProcessor
|
||||
|
||||
__all__ = [
|
||||
"LLMClient",
|
||||
"EmbeddingService",
|
||||
"TextProcessor",
|
||||
]
|
||||
@@ -0,0 +1,225 @@
|
||||
"""
|
||||
Embedding service for generating text embeddings.
|
||||
|
||||
Uses sentence-transformers with the all-MiniLM-L6-v2 model (384 dimensions).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from reviewiq_pipeline.config import Config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EmbeddingService:
|
||||
"""
|
||||
Service for generating text embeddings using sentence-transformers.
|
||||
|
||||
Uses the all-MiniLM-L6-v2 model by default, which produces 384-dimensional
|
||||
embeddings suitable for semantic similarity and clustering.
|
||||
"""
|
||||
|
||||
def __init__(self, config: Config):
|
||||
"""
|
||||
Initialize the embedding service.
|
||||
|
||||
Args:
|
||||
config: Pipeline configuration with embedding model settings
|
||||
"""
|
||||
self.config = config
|
||||
self.model_name = config.embedding_model
|
||||
self.dimension = config.embedding_dimension
|
||||
self._model = None
|
||||
self._initialized = False
|
||||
|
||||
def _ensure_initialized(self) -> None:
|
||||
"""Lazy initialization of the sentence-transformers model."""
|
||||
if self._initialized:
|
||||
return
|
||||
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
logger.info(f"Loading embedding model: {self.model_name}")
|
||||
self._model = SentenceTransformer(self.model_name)
|
||||
self._initialized = True
|
||||
logger.info(f"Embedding model loaded. Dimension: {self._model.get_sentence_embedding_dimension()}")
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"sentence-transformers is required for embeddings. "
|
||||
"Install with: pip install sentence-transformers"
|
||||
)
|
||||
|
||||
def embed(self, text: str) -> list[float]:
|
||||
"""
|
||||
Generate embedding for a single text.
|
||||
|
||||
Args:
|
||||
text: Text to embed
|
||||
|
||||
Returns:
|
||||
List of floats representing the embedding vector
|
||||
"""
|
||||
self._ensure_initialized()
|
||||
|
||||
if not text or not text.strip():
|
||||
# Return zero vector for empty text
|
||||
return [0.0] * self.dimension
|
||||
|
||||
embedding = self._model.encode(text, convert_to_numpy=True)
|
||||
return embedding.tolist()
|
||||
|
||||
def embed_batch(self, texts: list[str]) -> list[list[float]]:
|
||||
"""
|
||||
Generate embeddings for multiple texts.
|
||||
|
||||
More efficient than calling embed() repeatedly.
|
||||
|
||||
Args:
|
||||
texts: List of texts to embed
|
||||
|
||||
Returns:
|
||||
List of embedding vectors
|
||||
"""
|
||||
self._ensure_initialized()
|
||||
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
# Handle empty strings
|
||||
non_empty_indices = [i for i, t in enumerate(texts) if t and t.strip()]
|
||||
non_empty_texts = [texts[i] for i in non_empty_indices]
|
||||
|
||||
if not non_empty_texts:
|
||||
return [[0.0] * self.dimension for _ in texts]
|
||||
|
||||
# Batch encode
|
||||
embeddings = self._model.encode(non_empty_texts, convert_to_numpy=True)
|
||||
|
||||
# Build result with zero vectors for empty strings
|
||||
result = [[0.0] * self.dimension for _ in texts]
|
||||
for idx, emb in zip(non_empty_indices, embeddings):
|
||||
result[idx] = emb.tolist()
|
||||
|
||||
return result
|
||||
|
||||
def similarity(self, embedding1: list[float], embedding2: list[float]) -> float:
|
||||
"""
|
||||
Calculate cosine similarity between two embeddings.
|
||||
|
||||
Args:
|
||||
embedding1: First embedding vector
|
||||
embedding2: Second embedding vector
|
||||
|
||||
Returns:
|
||||
Cosine similarity score between -1 and 1
|
||||
"""
|
||||
vec1 = np.array(embedding1)
|
||||
vec2 = np.array(embedding2)
|
||||
|
||||
# Handle zero vectors
|
||||
norm1 = np.linalg.norm(vec1)
|
||||
norm2 = np.linalg.norm(vec2)
|
||||
|
||||
if norm1 == 0 or norm2 == 0:
|
||||
return 0.0
|
||||
|
||||
return float(np.dot(vec1, vec2) / (norm1 * norm2))
|
||||
|
||||
def find_similar(
|
||||
self,
|
||||
query_embedding: list[float],
|
||||
candidate_embeddings: list[list[float]],
|
||||
top_k: int = 5,
|
||||
threshold: float = 0.0,
|
||||
) -> list[tuple[int, float]]:
|
||||
"""
|
||||
Find most similar embeddings to a query.
|
||||
|
||||
Args:
|
||||
query_embedding: Query embedding vector
|
||||
candidate_embeddings: List of candidate embeddings
|
||||
top_k: Number of top results to return
|
||||
threshold: Minimum similarity threshold
|
||||
|
||||
Returns:
|
||||
List of (index, similarity) tuples, sorted by similarity descending
|
||||
"""
|
||||
if not candidate_embeddings:
|
||||
return []
|
||||
|
||||
query = np.array(query_embedding)
|
||||
candidates = np.array(candidate_embeddings)
|
||||
|
||||
# Compute all similarities at once
|
||||
query_norm = np.linalg.norm(query)
|
||||
if query_norm == 0:
|
||||
return []
|
||||
|
||||
candidate_norms = np.linalg.norm(candidates, axis=1)
|
||||
|
||||
# Avoid division by zero
|
||||
valid_mask = candidate_norms > 0
|
||||
similarities = np.zeros(len(candidates))
|
||||
similarities[valid_mask] = (
|
||||
np.dot(candidates[valid_mask], query)
|
||||
/ (candidate_norms[valid_mask] * query_norm)
|
||||
)
|
||||
|
||||
# Filter by threshold and get top k
|
||||
results = [
|
||||
(i, float(sim))
|
||||
for i, sim in enumerate(similarities)
|
||||
if sim >= threshold
|
||||
]
|
||||
results.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
return results[:top_k]
|
||||
|
||||
@property
|
||||
def model(self):
|
||||
"""Get the underlying sentence-transformers model."""
|
||||
self._ensure_initialized()
|
||||
return self._model
|
||||
|
||||
|
||||
def normalize_embedding(embedding: list[float]) -> list[float]:
|
||||
"""
|
||||
Normalize an embedding to unit length.
|
||||
|
||||
Args:
|
||||
embedding: Embedding vector
|
||||
|
||||
Returns:
|
||||
Unit-normalized embedding
|
||||
"""
|
||||
vec = np.array(embedding)
|
||||
norm = np.linalg.norm(vec)
|
||||
if norm == 0:
|
||||
return embedding
|
||||
return (vec / norm).tolist()
|
||||
|
||||
|
||||
def average_embeddings(embeddings: list[list[float]]) -> list[float]:
|
||||
"""
|
||||
Compute the average of multiple embeddings.
|
||||
|
||||
Useful for creating centroid vectors for clustering.
|
||||
|
||||
Args:
|
||||
embeddings: List of embedding vectors
|
||||
|
||||
Returns:
|
||||
Averaged embedding vector
|
||||
"""
|
||||
if not embeddings:
|
||||
raise ValueError("Cannot average empty embedding list")
|
||||
|
||||
arr = np.array(embeddings)
|
||||
return arr.mean(axis=0).tolist()
|
||||
@@ -0,0 +1,432 @@
|
||||
"""
|
||||
LLM client abstraction supporting OpenAI and Anthropic.
|
||||
|
||||
Provides a unified interface for classification requests with:
|
||||
- Provider abstraction (OpenAI/Anthropic)
|
||||
- Structured output (JSON mode)
|
||||
- Retry handling
|
||||
- Cost tracking
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from reviewiq_pipeline.config import Config
|
||||
from reviewiq_pipeline.contracts import LLMClassificationResponse
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# System prompt for URT classification
|
||||
SYSTEM_PROMPT = """You are a review classification system using URT (Universal Review Taxonomy) v5.1.
|
||||
|
||||
Your task is to extract semantic spans from customer reviews and classify each span independently.
|
||||
|
||||
## SPAN EXTRACTION RULES
|
||||
|
||||
1. **Split on contrasting conjunctions**: but, however, although, despite, yet, though
|
||||
2. **Split on topic/target change**: food → service → bathroom = 3 spans
|
||||
3. **Split on valence change**: positive → negative = split
|
||||
4. **Split on domain change**: O (Offering) → J (Journey) → E (Environment) = split
|
||||
5. **Keep together**: cause→effect within same feedback unit ("X because Y" = 1 span)
|
||||
|
||||
**Guardrails**:
|
||||
- Max 3 spans per sentence (if 4+, re-check for over-splitting)
|
||||
- Min 1 span per review (even single-word reviews)
|
||||
- Spans must be non-overlapping and cover meaningful content
|
||||
|
||||
## URT DOMAINS (Tier-3 codes: X#.##)
|
||||
|
||||
| Domain | Code | Description |
|
||||
|--------|------|-------------|
|
||||
| Offering | O1-O4 | Product/service quality, features, variety |
|
||||
| Price | P1-P4 | Value, pricing, promotions, payment |
|
||||
| Journey | J1-J4 | Timing, process, convenience, accessibility |
|
||||
| Environment | E1-E4 | Physical space, ambiance, cleanliness, digital UX |
|
||||
| Attitude | A1-A4 | Staff behavior, helpfulness, professionalism |
|
||||
| Voice | V1-V4 | Brand, communication, marketing, transparency |
|
||||
| Relationship | R1-R4 | Loyalty, trust, consistency, personalization |
|
||||
|
||||
## DIMENSION CODES
|
||||
|
||||
### Valence
|
||||
- V+ : Positive sentiment
|
||||
- V- : Negative sentiment
|
||||
- V0 : Neutral/factual
|
||||
- V± : Mixed within the span
|
||||
|
||||
### Intensity
|
||||
- I1 : Low ("okay", "fine", "decent")
|
||||
- I2 : Moderate ("good", "bad", "slow")
|
||||
- I3 : High ("amazing", "terrible", "unacceptable")
|
||||
|
||||
### Specificity
|
||||
- S1 : Vague ("it was bad")
|
||||
- S2 : Some detail ("the food was cold")
|
||||
- S3 : Precise ("waited 45 minutes for appetizers")
|
||||
|
||||
### Actionability
|
||||
- A1 : No clear action possible
|
||||
- A2 : Possible actions, unclear which
|
||||
- A3 : Clear, specific action ("train staff on X", "fix Y")
|
||||
|
||||
### Temporal
|
||||
- TC : Current visit (default when no markers)
|
||||
- TR : Recent pattern ("lately", "recently", "again")
|
||||
- TH : Historical ("for years", "always", "used to")
|
||||
- TF : Future ("won't return", "next time", "I expect")
|
||||
|
||||
### Evidence
|
||||
- ES : Stated explicitly in text (default)
|
||||
- EI : Inferred logically (not stated, but entailed)
|
||||
- EC : Contextual (depends on surrounding text)
|
||||
|
||||
### Comparative
|
||||
- CR-N : No comparison (default)
|
||||
- CR-B : Better than alternatives
|
||||
- CR-W : Worse than alternatives
|
||||
- CR-S : Same as alternatives
|
||||
|
||||
## PRIMARY SPAN SELECTION
|
||||
|
||||
Mark exactly ONE span as is_primary=true using this order:
|
||||
1. Highest intensity (I3 > I2 > I1)
|
||||
2. Tie-break: negative over positive (V- > V± > V0 > V+)
|
||||
3. Tie-break: earliest span_index
|
||||
|
||||
## USN (URT String Notation)
|
||||
|
||||
Generate a USN string for each span:
|
||||
```
|
||||
URT:S:{primary}[+{sec1}][+{sec2}]:{valence_sign}{intensity_num}:{S#}{A#}{temporal}.{evidence}.{CR_suffix}
|
||||
```
|
||||
|
||||
Examples:
|
||||
- `URT:S:J1.03:-2:22TC.ES.N` (J1.03, V-, I2, S2, A2, TC, ES, CR-N)
|
||||
- `URT:S:P1.01+O2.03:+3:33TR.ES.B` (P1.01 primary, O2.03 secondary, V+, I3, S3, A3, TR, ES, CR-B)
|
||||
|
||||
Valence encoding: + for V+, - for V-, 0 for V0, ± for V±
|
||||
CR suffix: N=CR-N, B=CR-B, W=CR-W, S=CR-S
|
||||
|
||||
## OUTPUT FORMAT
|
||||
|
||||
Return valid JSON matching this schema. No markdown, no explanations.
|
||||
|
||||
{
|
||||
"spans": [
|
||||
{
|
||||
"span_index": 0,
|
||||
"span_text": "exact text from review",
|
||||
"span_start": 0,
|
||||
"span_end": 25,
|
||||
"urt_primary": "O1.01",
|
||||
"urt_secondary": [],
|
||||
"valence": "V+",
|
||||
"intensity": "I2",
|
||||
"specificity": "S2",
|
||||
"actionability": "A1",
|
||||
"temporal": "TC",
|
||||
"evidence": "ES",
|
||||
"comparative": "CR-N",
|
||||
"is_primary": true,
|
||||
"confidence": "high",
|
||||
"entity": null,
|
||||
"entity_type": null,
|
||||
"relation_type": null,
|
||||
"related_span_index": null,
|
||||
"usn": "URT:S:O1.01:+2:21TC.ES.N"
|
||||
}
|
||||
],
|
||||
"review_summary": {
|
||||
"dominant_valence": "V+",
|
||||
"dominant_domain": "O",
|
||||
"span_count": 1,
|
||||
"has_comparative": false,
|
||||
"has_entity": false
|
||||
}
|
||||
}"""
|
||||
|
||||
|
||||
class LLMClientBase(ABC):
|
||||
"""Abstract base class for LLM clients."""
|
||||
|
||||
def __init__(self, config: Config):
|
||||
self.config = config
|
||||
self.total_tokens_used = 0
|
||||
self.total_cost_usd = 0.0
|
||||
|
||||
@abstractmethod
|
||||
async def classify(
|
||||
self,
|
||||
review_text: str,
|
||||
profile: str = "standard",
|
||||
) -> tuple[LLMClassificationResponse, dict[str, Any]]:
|
||||
"""
|
||||
Classify a review and extract spans.
|
||||
|
||||
Args:
|
||||
review_text: The review text to classify
|
||||
profile: Classification profile (lite/core/standard/full)
|
||||
|
||||
Returns:
|
||||
Tuple of (classification response, metadata dict with tokens/cost)
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def close(self) -> None:
|
||||
"""Close the client and cleanup resources."""
|
||||
pass
|
||||
|
||||
|
||||
class OpenAIClient(LLMClientBase):
|
||||
"""OpenAI LLM client implementation."""
|
||||
|
||||
# Pricing per 1M tokens (as of 2024)
|
||||
PRICING = {
|
||||
"gpt-4o": {"input": 5.0, "output": 15.0},
|
||||
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
|
||||
"gpt-4-turbo": {"input": 10.0, "output": 30.0},
|
||||
"gpt-3.5-turbo": {"input": 0.50, "output": 1.50},
|
||||
}
|
||||
|
||||
def __init__(self, config: Config):
|
||||
super().__init__(config)
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
self.client = AsyncOpenAI(api_key=config.get_llm_api_key())
|
||||
self.model = config.llm_model
|
||||
|
||||
async def classify(
|
||||
self,
|
||||
review_text: str,
|
||||
profile: str = "standard",
|
||||
) -> tuple[LLMClassificationResponse, dict[str, Any]]:
|
||||
"""Classify using OpenAI."""
|
||||
start_time = time.time()
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f'Classify this review:\n\n"{review_text}"',
|
||||
},
|
||||
]
|
||||
|
||||
response = await self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=messages,
|
||||
temperature=self.config.llm_temperature,
|
||||
response_format={"type": "json_object"},
|
||||
timeout=self.config.llm_timeout_seconds,
|
||||
)
|
||||
|
||||
# Parse response
|
||||
content = response.choices[0].message.content
|
||||
if not content:
|
||||
raise ValueError("Empty response from OpenAI")
|
||||
|
||||
result = json.loads(content)
|
||||
|
||||
# Calculate costs
|
||||
input_tokens = response.usage.prompt_tokens if response.usage else 0
|
||||
output_tokens = response.usage.completion_tokens if response.usage else 0
|
||||
total_tokens = input_tokens + output_tokens
|
||||
|
||||
pricing = self.PRICING.get(self.model, {"input": 0.15, "output": 0.60})
|
||||
cost = (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1_000_000
|
||||
|
||||
self.total_tokens_used += total_tokens
|
||||
self.total_cost_usd += cost
|
||||
|
||||
metadata = {
|
||||
"model": self.model,
|
||||
"input_tokens": input_tokens,
|
||||
"output_tokens": output_tokens,
|
||||
"total_tokens": total_tokens,
|
||||
"cost_usd": cost,
|
||||
"latency_ms": int((time.time() - start_time) * 1000),
|
||||
}
|
||||
|
||||
return result, metadata
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close the OpenAI client."""
|
||||
await self.client.close()
|
||||
|
||||
|
||||
class AnthropicClient(LLMClientBase):
|
||||
"""Anthropic LLM client implementation."""
|
||||
|
||||
# Pricing per 1M tokens (as of 2024)
|
||||
PRICING = {
|
||||
"claude-3-opus-20240229": {"input": 15.0, "output": 75.0},
|
||||
"claude-3-sonnet-20240229": {"input": 3.0, "output": 15.0},
|
||||
"claude-3-haiku-20240307": {"input": 0.25, "output": 1.25},
|
||||
"claude-3-5-sonnet-20241022": {"input": 3.0, "output": 15.0},
|
||||
}
|
||||
|
||||
def __init__(self, config: Config):
|
||||
super().__init__(config)
|
||||
from anthropic import AsyncAnthropic
|
||||
|
||||
self.client = AsyncAnthropic(api_key=config.get_llm_api_key())
|
||||
self.model = config.llm_model
|
||||
|
||||
async def classify(
|
||||
self,
|
||||
review_text: str,
|
||||
profile: str = "standard",
|
||||
) -> tuple[LLMClassificationResponse, dict[str, Any]]:
|
||||
"""Classify using Anthropic."""
|
||||
start_time = time.time()
|
||||
|
||||
response = await self.client.messages.create(
|
||||
model=self.model,
|
||||
max_tokens=4096,
|
||||
system=SYSTEM_PROMPT,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": f'Classify this review and return JSON only:\n\n"{review_text}"',
|
||||
},
|
||||
],
|
||||
temperature=self.config.llm_temperature,
|
||||
)
|
||||
|
||||
# Parse response
|
||||
content = response.content[0].text if response.content else ""
|
||||
if not content:
|
||||
raise ValueError("Empty response from Anthropic")
|
||||
|
||||
# Try to extract JSON from response
|
||||
result = self._extract_json(content)
|
||||
|
||||
# Calculate costs
|
||||
input_tokens = response.usage.input_tokens
|
||||
output_tokens = response.usage.output_tokens
|
||||
total_tokens = input_tokens + output_tokens
|
||||
|
||||
pricing = self.PRICING.get(self.model, {"input": 3.0, "output": 15.0})
|
||||
cost = (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1_000_000
|
||||
|
||||
self.total_tokens_used += total_tokens
|
||||
self.total_cost_usd += cost
|
||||
|
||||
metadata = {
|
||||
"model": self.model,
|
||||
"input_tokens": input_tokens,
|
||||
"output_tokens": output_tokens,
|
||||
"total_tokens": total_tokens,
|
||||
"cost_usd": cost,
|
||||
"latency_ms": int((time.time() - start_time) * 1000),
|
||||
}
|
||||
|
||||
return result, metadata
|
||||
|
||||
def _extract_json(self, content: str) -> dict[str, Any]:
|
||||
"""Extract JSON from response, handling markdown code blocks."""
|
||||
content = content.strip()
|
||||
|
||||
# Try direct parse first
|
||||
try:
|
||||
return json.loads(content)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Try to find JSON in code blocks
|
||||
import re
|
||||
|
||||
json_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", content)
|
||||
if json_match:
|
||||
return json.loads(json_match.group(1))
|
||||
|
||||
# Try to find JSON object
|
||||
json_match = re.search(r"\{[\s\S]*\}", content)
|
||||
if json_match:
|
||||
return json.loads(json_match.group(0))
|
||||
|
||||
raise ValueError(f"Could not extract JSON from response: {content[:200]}")
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close the Anthropic client."""
|
||||
await self.client.close()
|
||||
|
||||
|
||||
class LLMClient:
|
||||
"""
|
||||
Factory class for LLM clients.
|
||||
|
||||
Usage:
|
||||
client = LLMClient.create(config)
|
||||
result, metadata = await client.classify(review_text)
|
||||
await client.close()
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def create(config: Config) -> LLMClientBase:
|
||||
"""
|
||||
Create an LLM client based on configuration.
|
||||
|
||||
Args:
|
||||
config: Pipeline configuration
|
||||
|
||||
Returns:
|
||||
LLM client instance (OpenAI or Anthropic)
|
||||
"""
|
||||
if config.llm_provider == "openai":
|
||||
return OpenAIClient(config)
|
||||
elif config.llm_provider == "anthropic":
|
||||
return AnthropicClient(config)
|
||||
else:
|
||||
raise ValueError(f"Unsupported LLM provider: {config.llm_provider}")
|
||||
|
||||
|
||||
def create_fallback_response(review_text: str) -> LLMClassificationResponse:
|
||||
"""
|
||||
Create a fallback classification response when LLM fails.
|
||||
|
||||
Args:
|
||||
review_text: Original review text
|
||||
|
||||
Returns:
|
||||
Minimal valid classification response
|
||||
"""
|
||||
return {
|
||||
"spans": [
|
||||
{
|
||||
"span_index": 0,
|
||||
"span_text": review_text,
|
||||
"span_start": 0,
|
||||
"span_end": len(review_text),
|
||||
"urt_primary": "O1.01",
|
||||
"urt_secondary": [],
|
||||
"valence": "V0",
|
||||
"intensity": "I1",
|
||||
"specificity": "S1",
|
||||
"actionability": "A1",
|
||||
"temporal": "TC",
|
||||
"evidence": "ES",
|
||||
"comparative": "CR-N",
|
||||
"is_primary": True,
|
||||
"confidence": "low",
|
||||
"entity": None,
|
||||
"entity_type": None,
|
||||
"relation_type": None,
|
||||
"related_span_index": None,
|
||||
"usn": "URT:S:O1.01:01:11TC.ES.N",
|
||||
}
|
||||
],
|
||||
"review_summary": {
|
||||
"dominant_valence": "V0",
|
||||
"dominant_domain": "O",
|
||||
"span_count": 1,
|
||||
"has_comparative": False,
|
||||
"has_entity": False,
|
||||
},
|
||||
}
|
||||
@@ -0,0 +1,262 @@
|
||||
"""Text processing utilities for normalization."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import re
|
||||
import unicodedata
|
||||
from typing import NamedTuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class NormalizationResult(NamedTuple):
|
||||
"""Result of text normalization."""
|
||||
|
||||
normalized: str
|
||||
language: str
|
||||
word_count: int
|
||||
char_count: int
|
||||
|
||||
|
||||
class TextProcessor:
|
||||
"""Service for text normalization and processing."""
|
||||
|
||||
# Common emoji ranges
|
||||
EMOJI_PATTERN = re.compile(
|
||||
"["
|
||||
"\U0001F600-\U0001F64F" # emoticons
|
||||
"\U0001F300-\U0001F5FF" # symbols & pictographs
|
||||
"\U0001F680-\U0001F6FF" # transport & map symbols
|
||||
"\U0001F1E0-\U0001F1FF" # flags
|
||||
"\U00002702-\U000027B0" # dingbats
|
||||
"\U000024C2-\U0001F251" # enclosed characters
|
||||
"]+",
|
||||
flags=re.UNICODE,
|
||||
)
|
||||
|
||||
# Control characters (except newlines and tabs we want to normalize)
|
||||
CONTROL_CHAR_PATTERN = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]")
|
||||
|
||||
# Multiple whitespace
|
||||
MULTI_WHITESPACE_PATTERN = re.compile(r"\s+")
|
||||
|
||||
# URL pattern
|
||||
URL_PATTERN = re.compile(
|
||||
r"https?://[^\s<>\"{}|\\^`\[\]]+|www\.[^\s<>\"{}|\\^`\[\]]+"
|
||||
)
|
||||
|
||||
def __init__(self):
|
||||
self._langdetect_available = False
|
||||
try:
|
||||
from langdetect import detect, DetectorFactory
|
||||
|
||||
# Make detection deterministic
|
||||
DetectorFactory.seed = 0
|
||||
self._langdetect_available = True
|
||||
except ImportError:
|
||||
logger.warning("langdetect not available, defaulting to 'en' for all text")
|
||||
|
||||
def normalize(self, text: str) -> NormalizationResult:
|
||||
"""
|
||||
Normalize text for classification.
|
||||
|
||||
Steps:
|
||||
1. Remove control characters
|
||||
2. Normalize Unicode (NFC)
|
||||
3. Lowercase
|
||||
4. Normalize whitespace (collapse multiple spaces, trim)
|
||||
5. Standardize emoji (keep but normalize)
|
||||
6. Detect language
|
||||
|
||||
Args:
|
||||
text: Original review text
|
||||
|
||||
Returns:
|
||||
NormalizationResult with normalized text and metadata
|
||||
"""
|
||||
if not text:
|
||||
return NormalizationResult(
|
||||
normalized="",
|
||||
language="en",
|
||||
word_count=0,
|
||||
char_count=0,
|
||||
)
|
||||
|
||||
# Step 1: Remove control characters
|
||||
normalized = self.CONTROL_CHAR_PATTERN.sub("", text)
|
||||
|
||||
# Step 2: Unicode normalization (NFC - composed form)
|
||||
normalized = unicodedata.normalize("NFC", normalized)
|
||||
|
||||
# Step 3: Lowercase
|
||||
normalized = normalized.lower()
|
||||
|
||||
# Step 4: Normalize whitespace
|
||||
normalized = self.MULTI_WHITESPACE_PATTERN.sub(" ", normalized)
|
||||
normalized = normalized.strip()
|
||||
|
||||
# Detect language on original text (before lowercasing can help)
|
||||
language = self.detect_language(text)
|
||||
|
||||
# Calculate metrics
|
||||
word_count = len(normalized.split()) if normalized else 0
|
||||
char_count = len(normalized)
|
||||
|
||||
return NormalizationResult(
|
||||
normalized=normalized,
|
||||
language=language,
|
||||
word_count=word_count,
|
||||
char_count=char_count,
|
||||
)
|
||||
|
||||
def detect_language(self, text: str) -> str:
|
||||
"""
|
||||
Detect the language of the text.
|
||||
|
||||
Args:
|
||||
text: Text to analyze
|
||||
|
||||
Returns:
|
||||
ISO 639-1 language code (e.g., 'en', 'es', 'fr')
|
||||
"""
|
||||
if not text or not self._langdetect_available:
|
||||
return "en"
|
||||
|
||||
try:
|
||||
from langdetect import detect
|
||||
|
||||
# Need reasonable length for detection
|
||||
sample = text[:1000] if len(text) > 1000 else text
|
||||
return detect(sample)
|
||||
except Exception as e:
|
||||
logger.debug(f"Language detection failed: {e}")
|
||||
return "en"
|
||||
|
||||
def generate_content_hash(self, text_normalized: str) -> str:
|
||||
"""
|
||||
Generate a SHA256 hash of normalized text for deduplication.
|
||||
|
||||
Args:
|
||||
text_normalized: Normalized text
|
||||
|
||||
Returns:
|
||||
64-character hex string
|
||||
"""
|
||||
return hashlib.sha256(text_normalized.encode("utf-8")).hexdigest()
|
||||
|
||||
def has_control_characters(self, text: str) -> bool:
|
||||
"""Check if text contains control characters."""
|
||||
return bool(self.CONTROL_CHAR_PATTERN.search(text))
|
||||
|
||||
def extract_urls(self, text: str) -> list[str]:
|
||||
"""Extract URLs from text."""
|
||||
return self.URL_PATTERN.findall(text)
|
||||
|
||||
def count_emoji(self, text: str) -> int:
|
||||
"""Count emoji in text."""
|
||||
return len(self.EMOJI_PATTERN.findall(text))
|
||||
|
||||
def is_empty_or_trivial(self, text: str | None, min_chars: int = 3) -> bool:
|
||||
"""
|
||||
Check if text is empty or trivially short.
|
||||
|
||||
Args:
|
||||
text: Text to check
|
||||
min_chars: Minimum meaningful character count
|
||||
|
||||
Returns:
|
||||
True if text should be skipped
|
||||
"""
|
||||
if not text:
|
||||
return True
|
||||
stripped = text.strip()
|
||||
if not stripped:
|
||||
return True
|
||||
if len(stripped) < min_chars:
|
||||
return True
|
||||
return False
|
||||
|
||||
def clean_for_llm(self, text: str) -> str:
|
||||
"""
|
||||
Clean text for LLM input.
|
||||
|
||||
Similar to normalize but preserves case and some formatting
|
||||
for better LLM understanding.
|
||||
|
||||
Args:
|
||||
text: Original text
|
||||
|
||||
Returns:
|
||||
Cleaned text suitable for LLM input
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# Remove control characters
|
||||
cleaned = self.CONTROL_CHAR_PATTERN.sub("", text)
|
||||
|
||||
# Unicode normalization
|
||||
cleaned = unicodedata.normalize("NFC", cleaned)
|
||||
|
||||
# Normalize whitespace but preserve single newlines for paragraphs
|
||||
cleaned = re.sub(r"[^\S\n]+", " ", cleaned) # Collapse horizontal space
|
||||
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned) # Max 2 consecutive newlines
|
||||
cleaned = cleaned.strip()
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def is_valid_iso639(code: str) -> bool:
|
||||
"""
|
||||
Check if a language code is a valid ISO 639-1 code.
|
||||
|
||||
Args:
|
||||
code: Language code to validate
|
||||
|
||||
Returns:
|
||||
True if valid ISO 639-1 code
|
||||
"""
|
||||
# Common ISO 639-1 codes (not exhaustive but covers most)
|
||||
valid_codes = {
|
||||
"aa", "ab", "ae", "af", "ak", "am", "an", "ar", "as", "av",
|
||||
"ay", "az", "ba", "be", "bg", "bh", "bi", "bm", "bn", "bo",
|
||||
"br", "bs", "ca", "ce", "ch", "co", "cr", "cs", "cu", "cv",
|
||||
"cy", "da", "de", "dv", "dz", "ee", "el", "en", "eo", "es",
|
||||
"et", "eu", "fa", "ff", "fi", "fj", "fo", "fr", "fy", "ga",
|
||||
"gd", "gl", "gn", "gu", "gv", "ha", "he", "hi", "ho", "hr",
|
||||
"ht", "hu", "hy", "hz", "ia", "id", "ie", "ig", "ii", "ik",
|
||||
"io", "is", "it", "iu", "ja", "jv", "ka", "kg", "ki", "kj",
|
||||
"kk", "kl", "km", "kn", "ko", "kr", "ks", "ku", "kv", "kw",
|
||||
"ky", "la", "lb", "lg", "li", "ln", "lo", "lt", "lu", "lv",
|
||||
"mg", "mh", "mi", "mk", "ml", "mn", "mr", "ms", "mt", "my",
|
||||
"na", "nb", "nd", "ne", "ng", "nl", "nn", "no", "nr", "nv",
|
||||
"ny", "oc", "oj", "om", "or", "os", "pa", "pi", "pl", "ps",
|
||||
"pt", "qu", "rm", "rn", "ro", "ru", "rw", "sa", "sc", "sd",
|
||||
"se", "sg", "si", "sk", "sl", "sm", "sn", "so", "sq", "sr",
|
||||
"ss", "st", "su", "sv", "sw", "ta", "te", "tg", "th", "ti",
|
||||
"tk", "tl", "tn", "to", "tr", "ts", "tt", "tw", "ty", "ug",
|
||||
"uk", "ur", "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi",
|
||||
"yo", "za", "zh", "zu",
|
||||
}
|
||||
return code.lower() in valid_codes
|
||||
|
||||
|
||||
def is_valid_sha256(hash_str: str) -> bool:
|
||||
"""
|
||||
Check if a string is a valid SHA256 hex hash.
|
||||
|
||||
Args:
|
||||
hash_str: Hash string to validate
|
||||
|
||||
Returns:
|
||||
True if valid 64-character hex string
|
||||
"""
|
||||
if not hash_str or len(hash_str) != 64:
|
||||
return False
|
||||
try:
|
||||
int(hash_str, 16)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
@@ -0,0 +1,13 @@
|
||||
"""Pipeline stages for review processing."""
|
||||
|
||||
from reviewiq_pipeline.stages.stage1_normalize import Stage1Normalizer
|
||||
from reviewiq_pipeline.stages.stage2_classify import Stage2Classifier
|
||||
from reviewiq_pipeline.stages.stage3_route import Stage3Router
|
||||
from reviewiq_pipeline.stages.stage4_aggregate import Stage4Aggregator
|
||||
|
||||
__all__ = [
|
||||
"Stage1Normalizer",
|
||||
"Stage2Classifier",
|
||||
"Stage3Router",
|
||||
"Stage4Aggregator",
|
||||
]
|
||||
@@ -0,0 +1,247 @@
|
||||
"""
|
||||
Stage 1: Normalization
|
||||
|
||||
Transform raw scraped reviews into clean, versioned records ready for LLM classification.
|
||||
|
||||
Responsibilities:
|
||||
- Read raw reviews from input
|
||||
- Text normalization (lowercase, whitespace, emoji)
|
||||
- Language detection
|
||||
- Content hash generation for deduplication
|
||||
- Write to reviews_raw + reviews_enriched stub
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from reviewiq_pipeline.contracts import (
|
||||
NormalizedReview,
|
||||
RawReview,
|
||||
Stage1Input,
|
||||
Stage1Output,
|
||||
Stage1Stats,
|
||||
)
|
||||
from reviewiq_pipeline.services.text_processor import TextProcessor
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from reviewiq_pipeline.config import Config
|
||||
from reviewiq_pipeline.db.connection import DatabasePool
|
||||
from reviewiq_pipeline.db.repositories import ReviewRepository
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Stage1Normalizer:
|
||||
"""
|
||||
Stage 1: Normalize raw reviews for classification.
|
||||
|
||||
This stage:
|
||||
1. Reads raw reviews from Stage 0 output
|
||||
2. Normalizes text (lowercase, whitespace, unicode)
|
||||
3. Detects language
|
||||
4. Generates content hash for deduplication
|
||||
5. Writes to reviews_raw and reviews_enriched tables
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: Config,
|
||||
db: DatabasePool | None = None,
|
||||
review_repo: ReviewRepository | None = None,
|
||||
):
|
||||
self.config = config
|
||||
self.db = db
|
||||
self.review_repo = review_repo
|
||||
self.text_processor = TextProcessor()
|
||||
|
||||
async def process(self, input_data: Stage1Input) -> Stage1Output:
|
||||
"""
|
||||
Process raw reviews through normalization stage.
|
||||
|
||||
Args:
|
||||
input_data: Stage 1 input containing raw reviews
|
||||
|
||||
Returns:
|
||||
Stage1Output with normalized reviews and stats
|
||||
"""
|
||||
logger.info(
|
||||
f"Stage 1: Processing {len(input_data['reviews'])} reviews "
|
||||
f"for job {input_data['job_id']}"
|
||||
)
|
||||
|
||||
normalized_reviews: list[NormalizedReview] = []
|
||||
stats = Stage1Stats(
|
||||
input_count=len(input_data["reviews"]),
|
||||
output_count=0,
|
||||
skipped_empty=0,
|
||||
skipped_duplicate=0,
|
||||
)
|
||||
|
||||
seen_hashes: set[str] = set()
|
||||
|
||||
for raw_review in input_data["reviews"]:
|
||||
try:
|
||||
result = self._normalize_review(
|
||||
raw_review,
|
||||
input_data["business_id"],
|
||||
input_data["place_id"],
|
||||
)
|
||||
|
||||
if result is None:
|
||||
stats["skipped_empty"] += 1
|
||||
continue
|
||||
|
||||
# Check for duplicates within this batch
|
||||
if result["content_hash"] in seen_hashes:
|
||||
stats["skipped_duplicate"] += 1
|
||||
continue
|
||||
|
||||
seen_hashes.add(result["content_hash"])
|
||||
|
||||
# If we have a database, persist and check cross-batch duplicates
|
||||
if self.review_repo:
|
||||
raw_id = await self._persist_review(raw_review, result, input_data)
|
||||
result["raw_id"] = raw_id
|
||||
|
||||
normalized_reviews.append(result)
|
||||
stats["output_count"] += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error normalizing review {raw_review.get('review_id')}: {e}")
|
||||
raise
|
||||
|
||||
logger.info(
|
||||
f"Stage 1 complete: {stats['output_count']} normalized, "
|
||||
f"{stats['skipped_empty']} empty, {stats['skipped_duplicate']} duplicate"
|
||||
)
|
||||
|
||||
return Stage1Output(
|
||||
job_id=input_data["job_id"],
|
||||
business_id=input_data["business_id"],
|
||||
place_id=input_data["place_id"],
|
||||
reviews_normalized=normalized_reviews,
|
||||
stats=stats,
|
||||
)
|
||||
|
||||
def _normalize_review(
|
||||
self,
|
||||
raw: RawReview,
|
||||
business_id: str,
|
||||
place_id: str,
|
||||
) -> NormalizedReview | None:
|
||||
"""
|
||||
Normalize a single raw review.
|
||||
|
||||
Args:
|
||||
raw: Raw review from scraper
|
||||
business_id: Business identifier
|
||||
place_id: Google Place ID
|
||||
|
||||
Returns:
|
||||
NormalizedReview or None if should be skipped
|
||||
"""
|
||||
text = raw.get("text")
|
||||
|
||||
# Skip empty reviews (rating-only)
|
||||
if self.text_processor.is_empty_or_trivial(text):
|
||||
logger.debug(f"Skipping empty review {raw['review_id']}")
|
||||
return None
|
||||
|
||||
# Normalize text
|
||||
norm_result = self.text_processor.normalize(text) # type: ignore
|
||||
|
||||
# Skip if normalized to empty
|
||||
if not norm_result.normalized:
|
||||
return None
|
||||
|
||||
# Generate content hash
|
||||
content_hash = self.text_processor.generate_content_hash(norm_result.normalized)
|
||||
|
||||
return NormalizedReview(
|
||||
source="google",
|
||||
review_id=raw["review_id"],
|
||||
review_version=1,
|
||||
business_id=business_id,
|
||||
place_id=place_id,
|
||||
text=text, # type: ignore
|
||||
text_normalized=norm_result.normalized,
|
||||
text_language=norm_result.language,
|
||||
text_length=norm_result.char_count,
|
||||
word_count=norm_result.word_count,
|
||||
rating=raw["rating"],
|
||||
review_time=raw["review_time"],
|
||||
author_name=raw["author_name"],
|
||||
author_id=raw.get("author_id"),
|
||||
content_hash=content_hash,
|
||||
dedup_group_id=None,
|
||||
)
|
||||
|
||||
async def _persist_review(
|
||||
self,
|
||||
raw: RawReview,
|
||||
normalized: NormalizedReview,
|
||||
input_data: Stage1Input,
|
||||
) -> int:
|
||||
"""
|
||||
Persist a normalized review to the database.
|
||||
|
||||
Args:
|
||||
raw: Original raw review
|
||||
normalized: Normalized review data
|
||||
input_data: Stage 1 input for context
|
||||
|
||||
Returns:
|
||||
The raw_id from reviews_raw table
|
||||
"""
|
||||
if not self.review_repo:
|
||||
raise RuntimeError("ReviewRepository not configured")
|
||||
|
||||
# Insert raw review
|
||||
raw_id = await self.review_repo.insert_raw_review(
|
||||
raw,
|
||||
input_data["place_id"],
|
||||
source="google",
|
||||
)
|
||||
|
||||
# Insert enriched review stub
|
||||
await self.review_repo.insert_enriched_review(
|
||||
normalized,
|
||||
raw_id,
|
||||
)
|
||||
|
||||
return raw_id
|
||||
|
||||
def normalize_batch(
|
||||
self,
|
||||
reviews: list[RawReview],
|
||||
business_id: str,
|
||||
place_id: str,
|
||||
) -> list[NormalizedReview]:
|
||||
"""
|
||||
Normalize a batch of reviews without database persistence.
|
||||
|
||||
Useful for testing or when processing reviews in memory.
|
||||
|
||||
Args:
|
||||
reviews: List of raw reviews
|
||||
business_id: Business identifier
|
||||
place_id: Google Place ID
|
||||
|
||||
Returns:
|
||||
List of normalized reviews (skipped reviews excluded)
|
||||
"""
|
||||
results = []
|
||||
seen_hashes: set[str] = set()
|
||||
|
||||
for raw in reviews:
|
||||
normalized = self._normalize_review(raw, business_id, place_id)
|
||||
if normalized is None:
|
||||
continue
|
||||
if normalized["content_hash"] in seen_hashes:
|
||||
continue
|
||||
seen_hashes.add(normalized["content_hash"])
|
||||
results.append(normalized)
|
||||
|
||||
return results
|
||||
@@ -0,0 +1,539 @@
|
||||
"""
|
||||
Stage 2: LLM Classification
|
||||
|
||||
Classify normalized reviews into URT codes with span-level extraction.
|
||||
|
||||
Responsibilities:
|
||||
- Call LLM for span extraction and classification
|
||||
- Generate embeddings
|
||||
- Calculate trust scores
|
||||
- Select primary span
|
||||
- Write to reviews_enriched and review_spans tables
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import re
|
||||
import uuid
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from reviewiq_pipeline.contracts import (
|
||||
ClassifiedReview,
|
||||
ExtractedSpan,
|
||||
ReviewToClassify,
|
||||
Stage2Input,
|
||||
Stage2Output,
|
||||
Stage2Stats,
|
||||
)
|
||||
from reviewiq_pipeline.services.llm_client import LLMClient, create_fallback_response
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from reviewiq_pipeline.config import Config
|
||||
from reviewiq_pipeline.contracts import LLMClassificationResponse, LLMSpanResponse
|
||||
from reviewiq_pipeline.db.connection import DatabasePool
|
||||
from reviewiq_pipeline.db.repositories import ReviewRepository, SpanRepository
|
||||
from reviewiq_pipeline.services.embeddings import EmbeddingService
|
||||
from reviewiq_pipeline.services.llm_client import LLMClientBase
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# URT code validation pattern
|
||||
URT_CODE_PATTERN = re.compile(r"^[OPJEAVR][1-4]\.[0-9]{2}$")
|
||||
|
||||
# Valence priority for primary span selection (lower = higher priority)
|
||||
VALENCE_PRIORITY = {"V-": 0, "V±": 1, "V0": 2, "V+": 3}
|
||||
|
||||
# Intensity priority (lower = higher priority for I3)
|
||||
INTENSITY_PRIORITY = {"I3": 0, "I2": 1, "I1": 2}
|
||||
|
||||
|
||||
class Stage2Classifier:
|
||||
"""
|
||||
Stage 2: Classify reviews using LLM and extract spans.
|
||||
|
||||
This stage:
|
||||
1. Calls LLM to extract and classify spans
|
||||
2. Generates embeddings for each review
|
||||
3. Calculates trust scores
|
||||
4. Selects primary span
|
||||
5. Writes classification results to database
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: Config,
|
||||
db: DatabasePool | None = None,
|
||||
review_repo: ReviewRepository | None = None,
|
||||
span_repo: SpanRepository | None = None,
|
||||
embedding_service: EmbeddingService | None = None,
|
||||
):
|
||||
self.config = config
|
||||
self.db = db
|
||||
self.review_repo = review_repo
|
||||
self.span_repo = span_repo
|
||||
self.embedding_service = embedding_service
|
||||
self._llm_client: LLMClientBase | None = None
|
||||
|
||||
async def _get_llm_client(self) -> LLMClientBase:
|
||||
"""Get or create LLM client."""
|
||||
if self._llm_client is None:
|
||||
self._llm_client = LLMClient.create(self.config)
|
||||
return self._llm_client
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close resources."""
|
||||
if self._llm_client:
|
||||
await self._llm_client.close()
|
||||
self._llm_client = None
|
||||
|
||||
async def process(self, input_data: Stage2Input) -> Stage2Output:
|
||||
"""
|
||||
Process reviews through classification stage.
|
||||
|
||||
Args:
|
||||
input_data: Stage 2 input with reviews and config
|
||||
|
||||
Returns:
|
||||
Stage2Output with classified reviews and stats
|
||||
"""
|
||||
batch_id = str(uuid.uuid4())[:8]
|
||||
logger.info(
|
||||
f"Stage 2: Classifying {len(input_data['reviews'])} reviews "
|
||||
f"(batch {batch_id})"
|
||||
)
|
||||
|
||||
classified_reviews: list[ClassifiedReview] = []
|
||||
total_tokens = 0
|
||||
total_cost = 0.0
|
||||
total_spans = 0
|
||||
error_count = 0
|
||||
|
||||
llm_client = await self._get_llm_client()
|
||||
|
||||
for review in input_data["reviews"]:
|
||||
try:
|
||||
classified, metadata = await self._classify_review(
|
||||
review,
|
||||
input_data["config"]["profile"],
|
||||
llm_client,
|
||||
batch_id,
|
||||
)
|
||||
|
||||
if classified:
|
||||
classified_reviews.append(classified)
|
||||
total_spans += len(classified.get("spans", []))
|
||||
total_tokens += metadata.get("total_tokens", 0)
|
||||
total_cost += metadata.get("cost_usd", 0.0)
|
||||
|
||||
# Persist to database if configured
|
||||
if self.review_repo and self.span_repo:
|
||||
await self._persist_classification(
|
||||
classified,
|
||||
review,
|
||||
batch_id,
|
||||
input_data["config"],
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error classifying review {review['review_id']}: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
error_count += 1
|
||||
|
||||
avg_spans = total_spans / len(classified_reviews) if classified_reviews else 0
|
||||
|
||||
logger.info(
|
||||
f"Stage 2 complete: {len(classified_reviews)} classified, "
|
||||
f"{error_count} errors, {total_spans} spans total"
|
||||
)
|
||||
|
||||
return Stage2Output(
|
||||
batch_id=batch_id,
|
||||
taxonomy_version=input_data["config"]["taxonomy_version"],
|
||||
model_version=self.config.llm_model,
|
||||
prompt_version="v1.0",
|
||||
reviews_classified=classified_reviews,
|
||||
stats=Stage2Stats(
|
||||
input_count=len(input_data["reviews"]),
|
||||
success_count=len(classified_reviews),
|
||||
error_count=error_count,
|
||||
total_spans=total_spans,
|
||||
avg_spans_per_review=avg_spans,
|
||||
llm_tokens_used=total_tokens,
|
||||
llm_cost_usd=total_cost,
|
||||
),
|
||||
)
|
||||
|
||||
async def _classify_review(
|
||||
self,
|
||||
review: ReviewToClassify,
|
||||
profile: str,
|
||||
llm_client: LLMClientBase,
|
||||
batch_id: str,
|
||||
) -> tuple[ClassifiedReview | None, dict[str, Any]]:
|
||||
"""
|
||||
Classify a single review.
|
||||
|
||||
Args:
|
||||
review: Review to classify
|
||||
profile: Classification profile
|
||||
llm_client: LLM client instance
|
||||
batch_id: Batch identifier
|
||||
|
||||
Returns:
|
||||
Tuple of (classified review, metadata)
|
||||
"""
|
||||
metadata: dict[str, Any] = {}
|
||||
|
||||
# Call LLM for classification
|
||||
try:
|
||||
llm_response, llm_metadata = await llm_client.classify(
|
||||
review["text"],
|
||||
profile,
|
||||
)
|
||||
metadata.update(llm_metadata)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"LLM classification failed for {review['review_id']}, "
|
||||
f"using fallback: {e}"
|
||||
)
|
||||
llm_response = create_fallback_response(review["text"])
|
||||
metadata["fallback"] = True
|
||||
|
||||
# Validate and fix response
|
||||
llm_response = self._validate_and_fix_response(llm_response, review["text"])
|
||||
|
||||
# Convert spans to our format
|
||||
spans = self._convert_spans(
|
||||
llm_response["spans"],
|
||||
review,
|
||||
profile,
|
||||
batch_id,
|
||||
)
|
||||
|
||||
# Ensure exactly one primary span
|
||||
spans = self._ensure_primary_span(spans)
|
||||
|
||||
# Find the primary span for review-level classification
|
||||
primary_span = next((s for s in spans if s.get("is_primary")), spans[0] if spans else None)
|
||||
|
||||
# Generate embedding
|
||||
embedding: list[float] = []
|
||||
if self.embedding_service:
|
||||
embedding = self.embedding_service.embed(review["text_normalized"])
|
||||
|
||||
# Calculate trust score
|
||||
trust_score = self._calculate_trust_score(review, spans)
|
||||
|
||||
# Extract staff mentions and quotes
|
||||
staff_mentions = self._extract_staff_mentions(spans)
|
||||
quotes = self._extract_quotes(spans)
|
||||
|
||||
return ClassifiedReview(
|
||||
source=review["source"],
|
||||
review_id=review["review_id"],
|
||||
review_version=review["review_version"],
|
||||
urt_primary=primary_span["urt_primary"] if primary_span else "O1.01",
|
||||
urt_secondary=primary_span.get("urt_secondary", []) if primary_span else [],
|
||||
valence=primary_span["valence"] if primary_span else "V0",
|
||||
intensity=primary_span["intensity"] if primary_span else "I1",
|
||||
comparative=primary_span.get("comparative", "CR-N") if primary_span else "CR-N",
|
||||
staff_mentions=staff_mentions,
|
||||
quotes=quotes,
|
||||
trust_score=trust_score,
|
||||
embedding=embedding,
|
||||
spans=spans,
|
||||
classification_confidence={
|
||||
"overall": 0.8 if not metadata.get("fallback") else 0.3
|
||||
},
|
||||
processing_time_ms=metadata.get("latency_ms", 0),
|
||||
), metadata
|
||||
|
||||
def _validate_and_fix_response(
|
||||
self,
|
||||
response: LLMClassificationResponse,
|
||||
original_text: str,
|
||||
) -> LLMClassificationResponse:
|
||||
"""
|
||||
Validate LLM response and fix common issues.
|
||||
|
||||
Args:
|
||||
response: Raw LLM response
|
||||
original_text: Original review text for offset validation
|
||||
|
||||
Returns:
|
||||
Validated and fixed response
|
||||
"""
|
||||
spans = response.get("spans", [])
|
||||
if not spans:
|
||||
# Create fallback if no spans
|
||||
return create_fallback_response(original_text)
|
||||
|
||||
fixed_spans = []
|
||||
for i, span in enumerate(spans):
|
||||
# Ensure required fields
|
||||
span["span_index"] = i
|
||||
|
||||
# Validate and fix offsets
|
||||
start = span.get("span_start", 0)
|
||||
end = span.get("span_end", len(original_text))
|
||||
|
||||
if start < 0:
|
||||
start = 0
|
||||
if end > len(original_text):
|
||||
end = len(original_text)
|
||||
if end <= start:
|
||||
end = start + len(span.get("span_text", "")) or len(original_text)
|
||||
|
||||
span["span_start"] = start
|
||||
span["span_end"] = end
|
||||
|
||||
# Validate URT code
|
||||
urt_primary = span.get("urt_primary", "O1.01")
|
||||
if not URT_CODE_PATTERN.match(urt_primary):
|
||||
logger.warning(f"Invalid URT code '{urt_primary}', defaulting to O1.01")
|
||||
span["urt_primary"] = "O1.01"
|
||||
|
||||
# Ensure valid enums
|
||||
if span.get("valence") not in ("V+", "V-", "V0", "V±"):
|
||||
span["valence"] = "V0"
|
||||
if span.get("intensity") not in ("I1", "I2", "I3"):
|
||||
span["intensity"] = "I1"
|
||||
|
||||
fixed_spans.append(span)
|
||||
|
||||
response["spans"] = fixed_spans
|
||||
return response
|
||||
|
||||
def _convert_spans(
|
||||
self,
|
||||
llm_spans: list[LLMSpanResponse],
|
||||
review: ReviewToClassify,
|
||||
profile: str,
|
||||
batch_id: str,
|
||||
) -> list[ExtractedSpan]:
|
||||
"""
|
||||
Convert LLM spans to our ExtractedSpan format.
|
||||
|
||||
Args:
|
||||
llm_spans: Spans from LLM response
|
||||
review: Source review
|
||||
profile: Classification profile
|
||||
batch_id: Batch identifier
|
||||
|
||||
Returns:
|
||||
List of ExtractedSpan objects
|
||||
"""
|
||||
spans = []
|
||||
|
||||
for llm_span in llm_spans:
|
||||
# Generate deterministic span ID
|
||||
span_key = f"{review['review_id']}:{llm_span['span_index']}:{llm_span.get('span_text', '')[:50]}"
|
||||
span_hash = hashlib.sha256(span_key.encode()).hexdigest()[:16]
|
||||
span_id = f"SPN-{span_hash}"
|
||||
|
||||
span = ExtractedSpan(
|
||||
span_id=span_id,
|
||||
span_index=llm_span["span_index"],
|
||||
span_text=llm_span.get("span_text", ""),
|
||||
span_start=llm_span.get("span_start", 0),
|
||||
span_end=llm_span.get("span_end", 0),
|
||||
profile=profile, # type: ignore
|
||||
urt_primary=llm_span["urt_primary"],
|
||||
urt_secondary=llm_span.get("urt_secondary", []),
|
||||
valence=llm_span["valence"],
|
||||
intensity=llm_span["intensity"],
|
||||
comparative=llm_span.get("comparative", "CR-N"),
|
||||
specificity=llm_span.get("specificity"),
|
||||
actionability=llm_span.get("actionability"),
|
||||
temporal=llm_span.get("temporal"),
|
||||
evidence=llm_span.get("evidence"),
|
||||
entity=llm_span.get("entity"),
|
||||
entity_type=llm_span.get("entity_type"),
|
||||
entity_normalized=llm_span.get("entity", "").lower() if llm_span.get("entity") else None,
|
||||
relation_type=llm_span.get("relation_type"),
|
||||
related_span_index=llm_span.get("related_span_index"),
|
||||
confidence=llm_span.get("confidence", "medium"),
|
||||
usn=llm_span.get("usn", self._generate_usn(llm_span)),
|
||||
is_primary=llm_span.get("is_primary", False),
|
||||
)
|
||||
spans.append(span)
|
||||
|
||||
return spans
|
||||
|
||||
def _ensure_primary_span(self, spans: list[ExtractedSpan]) -> list[ExtractedSpan]:
|
||||
"""
|
||||
Ensure exactly one span is marked as primary.
|
||||
|
||||
Uses selection rules:
|
||||
1. Highest intensity (I3 > I2 > I1)
|
||||
2. Tie-break: negative over positive (V- > V± > V0 > V+)
|
||||
3. Tie-break: earliest span_index
|
||||
|
||||
Args:
|
||||
spans: List of spans
|
||||
|
||||
Returns:
|
||||
List of spans with exactly one primary
|
||||
"""
|
||||
if not spans:
|
||||
return spans
|
||||
|
||||
# Count current primaries
|
||||
primary_count = sum(1 for s in spans if s.get("is_primary"))
|
||||
|
||||
if primary_count == 1:
|
||||
return spans
|
||||
|
||||
# Clear all primaries and re-select
|
||||
for span in spans:
|
||||
span["is_primary"] = False
|
||||
|
||||
# Sort by selection criteria
|
||||
def sort_key(s: ExtractedSpan) -> tuple[int, int, int]:
|
||||
return (
|
||||
INTENSITY_PRIORITY.get(s["intensity"], 2),
|
||||
VALENCE_PRIORITY.get(s["valence"], 3),
|
||||
s["span_index"],
|
||||
)
|
||||
|
||||
sorted_spans = sorted(spans, key=sort_key)
|
||||
sorted_spans[0]["is_primary"] = True
|
||||
|
||||
return spans
|
||||
|
||||
def _calculate_trust_score(
|
||||
self,
|
||||
review: ReviewToClassify,
|
||||
spans: list[ExtractedSpan],
|
||||
) -> float:
|
||||
"""
|
||||
Calculate trust score for a review.
|
||||
|
||||
Factors:
|
||||
- Text length (longer = more trust)
|
||||
- Specificity of spans
|
||||
- Confidence levels
|
||||
|
||||
Args:
|
||||
review: Source review
|
||||
spans: Classified spans
|
||||
|
||||
Returns:
|
||||
Trust score between 0.2 and 1.0
|
||||
"""
|
||||
score = 0.5 # Base score
|
||||
|
||||
# Length factor (up to +0.2)
|
||||
text_len = len(review["text"])
|
||||
if text_len > 200:
|
||||
score += 0.2
|
||||
elif text_len > 100:
|
||||
score += 0.1
|
||||
elif text_len > 50:
|
||||
score += 0.05
|
||||
|
||||
# Specificity factor (up to +0.2)
|
||||
if spans:
|
||||
high_spec_count = sum(1 for s in spans if s.get("specificity") == "S3")
|
||||
if high_spec_count > 0:
|
||||
score += 0.1 + (0.1 * min(high_spec_count / len(spans), 1.0))
|
||||
|
||||
# Confidence factor (up to +0.1)
|
||||
if spans:
|
||||
high_conf_count = sum(1 for s in spans if s.get("confidence") == "high")
|
||||
score += 0.1 * (high_conf_count / len(spans))
|
||||
|
||||
# Ensure floor of 0.2 and ceiling of 1.0
|
||||
return max(self.config.trust_score_floor, min(1.0, score))
|
||||
|
||||
def _extract_staff_mentions(self, spans: list[ExtractedSpan]) -> list[str]:
|
||||
"""Extract staff names from spans."""
|
||||
staff = []
|
||||
for span in spans:
|
||||
if span.get("entity_type") == "staff" and span.get("entity"):
|
||||
staff.append(span["entity"])
|
||||
return list(set(staff))
|
||||
|
||||
def _extract_quotes(self, spans: list[ExtractedSpan]) -> dict[str, str]:
|
||||
"""Extract representative quotes by URT code."""
|
||||
quotes = {}
|
||||
for span in spans:
|
||||
code = span["urt_primary"]
|
||||
if code not in quotes:
|
||||
quotes[code] = span["span_text"][:100]
|
||||
return quotes
|
||||
|
||||
def _generate_usn(self, span: LLMSpanResponse) -> str:
|
||||
"""
|
||||
Generate USN (URT String Notation) for a span.
|
||||
|
||||
Format: URT:S:{primary}[+{sec}]:{valence_sign}{intensity_num}:{S#}{A#}{temporal}.{evidence}.{CR_suffix}
|
||||
"""
|
||||
primary = span.get("urt_primary", "O1.01")
|
||||
secondary = span.get("urt_secondary", [])
|
||||
valence = span.get("valence", "V0")
|
||||
intensity = span.get("intensity", "I1")
|
||||
specificity = span.get("specificity", "S1")
|
||||
actionability = span.get("actionability", "A1")
|
||||
temporal = span.get("temporal", "TC")
|
||||
evidence = span.get("evidence", "ES")
|
||||
comparative = span.get("comparative", "CR-N")
|
||||
|
||||
# Build code portion
|
||||
code_part = primary
|
||||
for sec in secondary[:2]:
|
||||
code_part += f"+{sec}"
|
||||
|
||||
# Valence encoding
|
||||
valence_map = {"V+": "+", "V-": "-", "V0": "0", "V±": "±"}
|
||||
valence_sign = valence_map.get(valence, "0")
|
||||
|
||||
# Intensity number
|
||||
intensity_num = intensity[1] if intensity.startswith("I") else "1"
|
||||
|
||||
# Dimensions
|
||||
spec_num = specificity[1] if specificity and specificity.startswith("S") else "1"
|
||||
act_num = actionability[1] if actionability and actionability.startswith("A") else "1"
|
||||
|
||||
# CR suffix
|
||||
cr_map = {"CR-N": "N", "CR-B": "B", "CR-W": "W", "CR-S": "S"}
|
||||
cr_suffix = cr_map.get(comparative, "N")
|
||||
|
||||
return f"URT:S:{code_part}:{valence_sign}{intensity_num}:{spec_num}{act_num}{temporal}.{evidence}.{cr_suffix}"
|
||||
|
||||
async def _persist_classification(
|
||||
self,
|
||||
classified: ClassifiedReview,
|
||||
review: ReviewToClassify,
|
||||
batch_id: str,
|
||||
config: dict[str, Any],
|
||||
) -> None:
|
||||
"""Persist classification results to database."""
|
||||
if not self.review_repo or not self.span_repo:
|
||||
return
|
||||
|
||||
# Update reviews_enriched
|
||||
await self.review_repo.update_enriched_with_classification(
|
||||
classified,
|
||||
self.config.llm_model,
|
||||
config["taxonomy_version"],
|
||||
)
|
||||
|
||||
# Insert spans
|
||||
for span in classified.get("spans", []):
|
||||
await self.span_repo.insert_span(
|
||||
span,
|
||||
review["business_id"],
|
||||
review["place_id"],
|
||||
review["source"],
|
||||
review["review_id"],
|
||||
review["review_version"],
|
||||
review["review_time"],
|
||||
batch_id,
|
||||
self.config.llm_model,
|
||||
config["taxonomy_version"],
|
||||
)
|
||||
@@ -0,0 +1,274 @@
|
||||
"""
|
||||
Stage 3: Issue Routing
|
||||
|
||||
Route classified spans to issues (create new or aggregate to existing).
|
||||
|
||||
Responsibilities:
|
||||
- Query unrouted V-/V± spans
|
||||
- Generate deterministic issue IDs
|
||||
- Create/update issues with span counts
|
||||
- Insert issue_spans links
|
||||
- Log events for audit trail
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from reviewiq_pipeline.contracts import (
|
||||
RoutedSpan,
|
||||
SpanToRoute,
|
||||
Stage3Input,
|
||||
Stage3Output,
|
||||
Stage3Stats,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from reviewiq_pipeline.config import Config
|
||||
from reviewiq_pipeline.db.connection import DatabasePool
|
||||
from reviewiq_pipeline.db.repositories import IssueRepository, SpanRepository
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Stage3Router:
|
||||
"""
|
||||
Stage 3: Route negative/mixed spans to issues.
|
||||
|
||||
This stage:
|
||||
1. Queries unrouted spans with V- or V± valence
|
||||
2. Generates deterministic issue IDs from routing keys
|
||||
3. Creates new issues or updates existing ones
|
||||
4. Links spans to issues (1:1 mapping)
|
||||
5. Logs events for audit trail
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: Config,
|
||||
db: DatabasePool | None = None,
|
||||
span_repo: SpanRepository | None = None,
|
||||
issue_repo: IssueRepository | None = None,
|
||||
):
|
||||
self.config = config
|
||||
self.db = db
|
||||
self.span_repo = span_repo
|
||||
self.issue_repo = issue_repo
|
||||
|
||||
async def process(self, input_data: Stage3Input) -> Stage3Output:
|
||||
"""
|
||||
Process spans through routing stage.
|
||||
|
||||
Args:
|
||||
input_data: Stage 3 input with spans to route
|
||||
|
||||
Returns:
|
||||
Stage3Output with routing results and stats
|
||||
"""
|
||||
logger.info(f"Stage 3: Routing {len(input_data['spans'])} spans")
|
||||
|
||||
routed_spans: list[RoutedSpan] = []
|
||||
issues_created: list[str] = []
|
||||
issues_updated: list[str] = []
|
||||
spans_skipped = 0
|
||||
|
||||
for span in input_data["spans"]:
|
||||
try:
|
||||
# Skip positive spans
|
||||
if span["valence"] not in ("V-", "V±"):
|
||||
spans_skipped += 1
|
||||
continue
|
||||
|
||||
routed = await self._route_span(span)
|
||||
if routed:
|
||||
routed_spans.append(routed)
|
||||
|
||||
if routed["is_new_issue"]:
|
||||
issues_created.append(routed["issue_id"])
|
||||
else:
|
||||
if routed["issue_id"] not in issues_updated:
|
||||
issues_updated.append(routed["issue_id"])
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error routing span {span['span_id']}: {e}")
|
||||
raise
|
||||
|
||||
logger.info(
|
||||
f"Stage 3 complete: {len(routed_spans)} routed, "
|
||||
f"{len(issues_created)} issues created, "
|
||||
f"{len(issues_updated)} issues updated"
|
||||
)
|
||||
|
||||
return Stage3Output(
|
||||
routed_spans=routed_spans,
|
||||
issues_created=issues_created,
|
||||
issues_updated=issues_updated,
|
||||
stats=Stage3Stats(
|
||||
spans_processed=len(input_data["spans"]),
|
||||
spans_routed=len(routed_spans),
|
||||
spans_skipped=spans_skipped,
|
||||
issues_created=len(issues_created),
|
||||
issues_updated=len(issues_updated),
|
||||
),
|
||||
)
|
||||
|
||||
async def _route_span(self, span: SpanToRoute) -> RoutedSpan | None:
|
||||
"""
|
||||
Route a single span to an issue.
|
||||
|
||||
Args:
|
||||
span: Span to route
|
||||
|
||||
Returns:
|
||||
RoutedSpan with routing info, or None if skipped
|
||||
"""
|
||||
# Generate routing key and issue ID
|
||||
routing_key = self._generate_routing_key(span)
|
||||
issue_id = self._generate_issue_id(routing_key)
|
||||
|
||||
# Check if span already routed (should not happen, but defensive)
|
||||
if self.issue_repo:
|
||||
existing_issue = await self.issue_repo.check_span_already_linked(span["span_id"])
|
||||
if existing_issue:
|
||||
logger.warning(
|
||||
f"Span {span['span_id']} already linked to {existing_issue}"
|
||||
)
|
||||
return None
|
||||
|
||||
# Create or update issue
|
||||
is_new_issue = True
|
||||
if self.issue_repo:
|
||||
is_new_issue = await self.issue_repo.upsert_issue(
|
||||
issue_id=issue_id,
|
||||
business_id=span["business_id"],
|
||||
place_id=span["place_id"],
|
||||
primary_subcode=span["urt_primary"],
|
||||
intensity=span["intensity"],
|
||||
entity=span.get("entity_normalized"),
|
||||
entity_normalized=span.get("entity_normalized"),
|
||||
taxonomy_version=self.config.taxonomy_version,
|
||||
)
|
||||
|
||||
routed = RoutedSpan(
|
||||
span_id=span["span_id"],
|
||||
issue_id=issue_id,
|
||||
routing_key=routing_key,
|
||||
is_new_issue=is_new_issue,
|
||||
)
|
||||
|
||||
# Link span to issue
|
||||
if self.issue_repo:
|
||||
await self.issue_repo.link_span_to_issue(
|
||||
routed=routed,
|
||||
source="google", # Assuming Google source
|
||||
review_id="", # Would need to be passed from span metadata
|
||||
review_version=1,
|
||||
intensity=span["intensity"],
|
||||
review_time=span["review_time"],
|
||||
is_primary_match=True,
|
||||
)
|
||||
|
||||
# Log event
|
||||
event_type = "issue_created" if is_new_issue else "span_added"
|
||||
await self.issue_repo.log_event(
|
||||
issue_id=issue_id,
|
||||
event_type=event_type,
|
||||
span_id=span["span_id"],
|
||||
metadata={
|
||||
"urt_primary": span["urt_primary"],
|
||||
"valence": span["valence"],
|
||||
"intensity": span["intensity"],
|
||||
},
|
||||
)
|
||||
|
||||
return routed
|
||||
|
||||
def _generate_routing_key(self, span: SpanToRoute) -> str:
|
||||
"""
|
||||
Generate routing key for a span.
|
||||
|
||||
Format: business_id|place_id|urt_primary|entity_normalized
|
||||
|
||||
Args:
|
||||
span: Span to generate key for
|
||||
|
||||
Returns:
|
||||
Routing key string
|
||||
"""
|
||||
entity = span.get("entity_normalized") or ""
|
||||
return f"{span['business_id']}|{span['place_id']}|{span['urt_primary']}|{entity}"
|
||||
|
||||
def _generate_issue_id(self, routing_key: str) -> str:
|
||||
"""
|
||||
Generate deterministic issue ID from routing key.
|
||||
|
||||
Args:
|
||||
routing_key: Routing key string
|
||||
|
||||
Returns:
|
||||
Issue ID in format ISS-{hash16}
|
||||
"""
|
||||
hash_value = hashlib.sha256(routing_key.encode()).hexdigest()
|
||||
return f"ISS-{hash_value[:16]}"
|
||||
|
||||
async def process_from_db(self, limit: int = 100) -> Stage3Output:
|
||||
"""
|
||||
Process unrouted spans directly from database.
|
||||
|
||||
Convenience method that queries unrouted spans and processes them.
|
||||
|
||||
Args:
|
||||
limit: Maximum number of spans to process
|
||||
|
||||
Returns:
|
||||
Stage3Output with routing results
|
||||
"""
|
||||
if not self.span_repo:
|
||||
raise RuntimeError("SpanRepository not configured")
|
||||
|
||||
# Query unrouted negative spans
|
||||
span_rows = await self.span_repo.get_unrouted_negative_spans(limit)
|
||||
|
||||
# Convert to SpanToRoute format
|
||||
spans = [
|
||||
SpanToRoute(
|
||||
span_id=row["span_id"],
|
||||
business_id=row["business_id"],
|
||||
place_id=row["place_id"],
|
||||
urt_primary=row["urt_primary"],
|
||||
valence=row["valence"],
|
||||
intensity=row["intensity"],
|
||||
entity_normalized=row.get("entity_normalized"),
|
||||
review_time=str(row["review_time"]),
|
||||
confidence=row["confidence"],
|
||||
trust_score=row.get("trust_score", 0.5),
|
||||
)
|
||||
for row in span_rows
|
||||
]
|
||||
|
||||
return await self.process(Stage3Input(spans=spans))
|
||||
|
||||
def route_span_sync(self, span: SpanToRoute) -> RoutedSpan:
|
||||
"""
|
||||
Route a span without database operations (for testing).
|
||||
|
||||
Args:
|
||||
span: Span to route
|
||||
|
||||
Returns:
|
||||
RoutedSpan with routing info
|
||||
"""
|
||||
if span["valence"] not in ("V-", "V±"):
|
||||
raise ValueError(f"Cannot route positive span (valence={span['valence']})")
|
||||
|
||||
routing_key = self._generate_routing_key(span)
|
||||
issue_id = self._generate_issue_id(routing_key)
|
||||
|
||||
return RoutedSpan(
|
||||
span_id=span["span_id"],
|
||||
issue_id=issue_id,
|
||||
routing_key=routing_key,
|
||||
is_new_issue=True, # Can't know without DB
|
||||
)
|
||||
@@ -0,0 +1,485 @@
|
||||
"""
|
||||
Stage 4: Fact Aggregation
|
||||
|
||||
Pre-aggregate span/review data into fact_timeseries for fast dashboard queries.
|
||||
|
||||
Responsibilities:
|
||||
- Aggregate spans by URT code per time bucket
|
||||
- Calculate valence/intensity distributions
|
||||
- Compute strength scores (trust-weighted)
|
||||
- UPSERT into fact_timeseries table
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from datetime import date, datetime, timedelta
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from reviewiq_pipeline.contracts import (
|
||||
FactRecord,
|
||||
Stage4Input,
|
||||
Stage4Output,
|
||||
Stage4Stats,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from reviewiq_pipeline.config import Config
|
||||
from reviewiq_pipeline.db.connection import DatabasePool
|
||||
from reviewiq_pipeline.db.repositories import FactRepository
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Stage4Aggregator:
|
||||
"""
|
||||
Stage 4: Aggregate span data into time series facts.
|
||||
|
||||
This stage:
|
||||
1. Queries span data for a business/date range
|
||||
2. Aggregates by URT code and time bucket
|
||||
3. Calculates valence/intensity distributions
|
||||
4. Computes trust-weighted strength scores
|
||||
5. UPSERTs results into fact_timeseries table
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: Config,
|
||||
db: DatabasePool | None = None,
|
||||
fact_repo: FactRepository | None = None,
|
||||
):
|
||||
self.config = config
|
||||
self.db = db
|
||||
self.fact_repo = fact_repo
|
||||
|
||||
async def process(self, input_data: Stage4Input) -> Stage4Output:
|
||||
"""
|
||||
Process aggregation for a business and date.
|
||||
|
||||
Args:
|
||||
input_data: Stage 4 input with aggregation parameters
|
||||
|
||||
Returns:
|
||||
Stage4Output with aggregated facts and stats
|
||||
"""
|
||||
logger.info(
|
||||
f"Stage 4: Aggregating for business {input_data['business_id']} "
|
||||
f"on {input_data['date']}"
|
||||
)
|
||||
|
||||
facts_written: list[FactRecord] = []
|
||||
locations_processed = 0
|
||||
codes_aggregated = set()
|
||||
|
||||
# Get date range based on bucket types
|
||||
target_date = datetime.strptime(input_data["date"], "%Y-%m-%d").date()
|
||||
|
||||
for bucket_type in input_data["bucket_types"]:
|
||||
start_date, end_date = self._get_bucket_range(target_date, bucket_type)
|
||||
period_date = self._get_period_date(target_date, bucket_type)
|
||||
|
||||
# Get aggregation data from database
|
||||
if self.fact_repo:
|
||||
span_data = await self.fact_repo.get_aggregation_data(
|
||||
input_data["business_id"],
|
||||
start_date,
|
||||
end_date,
|
||||
)
|
||||
place_ids = await self.fact_repo.get_place_ids_for_business(
|
||||
input_data["business_id"]
|
||||
)
|
||||
else:
|
||||
span_data = []
|
||||
place_ids = []
|
||||
|
||||
# Aggregate by place_id and URT code
|
||||
for place_id in place_ids + ["ALL"]:
|
||||
place_data = [
|
||||
s for s in span_data
|
||||
if place_id == "ALL" or s["place_id"] == place_id
|
||||
]
|
||||
|
||||
if not place_data:
|
||||
continue
|
||||
|
||||
locations_processed += 1 if place_id != "ALL" else 0
|
||||
|
||||
# Aggregate by URT code
|
||||
code_facts = self._aggregate_by_code(
|
||||
place_data,
|
||||
input_data["business_id"],
|
||||
place_id,
|
||||
period_date,
|
||||
bucket_type,
|
||||
input_data["taxonomy_version"],
|
||||
)
|
||||
|
||||
for fact in code_facts:
|
||||
facts_written.append(fact)
|
||||
codes_aggregated.add(fact["subject_id"])
|
||||
|
||||
if self.fact_repo:
|
||||
await self.fact_repo.upsert_fact(fact)
|
||||
|
||||
# Aggregate by domain
|
||||
domain_facts = self._aggregate_by_domain(
|
||||
place_data,
|
||||
input_data["business_id"],
|
||||
place_id,
|
||||
period_date,
|
||||
bucket_type,
|
||||
input_data["taxonomy_version"],
|
||||
)
|
||||
|
||||
for fact in domain_facts:
|
||||
facts_written.append(fact)
|
||||
|
||||
if self.fact_repo:
|
||||
await self.fact_repo.upsert_fact(fact)
|
||||
|
||||
# Overall aggregation
|
||||
overall_fact = self._aggregate_overall(
|
||||
place_data,
|
||||
input_data["business_id"],
|
||||
place_id,
|
||||
period_date,
|
||||
bucket_type,
|
||||
input_data["taxonomy_version"],
|
||||
)
|
||||
|
||||
facts_written.append(overall_fact)
|
||||
|
||||
if self.fact_repo:
|
||||
await self.fact_repo.upsert_fact(overall_fact)
|
||||
|
||||
logger.info(
|
||||
f"Stage 4 complete: {len(facts_written)} facts written, "
|
||||
f"{len(codes_aggregated)} unique codes"
|
||||
)
|
||||
|
||||
return Stage4Output(
|
||||
facts_written=facts_written,
|
||||
stats=Stage4Stats(
|
||||
business_id=input_data["business_id"],
|
||||
date=input_data["date"],
|
||||
locations_processed=locations_processed,
|
||||
codes_aggregated=len(codes_aggregated),
|
||||
facts_upserted=len(facts_written),
|
||||
),
|
||||
)
|
||||
|
||||
def _get_bucket_range(
|
||||
self,
|
||||
target_date: date,
|
||||
bucket_type: str,
|
||||
) -> tuple[date, date]:
|
||||
"""Get start and end dates for a time bucket."""
|
||||
if bucket_type == "day":
|
||||
return target_date, target_date
|
||||
elif bucket_type == "week":
|
||||
# Week starts on Monday
|
||||
start = target_date - timedelta(days=target_date.weekday())
|
||||
end = start + timedelta(days=6)
|
||||
return start, end
|
||||
elif bucket_type == "month":
|
||||
start = target_date.replace(day=1)
|
||||
# Get last day of month
|
||||
if target_date.month == 12:
|
||||
end = target_date.replace(year=target_date.year + 1, month=1, day=1) - timedelta(days=1)
|
||||
else:
|
||||
end = target_date.replace(month=target_date.month + 1, day=1) - timedelta(days=1)
|
||||
return start, end
|
||||
else:
|
||||
raise ValueError(f"Unknown bucket type: {bucket_type}")
|
||||
|
||||
def _get_period_date(self, target_date: date, bucket_type: str) -> str:
|
||||
"""Get the period date string for a bucket."""
|
||||
if bucket_type == "day":
|
||||
return target_date.isoformat()
|
||||
elif bucket_type == "week":
|
||||
# Week starts on Monday
|
||||
start = target_date - timedelta(days=target_date.weekday())
|
||||
return start.isoformat()
|
||||
elif bucket_type == "month":
|
||||
return target_date.replace(day=1).isoformat()
|
||||
else:
|
||||
return target_date.isoformat()
|
||||
|
||||
def _aggregate_by_code(
|
||||
self,
|
||||
span_data: list[dict[str, Any]],
|
||||
business_id: str,
|
||||
place_id: str,
|
||||
period_date: str,
|
||||
bucket_type: str,
|
||||
taxonomy_version: str,
|
||||
) -> list[FactRecord]:
|
||||
"""Aggregate spans by URT code."""
|
||||
code_groups: dict[str, list[dict]] = defaultdict(list)
|
||||
|
||||
for span in span_data:
|
||||
code_groups[span["urt_primary"]].append(span)
|
||||
|
||||
facts = []
|
||||
for code, spans in code_groups.items():
|
||||
fact = self._compute_fact_metrics(
|
||||
spans,
|
||||
business_id,
|
||||
place_id,
|
||||
period_date,
|
||||
bucket_type,
|
||||
"urt_code",
|
||||
code,
|
||||
taxonomy_version,
|
||||
)
|
||||
facts.append(fact)
|
||||
|
||||
return facts
|
||||
|
||||
def _aggregate_by_domain(
|
||||
self,
|
||||
span_data: list[dict[str, Any]],
|
||||
business_id: str,
|
||||
place_id: str,
|
||||
period_date: str,
|
||||
bucket_type: str,
|
||||
taxonomy_version: str,
|
||||
) -> list[FactRecord]:
|
||||
"""Aggregate spans by domain (first letter of URT code)."""
|
||||
domain_groups: dict[str, list[dict]] = defaultdict(list)
|
||||
|
||||
for span in span_data:
|
||||
domain = span["urt_primary"][0] # First letter
|
||||
domain_groups[domain].append(span)
|
||||
|
||||
facts = []
|
||||
for domain, spans in domain_groups.items():
|
||||
fact = self._compute_fact_metrics(
|
||||
spans,
|
||||
business_id,
|
||||
place_id,
|
||||
period_date,
|
||||
bucket_type,
|
||||
"domain",
|
||||
domain,
|
||||
taxonomy_version,
|
||||
)
|
||||
facts.append(fact)
|
||||
|
||||
return facts
|
||||
|
||||
def _aggregate_overall(
|
||||
self,
|
||||
span_data: list[dict[str, Any]],
|
||||
business_id: str,
|
||||
place_id: str,
|
||||
period_date: str,
|
||||
bucket_type: str,
|
||||
taxonomy_version: str,
|
||||
) -> FactRecord:
|
||||
"""Aggregate all spans for overall metrics."""
|
||||
return self._compute_fact_metrics(
|
||||
span_data,
|
||||
business_id,
|
||||
place_id,
|
||||
period_date,
|
||||
bucket_type,
|
||||
"overall",
|
||||
"all",
|
||||
taxonomy_version,
|
||||
)
|
||||
|
||||
def _compute_fact_metrics(
|
||||
self,
|
||||
spans: list[dict[str, Any]],
|
||||
business_id: str,
|
||||
place_id: str,
|
||||
period_date: str,
|
||||
bucket_type: str,
|
||||
subject_type: str,
|
||||
subject_id: str,
|
||||
taxonomy_version: str,
|
||||
) -> FactRecord:
|
||||
"""
|
||||
Compute aggregated metrics for a group of spans.
|
||||
|
||||
Args:
|
||||
spans: List of span data
|
||||
business_id: Business identifier
|
||||
place_id: Place ID or 'ALL'
|
||||
period_date: Period date string
|
||||
bucket_type: day/week/month
|
||||
subject_type: overall/urt_code/domain/issue
|
||||
subject_id: Subject identifier
|
||||
taxonomy_version: Taxonomy version
|
||||
|
||||
Returns:
|
||||
FactRecord with computed metrics
|
||||
"""
|
||||
if not spans:
|
||||
return self._empty_fact(
|
||||
business_id, place_id, period_date, bucket_type,
|
||||
subject_type, subject_id, taxonomy_version,
|
||||
)
|
||||
|
||||
# Count unique reviews
|
||||
review_ids = set()
|
||||
for span in spans:
|
||||
# Assuming span has review_id in metadata
|
||||
review_id = span.get("review_id", span.get("span_id", ""))
|
||||
review_ids.add(review_id)
|
||||
|
||||
span_count = len(spans)
|
||||
review_count = len(review_ids) if review_ids else span_count
|
||||
|
||||
# Valence counts
|
||||
negative_count = sum(1 for s in spans if s["valence"] == "V-")
|
||||
positive_count = sum(1 for s in spans if s["valence"] == "V+")
|
||||
neutral_count = sum(1 for s in spans if s["valence"] == "V0")
|
||||
mixed_count = sum(1 for s in spans if s["valence"] == "V±")
|
||||
|
||||
# Intensity counts
|
||||
i1_count = sum(1 for s in spans if s["intensity"] == "I1")
|
||||
i2_count = sum(1 for s in spans if s["intensity"] == "I2")
|
||||
i3_count = sum(1 for s in spans if s["intensity"] == "I3")
|
||||
|
||||
# Comparative counts
|
||||
cr_better = sum(1 for s in spans if s.get("comparative") == "CR-B")
|
||||
cr_worse = sum(1 for s in spans if s.get("comparative") == "CR-W")
|
||||
cr_same = sum(1 for s in spans if s.get("comparative") == "CR-S")
|
||||
|
||||
# Calculate strength scores
|
||||
strength_score = self._compute_strength_score(spans)
|
||||
negative_strength = self._compute_strength_score(
|
||||
[s for s in spans if s["valence"] in ("V-", "V±")]
|
||||
)
|
||||
positive_strength = self._compute_strength_score(
|
||||
[s for s in spans if s["valence"] == "V+"]
|
||||
)
|
||||
|
||||
# Trust-weighted scores
|
||||
trust_weighted_strength = self._compute_trust_weighted_strength(spans)
|
||||
trust_weighted_negative = self._compute_trust_weighted_strength(
|
||||
[s for s in spans if s["valence"] in ("V-", "V±")]
|
||||
)
|
||||
|
||||
# Average rating
|
||||
ratings = [s["rating"] for s in spans if s.get("rating")]
|
||||
avg_rating = sum(ratings) / len(ratings) if ratings else None
|
||||
|
||||
return FactRecord(
|
||||
business_id=business_id,
|
||||
place_id=place_id,
|
||||
period_date=period_date,
|
||||
bucket_type=bucket_type,
|
||||
subject_type=subject_type, # type: ignore
|
||||
subject_id=subject_id,
|
||||
taxonomy_version=taxonomy_version,
|
||||
review_count=review_count,
|
||||
span_count=span_count,
|
||||
negative_count=negative_count,
|
||||
positive_count=positive_count,
|
||||
neutral_count=neutral_count,
|
||||
mixed_count=mixed_count,
|
||||
strength_score=strength_score,
|
||||
negative_strength=negative_strength,
|
||||
positive_strength=positive_strength,
|
||||
avg_rating=avg_rating,
|
||||
i1_count=i1_count,
|
||||
i2_count=i2_count,
|
||||
i3_count=i3_count,
|
||||
cr_better=cr_better,
|
||||
cr_worse=cr_worse,
|
||||
cr_same=cr_same,
|
||||
trust_weighted_strength=trust_weighted_strength,
|
||||
trust_weighted_negative=trust_weighted_negative,
|
||||
)
|
||||
|
||||
def _compute_strength_score(self, spans: list[dict[str, Any]]) -> float:
|
||||
"""
|
||||
Compute strength score from intensity distribution.
|
||||
|
||||
Score: sum of (intensity_weight * valence_multiplier)
|
||||
I1=1, I2=2, I3=4 (exponential)
|
||||
V-=1, V±=0.5, V0=0, V+=1
|
||||
"""
|
||||
if not spans:
|
||||
return 0.0
|
||||
|
||||
intensity_weights = {"I1": 1, "I2": 2, "I3": 4}
|
||||
valence_multipliers = {"V-": 1.0, "V±": 0.5, "V0": 0.0, "V+": 1.0}
|
||||
|
||||
total = 0.0
|
||||
for span in spans:
|
||||
intensity = span.get("intensity", "I1")
|
||||
valence = span.get("valence", "V0")
|
||||
weight = intensity_weights.get(intensity, 1)
|
||||
multiplier = valence_multipliers.get(valence, 0)
|
||||
total += weight * multiplier
|
||||
|
||||
return total
|
||||
|
||||
def _compute_trust_weighted_strength(self, spans: list[dict[str, Any]]) -> float:
|
||||
"""
|
||||
Compute trust-weighted strength score.
|
||||
|
||||
Similar to strength score but weighted by trust_score.
|
||||
"""
|
||||
if not spans:
|
||||
return 0.0
|
||||
|
||||
intensity_weights = {"I1": 1, "I2": 2, "I3": 4}
|
||||
valence_multipliers = {"V-": 1.0, "V±": 0.5, "V0": 0.0, "V+": 1.0}
|
||||
|
||||
total = 0.0
|
||||
for span in spans:
|
||||
intensity = span.get("intensity", "I1")
|
||||
valence = span.get("valence", "V0")
|
||||
trust = span.get("trust_score", 0.5)
|
||||
|
||||
weight = intensity_weights.get(intensity, 1)
|
||||
multiplier = valence_multipliers.get(valence, 0)
|
||||
total += weight * multiplier * trust
|
||||
|
||||
return total
|
||||
|
||||
def _empty_fact(
|
||||
self,
|
||||
business_id: str,
|
||||
place_id: str,
|
||||
period_date: str,
|
||||
bucket_type: str,
|
||||
subject_type: str,
|
||||
subject_id: str,
|
||||
taxonomy_version: str,
|
||||
) -> FactRecord:
|
||||
"""Create an empty fact record with zero counts."""
|
||||
return FactRecord(
|
||||
business_id=business_id,
|
||||
place_id=place_id,
|
||||
period_date=period_date,
|
||||
bucket_type=bucket_type,
|
||||
subject_type=subject_type, # type: ignore
|
||||
subject_id=subject_id,
|
||||
taxonomy_version=taxonomy_version,
|
||||
review_count=0,
|
||||
span_count=0,
|
||||
negative_count=0,
|
||||
positive_count=0,
|
||||
neutral_count=0,
|
||||
mixed_count=0,
|
||||
strength_score=0.0,
|
||||
negative_strength=0.0,
|
||||
positive_strength=0.0,
|
||||
avg_rating=None,
|
||||
i1_count=0,
|
||||
i2_count=0,
|
||||
i3_count=0,
|
||||
cr_better=0,
|
||||
cr_worse=0,
|
||||
cr_same=0,
|
||||
trust_weighted_strength=0.0,
|
||||
trust_weighted_negative=0.0,
|
||||
)
|
||||
@@ -0,0 +1,23 @@
|
||||
"""Validation rules for pipeline stages."""
|
||||
|
||||
from reviewiq_pipeline.validation.validators import (
|
||||
Stage1Validator,
|
||||
Stage2Validator,
|
||||
Stage3Validator,
|
||||
Stage4Validator,
|
||||
validate_stage1_output,
|
||||
validate_stage2_output,
|
||||
validate_stage3_output,
|
||||
validate_stage4_output,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"Stage1Validator",
|
||||
"Stage2Validator",
|
||||
"Stage3Validator",
|
||||
"Stage4Validator",
|
||||
"validate_stage1_output",
|
||||
"validate_stage2_output",
|
||||
"validate_stage3_output",
|
||||
"validate_stage4_output",
|
||||
]
|
||||
@@ -0,0 +1,506 @@
|
||||
"""
|
||||
Validation rules for pipeline stages.
|
||||
|
||||
Implements validation rules V1.x, V2.x, V3.x, V4.x from the contracts.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import TYPE_CHECKING, Any, Callable
|
||||
|
||||
from reviewiq_pipeline.contracts import (
|
||||
ValidationError,
|
||||
ValidationResult,
|
||||
)
|
||||
from reviewiq_pipeline.services.text_processor import is_valid_iso639, is_valid_sha256
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from reviewiq_pipeline.contracts import (
|
||||
FactRecord,
|
||||
NormalizedReview,
|
||||
RoutedSpan,
|
||||
Stage1Output,
|
||||
Stage2Output,
|
||||
Stage3Output,
|
||||
Stage4Output,
|
||||
)
|
||||
from reviewiq_pipeline.db.connection import DatabasePool
|
||||
|
||||
# URT code pattern
|
||||
URT_CODE_PATTERN = re.compile(r"^[OPJEAVR][1-4]\.[0-9]{2}$")
|
||||
|
||||
# Issue ID pattern
|
||||
ISSUE_ID_PATTERN = re.compile(r"^ISS-[a-f0-9]{16}$")
|
||||
|
||||
# Valid enum values
|
||||
VALID_VALENCES = {"V+", "V-", "V0", "V±"}
|
||||
VALID_INTENSITIES = {"I1", "I2", "I3"}
|
||||
VALID_SPECIFICITIES = {"S1", "S2", "S3"}
|
||||
VALID_ACTIONABILITIES = {"A1", "A2", "A3"}
|
||||
VALID_TEMPORALS = {"TC", "TR", "TH", "TF"}
|
||||
VALID_EVIDENCES = {"ES", "EI", "EC"}
|
||||
VALID_COMPARATIVES = {"CR-N", "CR-B", "CR-W", "CR-S"}
|
||||
|
||||
|
||||
def _has_control_chars(text: str) -> bool:
|
||||
"""Check if text contains control characters."""
|
||||
return bool(re.search(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]", text))
|
||||
|
||||
|
||||
class Stage1Validator:
|
||||
"""Validator for Stage 1 output."""
|
||||
|
||||
def validate(self, output: Stage1Output) -> ValidationResult:
|
||||
"""
|
||||
Validate Stage 1 output.
|
||||
|
||||
Rules:
|
||||
- V1.1: text is non-empty string
|
||||
- V1.2: text_normalized contains no control chars
|
||||
- V1.3: content_hash is 64-char hex
|
||||
- V1.4: review_version >= 1
|
||||
- V1.5: text_language is valid ISO 639-1
|
||||
- V1.6: raw_id references valid reviews_raw row (requires DB)
|
||||
"""
|
||||
errors: list[ValidationError] = []
|
||||
|
||||
for review in output["reviews_normalized"]:
|
||||
review_id = review["review_id"]
|
||||
|
||||
# V1.1: Non-empty text
|
||||
if not review.get("text") or not review["text"].strip():
|
||||
errors.append(ValidationError(
|
||||
rule="V1.1",
|
||||
identifier=review_id,
|
||||
message="Empty text",
|
||||
))
|
||||
|
||||
# V1.2: No control characters in normalized text
|
||||
if review.get("text_normalized") and _has_control_chars(review["text_normalized"]):
|
||||
errors.append(ValidationError(
|
||||
rule="V1.2",
|
||||
identifier=review_id,
|
||||
message="Control chars in normalized text",
|
||||
))
|
||||
|
||||
# V1.3: Valid content hash
|
||||
if not is_valid_sha256(review.get("content_hash", "")):
|
||||
errors.append(ValidationError(
|
||||
rule="V1.3",
|
||||
identifier=review_id,
|
||||
message=f"Invalid content hash: {review.get('content_hash', '')[:20]}...",
|
||||
))
|
||||
|
||||
# V1.4: Version >= 1
|
||||
if review.get("review_version", 0) < 1:
|
||||
errors.append(ValidationError(
|
||||
rule="V1.4",
|
||||
identifier=review_id,
|
||||
message=f"Invalid version: {review.get('review_version')}",
|
||||
))
|
||||
|
||||
# V1.5: Valid language code
|
||||
if not is_valid_iso639(review.get("text_language", "")):
|
||||
errors.append(ValidationError(
|
||||
rule="V1.5",
|
||||
identifier=review_id,
|
||||
message=f"Invalid language: {review.get('text_language')}",
|
||||
))
|
||||
|
||||
return ValidationResult(
|
||||
stage="stage1",
|
||||
passed=len(errors) == 0,
|
||||
error_count=len(errors),
|
||||
errors=errors,
|
||||
)
|
||||
|
||||
|
||||
class Stage2Validator:
|
||||
"""Validator for Stage 2 output."""
|
||||
|
||||
def validate(
|
||||
self,
|
||||
output: Stage2Output,
|
||||
input_reviews: dict[tuple[str, str, int], dict[str, Any]] | None = None,
|
||||
) -> ValidationResult:
|
||||
"""
|
||||
Validate Stage 2 output.
|
||||
|
||||
Rules:
|
||||
- V2.1: urt_primary matches pattern
|
||||
- V2.2: urt_secondary has max 2 elements
|
||||
- V2.3: valence is valid enum
|
||||
- V2.4: intensity is valid enum
|
||||
- V2.5: span_end > span_start
|
||||
- V2.6: span_text matches text[span_start:span_end]
|
||||
- V2.7: spans do not overlap
|
||||
- V2.8: exactly one is_primary per review
|
||||
- V2.9: trust_score between 0.2 and 1.0
|
||||
- V2.10: embedding is 384-dim array
|
||||
- V2.11: usn matches profile-specific regex
|
||||
- V2.12: related_span_index references valid span
|
||||
|
||||
Args:
|
||||
output: Stage 2 output to validate
|
||||
input_reviews: Optional dict mapping (source, review_id, version) -> review data
|
||||
"""
|
||||
errors: list[ValidationError] = []
|
||||
|
||||
for review in output["reviews_classified"]:
|
||||
review_id = review["review_id"]
|
||||
|
||||
# V2.1: Valid URT code
|
||||
if not URT_CODE_PATTERN.match(review.get("urt_primary", "")):
|
||||
errors.append(ValidationError(
|
||||
rule="V2.1",
|
||||
identifier=review_id,
|
||||
message=f"Invalid URT code: {review.get('urt_primary')}",
|
||||
))
|
||||
|
||||
# V2.2: Max 2 secondary codes
|
||||
if len(review.get("urt_secondary", [])) > 2:
|
||||
errors.append(ValidationError(
|
||||
rule="V2.2",
|
||||
identifier=review_id,
|
||||
message=f"Too many secondary codes: {len(review.get('urt_secondary', []))}",
|
||||
))
|
||||
|
||||
# V2.3: Valid valence
|
||||
if review.get("valence") not in VALID_VALENCES:
|
||||
errors.append(ValidationError(
|
||||
rule="V2.3",
|
||||
identifier=review_id,
|
||||
message=f"Invalid valence: {review.get('valence')}",
|
||||
))
|
||||
|
||||
# V2.4: Valid intensity
|
||||
if review.get("intensity") not in VALID_INTENSITIES:
|
||||
errors.append(ValidationError(
|
||||
rule="V2.4",
|
||||
identifier=review_id,
|
||||
message=f"Invalid intensity: {review.get('intensity')}",
|
||||
))
|
||||
|
||||
# V2.9: Trust score bounds
|
||||
trust = review.get("trust_score", 0)
|
||||
if not (0.2 <= trust <= 1.0):
|
||||
errors.append(ValidationError(
|
||||
rule="V2.9",
|
||||
identifier=review_id,
|
||||
message=f"Trust score out of bounds: {trust}",
|
||||
))
|
||||
|
||||
# V2.10: Embedding dimension
|
||||
embedding = review.get("embedding", [])
|
||||
if embedding and len(embedding) != 384:
|
||||
errors.append(ValidationError(
|
||||
rule="V2.10",
|
||||
identifier=review_id,
|
||||
message=f"Invalid embedding dimension: {len(embedding)}",
|
||||
))
|
||||
|
||||
# Validate spans
|
||||
spans = review.get("spans", [])
|
||||
primary_count = 0
|
||||
span_ranges: list[tuple[int, int]] = []
|
||||
|
||||
# Get original text if available
|
||||
original_text = ""
|
||||
if input_reviews:
|
||||
key = (review["source"], review["review_id"], review["review_version"])
|
||||
original_text = input_reviews.get(key, {}).get("text", "")
|
||||
|
||||
for span in spans:
|
||||
span_id = span.get("span_id", f"{review_id}:span")
|
||||
|
||||
# V2.5: Valid bounds
|
||||
start = span.get("span_start", 0)
|
||||
end = span.get("span_end", 0)
|
||||
if end <= start:
|
||||
errors.append(ValidationError(
|
||||
rule="V2.5",
|
||||
identifier=span_id,
|
||||
message=f"Invalid bounds: {start}:{end}",
|
||||
))
|
||||
|
||||
# V2.6: Text matches (if we have original)
|
||||
if original_text and span.get("span_text"):
|
||||
expected = original_text[start:end]
|
||||
# Allow whitespace normalization
|
||||
expected_norm = " ".join(expected.split())
|
||||
actual_norm = " ".join(span["span_text"].split())
|
||||
if expected_norm != actual_norm:
|
||||
errors.append(ValidationError(
|
||||
rule="V2.6",
|
||||
identifier=span_id,
|
||||
message=f"Text mismatch at {start}:{end}",
|
||||
))
|
||||
|
||||
# V2.7: Check overlap
|
||||
for prev_start, prev_end in span_ranges:
|
||||
if not (end <= prev_start or start >= prev_end):
|
||||
errors.append(ValidationError(
|
||||
rule="V2.7",
|
||||
identifier=span_id,
|
||||
message="Overlapping span",
|
||||
))
|
||||
break
|
||||
span_ranges.append((start, end))
|
||||
|
||||
# V2.8: Count primaries
|
||||
if span.get("is_primary"):
|
||||
primary_count += 1
|
||||
|
||||
# V2.12: Valid related_span_index
|
||||
related_idx = span.get("related_span_index")
|
||||
if related_idx is not None:
|
||||
if related_idx < 0 or related_idx >= len(spans):
|
||||
errors.append(ValidationError(
|
||||
rule="V2.12",
|
||||
identifier=span_id,
|
||||
message=f"Invalid related_span_index: {related_idx}",
|
||||
))
|
||||
elif related_idx == span.get("span_index"):
|
||||
errors.append(ValidationError(
|
||||
rule="V2.12",
|
||||
identifier=span_id,
|
||||
message="Self-referencing span",
|
||||
))
|
||||
|
||||
# V2.8: Exactly one primary
|
||||
if primary_count != 1:
|
||||
errors.append(ValidationError(
|
||||
rule="V2.8",
|
||||
identifier=review_id,
|
||||
message=f"Primary span count: {primary_count}",
|
||||
))
|
||||
|
||||
return ValidationResult(
|
||||
stage="stage2",
|
||||
passed=len(errors) == 0,
|
||||
error_count=len(errors),
|
||||
errors=errors,
|
||||
)
|
||||
|
||||
|
||||
class Stage3Validator:
|
||||
"""Validator for Stage 3 output."""
|
||||
|
||||
def __init__(self, db: DatabasePool | None = None):
|
||||
self.db = db
|
||||
|
||||
async def validate(self, output: Stage3Output) -> ValidationResult:
|
||||
"""
|
||||
Validate Stage 3 output.
|
||||
|
||||
Rules:
|
||||
- V3.1: issue_id matches pattern
|
||||
- V3.2: routing_key is non-empty
|
||||
- V3.3: span not already linked to different issue
|
||||
- V3.4: issue exists in issues table
|
||||
- V3.5: only V-/V± spans create issues
|
||||
"""
|
||||
errors: list[ValidationError] = []
|
||||
|
||||
for routed in output["routed_spans"]:
|
||||
span_id = routed["span_id"]
|
||||
|
||||
# V3.1: Valid issue ID format
|
||||
if not ISSUE_ID_PATTERN.match(routed.get("issue_id", "")):
|
||||
errors.append(ValidationError(
|
||||
rule="V3.1",
|
||||
identifier=span_id,
|
||||
message=f"Invalid issue_id: {routed.get('issue_id')}",
|
||||
))
|
||||
|
||||
# V3.2: Non-empty routing key
|
||||
if not routed.get("routing_key"):
|
||||
errors.append(ValidationError(
|
||||
rule="V3.2",
|
||||
identifier=span_id,
|
||||
message="Empty routing key",
|
||||
))
|
||||
|
||||
# V3.3, V3.4: Require database for these checks
|
||||
if self.db:
|
||||
# V3.3: Check no duplicate routing
|
||||
existing = await self.db.fetchval(
|
||||
"SELECT issue_id FROM issue_spans WHERE span_id = $1",
|
||||
span_id,
|
||||
)
|
||||
if existing and existing != routed["issue_id"]:
|
||||
errors.append(ValidationError(
|
||||
rule="V3.3",
|
||||
identifier=span_id,
|
||||
message=f"Already routed to {existing}",
|
||||
))
|
||||
|
||||
# V3.4: Issue exists
|
||||
issue_exists = await self.db.fetchval(
|
||||
"SELECT 1 FROM issues WHERE issue_id = $1",
|
||||
routed["issue_id"],
|
||||
)
|
||||
if not issue_exists:
|
||||
errors.append(ValidationError(
|
||||
rule="V3.4",
|
||||
identifier=span_id,
|
||||
message=f"Issue not found: {routed['issue_id']}",
|
||||
))
|
||||
|
||||
return ValidationResult(
|
||||
stage="stage3",
|
||||
passed=len(errors) == 0,
|
||||
error_count=len(errors),
|
||||
errors=errors,
|
||||
)
|
||||
|
||||
def validate_sync(self, output: Stage3Output) -> ValidationResult:
|
||||
"""Synchronous validation without database checks."""
|
||||
errors: list[ValidationError] = []
|
||||
|
||||
for routed in output["routed_spans"]:
|
||||
span_id = routed["span_id"]
|
||||
|
||||
# V3.1: Valid issue ID format
|
||||
if not ISSUE_ID_PATTERN.match(routed.get("issue_id", "")):
|
||||
errors.append(ValidationError(
|
||||
rule="V3.1",
|
||||
identifier=span_id,
|
||||
message=f"Invalid issue_id: {routed.get('issue_id')}",
|
||||
))
|
||||
|
||||
# V3.2: Non-empty routing key
|
||||
if not routed.get("routing_key"):
|
||||
errors.append(ValidationError(
|
||||
rule="V3.2",
|
||||
identifier=span_id,
|
||||
message="Empty routing key",
|
||||
))
|
||||
|
||||
return ValidationResult(
|
||||
stage="stage3",
|
||||
passed=len(errors) == 0,
|
||||
error_count=len(errors),
|
||||
errors=errors,
|
||||
)
|
||||
|
||||
|
||||
class Stage4Validator:
|
||||
"""Validator for Stage 4 output."""
|
||||
|
||||
def validate(self, output: Stage4Output) -> ValidationResult:
|
||||
"""
|
||||
Validate Stage 4 output.
|
||||
|
||||
Rules:
|
||||
- V4.1: place_id is valid or 'ALL'
|
||||
- V4.2: period_date matches bucket
|
||||
- V4.3: span_count >= review_count
|
||||
- V4.4: valence counts sum to span_count
|
||||
- V4.5: intensity counts sum to span_count
|
||||
- V4.6: strength_score >= 0
|
||||
- V4.7: avg_rating between 1.0 and 5.0 (or NULL)
|
||||
"""
|
||||
errors: list[ValidationError] = []
|
||||
|
||||
for fact in output["facts_written"]:
|
||||
fact_id = f"{fact['subject_type']}:{fact['subject_id']}"
|
||||
|
||||
# V4.1: Valid place_id
|
||||
place_id = fact.get("place_id", "")
|
||||
if not place_id:
|
||||
errors.append(ValidationError(
|
||||
rule="V4.1",
|
||||
identifier=fact_id,
|
||||
message="Empty place_id",
|
||||
))
|
||||
|
||||
# V4.3: span_count >= review_count
|
||||
if fact.get("span_count", 0) < fact.get("review_count", 0):
|
||||
errors.append(ValidationError(
|
||||
rule="V4.3",
|
||||
identifier=fact_id,
|
||||
message=f"span_count ({fact.get('span_count')}) < review_count ({fact.get('review_count')})",
|
||||
))
|
||||
|
||||
# V4.4: Valence sum
|
||||
valence_sum = (
|
||||
fact.get("negative_count", 0) +
|
||||
fact.get("positive_count", 0) +
|
||||
fact.get("neutral_count", 0) +
|
||||
fact.get("mixed_count", 0)
|
||||
)
|
||||
if valence_sum != fact.get("span_count", 0):
|
||||
errors.append(ValidationError(
|
||||
rule="V4.4",
|
||||
identifier=fact_id,
|
||||
message=f"Valence sum {valence_sum} != span_count {fact.get('span_count')}",
|
||||
))
|
||||
|
||||
# V4.5: Intensity sum
|
||||
intensity_sum = (
|
||||
fact.get("i1_count", 0) +
|
||||
fact.get("i2_count", 0) +
|
||||
fact.get("i3_count", 0)
|
||||
)
|
||||
if intensity_sum != fact.get("span_count", 0):
|
||||
errors.append(ValidationError(
|
||||
rule="V4.5",
|
||||
identifier=fact_id,
|
||||
message=f"Intensity sum {intensity_sum} != span_count {fact.get('span_count')}",
|
||||
))
|
||||
|
||||
# V4.6: Non-negative strength
|
||||
if fact.get("strength_score", 0) < 0:
|
||||
errors.append(ValidationError(
|
||||
rule="V4.6",
|
||||
identifier=fact_id,
|
||||
message=f"Negative strength_score: {fact.get('strength_score')}",
|
||||
))
|
||||
|
||||
# V4.7: Rating bounds
|
||||
avg_rating = fact.get("avg_rating")
|
||||
if avg_rating is not None and not (1.0 <= avg_rating <= 5.0):
|
||||
errors.append(ValidationError(
|
||||
rule="V4.7",
|
||||
identifier=fact_id,
|
||||
message=f"Invalid avg_rating: {avg_rating}",
|
||||
))
|
||||
|
||||
return ValidationResult(
|
||||
stage="stage4",
|
||||
passed=len(errors) == 0,
|
||||
error_count=len(errors),
|
||||
errors=errors,
|
||||
)
|
||||
|
||||
|
||||
# Convenience functions
|
||||
def validate_stage1_output(output: Stage1Output) -> ValidationResult:
|
||||
"""Validate Stage 1 output."""
|
||||
return Stage1Validator().validate(output)
|
||||
|
||||
|
||||
def validate_stage2_output(
|
||||
output: Stage2Output,
|
||||
input_reviews: dict[tuple[str, str, int], dict[str, Any]] | None = None,
|
||||
) -> ValidationResult:
|
||||
"""Validate Stage 2 output."""
|
||||
return Stage2Validator().validate(output, input_reviews)
|
||||
|
||||
|
||||
async def validate_stage3_output(
|
||||
output: Stage3Output,
|
||||
db: DatabasePool | None = None,
|
||||
) -> ValidationResult:
|
||||
"""Validate Stage 3 output."""
|
||||
validator = Stage3Validator(db)
|
||||
if db:
|
||||
return await validator.validate(output)
|
||||
return validator.validate_sync(output)
|
||||
|
||||
|
||||
def validate_stage4_output(output: Stage4Output) -> ValidationResult:
|
||||
"""Validate Stage 4 output."""
|
||||
return Stage4Validator().validate(output)
|
||||
Reference in New Issue
Block a user