Implement a standalone Python package for processing customer reviews through a 4-stage pipeline using URT (Universal Review Taxonomy) v5.1: - Stage 1: Normalization (text cleaning, language detection, deduplication) - Stage 2: LLM Classification (OpenAI/Anthropic span extraction with URT codes) - Stage 3: Issue Routing (deterministic issue ID generation, span linking) - Stage 4: Fact Aggregation (time series metrics for dashboards) Package includes: - TypedDict contracts matching Pipeline-Contracts-v1.md - Async database layer with asyncpg and 5 SQL migrations - LLM client abstraction supporting both OpenAI and Anthropic - Sentence-transformers integration for embeddings - Validation rules V1.x through V4.x - CLI commands: migrate, run, validate, check - 55 unit and integration tests (all passing) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
270 lines
8.4 KiB
Python
270 lines
8.4 KiB
Python
"""
|
|
Pytest configuration and fixtures for reviewiq-pipeline tests.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import pytest
|
|
|
|
# Sample data fixtures matching the contract examples
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_raw_review() -> dict[str, Any]:
|
|
"""Sample raw review from Stage 0 output."""
|
|
return {
|
|
"review_id": "ChdDSUhNMG9nS0VJQ0FnSURBdWJQX3h3RRAB",
|
|
"author_name": "John Smith",
|
|
"author_id": "103456789012345678901",
|
|
"rating": 2,
|
|
"text": "The food was great but the wait was absolutely terrible. We waited 45 minutes just to be seated, and another 30 minutes for our appetizers. The server Mike was rude and dismissive when we complained. However, the steak was cooked perfectly and the dessert was amazing.",
|
|
"review_time": "2026-01-20T14:30:00Z",
|
|
"response_text": None,
|
|
"photos": [],
|
|
"raw_payload": {},
|
|
}
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_scraper_output(sample_raw_review: dict) -> dict[str, Any]:
|
|
"""Sample Stage 0 output."""
|
|
return {
|
|
"job_id": "test-job-001",
|
|
"status": "completed",
|
|
"business_id": "acme-corp",
|
|
"place_id": "ChIJN1t_tDeuEmsRUsoyG83frY4",
|
|
"business_info": {
|
|
"name": "Acme Restaurant",
|
|
"address": "123 Main St, Anytown, USA",
|
|
"category": "Restaurant",
|
|
"total_reviews": 1247,
|
|
"average_rating": 4.2,
|
|
},
|
|
"reviews": [sample_raw_review],
|
|
"scrape_time_ms": 12500,
|
|
"reviews_scraped": 1,
|
|
"scraper_version": "v1.0.0",
|
|
}
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_normalized_review() -> dict[str, Any]:
|
|
"""Sample normalized review from Stage 1 output."""
|
|
return {
|
|
"source": "google",
|
|
"review_id": "ChdDSUhNMG9nS0VJQ0FnSURBdWJQX3h3RRAB",
|
|
"review_version": 1,
|
|
"business_id": "acme-corp",
|
|
"place_id": "ChIJN1t_tDeuEmsRUsoyG83frY4",
|
|
"text": "The food was great but the wait was absolutely terrible. We waited 45 minutes just to be seated, and another 30 minutes for our appetizers. The server Mike was rude and dismissive when we complained. However, the steak was cooked perfectly and the dessert was amazing.",
|
|
"text_normalized": "the food was great but the wait was absolutely terrible we waited 45 minutes just to be seated and another 30 minutes for our appetizers the server mike was rude and dismissive when we complained however the steak was cooked perfectly and the dessert was amazing",
|
|
"text_language": "en",
|
|
"text_length": 267,
|
|
"word_count": 52,
|
|
"rating": 2,
|
|
"review_time": "2026-01-20T14:30:00Z",
|
|
"author_name": "John Smith",
|
|
"content_hash": "a1b2c3d4e5f6789012345678901234567890123456789012345678901234abcd",
|
|
"raw_id": 12345,
|
|
}
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_stage1_output(sample_normalized_review: dict) -> dict[str, Any]:
|
|
"""Sample Stage 1 output."""
|
|
return {
|
|
"job_id": "test-job-001",
|
|
"business_id": "acme-corp",
|
|
"place_id": "ChIJN1t_tDeuEmsRUsoyG83frY4",
|
|
"reviews_normalized": [sample_normalized_review],
|
|
"stats": {
|
|
"input_count": 1,
|
|
"output_count": 1,
|
|
"skipped_empty": 0,
|
|
"skipped_duplicate": 0,
|
|
},
|
|
}
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_span() -> dict[str, Any]:
|
|
"""Sample extracted span."""
|
|
return {
|
|
"span_id": "SPN-b2c3d4e5f6789012",
|
|
"span_index": 1,
|
|
"span_text": "the wait was absolutely terrible. We waited 45 minutes just to be seated, and another 30 minutes for our appetizers",
|
|
"span_start": 23,
|
|
"span_end": 138,
|
|
"profile": "standard",
|
|
"urt_primary": "J1.01",
|
|
"urt_secondary": [],
|
|
"valence": "V-",
|
|
"intensity": "I3",
|
|
"comparative": "CR-N",
|
|
"specificity": "S3",
|
|
"actionability": "A2",
|
|
"temporal": "TC",
|
|
"evidence": "EC",
|
|
"confidence": "high",
|
|
"usn": "URT:S:J1.01:-3:32TC.EC.N",
|
|
"is_primary": True,
|
|
}
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_classified_review(sample_span: dict) -> dict[str, Any]:
|
|
"""Sample classified review from Stage 2 output."""
|
|
return {
|
|
"source": "google",
|
|
"review_id": "ChdDSUhNMG9nS0VJQ0FnSURBdWJQX3h3RRAB",
|
|
"review_version": 1,
|
|
"urt_primary": "J1.01",
|
|
"urt_secondary": ["P1.02"],
|
|
"valence": "V±",
|
|
"intensity": "I3",
|
|
"comparative": "CR-N",
|
|
"staff_mentions": ["Mike"],
|
|
"quotes": {
|
|
"J1.01": "waited 45 minutes just to be seated",
|
|
"P1.02": "rude and dismissive",
|
|
},
|
|
"trust_score": 0.85,
|
|
"embedding": [0.1] * 384, # Placeholder
|
|
"spans": [
|
|
{
|
|
"span_id": "SPN-a1b2c3d4e5f67890",
|
|
"span_index": 0,
|
|
"span_text": "The food was great",
|
|
"span_start": 0,
|
|
"span_end": 18,
|
|
"profile": "standard",
|
|
"urt_primary": "O1.01",
|
|
"urt_secondary": [],
|
|
"valence": "V+",
|
|
"intensity": "I2",
|
|
"comparative": "CR-N",
|
|
"confidence": "high",
|
|
"usn": "URT:S:O1.01:+2:21TC.ES.N",
|
|
"is_primary": False,
|
|
},
|
|
sample_span,
|
|
],
|
|
"classification_confidence": {"overall": 0.85},
|
|
"processing_time_ms": 500,
|
|
}
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_stage2_output(sample_classified_review: dict) -> dict[str, Any]:
|
|
"""Sample Stage 2 output."""
|
|
return {
|
|
"batch_id": "batch001",
|
|
"taxonomy_version": "v5.1",
|
|
"model_version": "gpt-4o-mini",
|
|
"prompt_version": "v1.0",
|
|
"reviews_classified": [sample_classified_review],
|
|
"stats": {
|
|
"input_count": 1,
|
|
"success_count": 1,
|
|
"error_count": 0,
|
|
"total_spans": 2,
|
|
"avg_spans_per_review": 2.0,
|
|
"llm_tokens_used": 1500,
|
|
"llm_cost_usd": 0.001,
|
|
},
|
|
}
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_routed_span() -> dict[str, Any]:
|
|
"""Sample routed span from Stage 3."""
|
|
return {
|
|
"span_id": "SPN-b2c3d4e5f6789012",
|
|
"issue_id": "ISS-7a8b9c0d1e2f3a4b",
|
|
"routing_key": "acme-corp|ChIJN1t_tDeuEmsRUsoyG83frY4|J1.01|",
|
|
"is_new_issue": True,
|
|
}
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_stage3_output(sample_routed_span: dict) -> dict[str, Any]:
|
|
"""Sample Stage 3 output."""
|
|
return {
|
|
"routed_spans": [sample_routed_span],
|
|
"issues_created": ["ISS-7a8b9c0d1e2f3a4b"],
|
|
"issues_updated": [],
|
|
"stats": {
|
|
"spans_processed": 2,
|
|
"spans_routed": 1,
|
|
"spans_skipped": 1,
|
|
"issues_created": 1,
|
|
"issues_updated": 0,
|
|
},
|
|
}
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_fact() -> dict[str, Any]:
|
|
"""Sample fact record from Stage 4."""
|
|
return {
|
|
"business_id": "acme-corp",
|
|
"place_id": "ChIJN1t_tDeuEmsRUsoyG83frY4",
|
|
"period_date": "2026-01-20",
|
|
"bucket_type": "day",
|
|
"subject_type": "urt_code",
|
|
"subject_id": "J1.01",
|
|
"taxonomy_version": "v5.1",
|
|
"review_count": 1,
|
|
"span_count": 1,
|
|
"negative_count": 1,
|
|
"positive_count": 0,
|
|
"neutral_count": 0,
|
|
"mixed_count": 0,
|
|
"strength_score": 4.0,
|
|
"negative_strength": 4.0,
|
|
"positive_strength": 0.0,
|
|
"avg_rating": 2.0,
|
|
"i1_count": 0,
|
|
"i2_count": 0,
|
|
"i3_count": 1,
|
|
"cr_better": 0,
|
|
"cr_worse": 0,
|
|
"cr_same": 0,
|
|
"trust_weighted_strength": 3.4,
|
|
"trust_weighted_negative": 3.4,
|
|
}
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_stage4_output(sample_fact: dict) -> dict[str, Any]:
|
|
"""Sample Stage 4 output."""
|
|
return {
|
|
"facts_written": [sample_fact],
|
|
"stats": {
|
|
"business_id": "acme-corp",
|
|
"date": "2026-01-20",
|
|
"locations_processed": 1,
|
|
"codes_aggregated": 1,
|
|
"facts_upserted": 1,
|
|
},
|
|
}
|
|
|
|
|
|
@pytest.fixture
|
|
def fixtures_dir() -> Path:
|
|
"""Get the path to the fixtures directory."""
|
|
return Path(__file__).parent / "fixtures"
|
|
|
|
|
|
# Helper to load JSON fixtures
|
|
def load_fixture(name: str) -> dict[str, Any]:
|
|
"""Load a JSON fixture by name."""
|
|
fixtures_path = Path(__file__).parent / "fixtures" / f"{name}.json"
|
|
if fixtures_path.exists():
|
|
return json.loads(fixtures_path.read_text())
|
|
raise FileNotFoundError(f"Fixture not found: {name}")
|