Files
whyrating-engine-legacy/packages/reviewiq-pipeline/tests/conftest.py
Alejandro Gutiérrez 7d720f5378 feat: Add reviewiq-pipeline package for LLM-powered review classification
Implement a standalone Python package for processing customer reviews through
a 4-stage pipeline using URT (Universal Review Taxonomy) v5.1:

- Stage 1: Normalization (text cleaning, language detection, deduplication)
- Stage 2: LLM Classification (OpenAI/Anthropic span extraction with URT codes)
- Stage 3: Issue Routing (deterministic issue ID generation, span linking)
- Stage 4: Fact Aggregation (time series metrics for dashboards)

Package includes:
- TypedDict contracts matching Pipeline-Contracts-v1.md
- Async database layer with asyncpg and 5 SQL migrations
- LLM client abstraction supporting both OpenAI and Anthropic
- Sentence-transformers integration for embeddings
- Validation rules V1.x through V4.x
- CLI commands: migrate, run, validate, check
- 55 unit and integration tests (all passing)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 18:07:11 +00:00

270 lines
8.4 KiB
Python

"""
Pytest configuration and fixtures for reviewiq-pipeline tests.
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
import pytest
# Sample data fixtures matching the contract examples
@pytest.fixture
def sample_raw_review() -> dict[str, Any]:
"""Sample raw review from Stage 0 output."""
return {
"review_id": "ChdDSUhNMG9nS0VJQ0FnSURBdWJQX3h3RRAB",
"author_name": "John Smith",
"author_id": "103456789012345678901",
"rating": 2,
"text": "The food was great but the wait was absolutely terrible. We waited 45 minutes just to be seated, and another 30 minutes for our appetizers. The server Mike was rude and dismissive when we complained. However, the steak was cooked perfectly and the dessert was amazing.",
"review_time": "2026-01-20T14:30:00Z",
"response_text": None,
"photos": [],
"raw_payload": {},
}
@pytest.fixture
def sample_scraper_output(sample_raw_review: dict) -> dict[str, Any]:
"""Sample Stage 0 output."""
return {
"job_id": "test-job-001",
"status": "completed",
"business_id": "acme-corp",
"place_id": "ChIJN1t_tDeuEmsRUsoyG83frY4",
"business_info": {
"name": "Acme Restaurant",
"address": "123 Main St, Anytown, USA",
"category": "Restaurant",
"total_reviews": 1247,
"average_rating": 4.2,
},
"reviews": [sample_raw_review],
"scrape_time_ms": 12500,
"reviews_scraped": 1,
"scraper_version": "v1.0.0",
}
@pytest.fixture
def sample_normalized_review() -> dict[str, Any]:
"""Sample normalized review from Stage 1 output."""
return {
"source": "google",
"review_id": "ChdDSUhNMG9nS0VJQ0FnSURBdWJQX3h3RRAB",
"review_version": 1,
"business_id": "acme-corp",
"place_id": "ChIJN1t_tDeuEmsRUsoyG83frY4",
"text": "The food was great but the wait was absolutely terrible. We waited 45 minutes just to be seated, and another 30 minutes for our appetizers. The server Mike was rude and dismissive when we complained. However, the steak was cooked perfectly and the dessert was amazing.",
"text_normalized": "the food was great but the wait was absolutely terrible we waited 45 minutes just to be seated and another 30 minutes for our appetizers the server mike was rude and dismissive when we complained however the steak was cooked perfectly and the dessert was amazing",
"text_language": "en",
"text_length": 267,
"word_count": 52,
"rating": 2,
"review_time": "2026-01-20T14:30:00Z",
"author_name": "John Smith",
"content_hash": "a1b2c3d4e5f6789012345678901234567890123456789012345678901234abcd",
"raw_id": 12345,
}
@pytest.fixture
def sample_stage1_output(sample_normalized_review: dict) -> dict[str, Any]:
"""Sample Stage 1 output."""
return {
"job_id": "test-job-001",
"business_id": "acme-corp",
"place_id": "ChIJN1t_tDeuEmsRUsoyG83frY4",
"reviews_normalized": [sample_normalized_review],
"stats": {
"input_count": 1,
"output_count": 1,
"skipped_empty": 0,
"skipped_duplicate": 0,
},
}
@pytest.fixture
def sample_span() -> dict[str, Any]:
"""Sample extracted span."""
return {
"span_id": "SPN-b2c3d4e5f6789012",
"span_index": 1,
"span_text": "the wait was absolutely terrible. We waited 45 minutes just to be seated, and another 30 minutes for our appetizers",
"span_start": 23,
"span_end": 138,
"profile": "standard",
"urt_primary": "J1.01",
"urt_secondary": [],
"valence": "V-",
"intensity": "I3",
"comparative": "CR-N",
"specificity": "S3",
"actionability": "A2",
"temporal": "TC",
"evidence": "EC",
"confidence": "high",
"usn": "URT:S:J1.01:-3:32TC.EC.N",
"is_primary": True,
}
@pytest.fixture
def sample_classified_review(sample_span: dict) -> dict[str, Any]:
"""Sample classified review from Stage 2 output."""
return {
"source": "google",
"review_id": "ChdDSUhNMG9nS0VJQ0FnSURBdWJQX3h3RRAB",
"review_version": 1,
"urt_primary": "J1.01",
"urt_secondary": ["P1.02"],
"valence": "",
"intensity": "I3",
"comparative": "CR-N",
"staff_mentions": ["Mike"],
"quotes": {
"J1.01": "waited 45 minutes just to be seated",
"P1.02": "rude and dismissive",
},
"trust_score": 0.85,
"embedding": [0.1] * 384, # Placeholder
"spans": [
{
"span_id": "SPN-a1b2c3d4e5f67890",
"span_index": 0,
"span_text": "The food was great",
"span_start": 0,
"span_end": 18,
"profile": "standard",
"urt_primary": "O1.01",
"urt_secondary": [],
"valence": "V+",
"intensity": "I2",
"comparative": "CR-N",
"confidence": "high",
"usn": "URT:S:O1.01:+2:21TC.ES.N",
"is_primary": False,
},
sample_span,
],
"classification_confidence": {"overall": 0.85},
"processing_time_ms": 500,
}
@pytest.fixture
def sample_stage2_output(sample_classified_review: dict) -> dict[str, Any]:
"""Sample Stage 2 output."""
return {
"batch_id": "batch001",
"taxonomy_version": "v5.1",
"model_version": "gpt-4o-mini",
"prompt_version": "v1.0",
"reviews_classified": [sample_classified_review],
"stats": {
"input_count": 1,
"success_count": 1,
"error_count": 0,
"total_spans": 2,
"avg_spans_per_review": 2.0,
"llm_tokens_used": 1500,
"llm_cost_usd": 0.001,
},
}
@pytest.fixture
def sample_routed_span() -> dict[str, Any]:
"""Sample routed span from Stage 3."""
return {
"span_id": "SPN-b2c3d4e5f6789012",
"issue_id": "ISS-7a8b9c0d1e2f3a4b",
"routing_key": "acme-corp|ChIJN1t_tDeuEmsRUsoyG83frY4|J1.01|",
"is_new_issue": True,
}
@pytest.fixture
def sample_stage3_output(sample_routed_span: dict) -> dict[str, Any]:
"""Sample Stage 3 output."""
return {
"routed_spans": [sample_routed_span],
"issues_created": ["ISS-7a8b9c0d1e2f3a4b"],
"issues_updated": [],
"stats": {
"spans_processed": 2,
"spans_routed": 1,
"spans_skipped": 1,
"issues_created": 1,
"issues_updated": 0,
},
}
@pytest.fixture
def sample_fact() -> dict[str, Any]:
"""Sample fact record from Stage 4."""
return {
"business_id": "acme-corp",
"place_id": "ChIJN1t_tDeuEmsRUsoyG83frY4",
"period_date": "2026-01-20",
"bucket_type": "day",
"subject_type": "urt_code",
"subject_id": "J1.01",
"taxonomy_version": "v5.1",
"review_count": 1,
"span_count": 1,
"negative_count": 1,
"positive_count": 0,
"neutral_count": 0,
"mixed_count": 0,
"strength_score": 4.0,
"negative_strength": 4.0,
"positive_strength": 0.0,
"avg_rating": 2.0,
"i1_count": 0,
"i2_count": 0,
"i3_count": 1,
"cr_better": 0,
"cr_worse": 0,
"cr_same": 0,
"trust_weighted_strength": 3.4,
"trust_weighted_negative": 3.4,
}
@pytest.fixture
def sample_stage4_output(sample_fact: dict) -> dict[str, Any]:
"""Sample Stage 4 output."""
return {
"facts_written": [sample_fact],
"stats": {
"business_id": "acme-corp",
"date": "2026-01-20",
"locations_processed": 1,
"codes_aggregated": 1,
"facts_upserted": 1,
},
}
@pytest.fixture
def fixtures_dir() -> Path:
"""Get the path to the fixtures directory."""
return Path(__file__).parent / "fixtures"
# Helper to load JSON fixtures
def load_fixture(name: str) -> dict[str, Any]:
"""Load a JSON fixture by name."""
fixtures_path = Path(__file__).parent / "fixtures" / f"{name}.json"
if fixtures_path.exists():
return json.loads(fixtures_path.read_text())
raise FileNotFoundError(f"Fixture not found: {name}")