Initial commit - WhyRating Engine (Google Reviews Scraper)

2026-02-02 18:19:00 +00:00
parent 0543a08242
commit 2206ddeff2
136 changed files with 51138 additions and 855 deletions
--- a/packages/reviewiq-pipeline/validate_router.py
+++ b/packages/reviewiq-pipeline/validate_router.py
@@ -0,0 +1,486 @@
+#!/usr/bin/env python
+"""
+Validate router decisions against real reviews with minimal LLM cost.
+
+This script:
+1. Loads real reviews from database
+2. Routes them through the router
+3. Cherry-picks samples from each tier for validation
+4. Optionally runs LLM on small samples to validate decisions
+
+Usage:
+    # Dry run - just show routing decisions, no LLM calls
+    python validate_router.py <job_id> --dry-run
+
+    # Validate with LLM (costs ~$0.05-0.10)
+    python validate_router.py <job_id> --validate
+
+    # Custom sample sizes
+    python validate_router.py <job_id> --validate --skip-samples=3 --cheap-samples=5 --full-samples=3
+"""
+
+import asyncio
+import argparse
+import json
+import logging
+import os
+import sys
+from dataclasses import dataclass
+from typing import Any
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger("validate_router")
+
+
+@dataclass
+class ValidationResult:
+    """Result of validating a single review."""
+    review_id: str
+    text: str
+    rating: int
+    routed_tier: str
+    routing_reason: str
+    routing_signals: dict
+    # LLM results (if validated)
+    llm_urt: str | None = None
+    llm_valence: str | None = None
+    llm_span_count: int | None = None
+    llm_cost: float | None = None
+    # Validation verdict
+    routing_correct: bool | None = None
+    notes: str = ""
+
+
+async def load_reviews_from_db(job_id: str, database_url: str) -> list[dict]:
+    """Load reviews from database for a job."""
+    import asyncpg
+
+    conn = await asyncpg.connect(database_url)
+    try:
+        # Get reviews with text from pipeline schema
+        rows = await conn.fetch("""
+            SELECT
+                re.review_id,
+                re.text,
+                re.rating,
+                re.business_id,
+                re.place_id
+            FROM pipeline.reviews_enriched re
+            WHERE re.job_id = $1::uuid
+            AND re.text IS NOT NULL
+            AND re.text != ''
+            ORDER BY re.id
+        """, job_id)
+
+        reviews = []
+        for row in rows:
+            text = row["text"] or ""
+            reviews.append({
+                "review_id": row["review_id"],
+                "text": text,
+                "text_normalized": text.lower().strip(),
+                "rating": row["rating"],
+                "business_id": row["business_id"],
+                "place_id": row["place_id"],
+                "source": "google",
+                "review_version": 1,
+                "review_time": "2024-01-01T00:00:00Z",
+            })
+
+        logger.info(f"Loaded {len(reviews)} reviews from job {job_id}")
+        return reviews
+
+    finally:
+        await conn.close()
+
+
+def route_reviews(reviews: list[dict]) -> dict[str, list[dict]]:
+    """Route reviews and return grouped by tier."""
+    from reviewiq_pipeline.services.review_router import (
+        ReviewRouter,
+        RoutingTier,
+        create_router,
+    )
+
+    router = create_router(conservative=True)
+    routed = router.route_batch(reviews)
+
+    return {
+        "skip": routed[RoutingTier.SKIP],
+        "cheap": routed[RoutingTier.CHEAP_MODEL],
+        "full": routed[RoutingTier.FULL_MODEL],
+    }
+
+
+def select_diverse_samples(
+    reviews: list[dict],
+    tier: str,
+    n_samples: int,
+) -> list[dict]:
+    """
+    Select diverse samples from a tier for validation.
+
+    Strategy:
+    - For SKIP: Pick different ratings, different lengths
+    - For CHEAP: Pick different word counts, different ratings
+    - For FULL: Pick different routing reasons
+    """
+    if not reviews or n_samples <= 0:
+        return []
+
+    samples = []
+    seen_reasons = set()
+    seen_ratings = set()
+
+    # First pass: get diversity by reason and rating
+    for review in reviews:
+        routing = review.get("_routing")
+        if not routing:
+            continue
+
+        reason = routing.reason
+        rating = review["rating"]
+
+        # Prioritize diversity
+        key = (reason, rating)
+        if key not in seen_reasons or len(samples) < n_samples:
+            if len(samples) < n_samples:
+                samples.append(review)
+                seen_reasons.add(key)
+                seen_ratings.add(rating)
+
+    # Fill remaining slots if needed
+    for review in reviews:
+        if len(samples) >= n_samples:
+            break
+        if review not in samples:
+            samples.append(review)
+
+    return samples[:n_samples]
+
+
+def print_routing_summary(routed: dict[str, list[dict]]):
+    """Print summary of routing decisions."""
+    total = sum(len(v) for v in routed.values())
+
+    print("\n" + "=" * 70)
+    print("ROUTING SUMMARY")
+    print("=" * 70)
+
+    for tier, reviews in routed.items():
+        pct = len(reviews) / total * 100 if total > 0 else 0
+        print(f"\n{tier.upper()} TIER: {len(reviews)} reviews ({pct:.1f}%)")
+
+        # Group by reason
+        reasons = {}
+        for r in reviews:
+            routing = r.get("_routing")
+            if routing:
+                reason = routing.reason
+                reasons[reason] = reasons.get(reason, 0) + 1
+
+        for reason, count in sorted(reasons.items(), key=lambda x: -x[1]):
+            print(f"  - {reason}: {count}")
+
+
+def print_samples(samples: list[dict], tier: str):
+    """Print sample reviews for inspection."""
+    print(f"\n{'=' * 70}")
+    print(f"{tier.upper()} TIER SAMPLES ({len(samples)} reviews)")
+    print("=" * 70)
+
+    for i, review in enumerate(samples, 1):
+        routing = review.get("_routing")
+        signals = routing.signals if routing else {}
+
+        print(f"\n[{i}] Review ID: {review['review_id']}")
+        print(f"    Rating: {'⭐' * review['rating']}")
+        print(f"    Text: \"{review['text'][:100]}{'...' if len(review['text']) > 100 else ''}\"")
+        print(f"    Routing: {routing.reason if routing else 'N/A'}")
+        print(f"    Signals: words={signals.get('word_count', '?')}, "
+              f"chars={signals.get('char_count', '?')}, "
+              f"numbers={signals.get('has_numbers', '?')}, "
+              f"sentences={signals.get('sentence_count', '?')}")
+
+
+async def validate_with_llm(
+    samples: list[dict],
+    tier: str,
+    config: Any,
+) -> list[ValidationResult]:
+    """
+    Run LLM classification on samples to validate routing decisions.
+
+    Returns validation results with verdicts.
+    """
+    from reviewiq_pipeline.services.llm_client import LLMClient, BatchReviewInput, PartialBatchResult
+
+    results = []
+
+    if not samples:
+        return results
+
+    # Create LLM client
+    client = LLMClient.create(config)
+
+    try:
+        # Prepare batch input
+        batch_input = [
+            BatchReviewInput(
+                review_id=r["review_id"],
+                text=r["text"],
+                rating=r["rating"],
+            )
+            for r in samples
+        ]
+
+        # Run classification
+        logger.info(f"Running LLM on {len(samples)} {tier} tier samples...")
+
+        llm_responses = []
+        metadata = {}
+
+        try:
+            llm_responses, metadata = await client.classify_batch(batch_input, "standard")
+        except PartialBatchResult as e:
+            # Handle partial results
+            logger.warning(f"Partial result for {tier} tier: {len(e.partial_results)} recovered")
+            metadata = e.metadata or {}
+
+            # Build responses from partial results
+            for partial in e.partial_results:
+                idx = partial.get("review_index", -1)
+                if 0 <= idx < len(samples):
+                    llm_responses.append({
+                        "spans": partial.get("spans", []),
+                        "review_summary": partial.get("review_summary", {}),
+                        "_index": idx,
+                    })
+
+            # Pad with empty responses for missing indices
+            processed_indices = {r.get("_index", -1) for r in llm_responses}
+            for i, sample in enumerate(samples):
+                if i not in processed_indices:
+                    llm_responses.append({
+                        "spans": [],
+                        "review_summary": {},
+                        "_index": i,
+                        "_error": "partial_recovery_failed",
+                    })
+
+            # Sort by original index
+            llm_responses.sort(key=lambda x: x.get("_index", 999))
+
+        cost = metadata.get("cost_usd", 0)
+        logger.info(f"LLM cost for {tier} tier: ${cost:.4f}")
+
+        # Process results
+        for review, llm_response in zip(samples, llm_responses):
+            routing = review.get("_routing")
+            signals = routing.signals if routing else {}
+
+            spans = llm_response.get("spans", [])
+            primary_span = next((s for s in spans if s.get("is_primary")), spans[0] if spans else {})
+
+            urt = primary_span.get("urt_primary", "N/A")
+            valence = primary_span.get("valence", "N/A")
+
+            # Determine if routing was correct
+            routing_correct = None
+            notes = ""
+
+            if tier == "skip":
+                # SKIP is correct if LLM gives generic code (V4.03) or single low-info span
+                is_generic = urt in ("V4.03", "V4.01", "V4.02", "O1.01")
+                is_simple = len(spans) == 1 and primary_span.get("intensity") == "I1"
+                routing_correct = is_generic or is_simple
+                if not routing_correct:
+                    notes = f"LLM found specific content: {urt}"
+                else:
+                    notes = "Correctly skipped (generic/simple)"
+
+            elif tier == "cheap":
+                # CHEAP is correct if classification is straightforward
+                # (single domain, no complex causal chains)
+                is_simple = len(spans) <= 2
+                routing_correct = is_simple
+                if not routing_correct:
+                    notes = f"Complex: {len(spans)} spans found"
+                else:
+                    notes = "Simple enough for cheap model"
+
+            elif tier == "full":
+                # FULL is correct if there's meaningful content
+                has_content = len(spans) >= 1 and urt not in ("V4.03", "O1.01")
+                routing_correct = has_content
+                if routing_correct:
+                    notes = f"Correctly sent to full: {len(spans)} spans, {urt}"
+                else:
+                    notes = "Could have been cheaper"
+
+            result = ValidationResult(
+                review_id=review["review_id"],
+                text=review["text"],
+                rating=review["rating"],
+                routed_tier=tier,
+                routing_reason=routing.reason if routing else "N/A",
+                routing_signals=signals,
+                llm_urt=urt,
+                llm_valence=valence,
+                llm_span_count=len(spans),
+                llm_cost=cost / len(samples),
+                routing_correct=routing_correct,
+                notes=notes,
+            )
+            results.append(result)
+
+    finally:
+        await client.close()
+
+    return results
+
+
+def print_validation_results(results: list[ValidationResult], tier: str):
+    """Print validation results."""
+    if not results:
+        return
+
+    print(f"\n{'=' * 70}")
+    print(f"{tier.upper()} TIER VALIDATION RESULTS")
+    print("=" * 70)
+
+    correct = sum(1 for r in results if r.routing_correct)
+    total = len(results)
+    accuracy = correct / total * 100 if total > 0 else 0
+
+    print(f"\nAccuracy: {correct}/{total} ({accuracy:.1f}%)")
+
+    for r in results:
+        status = "✅" if r.routing_correct else "❌"
+        print(f"\n{status} [{r.review_id}] \"{r.text[:60]}...\"")
+        print(f"   Rating: {r.rating}, Routed: {r.routed_tier} ({r.routing_reason})")
+        print(f"   LLM: URT={r.llm_urt}, Valence={r.llm_valence}, Spans={r.llm_span_count}")
+        print(f"   Notes: {r.notes}")
+
+
+async def main():
+    parser = argparse.ArgumentParser(description="Validate router decisions")
+    parser.add_argument("job_id", help="Job ID to analyze")
+    parser.add_argument("--dry-run", action="store_true", help="Show routing only, no LLM")
+    parser.add_argument("--validate", action="store_true", help="Run LLM validation")
+    parser.add_argument("--skip-samples", type=int, default=3, help="SKIP tier samples")
+    parser.add_argument("--cheap-samples", type=int, default=5, help="CHEAP tier samples")
+    parser.add_argument("--full-samples", type=int, default=3, help="FULL tier samples")
+
+    args = parser.parse_args()
+
+    # Database URL
+    database_url = os.environ.get(
+        "DATABASE_URL",
+        "postgresql://scraper:scraper123@localhost:5437/scraper"
+    )
+
+    # Load reviews
+    reviews = await load_reviews_from_db(args.job_id, database_url)
+    if not reviews:
+        print("No reviews found for job")
+        return
+
+    # Route reviews
+    routed = route_reviews(reviews)
+
+    # Print summary
+    print_routing_summary(routed)
+
+    # Select samples
+    skip_samples = select_diverse_samples(routed["skip"], "skip", args.skip_samples)
+    cheap_samples = select_diverse_samples(routed["cheap"], "cheap", args.cheap_samples)
+    full_samples = select_diverse_samples(routed["full"], "full", args.full_samples)
+
+    # Print samples
+    print_samples(skip_samples, "skip")
+    print_samples(cheap_samples, "cheap")
+    print_samples(full_samples, "full")
+
+    # Estimate cost
+    total_samples = len(skip_samples) + len(cheap_samples) + len(full_samples)
+    estimated_cost = total_samples * 0.003  # ~$0.003 per review with Sonnet
+    print(f"\n{'=' * 70}")
+    print(f"VALIDATION COST ESTIMATE: ~${estimated_cost:.3f} for {total_samples} samples")
+    print("=" * 70)
+
+    if args.dry_run:
+        print("\n[DRY RUN] No LLM calls made. Use --validate to run validation.")
+        return
+
+    if not args.validate:
+        print("\nUse --validate to run LLM validation on these samples.")
+        return
+
+    # Run validation
+    from reviewiq_pipeline.config import Config
+
+    config = Config(
+        database_url=database_url,
+        llm_provider="anthropic",
+        llm_model="claude-sonnet-4-5-20250929",
+        anthropic_api_key=os.environ.get("ANTHROPIC_API_KEY",
+            "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"),
+    )
+
+    all_results = []
+    total_cost = 0
+
+    # Validate each tier
+    for tier, samples in [("skip", skip_samples), ("cheap", cheap_samples), ("full", full_samples)]:
+        if samples:
+            results = await validate_with_llm(samples, tier, config)
+            all_results.extend(results)
+            total_cost += sum(r.llm_cost or 0 for r in results)
+            print_validation_results(results, tier)
+
+    # Print summary
+    print(f"\n{'=' * 70}")
+    print("VALIDATION SUMMARY")
+    print("=" * 70)
+
+    for tier in ["skip", "cheap", "full"]:
+        tier_results = [r for r in all_results if r.routed_tier == tier]
+        if tier_results:
+            correct = sum(1 for r in tier_results if r.routing_correct)
+            total = len(tier_results)
+            print(f"{tier.upper()}: {correct}/{total} correct ({correct/total*100:.0f}%)")
+
+    overall_correct = sum(1 for r in all_results if r.routing_correct)
+    overall_total = len(all_results)
+    print(f"\nOVERALL: {overall_correct}/{overall_total} correct ({overall_correct/overall_total*100:.0f}%)")
+    print(f"TOTAL COST: ${total_cost:.4f}")
+
+    # Recommendations
+    print(f"\n{'=' * 70}")
+    print("RECOMMENDATIONS")
+    print("=" * 70)
+
+    skip_errors = [r for r in all_results if r.routed_tier == "skip" and not r.routing_correct]
+    if skip_errors:
+        print("\n⚠️  SKIP tier false negatives found:")
+        for r in skip_errors:
+            print(f"   - \"{r.text[:50]}...\" → {r.llm_urt}")
+        print("   Consider tightening SKIP criteria")
+    else:
+        print("\n✅ SKIP tier looks safe")
+
+    cheap_errors = [r for r in all_results if r.routed_tier == "cheap" and not r.routing_correct]
+    if cheap_errors:
+        print("\n⚠️  CHEAP tier may miss complexity:")
+        for r in cheap_errors:
+            print(f"   - \"{r.text[:50]}...\" → {r.llm_span_count} spans")
+    else:
+        print("\n✅ CHEAP tier thresholds look good")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())