Initial commit - WhyRating Engine (Google Reviews Scraper)

2026-02-02 18:19:00 +00:00
parent 0543a08242
commit 2206ddeff2
136 changed files with 51138 additions and 855 deletions
--- a/packages/reviewiq-pipeline/scripts/validate_l1_configs_v2.py
+++ b/packages/reviewiq-pipeline/scripts/validate_l1_configs_v2.py
@@ -0,0 +1,421 @@
+#!/usr/bin/env python3
+"""
+Wave 1 L1 Config Validation Script - V2 (Sector-Scoped)
+
+Validates L1 primitive configs against SECTOR-SPECIFIC review data.
+Only validates sectors where we have real business data.
+
+Key improvement over v1: spans are filtered by business → sector mapping,
+ensuring "TASTE in HEALTHCARE" noise doesn't pollute results.
+
+Usage:
+    python validate_l1_configs_v2.py --sector ENTERTAINMENT
+    python validate_l1_configs_v2.py --sector AUTOMOTIVE
+    python validate_l1_configs_v2.py --all
+    python validate_l1_configs_v2.py --report  # Summary only
+"""
+
+import argparse
+import asyncio
+import json
+import os
+from collections import Counter
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+import asyncpg
+
+# Paths
+DATA_DIR = Path(__file__).parent.parent / "data"
+CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l1"
+BRIEFS_DIR = DATA_DIR / "sector_briefs"
+
+# Business → Sector mapping (ground truth)
+BUSINESS_TO_SECTOR = {
+    "Go Karts Mar Menor": "ENTERTAINMENT",
+    "ClickRent Gran Canaria | Alquiler de Coches y Furgonetas": "AUTOMOTIVE",
+    "Soho Club": "ENTERTAINMENT",
+    "Fika": "FOOD_DINING",
+}
+
+# Sectors with real data
+SECTORS_WITH_DATA = {"ENTERTAINMENT", "AUTOMOTIVE", "FOOD_DINING"}
+
+# URT code to primitive mapping
+URT_TO_PRIMITIVE = {
+    # Offering codes
+    "O1.01": "CONSISTENCY", "O1.02": "CRAFT", "O1.03": "FRESHNESS",
+    "O1.04": "EFFECTIVENESS", "O1.05": "TASTE", "O1.06": "CONDITION",
+    "O2.01": "ACCURACY", "O2.02": "EFFECTIVENESS", "O2.03": "CRAFT",
+    "O3.01": "ACCURACY", "O3.02": "CONSISTENCY", "O3.03": "EFFECTIVENESS",
+    # People codes
+    "P1.01": "MANNER", "P1.02": "MANNER", "P1.03": "ATTENTIVENESS",
+    "P1.04": "COMMUNICATION", "P1.05": "ATTENTIVENESS",
+    "P2.01": "COMPETENCE", "P2.02": "COMPETENCE", "P2.03": "COMPETENCE",
+    "P3.01": "COMMUNICATION", "P3.02": "COMMUNICATION", "P3.03": "COMMUNICATION",
+    # Journey codes
+    "J1.01": "SPEED", "J1.02": "RELIABILITY", "J1.03": "FRICTION",
+    "J1.04": "SPEED", "J1.05": "RELIABILITY",
+    "J2.01": "RELIABILITY", "J2.02": "RELIABILITY", "J2.03": "FRICTION",
+    "J3.01": "FRICTION", "J3.02": "FRICTION", "J3.03": "FRICTION",
+    # Environment codes
+    "E1.01": "CLEANLINESS", "E1.02": "COMFORT", "E1.03": "AMBIANCE",
+    "E1.04": "AMBIANCE", "E1.05": "COMFORT",
+    "E2.01": "AMBIANCE", "E2.02": "COMFORT", "E2.03": "COMFORT",
+    "E2.04": "AMBIANCE", "E2.05": "DIGITAL_UX",
+    "E3.01": "SAFETY", "E3.02": "SAFETY", "E3.03": "ACCESSIBILITY",
+    "E4.01": "ACCESSIBILITY", "E4.02": "ACCESSIBILITY", "E4.03": "DIGITAL_UX",
+    # Access codes
+    "A1.01": "AVAILABILITY", "A1.02": "AVAILABILITY", "A1.03": "AVAILABILITY",
+    "A1.04": "ACCESSIBILITY", "A1.05": "ACCESSIBILITY",
+    "A2.01": "ACCESSIBILITY", "A2.02": "ACCESSIBILITY", "A2.03": "DIGITAL_UX",
+    "A3.01": "ACCESSIBILITY", "A3.02": "ACCESSIBILITY", "A3.03": "SPEED",
+    "A4.01": "ACCESSIBILITY", "A4.02": "ACCESSIBILITY", "A4.03": "AVAILABILITY",
+    # Value codes
+    "V1.01": "PRICE_LEVEL", "V1.02": "PRICE_FAIRNESS", "V1.03": "PRICE_TRANSPARENCY",
+    "V2.01": "PRICE_FAIRNESS", "V2.02": "PRICE_TRANSPARENCY", "V2.03": "VALUE_FOR_MONEY",
+    "V3.01": "VALUE_FOR_MONEY", "V3.02": "VALUE_FOR_MONEY", "V3.03": "PRICE_FAIRNESS",
+    "V4.01": "VALUE_FOR_MONEY", "V4.02": "VALUE_FOR_MONEY", "V4.03": "VALUE_FOR_MONEY",
+    # Relationship codes (map to meta - these should stay unmapped)
+    "R1.01": None, "R1.02": None, "R1.03": None,
+    "R2.01": None, "R2.02": None, "R2.03": None,
+    "R3.01": None, "R3.02": None, "R3.03": None,
+    "R4.01": None, "R4.02": None, "R4.03": None,
+}
+
+# Minimum threshold for "enable" recommendations (% of sector spans)
+ENABLE_THRESHOLD_PCT = 3.0  # Only recommend enable if >= 3% of sector spans
+
+
+@dataclass
+class SectorValidation:
+    """Validation result for a single sector."""
+    sector_code: str
+    businesses: list[str]
+    span_count: int
+
+    # Coverage
+    enabled_coverage: float
+    disabled_hits: dict[str, int] = field(default_factory=dict)
+    unmapped_count: int = 0
+
+    # Distribution
+    primitive_counts: dict[str, int] = field(default_factory=dict)
+    domain_distribution: dict[str, int] = field(default_factory=dict)
+    valence_distribution: dict[str, int] = field(default_factory=dict)
+    top_urt_codes: list[tuple[str, int]] = field(default_factory=list)
+
+    # Recommendations (threshold-gated)
+    recommended_enables: list[tuple[str, float]] = field(default_factory=list)  # (primitive, pct)
+    recommended_disables: list[tuple[str, float]] = field(default_factory=list)
+    weight_issues: list[str] = field(default_factory=list)
+
+    # Metadata
+    validated_at: str = ""
+    config_version: str = ""
+
+
+def load_l1_config(sector_code: str) -> dict[str, Any] | None:
+    """Load L1 config for a sector."""
+    config_file = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
+    if not config_file.exists():
+        return None
+    with open(config_file) as f:
+        return json.load(f)
+
+
+def get_businesses_for_sector(sector_code: str) -> list[str]:
+    """Get list of businesses belonging to a sector."""
+    return [biz for biz, sector in BUSINESS_TO_SECTOR.items() if sector == sector_code]
+
+
+async def fetch_spans_for_businesses(pool: asyncpg.Pool, businesses: list[str]) -> list[dict]:
+    """Fetch spans for specific businesses only."""
+    if not businesses:
+        return []
+
+    query = """
+        SELECT
+            business_id,
+            urt_primary,
+            valence,
+            intensity,
+            span_text
+        FROM pipeline.review_spans
+        WHERE business_id = ANY($1)
+        ORDER BY created_at DESC
+    """
+    rows = await pool.fetch(query, businesses)
+    return [dict(row) for row in rows]
+
+
+def analyze_sector_spans(
+    spans: list[dict],
+    config: dict[str, Any],
+    businesses: list[str],
+) -> SectorValidation:
+    """Analyze spans for a specific sector."""
+    sector_code = config["sector_code"]
+    enabled = set(config.get("enabled", []))
+    disabled = set(config.get("disabled", []))
+    weights = config.get("weights", {})
+    config_version = config.get("config_version", "1.0")
+
+    # Counters
+    primitive_counts: Counter = Counter()
+    domain_counts: Counter = Counter()
+    valence_counts: Counter = Counter()
+    urt_counts: Counter = Counter()
+    disabled_hits: Counter = Counter()
+    unmapped = 0
+    enabled_hits = 0
+
+    for span in spans:
+        urt_code = span["urt_primary"]
+        valence = span.get("valence", "V0")
+
+        urt_counts[urt_code] += 1
+        valence_counts[valence] += 1
+        domain_counts[urt_code[0]] += 1
+
+        primitive = URT_TO_PRIMITIVE.get(urt_code)
+        if primitive:
+            primitive_counts[primitive] += 1
+            if primitive in enabled:
+                enabled_hits += 1
+            elif primitive in disabled:
+                disabled_hits[primitive] += 1
+        else:
+            unmapped += 1
+
+    total = len(spans)
+    enabled_coverage = enabled_hits / total if total > 0 else 0
+
+    # Threshold-gated recommendations
+    recommended_enables = []
+    for prim, count in disabled_hits.most_common():
+        pct = count / total * 100 if total > 0 else 0
+        if pct >= ENABLE_THRESHOLD_PCT:
+            recommended_enables.append((prim, pct))
+
+    # Weight issues
+    weight_issues = []
+    for prim in weights:
+        if primitive_counts[prim] == 0 and prim in enabled:
+            weight_issues.append(f"{prim} weighted ({weights[prim]}x) but 0 appearances")
+
+    # High-frequency unweighted
+    for prim, count in primitive_counts.most_common(5):
+        pct = count / total * 100 if total > 0 else 0
+        if prim in enabled and prim not in weights and pct >= 10:
+            weight_issues.append(f"{prim} high freq ({pct:.1f}%) but unweighted")
+
+    return SectorValidation(
+        sector_code=sector_code,
+        businesses=businesses,
+        span_count=total,
+        enabled_coverage=enabled_coverage,
+        disabled_hits=dict(disabled_hits),
+        unmapped_count=unmapped,
+        primitive_counts=dict(primitive_counts),
+        domain_distribution=dict(domain_counts),
+        valence_distribution=dict(valence_counts),
+        top_urt_codes=urt_counts.most_common(15),
+        recommended_enables=recommended_enables,
+        weight_issues=weight_issues,
+        validated_at=datetime.utcnow().isoformat(),
+        config_version=config_version,
+    )
+
+
+def print_sector_report(result: SectorValidation, config: dict):
+    """Print detailed validation report for a sector."""
+    print("\n" + "=" * 70)
+    print(f"SECTOR-SCOPED VALIDATION: {result.sector_code}")
+    print("=" * 70)
+
+    print(f"\n📊 DATA SOURCE")
+    print(f"   Businesses: {', '.join(result.businesses)}")
+    print(f"   Total spans: {result.span_count:,}")
+    print(f"   Config version: {result.config_version}")
+
+    print(f"\n📈 COVERAGE")
+    print(f"   Enabled coverage: {result.enabled_coverage:.1%}")
+    print(f"   Unmapped (R-domain): {result.unmapped_count} ({result.unmapped_count/result.span_count*100:.1f}%)" if result.span_count > 0 else "")
+
+    # Domain distribution
+    print(f"\n📁 DOMAIN DISTRIBUTION")
+    domain_names = {"O": "Offering", "P": "People", "J": "Journey",
+                    "E": "Environment", "A": "Access", "V": "Value", "R": "Relationship"}
+    for domain in "OPJEVRA":
+        count = result.domain_distribution.get(domain, 0)
+        pct = count / result.span_count * 100 if result.span_count > 0 else 0
+        bar = "█" * int(pct / 2)
+        print(f"   {domain} {domain_names.get(domain, '?'):12} {count:4} ({pct:5.1f}%) {bar}")
+
+    # Top primitives
+    print(f"\n🔝 TOP PRIMITIVES (sector-scoped)")
+    enabled_set = set(config.get("enabled", []))
+    disabled_set = set(config.get("disabled", []))
+    weights = config.get("weights", {})
+
+    for prim, count in sorted(result.primitive_counts.items(), key=lambda x: -x[1])[:12]:
+        pct = count / result.span_count * 100 if result.span_count > 0 else 0
+        if prim in enabled_set:
+            status = "✓"
+        elif prim in disabled_set:
+            status = "✗"
+        else:
+            status = "?"
+        weight = f"({weights[prim]}x)" if prim in weights else ""
+        print(f"   {status} {prim:20} {count:4} ({pct:5.1f}%) {weight}")
+
+    # Threshold-gated recommendations
+    if result.recommended_enables:
+        print(f"\n⚠️  RECOMMENDED ENABLES (≥{ENABLE_THRESHOLD_PCT}% threshold)")
+        for prim, pct in result.recommended_enables:
+            count = result.disabled_hits.get(prim, 0)
+            print(f"   → ENABLE {prim}: {count} spans ({pct:.1f}%)")
+    else:
+        print(f"\n✅ No primitives exceed {ENABLE_THRESHOLD_PCT}% threshold for enabling")
+
+    # Low-frequency disabled (info only)
+    low_freq_disabled = [(p, c) for p, c in result.disabled_hits.items()
+                         if c / result.span_count * 100 < ENABLE_THRESHOLD_PCT]
+    if low_freq_disabled:
+        print(f"\n📋 DISABLED BUT APPEARING (below threshold - no action)")
+        for prim, count in sorted(low_freq_disabled, key=lambda x: -x[1])[:5]:
+            pct = count / result.span_count * 100
+            print(f"   {prim}: {count} ({pct:.1f}%)")
+
+    # Weight issues
+    if result.weight_issues:
+        print(f"\n⚖️  WEIGHT ISSUES")
+        for issue in result.weight_issues:
+            print(f"   • {issue}")
+
+    print(f"\n⏱️  Validated at: {result.validated_at}")
+    print("=" * 70)
+
+
+async def validate_sector(
+    sector_code: str,
+    db_url: str | None = None,
+    verbose: bool = True,
+) -> SectorValidation | None:
+    """Validate a single sector with sector-scoped data."""
+
+    if sector_code not in SECTORS_WITH_DATA:
+        if verbose:
+            print(f"⚠️  {sector_code}: No real business data available for validation")
+        return None
+
+    config = load_l1_config(sector_code)
+    if not config:
+        if verbose:
+            print(f"❌ No L1 config found for {sector_code}")
+        return None
+
+    businesses = get_businesses_for_sector(sector_code)
+    if not businesses:
+        if verbose:
+            print(f"⚠️  {sector_code}: No businesses mapped")
+        return None
+
+    db_url = db_url or os.environ.get(
+        "DATABASE_URL",
+        "postgresql://scraper:scraper123@localhost:5437/scraper"
+    )
+
+    pool = await asyncpg.create_pool(db_url)
+
+    try:
+        spans = await fetch_spans_for_businesses(pool, businesses)
+        if not spans:
+            if verbose:
+                print(f"⚠️  {sector_code}: No spans found for businesses")
+            return None
+
+        result = analyze_sector_spans(spans, config, businesses)
+
+        if verbose:
+            print_sector_report(result, config)
+
+        return result
+
+    finally:
+        await pool.close()
+
+
+async def validate_all_sectors(db_url: str | None = None) -> dict[str, SectorValidation]:
+    """Validate all sectors with available data."""
+    results = {}
+
+    for sector in SECTORS_WITH_DATA:
+        result = await validate_sector(sector, db_url, verbose=True)
+        if result:
+            results[sector] = result
+
+    # Print summary
+    print("\n" + "=" * 70)
+    print("VALIDATION SUMMARY")
+    print("=" * 70)
+    print(f"\n{'Sector':<20} {'Spans':>8} {'Coverage':>10} {'Enables':>10}")
+    print("-" * 50)
+
+    for sector, result in results.items():
+        enables = len(result.recommended_enables)
+        enables_str = f"{enables} recs" if enables > 0 else "✓ OK"
+        print(f"{sector:<20} {result.span_count:>8,} {result.enabled_coverage:>9.1%} {enables_str:>10}")
+
+    print("-" * 50)
+    print(f"Sectors validated: {len(results)}/{len(SECTORS_WITH_DATA)}")
+    print(f"Sectors without data: {20 - len(SECTORS_WITH_DATA)}")
+
+    return results
+
+
+async def generate_summary_report(db_url: str | None = None) -> dict:
+    """Generate a JSON summary report for all sectors."""
+    results = {}
+
+    for sector in SECTORS_WITH_DATA:
+        result = await validate_sector(sector, db_url, verbose=False)
+        if result:
+            results[sector] = {
+                "span_count": result.span_count,
+                "enabled_coverage": round(result.enabled_coverage, 3),
+                "recommended_enables": result.recommended_enables,
+                "weight_issues": result.weight_issues,
+                "config_version": result.config_version,
+                "validated_at": result.validated_at,
+            }
+
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Sector-scoped L1 config validation")
+    parser.add_argument("--sector", help="Validate specific sector")
+    parser.add_argument("--all", action="store_true", help="Validate all sectors with data")
+    parser.add_argument("--report", action="store_true", help="Generate JSON summary report")
+    parser.add_argument("--db-url", help="Database URL")
+
+    args = parser.parse_args()
+
+    if args.report:
+        results = asyncio.run(generate_summary_report(args.db_url))
+        print(json.dumps(results, indent=2))
+    elif args.all:
+        asyncio.run(validate_all_sectors(args.db_url))
+    elif args.sector:
+        asyncio.run(validate_sector(args.sector.upper(), args.db_url))
+    else:
+        parser.print_help()
+        print("\n\nSectors with real data:", ", ".join(sorted(SECTORS_WITH_DATA)))
+
+
+if __name__ == "__main__":
+    main()