whyrating-engine-legacy/packages/reviewiq-pipeline/scripts/validate_l1_configs_v2.py

#!/usr/bin/env python3
"""
Wave 1 L1 Config Validation Script - V2 (Sector-Scoped)

Validates L1 primitive configs against SECTOR-SPECIFIC review data.
Only validates sectors where we have real business data.

Key improvement over v1: spans are filtered by business → sector mapping,
ensuring "TASTE in HEALTHCARE" noise doesn't pollute results.

Usage:
    python validate_l1_configs_v2.py --sector ENTERTAINMENT
    python validate_l1_configs_v2.py --sector AUTOMOTIVE
    python validate_l1_configs_v2.py --all
    python validate_l1_configs_v2.py --report  # Summary only
"""

import argparse
import asyncio
import json
import os
from collections import Counter
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any

import asyncpg

# Paths
DATA_DIR = Path(__file__).parent.parent / "data"
CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l1"
BRIEFS_DIR = DATA_DIR / "sector_briefs"

# Business → Sector mapping (ground truth)
BUSINESS_TO_SECTOR = {
    "Go Karts Mar Menor": "ENTERTAINMENT",
    "ClickRent Gran Canaria | Alquiler de Coches y Furgonetas": "AUTOMOTIVE",
    "Soho Club": "ENTERTAINMENT",
    "Fika": "FOOD_DINING",
}

# Sectors with real data
SECTORS_WITH_DATA = {"ENTERTAINMENT", "AUTOMOTIVE", "FOOD_DINING"}

# URT code to primitive mapping
URT_TO_PRIMITIVE = {
    # Offering codes
    "O1.01": "CONSISTENCY", "O1.02": "CRAFT", "O1.03": "FRESHNESS",
    "O1.04": "EFFECTIVENESS", "O1.05": "TASTE", "O1.06": "CONDITION",
    "O2.01": "ACCURACY", "O2.02": "EFFECTIVENESS", "O2.03": "CRAFT",
    "O3.01": "ACCURACY", "O3.02": "CONSISTENCY", "O3.03": "EFFECTIVENESS",
    # People codes
    "P1.01": "MANNER", "P1.02": "MANNER", "P1.03": "ATTENTIVENESS",
    "P1.04": "COMMUNICATION", "P1.05": "ATTENTIVENESS",
    "P2.01": "COMPETENCE", "P2.02": "COMPETENCE", "P2.03": "COMPETENCE",
    "P3.01": "COMMUNICATION", "P3.02": "COMMUNICATION", "P3.03": "COMMUNICATION",
    # Journey codes
    "J1.01": "SPEED", "J1.02": "RELIABILITY", "J1.03": "FRICTION",
    "J1.04": "SPEED", "J1.05": "RELIABILITY",
    "J2.01": "RELIABILITY", "J2.02": "RELIABILITY", "J2.03": "FRICTION",
    "J3.01": "FRICTION", "J3.02": "FRICTION", "J3.03": "FRICTION",
    # Environment codes
    "E1.01": "CLEANLINESS", "E1.02": "COMFORT", "E1.03": "AMBIANCE",
    "E1.04": "AMBIANCE", "E1.05": "COMFORT",
    "E2.01": "AMBIANCE", "E2.02": "COMFORT", "E2.03": "COMFORT",
    "E2.04": "AMBIANCE", "E2.05": "DIGITAL_UX",
    "E3.01": "SAFETY", "E3.02": "SAFETY", "E3.03": "ACCESSIBILITY",
    "E4.01": "ACCESSIBILITY", "E4.02": "ACCESSIBILITY", "E4.03": "DIGITAL_UX",
    # Access codes
    "A1.01": "AVAILABILITY", "A1.02": "AVAILABILITY", "A1.03": "AVAILABILITY",
    "A1.04": "ACCESSIBILITY", "A1.05": "ACCESSIBILITY",
    "A2.01": "ACCESSIBILITY", "A2.02": "ACCESSIBILITY", "A2.03": "DIGITAL_UX",
    "A3.01": "ACCESSIBILITY", "A3.02": "ACCESSIBILITY", "A3.03": "SPEED",
    "A4.01": "ACCESSIBILITY", "A4.02": "ACCESSIBILITY", "A4.03": "AVAILABILITY",
    # Value codes
    "V1.01": "PRICE_LEVEL", "V1.02": "PRICE_FAIRNESS", "V1.03": "PRICE_TRANSPARENCY",
    "V2.01": "PRICE_FAIRNESS", "V2.02": "PRICE_TRANSPARENCY", "V2.03": "VALUE_FOR_MONEY",
    "V3.01": "VALUE_FOR_MONEY", "V3.02": "VALUE_FOR_MONEY", "V3.03": "PRICE_FAIRNESS",
    "V4.01": "VALUE_FOR_MONEY", "V4.02": "VALUE_FOR_MONEY", "V4.03": "VALUE_FOR_MONEY",
    # Relationship codes (map to meta - these should stay unmapped)
    "R1.01": None, "R1.02": None, "R1.03": None,
    "R2.01": None, "R2.02": None, "R2.03": None,
    "R3.01": None, "R3.02": None, "R3.03": None,
    "R4.01": None, "R4.02": None, "R4.03": None,
}

# Minimum threshold for "enable" recommendations (% of sector spans)
ENABLE_THRESHOLD_PCT = 3.0  # Only recommend enable if >= 3% of sector spans


@dataclass
class SectorValidation:
    """Validation result for a single sector."""
    sector_code: str
    businesses: list[str]
    span_count: int

    # Coverage
    enabled_coverage: float
    disabled_hits: dict[str, int] = field(default_factory=dict)
    unmapped_count: int = 0

    # Distribution
    primitive_counts: dict[str, int] = field(default_factory=dict)
    domain_distribution: dict[str, int] = field(default_factory=dict)
    valence_distribution: dict[str, int] = field(default_factory=dict)
    top_urt_codes: list[tuple[str, int]] = field(default_factory=list)

    # Recommendations (threshold-gated)
    recommended_enables: list[tuple[str, float]] = field(default_factory=list)  # (primitive, pct)
    recommended_disables: list[tuple[str, float]] = field(default_factory=list)
    weight_issues: list[str] = field(default_factory=list)

    # Metadata
    validated_at: str = ""
    config_version: str = ""


def load_l1_config(sector_code: str) -> dict[str, Any] | None:
    """Load L1 config for a sector."""
    config_file = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
    if not config_file.exists():
        return None
    with open(config_file) as f:
        return json.load(f)


def get_businesses_for_sector(sector_code: str) -> list[str]:
    """Get list of businesses belonging to a sector."""
    return [biz for biz, sector in BUSINESS_TO_SECTOR.items() if sector == sector_code]


async def fetch_spans_for_businesses(pool: asyncpg.Pool, businesses: list[str]) -> list[dict]:
    """Fetch spans for specific businesses only."""
    if not businesses:
        return []

    query = """
        SELECT
            business_id,
            urt_primary,
            valence,
            intensity,
            span_text
        FROM pipeline.review_spans
        WHERE business_id = ANY($1)
        ORDER BY created_at DESC
    """
    rows = await pool.fetch(query, businesses)
    return [dict(row) for row in rows]


def analyze_sector_spans(
    spans: list[dict],
    config: dict[str, Any],
    businesses: list[str],
) -> SectorValidation:
    """Analyze spans for a specific sector."""
    sector_code = config["sector_code"]
    enabled = set(config.get("enabled", []))
    disabled = set(config.get("disabled", []))
    weights = config.get("weights", {})
    config_version = config.get("config_version", "1.0")

    # Counters
    primitive_counts: Counter = Counter()
    domain_counts: Counter = Counter()
    valence_counts: Counter = Counter()
    urt_counts: Counter = Counter()
    disabled_hits: Counter = Counter()
    unmapped = 0
    enabled_hits = 0

    for span in spans:
        urt_code = span["urt_primary"]
        valence = span.get("valence", "V0")

        urt_counts[urt_code] += 1
        valence_counts[valence] += 1
        domain_counts[urt_code[0]] += 1

        primitive = URT_TO_PRIMITIVE.get(urt_code)
        if primitive:
            primitive_counts[primitive] += 1
            if primitive in enabled:
                enabled_hits += 1
            elif primitive in disabled:
                disabled_hits[primitive] += 1
        else:
            unmapped += 1

    total = len(spans)
    enabled_coverage = enabled_hits / total if total > 0 else 0

    # Threshold-gated recommendations
    recommended_enables = []
    for prim, count in disabled_hits.most_common():
        pct = count / total * 100 if total > 0 else 0
        if pct >= ENABLE_THRESHOLD_PCT:
            recommended_enables.append((prim, pct))

    # Weight issues
    weight_issues = []
    for prim in weights:
        if primitive_counts[prim] == 0 and prim in enabled:
            weight_issues.append(f"{prim} weighted ({weights[prim]}x) but 0 appearances")

    # High-frequency unweighted
    for prim, count in primitive_counts.most_common(5):
        pct = count / total * 100 if total > 0 else 0
        if prim in enabled and prim not in weights and pct >= 10:
            weight_issues.append(f"{prim} high freq ({pct:.1f}%) but unweighted")

    return SectorValidation(
        sector_code=sector_code,
        businesses=businesses,
        span_count=total,
        enabled_coverage=enabled_coverage,
        disabled_hits=dict(disabled_hits),
        unmapped_count=unmapped,
        primitive_counts=dict(primitive_counts),
        domain_distribution=dict(domain_counts),
        valence_distribution=dict(valence_counts),
        top_urt_codes=urt_counts.most_common(15),
        recommended_enables=recommended_enables,
        weight_issues=weight_issues,
        validated_at=datetime.utcnow().isoformat(),
        config_version=config_version,
    )


def print_sector_report(result: SectorValidation, config: dict):
    """Print detailed validation report for a sector."""
    print("\n" + "=" * 70)
    print(f"SECTOR-SCOPED VALIDATION: {result.sector_code}")
    print("=" * 70)

    print(f"\n📊 DATA SOURCE")
    print(f"   Businesses: {', '.join(result.businesses)}")
    print(f"   Total spans: {result.span_count:,}")
    print(f"   Config version: {result.config_version}")

    print(f"\n📈 COVERAGE")
    print(f"   Enabled coverage: {result.enabled_coverage:.1%}")
    print(f"   Unmapped (R-domain): {result.unmapped_count} ({result.unmapped_count/result.span_count*100:.1f}%)" if result.span_count > 0 else "")

    # Domain distribution
    print(f"\n📁 DOMAIN DISTRIBUTION")
    domain_names = {"O": "Offering", "P": "People", "J": "Journey",
                    "E": "Environment", "A": "Access", "V": "Value", "R": "Relationship"}
    for domain in "OPJEVRA":
        count = result.domain_distribution.get(domain, 0)
        pct = count / result.span_count * 100 if result.span_count > 0 else 0
        bar = "█" * int(pct / 2)
        print(f"   {domain} {domain_names.get(domain, '?'):12} {count:4} ({pct:5.1f}%) {bar}")

    # Top primitives
    print(f"\n🔝 TOP PRIMITIVES (sector-scoped)")
    enabled_set = set(config.get("enabled", []))
    disabled_set = set(config.get("disabled", []))
    weights = config.get("weights", {})

    for prim, count in sorted(result.primitive_counts.items(), key=lambda x: -x[1])[:12]:
        pct = count / result.span_count * 100 if result.span_count > 0 else 0
        if prim in enabled_set:
            status = "✓"
        elif prim in disabled_set:
            status = "✗"
        else:
            status = "?"
        weight = f"({weights[prim]}x)" if prim in weights else ""
        print(f"   {status} {prim:20} {count:4} ({pct:5.1f}%) {weight}")

    # Threshold-gated recommendations
    if result.recommended_enables:
        print(f"\n⚠️  RECOMMENDED ENABLES (≥{ENABLE_THRESHOLD_PCT}% threshold)")
        for prim, pct in result.recommended_enables:
            count = result.disabled_hits.get(prim, 0)
            print(f"   → ENABLE {prim}: {count} spans ({pct:.1f}%)")
    else:
        print(f"\n✅ No primitives exceed {ENABLE_THRESHOLD_PCT}% threshold for enabling")

    # Low-frequency disabled (info only)
    low_freq_disabled = [(p, c) for p, c in result.disabled_hits.items()
                         if c / result.span_count * 100 < ENABLE_THRESHOLD_PCT]
    if low_freq_disabled:
        print(f"\n📋 DISABLED BUT APPEARING (below threshold - no action)")
        for prim, count in sorted(low_freq_disabled, key=lambda x: -x[1])[:5]:
            pct = count / result.span_count * 100
            print(f"   {prim}: {count} ({pct:.1f}%)")

    # Weight issues
    if result.weight_issues:
        print(f"\n⚖️  WEIGHT ISSUES")
        for issue in result.weight_issues:
            print(f"   • {issue}")

    print(f"\n⏱️  Validated at: {result.validated_at}")
    print("=" * 70)


async def validate_sector(
    sector_code: str,
    db_url: str | None = None,
    verbose: bool = True,
) -> SectorValidation | None:
    """Validate a single sector with sector-scoped data."""

    if sector_code not in SECTORS_WITH_DATA:
        if verbose:
            print(f"⚠️  {sector_code}: No real business data available for validation")
        return None

    config = load_l1_config(sector_code)
    if not config:
        if verbose:
            print(f"❌ No L1 config found for {sector_code}")
        return None

    businesses = get_businesses_for_sector(sector_code)
    if not businesses:
        if verbose:
            print(f"⚠️  {sector_code}: No businesses mapped")
        return None

    db_url = db_url or os.environ.get(
        "DATABASE_URL",
        "postgresql://scraper:scraper123@localhost:5437/scraper"
    )

    pool = await asyncpg.create_pool(db_url)

    try:
        spans = await fetch_spans_for_businesses(pool, businesses)
        if not spans:
            if verbose:
                print(f"⚠️  {sector_code}: No spans found for businesses")
            return None

        result = analyze_sector_spans(spans, config, businesses)

        if verbose:
            print_sector_report(result, config)

        return result

    finally:
        await pool.close()


async def validate_all_sectors(db_url: str | None = None) -> dict[str, SectorValidation]:
    """Validate all sectors with available data."""
    results = {}

    for sector in SECTORS_WITH_DATA:
        result = await validate_sector(sector, db_url, verbose=True)
        if result:
            results[sector] = result

    # Print summary
    print("\n" + "=" * 70)
    print("VALIDATION SUMMARY")
    print("=" * 70)
    print(f"\n{'Sector':<20} {'Spans':>8} {'Coverage':>10} {'Enables':>10}")
    print("-" * 50)

    for sector, result in results.items():
        enables = len(result.recommended_enables)
        enables_str = f"{enables} recs" if enables > 0 else "✓ OK"
        print(f"{sector:<20} {result.span_count:>8,} {result.enabled_coverage:>9.1%} {enables_str:>10}")

    print("-" * 50)
    print(f"Sectors validated: {len(results)}/{len(SECTORS_WITH_DATA)}")
    print(f"Sectors without data: {20 - len(SECTORS_WITH_DATA)}")

    return results


async def generate_summary_report(db_url: str | None = None) -> dict:
    """Generate a JSON summary report for all sectors."""
    results = {}

    for sector in SECTORS_WITH_DATA:
        result = await validate_sector(sector, db_url, verbose=False)
        if result:
            results[sector] = {
                "span_count": result.span_count,
                "enabled_coverage": round(result.enabled_coverage, 3),
                "recommended_enables": result.recommended_enables,
                "weight_issues": result.weight_issues,
                "config_version": result.config_version,
                "validated_at": result.validated_at,
            }

    return results


def main():
    parser = argparse.ArgumentParser(description="Sector-scoped L1 config validation")
    parser.add_argument("--sector", help="Validate specific sector")
    parser.add_argument("--all", action="store_true", help="Validate all sectors with data")
    parser.add_argument("--report", action="store_true", help="Generate JSON summary report")
    parser.add_argument("--db-url", help="Database URL")

    args = parser.parse_args()

    if args.report:
        results = asyncio.run(generate_summary_report(args.db_url))
        print(json.dumps(results, indent=2))
    elif args.all:
        asyncio.run(validate_all_sectors(args.db_url))
    elif args.sector:
        asyncio.run(validate_sector(args.sector.upper(), args.db_url))
    else:
        parser.print_help()
        print("\n\nSectors with real data:", ", ".join(sorted(SECTORS_WITH_DATA)))


if __name__ == "__main__":
    main()