#!/usr/bin/env python3 """ Wave 1 L1 Config Validation Script - V2 (Sector-Scoped) Validates L1 primitive configs against SECTOR-SPECIFIC review data. Only validates sectors where we have real business data. Key improvement over v1: spans are filtered by business → sector mapping, ensuring "TASTE in HEALTHCARE" noise doesn't pollute results. Usage: python validate_l1_configs_v2.py --sector ENTERTAINMENT python validate_l1_configs_v2.py --sector AUTOMOTIVE python validate_l1_configs_v2.py --all python validate_l1_configs_v2.py --report # Summary only """ import argparse import asyncio import json import os from collections import Counter from dataclasses import dataclass, field from datetime import datetime from pathlib import Path from typing import Any import asyncpg # Paths DATA_DIR = Path(__file__).parent.parent / "data" CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l1" BRIEFS_DIR = DATA_DIR / "sector_briefs" # Business → Sector mapping (ground truth) BUSINESS_TO_SECTOR = { "Go Karts Mar Menor": "ENTERTAINMENT", "ClickRent Gran Canaria | Alquiler de Coches y Furgonetas": "AUTOMOTIVE", "Soho Club": "ENTERTAINMENT", "Fika": "FOOD_DINING", } # Sectors with real data SECTORS_WITH_DATA = {"ENTERTAINMENT", "AUTOMOTIVE", "FOOD_DINING"} # URT code to primitive mapping URT_TO_PRIMITIVE = { # Offering codes "O1.01": "CONSISTENCY", "O1.02": "CRAFT", "O1.03": "FRESHNESS", "O1.04": "EFFECTIVENESS", "O1.05": "TASTE", "O1.06": "CONDITION", "O2.01": "ACCURACY", "O2.02": "EFFECTIVENESS", "O2.03": "CRAFT", "O3.01": "ACCURACY", "O3.02": "CONSISTENCY", "O3.03": "EFFECTIVENESS", # People codes "P1.01": "MANNER", "P1.02": "MANNER", "P1.03": "ATTENTIVENESS", "P1.04": "COMMUNICATION", "P1.05": "ATTENTIVENESS", "P2.01": "COMPETENCE", "P2.02": "COMPETENCE", "P2.03": "COMPETENCE", "P3.01": "COMMUNICATION", "P3.02": "COMMUNICATION", "P3.03": "COMMUNICATION", # Journey codes "J1.01": "SPEED", "J1.02": "RELIABILITY", "J1.03": "FRICTION", "J1.04": "SPEED", "J1.05": "RELIABILITY", "J2.01": "RELIABILITY", "J2.02": "RELIABILITY", "J2.03": "FRICTION", "J3.01": "FRICTION", "J3.02": "FRICTION", "J3.03": "FRICTION", # Environment codes "E1.01": "CLEANLINESS", "E1.02": "COMFORT", "E1.03": "AMBIANCE", "E1.04": "AMBIANCE", "E1.05": "COMFORT", "E2.01": "AMBIANCE", "E2.02": "COMFORT", "E2.03": "COMFORT", "E2.04": "AMBIANCE", "E2.05": "DIGITAL_UX", "E3.01": "SAFETY", "E3.02": "SAFETY", "E3.03": "ACCESSIBILITY", "E4.01": "ACCESSIBILITY", "E4.02": "ACCESSIBILITY", "E4.03": "DIGITAL_UX", # Access codes "A1.01": "AVAILABILITY", "A1.02": "AVAILABILITY", "A1.03": "AVAILABILITY", "A1.04": "ACCESSIBILITY", "A1.05": "ACCESSIBILITY", "A2.01": "ACCESSIBILITY", "A2.02": "ACCESSIBILITY", "A2.03": "DIGITAL_UX", "A3.01": "ACCESSIBILITY", "A3.02": "ACCESSIBILITY", "A3.03": "SPEED", "A4.01": "ACCESSIBILITY", "A4.02": "ACCESSIBILITY", "A4.03": "AVAILABILITY", # Value codes "V1.01": "PRICE_LEVEL", "V1.02": "PRICE_FAIRNESS", "V1.03": "PRICE_TRANSPARENCY", "V2.01": "PRICE_FAIRNESS", "V2.02": "PRICE_TRANSPARENCY", "V2.03": "VALUE_FOR_MONEY", "V3.01": "VALUE_FOR_MONEY", "V3.02": "VALUE_FOR_MONEY", "V3.03": "PRICE_FAIRNESS", "V4.01": "VALUE_FOR_MONEY", "V4.02": "VALUE_FOR_MONEY", "V4.03": "VALUE_FOR_MONEY", # Relationship codes (map to meta - these should stay unmapped) "R1.01": None, "R1.02": None, "R1.03": None, "R2.01": None, "R2.02": None, "R2.03": None, "R3.01": None, "R3.02": None, "R3.03": None, "R4.01": None, "R4.02": None, "R4.03": None, } # Minimum threshold for "enable" recommendations (% of sector spans) ENABLE_THRESHOLD_PCT = 3.0 # Only recommend enable if >= 3% of sector spans @dataclass class SectorValidation: """Validation result for a single sector.""" sector_code: str businesses: list[str] span_count: int # Coverage enabled_coverage: float disabled_hits: dict[str, int] = field(default_factory=dict) unmapped_count: int = 0 # Distribution primitive_counts: dict[str, int] = field(default_factory=dict) domain_distribution: dict[str, int] = field(default_factory=dict) valence_distribution: dict[str, int] = field(default_factory=dict) top_urt_codes: list[tuple[str, int]] = field(default_factory=list) # Recommendations (threshold-gated) recommended_enables: list[tuple[str, float]] = field(default_factory=list) # (primitive, pct) recommended_disables: list[tuple[str, float]] = field(default_factory=list) weight_issues: list[str] = field(default_factory=list) # Metadata validated_at: str = "" config_version: str = "" def load_l1_config(sector_code: str) -> dict[str, Any] | None: """Load L1 config for a sector.""" config_file = CONFIGS_DIR / f"{sector_code.lower()}_config.json" if not config_file.exists(): return None with open(config_file) as f: return json.load(f) def get_businesses_for_sector(sector_code: str) -> list[str]: """Get list of businesses belonging to a sector.""" return [biz for biz, sector in BUSINESS_TO_SECTOR.items() if sector == sector_code] async def fetch_spans_for_businesses(pool: asyncpg.Pool, businesses: list[str]) -> list[dict]: """Fetch spans for specific businesses only.""" if not businesses: return [] query = """ SELECT business_id, urt_primary, valence, intensity, span_text FROM pipeline.review_spans WHERE business_id = ANY($1) ORDER BY created_at DESC """ rows = await pool.fetch(query, businesses) return [dict(row) for row in rows] def analyze_sector_spans( spans: list[dict], config: dict[str, Any], businesses: list[str], ) -> SectorValidation: """Analyze spans for a specific sector.""" sector_code = config["sector_code"] enabled = set(config.get("enabled", [])) disabled = set(config.get("disabled", [])) weights = config.get("weights", {}) config_version = config.get("config_version", "1.0") # Counters primitive_counts: Counter = Counter() domain_counts: Counter = Counter() valence_counts: Counter = Counter() urt_counts: Counter = Counter() disabled_hits: Counter = Counter() unmapped = 0 enabled_hits = 0 for span in spans: urt_code = span["urt_primary"] valence = span.get("valence", "V0") urt_counts[urt_code] += 1 valence_counts[valence] += 1 domain_counts[urt_code[0]] += 1 primitive = URT_TO_PRIMITIVE.get(urt_code) if primitive: primitive_counts[primitive] += 1 if primitive in enabled: enabled_hits += 1 elif primitive in disabled: disabled_hits[primitive] += 1 else: unmapped += 1 total = len(spans) enabled_coverage = enabled_hits / total if total > 0 else 0 # Threshold-gated recommendations recommended_enables = [] for prim, count in disabled_hits.most_common(): pct = count / total * 100 if total > 0 else 0 if pct >= ENABLE_THRESHOLD_PCT: recommended_enables.append((prim, pct)) # Weight issues weight_issues = [] for prim in weights: if primitive_counts[prim] == 0 and prim in enabled: weight_issues.append(f"{prim} weighted ({weights[prim]}x) but 0 appearances") # High-frequency unweighted for prim, count in primitive_counts.most_common(5): pct = count / total * 100 if total > 0 else 0 if prim in enabled and prim not in weights and pct >= 10: weight_issues.append(f"{prim} high freq ({pct:.1f}%) but unweighted") return SectorValidation( sector_code=sector_code, businesses=businesses, span_count=total, enabled_coverage=enabled_coverage, disabled_hits=dict(disabled_hits), unmapped_count=unmapped, primitive_counts=dict(primitive_counts), domain_distribution=dict(domain_counts), valence_distribution=dict(valence_counts), top_urt_codes=urt_counts.most_common(15), recommended_enables=recommended_enables, weight_issues=weight_issues, validated_at=datetime.utcnow().isoformat(), config_version=config_version, ) def print_sector_report(result: SectorValidation, config: dict): """Print detailed validation report for a sector.""" print("\n" + "=" * 70) print(f"SECTOR-SCOPED VALIDATION: {result.sector_code}") print("=" * 70) print(f"\nšŸ“Š DATA SOURCE") print(f" Businesses: {', '.join(result.businesses)}") print(f" Total spans: {result.span_count:,}") print(f" Config version: {result.config_version}") print(f"\nšŸ“ˆ COVERAGE") print(f" Enabled coverage: {result.enabled_coverage:.1%}") print(f" Unmapped (R-domain): {result.unmapped_count} ({result.unmapped_count/result.span_count*100:.1f}%)" if result.span_count > 0 else "") # Domain distribution print(f"\nšŸ“ DOMAIN DISTRIBUTION") domain_names = {"O": "Offering", "P": "People", "J": "Journey", "E": "Environment", "A": "Access", "V": "Value", "R": "Relationship"} for domain in "OPJEVRA": count = result.domain_distribution.get(domain, 0) pct = count / result.span_count * 100 if result.span_count > 0 else 0 bar = "ā–ˆ" * int(pct / 2) print(f" {domain} {domain_names.get(domain, '?'):12} {count:4} ({pct:5.1f}%) {bar}") # Top primitives print(f"\nšŸ” TOP PRIMITIVES (sector-scoped)") enabled_set = set(config.get("enabled", [])) disabled_set = set(config.get("disabled", [])) weights = config.get("weights", {}) for prim, count in sorted(result.primitive_counts.items(), key=lambda x: -x[1])[:12]: pct = count / result.span_count * 100 if result.span_count > 0 else 0 if prim in enabled_set: status = "āœ“" elif prim in disabled_set: status = "āœ—" else: status = "?" weight = f"({weights[prim]}x)" if prim in weights else "" print(f" {status} {prim:20} {count:4} ({pct:5.1f}%) {weight}") # Threshold-gated recommendations if result.recommended_enables: print(f"\nāš ļø RECOMMENDED ENABLES (≄{ENABLE_THRESHOLD_PCT}% threshold)") for prim, pct in result.recommended_enables: count = result.disabled_hits.get(prim, 0) print(f" → ENABLE {prim}: {count} spans ({pct:.1f}%)") else: print(f"\nāœ… No primitives exceed {ENABLE_THRESHOLD_PCT}% threshold for enabling") # Low-frequency disabled (info only) low_freq_disabled = [(p, c) for p, c in result.disabled_hits.items() if c / result.span_count * 100 < ENABLE_THRESHOLD_PCT] if low_freq_disabled: print(f"\nšŸ“‹ DISABLED BUT APPEARING (below threshold - no action)") for prim, count in sorted(low_freq_disabled, key=lambda x: -x[1])[:5]: pct = count / result.span_count * 100 print(f" {prim}: {count} ({pct:.1f}%)") # Weight issues if result.weight_issues: print(f"\nāš–ļø WEIGHT ISSUES") for issue in result.weight_issues: print(f" • {issue}") print(f"\nā±ļø Validated at: {result.validated_at}") print("=" * 70) async def validate_sector( sector_code: str, db_url: str | None = None, verbose: bool = True, ) -> SectorValidation | None: """Validate a single sector with sector-scoped data.""" if sector_code not in SECTORS_WITH_DATA: if verbose: print(f"āš ļø {sector_code}: No real business data available for validation") return None config = load_l1_config(sector_code) if not config: if verbose: print(f"āŒ No L1 config found for {sector_code}") return None businesses = get_businesses_for_sector(sector_code) if not businesses: if verbose: print(f"āš ļø {sector_code}: No businesses mapped") return None db_url = db_url or os.environ.get( "DATABASE_URL", "postgresql://scraper:scraper123@localhost:5437/scraper" ) pool = await asyncpg.create_pool(db_url) try: spans = await fetch_spans_for_businesses(pool, businesses) if not spans: if verbose: print(f"āš ļø {sector_code}: No spans found for businesses") return None result = analyze_sector_spans(spans, config, businesses) if verbose: print_sector_report(result, config) return result finally: await pool.close() async def validate_all_sectors(db_url: str | None = None) -> dict[str, SectorValidation]: """Validate all sectors with available data.""" results = {} for sector in SECTORS_WITH_DATA: result = await validate_sector(sector, db_url, verbose=True) if result: results[sector] = result # Print summary print("\n" + "=" * 70) print("VALIDATION SUMMARY") print("=" * 70) print(f"\n{'Sector':<20} {'Spans':>8} {'Coverage':>10} {'Enables':>10}") print("-" * 50) for sector, result in results.items(): enables = len(result.recommended_enables) enables_str = f"{enables} recs" if enables > 0 else "āœ“ OK" print(f"{sector:<20} {result.span_count:>8,} {result.enabled_coverage:>9.1%} {enables_str:>10}") print("-" * 50) print(f"Sectors validated: {len(results)}/{len(SECTORS_WITH_DATA)}") print(f"Sectors without data: {20 - len(SECTORS_WITH_DATA)}") return results async def generate_summary_report(db_url: str | None = None) -> dict: """Generate a JSON summary report for all sectors.""" results = {} for sector in SECTORS_WITH_DATA: result = await validate_sector(sector, db_url, verbose=False) if result: results[sector] = { "span_count": result.span_count, "enabled_coverage": round(result.enabled_coverage, 3), "recommended_enables": result.recommended_enables, "weight_issues": result.weight_issues, "config_version": result.config_version, "validated_at": result.validated_at, } return results def main(): parser = argparse.ArgumentParser(description="Sector-scoped L1 config validation") parser.add_argument("--sector", help="Validate specific sector") parser.add_argument("--all", action="store_true", help="Validate all sectors with data") parser.add_argument("--report", action="store_true", help="Generate JSON summary report") parser.add_argument("--db-url", help="Database URL") args = parser.parse_args() if args.report: results = asyncio.run(generate_summary_report(args.db_url)) print(json.dumps(results, indent=2)) elif args.all: asyncio.run(validate_all_sectors(args.db_url)) elif args.sector: asyncio.run(validate_sector(args.sector.upper(), args.db_url)) else: parser.print_help() print("\n\nSectors with real data:", ", ".join(sorted(SECTORS_WITH_DATA))) if __name__ == "__main__": main()