Files
whyrating-engine-legacy/packages/reviewiq-pipeline/scripts/validate_l1_configs_v2.py
2026-02-02 18:19:00 +00:00

422 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Wave 1 L1 Config Validation Script - V2 (Sector-Scoped)
Validates L1 primitive configs against SECTOR-SPECIFIC review data.
Only validates sectors where we have real business data.
Key improvement over v1: spans are filtered by business → sector mapping,
ensuring "TASTE in HEALTHCARE" noise doesn't pollute results.
Usage:
python validate_l1_configs_v2.py --sector ENTERTAINMENT
python validate_l1_configs_v2.py --sector AUTOMOTIVE
python validate_l1_configs_v2.py --all
python validate_l1_configs_v2.py --report # Summary only
"""
import argparse
import asyncio
import json
import os
from collections import Counter
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any
import asyncpg
# Paths
DATA_DIR = Path(__file__).parent.parent / "data"
CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l1"
BRIEFS_DIR = DATA_DIR / "sector_briefs"
# Business → Sector mapping (ground truth)
BUSINESS_TO_SECTOR = {
"Go Karts Mar Menor": "ENTERTAINMENT",
"ClickRent Gran Canaria | Alquiler de Coches y Furgonetas": "AUTOMOTIVE",
"Soho Club": "ENTERTAINMENT",
"Fika": "FOOD_DINING",
}
# Sectors with real data
SECTORS_WITH_DATA = {"ENTERTAINMENT", "AUTOMOTIVE", "FOOD_DINING"}
# URT code to primitive mapping
URT_TO_PRIMITIVE = {
# Offering codes
"O1.01": "CONSISTENCY", "O1.02": "CRAFT", "O1.03": "FRESHNESS",
"O1.04": "EFFECTIVENESS", "O1.05": "TASTE", "O1.06": "CONDITION",
"O2.01": "ACCURACY", "O2.02": "EFFECTIVENESS", "O2.03": "CRAFT",
"O3.01": "ACCURACY", "O3.02": "CONSISTENCY", "O3.03": "EFFECTIVENESS",
# People codes
"P1.01": "MANNER", "P1.02": "MANNER", "P1.03": "ATTENTIVENESS",
"P1.04": "COMMUNICATION", "P1.05": "ATTENTIVENESS",
"P2.01": "COMPETENCE", "P2.02": "COMPETENCE", "P2.03": "COMPETENCE",
"P3.01": "COMMUNICATION", "P3.02": "COMMUNICATION", "P3.03": "COMMUNICATION",
# Journey codes
"J1.01": "SPEED", "J1.02": "RELIABILITY", "J1.03": "FRICTION",
"J1.04": "SPEED", "J1.05": "RELIABILITY",
"J2.01": "RELIABILITY", "J2.02": "RELIABILITY", "J2.03": "FRICTION",
"J3.01": "FRICTION", "J3.02": "FRICTION", "J3.03": "FRICTION",
# Environment codes
"E1.01": "CLEANLINESS", "E1.02": "COMFORT", "E1.03": "AMBIANCE",
"E1.04": "AMBIANCE", "E1.05": "COMFORT",
"E2.01": "AMBIANCE", "E2.02": "COMFORT", "E2.03": "COMFORT",
"E2.04": "AMBIANCE", "E2.05": "DIGITAL_UX",
"E3.01": "SAFETY", "E3.02": "SAFETY", "E3.03": "ACCESSIBILITY",
"E4.01": "ACCESSIBILITY", "E4.02": "ACCESSIBILITY", "E4.03": "DIGITAL_UX",
# Access codes
"A1.01": "AVAILABILITY", "A1.02": "AVAILABILITY", "A1.03": "AVAILABILITY",
"A1.04": "ACCESSIBILITY", "A1.05": "ACCESSIBILITY",
"A2.01": "ACCESSIBILITY", "A2.02": "ACCESSIBILITY", "A2.03": "DIGITAL_UX",
"A3.01": "ACCESSIBILITY", "A3.02": "ACCESSIBILITY", "A3.03": "SPEED",
"A4.01": "ACCESSIBILITY", "A4.02": "ACCESSIBILITY", "A4.03": "AVAILABILITY",
# Value codes
"V1.01": "PRICE_LEVEL", "V1.02": "PRICE_FAIRNESS", "V1.03": "PRICE_TRANSPARENCY",
"V2.01": "PRICE_FAIRNESS", "V2.02": "PRICE_TRANSPARENCY", "V2.03": "VALUE_FOR_MONEY",
"V3.01": "VALUE_FOR_MONEY", "V3.02": "VALUE_FOR_MONEY", "V3.03": "PRICE_FAIRNESS",
"V4.01": "VALUE_FOR_MONEY", "V4.02": "VALUE_FOR_MONEY", "V4.03": "VALUE_FOR_MONEY",
# Relationship codes (map to meta - these should stay unmapped)
"R1.01": None, "R1.02": None, "R1.03": None,
"R2.01": None, "R2.02": None, "R2.03": None,
"R3.01": None, "R3.02": None, "R3.03": None,
"R4.01": None, "R4.02": None, "R4.03": None,
}
# Minimum threshold for "enable" recommendations (% of sector spans)
ENABLE_THRESHOLD_PCT = 3.0 # Only recommend enable if >= 3% of sector spans
@dataclass
class SectorValidation:
"""Validation result for a single sector."""
sector_code: str
businesses: list[str]
span_count: int
# Coverage
enabled_coverage: float
disabled_hits: dict[str, int] = field(default_factory=dict)
unmapped_count: int = 0
# Distribution
primitive_counts: dict[str, int] = field(default_factory=dict)
domain_distribution: dict[str, int] = field(default_factory=dict)
valence_distribution: dict[str, int] = field(default_factory=dict)
top_urt_codes: list[tuple[str, int]] = field(default_factory=list)
# Recommendations (threshold-gated)
recommended_enables: list[tuple[str, float]] = field(default_factory=list) # (primitive, pct)
recommended_disables: list[tuple[str, float]] = field(default_factory=list)
weight_issues: list[str] = field(default_factory=list)
# Metadata
validated_at: str = ""
config_version: str = ""
def load_l1_config(sector_code: str) -> dict[str, Any] | None:
"""Load L1 config for a sector."""
config_file = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
if not config_file.exists():
return None
with open(config_file) as f:
return json.load(f)
def get_businesses_for_sector(sector_code: str) -> list[str]:
"""Get list of businesses belonging to a sector."""
return [biz for biz, sector in BUSINESS_TO_SECTOR.items() if sector == sector_code]
async def fetch_spans_for_businesses(pool: asyncpg.Pool, businesses: list[str]) -> list[dict]:
"""Fetch spans for specific businesses only."""
if not businesses:
return []
query = """
SELECT
business_id,
urt_primary,
valence,
intensity,
span_text
FROM pipeline.review_spans
WHERE business_id = ANY($1)
ORDER BY created_at DESC
"""
rows = await pool.fetch(query, businesses)
return [dict(row) for row in rows]
def analyze_sector_spans(
spans: list[dict],
config: dict[str, Any],
businesses: list[str],
) -> SectorValidation:
"""Analyze spans for a specific sector."""
sector_code = config["sector_code"]
enabled = set(config.get("enabled", []))
disabled = set(config.get("disabled", []))
weights = config.get("weights", {})
config_version = config.get("config_version", "1.0")
# Counters
primitive_counts: Counter = Counter()
domain_counts: Counter = Counter()
valence_counts: Counter = Counter()
urt_counts: Counter = Counter()
disabled_hits: Counter = Counter()
unmapped = 0
enabled_hits = 0
for span in spans:
urt_code = span["urt_primary"]
valence = span.get("valence", "V0")
urt_counts[urt_code] += 1
valence_counts[valence] += 1
domain_counts[urt_code[0]] += 1
primitive = URT_TO_PRIMITIVE.get(urt_code)
if primitive:
primitive_counts[primitive] += 1
if primitive in enabled:
enabled_hits += 1
elif primitive in disabled:
disabled_hits[primitive] += 1
else:
unmapped += 1
total = len(spans)
enabled_coverage = enabled_hits / total if total > 0 else 0
# Threshold-gated recommendations
recommended_enables = []
for prim, count in disabled_hits.most_common():
pct = count / total * 100 if total > 0 else 0
if pct >= ENABLE_THRESHOLD_PCT:
recommended_enables.append((prim, pct))
# Weight issues
weight_issues = []
for prim in weights:
if primitive_counts[prim] == 0 and prim in enabled:
weight_issues.append(f"{prim} weighted ({weights[prim]}x) but 0 appearances")
# High-frequency unweighted
for prim, count in primitive_counts.most_common(5):
pct = count / total * 100 if total > 0 else 0
if prim in enabled and prim not in weights and pct >= 10:
weight_issues.append(f"{prim} high freq ({pct:.1f}%) but unweighted")
return SectorValidation(
sector_code=sector_code,
businesses=businesses,
span_count=total,
enabled_coverage=enabled_coverage,
disabled_hits=dict(disabled_hits),
unmapped_count=unmapped,
primitive_counts=dict(primitive_counts),
domain_distribution=dict(domain_counts),
valence_distribution=dict(valence_counts),
top_urt_codes=urt_counts.most_common(15),
recommended_enables=recommended_enables,
weight_issues=weight_issues,
validated_at=datetime.utcnow().isoformat(),
config_version=config_version,
)
def print_sector_report(result: SectorValidation, config: dict):
"""Print detailed validation report for a sector."""
print("\n" + "=" * 70)
print(f"SECTOR-SCOPED VALIDATION: {result.sector_code}")
print("=" * 70)
print(f"\n📊 DATA SOURCE")
print(f" Businesses: {', '.join(result.businesses)}")
print(f" Total spans: {result.span_count:,}")
print(f" Config version: {result.config_version}")
print(f"\n📈 COVERAGE")
print(f" Enabled coverage: {result.enabled_coverage:.1%}")
print(f" Unmapped (R-domain): {result.unmapped_count} ({result.unmapped_count/result.span_count*100:.1f}%)" if result.span_count > 0 else "")
# Domain distribution
print(f"\n📁 DOMAIN DISTRIBUTION")
domain_names = {"O": "Offering", "P": "People", "J": "Journey",
"E": "Environment", "A": "Access", "V": "Value", "R": "Relationship"}
for domain in "OPJEVRA":
count = result.domain_distribution.get(domain, 0)
pct = count / result.span_count * 100 if result.span_count > 0 else 0
bar = "" * int(pct / 2)
print(f" {domain} {domain_names.get(domain, '?'):12} {count:4} ({pct:5.1f}%) {bar}")
# Top primitives
print(f"\n🔝 TOP PRIMITIVES (sector-scoped)")
enabled_set = set(config.get("enabled", []))
disabled_set = set(config.get("disabled", []))
weights = config.get("weights", {})
for prim, count in sorted(result.primitive_counts.items(), key=lambda x: -x[1])[:12]:
pct = count / result.span_count * 100 if result.span_count > 0 else 0
if prim in enabled_set:
status = ""
elif prim in disabled_set:
status = ""
else:
status = "?"
weight = f"({weights[prim]}x)" if prim in weights else ""
print(f" {status} {prim:20} {count:4} ({pct:5.1f}%) {weight}")
# Threshold-gated recommendations
if result.recommended_enables:
print(f"\n⚠️ RECOMMENDED ENABLES (≥{ENABLE_THRESHOLD_PCT}% threshold)")
for prim, pct in result.recommended_enables:
count = result.disabled_hits.get(prim, 0)
print(f" → ENABLE {prim}: {count} spans ({pct:.1f}%)")
else:
print(f"\n✅ No primitives exceed {ENABLE_THRESHOLD_PCT}% threshold for enabling")
# Low-frequency disabled (info only)
low_freq_disabled = [(p, c) for p, c in result.disabled_hits.items()
if c / result.span_count * 100 < ENABLE_THRESHOLD_PCT]
if low_freq_disabled:
print(f"\n📋 DISABLED BUT APPEARING (below threshold - no action)")
for prim, count in sorted(low_freq_disabled, key=lambda x: -x[1])[:5]:
pct = count / result.span_count * 100
print(f" {prim}: {count} ({pct:.1f}%)")
# Weight issues
if result.weight_issues:
print(f"\n⚖️ WEIGHT ISSUES")
for issue in result.weight_issues:
print(f"{issue}")
print(f"\n⏱️ Validated at: {result.validated_at}")
print("=" * 70)
async def validate_sector(
sector_code: str,
db_url: str | None = None,
verbose: bool = True,
) -> SectorValidation | None:
"""Validate a single sector with sector-scoped data."""
if sector_code not in SECTORS_WITH_DATA:
if verbose:
print(f"⚠️ {sector_code}: No real business data available for validation")
return None
config = load_l1_config(sector_code)
if not config:
if verbose:
print(f"❌ No L1 config found for {sector_code}")
return None
businesses = get_businesses_for_sector(sector_code)
if not businesses:
if verbose:
print(f"⚠️ {sector_code}: No businesses mapped")
return None
db_url = db_url or os.environ.get(
"DATABASE_URL",
"postgresql://scraper:scraper123@localhost:5437/scraper"
)
pool = await asyncpg.create_pool(db_url)
try:
spans = await fetch_spans_for_businesses(pool, businesses)
if not spans:
if verbose:
print(f"⚠️ {sector_code}: No spans found for businesses")
return None
result = analyze_sector_spans(spans, config, businesses)
if verbose:
print_sector_report(result, config)
return result
finally:
await pool.close()
async def validate_all_sectors(db_url: str | None = None) -> dict[str, SectorValidation]:
"""Validate all sectors with available data."""
results = {}
for sector in SECTORS_WITH_DATA:
result = await validate_sector(sector, db_url, verbose=True)
if result:
results[sector] = result
# Print summary
print("\n" + "=" * 70)
print("VALIDATION SUMMARY")
print("=" * 70)
print(f"\n{'Sector':<20} {'Spans':>8} {'Coverage':>10} {'Enables':>10}")
print("-" * 50)
for sector, result in results.items():
enables = len(result.recommended_enables)
enables_str = f"{enables} recs" if enables > 0 else "✓ OK"
print(f"{sector:<20} {result.span_count:>8,} {result.enabled_coverage:>9.1%} {enables_str:>10}")
print("-" * 50)
print(f"Sectors validated: {len(results)}/{len(SECTORS_WITH_DATA)}")
print(f"Sectors without data: {20 - len(SECTORS_WITH_DATA)}")
return results
async def generate_summary_report(db_url: str | None = None) -> dict:
"""Generate a JSON summary report for all sectors."""
results = {}
for sector in SECTORS_WITH_DATA:
result = await validate_sector(sector, db_url, verbose=False)
if result:
results[sector] = {
"span_count": result.span_count,
"enabled_coverage": round(result.enabled_coverage, 3),
"recommended_enables": result.recommended_enables,
"weight_issues": result.weight_issues,
"config_version": result.config_version,
"validated_at": result.validated_at,
}
return results
def main():
parser = argparse.ArgumentParser(description="Sector-scoped L1 config validation")
parser.add_argument("--sector", help="Validate specific sector")
parser.add_argument("--all", action="store_true", help="Validate all sectors with data")
parser.add_argument("--report", action="store_true", help="Generate JSON summary report")
parser.add_argument("--db-url", help="Database URL")
args = parser.parse_args()
if args.report:
results = asyncio.run(generate_summary_report(args.db_url))
print(json.dumps(results, indent=2))
elif args.all:
asyncio.run(validate_all_sectors(args.db_url))
elif args.sector:
asyncio.run(validate_sector(args.sector.upper(), args.db_url))
else:
parser.print_help()
print("\n\nSectors with real data:", ", ".join(sorted(SECTORS_WITH_DATA)))
if __name__ == "__main__":
main()