Initial commit - WhyRating Engine (Google Reviews Scraper)
This commit is contained in:
421
packages/reviewiq-pipeline/scripts/validate_l1_configs_v2.py
Normal file
421
packages/reviewiq-pipeline/scripts/validate_l1_configs_v2.py
Normal file
@@ -0,0 +1,421 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Wave 1 L1 Config Validation Script - V2 (Sector-Scoped)
|
||||
|
||||
Validates L1 primitive configs against SECTOR-SPECIFIC review data.
|
||||
Only validates sectors where we have real business data.
|
||||
|
||||
Key improvement over v1: spans are filtered by business → sector mapping,
|
||||
ensuring "TASTE in HEALTHCARE" noise doesn't pollute results.
|
||||
|
||||
Usage:
|
||||
python validate_l1_configs_v2.py --sector ENTERTAINMENT
|
||||
python validate_l1_configs_v2.py --sector AUTOMOTIVE
|
||||
python validate_l1_configs_v2.py --all
|
||||
python validate_l1_configs_v2.py --report # Summary only
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import asyncpg
|
||||
|
||||
# Paths
|
||||
DATA_DIR = Path(__file__).parent.parent / "data"
|
||||
CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l1"
|
||||
BRIEFS_DIR = DATA_DIR / "sector_briefs"
|
||||
|
||||
# Business → Sector mapping (ground truth)
|
||||
BUSINESS_TO_SECTOR = {
|
||||
"Go Karts Mar Menor": "ENTERTAINMENT",
|
||||
"ClickRent Gran Canaria | Alquiler de Coches y Furgonetas": "AUTOMOTIVE",
|
||||
"Soho Club": "ENTERTAINMENT",
|
||||
"Fika": "FOOD_DINING",
|
||||
}
|
||||
|
||||
# Sectors with real data
|
||||
SECTORS_WITH_DATA = {"ENTERTAINMENT", "AUTOMOTIVE", "FOOD_DINING"}
|
||||
|
||||
# URT code to primitive mapping
|
||||
URT_TO_PRIMITIVE = {
|
||||
# Offering codes
|
||||
"O1.01": "CONSISTENCY", "O1.02": "CRAFT", "O1.03": "FRESHNESS",
|
||||
"O1.04": "EFFECTIVENESS", "O1.05": "TASTE", "O1.06": "CONDITION",
|
||||
"O2.01": "ACCURACY", "O2.02": "EFFECTIVENESS", "O2.03": "CRAFT",
|
||||
"O3.01": "ACCURACY", "O3.02": "CONSISTENCY", "O3.03": "EFFECTIVENESS",
|
||||
# People codes
|
||||
"P1.01": "MANNER", "P1.02": "MANNER", "P1.03": "ATTENTIVENESS",
|
||||
"P1.04": "COMMUNICATION", "P1.05": "ATTENTIVENESS",
|
||||
"P2.01": "COMPETENCE", "P2.02": "COMPETENCE", "P2.03": "COMPETENCE",
|
||||
"P3.01": "COMMUNICATION", "P3.02": "COMMUNICATION", "P3.03": "COMMUNICATION",
|
||||
# Journey codes
|
||||
"J1.01": "SPEED", "J1.02": "RELIABILITY", "J1.03": "FRICTION",
|
||||
"J1.04": "SPEED", "J1.05": "RELIABILITY",
|
||||
"J2.01": "RELIABILITY", "J2.02": "RELIABILITY", "J2.03": "FRICTION",
|
||||
"J3.01": "FRICTION", "J3.02": "FRICTION", "J3.03": "FRICTION",
|
||||
# Environment codes
|
||||
"E1.01": "CLEANLINESS", "E1.02": "COMFORT", "E1.03": "AMBIANCE",
|
||||
"E1.04": "AMBIANCE", "E1.05": "COMFORT",
|
||||
"E2.01": "AMBIANCE", "E2.02": "COMFORT", "E2.03": "COMFORT",
|
||||
"E2.04": "AMBIANCE", "E2.05": "DIGITAL_UX",
|
||||
"E3.01": "SAFETY", "E3.02": "SAFETY", "E3.03": "ACCESSIBILITY",
|
||||
"E4.01": "ACCESSIBILITY", "E4.02": "ACCESSIBILITY", "E4.03": "DIGITAL_UX",
|
||||
# Access codes
|
||||
"A1.01": "AVAILABILITY", "A1.02": "AVAILABILITY", "A1.03": "AVAILABILITY",
|
||||
"A1.04": "ACCESSIBILITY", "A1.05": "ACCESSIBILITY",
|
||||
"A2.01": "ACCESSIBILITY", "A2.02": "ACCESSIBILITY", "A2.03": "DIGITAL_UX",
|
||||
"A3.01": "ACCESSIBILITY", "A3.02": "ACCESSIBILITY", "A3.03": "SPEED",
|
||||
"A4.01": "ACCESSIBILITY", "A4.02": "ACCESSIBILITY", "A4.03": "AVAILABILITY",
|
||||
# Value codes
|
||||
"V1.01": "PRICE_LEVEL", "V1.02": "PRICE_FAIRNESS", "V1.03": "PRICE_TRANSPARENCY",
|
||||
"V2.01": "PRICE_FAIRNESS", "V2.02": "PRICE_TRANSPARENCY", "V2.03": "VALUE_FOR_MONEY",
|
||||
"V3.01": "VALUE_FOR_MONEY", "V3.02": "VALUE_FOR_MONEY", "V3.03": "PRICE_FAIRNESS",
|
||||
"V4.01": "VALUE_FOR_MONEY", "V4.02": "VALUE_FOR_MONEY", "V4.03": "VALUE_FOR_MONEY",
|
||||
# Relationship codes (map to meta - these should stay unmapped)
|
||||
"R1.01": None, "R1.02": None, "R1.03": None,
|
||||
"R2.01": None, "R2.02": None, "R2.03": None,
|
||||
"R3.01": None, "R3.02": None, "R3.03": None,
|
||||
"R4.01": None, "R4.02": None, "R4.03": None,
|
||||
}
|
||||
|
||||
# Minimum threshold for "enable" recommendations (% of sector spans)
|
||||
ENABLE_THRESHOLD_PCT = 3.0 # Only recommend enable if >= 3% of sector spans
|
||||
|
||||
|
||||
@dataclass
|
||||
class SectorValidation:
|
||||
"""Validation result for a single sector."""
|
||||
sector_code: str
|
||||
businesses: list[str]
|
||||
span_count: int
|
||||
|
||||
# Coverage
|
||||
enabled_coverage: float
|
||||
disabled_hits: dict[str, int] = field(default_factory=dict)
|
||||
unmapped_count: int = 0
|
||||
|
||||
# Distribution
|
||||
primitive_counts: dict[str, int] = field(default_factory=dict)
|
||||
domain_distribution: dict[str, int] = field(default_factory=dict)
|
||||
valence_distribution: dict[str, int] = field(default_factory=dict)
|
||||
top_urt_codes: list[tuple[str, int]] = field(default_factory=list)
|
||||
|
||||
# Recommendations (threshold-gated)
|
||||
recommended_enables: list[tuple[str, float]] = field(default_factory=list) # (primitive, pct)
|
||||
recommended_disables: list[tuple[str, float]] = field(default_factory=list)
|
||||
weight_issues: list[str] = field(default_factory=list)
|
||||
|
||||
# Metadata
|
||||
validated_at: str = ""
|
||||
config_version: str = ""
|
||||
|
||||
|
||||
def load_l1_config(sector_code: str) -> dict[str, Any] | None:
|
||||
"""Load L1 config for a sector."""
|
||||
config_file = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
|
||||
if not config_file.exists():
|
||||
return None
|
||||
with open(config_file) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def get_businesses_for_sector(sector_code: str) -> list[str]:
|
||||
"""Get list of businesses belonging to a sector."""
|
||||
return [biz for biz, sector in BUSINESS_TO_SECTOR.items() if sector == sector_code]
|
||||
|
||||
|
||||
async def fetch_spans_for_businesses(pool: asyncpg.Pool, businesses: list[str]) -> list[dict]:
|
||||
"""Fetch spans for specific businesses only."""
|
||||
if not businesses:
|
||||
return []
|
||||
|
||||
query = """
|
||||
SELECT
|
||||
business_id,
|
||||
urt_primary,
|
||||
valence,
|
||||
intensity,
|
||||
span_text
|
||||
FROM pipeline.review_spans
|
||||
WHERE business_id = ANY($1)
|
||||
ORDER BY created_at DESC
|
||||
"""
|
||||
rows = await pool.fetch(query, businesses)
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
|
||||
def analyze_sector_spans(
|
||||
spans: list[dict],
|
||||
config: dict[str, Any],
|
||||
businesses: list[str],
|
||||
) -> SectorValidation:
|
||||
"""Analyze spans for a specific sector."""
|
||||
sector_code = config["sector_code"]
|
||||
enabled = set(config.get("enabled", []))
|
||||
disabled = set(config.get("disabled", []))
|
||||
weights = config.get("weights", {})
|
||||
config_version = config.get("config_version", "1.0")
|
||||
|
||||
# Counters
|
||||
primitive_counts: Counter = Counter()
|
||||
domain_counts: Counter = Counter()
|
||||
valence_counts: Counter = Counter()
|
||||
urt_counts: Counter = Counter()
|
||||
disabled_hits: Counter = Counter()
|
||||
unmapped = 0
|
||||
enabled_hits = 0
|
||||
|
||||
for span in spans:
|
||||
urt_code = span["urt_primary"]
|
||||
valence = span.get("valence", "V0")
|
||||
|
||||
urt_counts[urt_code] += 1
|
||||
valence_counts[valence] += 1
|
||||
domain_counts[urt_code[0]] += 1
|
||||
|
||||
primitive = URT_TO_PRIMITIVE.get(urt_code)
|
||||
if primitive:
|
||||
primitive_counts[primitive] += 1
|
||||
if primitive in enabled:
|
||||
enabled_hits += 1
|
||||
elif primitive in disabled:
|
||||
disabled_hits[primitive] += 1
|
||||
else:
|
||||
unmapped += 1
|
||||
|
||||
total = len(spans)
|
||||
enabled_coverage = enabled_hits / total if total > 0 else 0
|
||||
|
||||
# Threshold-gated recommendations
|
||||
recommended_enables = []
|
||||
for prim, count in disabled_hits.most_common():
|
||||
pct = count / total * 100 if total > 0 else 0
|
||||
if pct >= ENABLE_THRESHOLD_PCT:
|
||||
recommended_enables.append((prim, pct))
|
||||
|
||||
# Weight issues
|
||||
weight_issues = []
|
||||
for prim in weights:
|
||||
if primitive_counts[prim] == 0 and prim in enabled:
|
||||
weight_issues.append(f"{prim} weighted ({weights[prim]}x) but 0 appearances")
|
||||
|
||||
# High-frequency unweighted
|
||||
for prim, count in primitive_counts.most_common(5):
|
||||
pct = count / total * 100 if total > 0 else 0
|
||||
if prim in enabled and prim not in weights and pct >= 10:
|
||||
weight_issues.append(f"{prim} high freq ({pct:.1f}%) but unweighted")
|
||||
|
||||
return SectorValidation(
|
||||
sector_code=sector_code,
|
||||
businesses=businesses,
|
||||
span_count=total,
|
||||
enabled_coverage=enabled_coverage,
|
||||
disabled_hits=dict(disabled_hits),
|
||||
unmapped_count=unmapped,
|
||||
primitive_counts=dict(primitive_counts),
|
||||
domain_distribution=dict(domain_counts),
|
||||
valence_distribution=dict(valence_counts),
|
||||
top_urt_codes=urt_counts.most_common(15),
|
||||
recommended_enables=recommended_enables,
|
||||
weight_issues=weight_issues,
|
||||
validated_at=datetime.utcnow().isoformat(),
|
||||
config_version=config_version,
|
||||
)
|
||||
|
||||
|
||||
def print_sector_report(result: SectorValidation, config: dict):
|
||||
"""Print detailed validation report for a sector."""
|
||||
print("\n" + "=" * 70)
|
||||
print(f"SECTOR-SCOPED VALIDATION: {result.sector_code}")
|
||||
print("=" * 70)
|
||||
|
||||
print(f"\n📊 DATA SOURCE")
|
||||
print(f" Businesses: {', '.join(result.businesses)}")
|
||||
print(f" Total spans: {result.span_count:,}")
|
||||
print(f" Config version: {result.config_version}")
|
||||
|
||||
print(f"\n📈 COVERAGE")
|
||||
print(f" Enabled coverage: {result.enabled_coverage:.1%}")
|
||||
print(f" Unmapped (R-domain): {result.unmapped_count} ({result.unmapped_count/result.span_count*100:.1f}%)" if result.span_count > 0 else "")
|
||||
|
||||
# Domain distribution
|
||||
print(f"\n📁 DOMAIN DISTRIBUTION")
|
||||
domain_names = {"O": "Offering", "P": "People", "J": "Journey",
|
||||
"E": "Environment", "A": "Access", "V": "Value", "R": "Relationship"}
|
||||
for domain in "OPJEVRA":
|
||||
count = result.domain_distribution.get(domain, 0)
|
||||
pct = count / result.span_count * 100 if result.span_count > 0 else 0
|
||||
bar = "█" * int(pct / 2)
|
||||
print(f" {domain} {domain_names.get(domain, '?'):12} {count:4} ({pct:5.1f}%) {bar}")
|
||||
|
||||
# Top primitives
|
||||
print(f"\n🔝 TOP PRIMITIVES (sector-scoped)")
|
||||
enabled_set = set(config.get("enabled", []))
|
||||
disabled_set = set(config.get("disabled", []))
|
||||
weights = config.get("weights", {})
|
||||
|
||||
for prim, count in sorted(result.primitive_counts.items(), key=lambda x: -x[1])[:12]:
|
||||
pct = count / result.span_count * 100 if result.span_count > 0 else 0
|
||||
if prim in enabled_set:
|
||||
status = "✓"
|
||||
elif prim in disabled_set:
|
||||
status = "✗"
|
||||
else:
|
||||
status = "?"
|
||||
weight = f"({weights[prim]}x)" if prim in weights else ""
|
||||
print(f" {status} {prim:20} {count:4} ({pct:5.1f}%) {weight}")
|
||||
|
||||
# Threshold-gated recommendations
|
||||
if result.recommended_enables:
|
||||
print(f"\n⚠️ RECOMMENDED ENABLES (≥{ENABLE_THRESHOLD_PCT}% threshold)")
|
||||
for prim, pct in result.recommended_enables:
|
||||
count = result.disabled_hits.get(prim, 0)
|
||||
print(f" → ENABLE {prim}: {count} spans ({pct:.1f}%)")
|
||||
else:
|
||||
print(f"\n✅ No primitives exceed {ENABLE_THRESHOLD_PCT}% threshold for enabling")
|
||||
|
||||
# Low-frequency disabled (info only)
|
||||
low_freq_disabled = [(p, c) for p, c in result.disabled_hits.items()
|
||||
if c / result.span_count * 100 < ENABLE_THRESHOLD_PCT]
|
||||
if low_freq_disabled:
|
||||
print(f"\n📋 DISABLED BUT APPEARING (below threshold - no action)")
|
||||
for prim, count in sorted(low_freq_disabled, key=lambda x: -x[1])[:5]:
|
||||
pct = count / result.span_count * 100
|
||||
print(f" {prim}: {count} ({pct:.1f}%)")
|
||||
|
||||
# Weight issues
|
||||
if result.weight_issues:
|
||||
print(f"\n⚖️ WEIGHT ISSUES")
|
||||
for issue in result.weight_issues:
|
||||
print(f" • {issue}")
|
||||
|
||||
print(f"\n⏱️ Validated at: {result.validated_at}")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
async def validate_sector(
|
||||
sector_code: str,
|
||||
db_url: str | None = None,
|
||||
verbose: bool = True,
|
||||
) -> SectorValidation | None:
|
||||
"""Validate a single sector with sector-scoped data."""
|
||||
|
||||
if sector_code not in SECTORS_WITH_DATA:
|
||||
if verbose:
|
||||
print(f"⚠️ {sector_code}: No real business data available for validation")
|
||||
return None
|
||||
|
||||
config = load_l1_config(sector_code)
|
||||
if not config:
|
||||
if verbose:
|
||||
print(f"❌ No L1 config found for {sector_code}")
|
||||
return None
|
||||
|
||||
businesses = get_businesses_for_sector(sector_code)
|
||||
if not businesses:
|
||||
if verbose:
|
||||
print(f"⚠️ {sector_code}: No businesses mapped")
|
||||
return None
|
||||
|
||||
db_url = db_url or os.environ.get(
|
||||
"DATABASE_URL",
|
||||
"postgresql://scraper:scraper123@localhost:5437/scraper"
|
||||
)
|
||||
|
||||
pool = await asyncpg.create_pool(db_url)
|
||||
|
||||
try:
|
||||
spans = await fetch_spans_for_businesses(pool, businesses)
|
||||
if not spans:
|
||||
if verbose:
|
||||
print(f"⚠️ {sector_code}: No spans found for businesses")
|
||||
return None
|
||||
|
||||
result = analyze_sector_spans(spans, config, businesses)
|
||||
|
||||
if verbose:
|
||||
print_sector_report(result, config)
|
||||
|
||||
return result
|
||||
|
||||
finally:
|
||||
await pool.close()
|
||||
|
||||
|
||||
async def validate_all_sectors(db_url: str | None = None) -> dict[str, SectorValidation]:
|
||||
"""Validate all sectors with available data."""
|
||||
results = {}
|
||||
|
||||
for sector in SECTORS_WITH_DATA:
|
||||
result = await validate_sector(sector, db_url, verbose=True)
|
||||
if result:
|
||||
results[sector] = result
|
||||
|
||||
# Print summary
|
||||
print("\n" + "=" * 70)
|
||||
print("VALIDATION SUMMARY")
|
||||
print("=" * 70)
|
||||
print(f"\n{'Sector':<20} {'Spans':>8} {'Coverage':>10} {'Enables':>10}")
|
||||
print("-" * 50)
|
||||
|
||||
for sector, result in results.items():
|
||||
enables = len(result.recommended_enables)
|
||||
enables_str = f"{enables} recs" if enables > 0 else "✓ OK"
|
||||
print(f"{sector:<20} {result.span_count:>8,} {result.enabled_coverage:>9.1%} {enables_str:>10}")
|
||||
|
||||
print("-" * 50)
|
||||
print(f"Sectors validated: {len(results)}/{len(SECTORS_WITH_DATA)}")
|
||||
print(f"Sectors without data: {20 - len(SECTORS_WITH_DATA)}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
async def generate_summary_report(db_url: str | None = None) -> dict:
|
||||
"""Generate a JSON summary report for all sectors."""
|
||||
results = {}
|
||||
|
||||
for sector in SECTORS_WITH_DATA:
|
||||
result = await validate_sector(sector, db_url, verbose=False)
|
||||
if result:
|
||||
results[sector] = {
|
||||
"span_count": result.span_count,
|
||||
"enabled_coverage": round(result.enabled_coverage, 3),
|
||||
"recommended_enables": result.recommended_enables,
|
||||
"weight_issues": result.weight_issues,
|
||||
"config_version": result.config_version,
|
||||
"validated_at": result.validated_at,
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Sector-scoped L1 config validation")
|
||||
parser.add_argument("--sector", help="Validate specific sector")
|
||||
parser.add_argument("--all", action="store_true", help="Validate all sectors with data")
|
||||
parser.add_argument("--report", action="store_true", help="Generate JSON summary report")
|
||||
parser.add_argument("--db-url", help="Database URL")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.report:
|
||||
results = asyncio.run(generate_summary_report(args.db_url))
|
||||
print(json.dumps(results, indent=2))
|
||||
elif args.all:
|
||||
asyncio.run(validate_all_sectors(args.db_url))
|
||||
elif args.sector:
|
||||
asyncio.run(validate_sector(args.sector.upper(), args.db_url))
|
||||
else:
|
||||
parser.print_help()
|
||||
print("\n\nSectors with real data:", ", ".join(sorted(SECTORS_WITH_DATA)))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user