Initial commit - WhyRating Engine (Google Reviews Scraper)
This commit is contained in:
238
packages/reviewiq-pipeline/scripts/fix_l1_configs_v2.py
Normal file
238
packages/reviewiq-pipeline/scripts/fix_l1_configs_v2.py
Normal file
@@ -0,0 +1,238 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Guarded L1 Config Fixer - V2 (Threshold-based, Sector-scoped)
|
||||
|
||||
Only applies fixes when:
|
||||
1. Evidence is from sector-scoped validation
|
||||
2. Frequency exceeds threshold (default 3%)
|
||||
3. Changes are logged with version bump
|
||||
|
||||
Usage:
|
||||
python fix_l1_configs_v2.py --apply # Apply fixes from validation
|
||||
python fix_l1_configs_v2.py --dry-run # Show what would change
|
||||
python fix_l1_configs_v2.py --revert SECTOR # Revert to previous version
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
CONFIGS_DIR = Path(__file__).parent.parent / "data" / "primitive_configs" / "l1"
|
||||
CHANGELOG_FILE = CONFIGS_DIR / "CHANGELOG.json"
|
||||
|
||||
# Minimum threshold for auto-enabling (% of sector spans)
|
||||
ENABLE_THRESHOLD_PCT = 3.0
|
||||
|
||||
# Fixes derived from sector-scoped validation (validate_l1_configs_v2.py output)
|
||||
# These are the ONLY fixes that should be applied
|
||||
SECTOR_SCOPED_FIXES = {
|
||||
"ENTERTAINMENT": {
|
||||
"evidence": "2,320 spans from Go Karts + Soho Club",
|
||||
"enable": [
|
||||
("TASTE", 4.3, "Entertainment venues have concessions/food service"),
|
||||
],
|
||||
"add_weight": [
|
||||
("CRAFT", 1.3, "13.4% frequency but unweighted"),
|
||||
],
|
||||
"remove_weight": [],
|
||||
},
|
||||
"FOOD_DINING": {
|
||||
"evidence": "61 spans from Fika cafe",
|
||||
"enable": [
|
||||
("COMFORT", 9.8, "Seating/atmosphere comfort matters for cafes"),
|
||||
],
|
||||
"add_weight": [
|
||||
("AVAILABILITY", 1.2, "16.4% frequency but unweighted"),
|
||||
],
|
||||
"remove_weight": [
|
||||
# Note: Small sample size (61 spans) - these may be false negatives
|
||||
# Keep weights but flag for review with more data
|
||||
],
|
||||
},
|
||||
"AUTOMOTIVE": {
|
||||
"evidence": "1,201 spans from ClickRent car rental",
|
||||
"enable": [], # Nothing exceeds 3% threshold
|
||||
"add_weight": [],
|
||||
"remove_weight": [
|
||||
# CONDITION, HONESTY, PROMISES, RECOVERY all have 0 appearances
|
||||
# However, may be specific to rental vs repair - keep for now
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def load_changelog() -> list[dict]:
|
||||
"""Load the changelog file."""
|
||||
if CHANGELOG_FILE.exists():
|
||||
with open(CHANGELOG_FILE) as f:
|
||||
return json.load(f)
|
||||
return []
|
||||
|
||||
|
||||
def save_changelog(entries: list[dict]) -> None:
|
||||
"""Save the changelog file."""
|
||||
CHANGELOG_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(CHANGELOG_FILE, "w") as f:
|
||||
json.dump(entries, f, indent=2)
|
||||
f.write("\n")
|
||||
|
||||
|
||||
def load_config(sector_code: str) -> dict[str, Any] | None:
|
||||
"""Load a sector config."""
|
||||
config_path = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
|
||||
if not config_path.exists():
|
||||
return None
|
||||
with open(config_path) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def save_config(sector_code: str, config: dict[str, Any]) -> None:
|
||||
"""Save a sector config."""
|
||||
config_path = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(config, f, indent=2)
|
||||
f.write("\n")
|
||||
|
||||
|
||||
def apply_fixes(sector_code: str, fixes: dict, dry_run: bool = False) -> list[str]:
|
||||
"""Apply fixes to a sector config."""
|
||||
config = load_config(sector_code)
|
||||
if not config:
|
||||
return [f"❌ Config not found for {sector_code}"]
|
||||
|
||||
enabled = set(config.get("enabled", []))
|
||||
disabled = set(config.get("disabled", []))
|
||||
weights = config.get("weights", {})
|
||||
|
||||
changes = []
|
||||
evidence = fixes.get("evidence", "unknown")
|
||||
|
||||
# Enable primitives
|
||||
for prim, pct, reason in fixes.get("enable", []):
|
||||
if pct < ENABLE_THRESHOLD_PCT:
|
||||
changes.append(f"⚠️ SKIP {prim}: {pct:.1f}% below {ENABLE_THRESHOLD_PCT}% threshold")
|
||||
continue
|
||||
|
||||
if prim in disabled:
|
||||
disabled.remove(prim)
|
||||
enabled.add(prim)
|
||||
changes.append(f"✓ ENABLE {prim}: {pct:.1f}% in sector data ({reason})")
|
||||
elif prim not in enabled:
|
||||
enabled.add(prim)
|
||||
changes.append(f"✓ ADD {prim}: {pct:.1f}% in sector data ({reason})")
|
||||
|
||||
# Add weights
|
||||
for prim, weight, reason in fixes.get("add_weight", []):
|
||||
if prim not in weights:
|
||||
weights[prim] = weight
|
||||
changes.append(f"⚖️ WEIGHT {prim}: {weight}x ({reason})")
|
||||
|
||||
# Remove weights
|
||||
for prim, reason in fixes.get("remove_weight", []):
|
||||
if prim in weights:
|
||||
del weights[prim]
|
||||
changes.append(f"⚖️ UNWEIGHT {prim}: ({reason})")
|
||||
|
||||
if not changes:
|
||||
return ["✓ No changes needed"]
|
||||
|
||||
if not dry_run:
|
||||
# Bump version
|
||||
old_version = config.get("config_version", "1.0")
|
||||
major, minor = old_version.split(".")
|
||||
new_version = f"{major}.{int(minor) + 1}"
|
||||
|
||||
config["enabled"] = sorted(enabled)
|
||||
config["disabled"] = sorted(disabled)
|
||||
config["weights"] = dict(sorted(weights.items()))
|
||||
config["config_version"] = new_version
|
||||
config["config_updated_at"] = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
save_config(sector_code, config)
|
||||
|
||||
# Log to changelog
|
||||
changelog = load_changelog()
|
||||
changelog.append({
|
||||
"sector": sector_code,
|
||||
"version": new_version,
|
||||
"previous_version": old_version,
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"evidence": evidence,
|
||||
"changes": changes,
|
||||
})
|
||||
save_changelog(changelog)
|
||||
|
||||
changes.append(f"📝 Version: {old_version} → {new_version}")
|
||||
|
||||
return changes
|
||||
|
||||
|
||||
def revert_config(sector_code: str, to_version: str | None = None) -> list[str]:
|
||||
"""Revert a config to a previous version."""
|
||||
changelog = load_changelog()
|
||||
|
||||
# Find entries for this sector
|
||||
sector_entries = [e for e in changelog if e["sector"] == sector_code]
|
||||
if not sector_entries:
|
||||
return [f"❌ No changelog entries for {sector_code}"]
|
||||
|
||||
# TODO: Implement actual revert by storing full config snapshots
|
||||
return [f"⚠️ Revert not yet implemented - manual restore required"]
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Guarded L1 config fixer")
|
||||
parser.add_argument("--apply", action="store_true", help="Apply sector-scoped fixes")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show what would change")
|
||||
parser.add_argument("--revert", metavar="SECTOR", help="Revert sector to previous version")
|
||||
parser.add_argument("--sector", help="Apply to specific sector only")
|
||||
parser.add_argument("--show-changelog", action="store_true", help="Show changelog")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.show_changelog:
|
||||
changelog = load_changelog()
|
||||
print(json.dumps(changelog, indent=2))
|
||||
return
|
||||
|
||||
if args.revert:
|
||||
changes = revert_config(args.revert.upper())
|
||||
for change in changes:
|
||||
print(change)
|
||||
return
|
||||
|
||||
if args.apply or args.dry_run:
|
||||
print("=" * 60)
|
||||
print(f"L1 CONFIG FIXER V2 - {'DRY RUN' if args.dry_run else 'APPLYING FIXES'}")
|
||||
print(f"Threshold: {ENABLE_THRESHOLD_PCT}%")
|
||||
print("=" * 60)
|
||||
|
||||
sectors = [args.sector.upper()] if args.sector else SECTOR_SCOPED_FIXES.keys()
|
||||
|
||||
for sector in sectors:
|
||||
if sector not in SECTOR_SCOPED_FIXES:
|
||||
print(f"\n⚠️ {sector}: No sector-scoped fixes defined")
|
||||
continue
|
||||
|
||||
print(f"\n📁 {sector}")
|
||||
print(f" Evidence: {SECTOR_SCOPED_FIXES[sector]['evidence']}")
|
||||
|
||||
changes = apply_fixes(sector, SECTOR_SCOPED_FIXES[sector], dry_run=args.dry_run)
|
||||
for change in changes:
|
||||
print(f" {change}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
if args.dry_run:
|
||||
print("DRY RUN - No changes applied")
|
||||
else:
|
||||
print("Fixes applied - see CHANGELOG.json for history")
|
||||
print("=" * 60)
|
||||
return
|
||||
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user