Initial commit - WhyRating Engine (Google Reviews Scraper)

This commit is contained in:
Alejandro Gutiérrez
2026-02-02 18:19:00 +00:00
parent 0543a08242
commit 2206ddeff2
136 changed files with 51138 additions and 855 deletions

View File

@@ -0,0 +1,409 @@
#!/usr/bin/env python3
"""
Backfill review_facts_v1 from public.jobs.reviews_data.
Parses relative timestamps ("17 hours ago", "2 weeks ago") into absolute
timestamps anchored to job.created_at.
Usage:
python backfill_review_facts.py
python backfill_review_facts.py --dry-run
python backfill_review_facts.py --job-id <uuid>
"""
import argparse
import asyncio
import json
import os
import re
from datetime import datetime, timedelta, timezone
from typing import Any
import asyncpg
# Database URL
DB_URL = os.environ.get(
"DATABASE_URL",
"postgresql://scraper:scraper123@localhost:5437/scraper"
)
# =============================================================================
# RELATIVE TIMESTAMP PARSER
# =============================================================================
# Regex patterns for relative timestamps
RELATIVE_PATTERNS = [
# "17 hours ago", "2 weeks ago", "a month ago"
(r"(?:edited\s+)?(\d+|a|an)\s+(second|minute|hour|day|week|month|year)s?\s+ago", "standard"),
# "just now"
(r"just\s+now", "just_now"),
# "yesterday"
(r"yesterday", "yesterday"),
# "today"
(r"today", "today"),
]
# Time unit multipliers (in seconds)
TIME_UNITS = {
"second": 1,
"minute": 60,
"hour": 3600,
"day": 86400,
"week": 604800,
"month": 2592000, # 30 days
"year": 31536000, # 365 days
}
def parse_relative_timestamp(raw: str, reference_time: datetime) -> datetime | None:
"""
Parse a relative timestamp string into an absolute datetime.
Args:
raw: Relative timestamp like "17 hours ago", "Edited 2 weeks ago"
reference_time: The reference point (usually job.created_at)
Returns:
Absolute datetime or None if parsing failed
"""
if not raw:
return None
text = raw.lower().strip()
# Handle "just now"
if "just now" in text:
return reference_time
# Handle "yesterday"
if text == "yesterday":
return reference_time - timedelta(days=1)
# Handle "today"
if text == "today":
return reference_time
# Handle standard relative format
# Remove "edited " prefix if present
text = re.sub(r"^edited\s+", "", text)
# Match "N unit(s) ago"
match = re.match(r"(\d+|a|an)\s+(second|minute|hour|day|week|month|year)s?\s+ago", text)
if match:
quantity_str = match.group(1)
unit = match.group(2)
# Convert "a"/"an" to 1
if quantity_str in ("a", "an"):
quantity = 1
else:
quantity = int(quantity_str)
seconds = quantity * TIME_UNITS.get(unit, 0)
return reference_time - timedelta(seconds=seconds)
# Unknown format
return None
def parse_relative_timestamp_safe(raw: str, reference_time: datetime) -> tuple[datetime | None, bool]:
"""
Safe wrapper that returns (parsed_time, success).
"""
try:
result = parse_relative_timestamp(raw, reference_time)
return result, result is not None
except Exception:
return None, False
# =============================================================================
# BACKFILL LOGIC
# =============================================================================
async def get_jobs_with_reviews(pool: asyncpg.Pool, job_id: str | None = None) -> list[dict]:
"""Get all jobs with reviews_data."""
if job_id:
query = """
SELECT job_id, created_at, reviews_data,
COALESCE(metadata->>'business_name', url) as business_id
FROM public.jobs
WHERE job_id = $1
AND reviews_data IS NOT NULL
AND jsonb_typeof(reviews_data) = 'array'
"""
rows = await pool.fetch(query, job_id)
else:
query = """
SELECT job_id, created_at, reviews_data,
COALESCE(metadata->>'business_name', url) as business_id
FROM public.jobs
WHERE reviews_data IS NOT NULL
AND jsonb_typeof(reviews_data) = 'array'
ORDER BY created_at DESC
"""
rows = await pool.fetch(query)
return [dict(r) for r in rows]
async def get_run_id_for_job(pool: asyncpg.Pool, job_id: str) -> str | None:
"""Get the run_id associated with a job from detected_spans_v2."""
row = await pool.fetchrow("""
SELECT DISTINCT run_id FROM pipeline.detected_spans_v2
WHERE job_id = $1 AND run_id IS NOT NULL
LIMIT 1
""", job_id)
return str(row["run_id"]) if row and row["run_id"] else None
async def get_language_for_review(pool: asyncpg.Pool, review_id: str) -> str | None:
"""Get detected language for a review from spans."""
row = await pool.fetchrow("""
SELECT language FROM pipeline.detected_spans_v2
WHERE review_id = $1 AND language IS NOT NULL
LIMIT 1
""", review_id)
return row["language"] if row else None
async def upsert_review_facts(
pool: asyncpg.Pool,
facts: list[dict],
dry_run: bool = False,
) -> tuple[int, int]:
"""
Upsert review facts into the database.
Returns:
(inserted_count, updated_count)
"""
if dry_run or not facts:
return 0, 0
# Use executemany with ON CONFLICT
query = """
INSERT INTO pipeline.review_facts_v1
(review_id, business_id, job_id, run_id, rating, review_time_utc, raw_timestamp, author, language)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
ON CONFLICT (review_id) DO UPDATE SET
business_id = EXCLUDED.business_id,
job_id = EXCLUDED.job_id,
run_id = COALESCE(EXCLUDED.run_id, pipeline.review_facts_v1.run_id),
rating = EXCLUDED.rating,
review_time_utc = EXCLUDED.review_time_utc,
raw_timestamp = EXCLUDED.raw_timestamp,
author = EXCLUDED.author,
language = COALESCE(EXCLUDED.language, pipeline.review_facts_v1.language)
"""
# Prepare records
records = [
(
f["review_id"],
f["business_id"],
f["job_id"],
f.get("run_id"),
f.get("rating"),
f.get("review_time_utc"),
f.get("raw_timestamp"),
f.get("author"),
f.get("language"),
)
for f in facts
]
await pool.executemany(query, records)
return len(records), 0
async def backfill_job(
pool: asyncpg.Pool,
job: dict,
dry_run: bool = False,
verbose: bool = False,
) -> dict[str, Any]:
"""
Backfill review facts for a single job.
Returns:
Stats dict with counts and errors
"""
job_id = job["job_id"]
job_created = job["created_at"]
business_id = job["business_id"]
reviews_data = job["reviews_data"]
# asyncpg may return JSONB as string
if isinstance(reviews_data, str):
reviews_data = json.loads(reviews_data)
# Make job_created timezone-aware if it isn't
if job_created.tzinfo is None:
job_created = job_created.replace(tzinfo=timezone.utc)
# Get run_id for this job
run_id = await get_run_id_for_job(pool, str(job_id))
stats = {
"job_id": str(job_id),
"total_reviews": 0,
"parsed_ok": 0,
"parsed_failed": 0,
"inserted": 0,
"sample_failures": [],
}
facts = []
for review in reviews_data:
stats["total_reviews"] += 1
# Handle both dict and JSON string
if isinstance(review, str):
try:
review = json.loads(review)
except json.JSONDecodeError:
continue
review_id = review.get("review_id")
if not review_id:
continue
raw_timestamp = review.get("timestamp", "")
review_time, success = parse_relative_timestamp_safe(raw_timestamp, job_created)
if success:
stats["parsed_ok"] += 1
else:
stats["parsed_failed"] += 1
if len(stats["sample_failures"]) < 5:
stats["sample_failures"].append(raw_timestamp)
# Get language from spans if available
language = await get_language_for_review(pool, review_id) if not dry_run else None
facts.append({
"review_id": review_id,
"business_id": business_id,
"job_id": job_id,
"run_id": run_id,
"rating": review.get("rating"),
"review_time_utc": review_time,
"raw_timestamp": raw_timestamp,
"author": review.get("author"),
"language": language,
})
# Upsert
inserted, _ = await upsert_review_facts(pool, facts, dry_run=dry_run)
stats["inserted"] = inserted
if verbose:
print(f" Job {job_id}: {stats['total_reviews']} reviews, "
f"{stats['parsed_ok']} parsed OK, {stats['parsed_failed']} failed")
if stats["sample_failures"]:
print(f" Sample failures: {stats['sample_failures'][:3]}")
return stats
async def backfill_all(
pool: asyncpg.Pool,
job_id: str | None = None,
dry_run: bool = False,
verbose: bool = False,
) -> dict[str, Any]:
"""
Backfill review facts for all jobs (or a specific job).
Returns:
Aggregate stats
"""
jobs = await get_jobs_with_reviews(pool, job_id)
print(f"\n{'[DRY RUN] ' if dry_run else ''}Backfilling review_facts_v1 from {len(jobs)} jobs...")
aggregate = {
"jobs_processed": 0,
"total_reviews": 0,
"parsed_ok": 0,
"parsed_failed": 0,
"inserted": 0,
"unique_failure_patterns": set(),
}
for i, job in enumerate(jobs, 1):
if verbose:
print(f"\n[{i}/{len(jobs)}] Processing job {job['job_id']}...")
stats = await backfill_job(pool, job, dry_run=dry_run, verbose=verbose)
aggregate["jobs_processed"] += 1
aggregate["total_reviews"] += stats["total_reviews"]
aggregate["parsed_ok"] += stats["parsed_ok"]
aggregate["parsed_failed"] += stats["parsed_failed"]
aggregate["inserted"] += stats["inserted"]
aggregate["unique_failure_patterns"].update(stats["sample_failures"])
# Convert set to list for JSON serialization
aggregate["unique_failure_patterns"] = list(aggregate["unique_failure_patterns"])[:20]
return aggregate
# =============================================================================
# CLI
# =============================================================================
async def main_async(args):
"""Main async entry point."""
pool = await asyncpg.create_pool(DB_URL)
try:
stats = await backfill_all(
pool,
job_id=args.job_id,
dry_run=args.dry_run,
verbose=args.verbose,
)
print("\n" + "=" * 60)
print("BACKFILL COMPLETE")
print("=" * 60)
print(f"Jobs processed: {stats['jobs_processed']}")
print(f"Total reviews: {stats['total_reviews']}")
print(f"Timestamps parsed: {stats['parsed_ok']} ({stats['parsed_ok']/max(stats['total_reviews'],1)*100:.1f}%)")
print(f"Timestamps failed: {stats['parsed_failed']} ({stats['parsed_failed']/max(stats['total_reviews'],1)*100:.1f}%)")
if not args.dry_run:
print(f"Records upserted: {stats['inserted']}")
if stats["unique_failure_patterns"]:
print(f"\nUnparsed timestamp patterns ({len(stats['unique_failure_patterns'])}):")
for p in stats["unique_failure_patterns"][:10]:
print(f" - \"{p}\"")
# Calculate coverage
coverage = stats['parsed_ok'] / max(stats['total_reviews'], 1) * 100
if coverage < 90:
print(f"\n⚠️ WARNING: Timestamp coverage is {coverage:.1f}% (target: >90%)")
else:
print(f"\n✅ Timestamp coverage: {coverage:.1f}%")
finally:
await pool.close()
def main():
parser = argparse.ArgumentParser(description="Backfill review_facts_v1")
parser.add_argument("--job-id", help="Process a specific job only")
parser.add_argument("--dry-run", action="store_true", help="Don't write to database")
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
args = parser.parse_args()
asyncio.run(main_async(args))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,226 @@
"""
Config Resolver - Standalone version for scripts.
Resolves L1 config + sector brief for classification.
"""
import json
import logging
from pathlib import Path
from typing import Any
logger = logging.getLogger(__name__)
# Paths
DATA_DIR = Path(__file__).parent.parent / "data"
CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l1"
L2_CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l2"
BRIEFS_DIR = DATA_DIR / "sector_briefs"
# Meta primitives - always enabled
META_PRIMITIVES = frozenset([
"HONESTY", "ETHICS", "PROMISES",
"ACKNOWLEDGMENT", "RESPONSE_QUALITY", "RECOVERY",
"RETURN_INTENT", "RECOMMEND", "RECOGNITION",
"UNMAPPED",
])
# Core primitives dictionary
CORE_PRIMITIVES = {
"TASTE": {"domain": "O", "name": "Taste/Flavor", "def": "Sensory quality of food/beverage"},
"CRAFT": {"domain": "O", "name": "Craftsmanship", "def": "Skill of execution/preparation"},
"FRESHNESS": {"domain": "O", "name": "Freshness", "def": "Newness, not stale or old"},
"TEMPERATURE": {"domain": "O", "name": "Temperature", "def": "Hot/cold as expected"},
"EFFECTIVENESS": {"domain": "O", "name": "Effectiveness", "def": "Achieves intended purpose"},
"ACCURACY": {"domain": "O", "name": "Accuracy", "def": "Correct, as ordered/specified"},
"CONDITION": {"domain": "O", "name": "Condition", "def": "Physical state, wear, damage"},
"CONSISTENCY": {"domain": "O", "name": "Consistency", "def": "Same quality each time"},
"MANNER": {"domain": "P", "name": "Manner/Attitude", "def": "Friendliness, respect, warmth"},
"COMPETENCE": {"domain": "P", "name": "Competence", "def": "Knowledge and skill of staff"},
"ATTENTIVENESS": {"domain": "P", "name": "Attentiveness", "def": "Being present, responsive"},
"COMMUNICATION": {"domain": "P", "name": "Communication", "def": "Clarity, listening, updates"},
"SPEED": {"domain": "J", "name": "Speed/Wait", "def": "Time to service, waiting"},
"FRICTION": {"domain": "J", "name": "Friction", "def": "Obstacles, hassles, complexity"},
"RELIABILITY": {"domain": "J", "name": "Reliability", "def": "Dependable, keeps promises"},
"AVAILABILITY": {"domain": "J", "name": "Availability", "def": "Open when needed, bookable"},
"CLEANLINESS": {"domain": "E", "name": "Cleanliness", "def": "Hygiene, tidiness"},
"COMFORT": {"domain": "E", "name": "Comfort", "def": "Physical ease, seating"},
"SAFETY": {"domain": "E", "name": "Safety", "def": "Free from harm/danger"},
"AMBIANCE": {"domain": "E", "name": "Ambiance", "def": "Atmosphere, mood, vibe"},
"ACCESSIBILITY": {"domain": "E", "name": "Accessibility", "def": "Easy to reach, navigate"},
"DIGITAL_UX": {"domain": "E", "name": "Digital Experience", "def": "Website, app, online"},
"PRICE_LEVEL": {"domain": "V", "name": "Price Level", "def": "Absolute cost (cheap/expensive)"},
"PRICE_FAIRNESS": {"domain": "V", "name": "Price Fairness", "def": "Reasonable for what you get"},
"PRICE_TRANSPARENCY": {"domain": "V", "name": "Price Transparency", "def": "No hidden fees, clear pricing"},
"VALUE_FOR_MONEY": {"domain": "V", "name": "Value for Money", "def": "Worth what you paid"},
}
class ConfigResolver:
"""Resolves classification config for a business."""
def __init__(self):
self._l1_cache: dict[str, dict] = {}
self._l2_cache: dict[str, dict] = {}
self._brief_cache: dict[str, dict] = {}
def _load_l2_configs(self) -> list[dict[str, Any]]:
"""Load all L2 config files."""
if not L2_CONFIGS_DIR.exists():
return []
configs = []
for config_path in L2_CONFIGS_DIR.glob("*_config.json"):
try:
with open(config_path) as f:
config = json.load(f)
configs.append(config)
except Exception as e:
logger.warning(f"Failed to load L2 config {config_path}: {e}")
return configs
def _find_matching_l2(self, gbp_path: str) -> dict[str, Any] | None:
"""Find L2 config that matches the GBP path (most specific wins)."""
l2_configs = self._load_l2_configs()
# Find all matching configs (path starts with L2 gbp_path)
matches = []
for config in l2_configs:
l2_path = config.get("gbp_path", "")
if gbp_path.startswith(l2_path) or gbp_path == l2_path:
matches.append((len(l2_path), config))
if not matches:
return None
# Return most specific match (longest path)
matches.sort(key=lambda x: x[0], reverse=True)
return matches[0][1]
def _apply_l2_delta(self, l1_config: dict, l2_config: dict) -> dict:
"""Apply L2 delta to L1 config."""
result = l1_config.copy()
delta = l2_config.get("delta", {})
# Enable additional primitives
if "enable" in delta:
enabled = set(result.get("enabled", []))
enabled.update(delta["enable"])
result["enabled"] = list(enabled)
# Merge weights
if "weights" in delta:
weights = dict(result.get("weights", {}))
weights.update(delta["weights"])
result["weights"] = weights
# Update config version to indicate L2
result["config_version"] = l2_config.get("config_version", result.get("config_version", "1.0"))
result["l2_applied"] = l2_config.get("gbp_path")
return result
def _load_l1_config(self, sector_code: str) -> dict[str, Any] | None:
if sector_code in self._l1_cache:
return self._l1_cache[sector_code]
config_path = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
if not config_path.exists():
return None
with open(config_path) as f:
config = json.load(f)
self._l1_cache[sector_code] = config
return config
def _load_sector_brief(self, sector_code: str) -> dict[str, Any] | None:
if sector_code in self._brief_cache:
return self._brief_cache[sector_code]
brief_path = BRIEFS_DIR / f"{sector_code.lower()}_brief.json"
if not brief_path.exists():
return None
with open(brief_path) as f:
brief = json.load(f)
self._brief_cache[sector_code] = brief
return brief
async def get_business_mapping(self, pool, business_id: str) -> dict[str, Any] | None:
query = """
SELECT business_id, gbp_path::text, sector_code
FROM pipeline.business_taxonomy_map
WHERE business_id = $1
"""
row = await pool.fetchrow(query, business_id)
return dict(row) if row else None
def resolve_enabled_set(self, l1_config: dict) -> set[str]:
enabled = set(l1_config.get("enabled", []))
enabled.update(META_PRIMITIVES)
return enabled
def build_primitives_for_prompt(self, enabled: set[str], weights: dict[str, float]) -> dict[str, dict]:
result = {}
for prim in enabled:
if prim in CORE_PRIMITIVES:
entry = CORE_PRIMITIVES[prim].copy()
if prim in weights:
entry["weight"] = weights[prim]
result[prim] = entry
elif prim in META_PRIMITIVES:
result[prim] = {"domain": "M", "name": prim.replace("_", " ").title(), "meta": True}
return result
def extract_brief_signals(self, brief: dict) -> dict[str, Any]:
if not brief:
return {}
return {
"sector": brief.get("sector_code"),
"what_customers_judge": brief.get("what_customers_judge"),
"critical_pain_points": brief.get("critical_pain_points"),
"industry_terminology": brief.get("industry_terminology"),
}
async def resolve(self, business_id: str, pool, mode: str | None = None) -> dict[str, Any] | None:
mapping = await self.get_business_mapping(pool, business_id)
if not mapping:
return None
sector_code = mapping["sector_code"]
gbp_path = mapping["gbp_path"]
# Load L1 config (sector-level)
l1_config = self._load_l1_config(sector_code)
if not l1_config:
l1_config = {"enabled": list(CORE_PRIMITIVES.keys()), "weights": {}}
# Check for L2 config (category-level delta)
l2_config = self._find_matching_l2(gbp_path)
if l2_config:
logger.info(f"Applying L2 delta for {gbp_path}: {l2_config.get('gbp_path')}")
l1_config = self._apply_l2_delta(l1_config, l2_config)
brief = self._load_sector_brief(sector_code)
enabled = self.resolve_enabled_set(l1_config)
weights = dict(l1_config.get("weights", {}))
primitives = self.build_primitives_for_prompt(enabled, weights)
brief_signals = self.extract_brief_signals(brief)
return {
"business_id": business_id,
"gbp_path": gbp_path,
"sector_code": sector_code,
"config_version": l1_config.get("config_version", "1.0"),
"l2_applied": l1_config.get("l2_applied"),
"modes": [mode] if mode else ["in_person"],
"default_mode": mode or "in_person",
"enabled_primitives": sorted(enabled),
"disabled_primitives": sorted(l1_config.get("disabled", [])),
"weights": weights,
"brief": brief_signals,
"primitives": primitives,
}

View File

@@ -0,0 +1,148 @@
#!/usr/bin/env python3
"""
Fix L1 configs based on validation results.
Applies fixes discovered during validation:
1. Enable primitives that were disabled but appearing frequently
2. Remove weights for primitives with zero appearances
3. Add weights for high-frequency unweighted primitives
"""
import json
from pathlib import Path
CONFIGS_DIR = Path(__file__).parent.parent / "data" / "primitive_configs" / "l1"
# Fixes based on validation results
# Format: { sector: { "enable": [primitives], "disable": [primitives], "add_weight": {prim: weight}, "remove_weight": [prims] } }
FIXES = {
"ENTERTAINMENT": {
"enable": ["CRAFT", "CONSISTENCY", "COMMUNICATION", "FRICTION"],
"disable": [],
"add_weight": {},
"remove_weight": ["CONDITION"], # 0 appearances despite 1.4x weight
},
"FOOD_DINING": {
"enable": ["PRICE_LEVEL", "ACCESSIBILITY", "PRICE_TRANSPARENCY", "FRICTION", "EFFECTIVENESS"],
"disable": [],
"add_weight": {},
"remove_weight": [],
},
"AUTOMOTIVE": {
"enable": ["CRAFT", "CONSISTENCY", "PRICE_LEVEL", "AMBIANCE"],
"disable": [],
"add_weight": {},
"remove_weight": [],
},
"HEALTHCARE": {
"enable": ["CRAFT", "PRICE_LEVEL", "AMBIANCE"],
"disable": [],
"add_weight": {},
"remove_weight": [],
},
"RETAIL_SHOPPING": {
"enable": ["CRAFT", "PRICE_LEVEL", "AMBIANCE"],
"disable": [],
"add_weight": {},
"remove_weight": [],
},
"HOSPITALITY_TRAVEL": {
"enable": ["CRAFT", "CONSISTENCY", "PRICE_LEVEL"],
"disable": [],
"add_weight": {},
"remove_weight": [],
},
"PERSONAL_SERVICES": {
"enable": ["PRICE_LEVEL", "SPEED", "FRICTION"],
"disable": [],
"add_weight": {},
"remove_weight": [],
},
}
def fix_config(sector_code: str, fixes: dict) -> dict:
"""Apply fixes to a sector config."""
config_path = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
if not config_path.exists():
print(f" ⚠️ Config not found: {config_path}")
return None
with open(config_path) as f:
config = json.load(f)
enabled = set(config.get("enabled", []))
disabled = set(config.get("disabled", []))
weights = config.get("weights", {})
changes = []
# Apply enables (move from disabled to enabled)
for prim in fixes.get("enable", []):
if prim in disabled:
disabled.remove(prim)
enabled.add(prim)
changes.append(f"✓ Enabled {prim}")
elif prim not in enabled:
enabled.add(prim)
changes.append(f"✓ Added {prim} to enabled")
# Apply disables (move from enabled to disabled)
for prim in fixes.get("disable", []):
if prim in enabled:
enabled.remove(prim)
disabled.add(prim)
changes.append(f"✗ Disabled {prim}")
# Add weights
for prim, weight in fixes.get("add_weight", {}).items():
if prim not in weights:
weights[prim] = weight
changes.append(f"⚖️ Added weight {prim}: {weight}x")
# Remove weights
for prim in fixes.get("remove_weight", []):
if prim in weights:
del weights[prim]
changes.append(f"⚖️ Removed weight for {prim}")
# Update config
config["enabled"] = sorted(enabled)
config["disabled"] = sorted(disabled)
config["weights"] = dict(sorted(weights.items()))
config["config_version"] = "1.1" # Bump version
# Save
with open(config_path, "w") as f:
json.dump(config, f, indent=2)
f.write("\n")
return changes
def main():
print("=" * 60)
print("L1 CONFIG FIXER - Applying validation-based fixes")
print("=" * 60)
total_changes = 0
for sector, fixes in FIXES.items():
print(f"\n📁 {sector}")
changes = fix_config(sector, fixes)
if changes:
for change in changes:
print(f" {change}")
total_changes += len(changes)
else:
print(" No changes applied")
print(f"\n{'=' * 60}")
print(f"Total changes applied: {total_changes}")
print("Config version bumped to 1.1")
print("=" * 60)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,238 @@
#!/usr/bin/env python3
"""
Guarded L1 Config Fixer - V2 (Threshold-based, Sector-scoped)
Only applies fixes when:
1. Evidence is from sector-scoped validation
2. Frequency exceeds threshold (default 3%)
3. Changes are logged with version bump
Usage:
python fix_l1_configs_v2.py --apply # Apply fixes from validation
python fix_l1_configs_v2.py --dry-run # Show what would change
python fix_l1_configs_v2.py --revert SECTOR # Revert to previous version
"""
import argparse
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
CONFIGS_DIR = Path(__file__).parent.parent / "data" / "primitive_configs" / "l1"
CHANGELOG_FILE = CONFIGS_DIR / "CHANGELOG.json"
# Minimum threshold for auto-enabling (% of sector spans)
ENABLE_THRESHOLD_PCT = 3.0
# Fixes derived from sector-scoped validation (validate_l1_configs_v2.py output)
# These are the ONLY fixes that should be applied
SECTOR_SCOPED_FIXES = {
"ENTERTAINMENT": {
"evidence": "2,320 spans from Go Karts + Soho Club",
"enable": [
("TASTE", 4.3, "Entertainment venues have concessions/food service"),
],
"add_weight": [
("CRAFT", 1.3, "13.4% frequency but unweighted"),
],
"remove_weight": [],
},
"FOOD_DINING": {
"evidence": "61 spans from Fika cafe",
"enable": [
("COMFORT", 9.8, "Seating/atmosphere comfort matters for cafes"),
],
"add_weight": [
("AVAILABILITY", 1.2, "16.4% frequency but unweighted"),
],
"remove_weight": [
# Note: Small sample size (61 spans) - these may be false negatives
# Keep weights but flag for review with more data
],
},
"AUTOMOTIVE": {
"evidence": "1,201 spans from ClickRent car rental",
"enable": [], # Nothing exceeds 3% threshold
"add_weight": [],
"remove_weight": [
# CONDITION, HONESTY, PROMISES, RECOVERY all have 0 appearances
# However, may be specific to rental vs repair - keep for now
],
},
}
def load_changelog() -> list[dict]:
"""Load the changelog file."""
if CHANGELOG_FILE.exists():
with open(CHANGELOG_FILE) as f:
return json.load(f)
return []
def save_changelog(entries: list[dict]) -> None:
"""Save the changelog file."""
CHANGELOG_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(CHANGELOG_FILE, "w") as f:
json.dump(entries, f, indent=2)
f.write("\n")
def load_config(sector_code: str) -> dict[str, Any] | None:
"""Load a sector config."""
config_path = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
if not config_path.exists():
return None
with open(config_path) as f:
return json.load(f)
def save_config(sector_code: str, config: dict[str, Any]) -> None:
"""Save a sector config."""
config_path = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
with open(config_path, "w") as f:
json.dump(config, f, indent=2)
f.write("\n")
def apply_fixes(sector_code: str, fixes: dict, dry_run: bool = False) -> list[str]:
"""Apply fixes to a sector config."""
config = load_config(sector_code)
if not config:
return [f"❌ Config not found for {sector_code}"]
enabled = set(config.get("enabled", []))
disabled = set(config.get("disabled", []))
weights = config.get("weights", {})
changes = []
evidence = fixes.get("evidence", "unknown")
# Enable primitives
for prim, pct, reason in fixes.get("enable", []):
if pct < ENABLE_THRESHOLD_PCT:
changes.append(f"⚠️ SKIP {prim}: {pct:.1f}% below {ENABLE_THRESHOLD_PCT}% threshold")
continue
if prim in disabled:
disabled.remove(prim)
enabled.add(prim)
changes.append(f"✓ ENABLE {prim}: {pct:.1f}% in sector data ({reason})")
elif prim not in enabled:
enabled.add(prim)
changes.append(f"✓ ADD {prim}: {pct:.1f}% in sector data ({reason})")
# Add weights
for prim, weight, reason in fixes.get("add_weight", []):
if prim not in weights:
weights[prim] = weight
changes.append(f"⚖️ WEIGHT {prim}: {weight}x ({reason})")
# Remove weights
for prim, reason in fixes.get("remove_weight", []):
if prim in weights:
del weights[prim]
changes.append(f"⚖️ UNWEIGHT {prim}: ({reason})")
if not changes:
return ["✓ No changes needed"]
if not dry_run:
# Bump version
old_version = config.get("config_version", "1.0")
major, minor = old_version.split(".")
new_version = f"{major}.{int(minor) + 1}"
config["enabled"] = sorted(enabled)
config["disabled"] = sorted(disabled)
config["weights"] = dict(sorted(weights.items()))
config["config_version"] = new_version
config["config_updated_at"] = datetime.now(timezone.utc).isoformat()
save_config(sector_code, config)
# Log to changelog
changelog = load_changelog()
changelog.append({
"sector": sector_code,
"version": new_version,
"previous_version": old_version,
"timestamp": datetime.now(timezone.utc).isoformat(),
"evidence": evidence,
"changes": changes,
})
save_changelog(changelog)
changes.append(f"📝 Version: {old_version}{new_version}")
return changes
def revert_config(sector_code: str, to_version: str | None = None) -> list[str]:
"""Revert a config to a previous version."""
changelog = load_changelog()
# Find entries for this sector
sector_entries = [e for e in changelog if e["sector"] == sector_code]
if not sector_entries:
return [f"❌ No changelog entries for {sector_code}"]
# TODO: Implement actual revert by storing full config snapshots
return [f"⚠️ Revert not yet implemented - manual restore required"]
def main():
parser = argparse.ArgumentParser(description="Guarded L1 config fixer")
parser.add_argument("--apply", action="store_true", help="Apply sector-scoped fixes")
parser.add_argument("--dry-run", action="store_true", help="Show what would change")
parser.add_argument("--revert", metavar="SECTOR", help="Revert sector to previous version")
parser.add_argument("--sector", help="Apply to specific sector only")
parser.add_argument("--show-changelog", action="store_true", help="Show changelog")
args = parser.parse_args()
if args.show_changelog:
changelog = load_changelog()
print(json.dumps(changelog, indent=2))
return
if args.revert:
changes = revert_config(args.revert.upper())
for change in changes:
print(change)
return
if args.apply or args.dry_run:
print("=" * 60)
print(f"L1 CONFIG FIXER V2 - {'DRY RUN' if args.dry_run else 'APPLYING FIXES'}")
print(f"Threshold: {ENABLE_THRESHOLD_PCT}%")
print("=" * 60)
sectors = [args.sector.upper()] if args.sector else SECTOR_SCOPED_FIXES.keys()
for sector in sectors:
if sector not in SECTOR_SCOPED_FIXES:
print(f"\n⚠️ {sector}: No sector-scoped fixes defined")
continue
print(f"\n📁 {sector}")
print(f" Evidence: {SECTOR_SCOPED_FIXES[sector]['evidence']}")
changes = apply_fixes(sector, SECTOR_SCOPED_FIXES[sector], dry_run=args.dry_run)
for change in changes:
print(f" {change}")
print("\n" + "=" * 60)
if args.dry_run:
print("DRY RUN - No changes applied")
else:
print("Fixes applied - see CHANGELOG.json for history")
print("=" * 60)
return
parser.print_help()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,372 @@
#!/usr/bin/env python3
"""
Wave 0: Sector Brief Generator
Generates alignment context briefs for each sector.
These briefs inform Wave 1 and Wave 2 primitive config generation.
Usage:
python generate_sector_briefs.py # Generate all sectors
python generate_sector_briefs.py --sector FOOD_DINING # Generate one sector
python generate_sector_briefs.py --dry-run # Show what would be generated
python generate_sector_briefs.py --validate # Validate existing briefs
"""
import argparse
import json
import os
import sys
from datetime import datetime
from pathlib import Path
try:
from openai import OpenAI
except ImportError:
print("ERROR: openai package required. Install with: pip install openai")
sys.exit(1)
PROMPT_TEMPLATE = '''You are an expert in customer experience analysis across industries.
Your task: Generate a **sector brief** for the "{sector_name}" sector.
This brief will be used to align classification agents with industry-specific context.
It describes what customers care about — NOT how to classify, NOT what primitives to use.
## Sector Information
- **Code**: {sector_code}
- **Name**: {sector_name}
- **Description**: {description}
- **Sample Business Types**: {business_types}
## Output Requirements
Generate a JSON object with this exact structure:
```json
{{
"sector_code": "{sector_code}",
"sector_name": "{sector_name}",
"generated_at": "<ISO timestamp>",
"version": "1.0",
"what_customers_judge": {{
"description": "The primary dimensions customers evaluate in this sector",
"items": [
{{
"aspect": "string (2-5 words)",
"importance": "critical | high | moderate",
"why_it_matters": "string (1 sentence)"
}}
]
}},
"critical_pain_points": {{
"description": "What damages reputation most severely",
"items": [
{{
"pain_point": "string (2-5 words)",
"typical_language": ["phrases customers actually use in reviews"],
"reputation_impact": "severe | significant | moderate"
}}
]
}},
"common_praise": {{
"description": "What earns customer loyalty and positive reviews",
"items": [
{{
"praise_area": "string (2-5 words)",
"typical_language": ["phrases customers actually use in reviews"],
"loyalty_impact": "high | moderate"
}}
]
}},
"industry_terminology": {{
"description": "Domain-specific vocabulary",
"staff_terms": ["terms for staff roles in this sector"],
"product_terms": ["terms for products/services"],
"process_terms": ["terms for processes/interactions"],
"quality_terms": ["positive quality descriptors"],
"problem_terms": ["negative quality descriptors"]
}},
"mode_specific_concerns": {{
"description": "Different service modes have different priorities",
"modes": [
{{
"mode": "string (e.g., 'In-person', 'Online', 'Phone')",
"primary_concerns": ["top concerns for this mode"],
"unique_pain_points": ["pain points specific to this mode"]
}}
]
}},
"what_is_actionable": {{
"description": "Feedback businesses can act on",
"actionable_examples": [
{{
"feedback_type": "string",
"example": "string (realistic review excerpt)",
"action_owner": "role/team that can fix it"
}}
],
"not_actionable_examples": [
{{
"feedback_type": "string",
"example": "string (realistic review excerpt)",
"why_not_actionable": "string"
}}
]
}},
"sector_specific_signals": {{
"description": "Signals with sector-specific meaning",
"examples": [
{{
"signal": "string (word or phrase)",
"meaning_in_this_sector": "string",
"contrast_with": "how it differs in other sectors"
}}
]
}}
}}
```
## Critical Rules
1. **Use realistic review language** in `typical_language` arrays - actual phrases customers write
2. **Include 4-8 items** per array (not too few, not excessive)
3. **Be sector-specific** - don't use generic phrases that apply to all businesses
4. **Include appropriate modes** - only modes that actually exist in this sector
5. **NO primitive codes, priorities, weights, or solutions**
6. **Focus on WHAT customers care about**, not HOW to classify it
Return ONLY the JSON object, no markdown formatting or explanation.'''
def load_sectors(data_path: Path) -> list[dict]:
"""Load sector definitions from JSON file."""
with open(data_path) as f:
data = json.load(f)
return data["sectors"]
def generate_sector_brief(client: OpenAI, sector: dict, model: str) -> dict:
"""Generate a sector brief using LLM."""
prompt = PROMPT_TEMPLATE.format(
sector_code=sector["sector_code"],
sector_name=sector["sector_name"],
description=sector["description"],
business_types=", ".join(sector["sample_business_types"])
)
response = client.chat.completions.create(
model=model,
messages=[
{
"role": "system",
"content": "You are an expert customer experience analyst. Return only valid JSON, no markdown."
},
{"role": "user", "content": prompt}
],
temperature=0.3,
max_tokens=4000,
response_format={"type": "json_object"}
)
text = response.choices[0].message.content.strip()
# Parse JSON
brief = json.loads(text)
# Ensure required fields
brief["sector_code"] = sector["sector_code"]
brief["sector_name"] = sector["sector_name"]
brief["generated_at"] = datetime.utcnow().isoformat() + "Z"
brief["version"] = "1.0"
return brief
def validate_brief(brief: dict) -> list[str]:
"""Validate a sector brief, return list of issues."""
issues = []
required_keys = [
"what_customers_judge",
"critical_pain_points",
"common_praise",
"industry_terminology",
"mode_specific_concerns",
"what_is_actionable",
"sector_specific_signals"
]
for key in required_keys:
if key not in brief:
issues.append(f"Missing required key: {key}")
# Check array lengths
if "what_customers_judge" in brief:
items = brief["what_customers_judge"].get("items", [])
if len(items) < 3:
issues.append(f"what_customers_judge has only {len(items)} items (need 3+)")
if len(items) > 10:
issues.append(f"what_customers_judge has {len(items)} items (max 10)")
if "critical_pain_points" in brief:
items = brief["critical_pain_points"].get("items", [])
if len(items) < 3:
issues.append(f"critical_pain_points has only {len(items)} items (need 3+)")
if "common_praise" in brief:
items = brief["common_praise"].get("items", [])
if len(items) < 3:
issues.append(f"common_praise has only {len(items)} items (need 3+)")
# Check for forbidden content
text = json.dumps(brief).lower()
forbidden = ["priority", "weight", "primitive", "enabled", "disabled", "solution"]
for word in forbidden:
if word in text and word != "solution": # solution can appear in context
issues.append(f"Contains potentially forbidden term: {word}")
return issues
def save_brief(brief: dict, output_dir: Path) -> Path:
"""Save brief to JSON file."""
output_dir.mkdir(parents=True, exist_ok=True)
filename = f"{brief['sector_code'].lower()}_brief.json"
output_path = output_dir / filename
with open(output_path, "w") as f:
json.dump(brief, f, indent=2)
return output_path
def validate_existing_briefs(output_dir: Path) -> None:
"""Validate all existing brief files."""
if not output_dir.exists():
print(f"Output directory does not exist: {output_dir}")
return
files = list(output_dir.glob("*_brief.json"))
if not files:
print("No brief files found")
return
print(f"Validating {len(files)} brief files...\n")
all_valid = True
for filepath in sorted(files):
with open(filepath) as f:
brief = json.load(f)
issues = validate_brief(brief)
status = "" if not issues else ""
print(f"{status} {filepath.name}")
if issues:
all_valid = False
for issue in issues:
print(f" - {issue}")
print()
if all_valid:
print("All briefs valid!")
else:
print("Some briefs have issues.")
def main():
parser = argparse.ArgumentParser(description="Generate sector briefs for Wave 0")
parser.add_argument("--sector", help="Generate only this sector code")
parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
parser.add_argument("--validate", action="store_true", help="Validate existing briefs")
parser.add_argument("--output-dir", default="data/sector_briefs", help="Output directory")
parser.add_argument("--model", default="gpt-4o", help="OpenAI model to use")
args = parser.parse_args()
# Paths
script_dir = Path(__file__).parent
package_dir = script_dir.parent
data_path = package_dir / "data" / "sectors.json"
output_dir = package_dir / args.output_dir
# Validate mode
if args.validate:
validate_existing_briefs(output_dir)
return
# Load sectors
sectors = load_sectors(data_path)
print(f"Loaded {len(sectors)} sectors")
# Filter to single sector if specified
if args.sector:
sectors = [s for s in sectors if s["sector_code"] == args.sector]
if not sectors:
print(f"ERROR: Sector '{args.sector}' not found")
sys.exit(1)
if args.dry_run:
print("\n[DRY RUN] Would generate briefs for:")
for sector in sectors:
print(f" - {sector['sector_code']}: {sector['sector_name']}")
print(f"\nOutput directory: {output_dir}")
return
# Check API key
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
print("ERROR: OPENAI_API_KEY environment variable required")
sys.exit(1)
# Initialize client
client = OpenAI(api_key=api_key)
print(f"Using model: {args.model}")
# Generate briefs
results = {"success": [], "failed": []}
for i, sector in enumerate(sectors, 1):
print(f"\n[{i}/{len(sectors)}] Generating brief for: {sector['sector_name']}")
try:
brief = generate_sector_brief(client, sector, args.model)
# Validate
issues = validate_brief(brief)
if issues:
print(f" Warnings:")
for issue in issues:
print(f" - {issue}")
# Save
output_path = save_brief(brief, output_dir)
print(f" ✓ Saved to: {output_path}")
results["success"].append(sector["sector_code"])
except Exception as e:
print(f" ✗ FAILED: {e}")
results["failed"].append(sector["sector_code"])
# Summary
print(f"\n{'='*60}")
print(f"SUMMARY")
print(f"{'='*60}")
print(f"Success: {len(results['success'])}")
print(f"Failed: {len(results['failed'])}")
if results["failed"]:
print(f"\nFailed sectors: {', '.join(results['failed'])}")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,523 @@
"""
LLM Classifier - Real classification using OpenAI Responses API.
Uses JSON Schema to enforce strict output format.
Validates primitives against enabled set.
Stores raw response for audit.
Supports multilingual reviews with language detection.
"""
import hashlib
import json
import os
import re
import time
from typing import Any
from openai import OpenAI
# Language detection - try langdetect, fall back to heuristics
try:
from langdetect import detect as langdetect_detect, LangDetectException
LANGDETECT_AVAILABLE = True
except ImportError:
LANGDETECT_AVAILABLE = False
LangDetectException = Exception # Placeholder
def detect_language(text: str) -> tuple[str, float]:
"""
Detect the language of a text.
Returns (language_code, confidence).
Supported languages: en, es, de, fr, it, pt, ru, zh, ja, ko, ar, etc.
Falls back to heuristic detection if langdetect unavailable.
"""
if not text or len(text.strip()) < 3:
return "unknown", 0.0
text = text.strip()
# Try langdetect first (most accurate)
if LANGDETECT_AVAILABLE:
try:
lang = langdetect_detect(text)
# langdetect doesn't provide confidence directly, estimate based on text length
confidence = min(0.95, 0.5 + len(text) / 200)
return lang, confidence
except LangDetectException:
pass
# Fallback: Simple heuristic detection based on character ranges
# This is less accurate but works without dependencies
# Count characters in different scripts
latin = sum(1 for c in text if '\u0041' <= c <= '\u024F') # Latin extended
cyrillic = sum(1 for c in text if '\u0400' <= c <= '\u04FF') # Cyrillic
cjk = sum(1 for c in text if '\u4E00' <= c <= '\u9FFF') # CJK Unified
japanese = sum(1 for c in text if '\u3040' <= c <= '\u30FF') # Hiragana + Katakana
korean = sum(1 for c in text if '\uAC00' <= c <= '\uD7AF') # Hangul
arabic = sum(1 for c in text if '\u0600' <= c <= '\u06FF') # Arabic
total = len(text)
if total == 0:
return "unknown", 0.0
# Determine primary script
if cjk / total > 0.3:
return "zh", 0.6 # Chinese
if japanese / total > 0.2:
return "ja", 0.6 # Japanese
if korean / total > 0.3:
return "ko", 0.6 # Korean
if cyrillic / total > 0.3:
return "ru", 0.5 # Russian (could be other Cyrillic)
if arabic / total > 0.3:
return "ar", 0.5 # Arabic
if latin / total > 0.5:
# Latin script - try to distinguish languages by common words
text_lower = text.lower()
# Spanish indicators (expanded for better detection)
es_words = ['el', 'la', 'los', 'las', 'de', 'que', 'es', 'en', 'un', 'una',
'muy', 'pero', 'con', 'está', 'están', 'para', 'por', 'como',
'excelente', 'recomendado', 'servicio', 'bueno', 'malo', 'bien',
'todo', 'nada', 'más', 'sin', 'nunca', 'siempre', 'también']
es_score = sum(1 for w in es_words if re.search(rf'\b{w}\b', text_lower))
# Spanish-specific patterns (accents, ñ, inverted punctuation)
if 'ñ' in text_lower or '¿' in text or '¡' in text:
es_score += 3
if any(c in text_lower for c in 'áéíóúü'):
es_score += 1
# English indicators
en_words = ['the', 'and', 'is', 'are', 'was', 'were', 'this', 'that',
'with', 'for', 'but', 'not', 'very', 'great', 'good',
'service', 'place', 'food', 'staff', 'friendly', 'amazing',
'would', 'recommend', 'will', 'definitely', 'really']
en_score = sum(1 for w in en_words if re.search(rf'\b{w}\b', text_lower))
# German indicators
de_words = ['der', 'die', 'das', 'und', 'ist', 'sind', 'war', 'sehr',
'mit', 'für', 'aber', 'nicht', 'ein', 'eine', 'wir', 'ich',
'auch', 'gut', 'schlecht', 'toll', 'super']
de_score = sum(1 for w in de_words if re.search(rf'\b{w}\b', text_lower))
# German umlauts
if any(c in text_lower for c in 'äöüß'):
de_score += 2
# French indicators
fr_words = ['le', 'la', 'les', 'est', 'sont', 'très', 'mais', 'avec',
'pour', 'pas', 'un', 'une', 'et', 'nous', 'vous', 'bien',
'bon', 'mauvais', 'excellent', 'super', "c'est", "j'ai"]
fr_score = sum(1 for w in fr_words if re.search(rf'\b{w}\b', text_lower))
# French accents and patterns
if any(c in text_lower for c in 'àâçèêëîïôùûÿœæ'):
fr_score += 2
scores = {'es': es_score, 'en': en_score, 'de': de_score, 'fr': fr_score}
best_lang = max(scores, key=scores.get)
best_score = scores[best_lang]
if best_score >= 1: # Lowered threshold
confidence = min(0.75, 0.3 + best_score * 0.08)
return best_lang, confidence
# Default to English for Latin script
return "en", 0.3
return "unknown", 0.1
# Lazy client initialization
_client = None
def get_client() -> OpenAI:
"""Get OpenAI client, initializing lazily on first use."""
global _client
if _client is None:
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
raise RuntimeError(
"OPENAI_API_KEY environment variable not set. "
"Set it or use --dry-run / mock classifier."
)
_client = OpenAI(api_key=api_key)
return _client
# Default model
DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")
# Meta primitives - always available
META_PRIMITIVES = frozenset([
"HONESTY", "ETHICS", "PROMISES",
"ACKNOWLEDGMENT", "RESPONSE_QUALITY", "RECOVERY",
"RETURN_INTENT", "RECOMMEND", "RECOGNITION",
"UNMAPPED",
])
# JSON Schema for structured output
SPAN_SCHEMA = {
"name": "review_classification",
"strict": True,
"schema": {
"type": "object",
"additionalProperties": False,
"properties": {
"spans": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": False,
"properties": {
"primitive": {"type": "string"},
"valence": {"type": "string", "enum": ["positive", "negative", "mixed", "neutral"]},
"intensity": {"type": "integer", "minimum": 1, "maximum": 5},
"evidence": {"type": "string"},
"start_char": {"type": ["integer", "null"]},
"end_char": {"type": ["integer", "null"]},
"confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0},
"details": {"type": "null"}
},
"required": ["primitive", "valence", "intensity", "evidence", "confidence", "start_char", "end_char", "details"]
}
},
"unmapped": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": False,
"properties": {
"label": {"type": "string"},
"evidence": {"type": "string"},
"confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0}
},
"required": ["label", "evidence", "confidence"]
}
}
},
"required": ["spans", "unmapped"]
}
}
# System prompt
SYSTEM_PROMPT = """You are a review classification system that extracts semantic spans and maps them to primitives.
## RULES (MUST FOLLOW)
1. Use ONLY primitives from the enabled_primitives list provided. Do NOT invent new primitives.
2. Meta primitives are always available: HONESTY, ETHICS, PROMISES, ACKNOWLEDGMENT, RESPONSE_QUALITY, RECOVERY, RETURN_INTENT, RECOMMEND, RECOGNITION, UNMAPPED
3. If content doesn't fit any enabled primitive, use UNMAPPED or put it in the unmapped array with a descriptive label.
4. Output MUST match the JSON schema exactly. No extra keys.
5. Evidence must be a SHORT EXACT QUOTE from the review text (in original language).
6. Extract 1-5 spans per review. Prefer fewer, larger spans over many small ones.
7. If unsure about classification, lower the confidence score.
## VALENCE
- positive: praise, satisfaction, recommendation
- negative: complaint, dissatisfaction, warning
- mixed: both positive and negative in same span
- neutral: factual observation, no sentiment
## INTENSITY (1-5)
- 1: mild ("okay", "fine")
- 2: moderate ("good", "bad")
- 3: strong ("great", "terrible")
- 4: very strong ("amazing", "awful")
- 5: extreme ("best ever", "worst nightmare")
## CONFIDENCE
- 0.9+: Very certain the primitive fits
- 0.7-0.9: Confident
- 0.5-0.7: Moderate confidence
- <0.5: Low confidence (consider UNMAPPED)
Output valid JSON only. No markdown, no explanations."""
def compute_review_hash(text: str, config_version: str) -> str:
"""Compute hash for caching."""
key = f"{config_version}:{text}"
return hashlib.sha256(key.encode()).hexdigest()[:16]
def build_user_payload(
review_text: str,
rating: int | None,
config: dict[str, Any],
language: str = "auto",
) -> dict[str, Any]:
"""Build the user message payload for the LLM."""
# Extract only what the model needs
enabled = set(config.get("enabled_primitives", []))
enabled.update(META_PRIMITIVES)
# Build primitive definitions (minimal)
primitives_dict = config.get("primitives", {})
primitive_defs = {}
for prim in enabled:
if prim in primitives_dict:
info = primitives_dict[prim]
primitive_defs[prim] = info.get("def", info.get("name", prim))
elif prim in META_PRIMITIVES:
primitive_defs[prim] = f"Meta primitive: {prim.replace('_', ' ').lower()}"
# Extract brief signals (keep it short)
brief = config.get("brief", {})
brief_summary = {}
if brief.get("what_customers_judge"):
items = brief["what_customers_judge"]
if isinstance(items, dict):
items = items.get("items", [])
brief_summary["key_judgment_areas"] = [
item.get("aspect", item.get("area", str(item))) if isinstance(item, dict) else str(item)
for item in items[:5]
]
if brief.get("critical_pain_points"):
pains = brief["critical_pain_points"]
if isinstance(pains, dict):
pains = pains.get("items", [])
brief_summary["critical_pains"] = [
item.get("pain", str(item)) if isinstance(item, dict) else str(item)
for item in pains[:3]
]
return {
"business": {
"name": config.get("business_id"),
"sector": config.get("sector_code"),
"config_version": config.get("config_version"),
},
"enabled_primitives": sorted(enabled),
"primitive_definitions": primitive_defs,
"weights": config.get("weights", {}),
"sector_brief": brief_summary,
"review": {
"text": review_text,
"rating": rating,
"language": language,
},
}
def validate_response(
response: dict[str, Any],
enabled_primitives: set[str],
) -> tuple[dict[str, Any], list[str]]:
"""
Validate LLM response and fix invalid primitives.
Returns (validated_response, warnings).
"""
warnings = []
all_valid = enabled_primitives | META_PRIMITIVES
validated_spans = []
for span in response.get("spans", []):
prim = span.get("primitive")
if prim not in all_valid:
warnings.append(f"Invalid primitive '{prim}' → UNMAPPED (original: {prim})")
span["primitive"] = "UNMAPPED"
validated_spans.append(span)
return {
"spans": validated_spans,
"unmapped": response.get("unmapped", []),
}, warnings
def classify_review(
review_text: str,
rating: int | None,
config: dict[str, Any],
language: str = "auto",
model: str | None = None,
max_retries: int = 3,
) -> dict[str, Any]:
"""
Classify a single review using OpenAI.
Args:
review_text: The review text to classify
rating: Star rating (1-5) if available
config: Resolved config from ConfigResolver
language: Language hint (default: auto-detect)
model: Model to use (default: gpt-4o-mini)
max_retries: Max retries on transient errors
Returns:
{
"spans": [...],
"unmapped": [...],
"model": str,
"raw_response": str,
"review_hash": str,
"warnings": [...],
"detected_language": str,
"language_confidence": float,
}
"""
model = model or DEFAULT_MODEL
# Detect language if auto
detected_lang = "unknown"
lang_confidence = 0.0
if language == "auto":
detected_lang, lang_confidence = detect_language(review_text)
language = detected_lang
else:
detected_lang = language
lang_confidence = 1.0 # User-specified
# Build payload with detected language
payload = build_user_payload(review_text, rating, config, detected_lang)
user_content = json.dumps(payload, ensure_ascii=False, indent=None)
# Compute hash for caching
review_hash = compute_review_hash(review_text, config.get("config_version", "1.0"))
# Call OpenAI with retries
last_error = None
client = get_client()
for attempt in range(max_retries):
try:
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_content},
],
response_format={
"type": "json_schema",
"json_schema": SPAN_SCHEMA,
},
temperature=0.1, # Low temperature for consistency
max_tokens=2000,
)
# Parse response
raw_text = response.choices[0].message.content
parsed = json.loads(raw_text)
# Validate primitives
enabled = set(config.get("enabled_primitives", []))
validated, warnings = validate_response(parsed, enabled)
return {
"spans": validated["spans"],
"unmapped": validated["unmapped"],
"model": model,
"raw_response": raw_text,
"review_hash": review_hash,
"warnings": warnings,
"tokens": {
"prompt": response.usage.prompt_tokens if response.usage else 0,
"completion": response.usage.completion_tokens if response.usage else 0,
},
"detected_language": detected_lang,
"language_confidence": lang_confidence,
}
except json.JSONDecodeError as e:
last_error = f"JSON parse error: {e}"
# Don't retry parse errors - log and return fallback
break
except Exception as e:
last_error = str(e)
if "rate_limit" in str(e).lower() or "429" in str(e):
# Exponential backoff for rate limits
wait = 2 ** attempt
time.sleep(wait)
continue
elif "500" in str(e) or "502" in str(e) or "503" in str(e):
# Retry on server errors
time.sleep(1)
continue
else:
# Don't retry other errors
break
# Fallback response on error
return {
"spans": [{
"primitive": "UNMAPPED",
"valence": "neutral",
"intensity": 1,
"evidence": review_text[:100] if review_text else "",
"start_char": 0,
"end_char": min(100, len(review_text)) if review_text else 0,
"confidence": 0.1,
"details": {"error": last_error},
}],
"unmapped": [],
"model": model,
"raw_response": json.dumps({"error": last_error}),
"review_hash": review_hash,
"warnings": [f"Classification failed: {last_error}"],
"tokens": {"prompt": 0, "completion": 0},
"detected_language": detected_lang,
"language_confidence": lang_confidence,
}
async def classify_review_async(
review_text: str,
rating: int | None,
config: dict[str, Any],
language: str = "auto",
model: str | None = None,
) -> dict[str, Any]:
"""Async wrapper for classify_review."""
import asyncio
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
None,
lambda: classify_review(review_text, rating, config, language, model),
)
# Batch classification (for later optimization)
async def classify_batch(
reviews: list[dict[str, Any]],
config: dict[str, Any],
model: str | None = None,
max_concurrent: int = 5,
) -> list[dict[str, Any]]:
"""
Classify multiple reviews concurrently.
Args:
reviews: List of {"text": str, "rating": int, "language": str}
config: Resolved config
model: Model to use
max_concurrent: Max concurrent requests
Returns:
List of classification results
"""
import asyncio
semaphore = asyncio.Semaphore(max_concurrent)
async def classify_one(review: dict) -> dict:
async with semaphore:
return await classify_review_async(
review.get("text", ""),
review.get("rating"),
config,
review.get("language", "auto"),
model,
)
tasks = [classify_one(r) for r in reviews]
return await asyncio.gather(*tasks)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,457 @@
#!/usr/bin/env python3
"""
Wave 1 L1 Config Validation Script
Validates L1 primitive configs against real review data by analyzing:
1. Coverage: % of spans mapped to enabled primitives
2. Top primitives by frequency
3. Disabled primitives appearing (potential misconfig)
4. Weight effectiveness
Usage:
python validate_l1_configs.py --sector ENTERTAINMENT --job-url "gokarts"
python validate_l1_configs.py --sector AUTOMOTIVE --job-url "clickrent"
python validate_l1_configs.py --all
"""
import argparse
import asyncio
import json
import os
import sys
from collections import Counter, defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import asyncpg
# Paths
DATA_DIR = Path(__file__).parent.parent / "data"
CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l1"
BRIEFS_DIR = DATA_DIR / "sector_briefs"
# Primitive to URT domain mapping
# Primitives map to URT domains: O=Offering, P=People, J=Journey, E=Environment, A=Access, V=Value, R=Relationship
PRIMITIVE_TO_DOMAIN = {
# Quality -> Offering (O)
"TASTE": "O", "CRAFT": "O", "FRESHNESS": "O", "TEMPERATURE": "O",
"EFFECTIVENESS": "O", "ACCURACY": "O", "CONDITION": "O", "CONSISTENCY": "O",
# Service -> People (P)
"MANNER": "P", "COMPETENCE": "P", "ATTENTIVENESS": "P", "COMMUNICATION": "P",
# Process -> Journey (J)
"SPEED": "J", "FRICTION": "J", "RELIABILITY": "J", "AVAILABILITY": "J",
# Environment -> Environment (E)
"CLEANLINESS": "E", "COMFORT": "E", "SAFETY": "E", "AMBIANCE": "E",
"ACCESSIBILITY": "E", "DIGITAL_UX": "E",
# Value -> Value (V)
"PRICE_LEVEL": "V", "PRICE_FAIRNESS": "V", "PRICE_TRANSPARENCY": "V", "VALUE_FOR_MONEY": "V",
}
# URT code to primitive mapping (simplified - maps URT codes to closest primitive)
URT_TO_PRIMITIVE = {
# Offering codes
"O1.01": "CONSISTENCY", "O1.02": "CRAFT", "O1.03": "FRESHNESS",
"O1.04": "EFFECTIVENESS", "O1.05": "TASTE", "O1.06": "CONDITION",
"O2.01": "ACCURACY", "O2.02": "EFFECTIVENESS", "O2.03": "CRAFT",
"O3.01": "ACCURACY", "O3.02": "CONSISTENCY", "O3.03": "EFFECTIVENESS",
# People codes
"P1.01": "MANNER", "P1.02": "MANNER", "P1.03": "ATTENTIVENESS",
"P1.04": "COMMUNICATION", "P1.05": "ATTENTIVENESS",
"P2.01": "COMPETENCE", "P2.02": "COMPETENCE", "P2.03": "COMPETENCE",
"P3.01": "COMMUNICATION", "P3.02": "COMMUNICATION", "P3.03": "COMMUNICATION",
# Journey codes
"J1.01": "SPEED", "J1.02": "RELIABILITY", "J1.03": "FRICTION",
"J1.04": "SPEED", "J1.05": "RELIABILITY",
"J2.01": "RELIABILITY", "J2.02": "RELIABILITY", "J2.03": "FRICTION",
"J3.01": "FRICTION", "J3.02": "FRICTION", "J3.03": "FRICTION",
# Environment codes
"E1.01": "CLEANLINESS", "E1.02": "COMFORT", "E1.03": "AMBIANCE",
"E1.04": "AMBIANCE", "E1.05": "COMFORT",
"E2.01": "AMBIANCE", "E2.02": "COMFORT", "E2.03": "COMFORT",
"E2.04": "AMBIANCE", "E2.05": "DIGITAL_UX",
"E3.01": "SAFETY", "E3.02": "SAFETY", "E3.03": "ACCESSIBILITY",
"E4.01": "ACCESSIBILITY", "E4.02": "ACCESSIBILITY", "E4.03": "DIGITAL_UX",
# Access codes
"A1.01": "AVAILABILITY", "A1.02": "AVAILABILITY", "A1.03": "AVAILABILITY",
"A1.04": "ACCESSIBILITY", "A1.05": "ACCESSIBILITY",
"A2.01": "ACCESSIBILITY", "A2.02": "ACCESSIBILITY", "A2.03": "DIGITAL_UX",
"A3.01": "ACCESSIBILITY", "A3.02": "ACCESSIBILITY", "A3.03": "SPEED",
"A4.01": "ACCESSIBILITY", "A4.02": "ACCESSIBILITY", "A4.03": "AVAILABILITY",
# Value codes
"V1.01": "PRICE_LEVEL", "V1.02": "PRICE_FAIRNESS", "V1.03": "PRICE_TRANSPARENCY",
"V2.01": "PRICE_FAIRNESS", "V2.02": "PRICE_TRANSPARENCY", "V2.03": "VALUE_FOR_MONEY",
"V3.01": "VALUE_FOR_MONEY", "V3.02": "VALUE_FOR_MONEY", "V3.03": "PRICE_FAIRNESS",
"V4.01": "VALUE_FOR_MONEY", "V4.02": "VALUE_FOR_MONEY", "V4.03": "VALUE_FOR_MONEY",
# Relationship codes
"R1.01": "RELIABILITY", "R1.02": "RELIABILITY", "R1.03": "RELIABILITY",
"R2.01": "RELIABILITY", "R2.02": "CONSISTENCY", "R2.03": "RELIABILITY",
"R3.01": "MANNER", "R3.02": "MANNER", "R3.03": "COMMUNICATION",
"R4.01": "CONSISTENCY", "R4.02": "RELIABILITY", "R4.03": "CONSISTENCY",
}
@dataclass
class ValidationResult:
"""Validation results for a sector."""
sector_code: str
job_count: int
review_count: int
span_count: int
# Coverage metrics
enabled_coverage: float # % spans using enabled primitives
disabled_hits: dict[str, int] # disabled primitives that appeared
unmapped_count: int # spans that couldn't be mapped
# Distribution
primitive_counts: dict[str, int] # all primitives by count
domain_distribution: dict[str, int] # O, P, J, E, A, V, R
valence_distribution: dict[str, int] # V+, V-, V0, V±
# Top codes
top_urt_codes: list[tuple[str, int]]
# Recommendations
recommendations: list[str]
def load_l1_config(sector_code: str) -> dict[str, Any] | None:
"""Load L1 config for a sector."""
config_file = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
if not config_file.exists():
return None
with open(config_file) as f:
return json.load(f)
def load_sector_brief(sector_code: str) -> dict[str, Any] | None:
"""Load sector brief for a sector."""
brief_file = BRIEFS_DIR / f"{sector_code.lower()}_brief.json"
if not brief_file.exists():
return None
with open(brief_file) as f:
return json.load(f)
def map_urt_to_primitive(urt_code: str) -> str | None:
"""Map URT code to primitive."""
return URT_TO_PRIMITIVE.get(urt_code)
async def fetch_spans_for_jobs(pool: asyncpg.Pool, job_url_pattern: str) -> list[dict]:
"""Fetch spans for jobs matching URL pattern."""
query = """
SELECT
rs.urt_primary,
rs.valence,
rs.intensity,
rs.span_text,
j.url
FROM pipeline.review_spans rs
JOIN pipeline.reviews_raw rr ON rs.review_id = rr.review_id
JOIN public.jobs j ON rr.job_id = j.job_id
WHERE LOWER(j.url) LIKE $1
ORDER BY rs.created_at DESC
"""
rows = await pool.fetch(query, f"%{job_url_pattern.lower()}%")
return [dict(row) for row in rows]
async def fetch_all_spans(pool: asyncpg.Pool) -> list[dict]:
"""Fetch all spans from database."""
query = """
SELECT
urt_primary,
valence,
intensity,
span_text
FROM pipeline.review_spans
ORDER BY created_at DESC
"""
rows = await pool.fetch(query)
return [dict(row) for row in rows]
def analyze_spans(
spans: list[dict],
config: dict[str, Any],
) -> ValidationResult:
"""Analyze spans against L1 config."""
sector_code = config["sector_code"]
enabled = set(config.get("enabled", []))
disabled = set(config.get("disabled", []))
weights = config.get("weights", {})
# Counters
primitive_counts: Counter = Counter()
domain_counts: Counter = Counter()
valence_counts: Counter = Counter()
urt_counts: Counter = Counter()
disabled_hits: Counter = Counter()
unmapped = 0
enabled_hits = 0
for span in spans:
urt_code = span["urt_primary"]
valence = span.get("valence", "V0")
# Count URT codes
urt_counts[urt_code] += 1
# Count valence
valence_counts[valence] += 1
# Map to primitive
primitive = map_urt_to_primitive(urt_code)
if primitive:
primitive_counts[primitive] += 1
# Count domain
domain = PRIMITIVE_TO_DOMAIN.get(primitive, urt_code[0])
domain_counts[domain] += 1
# Check if enabled or disabled
if primitive in enabled:
enabled_hits += 1
elif primitive in disabled:
disabled_hits[primitive] += 1
else:
unmapped += 1
# Still count domain from URT code
domain_counts[urt_code[0]] += 1
# Calculate coverage
total = len(spans)
enabled_coverage = enabled_hits / total if total > 0 else 0
# Generate recommendations
recommendations = []
# Check disabled primitives that appeared frequently
for prim, count in disabled_hits.most_common(5):
if count >= 10:
pct = count / total * 100
recommendations.append(
f"ENABLE {prim}: Disabled but appeared {count} times ({pct:.1f}%)"
)
# Check for missing high-weight primitives
weighted_set = set(weights.keys())
for prim in weighted_set:
if primitive_counts[prim] == 0 and prim in enabled:
recommendations.append(
f"CHECK {prim}: Weighted ({weights[prim]}x) but no appearances"
)
# Check for frequently appearing unweighted primitives
for prim, count in primitive_counts.most_common(10):
if prim in enabled and prim not in weights and count >= total * 0.1:
pct = count / total * 100
recommendations.append(
f"WEIGHT {prim}: High frequency ({count}, {pct:.1f}%) but not weighted"
)
return ValidationResult(
sector_code=sector_code,
job_count=1, # Will be updated by caller
review_count=0, # Not tracked at span level
span_count=total,
enabled_coverage=enabled_coverage,
disabled_hits=dict(disabled_hits),
unmapped_count=unmapped,
primitive_counts=dict(primitive_counts),
domain_distribution=dict(domain_counts),
valence_distribution=dict(valence_counts),
top_urt_codes=urt_counts.most_common(15),
recommendations=recommendations,
)
def print_validation_report(result: ValidationResult, config: dict, brief: dict | None):
"""Print formatted validation report."""
print("\n" + "=" * 70)
print(f"VALIDATION REPORT: {result.sector_code}")
print("=" * 70)
# Overview
print(f"\n📊 OVERVIEW")
print(f" Spans analyzed: {result.span_count:,}")
print(f" Enabled coverage: {result.enabled_coverage:.1%}")
print(f" Unmapped spans: {result.unmapped_count} ({result.unmapped_count/result.span_count*100:.1f}%)" if result.span_count > 0 else " No spans")
# Config summary
print(f"\n⚙️ CONFIG SUMMARY")
print(f" Enabled: {len(config.get('enabled', []))} primitives")
print(f" Disabled: {len(config.get('disabled', []))} primitives")
print(f" Weighted: {len(config.get('weights', {}))} primitives")
# Domain distribution
print(f"\n📁 DOMAIN DISTRIBUTION")
domain_names = {"O": "Offering", "P": "People", "J": "Journey",
"E": "Environment", "A": "Access", "V": "Value", "R": "Relationship"}
for domain in "OPJEVRA":
count = result.domain_distribution.get(domain, 0)
pct = count / result.span_count * 100 if result.span_count > 0 else 0
bar = "" * int(pct / 2)
print(f" {domain} {domain_names.get(domain, '?'):12} {count:4} ({pct:5.1f}%) {bar}")
# Valence distribution
print(f"\n😊 VALENCE DISTRIBUTION")
for val in ["V+", "V-", "V0", ""]:
count = result.valence_distribution.get(val, 0)
pct = count / result.span_count * 100 if result.span_count > 0 else 0
print(f" {val}: {count:4} ({pct:5.1f}%)")
# Top primitives
print(f"\n🔝 TOP PRIMITIVES")
enabled_set = set(config.get("enabled", []))
weights = config.get("weights", {})
for prim, count in sorted(result.primitive_counts.items(), key=lambda x: -x[1])[:12]:
pct = count / result.span_count * 100 if result.span_count > 0 else 0
status = "" if prim in enabled_set else ""
weight = f"({weights[prim]}x)" if prim in weights else ""
print(f" {status} {prim:20} {count:4} ({pct:5.1f}%) {weight}")
# Top URT codes
print(f"\n📋 TOP URT CODES")
for code, count in result.top_urt_codes[:10]:
pct = count / result.span_count * 100 if result.span_count > 0 else 0
mapped = URT_TO_PRIMITIVE.get(code, "UNMAPPED")
print(f" {code}: {count:4} ({pct:5.1f}%) → {mapped}")
# Disabled but appearing
if result.disabled_hits:
print(f"\n⚠️ DISABLED BUT APPEARING")
for prim, count in sorted(result.disabled_hits.items(), key=lambda x: -x[1]):
pct = count / result.span_count * 100 if result.span_count > 0 else 0
print(f" {prim}: {count} ({pct:.1f}%)")
# Recommendations
if result.recommendations:
print(f"\n💡 RECOMMENDATIONS")
for rec in result.recommendations:
print(f"{rec}")
# Brief signals check (if available)
if brief:
print(f"\n📝 BRIEF SIGNALS CHECK")
what_customers_judge = brief.get("what_customers_judge", {})
if isinstance(what_customers_judge, dict):
items = what_customers_judge.get("items", [])
else:
items = what_customers_judge if isinstance(what_customers_judge, list) else []
print(f" Key judgment areas from brief:")
for item in items[:5]:
if isinstance(item, dict):
print(f"{item.get('area', item)}")
else:
print(f"{item}")
print("\n" + "=" * 70)
async def run_validation(
sector_code: str,
job_url_pattern: str | None = None,
db_url: str | None = None,
):
"""Run validation for a sector."""
# Load config
config = load_l1_config(sector_code)
if not config:
print(f"❌ No L1 config found for {sector_code}")
return None
# Load brief
brief = load_sector_brief(sector_code)
# Connect to database
db_url = db_url or os.environ.get(
"DATABASE_URL",
"postgresql://scraper:scraper123@localhost:5437/scraper"
)
pool = await asyncpg.create_pool(db_url)
try:
# Fetch spans
if job_url_pattern:
spans = await fetch_spans_for_jobs(pool, job_url_pattern)
if not spans:
print(f"⚠️ No spans found for jobs matching '{job_url_pattern}'")
return None
else:
spans = await fetch_all_spans(pool)
# Analyze
result = analyze_spans(spans, config)
# Print report
print_validation_report(result, config, brief)
return result
finally:
await pool.close()
async def run_all_validations(db_url: str | None = None):
"""Run validation for all sectors with available data."""
# Known jobs and their sectors
jobs_by_sector = {
"ENTERTAINMENT": ["gokarts", "soho"],
"AUTOMOTIVE": ["clickrent"],
"PERSONAL_SERVICES": ["fleitas"],
"FOOD_DINING": ["fika"],
}
results = {}
for sector, job_patterns in jobs_by_sector.items():
print(f"\n{'='*70}")
print(f"Validating {sector}...")
print(f"{'='*70}")
for pattern in job_patterns:
result = await run_validation(sector, pattern, db_url)
if result:
results[f"{sector}:{pattern}"] = result
# Summary
print("\n" + "=" * 70)
print("VALIDATION SUMMARY")
print("=" * 70)
for key, result in results.items():
sector, pattern = key.split(":")
print(f"\n{sector} ({pattern}):")
print(f" Coverage: {result.enabled_coverage:.1%}")
print(f" Spans: {result.span_count}")
if result.disabled_hits:
print(f" ⚠️ Disabled hits: {sum(result.disabled_hits.values())}")
if result.recommendations:
print(f" Recommendations: {len(result.recommendations)}")
def main():
parser = argparse.ArgumentParser(description="Validate L1 primitive configs")
parser.add_argument("--sector", help="Sector code (e.g., ENTERTAINMENT)")
parser.add_argument("--job-url", help="Job URL pattern to filter (e.g., 'gokarts')")
parser.add_argument("--all", action="store_true", help="Run all validations")
parser.add_argument("--db-url", help="Database URL")
args = parser.parse_args()
if args.all:
asyncio.run(run_all_validations(args.db_url))
elif args.sector:
asyncio.run(run_validation(args.sector, args.job_url, args.db_url))
else:
parser.print_help()
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,421 @@
#!/usr/bin/env python3
"""
Wave 1 L1 Config Validation Script - V2 (Sector-Scoped)
Validates L1 primitive configs against SECTOR-SPECIFIC review data.
Only validates sectors where we have real business data.
Key improvement over v1: spans are filtered by business → sector mapping,
ensuring "TASTE in HEALTHCARE" noise doesn't pollute results.
Usage:
python validate_l1_configs_v2.py --sector ENTERTAINMENT
python validate_l1_configs_v2.py --sector AUTOMOTIVE
python validate_l1_configs_v2.py --all
python validate_l1_configs_v2.py --report # Summary only
"""
import argparse
import asyncio
import json
import os
from collections import Counter
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any
import asyncpg
# Paths
DATA_DIR = Path(__file__).parent.parent / "data"
CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l1"
BRIEFS_DIR = DATA_DIR / "sector_briefs"
# Business → Sector mapping (ground truth)
BUSINESS_TO_SECTOR = {
"Go Karts Mar Menor": "ENTERTAINMENT",
"ClickRent Gran Canaria | Alquiler de Coches y Furgonetas": "AUTOMOTIVE",
"Soho Club": "ENTERTAINMENT",
"Fika": "FOOD_DINING",
}
# Sectors with real data
SECTORS_WITH_DATA = {"ENTERTAINMENT", "AUTOMOTIVE", "FOOD_DINING"}
# URT code to primitive mapping
URT_TO_PRIMITIVE = {
# Offering codes
"O1.01": "CONSISTENCY", "O1.02": "CRAFT", "O1.03": "FRESHNESS",
"O1.04": "EFFECTIVENESS", "O1.05": "TASTE", "O1.06": "CONDITION",
"O2.01": "ACCURACY", "O2.02": "EFFECTIVENESS", "O2.03": "CRAFT",
"O3.01": "ACCURACY", "O3.02": "CONSISTENCY", "O3.03": "EFFECTIVENESS",
# People codes
"P1.01": "MANNER", "P1.02": "MANNER", "P1.03": "ATTENTIVENESS",
"P1.04": "COMMUNICATION", "P1.05": "ATTENTIVENESS",
"P2.01": "COMPETENCE", "P2.02": "COMPETENCE", "P2.03": "COMPETENCE",
"P3.01": "COMMUNICATION", "P3.02": "COMMUNICATION", "P3.03": "COMMUNICATION",
# Journey codes
"J1.01": "SPEED", "J1.02": "RELIABILITY", "J1.03": "FRICTION",
"J1.04": "SPEED", "J1.05": "RELIABILITY",
"J2.01": "RELIABILITY", "J2.02": "RELIABILITY", "J2.03": "FRICTION",
"J3.01": "FRICTION", "J3.02": "FRICTION", "J3.03": "FRICTION",
# Environment codes
"E1.01": "CLEANLINESS", "E1.02": "COMFORT", "E1.03": "AMBIANCE",
"E1.04": "AMBIANCE", "E1.05": "COMFORT",
"E2.01": "AMBIANCE", "E2.02": "COMFORT", "E2.03": "COMFORT",
"E2.04": "AMBIANCE", "E2.05": "DIGITAL_UX",
"E3.01": "SAFETY", "E3.02": "SAFETY", "E3.03": "ACCESSIBILITY",
"E4.01": "ACCESSIBILITY", "E4.02": "ACCESSIBILITY", "E4.03": "DIGITAL_UX",
# Access codes
"A1.01": "AVAILABILITY", "A1.02": "AVAILABILITY", "A1.03": "AVAILABILITY",
"A1.04": "ACCESSIBILITY", "A1.05": "ACCESSIBILITY",
"A2.01": "ACCESSIBILITY", "A2.02": "ACCESSIBILITY", "A2.03": "DIGITAL_UX",
"A3.01": "ACCESSIBILITY", "A3.02": "ACCESSIBILITY", "A3.03": "SPEED",
"A4.01": "ACCESSIBILITY", "A4.02": "ACCESSIBILITY", "A4.03": "AVAILABILITY",
# Value codes
"V1.01": "PRICE_LEVEL", "V1.02": "PRICE_FAIRNESS", "V1.03": "PRICE_TRANSPARENCY",
"V2.01": "PRICE_FAIRNESS", "V2.02": "PRICE_TRANSPARENCY", "V2.03": "VALUE_FOR_MONEY",
"V3.01": "VALUE_FOR_MONEY", "V3.02": "VALUE_FOR_MONEY", "V3.03": "PRICE_FAIRNESS",
"V4.01": "VALUE_FOR_MONEY", "V4.02": "VALUE_FOR_MONEY", "V4.03": "VALUE_FOR_MONEY",
# Relationship codes (map to meta - these should stay unmapped)
"R1.01": None, "R1.02": None, "R1.03": None,
"R2.01": None, "R2.02": None, "R2.03": None,
"R3.01": None, "R3.02": None, "R3.03": None,
"R4.01": None, "R4.02": None, "R4.03": None,
}
# Minimum threshold for "enable" recommendations (% of sector spans)
ENABLE_THRESHOLD_PCT = 3.0 # Only recommend enable if >= 3% of sector spans
@dataclass
class SectorValidation:
"""Validation result for a single sector."""
sector_code: str
businesses: list[str]
span_count: int
# Coverage
enabled_coverage: float
disabled_hits: dict[str, int] = field(default_factory=dict)
unmapped_count: int = 0
# Distribution
primitive_counts: dict[str, int] = field(default_factory=dict)
domain_distribution: dict[str, int] = field(default_factory=dict)
valence_distribution: dict[str, int] = field(default_factory=dict)
top_urt_codes: list[tuple[str, int]] = field(default_factory=list)
# Recommendations (threshold-gated)
recommended_enables: list[tuple[str, float]] = field(default_factory=list) # (primitive, pct)
recommended_disables: list[tuple[str, float]] = field(default_factory=list)
weight_issues: list[str] = field(default_factory=list)
# Metadata
validated_at: str = ""
config_version: str = ""
def load_l1_config(sector_code: str) -> dict[str, Any] | None:
"""Load L1 config for a sector."""
config_file = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
if not config_file.exists():
return None
with open(config_file) as f:
return json.load(f)
def get_businesses_for_sector(sector_code: str) -> list[str]:
"""Get list of businesses belonging to a sector."""
return [biz for biz, sector in BUSINESS_TO_SECTOR.items() if sector == sector_code]
async def fetch_spans_for_businesses(pool: asyncpg.Pool, businesses: list[str]) -> list[dict]:
"""Fetch spans for specific businesses only."""
if not businesses:
return []
query = """
SELECT
business_id,
urt_primary,
valence,
intensity,
span_text
FROM pipeline.review_spans
WHERE business_id = ANY($1)
ORDER BY created_at DESC
"""
rows = await pool.fetch(query, businesses)
return [dict(row) for row in rows]
def analyze_sector_spans(
spans: list[dict],
config: dict[str, Any],
businesses: list[str],
) -> SectorValidation:
"""Analyze spans for a specific sector."""
sector_code = config["sector_code"]
enabled = set(config.get("enabled", []))
disabled = set(config.get("disabled", []))
weights = config.get("weights", {})
config_version = config.get("config_version", "1.0")
# Counters
primitive_counts: Counter = Counter()
domain_counts: Counter = Counter()
valence_counts: Counter = Counter()
urt_counts: Counter = Counter()
disabled_hits: Counter = Counter()
unmapped = 0
enabled_hits = 0
for span in spans:
urt_code = span["urt_primary"]
valence = span.get("valence", "V0")
urt_counts[urt_code] += 1
valence_counts[valence] += 1
domain_counts[urt_code[0]] += 1
primitive = URT_TO_PRIMITIVE.get(urt_code)
if primitive:
primitive_counts[primitive] += 1
if primitive in enabled:
enabled_hits += 1
elif primitive in disabled:
disabled_hits[primitive] += 1
else:
unmapped += 1
total = len(spans)
enabled_coverage = enabled_hits / total if total > 0 else 0
# Threshold-gated recommendations
recommended_enables = []
for prim, count in disabled_hits.most_common():
pct = count / total * 100 if total > 0 else 0
if pct >= ENABLE_THRESHOLD_PCT:
recommended_enables.append((prim, pct))
# Weight issues
weight_issues = []
for prim in weights:
if primitive_counts[prim] == 0 and prim in enabled:
weight_issues.append(f"{prim} weighted ({weights[prim]}x) but 0 appearances")
# High-frequency unweighted
for prim, count in primitive_counts.most_common(5):
pct = count / total * 100 if total > 0 else 0
if prim in enabled and prim not in weights and pct >= 10:
weight_issues.append(f"{prim} high freq ({pct:.1f}%) but unweighted")
return SectorValidation(
sector_code=sector_code,
businesses=businesses,
span_count=total,
enabled_coverage=enabled_coverage,
disabled_hits=dict(disabled_hits),
unmapped_count=unmapped,
primitive_counts=dict(primitive_counts),
domain_distribution=dict(domain_counts),
valence_distribution=dict(valence_counts),
top_urt_codes=urt_counts.most_common(15),
recommended_enables=recommended_enables,
weight_issues=weight_issues,
validated_at=datetime.utcnow().isoformat(),
config_version=config_version,
)
def print_sector_report(result: SectorValidation, config: dict):
"""Print detailed validation report for a sector."""
print("\n" + "=" * 70)
print(f"SECTOR-SCOPED VALIDATION: {result.sector_code}")
print("=" * 70)
print(f"\n📊 DATA SOURCE")
print(f" Businesses: {', '.join(result.businesses)}")
print(f" Total spans: {result.span_count:,}")
print(f" Config version: {result.config_version}")
print(f"\n📈 COVERAGE")
print(f" Enabled coverage: {result.enabled_coverage:.1%}")
print(f" Unmapped (R-domain): {result.unmapped_count} ({result.unmapped_count/result.span_count*100:.1f}%)" if result.span_count > 0 else "")
# Domain distribution
print(f"\n📁 DOMAIN DISTRIBUTION")
domain_names = {"O": "Offering", "P": "People", "J": "Journey",
"E": "Environment", "A": "Access", "V": "Value", "R": "Relationship"}
for domain in "OPJEVRA":
count = result.domain_distribution.get(domain, 0)
pct = count / result.span_count * 100 if result.span_count > 0 else 0
bar = "" * int(pct / 2)
print(f" {domain} {domain_names.get(domain, '?'):12} {count:4} ({pct:5.1f}%) {bar}")
# Top primitives
print(f"\n🔝 TOP PRIMITIVES (sector-scoped)")
enabled_set = set(config.get("enabled", []))
disabled_set = set(config.get("disabled", []))
weights = config.get("weights", {})
for prim, count in sorted(result.primitive_counts.items(), key=lambda x: -x[1])[:12]:
pct = count / result.span_count * 100 if result.span_count > 0 else 0
if prim in enabled_set:
status = ""
elif prim in disabled_set:
status = ""
else:
status = "?"
weight = f"({weights[prim]}x)" if prim in weights else ""
print(f" {status} {prim:20} {count:4} ({pct:5.1f}%) {weight}")
# Threshold-gated recommendations
if result.recommended_enables:
print(f"\n⚠️ RECOMMENDED ENABLES (≥{ENABLE_THRESHOLD_PCT}% threshold)")
for prim, pct in result.recommended_enables:
count = result.disabled_hits.get(prim, 0)
print(f" → ENABLE {prim}: {count} spans ({pct:.1f}%)")
else:
print(f"\n✅ No primitives exceed {ENABLE_THRESHOLD_PCT}% threshold for enabling")
# Low-frequency disabled (info only)
low_freq_disabled = [(p, c) for p, c in result.disabled_hits.items()
if c / result.span_count * 100 < ENABLE_THRESHOLD_PCT]
if low_freq_disabled:
print(f"\n📋 DISABLED BUT APPEARING (below threshold - no action)")
for prim, count in sorted(low_freq_disabled, key=lambda x: -x[1])[:5]:
pct = count / result.span_count * 100
print(f" {prim}: {count} ({pct:.1f}%)")
# Weight issues
if result.weight_issues:
print(f"\n⚖️ WEIGHT ISSUES")
for issue in result.weight_issues:
print(f"{issue}")
print(f"\n⏱️ Validated at: {result.validated_at}")
print("=" * 70)
async def validate_sector(
sector_code: str,
db_url: str | None = None,
verbose: bool = True,
) -> SectorValidation | None:
"""Validate a single sector with sector-scoped data."""
if sector_code not in SECTORS_WITH_DATA:
if verbose:
print(f"⚠️ {sector_code}: No real business data available for validation")
return None
config = load_l1_config(sector_code)
if not config:
if verbose:
print(f"❌ No L1 config found for {sector_code}")
return None
businesses = get_businesses_for_sector(sector_code)
if not businesses:
if verbose:
print(f"⚠️ {sector_code}: No businesses mapped")
return None
db_url = db_url or os.environ.get(
"DATABASE_URL",
"postgresql://scraper:scraper123@localhost:5437/scraper"
)
pool = await asyncpg.create_pool(db_url)
try:
spans = await fetch_spans_for_businesses(pool, businesses)
if not spans:
if verbose:
print(f"⚠️ {sector_code}: No spans found for businesses")
return None
result = analyze_sector_spans(spans, config, businesses)
if verbose:
print_sector_report(result, config)
return result
finally:
await pool.close()
async def validate_all_sectors(db_url: str | None = None) -> dict[str, SectorValidation]:
"""Validate all sectors with available data."""
results = {}
for sector in SECTORS_WITH_DATA:
result = await validate_sector(sector, db_url, verbose=True)
if result:
results[sector] = result
# Print summary
print("\n" + "=" * 70)
print("VALIDATION SUMMARY")
print("=" * 70)
print(f"\n{'Sector':<20} {'Spans':>8} {'Coverage':>10} {'Enables':>10}")
print("-" * 50)
for sector, result in results.items():
enables = len(result.recommended_enables)
enables_str = f"{enables} recs" if enables > 0 else "✓ OK"
print(f"{sector:<20} {result.span_count:>8,} {result.enabled_coverage:>9.1%} {enables_str:>10}")
print("-" * 50)
print(f"Sectors validated: {len(results)}/{len(SECTORS_WITH_DATA)}")
print(f"Sectors without data: {20 - len(SECTORS_WITH_DATA)}")
return results
async def generate_summary_report(db_url: str | None = None) -> dict:
"""Generate a JSON summary report for all sectors."""
results = {}
for sector in SECTORS_WITH_DATA:
result = await validate_sector(sector, db_url, verbose=False)
if result:
results[sector] = {
"span_count": result.span_count,
"enabled_coverage": round(result.enabled_coverage, 3),
"recommended_enables": result.recommended_enables,
"weight_issues": result.weight_issues,
"config_version": result.config_version,
"validated_at": result.validated_at,
}
return results
def main():
parser = argparse.ArgumentParser(description="Sector-scoped L1 config validation")
parser.add_argument("--sector", help="Validate specific sector")
parser.add_argument("--all", action="store_true", help="Validate all sectors with data")
parser.add_argument("--report", action="store_true", help="Generate JSON summary report")
parser.add_argument("--db-url", help="Database URL")
args = parser.parse_args()
if args.report:
results = asyncio.run(generate_summary_report(args.db_url))
print(json.dumps(results, indent=2))
elif args.all:
asyncio.run(validate_all_sectors(args.db_url))
elif args.sector:
asyncio.run(validate_sector(args.sector.upper(), args.db_url))
else:
parser.print_help()
print("\n\nSectors with real data:", ", ".join(sorted(SECTORS_WITH_DATA)))
if __name__ == "__main__":
main()