Initial commit - WhyRating Engine (Google Reviews Scraper)
This commit is contained in:
409
packages/reviewiq-pipeline/scripts/backfill_review_facts.py
Normal file
409
packages/reviewiq-pipeline/scripts/backfill_review_facts.py
Normal file
@@ -0,0 +1,409 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Backfill review_facts_v1 from public.jobs.reviews_data.
|
||||
|
||||
Parses relative timestamps ("17 hours ago", "2 weeks ago") into absolute
|
||||
timestamps anchored to job.created_at.
|
||||
|
||||
Usage:
|
||||
python backfill_review_facts.py
|
||||
python backfill_review_facts.py --dry-run
|
||||
python backfill_review_facts.py --job-id <uuid>
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Any
|
||||
|
||||
import asyncpg
|
||||
|
||||
# Database URL
|
||||
DB_URL = os.environ.get(
|
||||
"DATABASE_URL",
|
||||
"postgresql://scraper:scraper123@localhost:5437/scraper"
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# RELATIVE TIMESTAMP PARSER
|
||||
# =============================================================================
|
||||
|
||||
# Regex patterns for relative timestamps
|
||||
RELATIVE_PATTERNS = [
|
||||
# "17 hours ago", "2 weeks ago", "a month ago"
|
||||
(r"(?:edited\s+)?(\d+|a|an)\s+(second|minute|hour|day|week|month|year)s?\s+ago", "standard"),
|
||||
# "just now"
|
||||
(r"just\s+now", "just_now"),
|
||||
# "yesterday"
|
||||
(r"yesterday", "yesterday"),
|
||||
# "today"
|
||||
(r"today", "today"),
|
||||
]
|
||||
|
||||
# Time unit multipliers (in seconds)
|
||||
TIME_UNITS = {
|
||||
"second": 1,
|
||||
"minute": 60,
|
||||
"hour": 3600,
|
||||
"day": 86400,
|
||||
"week": 604800,
|
||||
"month": 2592000, # 30 days
|
||||
"year": 31536000, # 365 days
|
||||
}
|
||||
|
||||
|
||||
def parse_relative_timestamp(raw: str, reference_time: datetime) -> datetime | None:
|
||||
"""
|
||||
Parse a relative timestamp string into an absolute datetime.
|
||||
|
||||
Args:
|
||||
raw: Relative timestamp like "17 hours ago", "Edited 2 weeks ago"
|
||||
reference_time: The reference point (usually job.created_at)
|
||||
|
||||
Returns:
|
||||
Absolute datetime or None if parsing failed
|
||||
"""
|
||||
if not raw:
|
||||
return None
|
||||
|
||||
text = raw.lower().strip()
|
||||
|
||||
# Handle "just now"
|
||||
if "just now" in text:
|
||||
return reference_time
|
||||
|
||||
# Handle "yesterday"
|
||||
if text == "yesterday":
|
||||
return reference_time - timedelta(days=1)
|
||||
|
||||
# Handle "today"
|
||||
if text == "today":
|
||||
return reference_time
|
||||
|
||||
# Handle standard relative format
|
||||
# Remove "edited " prefix if present
|
||||
text = re.sub(r"^edited\s+", "", text)
|
||||
|
||||
# Match "N unit(s) ago"
|
||||
match = re.match(r"(\d+|a|an)\s+(second|minute|hour|day|week|month|year)s?\s+ago", text)
|
||||
if match:
|
||||
quantity_str = match.group(1)
|
||||
unit = match.group(2)
|
||||
|
||||
# Convert "a"/"an" to 1
|
||||
if quantity_str in ("a", "an"):
|
||||
quantity = 1
|
||||
else:
|
||||
quantity = int(quantity_str)
|
||||
|
||||
seconds = quantity * TIME_UNITS.get(unit, 0)
|
||||
return reference_time - timedelta(seconds=seconds)
|
||||
|
||||
# Unknown format
|
||||
return None
|
||||
|
||||
|
||||
def parse_relative_timestamp_safe(raw: str, reference_time: datetime) -> tuple[datetime | None, bool]:
|
||||
"""
|
||||
Safe wrapper that returns (parsed_time, success).
|
||||
"""
|
||||
try:
|
||||
result = parse_relative_timestamp(raw, reference_time)
|
||||
return result, result is not None
|
||||
except Exception:
|
||||
return None, False
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# BACKFILL LOGIC
|
||||
# =============================================================================
|
||||
|
||||
async def get_jobs_with_reviews(pool: asyncpg.Pool, job_id: str | None = None) -> list[dict]:
|
||||
"""Get all jobs with reviews_data."""
|
||||
if job_id:
|
||||
query = """
|
||||
SELECT job_id, created_at, reviews_data,
|
||||
COALESCE(metadata->>'business_name', url) as business_id
|
||||
FROM public.jobs
|
||||
WHERE job_id = $1
|
||||
AND reviews_data IS NOT NULL
|
||||
AND jsonb_typeof(reviews_data) = 'array'
|
||||
"""
|
||||
rows = await pool.fetch(query, job_id)
|
||||
else:
|
||||
query = """
|
||||
SELECT job_id, created_at, reviews_data,
|
||||
COALESCE(metadata->>'business_name', url) as business_id
|
||||
FROM public.jobs
|
||||
WHERE reviews_data IS NOT NULL
|
||||
AND jsonb_typeof(reviews_data) = 'array'
|
||||
ORDER BY created_at DESC
|
||||
"""
|
||||
rows = await pool.fetch(query)
|
||||
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
|
||||
async def get_run_id_for_job(pool: asyncpg.Pool, job_id: str) -> str | None:
|
||||
"""Get the run_id associated with a job from detected_spans_v2."""
|
||||
row = await pool.fetchrow("""
|
||||
SELECT DISTINCT run_id FROM pipeline.detected_spans_v2
|
||||
WHERE job_id = $1 AND run_id IS NOT NULL
|
||||
LIMIT 1
|
||||
""", job_id)
|
||||
return str(row["run_id"]) if row and row["run_id"] else None
|
||||
|
||||
|
||||
async def get_language_for_review(pool: asyncpg.Pool, review_id: str) -> str | None:
|
||||
"""Get detected language for a review from spans."""
|
||||
row = await pool.fetchrow("""
|
||||
SELECT language FROM pipeline.detected_spans_v2
|
||||
WHERE review_id = $1 AND language IS NOT NULL
|
||||
LIMIT 1
|
||||
""", review_id)
|
||||
return row["language"] if row else None
|
||||
|
||||
|
||||
async def upsert_review_facts(
|
||||
pool: asyncpg.Pool,
|
||||
facts: list[dict],
|
||||
dry_run: bool = False,
|
||||
) -> tuple[int, int]:
|
||||
"""
|
||||
Upsert review facts into the database.
|
||||
|
||||
Returns:
|
||||
(inserted_count, updated_count)
|
||||
"""
|
||||
if dry_run or not facts:
|
||||
return 0, 0
|
||||
|
||||
# Use executemany with ON CONFLICT
|
||||
query = """
|
||||
INSERT INTO pipeline.review_facts_v1
|
||||
(review_id, business_id, job_id, run_id, rating, review_time_utc, raw_timestamp, author, language)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
||||
ON CONFLICT (review_id) DO UPDATE SET
|
||||
business_id = EXCLUDED.business_id,
|
||||
job_id = EXCLUDED.job_id,
|
||||
run_id = COALESCE(EXCLUDED.run_id, pipeline.review_facts_v1.run_id),
|
||||
rating = EXCLUDED.rating,
|
||||
review_time_utc = EXCLUDED.review_time_utc,
|
||||
raw_timestamp = EXCLUDED.raw_timestamp,
|
||||
author = EXCLUDED.author,
|
||||
language = COALESCE(EXCLUDED.language, pipeline.review_facts_v1.language)
|
||||
"""
|
||||
|
||||
# Prepare records
|
||||
records = [
|
||||
(
|
||||
f["review_id"],
|
||||
f["business_id"],
|
||||
f["job_id"],
|
||||
f.get("run_id"),
|
||||
f.get("rating"),
|
||||
f.get("review_time_utc"),
|
||||
f.get("raw_timestamp"),
|
||||
f.get("author"),
|
||||
f.get("language"),
|
||||
)
|
||||
for f in facts
|
||||
]
|
||||
|
||||
await pool.executemany(query, records)
|
||||
return len(records), 0
|
||||
|
||||
|
||||
async def backfill_job(
|
||||
pool: asyncpg.Pool,
|
||||
job: dict,
|
||||
dry_run: bool = False,
|
||||
verbose: bool = False,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Backfill review facts for a single job.
|
||||
|
||||
Returns:
|
||||
Stats dict with counts and errors
|
||||
"""
|
||||
job_id = job["job_id"]
|
||||
job_created = job["created_at"]
|
||||
business_id = job["business_id"]
|
||||
reviews_data = job["reviews_data"]
|
||||
|
||||
# asyncpg may return JSONB as string
|
||||
if isinstance(reviews_data, str):
|
||||
reviews_data = json.loads(reviews_data)
|
||||
|
||||
# Make job_created timezone-aware if it isn't
|
||||
if job_created.tzinfo is None:
|
||||
job_created = job_created.replace(tzinfo=timezone.utc)
|
||||
|
||||
# Get run_id for this job
|
||||
run_id = await get_run_id_for_job(pool, str(job_id))
|
||||
|
||||
stats = {
|
||||
"job_id": str(job_id),
|
||||
"total_reviews": 0,
|
||||
"parsed_ok": 0,
|
||||
"parsed_failed": 0,
|
||||
"inserted": 0,
|
||||
"sample_failures": [],
|
||||
}
|
||||
|
||||
facts = []
|
||||
|
||||
for review in reviews_data:
|
||||
stats["total_reviews"] += 1
|
||||
|
||||
# Handle both dict and JSON string
|
||||
if isinstance(review, str):
|
||||
try:
|
||||
review = json.loads(review)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
review_id = review.get("review_id")
|
||||
if not review_id:
|
||||
continue
|
||||
|
||||
raw_timestamp = review.get("timestamp", "")
|
||||
review_time, success = parse_relative_timestamp_safe(raw_timestamp, job_created)
|
||||
|
||||
if success:
|
||||
stats["parsed_ok"] += 1
|
||||
else:
|
||||
stats["parsed_failed"] += 1
|
||||
if len(stats["sample_failures"]) < 5:
|
||||
stats["sample_failures"].append(raw_timestamp)
|
||||
|
||||
# Get language from spans if available
|
||||
language = await get_language_for_review(pool, review_id) if not dry_run else None
|
||||
|
||||
facts.append({
|
||||
"review_id": review_id,
|
||||
"business_id": business_id,
|
||||
"job_id": job_id,
|
||||
"run_id": run_id,
|
||||
"rating": review.get("rating"),
|
||||
"review_time_utc": review_time,
|
||||
"raw_timestamp": raw_timestamp,
|
||||
"author": review.get("author"),
|
||||
"language": language,
|
||||
})
|
||||
|
||||
# Upsert
|
||||
inserted, _ = await upsert_review_facts(pool, facts, dry_run=dry_run)
|
||||
stats["inserted"] = inserted
|
||||
|
||||
if verbose:
|
||||
print(f" Job {job_id}: {stats['total_reviews']} reviews, "
|
||||
f"{stats['parsed_ok']} parsed OK, {stats['parsed_failed']} failed")
|
||||
if stats["sample_failures"]:
|
||||
print(f" Sample failures: {stats['sample_failures'][:3]}")
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
async def backfill_all(
|
||||
pool: asyncpg.Pool,
|
||||
job_id: str | None = None,
|
||||
dry_run: bool = False,
|
||||
verbose: bool = False,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Backfill review facts for all jobs (or a specific job).
|
||||
|
||||
Returns:
|
||||
Aggregate stats
|
||||
"""
|
||||
jobs = await get_jobs_with_reviews(pool, job_id)
|
||||
|
||||
print(f"\n{'[DRY RUN] ' if dry_run else ''}Backfilling review_facts_v1 from {len(jobs)} jobs...")
|
||||
|
||||
aggregate = {
|
||||
"jobs_processed": 0,
|
||||
"total_reviews": 0,
|
||||
"parsed_ok": 0,
|
||||
"parsed_failed": 0,
|
||||
"inserted": 0,
|
||||
"unique_failure_patterns": set(),
|
||||
}
|
||||
|
||||
for i, job in enumerate(jobs, 1):
|
||||
if verbose:
|
||||
print(f"\n[{i}/{len(jobs)}] Processing job {job['job_id']}...")
|
||||
|
||||
stats = await backfill_job(pool, job, dry_run=dry_run, verbose=verbose)
|
||||
|
||||
aggregate["jobs_processed"] += 1
|
||||
aggregate["total_reviews"] += stats["total_reviews"]
|
||||
aggregate["parsed_ok"] += stats["parsed_ok"]
|
||||
aggregate["parsed_failed"] += stats["parsed_failed"]
|
||||
aggregate["inserted"] += stats["inserted"]
|
||||
aggregate["unique_failure_patterns"].update(stats["sample_failures"])
|
||||
|
||||
# Convert set to list for JSON serialization
|
||||
aggregate["unique_failure_patterns"] = list(aggregate["unique_failure_patterns"])[:20]
|
||||
|
||||
return aggregate
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI
|
||||
# =============================================================================
|
||||
|
||||
async def main_async(args):
|
||||
"""Main async entry point."""
|
||||
pool = await asyncpg.create_pool(DB_URL)
|
||||
|
||||
try:
|
||||
stats = await backfill_all(
|
||||
pool,
|
||||
job_id=args.job_id,
|
||||
dry_run=args.dry_run,
|
||||
verbose=args.verbose,
|
||||
)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("BACKFILL COMPLETE")
|
||||
print("=" * 60)
|
||||
print(f"Jobs processed: {stats['jobs_processed']}")
|
||||
print(f"Total reviews: {stats['total_reviews']}")
|
||||
print(f"Timestamps parsed: {stats['parsed_ok']} ({stats['parsed_ok']/max(stats['total_reviews'],1)*100:.1f}%)")
|
||||
print(f"Timestamps failed: {stats['parsed_failed']} ({stats['parsed_failed']/max(stats['total_reviews'],1)*100:.1f}%)")
|
||||
if not args.dry_run:
|
||||
print(f"Records upserted: {stats['inserted']}")
|
||||
|
||||
if stats["unique_failure_patterns"]:
|
||||
print(f"\nUnparsed timestamp patterns ({len(stats['unique_failure_patterns'])}):")
|
||||
for p in stats["unique_failure_patterns"][:10]:
|
||||
print(f" - \"{p}\"")
|
||||
|
||||
# Calculate coverage
|
||||
coverage = stats['parsed_ok'] / max(stats['total_reviews'], 1) * 100
|
||||
if coverage < 90:
|
||||
print(f"\n⚠️ WARNING: Timestamp coverage is {coverage:.1f}% (target: >90%)")
|
||||
else:
|
||||
print(f"\n✅ Timestamp coverage: {coverage:.1f}%")
|
||||
|
||||
finally:
|
||||
await pool.close()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Backfill review_facts_v1")
|
||||
parser.add_argument("--job-id", help="Process a specific job only")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Don't write to database")
|
||||
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
||||
|
||||
args = parser.parse_args()
|
||||
asyncio.run(main_async(args))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
226
packages/reviewiq-pipeline/scripts/config_resolver_standalone.py
Normal file
226
packages/reviewiq-pipeline/scripts/config_resolver_standalone.py
Normal file
@@ -0,0 +1,226 @@
|
||||
"""
|
||||
Config Resolver - Standalone version for scripts.
|
||||
|
||||
Resolves L1 config + sector brief for classification.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Paths
|
||||
DATA_DIR = Path(__file__).parent.parent / "data"
|
||||
CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l1"
|
||||
L2_CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l2"
|
||||
BRIEFS_DIR = DATA_DIR / "sector_briefs"
|
||||
|
||||
# Meta primitives - always enabled
|
||||
META_PRIMITIVES = frozenset([
|
||||
"HONESTY", "ETHICS", "PROMISES",
|
||||
"ACKNOWLEDGMENT", "RESPONSE_QUALITY", "RECOVERY",
|
||||
"RETURN_INTENT", "RECOMMEND", "RECOGNITION",
|
||||
"UNMAPPED",
|
||||
])
|
||||
|
||||
# Core primitives dictionary
|
||||
CORE_PRIMITIVES = {
|
||||
"TASTE": {"domain": "O", "name": "Taste/Flavor", "def": "Sensory quality of food/beverage"},
|
||||
"CRAFT": {"domain": "O", "name": "Craftsmanship", "def": "Skill of execution/preparation"},
|
||||
"FRESHNESS": {"domain": "O", "name": "Freshness", "def": "Newness, not stale or old"},
|
||||
"TEMPERATURE": {"domain": "O", "name": "Temperature", "def": "Hot/cold as expected"},
|
||||
"EFFECTIVENESS": {"domain": "O", "name": "Effectiveness", "def": "Achieves intended purpose"},
|
||||
"ACCURACY": {"domain": "O", "name": "Accuracy", "def": "Correct, as ordered/specified"},
|
||||
"CONDITION": {"domain": "O", "name": "Condition", "def": "Physical state, wear, damage"},
|
||||
"CONSISTENCY": {"domain": "O", "name": "Consistency", "def": "Same quality each time"},
|
||||
"MANNER": {"domain": "P", "name": "Manner/Attitude", "def": "Friendliness, respect, warmth"},
|
||||
"COMPETENCE": {"domain": "P", "name": "Competence", "def": "Knowledge and skill of staff"},
|
||||
"ATTENTIVENESS": {"domain": "P", "name": "Attentiveness", "def": "Being present, responsive"},
|
||||
"COMMUNICATION": {"domain": "P", "name": "Communication", "def": "Clarity, listening, updates"},
|
||||
"SPEED": {"domain": "J", "name": "Speed/Wait", "def": "Time to service, waiting"},
|
||||
"FRICTION": {"domain": "J", "name": "Friction", "def": "Obstacles, hassles, complexity"},
|
||||
"RELIABILITY": {"domain": "J", "name": "Reliability", "def": "Dependable, keeps promises"},
|
||||
"AVAILABILITY": {"domain": "J", "name": "Availability", "def": "Open when needed, bookable"},
|
||||
"CLEANLINESS": {"domain": "E", "name": "Cleanliness", "def": "Hygiene, tidiness"},
|
||||
"COMFORT": {"domain": "E", "name": "Comfort", "def": "Physical ease, seating"},
|
||||
"SAFETY": {"domain": "E", "name": "Safety", "def": "Free from harm/danger"},
|
||||
"AMBIANCE": {"domain": "E", "name": "Ambiance", "def": "Atmosphere, mood, vibe"},
|
||||
"ACCESSIBILITY": {"domain": "E", "name": "Accessibility", "def": "Easy to reach, navigate"},
|
||||
"DIGITAL_UX": {"domain": "E", "name": "Digital Experience", "def": "Website, app, online"},
|
||||
"PRICE_LEVEL": {"domain": "V", "name": "Price Level", "def": "Absolute cost (cheap/expensive)"},
|
||||
"PRICE_FAIRNESS": {"domain": "V", "name": "Price Fairness", "def": "Reasonable for what you get"},
|
||||
"PRICE_TRANSPARENCY": {"domain": "V", "name": "Price Transparency", "def": "No hidden fees, clear pricing"},
|
||||
"VALUE_FOR_MONEY": {"domain": "V", "name": "Value for Money", "def": "Worth what you paid"},
|
||||
}
|
||||
|
||||
|
||||
class ConfigResolver:
|
||||
"""Resolves classification config for a business."""
|
||||
|
||||
def __init__(self):
|
||||
self._l1_cache: dict[str, dict] = {}
|
||||
self._l2_cache: dict[str, dict] = {}
|
||||
self._brief_cache: dict[str, dict] = {}
|
||||
|
||||
def _load_l2_configs(self) -> list[dict[str, Any]]:
|
||||
"""Load all L2 config files."""
|
||||
if not L2_CONFIGS_DIR.exists():
|
||||
return []
|
||||
|
||||
configs = []
|
||||
for config_path in L2_CONFIGS_DIR.glob("*_config.json"):
|
||||
try:
|
||||
with open(config_path) as f:
|
||||
config = json.load(f)
|
||||
configs.append(config)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load L2 config {config_path}: {e}")
|
||||
return configs
|
||||
|
||||
def _find_matching_l2(self, gbp_path: str) -> dict[str, Any] | None:
|
||||
"""Find L2 config that matches the GBP path (most specific wins)."""
|
||||
l2_configs = self._load_l2_configs()
|
||||
|
||||
# Find all matching configs (path starts with L2 gbp_path)
|
||||
matches = []
|
||||
for config in l2_configs:
|
||||
l2_path = config.get("gbp_path", "")
|
||||
if gbp_path.startswith(l2_path) or gbp_path == l2_path:
|
||||
matches.append((len(l2_path), config))
|
||||
|
||||
if not matches:
|
||||
return None
|
||||
|
||||
# Return most specific match (longest path)
|
||||
matches.sort(key=lambda x: x[0], reverse=True)
|
||||
return matches[0][1]
|
||||
|
||||
def _apply_l2_delta(self, l1_config: dict, l2_config: dict) -> dict:
|
||||
"""Apply L2 delta to L1 config."""
|
||||
result = l1_config.copy()
|
||||
delta = l2_config.get("delta", {})
|
||||
|
||||
# Enable additional primitives
|
||||
if "enable" in delta:
|
||||
enabled = set(result.get("enabled", []))
|
||||
enabled.update(delta["enable"])
|
||||
result["enabled"] = list(enabled)
|
||||
|
||||
# Merge weights
|
||||
if "weights" in delta:
|
||||
weights = dict(result.get("weights", {}))
|
||||
weights.update(delta["weights"])
|
||||
result["weights"] = weights
|
||||
|
||||
# Update config version to indicate L2
|
||||
result["config_version"] = l2_config.get("config_version", result.get("config_version", "1.0"))
|
||||
result["l2_applied"] = l2_config.get("gbp_path")
|
||||
|
||||
return result
|
||||
|
||||
def _load_l1_config(self, sector_code: str) -> dict[str, Any] | None:
|
||||
if sector_code in self._l1_cache:
|
||||
return self._l1_cache[sector_code]
|
||||
|
||||
config_path = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
|
||||
if not config_path.exists():
|
||||
return None
|
||||
|
||||
with open(config_path) as f:
|
||||
config = json.load(f)
|
||||
|
||||
self._l1_cache[sector_code] = config
|
||||
return config
|
||||
|
||||
def _load_sector_brief(self, sector_code: str) -> dict[str, Any] | None:
|
||||
if sector_code in self._brief_cache:
|
||||
return self._brief_cache[sector_code]
|
||||
|
||||
brief_path = BRIEFS_DIR / f"{sector_code.lower()}_brief.json"
|
||||
if not brief_path.exists():
|
||||
return None
|
||||
|
||||
with open(brief_path) as f:
|
||||
brief = json.load(f)
|
||||
|
||||
self._brief_cache[sector_code] = brief
|
||||
return brief
|
||||
|
||||
async def get_business_mapping(self, pool, business_id: str) -> dict[str, Any] | None:
|
||||
query = """
|
||||
SELECT business_id, gbp_path::text, sector_code
|
||||
FROM pipeline.business_taxonomy_map
|
||||
WHERE business_id = $1
|
||||
"""
|
||||
row = await pool.fetchrow(query, business_id)
|
||||
return dict(row) if row else None
|
||||
|
||||
def resolve_enabled_set(self, l1_config: dict) -> set[str]:
|
||||
enabled = set(l1_config.get("enabled", []))
|
||||
enabled.update(META_PRIMITIVES)
|
||||
return enabled
|
||||
|
||||
def build_primitives_for_prompt(self, enabled: set[str], weights: dict[str, float]) -> dict[str, dict]:
|
||||
result = {}
|
||||
for prim in enabled:
|
||||
if prim in CORE_PRIMITIVES:
|
||||
entry = CORE_PRIMITIVES[prim].copy()
|
||||
if prim in weights:
|
||||
entry["weight"] = weights[prim]
|
||||
result[prim] = entry
|
||||
elif prim in META_PRIMITIVES:
|
||||
result[prim] = {"domain": "M", "name": prim.replace("_", " ").title(), "meta": True}
|
||||
return result
|
||||
|
||||
def extract_brief_signals(self, brief: dict) -> dict[str, Any]:
|
||||
if not brief:
|
||||
return {}
|
||||
return {
|
||||
"sector": brief.get("sector_code"),
|
||||
"what_customers_judge": brief.get("what_customers_judge"),
|
||||
"critical_pain_points": brief.get("critical_pain_points"),
|
||||
"industry_terminology": brief.get("industry_terminology"),
|
||||
}
|
||||
|
||||
async def resolve(self, business_id: str, pool, mode: str | None = None) -> dict[str, Any] | None:
|
||||
mapping = await self.get_business_mapping(pool, business_id)
|
||||
if not mapping:
|
||||
return None
|
||||
|
||||
sector_code = mapping["sector_code"]
|
||||
gbp_path = mapping["gbp_path"]
|
||||
|
||||
# Load L1 config (sector-level)
|
||||
l1_config = self._load_l1_config(sector_code)
|
||||
if not l1_config:
|
||||
l1_config = {"enabled": list(CORE_PRIMITIVES.keys()), "weights": {}}
|
||||
|
||||
# Check for L2 config (category-level delta)
|
||||
l2_config = self._find_matching_l2(gbp_path)
|
||||
if l2_config:
|
||||
logger.info(f"Applying L2 delta for {gbp_path}: {l2_config.get('gbp_path')}")
|
||||
l1_config = self._apply_l2_delta(l1_config, l2_config)
|
||||
|
||||
brief = self._load_sector_brief(sector_code)
|
||||
|
||||
enabled = self.resolve_enabled_set(l1_config)
|
||||
weights = dict(l1_config.get("weights", {}))
|
||||
primitives = self.build_primitives_for_prompt(enabled, weights)
|
||||
brief_signals = self.extract_brief_signals(brief)
|
||||
|
||||
return {
|
||||
"business_id": business_id,
|
||||
"gbp_path": gbp_path,
|
||||
"sector_code": sector_code,
|
||||
"config_version": l1_config.get("config_version", "1.0"),
|
||||
"l2_applied": l1_config.get("l2_applied"),
|
||||
"modes": [mode] if mode else ["in_person"],
|
||||
"default_mode": mode or "in_person",
|
||||
"enabled_primitives": sorted(enabled),
|
||||
"disabled_primitives": sorted(l1_config.get("disabled", [])),
|
||||
"weights": weights,
|
||||
"brief": brief_signals,
|
||||
"primitives": primitives,
|
||||
}
|
||||
148
packages/reviewiq-pipeline/scripts/fix_l1_configs.py
Normal file
148
packages/reviewiq-pipeline/scripts/fix_l1_configs.py
Normal file
@@ -0,0 +1,148 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fix L1 configs based on validation results.
|
||||
|
||||
Applies fixes discovered during validation:
|
||||
1. Enable primitives that were disabled but appearing frequently
|
||||
2. Remove weights for primitives with zero appearances
|
||||
3. Add weights for high-frequency unweighted primitives
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
CONFIGS_DIR = Path(__file__).parent.parent / "data" / "primitive_configs" / "l1"
|
||||
|
||||
# Fixes based on validation results
|
||||
# Format: { sector: { "enable": [primitives], "disable": [primitives], "add_weight": {prim: weight}, "remove_weight": [prims] } }
|
||||
FIXES = {
|
||||
"ENTERTAINMENT": {
|
||||
"enable": ["CRAFT", "CONSISTENCY", "COMMUNICATION", "FRICTION"],
|
||||
"disable": [],
|
||||
"add_weight": {},
|
||||
"remove_weight": ["CONDITION"], # 0 appearances despite 1.4x weight
|
||||
},
|
||||
"FOOD_DINING": {
|
||||
"enable": ["PRICE_LEVEL", "ACCESSIBILITY", "PRICE_TRANSPARENCY", "FRICTION", "EFFECTIVENESS"],
|
||||
"disable": [],
|
||||
"add_weight": {},
|
||||
"remove_weight": [],
|
||||
},
|
||||
"AUTOMOTIVE": {
|
||||
"enable": ["CRAFT", "CONSISTENCY", "PRICE_LEVEL", "AMBIANCE"],
|
||||
"disable": [],
|
||||
"add_weight": {},
|
||||
"remove_weight": [],
|
||||
},
|
||||
"HEALTHCARE": {
|
||||
"enable": ["CRAFT", "PRICE_LEVEL", "AMBIANCE"],
|
||||
"disable": [],
|
||||
"add_weight": {},
|
||||
"remove_weight": [],
|
||||
},
|
||||
"RETAIL_SHOPPING": {
|
||||
"enable": ["CRAFT", "PRICE_LEVEL", "AMBIANCE"],
|
||||
"disable": [],
|
||||
"add_weight": {},
|
||||
"remove_weight": [],
|
||||
},
|
||||
"HOSPITALITY_TRAVEL": {
|
||||
"enable": ["CRAFT", "CONSISTENCY", "PRICE_LEVEL"],
|
||||
"disable": [],
|
||||
"add_weight": {},
|
||||
"remove_weight": [],
|
||||
},
|
||||
"PERSONAL_SERVICES": {
|
||||
"enable": ["PRICE_LEVEL", "SPEED", "FRICTION"],
|
||||
"disable": [],
|
||||
"add_weight": {},
|
||||
"remove_weight": [],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def fix_config(sector_code: str, fixes: dict) -> dict:
|
||||
"""Apply fixes to a sector config."""
|
||||
config_path = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
|
||||
|
||||
if not config_path.exists():
|
||||
print(f" ⚠️ Config not found: {config_path}")
|
||||
return None
|
||||
|
||||
with open(config_path) as f:
|
||||
config = json.load(f)
|
||||
|
||||
enabled = set(config.get("enabled", []))
|
||||
disabled = set(config.get("disabled", []))
|
||||
weights = config.get("weights", {})
|
||||
|
||||
changes = []
|
||||
|
||||
# Apply enables (move from disabled to enabled)
|
||||
for prim in fixes.get("enable", []):
|
||||
if prim in disabled:
|
||||
disabled.remove(prim)
|
||||
enabled.add(prim)
|
||||
changes.append(f"✓ Enabled {prim}")
|
||||
elif prim not in enabled:
|
||||
enabled.add(prim)
|
||||
changes.append(f"✓ Added {prim} to enabled")
|
||||
|
||||
# Apply disables (move from enabled to disabled)
|
||||
for prim in fixes.get("disable", []):
|
||||
if prim in enabled:
|
||||
enabled.remove(prim)
|
||||
disabled.add(prim)
|
||||
changes.append(f"✗ Disabled {prim}")
|
||||
|
||||
# Add weights
|
||||
for prim, weight in fixes.get("add_weight", {}).items():
|
||||
if prim not in weights:
|
||||
weights[prim] = weight
|
||||
changes.append(f"⚖️ Added weight {prim}: {weight}x")
|
||||
|
||||
# Remove weights
|
||||
for prim in fixes.get("remove_weight", []):
|
||||
if prim in weights:
|
||||
del weights[prim]
|
||||
changes.append(f"⚖️ Removed weight for {prim}")
|
||||
|
||||
# Update config
|
||||
config["enabled"] = sorted(enabled)
|
||||
config["disabled"] = sorted(disabled)
|
||||
config["weights"] = dict(sorted(weights.items()))
|
||||
config["config_version"] = "1.1" # Bump version
|
||||
|
||||
# Save
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(config, f, indent=2)
|
||||
f.write("\n")
|
||||
|
||||
return changes
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("L1 CONFIG FIXER - Applying validation-based fixes")
|
||||
print("=" * 60)
|
||||
|
||||
total_changes = 0
|
||||
|
||||
for sector, fixes in FIXES.items():
|
||||
print(f"\n📁 {sector}")
|
||||
changes = fix_config(sector, fixes)
|
||||
if changes:
|
||||
for change in changes:
|
||||
print(f" {change}")
|
||||
total_changes += len(changes)
|
||||
else:
|
||||
print(" No changes applied")
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"Total changes applied: {total_changes}")
|
||||
print("Config version bumped to 1.1")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
238
packages/reviewiq-pipeline/scripts/fix_l1_configs_v2.py
Normal file
238
packages/reviewiq-pipeline/scripts/fix_l1_configs_v2.py
Normal file
@@ -0,0 +1,238 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Guarded L1 Config Fixer - V2 (Threshold-based, Sector-scoped)
|
||||
|
||||
Only applies fixes when:
|
||||
1. Evidence is from sector-scoped validation
|
||||
2. Frequency exceeds threshold (default 3%)
|
||||
3. Changes are logged with version bump
|
||||
|
||||
Usage:
|
||||
python fix_l1_configs_v2.py --apply # Apply fixes from validation
|
||||
python fix_l1_configs_v2.py --dry-run # Show what would change
|
||||
python fix_l1_configs_v2.py --revert SECTOR # Revert to previous version
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
CONFIGS_DIR = Path(__file__).parent.parent / "data" / "primitive_configs" / "l1"
|
||||
CHANGELOG_FILE = CONFIGS_DIR / "CHANGELOG.json"
|
||||
|
||||
# Minimum threshold for auto-enabling (% of sector spans)
|
||||
ENABLE_THRESHOLD_PCT = 3.0
|
||||
|
||||
# Fixes derived from sector-scoped validation (validate_l1_configs_v2.py output)
|
||||
# These are the ONLY fixes that should be applied
|
||||
SECTOR_SCOPED_FIXES = {
|
||||
"ENTERTAINMENT": {
|
||||
"evidence": "2,320 spans from Go Karts + Soho Club",
|
||||
"enable": [
|
||||
("TASTE", 4.3, "Entertainment venues have concessions/food service"),
|
||||
],
|
||||
"add_weight": [
|
||||
("CRAFT", 1.3, "13.4% frequency but unweighted"),
|
||||
],
|
||||
"remove_weight": [],
|
||||
},
|
||||
"FOOD_DINING": {
|
||||
"evidence": "61 spans from Fika cafe",
|
||||
"enable": [
|
||||
("COMFORT", 9.8, "Seating/atmosphere comfort matters for cafes"),
|
||||
],
|
||||
"add_weight": [
|
||||
("AVAILABILITY", 1.2, "16.4% frequency but unweighted"),
|
||||
],
|
||||
"remove_weight": [
|
||||
# Note: Small sample size (61 spans) - these may be false negatives
|
||||
# Keep weights but flag for review with more data
|
||||
],
|
||||
},
|
||||
"AUTOMOTIVE": {
|
||||
"evidence": "1,201 spans from ClickRent car rental",
|
||||
"enable": [], # Nothing exceeds 3% threshold
|
||||
"add_weight": [],
|
||||
"remove_weight": [
|
||||
# CONDITION, HONESTY, PROMISES, RECOVERY all have 0 appearances
|
||||
# However, may be specific to rental vs repair - keep for now
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def load_changelog() -> list[dict]:
|
||||
"""Load the changelog file."""
|
||||
if CHANGELOG_FILE.exists():
|
||||
with open(CHANGELOG_FILE) as f:
|
||||
return json.load(f)
|
||||
return []
|
||||
|
||||
|
||||
def save_changelog(entries: list[dict]) -> None:
|
||||
"""Save the changelog file."""
|
||||
CHANGELOG_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(CHANGELOG_FILE, "w") as f:
|
||||
json.dump(entries, f, indent=2)
|
||||
f.write("\n")
|
||||
|
||||
|
||||
def load_config(sector_code: str) -> dict[str, Any] | None:
|
||||
"""Load a sector config."""
|
||||
config_path = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
|
||||
if not config_path.exists():
|
||||
return None
|
||||
with open(config_path) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def save_config(sector_code: str, config: dict[str, Any]) -> None:
|
||||
"""Save a sector config."""
|
||||
config_path = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(config, f, indent=2)
|
||||
f.write("\n")
|
||||
|
||||
|
||||
def apply_fixes(sector_code: str, fixes: dict, dry_run: bool = False) -> list[str]:
|
||||
"""Apply fixes to a sector config."""
|
||||
config = load_config(sector_code)
|
||||
if not config:
|
||||
return [f"❌ Config not found for {sector_code}"]
|
||||
|
||||
enabled = set(config.get("enabled", []))
|
||||
disabled = set(config.get("disabled", []))
|
||||
weights = config.get("weights", {})
|
||||
|
||||
changes = []
|
||||
evidence = fixes.get("evidence", "unknown")
|
||||
|
||||
# Enable primitives
|
||||
for prim, pct, reason in fixes.get("enable", []):
|
||||
if pct < ENABLE_THRESHOLD_PCT:
|
||||
changes.append(f"⚠️ SKIP {prim}: {pct:.1f}% below {ENABLE_THRESHOLD_PCT}% threshold")
|
||||
continue
|
||||
|
||||
if prim in disabled:
|
||||
disabled.remove(prim)
|
||||
enabled.add(prim)
|
||||
changes.append(f"✓ ENABLE {prim}: {pct:.1f}% in sector data ({reason})")
|
||||
elif prim not in enabled:
|
||||
enabled.add(prim)
|
||||
changes.append(f"✓ ADD {prim}: {pct:.1f}% in sector data ({reason})")
|
||||
|
||||
# Add weights
|
||||
for prim, weight, reason in fixes.get("add_weight", []):
|
||||
if prim not in weights:
|
||||
weights[prim] = weight
|
||||
changes.append(f"⚖️ WEIGHT {prim}: {weight}x ({reason})")
|
||||
|
||||
# Remove weights
|
||||
for prim, reason in fixes.get("remove_weight", []):
|
||||
if prim in weights:
|
||||
del weights[prim]
|
||||
changes.append(f"⚖️ UNWEIGHT {prim}: ({reason})")
|
||||
|
||||
if not changes:
|
||||
return ["✓ No changes needed"]
|
||||
|
||||
if not dry_run:
|
||||
# Bump version
|
||||
old_version = config.get("config_version", "1.0")
|
||||
major, minor = old_version.split(".")
|
||||
new_version = f"{major}.{int(minor) + 1}"
|
||||
|
||||
config["enabled"] = sorted(enabled)
|
||||
config["disabled"] = sorted(disabled)
|
||||
config["weights"] = dict(sorted(weights.items()))
|
||||
config["config_version"] = new_version
|
||||
config["config_updated_at"] = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
save_config(sector_code, config)
|
||||
|
||||
# Log to changelog
|
||||
changelog = load_changelog()
|
||||
changelog.append({
|
||||
"sector": sector_code,
|
||||
"version": new_version,
|
||||
"previous_version": old_version,
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"evidence": evidence,
|
||||
"changes": changes,
|
||||
})
|
||||
save_changelog(changelog)
|
||||
|
||||
changes.append(f"📝 Version: {old_version} → {new_version}")
|
||||
|
||||
return changes
|
||||
|
||||
|
||||
def revert_config(sector_code: str, to_version: str | None = None) -> list[str]:
|
||||
"""Revert a config to a previous version."""
|
||||
changelog = load_changelog()
|
||||
|
||||
# Find entries for this sector
|
||||
sector_entries = [e for e in changelog if e["sector"] == sector_code]
|
||||
if not sector_entries:
|
||||
return [f"❌ No changelog entries for {sector_code}"]
|
||||
|
||||
# TODO: Implement actual revert by storing full config snapshots
|
||||
return [f"⚠️ Revert not yet implemented - manual restore required"]
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Guarded L1 config fixer")
|
||||
parser.add_argument("--apply", action="store_true", help="Apply sector-scoped fixes")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show what would change")
|
||||
parser.add_argument("--revert", metavar="SECTOR", help="Revert sector to previous version")
|
||||
parser.add_argument("--sector", help="Apply to specific sector only")
|
||||
parser.add_argument("--show-changelog", action="store_true", help="Show changelog")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.show_changelog:
|
||||
changelog = load_changelog()
|
||||
print(json.dumps(changelog, indent=2))
|
||||
return
|
||||
|
||||
if args.revert:
|
||||
changes = revert_config(args.revert.upper())
|
||||
for change in changes:
|
||||
print(change)
|
||||
return
|
||||
|
||||
if args.apply or args.dry_run:
|
||||
print("=" * 60)
|
||||
print(f"L1 CONFIG FIXER V2 - {'DRY RUN' if args.dry_run else 'APPLYING FIXES'}")
|
||||
print(f"Threshold: {ENABLE_THRESHOLD_PCT}%")
|
||||
print("=" * 60)
|
||||
|
||||
sectors = [args.sector.upper()] if args.sector else SECTOR_SCOPED_FIXES.keys()
|
||||
|
||||
for sector in sectors:
|
||||
if sector not in SECTOR_SCOPED_FIXES:
|
||||
print(f"\n⚠️ {sector}: No sector-scoped fixes defined")
|
||||
continue
|
||||
|
||||
print(f"\n📁 {sector}")
|
||||
print(f" Evidence: {SECTOR_SCOPED_FIXES[sector]['evidence']}")
|
||||
|
||||
changes = apply_fixes(sector, SECTOR_SCOPED_FIXES[sector], dry_run=args.dry_run)
|
||||
for change in changes:
|
||||
print(f" {change}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
if args.dry_run:
|
||||
print("DRY RUN - No changes applied")
|
||||
else:
|
||||
print("Fixes applied - see CHANGELOG.json for history")
|
||||
print("=" * 60)
|
||||
return
|
||||
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
372
packages/reviewiq-pipeline/scripts/generate_sector_briefs.py
Normal file
372
packages/reviewiq-pipeline/scripts/generate_sector_briefs.py
Normal file
@@ -0,0 +1,372 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Wave 0: Sector Brief Generator
|
||||
|
||||
Generates alignment context briefs for each sector.
|
||||
These briefs inform Wave 1 and Wave 2 primitive config generation.
|
||||
|
||||
Usage:
|
||||
python generate_sector_briefs.py # Generate all sectors
|
||||
python generate_sector_briefs.py --sector FOOD_DINING # Generate one sector
|
||||
python generate_sector_briefs.py --dry-run # Show what would be generated
|
||||
python generate_sector_briefs.py --validate # Validate existing briefs
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
from openai import OpenAI
|
||||
except ImportError:
|
||||
print("ERROR: openai package required. Install with: pip install openai")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
PROMPT_TEMPLATE = '''You are an expert in customer experience analysis across industries.
|
||||
|
||||
Your task: Generate a **sector brief** for the "{sector_name}" sector.
|
||||
|
||||
This brief will be used to align classification agents with industry-specific context.
|
||||
It describes what customers care about — NOT how to classify, NOT what primitives to use.
|
||||
|
||||
## Sector Information
|
||||
|
||||
- **Code**: {sector_code}
|
||||
- **Name**: {sector_name}
|
||||
- **Description**: {description}
|
||||
- **Sample Business Types**: {business_types}
|
||||
|
||||
## Output Requirements
|
||||
|
||||
Generate a JSON object with this exact structure:
|
||||
|
||||
```json
|
||||
{{
|
||||
"sector_code": "{sector_code}",
|
||||
"sector_name": "{sector_name}",
|
||||
"generated_at": "<ISO timestamp>",
|
||||
"version": "1.0",
|
||||
|
||||
"what_customers_judge": {{
|
||||
"description": "The primary dimensions customers evaluate in this sector",
|
||||
"items": [
|
||||
{{
|
||||
"aspect": "string (2-5 words)",
|
||||
"importance": "critical | high | moderate",
|
||||
"why_it_matters": "string (1 sentence)"
|
||||
}}
|
||||
]
|
||||
}},
|
||||
|
||||
"critical_pain_points": {{
|
||||
"description": "What damages reputation most severely",
|
||||
"items": [
|
||||
{{
|
||||
"pain_point": "string (2-5 words)",
|
||||
"typical_language": ["phrases customers actually use in reviews"],
|
||||
"reputation_impact": "severe | significant | moderate"
|
||||
}}
|
||||
]
|
||||
}},
|
||||
|
||||
"common_praise": {{
|
||||
"description": "What earns customer loyalty and positive reviews",
|
||||
"items": [
|
||||
{{
|
||||
"praise_area": "string (2-5 words)",
|
||||
"typical_language": ["phrases customers actually use in reviews"],
|
||||
"loyalty_impact": "high | moderate"
|
||||
}}
|
||||
]
|
||||
}},
|
||||
|
||||
"industry_terminology": {{
|
||||
"description": "Domain-specific vocabulary",
|
||||
"staff_terms": ["terms for staff roles in this sector"],
|
||||
"product_terms": ["terms for products/services"],
|
||||
"process_terms": ["terms for processes/interactions"],
|
||||
"quality_terms": ["positive quality descriptors"],
|
||||
"problem_terms": ["negative quality descriptors"]
|
||||
}},
|
||||
|
||||
"mode_specific_concerns": {{
|
||||
"description": "Different service modes have different priorities",
|
||||
"modes": [
|
||||
{{
|
||||
"mode": "string (e.g., 'In-person', 'Online', 'Phone')",
|
||||
"primary_concerns": ["top concerns for this mode"],
|
||||
"unique_pain_points": ["pain points specific to this mode"]
|
||||
}}
|
||||
]
|
||||
}},
|
||||
|
||||
"what_is_actionable": {{
|
||||
"description": "Feedback businesses can act on",
|
||||
"actionable_examples": [
|
||||
{{
|
||||
"feedback_type": "string",
|
||||
"example": "string (realistic review excerpt)",
|
||||
"action_owner": "role/team that can fix it"
|
||||
}}
|
||||
],
|
||||
"not_actionable_examples": [
|
||||
{{
|
||||
"feedback_type": "string",
|
||||
"example": "string (realistic review excerpt)",
|
||||
"why_not_actionable": "string"
|
||||
}}
|
||||
]
|
||||
}},
|
||||
|
||||
"sector_specific_signals": {{
|
||||
"description": "Signals with sector-specific meaning",
|
||||
"examples": [
|
||||
{{
|
||||
"signal": "string (word or phrase)",
|
||||
"meaning_in_this_sector": "string",
|
||||
"contrast_with": "how it differs in other sectors"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
}}
|
||||
```
|
||||
|
||||
## Critical Rules
|
||||
|
||||
1. **Use realistic review language** in `typical_language` arrays - actual phrases customers write
|
||||
2. **Include 4-8 items** per array (not too few, not excessive)
|
||||
3. **Be sector-specific** - don't use generic phrases that apply to all businesses
|
||||
4. **Include appropriate modes** - only modes that actually exist in this sector
|
||||
5. **NO primitive codes, priorities, weights, or solutions**
|
||||
6. **Focus on WHAT customers care about**, not HOW to classify it
|
||||
|
||||
Return ONLY the JSON object, no markdown formatting or explanation.'''
|
||||
|
||||
|
||||
def load_sectors(data_path: Path) -> list[dict]:
|
||||
"""Load sector definitions from JSON file."""
|
||||
with open(data_path) as f:
|
||||
data = json.load(f)
|
||||
return data["sectors"]
|
||||
|
||||
|
||||
def generate_sector_brief(client: OpenAI, sector: dict, model: str) -> dict:
|
||||
"""Generate a sector brief using LLM."""
|
||||
prompt = PROMPT_TEMPLATE.format(
|
||||
sector_code=sector["sector_code"],
|
||||
sector_name=sector["sector_name"],
|
||||
description=sector["description"],
|
||||
business_types=", ".join(sector["sample_business_types"])
|
||||
)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are an expert customer experience analyst. Return only valid JSON, no markdown."
|
||||
},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
temperature=0.3,
|
||||
max_tokens=4000,
|
||||
response_format={"type": "json_object"}
|
||||
)
|
||||
|
||||
text = response.choices[0].message.content.strip()
|
||||
|
||||
# Parse JSON
|
||||
brief = json.loads(text)
|
||||
|
||||
# Ensure required fields
|
||||
brief["sector_code"] = sector["sector_code"]
|
||||
brief["sector_name"] = sector["sector_name"]
|
||||
brief["generated_at"] = datetime.utcnow().isoformat() + "Z"
|
||||
brief["version"] = "1.0"
|
||||
|
||||
return brief
|
||||
|
||||
|
||||
def validate_brief(brief: dict) -> list[str]:
|
||||
"""Validate a sector brief, return list of issues."""
|
||||
issues = []
|
||||
|
||||
required_keys = [
|
||||
"what_customers_judge",
|
||||
"critical_pain_points",
|
||||
"common_praise",
|
||||
"industry_terminology",
|
||||
"mode_specific_concerns",
|
||||
"what_is_actionable",
|
||||
"sector_specific_signals"
|
||||
]
|
||||
|
||||
for key in required_keys:
|
||||
if key not in brief:
|
||||
issues.append(f"Missing required key: {key}")
|
||||
|
||||
# Check array lengths
|
||||
if "what_customers_judge" in brief:
|
||||
items = brief["what_customers_judge"].get("items", [])
|
||||
if len(items) < 3:
|
||||
issues.append(f"what_customers_judge has only {len(items)} items (need 3+)")
|
||||
if len(items) > 10:
|
||||
issues.append(f"what_customers_judge has {len(items)} items (max 10)")
|
||||
|
||||
if "critical_pain_points" in brief:
|
||||
items = brief["critical_pain_points"].get("items", [])
|
||||
if len(items) < 3:
|
||||
issues.append(f"critical_pain_points has only {len(items)} items (need 3+)")
|
||||
|
||||
if "common_praise" in brief:
|
||||
items = brief["common_praise"].get("items", [])
|
||||
if len(items) < 3:
|
||||
issues.append(f"common_praise has only {len(items)} items (need 3+)")
|
||||
|
||||
# Check for forbidden content
|
||||
text = json.dumps(brief).lower()
|
||||
forbidden = ["priority", "weight", "primitive", "enabled", "disabled", "solution"]
|
||||
for word in forbidden:
|
||||
if word in text and word != "solution": # solution can appear in context
|
||||
issues.append(f"Contains potentially forbidden term: {word}")
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
def save_brief(brief: dict, output_dir: Path) -> Path:
|
||||
"""Save brief to JSON file."""
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
filename = f"{brief['sector_code'].lower()}_brief.json"
|
||||
output_path = output_dir / filename
|
||||
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(brief, f, indent=2)
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
def validate_existing_briefs(output_dir: Path) -> None:
|
||||
"""Validate all existing brief files."""
|
||||
if not output_dir.exists():
|
||||
print(f"Output directory does not exist: {output_dir}")
|
||||
return
|
||||
|
||||
files = list(output_dir.glob("*_brief.json"))
|
||||
if not files:
|
||||
print("No brief files found")
|
||||
return
|
||||
|
||||
print(f"Validating {len(files)} brief files...\n")
|
||||
|
||||
all_valid = True
|
||||
for filepath in sorted(files):
|
||||
with open(filepath) as f:
|
||||
brief = json.load(f)
|
||||
|
||||
issues = validate_brief(brief)
|
||||
status = "✓" if not issues else "✗"
|
||||
print(f"{status} {filepath.name}")
|
||||
|
||||
if issues:
|
||||
all_valid = False
|
||||
for issue in issues:
|
||||
print(f" - {issue}")
|
||||
|
||||
print()
|
||||
if all_valid:
|
||||
print("All briefs valid!")
|
||||
else:
|
||||
print("Some briefs have issues.")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Generate sector briefs for Wave 0")
|
||||
parser.add_argument("--sector", help="Generate only this sector code")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
|
||||
parser.add_argument("--validate", action="store_true", help="Validate existing briefs")
|
||||
parser.add_argument("--output-dir", default="data/sector_briefs", help="Output directory")
|
||||
parser.add_argument("--model", default="gpt-4o", help="OpenAI model to use")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Paths
|
||||
script_dir = Path(__file__).parent
|
||||
package_dir = script_dir.parent
|
||||
data_path = package_dir / "data" / "sectors.json"
|
||||
output_dir = package_dir / args.output_dir
|
||||
|
||||
# Validate mode
|
||||
if args.validate:
|
||||
validate_existing_briefs(output_dir)
|
||||
return
|
||||
|
||||
# Load sectors
|
||||
sectors = load_sectors(data_path)
|
||||
print(f"Loaded {len(sectors)} sectors")
|
||||
|
||||
# Filter to single sector if specified
|
||||
if args.sector:
|
||||
sectors = [s for s in sectors if s["sector_code"] == args.sector]
|
||||
if not sectors:
|
||||
print(f"ERROR: Sector '{args.sector}' not found")
|
||||
sys.exit(1)
|
||||
|
||||
if args.dry_run:
|
||||
print("\n[DRY RUN] Would generate briefs for:")
|
||||
for sector in sectors:
|
||||
print(f" - {sector['sector_code']}: {sector['sector_name']}")
|
||||
print(f"\nOutput directory: {output_dir}")
|
||||
return
|
||||
|
||||
# Check API key
|
||||
api_key = os.environ.get("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
print("ERROR: OPENAI_API_KEY environment variable required")
|
||||
sys.exit(1)
|
||||
|
||||
# Initialize client
|
||||
client = OpenAI(api_key=api_key)
|
||||
print(f"Using model: {args.model}")
|
||||
|
||||
# Generate briefs
|
||||
results = {"success": [], "failed": []}
|
||||
|
||||
for i, sector in enumerate(sectors, 1):
|
||||
print(f"\n[{i}/{len(sectors)}] Generating brief for: {sector['sector_name']}")
|
||||
|
||||
try:
|
||||
brief = generate_sector_brief(client, sector, args.model)
|
||||
|
||||
# Validate
|
||||
issues = validate_brief(brief)
|
||||
if issues:
|
||||
print(f" Warnings:")
|
||||
for issue in issues:
|
||||
print(f" - {issue}")
|
||||
|
||||
# Save
|
||||
output_path = save_brief(brief, output_dir)
|
||||
print(f" ✓ Saved to: {output_path}")
|
||||
results["success"].append(sector["sector_code"])
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ FAILED: {e}")
|
||||
results["failed"].append(sector["sector_code"])
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"SUMMARY")
|
||||
print(f"{'='*60}")
|
||||
print(f"Success: {len(results['success'])}")
|
||||
print(f"Failed: {len(results['failed'])}")
|
||||
|
||||
if results["failed"]:
|
||||
print(f"\nFailed sectors: {', '.join(results['failed'])}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
523
packages/reviewiq-pipeline/scripts/llm_classifier.py
Normal file
523
packages/reviewiq-pipeline/scripts/llm_classifier.py
Normal file
@@ -0,0 +1,523 @@
|
||||
"""
|
||||
LLM Classifier - Real classification using OpenAI Responses API.
|
||||
|
||||
Uses JSON Schema to enforce strict output format.
|
||||
Validates primitives against enabled set.
|
||||
Stores raw response for audit.
|
||||
Supports multilingual reviews with language detection.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
# Language detection - try langdetect, fall back to heuristics
|
||||
try:
|
||||
from langdetect import detect as langdetect_detect, LangDetectException
|
||||
LANGDETECT_AVAILABLE = True
|
||||
except ImportError:
|
||||
LANGDETECT_AVAILABLE = False
|
||||
LangDetectException = Exception # Placeholder
|
||||
|
||||
|
||||
def detect_language(text: str) -> tuple[str, float]:
|
||||
"""
|
||||
Detect the language of a text.
|
||||
|
||||
Returns (language_code, confidence).
|
||||
Supported languages: en, es, de, fr, it, pt, ru, zh, ja, ko, ar, etc.
|
||||
|
||||
Falls back to heuristic detection if langdetect unavailable.
|
||||
"""
|
||||
if not text or len(text.strip()) < 3:
|
||||
return "unknown", 0.0
|
||||
|
||||
text = text.strip()
|
||||
|
||||
# Try langdetect first (most accurate)
|
||||
if LANGDETECT_AVAILABLE:
|
||||
try:
|
||||
lang = langdetect_detect(text)
|
||||
# langdetect doesn't provide confidence directly, estimate based on text length
|
||||
confidence = min(0.95, 0.5 + len(text) / 200)
|
||||
return lang, confidence
|
||||
except LangDetectException:
|
||||
pass
|
||||
|
||||
# Fallback: Simple heuristic detection based on character ranges
|
||||
# This is less accurate but works without dependencies
|
||||
|
||||
# Count characters in different scripts
|
||||
latin = sum(1 for c in text if '\u0041' <= c <= '\u024F') # Latin extended
|
||||
cyrillic = sum(1 for c in text if '\u0400' <= c <= '\u04FF') # Cyrillic
|
||||
cjk = sum(1 for c in text if '\u4E00' <= c <= '\u9FFF') # CJK Unified
|
||||
japanese = sum(1 for c in text if '\u3040' <= c <= '\u30FF') # Hiragana + Katakana
|
||||
korean = sum(1 for c in text if '\uAC00' <= c <= '\uD7AF') # Hangul
|
||||
arabic = sum(1 for c in text if '\u0600' <= c <= '\u06FF') # Arabic
|
||||
|
||||
total = len(text)
|
||||
if total == 0:
|
||||
return "unknown", 0.0
|
||||
|
||||
# Determine primary script
|
||||
if cjk / total > 0.3:
|
||||
return "zh", 0.6 # Chinese
|
||||
if japanese / total > 0.2:
|
||||
return "ja", 0.6 # Japanese
|
||||
if korean / total > 0.3:
|
||||
return "ko", 0.6 # Korean
|
||||
if cyrillic / total > 0.3:
|
||||
return "ru", 0.5 # Russian (could be other Cyrillic)
|
||||
if arabic / total > 0.3:
|
||||
return "ar", 0.5 # Arabic
|
||||
|
||||
if latin / total > 0.5:
|
||||
# Latin script - try to distinguish languages by common words
|
||||
text_lower = text.lower()
|
||||
|
||||
# Spanish indicators (expanded for better detection)
|
||||
es_words = ['el', 'la', 'los', 'las', 'de', 'que', 'es', 'en', 'un', 'una',
|
||||
'muy', 'pero', 'con', 'está', 'están', 'para', 'por', 'como',
|
||||
'excelente', 'recomendado', 'servicio', 'bueno', 'malo', 'bien',
|
||||
'todo', 'nada', 'más', 'sin', 'nunca', 'siempre', 'también']
|
||||
es_score = sum(1 for w in es_words if re.search(rf'\b{w}\b', text_lower))
|
||||
|
||||
# Spanish-specific patterns (accents, ñ, inverted punctuation)
|
||||
if 'ñ' in text_lower or '¿' in text or '¡' in text:
|
||||
es_score += 3
|
||||
if any(c in text_lower for c in 'áéíóúü'):
|
||||
es_score += 1
|
||||
|
||||
# English indicators
|
||||
en_words = ['the', 'and', 'is', 'are', 'was', 'were', 'this', 'that',
|
||||
'with', 'for', 'but', 'not', 'very', 'great', 'good',
|
||||
'service', 'place', 'food', 'staff', 'friendly', 'amazing',
|
||||
'would', 'recommend', 'will', 'definitely', 'really']
|
||||
en_score = sum(1 for w in en_words if re.search(rf'\b{w}\b', text_lower))
|
||||
|
||||
# German indicators
|
||||
de_words = ['der', 'die', 'das', 'und', 'ist', 'sind', 'war', 'sehr',
|
||||
'mit', 'für', 'aber', 'nicht', 'ein', 'eine', 'wir', 'ich',
|
||||
'auch', 'gut', 'schlecht', 'toll', 'super']
|
||||
de_score = sum(1 for w in de_words if re.search(rf'\b{w}\b', text_lower))
|
||||
# German umlauts
|
||||
if any(c in text_lower for c in 'äöüß'):
|
||||
de_score += 2
|
||||
|
||||
# French indicators
|
||||
fr_words = ['le', 'la', 'les', 'est', 'sont', 'très', 'mais', 'avec',
|
||||
'pour', 'pas', 'un', 'une', 'et', 'nous', 'vous', 'bien',
|
||||
'bon', 'mauvais', 'excellent', 'super', "c'est", "j'ai"]
|
||||
fr_score = sum(1 for w in fr_words if re.search(rf'\b{w}\b', text_lower))
|
||||
# French accents and patterns
|
||||
if any(c in text_lower for c in 'àâçèêëîïôùûÿœæ'):
|
||||
fr_score += 2
|
||||
|
||||
scores = {'es': es_score, 'en': en_score, 'de': de_score, 'fr': fr_score}
|
||||
best_lang = max(scores, key=scores.get)
|
||||
best_score = scores[best_lang]
|
||||
|
||||
if best_score >= 1: # Lowered threshold
|
||||
confidence = min(0.75, 0.3 + best_score * 0.08)
|
||||
return best_lang, confidence
|
||||
|
||||
# Default to English for Latin script
|
||||
return "en", 0.3
|
||||
|
||||
return "unknown", 0.1
|
||||
|
||||
# Lazy client initialization
|
||||
_client = None
|
||||
|
||||
|
||||
def get_client() -> OpenAI:
|
||||
"""Get OpenAI client, initializing lazily on first use."""
|
||||
global _client
|
||||
if _client is None:
|
||||
api_key = os.environ.get("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise RuntimeError(
|
||||
"OPENAI_API_KEY environment variable not set. "
|
||||
"Set it or use --dry-run / mock classifier."
|
||||
)
|
||||
_client = OpenAI(api_key=api_key)
|
||||
return _client
|
||||
|
||||
# Default model
|
||||
DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")
|
||||
|
||||
# Meta primitives - always available
|
||||
META_PRIMITIVES = frozenset([
|
||||
"HONESTY", "ETHICS", "PROMISES",
|
||||
"ACKNOWLEDGMENT", "RESPONSE_QUALITY", "RECOVERY",
|
||||
"RETURN_INTENT", "RECOMMEND", "RECOGNITION",
|
||||
"UNMAPPED",
|
||||
])
|
||||
|
||||
# JSON Schema for structured output
|
||||
SPAN_SCHEMA = {
|
||||
"name": "review_classification",
|
||||
"strict": True,
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"additionalProperties": False,
|
||||
"properties": {
|
||||
"spans": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"additionalProperties": False,
|
||||
"properties": {
|
||||
"primitive": {"type": "string"},
|
||||
"valence": {"type": "string", "enum": ["positive", "negative", "mixed", "neutral"]},
|
||||
"intensity": {"type": "integer", "minimum": 1, "maximum": 5},
|
||||
"evidence": {"type": "string"},
|
||||
"start_char": {"type": ["integer", "null"]},
|
||||
"end_char": {"type": ["integer", "null"]},
|
||||
"confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0},
|
||||
"details": {"type": "null"}
|
||||
},
|
||||
"required": ["primitive", "valence", "intensity", "evidence", "confidence", "start_char", "end_char", "details"]
|
||||
}
|
||||
},
|
||||
"unmapped": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"additionalProperties": False,
|
||||
"properties": {
|
||||
"label": {"type": "string"},
|
||||
"evidence": {"type": "string"},
|
||||
"confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0}
|
||||
},
|
||||
"required": ["label", "evidence", "confidence"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["spans", "unmapped"]
|
||||
}
|
||||
}
|
||||
|
||||
# System prompt
|
||||
SYSTEM_PROMPT = """You are a review classification system that extracts semantic spans and maps them to primitives.
|
||||
|
||||
## RULES (MUST FOLLOW)
|
||||
|
||||
1. Use ONLY primitives from the enabled_primitives list provided. Do NOT invent new primitives.
|
||||
|
||||
2. Meta primitives are always available: HONESTY, ETHICS, PROMISES, ACKNOWLEDGMENT, RESPONSE_QUALITY, RECOVERY, RETURN_INTENT, RECOMMEND, RECOGNITION, UNMAPPED
|
||||
|
||||
3. If content doesn't fit any enabled primitive, use UNMAPPED or put it in the unmapped array with a descriptive label.
|
||||
|
||||
4. Output MUST match the JSON schema exactly. No extra keys.
|
||||
|
||||
5. Evidence must be a SHORT EXACT QUOTE from the review text (in original language).
|
||||
|
||||
6. Extract 1-5 spans per review. Prefer fewer, larger spans over many small ones.
|
||||
|
||||
7. If unsure about classification, lower the confidence score.
|
||||
|
||||
## VALENCE
|
||||
- positive: praise, satisfaction, recommendation
|
||||
- negative: complaint, dissatisfaction, warning
|
||||
- mixed: both positive and negative in same span
|
||||
- neutral: factual observation, no sentiment
|
||||
|
||||
## INTENSITY (1-5)
|
||||
- 1: mild ("okay", "fine")
|
||||
- 2: moderate ("good", "bad")
|
||||
- 3: strong ("great", "terrible")
|
||||
- 4: very strong ("amazing", "awful")
|
||||
- 5: extreme ("best ever", "worst nightmare")
|
||||
|
||||
## CONFIDENCE
|
||||
- 0.9+: Very certain the primitive fits
|
||||
- 0.7-0.9: Confident
|
||||
- 0.5-0.7: Moderate confidence
|
||||
- <0.5: Low confidence (consider UNMAPPED)
|
||||
|
||||
Output valid JSON only. No markdown, no explanations."""
|
||||
|
||||
|
||||
def compute_review_hash(text: str, config_version: str) -> str:
|
||||
"""Compute hash for caching."""
|
||||
key = f"{config_version}:{text}"
|
||||
return hashlib.sha256(key.encode()).hexdigest()[:16]
|
||||
|
||||
|
||||
def build_user_payload(
|
||||
review_text: str,
|
||||
rating: int | None,
|
||||
config: dict[str, Any],
|
||||
language: str = "auto",
|
||||
) -> dict[str, Any]:
|
||||
"""Build the user message payload for the LLM."""
|
||||
# Extract only what the model needs
|
||||
enabled = set(config.get("enabled_primitives", []))
|
||||
enabled.update(META_PRIMITIVES)
|
||||
|
||||
# Build primitive definitions (minimal)
|
||||
primitives_dict = config.get("primitives", {})
|
||||
primitive_defs = {}
|
||||
for prim in enabled:
|
||||
if prim in primitives_dict:
|
||||
info = primitives_dict[prim]
|
||||
primitive_defs[prim] = info.get("def", info.get("name", prim))
|
||||
elif prim in META_PRIMITIVES:
|
||||
primitive_defs[prim] = f"Meta primitive: {prim.replace('_', ' ').lower()}"
|
||||
|
||||
# Extract brief signals (keep it short)
|
||||
brief = config.get("brief", {})
|
||||
brief_summary = {}
|
||||
if brief.get("what_customers_judge"):
|
||||
items = brief["what_customers_judge"]
|
||||
if isinstance(items, dict):
|
||||
items = items.get("items", [])
|
||||
brief_summary["key_judgment_areas"] = [
|
||||
item.get("aspect", item.get("area", str(item))) if isinstance(item, dict) else str(item)
|
||||
for item in items[:5]
|
||||
]
|
||||
if brief.get("critical_pain_points"):
|
||||
pains = brief["critical_pain_points"]
|
||||
if isinstance(pains, dict):
|
||||
pains = pains.get("items", [])
|
||||
brief_summary["critical_pains"] = [
|
||||
item.get("pain", str(item)) if isinstance(item, dict) else str(item)
|
||||
for item in pains[:3]
|
||||
]
|
||||
|
||||
return {
|
||||
"business": {
|
||||
"name": config.get("business_id"),
|
||||
"sector": config.get("sector_code"),
|
||||
"config_version": config.get("config_version"),
|
||||
},
|
||||
"enabled_primitives": sorted(enabled),
|
||||
"primitive_definitions": primitive_defs,
|
||||
"weights": config.get("weights", {}),
|
||||
"sector_brief": brief_summary,
|
||||
"review": {
|
||||
"text": review_text,
|
||||
"rating": rating,
|
||||
"language": language,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def validate_response(
|
||||
response: dict[str, Any],
|
||||
enabled_primitives: set[str],
|
||||
) -> tuple[dict[str, Any], list[str]]:
|
||||
"""
|
||||
Validate LLM response and fix invalid primitives.
|
||||
|
||||
Returns (validated_response, warnings).
|
||||
"""
|
||||
warnings = []
|
||||
all_valid = enabled_primitives | META_PRIMITIVES
|
||||
|
||||
validated_spans = []
|
||||
for span in response.get("spans", []):
|
||||
prim = span.get("primitive")
|
||||
if prim not in all_valid:
|
||||
warnings.append(f"Invalid primitive '{prim}' → UNMAPPED (original: {prim})")
|
||||
span["primitive"] = "UNMAPPED"
|
||||
validated_spans.append(span)
|
||||
|
||||
return {
|
||||
"spans": validated_spans,
|
||||
"unmapped": response.get("unmapped", []),
|
||||
}, warnings
|
||||
|
||||
|
||||
def classify_review(
|
||||
review_text: str,
|
||||
rating: int | None,
|
||||
config: dict[str, Any],
|
||||
language: str = "auto",
|
||||
model: str | None = None,
|
||||
max_retries: int = 3,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Classify a single review using OpenAI.
|
||||
|
||||
Args:
|
||||
review_text: The review text to classify
|
||||
rating: Star rating (1-5) if available
|
||||
config: Resolved config from ConfigResolver
|
||||
language: Language hint (default: auto-detect)
|
||||
model: Model to use (default: gpt-4o-mini)
|
||||
max_retries: Max retries on transient errors
|
||||
|
||||
Returns:
|
||||
{
|
||||
"spans": [...],
|
||||
"unmapped": [...],
|
||||
"model": str,
|
||||
"raw_response": str,
|
||||
"review_hash": str,
|
||||
"warnings": [...],
|
||||
"detected_language": str,
|
||||
"language_confidence": float,
|
||||
}
|
||||
"""
|
||||
model = model or DEFAULT_MODEL
|
||||
|
||||
# Detect language if auto
|
||||
detected_lang = "unknown"
|
||||
lang_confidence = 0.0
|
||||
if language == "auto":
|
||||
detected_lang, lang_confidence = detect_language(review_text)
|
||||
language = detected_lang
|
||||
else:
|
||||
detected_lang = language
|
||||
lang_confidence = 1.0 # User-specified
|
||||
|
||||
# Build payload with detected language
|
||||
payload = build_user_payload(review_text, rating, config, detected_lang)
|
||||
user_content = json.dumps(payload, ensure_ascii=False, indent=None)
|
||||
|
||||
# Compute hash for caching
|
||||
review_hash = compute_review_hash(review_text, config.get("config_version", "1.0"))
|
||||
|
||||
# Call OpenAI with retries
|
||||
last_error = None
|
||||
client = get_client()
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_content},
|
||||
],
|
||||
response_format={
|
||||
"type": "json_schema",
|
||||
"json_schema": SPAN_SCHEMA,
|
||||
},
|
||||
temperature=0.1, # Low temperature for consistency
|
||||
max_tokens=2000,
|
||||
)
|
||||
|
||||
# Parse response
|
||||
raw_text = response.choices[0].message.content
|
||||
parsed = json.loads(raw_text)
|
||||
|
||||
# Validate primitives
|
||||
enabled = set(config.get("enabled_primitives", []))
|
||||
validated, warnings = validate_response(parsed, enabled)
|
||||
|
||||
return {
|
||||
"spans": validated["spans"],
|
||||
"unmapped": validated["unmapped"],
|
||||
"model": model,
|
||||
"raw_response": raw_text,
|
||||
"review_hash": review_hash,
|
||||
"warnings": warnings,
|
||||
"tokens": {
|
||||
"prompt": response.usage.prompt_tokens if response.usage else 0,
|
||||
"completion": response.usage.completion_tokens if response.usage else 0,
|
||||
},
|
||||
"detected_language": detected_lang,
|
||||
"language_confidence": lang_confidence,
|
||||
}
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
last_error = f"JSON parse error: {e}"
|
||||
# Don't retry parse errors - log and return fallback
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
last_error = str(e)
|
||||
if "rate_limit" in str(e).lower() or "429" in str(e):
|
||||
# Exponential backoff for rate limits
|
||||
wait = 2 ** attempt
|
||||
time.sleep(wait)
|
||||
continue
|
||||
elif "500" in str(e) or "502" in str(e) or "503" in str(e):
|
||||
# Retry on server errors
|
||||
time.sleep(1)
|
||||
continue
|
||||
else:
|
||||
# Don't retry other errors
|
||||
break
|
||||
|
||||
# Fallback response on error
|
||||
return {
|
||||
"spans": [{
|
||||
"primitive": "UNMAPPED",
|
||||
"valence": "neutral",
|
||||
"intensity": 1,
|
||||
"evidence": review_text[:100] if review_text else "",
|
||||
"start_char": 0,
|
||||
"end_char": min(100, len(review_text)) if review_text else 0,
|
||||
"confidence": 0.1,
|
||||
"details": {"error": last_error},
|
||||
}],
|
||||
"unmapped": [],
|
||||
"model": model,
|
||||
"raw_response": json.dumps({"error": last_error}),
|
||||
"review_hash": review_hash,
|
||||
"warnings": [f"Classification failed: {last_error}"],
|
||||
"tokens": {"prompt": 0, "completion": 0},
|
||||
"detected_language": detected_lang,
|
||||
"language_confidence": lang_confidence,
|
||||
}
|
||||
|
||||
|
||||
async def classify_review_async(
|
||||
review_text: str,
|
||||
rating: int | None,
|
||||
config: dict[str, Any],
|
||||
language: str = "auto",
|
||||
model: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Async wrapper for classify_review."""
|
||||
import asyncio
|
||||
loop = asyncio.get_event_loop()
|
||||
return await loop.run_in_executor(
|
||||
None,
|
||||
lambda: classify_review(review_text, rating, config, language, model),
|
||||
)
|
||||
|
||||
|
||||
# Batch classification (for later optimization)
|
||||
async def classify_batch(
|
||||
reviews: list[dict[str, Any]],
|
||||
config: dict[str, Any],
|
||||
model: str | None = None,
|
||||
max_concurrent: int = 5,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Classify multiple reviews concurrently.
|
||||
|
||||
Args:
|
||||
reviews: List of {"text": str, "rating": int, "language": str}
|
||||
config: Resolved config
|
||||
model: Model to use
|
||||
max_concurrent: Max concurrent requests
|
||||
|
||||
Returns:
|
||||
List of classification results
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
semaphore = asyncio.Semaphore(max_concurrent)
|
||||
|
||||
async def classify_one(review: dict) -> dict:
|
||||
async with semaphore:
|
||||
return await classify_review_async(
|
||||
review.get("text", ""),
|
||||
review.get("rating"),
|
||||
config,
|
||||
review.get("language", "auto"),
|
||||
model,
|
||||
)
|
||||
|
||||
tasks = [classify_one(r) for r in reviews]
|
||||
return await asyncio.gather(*tasks)
|
||||
1102
packages/reviewiq-pipeline/scripts/run_classification_v2.py
Normal file
1102
packages/reviewiq-pipeline/scripts/run_classification_v2.py
Normal file
File diff suppressed because it is too large
Load Diff
457
packages/reviewiq-pipeline/scripts/validate_l1_configs.py
Normal file
457
packages/reviewiq-pipeline/scripts/validate_l1_configs.py
Normal file
@@ -0,0 +1,457 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Wave 1 L1 Config Validation Script
|
||||
|
||||
Validates L1 primitive configs against real review data by analyzing:
|
||||
1. Coverage: % of spans mapped to enabled primitives
|
||||
2. Top primitives by frequency
|
||||
3. Disabled primitives appearing (potential misconfig)
|
||||
4. Weight effectiveness
|
||||
|
||||
Usage:
|
||||
python validate_l1_configs.py --sector ENTERTAINMENT --job-url "gokarts"
|
||||
python validate_l1_configs.py --sector AUTOMOTIVE --job-url "clickrent"
|
||||
python validate_l1_configs.py --all
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from collections import Counter, defaultdict
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import asyncpg
|
||||
|
||||
# Paths
|
||||
DATA_DIR = Path(__file__).parent.parent / "data"
|
||||
CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l1"
|
||||
BRIEFS_DIR = DATA_DIR / "sector_briefs"
|
||||
|
||||
# Primitive to URT domain mapping
|
||||
# Primitives map to URT domains: O=Offering, P=People, J=Journey, E=Environment, A=Access, V=Value, R=Relationship
|
||||
PRIMITIVE_TO_DOMAIN = {
|
||||
# Quality -> Offering (O)
|
||||
"TASTE": "O", "CRAFT": "O", "FRESHNESS": "O", "TEMPERATURE": "O",
|
||||
"EFFECTIVENESS": "O", "ACCURACY": "O", "CONDITION": "O", "CONSISTENCY": "O",
|
||||
# Service -> People (P)
|
||||
"MANNER": "P", "COMPETENCE": "P", "ATTENTIVENESS": "P", "COMMUNICATION": "P",
|
||||
# Process -> Journey (J)
|
||||
"SPEED": "J", "FRICTION": "J", "RELIABILITY": "J", "AVAILABILITY": "J",
|
||||
# Environment -> Environment (E)
|
||||
"CLEANLINESS": "E", "COMFORT": "E", "SAFETY": "E", "AMBIANCE": "E",
|
||||
"ACCESSIBILITY": "E", "DIGITAL_UX": "E",
|
||||
# Value -> Value (V)
|
||||
"PRICE_LEVEL": "V", "PRICE_FAIRNESS": "V", "PRICE_TRANSPARENCY": "V", "VALUE_FOR_MONEY": "V",
|
||||
}
|
||||
|
||||
# URT code to primitive mapping (simplified - maps URT codes to closest primitive)
|
||||
URT_TO_PRIMITIVE = {
|
||||
# Offering codes
|
||||
"O1.01": "CONSISTENCY", "O1.02": "CRAFT", "O1.03": "FRESHNESS",
|
||||
"O1.04": "EFFECTIVENESS", "O1.05": "TASTE", "O1.06": "CONDITION",
|
||||
"O2.01": "ACCURACY", "O2.02": "EFFECTIVENESS", "O2.03": "CRAFT",
|
||||
"O3.01": "ACCURACY", "O3.02": "CONSISTENCY", "O3.03": "EFFECTIVENESS",
|
||||
# People codes
|
||||
"P1.01": "MANNER", "P1.02": "MANNER", "P1.03": "ATTENTIVENESS",
|
||||
"P1.04": "COMMUNICATION", "P1.05": "ATTENTIVENESS",
|
||||
"P2.01": "COMPETENCE", "P2.02": "COMPETENCE", "P2.03": "COMPETENCE",
|
||||
"P3.01": "COMMUNICATION", "P3.02": "COMMUNICATION", "P3.03": "COMMUNICATION",
|
||||
# Journey codes
|
||||
"J1.01": "SPEED", "J1.02": "RELIABILITY", "J1.03": "FRICTION",
|
||||
"J1.04": "SPEED", "J1.05": "RELIABILITY",
|
||||
"J2.01": "RELIABILITY", "J2.02": "RELIABILITY", "J2.03": "FRICTION",
|
||||
"J3.01": "FRICTION", "J3.02": "FRICTION", "J3.03": "FRICTION",
|
||||
# Environment codes
|
||||
"E1.01": "CLEANLINESS", "E1.02": "COMFORT", "E1.03": "AMBIANCE",
|
||||
"E1.04": "AMBIANCE", "E1.05": "COMFORT",
|
||||
"E2.01": "AMBIANCE", "E2.02": "COMFORT", "E2.03": "COMFORT",
|
||||
"E2.04": "AMBIANCE", "E2.05": "DIGITAL_UX",
|
||||
"E3.01": "SAFETY", "E3.02": "SAFETY", "E3.03": "ACCESSIBILITY",
|
||||
"E4.01": "ACCESSIBILITY", "E4.02": "ACCESSIBILITY", "E4.03": "DIGITAL_UX",
|
||||
# Access codes
|
||||
"A1.01": "AVAILABILITY", "A1.02": "AVAILABILITY", "A1.03": "AVAILABILITY",
|
||||
"A1.04": "ACCESSIBILITY", "A1.05": "ACCESSIBILITY",
|
||||
"A2.01": "ACCESSIBILITY", "A2.02": "ACCESSIBILITY", "A2.03": "DIGITAL_UX",
|
||||
"A3.01": "ACCESSIBILITY", "A3.02": "ACCESSIBILITY", "A3.03": "SPEED",
|
||||
"A4.01": "ACCESSIBILITY", "A4.02": "ACCESSIBILITY", "A4.03": "AVAILABILITY",
|
||||
# Value codes
|
||||
"V1.01": "PRICE_LEVEL", "V1.02": "PRICE_FAIRNESS", "V1.03": "PRICE_TRANSPARENCY",
|
||||
"V2.01": "PRICE_FAIRNESS", "V2.02": "PRICE_TRANSPARENCY", "V2.03": "VALUE_FOR_MONEY",
|
||||
"V3.01": "VALUE_FOR_MONEY", "V3.02": "VALUE_FOR_MONEY", "V3.03": "PRICE_FAIRNESS",
|
||||
"V4.01": "VALUE_FOR_MONEY", "V4.02": "VALUE_FOR_MONEY", "V4.03": "VALUE_FOR_MONEY",
|
||||
# Relationship codes
|
||||
"R1.01": "RELIABILITY", "R1.02": "RELIABILITY", "R1.03": "RELIABILITY",
|
||||
"R2.01": "RELIABILITY", "R2.02": "CONSISTENCY", "R2.03": "RELIABILITY",
|
||||
"R3.01": "MANNER", "R3.02": "MANNER", "R3.03": "COMMUNICATION",
|
||||
"R4.01": "CONSISTENCY", "R4.02": "RELIABILITY", "R4.03": "CONSISTENCY",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ValidationResult:
|
||||
"""Validation results for a sector."""
|
||||
sector_code: str
|
||||
job_count: int
|
||||
review_count: int
|
||||
span_count: int
|
||||
|
||||
# Coverage metrics
|
||||
enabled_coverage: float # % spans using enabled primitives
|
||||
disabled_hits: dict[str, int] # disabled primitives that appeared
|
||||
unmapped_count: int # spans that couldn't be mapped
|
||||
|
||||
# Distribution
|
||||
primitive_counts: dict[str, int] # all primitives by count
|
||||
domain_distribution: dict[str, int] # O, P, J, E, A, V, R
|
||||
valence_distribution: dict[str, int] # V+, V-, V0, V±
|
||||
|
||||
# Top codes
|
||||
top_urt_codes: list[tuple[str, int]]
|
||||
|
||||
# Recommendations
|
||||
recommendations: list[str]
|
||||
|
||||
|
||||
def load_l1_config(sector_code: str) -> dict[str, Any] | None:
|
||||
"""Load L1 config for a sector."""
|
||||
config_file = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
|
||||
if not config_file.exists():
|
||||
return None
|
||||
with open(config_file) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def load_sector_brief(sector_code: str) -> dict[str, Any] | None:
|
||||
"""Load sector brief for a sector."""
|
||||
brief_file = BRIEFS_DIR / f"{sector_code.lower()}_brief.json"
|
||||
if not brief_file.exists():
|
||||
return None
|
||||
with open(brief_file) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def map_urt_to_primitive(urt_code: str) -> str | None:
|
||||
"""Map URT code to primitive."""
|
||||
return URT_TO_PRIMITIVE.get(urt_code)
|
||||
|
||||
|
||||
async def fetch_spans_for_jobs(pool: asyncpg.Pool, job_url_pattern: str) -> list[dict]:
|
||||
"""Fetch spans for jobs matching URL pattern."""
|
||||
query = """
|
||||
SELECT
|
||||
rs.urt_primary,
|
||||
rs.valence,
|
||||
rs.intensity,
|
||||
rs.span_text,
|
||||
j.url
|
||||
FROM pipeline.review_spans rs
|
||||
JOIN pipeline.reviews_raw rr ON rs.review_id = rr.review_id
|
||||
JOIN public.jobs j ON rr.job_id = j.job_id
|
||||
WHERE LOWER(j.url) LIKE $1
|
||||
ORDER BY rs.created_at DESC
|
||||
"""
|
||||
rows = await pool.fetch(query, f"%{job_url_pattern.lower()}%")
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
|
||||
async def fetch_all_spans(pool: asyncpg.Pool) -> list[dict]:
|
||||
"""Fetch all spans from database."""
|
||||
query = """
|
||||
SELECT
|
||||
urt_primary,
|
||||
valence,
|
||||
intensity,
|
||||
span_text
|
||||
FROM pipeline.review_spans
|
||||
ORDER BY created_at DESC
|
||||
"""
|
||||
rows = await pool.fetch(query)
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
|
||||
def analyze_spans(
|
||||
spans: list[dict],
|
||||
config: dict[str, Any],
|
||||
) -> ValidationResult:
|
||||
"""Analyze spans against L1 config."""
|
||||
sector_code = config["sector_code"]
|
||||
enabled = set(config.get("enabled", []))
|
||||
disabled = set(config.get("disabled", []))
|
||||
weights = config.get("weights", {})
|
||||
|
||||
# Counters
|
||||
primitive_counts: Counter = Counter()
|
||||
domain_counts: Counter = Counter()
|
||||
valence_counts: Counter = Counter()
|
||||
urt_counts: Counter = Counter()
|
||||
disabled_hits: Counter = Counter()
|
||||
unmapped = 0
|
||||
enabled_hits = 0
|
||||
|
||||
for span in spans:
|
||||
urt_code = span["urt_primary"]
|
||||
valence = span.get("valence", "V0")
|
||||
|
||||
# Count URT codes
|
||||
urt_counts[urt_code] += 1
|
||||
|
||||
# Count valence
|
||||
valence_counts[valence] += 1
|
||||
|
||||
# Map to primitive
|
||||
primitive = map_urt_to_primitive(urt_code)
|
||||
if primitive:
|
||||
primitive_counts[primitive] += 1
|
||||
|
||||
# Count domain
|
||||
domain = PRIMITIVE_TO_DOMAIN.get(primitive, urt_code[0])
|
||||
domain_counts[domain] += 1
|
||||
|
||||
# Check if enabled or disabled
|
||||
if primitive in enabled:
|
||||
enabled_hits += 1
|
||||
elif primitive in disabled:
|
||||
disabled_hits[primitive] += 1
|
||||
else:
|
||||
unmapped += 1
|
||||
# Still count domain from URT code
|
||||
domain_counts[urt_code[0]] += 1
|
||||
|
||||
# Calculate coverage
|
||||
total = len(spans)
|
||||
enabled_coverage = enabled_hits / total if total > 0 else 0
|
||||
|
||||
# Generate recommendations
|
||||
recommendations = []
|
||||
|
||||
# Check disabled primitives that appeared frequently
|
||||
for prim, count in disabled_hits.most_common(5):
|
||||
if count >= 10:
|
||||
pct = count / total * 100
|
||||
recommendations.append(
|
||||
f"ENABLE {prim}: Disabled but appeared {count} times ({pct:.1f}%)"
|
||||
)
|
||||
|
||||
# Check for missing high-weight primitives
|
||||
weighted_set = set(weights.keys())
|
||||
for prim in weighted_set:
|
||||
if primitive_counts[prim] == 0 and prim in enabled:
|
||||
recommendations.append(
|
||||
f"CHECK {prim}: Weighted ({weights[prim]}x) but no appearances"
|
||||
)
|
||||
|
||||
# Check for frequently appearing unweighted primitives
|
||||
for prim, count in primitive_counts.most_common(10):
|
||||
if prim in enabled and prim not in weights and count >= total * 0.1:
|
||||
pct = count / total * 100
|
||||
recommendations.append(
|
||||
f"WEIGHT {prim}: High frequency ({count}, {pct:.1f}%) but not weighted"
|
||||
)
|
||||
|
||||
return ValidationResult(
|
||||
sector_code=sector_code,
|
||||
job_count=1, # Will be updated by caller
|
||||
review_count=0, # Not tracked at span level
|
||||
span_count=total,
|
||||
enabled_coverage=enabled_coverage,
|
||||
disabled_hits=dict(disabled_hits),
|
||||
unmapped_count=unmapped,
|
||||
primitive_counts=dict(primitive_counts),
|
||||
domain_distribution=dict(domain_counts),
|
||||
valence_distribution=dict(valence_counts),
|
||||
top_urt_codes=urt_counts.most_common(15),
|
||||
recommendations=recommendations,
|
||||
)
|
||||
|
||||
|
||||
def print_validation_report(result: ValidationResult, config: dict, brief: dict | None):
|
||||
"""Print formatted validation report."""
|
||||
print("\n" + "=" * 70)
|
||||
print(f"VALIDATION REPORT: {result.sector_code}")
|
||||
print("=" * 70)
|
||||
|
||||
# Overview
|
||||
print(f"\n📊 OVERVIEW")
|
||||
print(f" Spans analyzed: {result.span_count:,}")
|
||||
print(f" Enabled coverage: {result.enabled_coverage:.1%}")
|
||||
print(f" Unmapped spans: {result.unmapped_count} ({result.unmapped_count/result.span_count*100:.1f}%)" if result.span_count > 0 else " No spans")
|
||||
|
||||
# Config summary
|
||||
print(f"\n⚙️ CONFIG SUMMARY")
|
||||
print(f" Enabled: {len(config.get('enabled', []))} primitives")
|
||||
print(f" Disabled: {len(config.get('disabled', []))} primitives")
|
||||
print(f" Weighted: {len(config.get('weights', {}))} primitives")
|
||||
|
||||
# Domain distribution
|
||||
print(f"\n📁 DOMAIN DISTRIBUTION")
|
||||
domain_names = {"O": "Offering", "P": "People", "J": "Journey",
|
||||
"E": "Environment", "A": "Access", "V": "Value", "R": "Relationship"}
|
||||
for domain in "OPJEVRA":
|
||||
count = result.domain_distribution.get(domain, 0)
|
||||
pct = count / result.span_count * 100 if result.span_count > 0 else 0
|
||||
bar = "█" * int(pct / 2)
|
||||
print(f" {domain} {domain_names.get(domain, '?'):12} {count:4} ({pct:5.1f}%) {bar}")
|
||||
|
||||
# Valence distribution
|
||||
print(f"\n😊 VALENCE DISTRIBUTION")
|
||||
for val in ["V+", "V-", "V0", "V±"]:
|
||||
count = result.valence_distribution.get(val, 0)
|
||||
pct = count / result.span_count * 100 if result.span_count > 0 else 0
|
||||
print(f" {val}: {count:4} ({pct:5.1f}%)")
|
||||
|
||||
# Top primitives
|
||||
print(f"\n🔝 TOP PRIMITIVES")
|
||||
enabled_set = set(config.get("enabled", []))
|
||||
weights = config.get("weights", {})
|
||||
for prim, count in sorted(result.primitive_counts.items(), key=lambda x: -x[1])[:12]:
|
||||
pct = count / result.span_count * 100 if result.span_count > 0 else 0
|
||||
status = "✓" if prim in enabled_set else "✗"
|
||||
weight = f"({weights[prim]}x)" if prim in weights else ""
|
||||
print(f" {status} {prim:20} {count:4} ({pct:5.1f}%) {weight}")
|
||||
|
||||
# Top URT codes
|
||||
print(f"\n📋 TOP URT CODES")
|
||||
for code, count in result.top_urt_codes[:10]:
|
||||
pct = count / result.span_count * 100 if result.span_count > 0 else 0
|
||||
mapped = URT_TO_PRIMITIVE.get(code, "UNMAPPED")
|
||||
print(f" {code}: {count:4} ({pct:5.1f}%) → {mapped}")
|
||||
|
||||
# Disabled but appearing
|
||||
if result.disabled_hits:
|
||||
print(f"\n⚠️ DISABLED BUT APPEARING")
|
||||
for prim, count in sorted(result.disabled_hits.items(), key=lambda x: -x[1]):
|
||||
pct = count / result.span_count * 100 if result.span_count > 0 else 0
|
||||
print(f" {prim}: {count} ({pct:.1f}%)")
|
||||
|
||||
# Recommendations
|
||||
if result.recommendations:
|
||||
print(f"\n💡 RECOMMENDATIONS")
|
||||
for rec in result.recommendations:
|
||||
print(f" • {rec}")
|
||||
|
||||
# Brief signals check (if available)
|
||||
if brief:
|
||||
print(f"\n📝 BRIEF SIGNALS CHECK")
|
||||
what_customers_judge = brief.get("what_customers_judge", {})
|
||||
if isinstance(what_customers_judge, dict):
|
||||
items = what_customers_judge.get("items", [])
|
||||
else:
|
||||
items = what_customers_judge if isinstance(what_customers_judge, list) else []
|
||||
|
||||
print(f" Key judgment areas from brief:")
|
||||
for item in items[:5]:
|
||||
if isinstance(item, dict):
|
||||
print(f" • {item.get('area', item)}")
|
||||
else:
|
||||
print(f" • {item}")
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
|
||||
|
||||
async def run_validation(
|
||||
sector_code: str,
|
||||
job_url_pattern: str | None = None,
|
||||
db_url: str | None = None,
|
||||
):
|
||||
"""Run validation for a sector."""
|
||||
# Load config
|
||||
config = load_l1_config(sector_code)
|
||||
if not config:
|
||||
print(f"❌ No L1 config found for {sector_code}")
|
||||
return None
|
||||
|
||||
# Load brief
|
||||
brief = load_sector_brief(sector_code)
|
||||
|
||||
# Connect to database
|
||||
db_url = db_url or os.environ.get(
|
||||
"DATABASE_URL",
|
||||
"postgresql://scraper:scraper123@localhost:5437/scraper"
|
||||
)
|
||||
|
||||
pool = await asyncpg.create_pool(db_url)
|
||||
|
||||
try:
|
||||
# Fetch spans
|
||||
if job_url_pattern:
|
||||
spans = await fetch_spans_for_jobs(pool, job_url_pattern)
|
||||
if not spans:
|
||||
print(f"⚠️ No spans found for jobs matching '{job_url_pattern}'")
|
||||
return None
|
||||
else:
|
||||
spans = await fetch_all_spans(pool)
|
||||
|
||||
# Analyze
|
||||
result = analyze_spans(spans, config)
|
||||
|
||||
# Print report
|
||||
print_validation_report(result, config, brief)
|
||||
|
||||
return result
|
||||
|
||||
finally:
|
||||
await pool.close()
|
||||
|
||||
|
||||
async def run_all_validations(db_url: str | None = None):
|
||||
"""Run validation for all sectors with available data."""
|
||||
# Known jobs and their sectors
|
||||
jobs_by_sector = {
|
||||
"ENTERTAINMENT": ["gokarts", "soho"],
|
||||
"AUTOMOTIVE": ["clickrent"],
|
||||
"PERSONAL_SERVICES": ["fleitas"],
|
||||
"FOOD_DINING": ["fika"],
|
||||
}
|
||||
|
||||
results = {}
|
||||
|
||||
for sector, job_patterns in jobs_by_sector.items():
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Validating {sector}...")
|
||||
print(f"{'='*70}")
|
||||
|
||||
for pattern in job_patterns:
|
||||
result = await run_validation(sector, pattern, db_url)
|
||||
if result:
|
||||
results[f"{sector}:{pattern}"] = result
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 70)
|
||||
print("VALIDATION SUMMARY")
|
||||
print("=" * 70)
|
||||
|
||||
for key, result in results.items():
|
||||
sector, pattern = key.split(":")
|
||||
print(f"\n{sector} ({pattern}):")
|
||||
print(f" Coverage: {result.enabled_coverage:.1%}")
|
||||
print(f" Spans: {result.span_count}")
|
||||
if result.disabled_hits:
|
||||
print(f" ⚠️ Disabled hits: {sum(result.disabled_hits.values())}")
|
||||
if result.recommendations:
|
||||
print(f" Recommendations: {len(result.recommendations)}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Validate L1 primitive configs")
|
||||
parser.add_argument("--sector", help="Sector code (e.g., ENTERTAINMENT)")
|
||||
parser.add_argument("--job-url", help="Job URL pattern to filter (e.g., 'gokarts')")
|
||||
parser.add_argument("--all", action="store_true", help="Run all validations")
|
||||
parser.add_argument("--db-url", help="Database URL")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.all:
|
||||
asyncio.run(run_all_validations(args.db_url))
|
||||
elif args.sector:
|
||||
asyncio.run(run_validation(args.sector, args.job_url, args.db_url))
|
||||
else:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
421
packages/reviewiq-pipeline/scripts/validate_l1_configs_v2.py
Normal file
421
packages/reviewiq-pipeline/scripts/validate_l1_configs_v2.py
Normal file
@@ -0,0 +1,421 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Wave 1 L1 Config Validation Script - V2 (Sector-Scoped)
|
||||
|
||||
Validates L1 primitive configs against SECTOR-SPECIFIC review data.
|
||||
Only validates sectors where we have real business data.
|
||||
|
||||
Key improvement over v1: spans are filtered by business → sector mapping,
|
||||
ensuring "TASTE in HEALTHCARE" noise doesn't pollute results.
|
||||
|
||||
Usage:
|
||||
python validate_l1_configs_v2.py --sector ENTERTAINMENT
|
||||
python validate_l1_configs_v2.py --sector AUTOMOTIVE
|
||||
python validate_l1_configs_v2.py --all
|
||||
python validate_l1_configs_v2.py --report # Summary only
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import asyncpg
|
||||
|
||||
# Paths
|
||||
DATA_DIR = Path(__file__).parent.parent / "data"
|
||||
CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l1"
|
||||
BRIEFS_DIR = DATA_DIR / "sector_briefs"
|
||||
|
||||
# Business → Sector mapping (ground truth)
|
||||
BUSINESS_TO_SECTOR = {
|
||||
"Go Karts Mar Menor": "ENTERTAINMENT",
|
||||
"ClickRent Gran Canaria | Alquiler de Coches y Furgonetas": "AUTOMOTIVE",
|
||||
"Soho Club": "ENTERTAINMENT",
|
||||
"Fika": "FOOD_DINING",
|
||||
}
|
||||
|
||||
# Sectors with real data
|
||||
SECTORS_WITH_DATA = {"ENTERTAINMENT", "AUTOMOTIVE", "FOOD_DINING"}
|
||||
|
||||
# URT code to primitive mapping
|
||||
URT_TO_PRIMITIVE = {
|
||||
# Offering codes
|
||||
"O1.01": "CONSISTENCY", "O1.02": "CRAFT", "O1.03": "FRESHNESS",
|
||||
"O1.04": "EFFECTIVENESS", "O1.05": "TASTE", "O1.06": "CONDITION",
|
||||
"O2.01": "ACCURACY", "O2.02": "EFFECTIVENESS", "O2.03": "CRAFT",
|
||||
"O3.01": "ACCURACY", "O3.02": "CONSISTENCY", "O3.03": "EFFECTIVENESS",
|
||||
# People codes
|
||||
"P1.01": "MANNER", "P1.02": "MANNER", "P1.03": "ATTENTIVENESS",
|
||||
"P1.04": "COMMUNICATION", "P1.05": "ATTENTIVENESS",
|
||||
"P2.01": "COMPETENCE", "P2.02": "COMPETENCE", "P2.03": "COMPETENCE",
|
||||
"P3.01": "COMMUNICATION", "P3.02": "COMMUNICATION", "P3.03": "COMMUNICATION",
|
||||
# Journey codes
|
||||
"J1.01": "SPEED", "J1.02": "RELIABILITY", "J1.03": "FRICTION",
|
||||
"J1.04": "SPEED", "J1.05": "RELIABILITY",
|
||||
"J2.01": "RELIABILITY", "J2.02": "RELIABILITY", "J2.03": "FRICTION",
|
||||
"J3.01": "FRICTION", "J3.02": "FRICTION", "J3.03": "FRICTION",
|
||||
# Environment codes
|
||||
"E1.01": "CLEANLINESS", "E1.02": "COMFORT", "E1.03": "AMBIANCE",
|
||||
"E1.04": "AMBIANCE", "E1.05": "COMFORT",
|
||||
"E2.01": "AMBIANCE", "E2.02": "COMFORT", "E2.03": "COMFORT",
|
||||
"E2.04": "AMBIANCE", "E2.05": "DIGITAL_UX",
|
||||
"E3.01": "SAFETY", "E3.02": "SAFETY", "E3.03": "ACCESSIBILITY",
|
||||
"E4.01": "ACCESSIBILITY", "E4.02": "ACCESSIBILITY", "E4.03": "DIGITAL_UX",
|
||||
# Access codes
|
||||
"A1.01": "AVAILABILITY", "A1.02": "AVAILABILITY", "A1.03": "AVAILABILITY",
|
||||
"A1.04": "ACCESSIBILITY", "A1.05": "ACCESSIBILITY",
|
||||
"A2.01": "ACCESSIBILITY", "A2.02": "ACCESSIBILITY", "A2.03": "DIGITAL_UX",
|
||||
"A3.01": "ACCESSIBILITY", "A3.02": "ACCESSIBILITY", "A3.03": "SPEED",
|
||||
"A4.01": "ACCESSIBILITY", "A4.02": "ACCESSIBILITY", "A4.03": "AVAILABILITY",
|
||||
# Value codes
|
||||
"V1.01": "PRICE_LEVEL", "V1.02": "PRICE_FAIRNESS", "V1.03": "PRICE_TRANSPARENCY",
|
||||
"V2.01": "PRICE_FAIRNESS", "V2.02": "PRICE_TRANSPARENCY", "V2.03": "VALUE_FOR_MONEY",
|
||||
"V3.01": "VALUE_FOR_MONEY", "V3.02": "VALUE_FOR_MONEY", "V3.03": "PRICE_FAIRNESS",
|
||||
"V4.01": "VALUE_FOR_MONEY", "V4.02": "VALUE_FOR_MONEY", "V4.03": "VALUE_FOR_MONEY",
|
||||
# Relationship codes (map to meta - these should stay unmapped)
|
||||
"R1.01": None, "R1.02": None, "R1.03": None,
|
||||
"R2.01": None, "R2.02": None, "R2.03": None,
|
||||
"R3.01": None, "R3.02": None, "R3.03": None,
|
||||
"R4.01": None, "R4.02": None, "R4.03": None,
|
||||
}
|
||||
|
||||
# Minimum threshold for "enable" recommendations (% of sector spans)
|
||||
ENABLE_THRESHOLD_PCT = 3.0 # Only recommend enable if >= 3% of sector spans
|
||||
|
||||
|
||||
@dataclass
|
||||
class SectorValidation:
|
||||
"""Validation result for a single sector."""
|
||||
sector_code: str
|
||||
businesses: list[str]
|
||||
span_count: int
|
||||
|
||||
# Coverage
|
||||
enabled_coverage: float
|
||||
disabled_hits: dict[str, int] = field(default_factory=dict)
|
||||
unmapped_count: int = 0
|
||||
|
||||
# Distribution
|
||||
primitive_counts: dict[str, int] = field(default_factory=dict)
|
||||
domain_distribution: dict[str, int] = field(default_factory=dict)
|
||||
valence_distribution: dict[str, int] = field(default_factory=dict)
|
||||
top_urt_codes: list[tuple[str, int]] = field(default_factory=list)
|
||||
|
||||
# Recommendations (threshold-gated)
|
||||
recommended_enables: list[tuple[str, float]] = field(default_factory=list) # (primitive, pct)
|
||||
recommended_disables: list[tuple[str, float]] = field(default_factory=list)
|
||||
weight_issues: list[str] = field(default_factory=list)
|
||||
|
||||
# Metadata
|
||||
validated_at: str = ""
|
||||
config_version: str = ""
|
||||
|
||||
|
||||
def load_l1_config(sector_code: str) -> dict[str, Any] | None:
|
||||
"""Load L1 config for a sector."""
|
||||
config_file = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
|
||||
if not config_file.exists():
|
||||
return None
|
||||
with open(config_file) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def get_businesses_for_sector(sector_code: str) -> list[str]:
|
||||
"""Get list of businesses belonging to a sector."""
|
||||
return [biz for biz, sector in BUSINESS_TO_SECTOR.items() if sector == sector_code]
|
||||
|
||||
|
||||
async def fetch_spans_for_businesses(pool: asyncpg.Pool, businesses: list[str]) -> list[dict]:
|
||||
"""Fetch spans for specific businesses only."""
|
||||
if not businesses:
|
||||
return []
|
||||
|
||||
query = """
|
||||
SELECT
|
||||
business_id,
|
||||
urt_primary,
|
||||
valence,
|
||||
intensity,
|
||||
span_text
|
||||
FROM pipeline.review_spans
|
||||
WHERE business_id = ANY($1)
|
||||
ORDER BY created_at DESC
|
||||
"""
|
||||
rows = await pool.fetch(query, businesses)
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
|
||||
def analyze_sector_spans(
|
||||
spans: list[dict],
|
||||
config: dict[str, Any],
|
||||
businesses: list[str],
|
||||
) -> SectorValidation:
|
||||
"""Analyze spans for a specific sector."""
|
||||
sector_code = config["sector_code"]
|
||||
enabled = set(config.get("enabled", []))
|
||||
disabled = set(config.get("disabled", []))
|
||||
weights = config.get("weights", {})
|
||||
config_version = config.get("config_version", "1.0")
|
||||
|
||||
# Counters
|
||||
primitive_counts: Counter = Counter()
|
||||
domain_counts: Counter = Counter()
|
||||
valence_counts: Counter = Counter()
|
||||
urt_counts: Counter = Counter()
|
||||
disabled_hits: Counter = Counter()
|
||||
unmapped = 0
|
||||
enabled_hits = 0
|
||||
|
||||
for span in spans:
|
||||
urt_code = span["urt_primary"]
|
||||
valence = span.get("valence", "V0")
|
||||
|
||||
urt_counts[urt_code] += 1
|
||||
valence_counts[valence] += 1
|
||||
domain_counts[urt_code[0]] += 1
|
||||
|
||||
primitive = URT_TO_PRIMITIVE.get(urt_code)
|
||||
if primitive:
|
||||
primitive_counts[primitive] += 1
|
||||
if primitive in enabled:
|
||||
enabled_hits += 1
|
||||
elif primitive in disabled:
|
||||
disabled_hits[primitive] += 1
|
||||
else:
|
||||
unmapped += 1
|
||||
|
||||
total = len(spans)
|
||||
enabled_coverage = enabled_hits / total if total > 0 else 0
|
||||
|
||||
# Threshold-gated recommendations
|
||||
recommended_enables = []
|
||||
for prim, count in disabled_hits.most_common():
|
||||
pct = count / total * 100 if total > 0 else 0
|
||||
if pct >= ENABLE_THRESHOLD_PCT:
|
||||
recommended_enables.append((prim, pct))
|
||||
|
||||
# Weight issues
|
||||
weight_issues = []
|
||||
for prim in weights:
|
||||
if primitive_counts[prim] == 0 and prim in enabled:
|
||||
weight_issues.append(f"{prim} weighted ({weights[prim]}x) but 0 appearances")
|
||||
|
||||
# High-frequency unweighted
|
||||
for prim, count in primitive_counts.most_common(5):
|
||||
pct = count / total * 100 if total > 0 else 0
|
||||
if prim in enabled and prim not in weights and pct >= 10:
|
||||
weight_issues.append(f"{prim} high freq ({pct:.1f}%) but unweighted")
|
||||
|
||||
return SectorValidation(
|
||||
sector_code=sector_code,
|
||||
businesses=businesses,
|
||||
span_count=total,
|
||||
enabled_coverage=enabled_coverage,
|
||||
disabled_hits=dict(disabled_hits),
|
||||
unmapped_count=unmapped,
|
||||
primitive_counts=dict(primitive_counts),
|
||||
domain_distribution=dict(domain_counts),
|
||||
valence_distribution=dict(valence_counts),
|
||||
top_urt_codes=urt_counts.most_common(15),
|
||||
recommended_enables=recommended_enables,
|
||||
weight_issues=weight_issues,
|
||||
validated_at=datetime.utcnow().isoformat(),
|
||||
config_version=config_version,
|
||||
)
|
||||
|
||||
|
||||
def print_sector_report(result: SectorValidation, config: dict):
|
||||
"""Print detailed validation report for a sector."""
|
||||
print("\n" + "=" * 70)
|
||||
print(f"SECTOR-SCOPED VALIDATION: {result.sector_code}")
|
||||
print("=" * 70)
|
||||
|
||||
print(f"\n📊 DATA SOURCE")
|
||||
print(f" Businesses: {', '.join(result.businesses)}")
|
||||
print(f" Total spans: {result.span_count:,}")
|
||||
print(f" Config version: {result.config_version}")
|
||||
|
||||
print(f"\n📈 COVERAGE")
|
||||
print(f" Enabled coverage: {result.enabled_coverage:.1%}")
|
||||
print(f" Unmapped (R-domain): {result.unmapped_count} ({result.unmapped_count/result.span_count*100:.1f}%)" if result.span_count > 0 else "")
|
||||
|
||||
# Domain distribution
|
||||
print(f"\n📁 DOMAIN DISTRIBUTION")
|
||||
domain_names = {"O": "Offering", "P": "People", "J": "Journey",
|
||||
"E": "Environment", "A": "Access", "V": "Value", "R": "Relationship"}
|
||||
for domain in "OPJEVRA":
|
||||
count = result.domain_distribution.get(domain, 0)
|
||||
pct = count / result.span_count * 100 if result.span_count > 0 else 0
|
||||
bar = "█" * int(pct / 2)
|
||||
print(f" {domain} {domain_names.get(domain, '?'):12} {count:4} ({pct:5.1f}%) {bar}")
|
||||
|
||||
# Top primitives
|
||||
print(f"\n🔝 TOP PRIMITIVES (sector-scoped)")
|
||||
enabled_set = set(config.get("enabled", []))
|
||||
disabled_set = set(config.get("disabled", []))
|
||||
weights = config.get("weights", {})
|
||||
|
||||
for prim, count in sorted(result.primitive_counts.items(), key=lambda x: -x[1])[:12]:
|
||||
pct = count / result.span_count * 100 if result.span_count > 0 else 0
|
||||
if prim in enabled_set:
|
||||
status = "✓"
|
||||
elif prim in disabled_set:
|
||||
status = "✗"
|
||||
else:
|
||||
status = "?"
|
||||
weight = f"({weights[prim]}x)" if prim in weights else ""
|
||||
print(f" {status} {prim:20} {count:4} ({pct:5.1f}%) {weight}")
|
||||
|
||||
# Threshold-gated recommendations
|
||||
if result.recommended_enables:
|
||||
print(f"\n⚠️ RECOMMENDED ENABLES (≥{ENABLE_THRESHOLD_PCT}% threshold)")
|
||||
for prim, pct in result.recommended_enables:
|
||||
count = result.disabled_hits.get(prim, 0)
|
||||
print(f" → ENABLE {prim}: {count} spans ({pct:.1f}%)")
|
||||
else:
|
||||
print(f"\n✅ No primitives exceed {ENABLE_THRESHOLD_PCT}% threshold for enabling")
|
||||
|
||||
# Low-frequency disabled (info only)
|
||||
low_freq_disabled = [(p, c) for p, c in result.disabled_hits.items()
|
||||
if c / result.span_count * 100 < ENABLE_THRESHOLD_PCT]
|
||||
if low_freq_disabled:
|
||||
print(f"\n📋 DISABLED BUT APPEARING (below threshold - no action)")
|
||||
for prim, count in sorted(low_freq_disabled, key=lambda x: -x[1])[:5]:
|
||||
pct = count / result.span_count * 100
|
||||
print(f" {prim}: {count} ({pct:.1f}%)")
|
||||
|
||||
# Weight issues
|
||||
if result.weight_issues:
|
||||
print(f"\n⚖️ WEIGHT ISSUES")
|
||||
for issue in result.weight_issues:
|
||||
print(f" • {issue}")
|
||||
|
||||
print(f"\n⏱️ Validated at: {result.validated_at}")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
async def validate_sector(
|
||||
sector_code: str,
|
||||
db_url: str | None = None,
|
||||
verbose: bool = True,
|
||||
) -> SectorValidation | None:
|
||||
"""Validate a single sector with sector-scoped data."""
|
||||
|
||||
if sector_code not in SECTORS_WITH_DATA:
|
||||
if verbose:
|
||||
print(f"⚠️ {sector_code}: No real business data available for validation")
|
||||
return None
|
||||
|
||||
config = load_l1_config(sector_code)
|
||||
if not config:
|
||||
if verbose:
|
||||
print(f"❌ No L1 config found for {sector_code}")
|
||||
return None
|
||||
|
||||
businesses = get_businesses_for_sector(sector_code)
|
||||
if not businesses:
|
||||
if verbose:
|
||||
print(f"⚠️ {sector_code}: No businesses mapped")
|
||||
return None
|
||||
|
||||
db_url = db_url or os.environ.get(
|
||||
"DATABASE_URL",
|
||||
"postgresql://scraper:scraper123@localhost:5437/scraper"
|
||||
)
|
||||
|
||||
pool = await asyncpg.create_pool(db_url)
|
||||
|
||||
try:
|
||||
spans = await fetch_spans_for_businesses(pool, businesses)
|
||||
if not spans:
|
||||
if verbose:
|
||||
print(f"⚠️ {sector_code}: No spans found for businesses")
|
||||
return None
|
||||
|
||||
result = analyze_sector_spans(spans, config, businesses)
|
||||
|
||||
if verbose:
|
||||
print_sector_report(result, config)
|
||||
|
||||
return result
|
||||
|
||||
finally:
|
||||
await pool.close()
|
||||
|
||||
|
||||
async def validate_all_sectors(db_url: str | None = None) -> dict[str, SectorValidation]:
|
||||
"""Validate all sectors with available data."""
|
||||
results = {}
|
||||
|
||||
for sector in SECTORS_WITH_DATA:
|
||||
result = await validate_sector(sector, db_url, verbose=True)
|
||||
if result:
|
||||
results[sector] = result
|
||||
|
||||
# Print summary
|
||||
print("\n" + "=" * 70)
|
||||
print("VALIDATION SUMMARY")
|
||||
print("=" * 70)
|
||||
print(f"\n{'Sector':<20} {'Spans':>8} {'Coverage':>10} {'Enables':>10}")
|
||||
print("-" * 50)
|
||||
|
||||
for sector, result in results.items():
|
||||
enables = len(result.recommended_enables)
|
||||
enables_str = f"{enables} recs" if enables > 0 else "✓ OK"
|
||||
print(f"{sector:<20} {result.span_count:>8,} {result.enabled_coverage:>9.1%} {enables_str:>10}")
|
||||
|
||||
print("-" * 50)
|
||||
print(f"Sectors validated: {len(results)}/{len(SECTORS_WITH_DATA)}")
|
||||
print(f"Sectors without data: {20 - len(SECTORS_WITH_DATA)}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
async def generate_summary_report(db_url: str | None = None) -> dict:
|
||||
"""Generate a JSON summary report for all sectors."""
|
||||
results = {}
|
||||
|
||||
for sector in SECTORS_WITH_DATA:
|
||||
result = await validate_sector(sector, db_url, verbose=False)
|
||||
if result:
|
||||
results[sector] = {
|
||||
"span_count": result.span_count,
|
||||
"enabled_coverage": round(result.enabled_coverage, 3),
|
||||
"recommended_enables": result.recommended_enables,
|
||||
"weight_issues": result.weight_issues,
|
||||
"config_version": result.config_version,
|
||||
"validated_at": result.validated_at,
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Sector-scoped L1 config validation")
|
||||
parser.add_argument("--sector", help="Validate specific sector")
|
||||
parser.add_argument("--all", action="store_true", help="Validate all sectors with data")
|
||||
parser.add_argument("--report", action="store_true", help="Generate JSON summary report")
|
||||
parser.add_argument("--db-url", help="Database URL")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.report:
|
||||
results = asyncio.run(generate_summary_report(args.db_url))
|
||||
print(json.dumps(results, indent=2))
|
||||
elif args.all:
|
||||
asyncio.run(validate_all_sectors(args.db_url))
|
||||
elif args.sector:
|
||||
asyncio.run(validate_sector(args.sector.upper(), args.db_url))
|
||||
else:
|
||||
parser.print_help()
|
||||
print("\n\nSectors with real data:", ", ".join(sorted(SECTORS_WITH_DATA)))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user