Initial commit - WhyRating Engine (Google Reviews Scraper)

2026-02-02 18:19:00 +00:00
parent 0543a08242
commit 2206ddeff2
136 changed files with 51138 additions and 855 deletions
--- a/packages/reviewiq-pipeline/scripts/backfill_review_facts.py
+++ b/packages/reviewiq-pipeline/scripts/backfill_review_facts.py
@@ -0,0 +1,409 @@
+#!/usr/bin/env python3
+"""
+Backfill review_facts_v1 from public.jobs.reviews_data.
+
+Parses relative timestamps ("17 hours ago", "2 weeks ago") into absolute
+timestamps anchored to job.created_at.
+
+Usage:
+    python backfill_review_facts.py
+    python backfill_review_facts.py --dry-run
+    python backfill_review_facts.py --job-id <uuid>
+"""
+
+import argparse
+import asyncio
+import json
+import os
+import re
+from datetime import datetime, timedelta, timezone
+from typing import Any
+
+import asyncpg
+
+# Database URL
+DB_URL = os.environ.get(
+    "DATABASE_URL",
+    "postgresql://scraper:scraper123@localhost:5437/scraper"
+)
+
+
+# =============================================================================
+# RELATIVE TIMESTAMP PARSER
+# =============================================================================
+
+# Regex patterns for relative timestamps
+RELATIVE_PATTERNS = [
+    # "17 hours ago", "2 weeks ago", "a month ago"
+    (r"(?:edited\s+)?(\d+|a|an)\s+(second|minute|hour|day|week|month|year)s?\s+ago", "standard"),
+    # "just now"
+    (r"just\s+now", "just_now"),
+    # "yesterday"
+    (r"yesterday", "yesterday"),
+    # "today"
+    (r"today", "today"),
+]
+
+# Time unit multipliers (in seconds)
+TIME_UNITS = {
+    "second": 1,
+    "minute": 60,
+    "hour": 3600,
+    "day": 86400,
+    "week": 604800,
+    "month": 2592000,  # 30 days
+    "year": 31536000,  # 365 days
+}
+
+
+def parse_relative_timestamp(raw: str, reference_time: datetime) -> datetime | None:
+    """
+    Parse a relative timestamp string into an absolute datetime.
+
+    Args:
+        raw: Relative timestamp like "17 hours ago", "Edited 2 weeks ago"
+        reference_time: The reference point (usually job.created_at)
+
+    Returns:
+        Absolute datetime or None if parsing failed
+    """
+    if not raw:
+        return None
+
+    text = raw.lower().strip()
+
+    # Handle "just now"
+    if "just now" in text:
+        return reference_time
+
+    # Handle "yesterday"
+    if text == "yesterday":
+        return reference_time - timedelta(days=1)
+
+    # Handle "today"
+    if text == "today":
+        return reference_time
+
+    # Handle standard relative format
+    # Remove "edited " prefix if present
+    text = re.sub(r"^edited\s+", "", text)
+
+    # Match "N unit(s) ago"
+    match = re.match(r"(\d+|a|an)\s+(second|minute|hour|day|week|month|year)s?\s+ago", text)
+    if match:
+        quantity_str = match.group(1)
+        unit = match.group(2)
+
+        # Convert "a"/"an" to 1
+        if quantity_str in ("a", "an"):
+            quantity = 1
+        else:
+            quantity = int(quantity_str)
+
+        seconds = quantity * TIME_UNITS.get(unit, 0)
+        return reference_time - timedelta(seconds=seconds)
+
+    # Unknown format
+    return None
+
+
+def parse_relative_timestamp_safe(raw: str, reference_time: datetime) -> tuple[datetime | None, bool]:
+    """
+    Safe wrapper that returns (parsed_time, success).
+    """
+    try:
+        result = parse_relative_timestamp(raw, reference_time)
+        return result, result is not None
+    except Exception:
+        return None, False
+
+
+# =============================================================================
+# BACKFILL LOGIC
+# =============================================================================
+
+async def get_jobs_with_reviews(pool: asyncpg.Pool, job_id: str | None = None) -> list[dict]:
+    """Get all jobs with reviews_data."""
+    if job_id:
+        query = """
+            SELECT job_id, created_at, reviews_data,
+                   COALESCE(metadata->>'business_name', url) as business_id
+            FROM public.jobs
+            WHERE job_id = $1
+              AND reviews_data IS NOT NULL
+              AND jsonb_typeof(reviews_data) = 'array'
+        """
+        rows = await pool.fetch(query, job_id)
+    else:
+        query = """
+            SELECT job_id, created_at, reviews_data,
+                   COALESCE(metadata->>'business_name', url) as business_id
+            FROM public.jobs
+            WHERE reviews_data IS NOT NULL
+              AND jsonb_typeof(reviews_data) = 'array'
+            ORDER BY created_at DESC
+        """
+        rows = await pool.fetch(query)
+
+    return [dict(r) for r in rows]
+
+
+async def get_run_id_for_job(pool: asyncpg.Pool, job_id: str) -> str | None:
+    """Get the run_id associated with a job from detected_spans_v2."""
+    row = await pool.fetchrow("""
+        SELECT DISTINCT run_id FROM pipeline.detected_spans_v2
+        WHERE job_id = $1 AND run_id IS NOT NULL
+        LIMIT 1
+    """, job_id)
+    return str(row["run_id"]) if row and row["run_id"] else None
+
+
+async def get_language_for_review(pool: asyncpg.Pool, review_id: str) -> str | None:
+    """Get detected language for a review from spans."""
+    row = await pool.fetchrow("""
+        SELECT language FROM pipeline.detected_spans_v2
+        WHERE review_id = $1 AND language IS NOT NULL
+        LIMIT 1
+    """, review_id)
+    return row["language"] if row else None
+
+
+async def upsert_review_facts(
+    pool: asyncpg.Pool,
+    facts: list[dict],
+    dry_run: bool = False,
+) -> tuple[int, int]:
+    """
+    Upsert review facts into the database.
+
+    Returns:
+        (inserted_count, updated_count)
+    """
+    if dry_run or not facts:
+        return 0, 0
+
+    # Use executemany with ON CONFLICT
+    query = """
+        INSERT INTO pipeline.review_facts_v1
+        (review_id, business_id, job_id, run_id, rating, review_time_utc, raw_timestamp, author, language)
+        VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
+        ON CONFLICT (review_id) DO UPDATE SET
+            business_id = EXCLUDED.business_id,
+            job_id = EXCLUDED.job_id,
+            run_id = COALESCE(EXCLUDED.run_id, pipeline.review_facts_v1.run_id),
+            rating = EXCLUDED.rating,
+            review_time_utc = EXCLUDED.review_time_utc,
+            raw_timestamp = EXCLUDED.raw_timestamp,
+            author = EXCLUDED.author,
+            language = COALESCE(EXCLUDED.language, pipeline.review_facts_v1.language)
+    """
+
+    # Prepare records
+    records = [
+        (
+            f["review_id"],
+            f["business_id"],
+            f["job_id"],
+            f.get("run_id"),
+            f.get("rating"),
+            f.get("review_time_utc"),
+            f.get("raw_timestamp"),
+            f.get("author"),
+            f.get("language"),
+        )
+        for f in facts
+    ]
+
+    await pool.executemany(query, records)
+    return len(records), 0
+
+
+async def backfill_job(
+    pool: asyncpg.Pool,
+    job: dict,
+    dry_run: bool = False,
+    verbose: bool = False,
+) -> dict[str, Any]:
+    """
+    Backfill review facts for a single job.
+
+    Returns:
+        Stats dict with counts and errors
+    """
+    job_id = job["job_id"]
+    job_created = job["created_at"]
+    business_id = job["business_id"]
+    reviews_data = job["reviews_data"]
+
+    # asyncpg may return JSONB as string
+    if isinstance(reviews_data, str):
+        reviews_data = json.loads(reviews_data)
+
+    # Make job_created timezone-aware if it isn't
+    if job_created.tzinfo is None:
+        job_created = job_created.replace(tzinfo=timezone.utc)
+
+    # Get run_id for this job
+    run_id = await get_run_id_for_job(pool, str(job_id))
+
+    stats = {
+        "job_id": str(job_id),
+        "total_reviews": 0,
+        "parsed_ok": 0,
+        "parsed_failed": 0,
+        "inserted": 0,
+        "sample_failures": [],
+    }
+
+    facts = []
+
+    for review in reviews_data:
+        stats["total_reviews"] += 1
+
+        # Handle both dict and JSON string
+        if isinstance(review, str):
+            try:
+                review = json.loads(review)
+            except json.JSONDecodeError:
+                continue
+
+        review_id = review.get("review_id")
+        if not review_id:
+            continue
+
+        raw_timestamp = review.get("timestamp", "")
+        review_time, success = parse_relative_timestamp_safe(raw_timestamp, job_created)
+
+        if success:
+            stats["parsed_ok"] += 1
+        else:
+            stats["parsed_failed"] += 1
+            if len(stats["sample_failures"]) < 5:
+                stats["sample_failures"].append(raw_timestamp)
+
+        # Get language from spans if available
+        language = await get_language_for_review(pool, review_id) if not dry_run else None
+
+        facts.append({
+            "review_id": review_id,
+            "business_id": business_id,
+            "job_id": job_id,
+            "run_id": run_id,
+            "rating": review.get("rating"),
+            "review_time_utc": review_time,
+            "raw_timestamp": raw_timestamp,
+            "author": review.get("author"),
+            "language": language,
+        })
+
+    # Upsert
+    inserted, _ = await upsert_review_facts(pool, facts, dry_run=dry_run)
+    stats["inserted"] = inserted
+
+    if verbose:
+        print(f"  Job {job_id}: {stats['total_reviews']} reviews, "
+              f"{stats['parsed_ok']} parsed OK, {stats['parsed_failed']} failed")
+        if stats["sample_failures"]:
+            print(f"    Sample failures: {stats['sample_failures'][:3]}")
+
+    return stats
+
+
+async def backfill_all(
+    pool: asyncpg.Pool,
+    job_id: str | None = None,
+    dry_run: bool = False,
+    verbose: bool = False,
+) -> dict[str, Any]:
+    """
+    Backfill review facts for all jobs (or a specific job).
+
+    Returns:
+        Aggregate stats
+    """
+    jobs = await get_jobs_with_reviews(pool, job_id)
+
+    print(f"\n{'[DRY RUN] ' if dry_run else ''}Backfilling review_facts_v1 from {len(jobs)} jobs...")
+
+    aggregate = {
+        "jobs_processed": 0,
+        "total_reviews": 0,
+        "parsed_ok": 0,
+        "parsed_failed": 0,
+        "inserted": 0,
+        "unique_failure_patterns": set(),
+    }
+
+    for i, job in enumerate(jobs, 1):
+        if verbose:
+            print(f"\n[{i}/{len(jobs)}] Processing job {job['job_id']}...")
+
+        stats = await backfill_job(pool, job, dry_run=dry_run, verbose=verbose)
+
+        aggregate["jobs_processed"] += 1
+        aggregate["total_reviews"] += stats["total_reviews"]
+        aggregate["parsed_ok"] += stats["parsed_ok"]
+        aggregate["parsed_failed"] += stats["parsed_failed"]
+        aggregate["inserted"] += stats["inserted"]
+        aggregate["unique_failure_patterns"].update(stats["sample_failures"])
+
+    # Convert set to list for JSON serialization
+    aggregate["unique_failure_patterns"] = list(aggregate["unique_failure_patterns"])[:20]
+
+    return aggregate
+
+
+# =============================================================================
+# CLI
+# =============================================================================
+
+async def main_async(args):
+    """Main async entry point."""
+    pool = await asyncpg.create_pool(DB_URL)
+
+    try:
+        stats = await backfill_all(
+            pool,
+            job_id=args.job_id,
+            dry_run=args.dry_run,
+            verbose=args.verbose,
+        )
+
+        print("\n" + "=" * 60)
+        print("BACKFILL COMPLETE")
+        print("=" * 60)
+        print(f"Jobs processed:    {stats['jobs_processed']}")
+        print(f"Total reviews:     {stats['total_reviews']}")
+        print(f"Timestamps parsed: {stats['parsed_ok']} ({stats['parsed_ok']/max(stats['total_reviews'],1)*100:.1f}%)")
+        print(f"Timestamps failed: {stats['parsed_failed']} ({stats['parsed_failed']/max(stats['total_reviews'],1)*100:.1f}%)")
+        if not args.dry_run:
+            print(f"Records upserted:  {stats['inserted']}")
+
+        if stats["unique_failure_patterns"]:
+            print(f"\nUnparsed timestamp patterns ({len(stats['unique_failure_patterns'])}):")
+            for p in stats["unique_failure_patterns"][:10]:
+                print(f"  - \"{p}\"")
+
+        # Calculate coverage
+        coverage = stats['parsed_ok'] / max(stats['total_reviews'], 1) * 100
+        if coverage < 90:
+            print(f"\n⚠️  WARNING: Timestamp coverage is {coverage:.1f}% (target: >90%)")
+        else:
+            print(f"\n✅ Timestamp coverage: {coverage:.1f}%")
+
+    finally:
+        await pool.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Backfill review_facts_v1")
+    parser.add_argument("--job-id", help="Process a specific job only")
+    parser.add_argument("--dry-run", action="store_true", help="Don't write to database")
+    parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
+
+    args = parser.parse_args()
+    asyncio.run(main_async(args))
+
+
+if __name__ == "__main__":
+    main()
--- a/packages/reviewiq-pipeline/scripts/config_resolver_standalone.py
+++ b/packages/reviewiq-pipeline/scripts/config_resolver_standalone.py
@@ -0,0 +1,226 @@
+"""
+Config Resolver - Standalone version for scripts.
+
+Resolves L1 config + sector brief for classification.
+"""
+
+import json
+import logging
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+# Paths
+DATA_DIR = Path(__file__).parent.parent / "data"
+CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l1"
+L2_CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l2"
+BRIEFS_DIR = DATA_DIR / "sector_briefs"
+
+# Meta primitives - always enabled
+META_PRIMITIVES = frozenset([
+    "HONESTY", "ETHICS", "PROMISES",
+    "ACKNOWLEDGMENT", "RESPONSE_QUALITY", "RECOVERY",
+    "RETURN_INTENT", "RECOMMEND", "RECOGNITION",
+    "UNMAPPED",
+])
+
+# Core primitives dictionary
+CORE_PRIMITIVES = {
+    "TASTE": {"domain": "O", "name": "Taste/Flavor", "def": "Sensory quality of food/beverage"},
+    "CRAFT": {"domain": "O", "name": "Craftsmanship", "def": "Skill of execution/preparation"},
+    "FRESHNESS": {"domain": "O", "name": "Freshness", "def": "Newness, not stale or old"},
+    "TEMPERATURE": {"domain": "O", "name": "Temperature", "def": "Hot/cold as expected"},
+    "EFFECTIVENESS": {"domain": "O", "name": "Effectiveness", "def": "Achieves intended purpose"},
+    "ACCURACY": {"domain": "O", "name": "Accuracy", "def": "Correct, as ordered/specified"},
+    "CONDITION": {"domain": "O", "name": "Condition", "def": "Physical state, wear, damage"},
+    "CONSISTENCY": {"domain": "O", "name": "Consistency", "def": "Same quality each time"},
+    "MANNER": {"domain": "P", "name": "Manner/Attitude", "def": "Friendliness, respect, warmth"},
+    "COMPETENCE": {"domain": "P", "name": "Competence", "def": "Knowledge and skill of staff"},
+    "ATTENTIVENESS": {"domain": "P", "name": "Attentiveness", "def": "Being present, responsive"},
+    "COMMUNICATION": {"domain": "P", "name": "Communication", "def": "Clarity, listening, updates"},
+    "SPEED": {"domain": "J", "name": "Speed/Wait", "def": "Time to service, waiting"},
+    "FRICTION": {"domain": "J", "name": "Friction", "def": "Obstacles, hassles, complexity"},
+    "RELIABILITY": {"domain": "J", "name": "Reliability", "def": "Dependable, keeps promises"},
+    "AVAILABILITY": {"domain": "J", "name": "Availability", "def": "Open when needed, bookable"},
+    "CLEANLINESS": {"domain": "E", "name": "Cleanliness", "def": "Hygiene, tidiness"},
+    "COMFORT": {"domain": "E", "name": "Comfort", "def": "Physical ease, seating"},
+    "SAFETY": {"domain": "E", "name": "Safety", "def": "Free from harm/danger"},
+    "AMBIANCE": {"domain": "E", "name": "Ambiance", "def": "Atmosphere, mood, vibe"},
+    "ACCESSIBILITY": {"domain": "E", "name": "Accessibility", "def": "Easy to reach, navigate"},
+    "DIGITAL_UX": {"domain": "E", "name": "Digital Experience", "def": "Website, app, online"},
+    "PRICE_LEVEL": {"domain": "V", "name": "Price Level", "def": "Absolute cost (cheap/expensive)"},
+    "PRICE_FAIRNESS": {"domain": "V", "name": "Price Fairness", "def": "Reasonable for what you get"},
+    "PRICE_TRANSPARENCY": {"domain": "V", "name": "Price Transparency", "def": "No hidden fees, clear pricing"},
+    "VALUE_FOR_MONEY": {"domain": "V", "name": "Value for Money", "def": "Worth what you paid"},
+}
+
+
+class ConfigResolver:
+    """Resolves classification config for a business."""
+
+    def __init__(self):
+        self._l1_cache: dict[str, dict] = {}
+        self._l2_cache: dict[str, dict] = {}
+        self._brief_cache: dict[str, dict] = {}
+
+    def _load_l2_configs(self) -> list[dict[str, Any]]:
+        """Load all L2 config files."""
+        if not L2_CONFIGS_DIR.exists():
+            return []
+
+        configs = []
+        for config_path in L2_CONFIGS_DIR.glob("*_config.json"):
+            try:
+                with open(config_path) as f:
+                    config = json.load(f)
+                    configs.append(config)
+            except Exception as e:
+                logger.warning(f"Failed to load L2 config {config_path}: {e}")
+        return configs
+
+    def _find_matching_l2(self, gbp_path: str) -> dict[str, Any] | None:
+        """Find L2 config that matches the GBP path (most specific wins)."""
+        l2_configs = self._load_l2_configs()
+
+        # Find all matching configs (path starts with L2 gbp_path)
+        matches = []
+        for config in l2_configs:
+            l2_path = config.get("gbp_path", "")
+            if gbp_path.startswith(l2_path) or gbp_path == l2_path:
+                matches.append((len(l2_path), config))
+
+        if not matches:
+            return None
+
+        # Return most specific match (longest path)
+        matches.sort(key=lambda x: x[0], reverse=True)
+        return matches[0][1]
+
+    def _apply_l2_delta(self, l1_config: dict, l2_config: dict) -> dict:
+        """Apply L2 delta to L1 config."""
+        result = l1_config.copy()
+        delta = l2_config.get("delta", {})
+
+        # Enable additional primitives
+        if "enable" in delta:
+            enabled = set(result.get("enabled", []))
+            enabled.update(delta["enable"])
+            result["enabled"] = list(enabled)
+
+        # Merge weights
+        if "weights" in delta:
+            weights = dict(result.get("weights", {}))
+            weights.update(delta["weights"])
+            result["weights"] = weights
+
+        # Update config version to indicate L2
+        result["config_version"] = l2_config.get("config_version", result.get("config_version", "1.0"))
+        result["l2_applied"] = l2_config.get("gbp_path")
+
+        return result
+
+    def _load_l1_config(self, sector_code: str) -> dict[str, Any] | None:
+        if sector_code in self._l1_cache:
+            return self._l1_cache[sector_code]
+
+        config_path = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
+        if not config_path.exists():
+            return None
+
+        with open(config_path) as f:
+            config = json.load(f)
+
+        self._l1_cache[sector_code] = config
+        return config
+
+    def _load_sector_brief(self, sector_code: str) -> dict[str, Any] | None:
+        if sector_code in self._brief_cache:
+            return self._brief_cache[sector_code]
+
+        brief_path = BRIEFS_DIR / f"{sector_code.lower()}_brief.json"
+        if not brief_path.exists():
+            return None
+
+        with open(brief_path) as f:
+            brief = json.load(f)
+
+        self._brief_cache[sector_code] = brief
+        return brief
+
+    async def get_business_mapping(self, pool, business_id: str) -> dict[str, Any] | None:
+        query = """
+            SELECT business_id, gbp_path::text, sector_code
+            FROM pipeline.business_taxonomy_map
+            WHERE business_id = $1
+        """
+        row = await pool.fetchrow(query, business_id)
+        return dict(row) if row else None
+
+    def resolve_enabled_set(self, l1_config: dict) -> set[str]:
+        enabled = set(l1_config.get("enabled", []))
+        enabled.update(META_PRIMITIVES)
+        return enabled
+
+    def build_primitives_for_prompt(self, enabled: set[str], weights: dict[str, float]) -> dict[str, dict]:
+        result = {}
+        for prim in enabled:
+            if prim in CORE_PRIMITIVES:
+                entry = CORE_PRIMITIVES[prim].copy()
+                if prim in weights:
+                    entry["weight"] = weights[prim]
+                result[prim] = entry
+            elif prim in META_PRIMITIVES:
+                result[prim] = {"domain": "M", "name": prim.replace("_", " ").title(), "meta": True}
+        return result
+
+    def extract_brief_signals(self, brief: dict) -> dict[str, Any]:
+        if not brief:
+            return {}
+        return {
+            "sector": brief.get("sector_code"),
+            "what_customers_judge": brief.get("what_customers_judge"),
+            "critical_pain_points": brief.get("critical_pain_points"),
+            "industry_terminology": brief.get("industry_terminology"),
+        }
+
+    async def resolve(self, business_id: str, pool, mode: str | None = None) -> dict[str, Any] | None:
+        mapping = await self.get_business_mapping(pool, business_id)
+        if not mapping:
+            return None
+
+        sector_code = mapping["sector_code"]
+        gbp_path = mapping["gbp_path"]
+
+        # Load L1 config (sector-level)
+        l1_config = self._load_l1_config(sector_code)
+        if not l1_config:
+            l1_config = {"enabled": list(CORE_PRIMITIVES.keys()), "weights": {}}
+
+        # Check for L2 config (category-level delta)
+        l2_config = self._find_matching_l2(gbp_path)
+        if l2_config:
+            logger.info(f"Applying L2 delta for {gbp_path}: {l2_config.get('gbp_path')}")
+            l1_config = self._apply_l2_delta(l1_config, l2_config)
+
+        brief = self._load_sector_brief(sector_code)
+
+        enabled = self.resolve_enabled_set(l1_config)
+        weights = dict(l1_config.get("weights", {}))
+        primitives = self.build_primitives_for_prompt(enabled, weights)
+        brief_signals = self.extract_brief_signals(brief)
+
+        return {
+            "business_id": business_id,
+            "gbp_path": gbp_path,
+            "sector_code": sector_code,
+            "config_version": l1_config.get("config_version", "1.0"),
+            "l2_applied": l1_config.get("l2_applied"),
+            "modes": [mode] if mode else ["in_person"],
+            "default_mode": mode or "in_person",
+            "enabled_primitives": sorted(enabled),
+            "disabled_primitives": sorted(l1_config.get("disabled", [])),
+            "weights": weights,
+            "brief": brief_signals,
+            "primitives": primitives,
+        }
--- a/packages/reviewiq-pipeline/scripts/fix_l1_configs.py
+++ b/packages/reviewiq-pipeline/scripts/fix_l1_configs.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+"""
+Fix L1 configs based on validation results.
+
+Applies fixes discovered during validation:
+1. Enable primitives that were disabled but appearing frequently
+2. Remove weights for primitives with zero appearances
+3. Add weights for high-frequency unweighted primitives
+"""
+
+import json
+from pathlib import Path
+
+CONFIGS_DIR = Path(__file__).parent.parent / "data" / "primitive_configs" / "l1"
+
+# Fixes based on validation results
+# Format: { sector: { "enable": [primitives], "disable": [primitives], "add_weight": {prim: weight}, "remove_weight": [prims] } }
+FIXES = {
+    "ENTERTAINMENT": {
+        "enable": ["CRAFT", "CONSISTENCY", "COMMUNICATION", "FRICTION"],
+        "disable": [],
+        "add_weight": {},
+        "remove_weight": ["CONDITION"],  # 0 appearances despite 1.4x weight
+    },
+    "FOOD_DINING": {
+        "enable": ["PRICE_LEVEL", "ACCESSIBILITY", "PRICE_TRANSPARENCY", "FRICTION", "EFFECTIVENESS"],
+        "disable": [],
+        "add_weight": {},
+        "remove_weight": [],
+    },
+    "AUTOMOTIVE": {
+        "enable": ["CRAFT", "CONSISTENCY", "PRICE_LEVEL", "AMBIANCE"],
+        "disable": [],
+        "add_weight": {},
+        "remove_weight": [],
+    },
+    "HEALTHCARE": {
+        "enable": ["CRAFT", "PRICE_LEVEL", "AMBIANCE"],
+        "disable": [],
+        "add_weight": {},
+        "remove_weight": [],
+    },
+    "RETAIL_SHOPPING": {
+        "enable": ["CRAFT", "PRICE_LEVEL", "AMBIANCE"],
+        "disable": [],
+        "add_weight": {},
+        "remove_weight": [],
+    },
+    "HOSPITALITY_TRAVEL": {
+        "enable": ["CRAFT", "CONSISTENCY", "PRICE_LEVEL"],
+        "disable": [],
+        "add_weight": {},
+        "remove_weight": [],
+    },
+    "PERSONAL_SERVICES": {
+        "enable": ["PRICE_LEVEL", "SPEED", "FRICTION"],
+        "disable": [],
+        "add_weight": {},
+        "remove_weight": [],
+    },
+}
+
+
+def fix_config(sector_code: str, fixes: dict) -> dict:
+    """Apply fixes to a sector config."""
+    config_path = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
+
+    if not config_path.exists():
+        print(f"  ⚠️  Config not found: {config_path}")
+        return None
+
+    with open(config_path) as f:
+        config = json.load(f)
+
+    enabled = set(config.get("enabled", []))
+    disabled = set(config.get("disabled", []))
+    weights = config.get("weights", {})
+
+    changes = []
+
+    # Apply enables (move from disabled to enabled)
+    for prim in fixes.get("enable", []):
+        if prim in disabled:
+            disabled.remove(prim)
+            enabled.add(prim)
+            changes.append(f"✓ Enabled {prim}")
+        elif prim not in enabled:
+            enabled.add(prim)
+            changes.append(f"✓ Added {prim} to enabled")
+
+    # Apply disables (move from enabled to disabled)
+    for prim in fixes.get("disable", []):
+        if prim in enabled:
+            enabled.remove(prim)
+            disabled.add(prim)
+            changes.append(f"✗ Disabled {prim}")
+
+    # Add weights
+    for prim, weight in fixes.get("add_weight", {}).items():
+        if prim not in weights:
+            weights[prim] = weight
+            changes.append(f"⚖️ Added weight {prim}: {weight}x")
+
+    # Remove weights
+    for prim in fixes.get("remove_weight", []):
+        if prim in weights:
+            del weights[prim]
+            changes.append(f"⚖️ Removed weight for {prim}")
+
+    # Update config
+    config["enabled"] = sorted(enabled)
+    config["disabled"] = sorted(disabled)
+    config["weights"] = dict(sorted(weights.items()))
+    config["config_version"] = "1.1"  # Bump version
+
+    # Save
+    with open(config_path, "w") as f:
+        json.dump(config, f, indent=2)
+        f.write("\n")
+
+    return changes
+
+
+def main():
+    print("=" * 60)
+    print("L1 CONFIG FIXER - Applying validation-based fixes")
+    print("=" * 60)
+
+    total_changes = 0
+
+    for sector, fixes in FIXES.items():
+        print(f"\n📁 {sector}")
+        changes = fix_config(sector, fixes)
+        if changes:
+            for change in changes:
+                print(f"   {change}")
+            total_changes += len(changes)
+        else:
+            print("   No changes applied")
+
+    print(f"\n{'=' * 60}")
+    print(f"Total changes applied: {total_changes}")
+    print("Config version bumped to 1.1")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()
--- a/packages/reviewiq-pipeline/scripts/fix_l1_configs_v2.py
+++ b/packages/reviewiq-pipeline/scripts/fix_l1_configs_v2.py
@@ -0,0 +1,238 @@
+#!/usr/bin/env python3
+"""
+Guarded L1 Config Fixer - V2 (Threshold-based, Sector-scoped)
+
+Only applies fixes when:
+1. Evidence is from sector-scoped validation
+2. Frequency exceeds threshold (default 3%)
+3. Changes are logged with version bump
+
+Usage:
+    python fix_l1_configs_v2.py --apply         # Apply fixes from validation
+    python fix_l1_configs_v2.py --dry-run       # Show what would change
+    python fix_l1_configs_v2.py --revert SECTOR # Revert to previous version
+"""
+
+import argparse
+import json
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+CONFIGS_DIR = Path(__file__).parent.parent / "data" / "primitive_configs" / "l1"
+CHANGELOG_FILE = CONFIGS_DIR / "CHANGELOG.json"
+
+# Minimum threshold for auto-enabling (% of sector spans)
+ENABLE_THRESHOLD_PCT = 3.0
+
+# Fixes derived from sector-scoped validation (validate_l1_configs_v2.py output)
+# These are the ONLY fixes that should be applied
+SECTOR_SCOPED_FIXES = {
+    "ENTERTAINMENT": {
+        "evidence": "2,320 spans from Go Karts + Soho Club",
+        "enable": [
+            ("TASTE", 4.3, "Entertainment venues have concessions/food service"),
+        ],
+        "add_weight": [
+            ("CRAFT", 1.3, "13.4% frequency but unweighted"),
+        ],
+        "remove_weight": [],
+    },
+    "FOOD_DINING": {
+        "evidence": "61 spans from Fika cafe",
+        "enable": [
+            ("COMFORT", 9.8, "Seating/atmosphere comfort matters for cafes"),
+        ],
+        "add_weight": [
+            ("AVAILABILITY", 1.2, "16.4% frequency but unweighted"),
+        ],
+        "remove_weight": [
+            # Note: Small sample size (61 spans) - these may be false negatives
+            # Keep weights but flag for review with more data
+        ],
+    },
+    "AUTOMOTIVE": {
+        "evidence": "1,201 spans from ClickRent car rental",
+        "enable": [],  # Nothing exceeds 3% threshold
+        "add_weight": [],
+        "remove_weight": [
+            # CONDITION, HONESTY, PROMISES, RECOVERY all have 0 appearances
+            # However, may be specific to rental vs repair - keep for now
+        ],
+    },
+}
+
+
+def load_changelog() -> list[dict]:
+    """Load the changelog file."""
+    if CHANGELOG_FILE.exists():
+        with open(CHANGELOG_FILE) as f:
+            return json.load(f)
+    return []
+
+
+def save_changelog(entries: list[dict]) -> None:
+    """Save the changelog file."""
+    CHANGELOG_FILE.parent.mkdir(parents=True, exist_ok=True)
+    with open(CHANGELOG_FILE, "w") as f:
+        json.dump(entries, f, indent=2)
+        f.write("\n")
+
+
+def load_config(sector_code: str) -> dict[str, Any] | None:
+    """Load a sector config."""
+    config_path = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
+    if not config_path.exists():
+        return None
+    with open(config_path) as f:
+        return json.load(f)
+
+
+def save_config(sector_code: str, config: dict[str, Any]) -> None:
+    """Save a sector config."""
+    config_path = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
+    with open(config_path, "w") as f:
+        json.dump(config, f, indent=2)
+        f.write("\n")
+
+
+def apply_fixes(sector_code: str, fixes: dict, dry_run: bool = False) -> list[str]:
+    """Apply fixes to a sector config."""
+    config = load_config(sector_code)
+    if not config:
+        return [f"❌ Config not found for {sector_code}"]
+
+    enabled = set(config.get("enabled", []))
+    disabled = set(config.get("disabled", []))
+    weights = config.get("weights", {})
+
+    changes = []
+    evidence = fixes.get("evidence", "unknown")
+
+    # Enable primitives
+    for prim, pct, reason in fixes.get("enable", []):
+        if pct < ENABLE_THRESHOLD_PCT:
+            changes.append(f"⚠️  SKIP {prim}: {pct:.1f}% below {ENABLE_THRESHOLD_PCT}% threshold")
+            continue
+
+        if prim in disabled:
+            disabled.remove(prim)
+            enabled.add(prim)
+            changes.append(f"✓ ENABLE {prim}: {pct:.1f}% in sector data ({reason})")
+        elif prim not in enabled:
+            enabled.add(prim)
+            changes.append(f"✓ ADD {prim}: {pct:.1f}% in sector data ({reason})")
+
+    # Add weights
+    for prim, weight, reason in fixes.get("add_weight", []):
+        if prim not in weights:
+            weights[prim] = weight
+            changes.append(f"⚖️  WEIGHT {prim}: {weight}x ({reason})")
+
+    # Remove weights
+    for prim, reason in fixes.get("remove_weight", []):
+        if prim in weights:
+            del weights[prim]
+            changes.append(f"⚖️  UNWEIGHT {prim}: ({reason})")
+
+    if not changes:
+        return ["✓ No changes needed"]
+
+    if not dry_run:
+        # Bump version
+        old_version = config.get("config_version", "1.0")
+        major, minor = old_version.split(".")
+        new_version = f"{major}.{int(minor) + 1}"
+
+        config["enabled"] = sorted(enabled)
+        config["disabled"] = sorted(disabled)
+        config["weights"] = dict(sorted(weights.items()))
+        config["config_version"] = new_version
+        config["config_updated_at"] = datetime.now(timezone.utc).isoformat()
+
+        save_config(sector_code, config)
+
+        # Log to changelog
+        changelog = load_changelog()
+        changelog.append({
+            "sector": sector_code,
+            "version": new_version,
+            "previous_version": old_version,
+            "timestamp": datetime.now(timezone.utc).isoformat(),
+            "evidence": evidence,
+            "changes": changes,
+        })
+        save_changelog(changelog)
+
+        changes.append(f"📝 Version: {old_version} → {new_version}")
+
+    return changes
+
+
+def revert_config(sector_code: str, to_version: str | None = None) -> list[str]:
+    """Revert a config to a previous version."""
+    changelog = load_changelog()
+
+    # Find entries for this sector
+    sector_entries = [e for e in changelog if e["sector"] == sector_code]
+    if not sector_entries:
+        return [f"❌ No changelog entries for {sector_code}"]
+
+    # TODO: Implement actual revert by storing full config snapshots
+    return [f"⚠️  Revert not yet implemented - manual restore required"]
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Guarded L1 config fixer")
+    parser.add_argument("--apply", action="store_true", help="Apply sector-scoped fixes")
+    parser.add_argument("--dry-run", action="store_true", help="Show what would change")
+    parser.add_argument("--revert", metavar="SECTOR", help="Revert sector to previous version")
+    parser.add_argument("--sector", help="Apply to specific sector only")
+    parser.add_argument("--show-changelog", action="store_true", help="Show changelog")
+
+    args = parser.parse_args()
+
+    if args.show_changelog:
+        changelog = load_changelog()
+        print(json.dumps(changelog, indent=2))
+        return
+
+    if args.revert:
+        changes = revert_config(args.revert.upper())
+        for change in changes:
+            print(change)
+        return
+
+    if args.apply or args.dry_run:
+        print("=" * 60)
+        print(f"L1 CONFIG FIXER V2 - {'DRY RUN' if args.dry_run else 'APPLYING FIXES'}")
+        print(f"Threshold: {ENABLE_THRESHOLD_PCT}%")
+        print("=" * 60)
+
+        sectors = [args.sector.upper()] if args.sector else SECTOR_SCOPED_FIXES.keys()
+
+        for sector in sectors:
+            if sector not in SECTOR_SCOPED_FIXES:
+                print(f"\n⚠️  {sector}: No sector-scoped fixes defined")
+                continue
+
+            print(f"\n📁 {sector}")
+            print(f"   Evidence: {SECTOR_SCOPED_FIXES[sector]['evidence']}")
+
+            changes = apply_fixes(sector, SECTOR_SCOPED_FIXES[sector], dry_run=args.dry_run)
+            for change in changes:
+                print(f"   {change}")
+
+        print("\n" + "=" * 60)
+        if args.dry_run:
+            print("DRY RUN - No changes applied")
+        else:
+            print("Fixes applied - see CHANGELOG.json for history")
+        print("=" * 60)
+        return
+
+    parser.print_help()
+
+
+if __name__ == "__main__":
+    main()
--- a/packages/reviewiq-pipeline/scripts/generate_sector_briefs.py
+++ b/packages/reviewiq-pipeline/scripts/generate_sector_briefs.py
@@ -0,0 +1,372 @@
+#!/usr/bin/env python3
+"""
+Wave 0: Sector Brief Generator
+
+Generates alignment context briefs for each sector.
+These briefs inform Wave 1 and Wave 2 primitive config generation.
+
+Usage:
+    python generate_sector_briefs.py                       # Generate all sectors
+    python generate_sector_briefs.py --sector FOOD_DINING  # Generate one sector
+    python generate_sector_briefs.py --dry-run             # Show what would be generated
+    python generate_sector_briefs.py --validate            # Validate existing briefs
+"""
+
+import argparse
+import json
+import os
+import sys
+from datetime import datetime
+from pathlib import Path
+
+try:
+    from openai import OpenAI
+except ImportError:
+    print("ERROR: openai package required. Install with: pip install openai")
+    sys.exit(1)
+
+
+PROMPT_TEMPLATE = '''You are an expert in customer experience analysis across industries.
+
+Your task: Generate a **sector brief** for the "{sector_name}" sector.
+
+This brief will be used to align classification agents with industry-specific context.
+It describes what customers care about — NOT how to classify, NOT what primitives to use.
+
+## Sector Information
+
+- **Code**: {sector_code}
+- **Name**: {sector_name}
+- **Description**: {description}
+- **Sample Business Types**: {business_types}
+
+## Output Requirements
+
+Generate a JSON object with this exact structure:
+
+```json
+{{
+  "sector_code": "{sector_code}",
+  "sector_name": "{sector_name}",
+  "generated_at": "<ISO timestamp>",
+  "version": "1.0",
+
+  "what_customers_judge": {{
+    "description": "The primary dimensions customers evaluate in this sector",
+    "items": [
+      {{
+        "aspect": "string (2-5 words)",
+        "importance": "critical | high | moderate",
+        "why_it_matters": "string (1 sentence)"
+      }}
+    ]
+  }},
+
+  "critical_pain_points": {{
+    "description": "What damages reputation most severely",
+    "items": [
+      {{
+        "pain_point": "string (2-5 words)",
+        "typical_language": ["phrases customers actually use in reviews"],
+        "reputation_impact": "severe | significant | moderate"
+      }}
+    ]
+  }},
+
+  "common_praise": {{
+    "description": "What earns customer loyalty and positive reviews",
+    "items": [
+      {{
+        "praise_area": "string (2-5 words)",
+        "typical_language": ["phrases customers actually use in reviews"],
+        "loyalty_impact": "high | moderate"
+      }}
+    ]
+  }},
+
+  "industry_terminology": {{
+    "description": "Domain-specific vocabulary",
+    "staff_terms": ["terms for staff roles in this sector"],
+    "product_terms": ["terms for products/services"],
+    "process_terms": ["terms for processes/interactions"],
+    "quality_terms": ["positive quality descriptors"],
+    "problem_terms": ["negative quality descriptors"]
+  }},
+
+  "mode_specific_concerns": {{
+    "description": "Different service modes have different priorities",
+    "modes": [
+      {{
+        "mode": "string (e.g., 'In-person', 'Online', 'Phone')",
+        "primary_concerns": ["top concerns for this mode"],
+        "unique_pain_points": ["pain points specific to this mode"]
+      }}
+    ]
+  }},
+
+  "what_is_actionable": {{
+    "description": "Feedback businesses can act on",
+    "actionable_examples": [
+      {{
+        "feedback_type": "string",
+        "example": "string (realistic review excerpt)",
+        "action_owner": "role/team that can fix it"
+      }}
+    ],
+    "not_actionable_examples": [
+      {{
+        "feedback_type": "string",
+        "example": "string (realistic review excerpt)",
+        "why_not_actionable": "string"
+      }}
+    ]
+  }},
+
+  "sector_specific_signals": {{
+    "description": "Signals with sector-specific meaning",
+    "examples": [
+      {{
+        "signal": "string (word or phrase)",
+        "meaning_in_this_sector": "string",
+        "contrast_with": "how it differs in other sectors"
+      }}
+    ]
+  }}
+}}
+```
+
+## Critical Rules
+
+1. **Use realistic review language** in `typical_language` arrays - actual phrases customers write
+2. **Include 4-8 items** per array (not too few, not excessive)
+3. **Be sector-specific** - don't use generic phrases that apply to all businesses
+4. **Include appropriate modes** - only modes that actually exist in this sector
+5. **NO primitive codes, priorities, weights, or solutions**
+6. **Focus on WHAT customers care about**, not HOW to classify it
+
+Return ONLY the JSON object, no markdown formatting or explanation.'''
+
+
+def load_sectors(data_path: Path) -> list[dict]:
+    """Load sector definitions from JSON file."""
+    with open(data_path) as f:
+        data = json.load(f)
+    return data["sectors"]
+
+
+def generate_sector_brief(client: OpenAI, sector: dict, model: str) -> dict:
+    """Generate a sector brief using LLM."""
+    prompt = PROMPT_TEMPLATE.format(
+        sector_code=sector["sector_code"],
+        sector_name=sector["sector_name"],
+        description=sector["description"],
+        business_types=", ".join(sector["sample_business_types"])
+    )
+
+    response = client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "system",
+                "content": "You are an expert customer experience analyst. Return only valid JSON, no markdown."
+            },
+            {"role": "user", "content": prompt}
+        ],
+        temperature=0.3,
+        max_tokens=4000,
+        response_format={"type": "json_object"}
+    )
+
+    text = response.choices[0].message.content.strip()
+
+    # Parse JSON
+    brief = json.loads(text)
+
+    # Ensure required fields
+    brief["sector_code"] = sector["sector_code"]
+    brief["sector_name"] = sector["sector_name"]
+    brief["generated_at"] = datetime.utcnow().isoformat() + "Z"
+    brief["version"] = "1.0"
+
+    return brief
+
+
+def validate_brief(brief: dict) -> list[str]:
+    """Validate a sector brief, return list of issues."""
+    issues = []
+
+    required_keys = [
+        "what_customers_judge",
+        "critical_pain_points",
+        "common_praise",
+        "industry_terminology",
+        "mode_specific_concerns",
+        "what_is_actionable",
+        "sector_specific_signals"
+    ]
+
+    for key in required_keys:
+        if key not in brief:
+            issues.append(f"Missing required key: {key}")
+
+    # Check array lengths
+    if "what_customers_judge" in brief:
+        items = brief["what_customers_judge"].get("items", [])
+        if len(items) < 3:
+            issues.append(f"what_customers_judge has only {len(items)} items (need 3+)")
+        if len(items) > 10:
+            issues.append(f"what_customers_judge has {len(items)} items (max 10)")
+
+    if "critical_pain_points" in brief:
+        items = brief["critical_pain_points"].get("items", [])
+        if len(items) < 3:
+            issues.append(f"critical_pain_points has only {len(items)} items (need 3+)")
+
+    if "common_praise" in brief:
+        items = brief["common_praise"].get("items", [])
+        if len(items) < 3:
+            issues.append(f"common_praise has only {len(items)} items (need 3+)")
+
+    # Check for forbidden content
+    text = json.dumps(brief).lower()
+    forbidden = ["priority", "weight", "primitive", "enabled", "disabled", "solution"]
+    for word in forbidden:
+        if word in text and word != "solution":  # solution can appear in context
+            issues.append(f"Contains potentially forbidden term: {word}")
+
+    return issues
+
+
+def save_brief(brief: dict, output_dir: Path) -> Path:
+    """Save brief to JSON file."""
+    output_dir.mkdir(parents=True, exist_ok=True)
+    filename = f"{brief['sector_code'].lower()}_brief.json"
+    output_path = output_dir / filename
+
+    with open(output_path, "w") as f:
+        json.dump(brief, f, indent=2)
+
+    return output_path
+
+
+def validate_existing_briefs(output_dir: Path) -> None:
+    """Validate all existing brief files."""
+    if not output_dir.exists():
+        print(f"Output directory does not exist: {output_dir}")
+        return
+
+    files = list(output_dir.glob("*_brief.json"))
+    if not files:
+        print("No brief files found")
+        return
+
+    print(f"Validating {len(files)} brief files...\n")
+
+    all_valid = True
+    for filepath in sorted(files):
+        with open(filepath) as f:
+            brief = json.load(f)
+
+        issues = validate_brief(brief)
+        status = "✓" if not issues else "✗"
+        print(f"{status} {filepath.name}")
+
+        if issues:
+            all_valid = False
+            for issue in issues:
+                print(f"    - {issue}")
+
+    print()
+    if all_valid:
+        print("All briefs valid!")
+    else:
+        print("Some briefs have issues.")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate sector briefs for Wave 0")
+    parser.add_argument("--sector", help="Generate only this sector code")
+    parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
+    parser.add_argument("--validate", action="store_true", help="Validate existing briefs")
+    parser.add_argument("--output-dir", default="data/sector_briefs", help="Output directory")
+    parser.add_argument("--model", default="gpt-4o", help="OpenAI model to use")
+    args = parser.parse_args()
+
+    # Paths
+    script_dir = Path(__file__).parent
+    package_dir = script_dir.parent
+    data_path = package_dir / "data" / "sectors.json"
+    output_dir = package_dir / args.output_dir
+
+    # Validate mode
+    if args.validate:
+        validate_existing_briefs(output_dir)
+        return
+
+    # Load sectors
+    sectors = load_sectors(data_path)
+    print(f"Loaded {len(sectors)} sectors")
+
+    # Filter to single sector if specified
+    if args.sector:
+        sectors = [s for s in sectors if s["sector_code"] == args.sector]
+        if not sectors:
+            print(f"ERROR: Sector '{args.sector}' not found")
+            sys.exit(1)
+
+    if args.dry_run:
+        print("\n[DRY RUN] Would generate briefs for:")
+        for sector in sectors:
+            print(f"  - {sector['sector_code']}: {sector['sector_name']}")
+        print(f"\nOutput directory: {output_dir}")
+        return
+
+    # Check API key
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        print("ERROR: OPENAI_API_KEY environment variable required")
+        sys.exit(1)
+
+    # Initialize client
+    client = OpenAI(api_key=api_key)
+    print(f"Using model: {args.model}")
+
+    # Generate briefs
+    results = {"success": [], "failed": []}
+
+    for i, sector in enumerate(sectors, 1):
+        print(f"\n[{i}/{len(sectors)}] Generating brief for: {sector['sector_name']}")
+
+        try:
+            brief = generate_sector_brief(client, sector, args.model)
+
+            # Validate
+            issues = validate_brief(brief)
+            if issues:
+                print(f"  Warnings:")
+                for issue in issues:
+                    print(f"    - {issue}")
+
+            # Save
+            output_path = save_brief(brief, output_dir)
+            print(f"  ✓ Saved to: {output_path}")
+            results["success"].append(sector["sector_code"])
+
+        except Exception as e:
+            print(f"  ✗ FAILED: {e}")
+            results["failed"].append(sector["sector_code"])
+
+    # Summary
+    print(f"\n{'='*60}")
+    print(f"SUMMARY")
+    print(f"{'='*60}")
+    print(f"Success: {len(results['success'])}")
+    print(f"Failed:  {len(results['failed'])}")
+
+    if results["failed"]:
+        print(f"\nFailed sectors: {', '.join(results['failed'])}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/packages/reviewiq-pipeline/scripts/llm_classifier.py
+++ b/packages/reviewiq-pipeline/scripts/llm_classifier.py
@@ -0,0 +1,523 @@
+"""
+LLM Classifier - Real classification using OpenAI Responses API.
+
+Uses JSON Schema to enforce strict output format.
+Validates primitives against enabled set.
+Stores raw response for audit.
+Supports multilingual reviews with language detection.
+"""
+
+import hashlib
+import json
+import os
+import re
+import time
+from typing import Any
+
+from openai import OpenAI
+
+# Language detection - try langdetect, fall back to heuristics
+try:
+    from langdetect import detect as langdetect_detect, LangDetectException
+    LANGDETECT_AVAILABLE = True
+except ImportError:
+    LANGDETECT_AVAILABLE = False
+    LangDetectException = Exception  # Placeholder
+
+
+def detect_language(text: str) -> tuple[str, float]:
+    """
+    Detect the language of a text.
+
+    Returns (language_code, confidence).
+    Supported languages: en, es, de, fr, it, pt, ru, zh, ja, ko, ar, etc.
+
+    Falls back to heuristic detection if langdetect unavailable.
+    """
+    if not text or len(text.strip()) < 3:
+        return "unknown", 0.0
+
+    text = text.strip()
+
+    # Try langdetect first (most accurate)
+    if LANGDETECT_AVAILABLE:
+        try:
+            lang = langdetect_detect(text)
+            # langdetect doesn't provide confidence directly, estimate based on text length
+            confidence = min(0.95, 0.5 + len(text) / 200)
+            return lang, confidence
+        except LangDetectException:
+            pass
+
+    # Fallback: Simple heuristic detection based on character ranges
+    # This is less accurate but works without dependencies
+
+    # Count characters in different scripts
+    latin = sum(1 for c in text if '\u0041' <= c <= '\u024F')  # Latin extended
+    cyrillic = sum(1 for c in text if '\u0400' <= c <= '\u04FF')  # Cyrillic
+    cjk = sum(1 for c in text if '\u4E00' <= c <= '\u9FFF')  # CJK Unified
+    japanese = sum(1 for c in text if '\u3040' <= c <= '\u30FF')  # Hiragana + Katakana
+    korean = sum(1 for c in text if '\uAC00' <= c <= '\uD7AF')  # Hangul
+    arabic = sum(1 for c in text if '\u0600' <= c <= '\u06FF')  # Arabic
+
+    total = len(text)
+    if total == 0:
+        return "unknown", 0.0
+
+    # Determine primary script
+    if cjk / total > 0.3:
+        return "zh", 0.6  # Chinese
+    if japanese / total > 0.2:
+        return "ja", 0.6  # Japanese
+    if korean / total > 0.3:
+        return "ko", 0.6  # Korean
+    if cyrillic / total > 0.3:
+        return "ru", 0.5  # Russian (could be other Cyrillic)
+    if arabic / total > 0.3:
+        return "ar", 0.5  # Arabic
+
+    if latin / total > 0.5:
+        # Latin script - try to distinguish languages by common words
+        text_lower = text.lower()
+
+        # Spanish indicators (expanded for better detection)
+        es_words = ['el', 'la', 'los', 'las', 'de', 'que', 'es', 'en', 'un', 'una',
+                    'muy', 'pero', 'con', 'está', 'están', 'para', 'por', 'como',
+                    'excelente', 'recomendado', 'servicio', 'bueno', 'malo', 'bien',
+                    'todo', 'nada', 'más', 'sin', 'nunca', 'siempre', 'también']
+        es_score = sum(1 for w in es_words if re.search(rf'\b{w}\b', text_lower))
+
+        # Spanish-specific patterns (accents, ñ, inverted punctuation)
+        if 'ñ' in text_lower or '¿' in text or '¡' in text:
+            es_score += 3
+        if any(c in text_lower for c in 'áéíóúü'):
+            es_score += 1
+
+        # English indicators
+        en_words = ['the', 'and', 'is', 'are', 'was', 'were', 'this', 'that',
+                    'with', 'for', 'but', 'not', 'very', 'great', 'good',
+                    'service', 'place', 'food', 'staff', 'friendly', 'amazing',
+                    'would', 'recommend', 'will', 'definitely', 'really']
+        en_score = sum(1 for w in en_words if re.search(rf'\b{w}\b', text_lower))
+
+        # German indicators
+        de_words = ['der', 'die', 'das', 'und', 'ist', 'sind', 'war', 'sehr',
+                    'mit', 'für', 'aber', 'nicht', 'ein', 'eine', 'wir', 'ich',
+                    'auch', 'gut', 'schlecht', 'toll', 'super']
+        de_score = sum(1 for w in de_words if re.search(rf'\b{w}\b', text_lower))
+        # German umlauts
+        if any(c in text_lower for c in 'äöüß'):
+            de_score += 2
+
+        # French indicators
+        fr_words = ['le', 'la', 'les', 'est', 'sont', 'très', 'mais', 'avec',
+                    'pour', 'pas', 'un', 'une', 'et', 'nous', 'vous', 'bien',
+                    'bon', 'mauvais', 'excellent', 'super', "c'est", "j'ai"]
+        fr_score = sum(1 for w in fr_words if re.search(rf'\b{w}\b', text_lower))
+        # French accents and patterns
+        if any(c in text_lower for c in 'àâçèêëîïôùûÿœæ'):
+            fr_score += 2
+
+        scores = {'es': es_score, 'en': en_score, 'de': de_score, 'fr': fr_score}
+        best_lang = max(scores, key=scores.get)
+        best_score = scores[best_lang]
+
+        if best_score >= 1:  # Lowered threshold
+            confidence = min(0.75, 0.3 + best_score * 0.08)
+            return best_lang, confidence
+
+        # Default to English for Latin script
+        return "en", 0.3
+
+    return "unknown", 0.1
+
+# Lazy client initialization
+_client = None
+
+
+def get_client() -> OpenAI:
+    """Get OpenAI client, initializing lazily on first use."""
+    global _client
+    if _client is None:
+        api_key = os.environ.get("OPENAI_API_KEY")
+        if not api_key:
+            raise RuntimeError(
+                "OPENAI_API_KEY environment variable not set. "
+                "Set it or use --dry-run / mock classifier."
+            )
+        _client = OpenAI(api_key=api_key)
+    return _client
+
+# Default model
+DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")
+
+# Meta primitives - always available
+META_PRIMITIVES = frozenset([
+    "HONESTY", "ETHICS", "PROMISES",
+    "ACKNOWLEDGMENT", "RESPONSE_QUALITY", "RECOVERY",
+    "RETURN_INTENT", "RECOMMEND", "RECOGNITION",
+    "UNMAPPED",
+])
+
+# JSON Schema for structured output
+SPAN_SCHEMA = {
+    "name": "review_classification",
+    "strict": True,
+    "schema": {
+        "type": "object",
+        "additionalProperties": False,
+        "properties": {
+            "spans": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "additionalProperties": False,
+                    "properties": {
+                        "primitive": {"type": "string"},
+                        "valence": {"type": "string", "enum": ["positive", "negative", "mixed", "neutral"]},
+                        "intensity": {"type": "integer", "minimum": 1, "maximum": 5},
+                        "evidence": {"type": "string"},
+                        "start_char": {"type": ["integer", "null"]},
+                        "end_char": {"type": ["integer", "null"]},
+                        "confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0},
+                        "details": {"type": "null"}
+                    },
+                    "required": ["primitive", "valence", "intensity", "evidence", "confidence", "start_char", "end_char", "details"]
+                }
+            },
+            "unmapped": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "additionalProperties": False,
+                    "properties": {
+                        "label": {"type": "string"},
+                        "evidence": {"type": "string"},
+                        "confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0}
+                    },
+                    "required": ["label", "evidence", "confidence"]
+                }
+            }
+        },
+        "required": ["spans", "unmapped"]
+    }
+}
+
+# System prompt
+SYSTEM_PROMPT = """You are a review classification system that extracts semantic spans and maps them to primitives.
+
+## RULES (MUST FOLLOW)
+
+1. Use ONLY primitives from the enabled_primitives list provided. Do NOT invent new primitives.
+
+2. Meta primitives are always available: HONESTY, ETHICS, PROMISES, ACKNOWLEDGMENT, RESPONSE_QUALITY, RECOVERY, RETURN_INTENT, RECOMMEND, RECOGNITION, UNMAPPED
+
+3. If content doesn't fit any enabled primitive, use UNMAPPED or put it in the unmapped array with a descriptive label.
+
+4. Output MUST match the JSON schema exactly. No extra keys.
+
+5. Evidence must be a SHORT EXACT QUOTE from the review text (in original language).
+
+6. Extract 1-5 spans per review. Prefer fewer, larger spans over many small ones.
+
+7. If unsure about classification, lower the confidence score.
+
+## VALENCE
+- positive: praise, satisfaction, recommendation
+- negative: complaint, dissatisfaction, warning
+- mixed: both positive and negative in same span
+- neutral: factual observation, no sentiment
+
+## INTENSITY (1-5)
+- 1: mild ("okay", "fine")
+- 2: moderate ("good", "bad")
+- 3: strong ("great", "terrible")
+- 4: very strong ("amazing", "awful")
+- 5: extreme ("best ever", "worst nightmare")
+
+## CONFIDENCE
+- 0.9+: Very certain the primitive fits
+- 0.7-0.9: Confident
+- 0.5-0.7: Moderate confidence
+- <0.5: Low confidence (consider UNMAPPED)
+
+Output valid JSON only. No markdown, no explanations."""
+
+
+def compute_review_hash(text: str, config_version: str) -> str:
+    """Compute hash for caching."""
+    key = f"{config_version}:{text}"
+    return hashlib.sha256(key.encode()).hexdigest()[:16]
+
+
+def build_user_payload(
+    review_text: str,
+    rating: int | None,
+    config: dict[str, Any],
+    language: str = "auto",
+) -> dict[str, Any]:
+    """Build the user message payload for the LLM."""
+    # Extract only what the model needs
+    enabled = set(config.get("enabled_primitives", []))
+    enabled.update(META_PRIMITIVES)
+
+    # Build primitive definitions (minimal)
+    primitives_dict = config.get("primitives", {})
+    primitive_defs = {}
+    for prim in enabled:
+        if prim in primitives_dict:
+            info = primitives_dict[prim]
+            primitive_defs[prim] = info.get("def", info.get("name", prim))
+        elif prim in META_PRIMITIVES:
+            primitive_defs[prim] = f"Meta primitive: {prim.replace('_', ' ').lower()}"
+
+    # Extract brief signals (keep it short)
+    brief = config.get("brief", {})
+    brief_summary = {}
+    if brief.get("what_customers_judge"):
+        items = brief["what_customers_judge"]
+        if isinstance(items, dict):
+            items = items.get("items", [])
+        brief_summary["key_judgment_areas"] = [
+            item.get("aspect", item.get("area", str(item))) if isinstance(item, dict) else str(item)
+            for item in items[:5]
+        ]
+    if brief.get("critical_pain_points"):
+        pains = brief["critical_pain_points"]
+        if isinstance(pains, dict):
+            pains = pains.get("items", [])
+        brief_summary["critical_pains"] = [
+            item.get("pain", str(item)) if isinstance(item, dict) else str(item)
+            for item in pains[:3]
+        ]
+
+    return {
+        "business": {
+            "name": config.get("business_id"),
+            "sector": config.get("sector_code"),
+            "config_version": config.get("config_version"),
+        },
+        "enabled_primitives": sorted(enabled),
+        "primitive_definitions": primitive_defs,
+        "weights": config.get("weights", {}),
+        "sector_brief": brief_summary,
+        "review": {
+            "text": review_text,
+            "rating": rating,
+            "language": language,
+        },
+    }
+
+
+def validate_response(
+    response: dict[str, Any],
+    enabled_primitives: set[str],
+) -> tuple[dict[str, Any], list[str]]:
+    """
+    Validate LLM response and fix invalid primitives.
+
+    Returns (validated_response, warnings).
+    """
+    warnings = []
+    all_valid = enabled_primitives | META_PRIMITIVES
+
+    validated_spans = []
+    for span in response.get("spans", []):
+        prim = span.get("primitive")
+        if prim not in all_valid:
+            warnings.append(f"Invalid primitive '{prim}' → UNMAPPED (original: {prim})")
+            span["primitive"] = "UNMAPPED"
+        validated_spans.append(span)
+
+    return {
+        "spans": validated_spans,
+        "unmapped": response.get("unmapped", []),
+    }, warnings
+
+
+def classify_review(
+    review_text: str,
+    rating: int | None,
+    config: dict[str, Any],
+    language: str = "auto",
+    model: str | None = None,
+    max_retries: int = 3,
+) -> dict[str, Any]:
+    """
+    Classify a single review using OpenAI.
+
+    Args:
+        review_text: The review text to classify
+        rating: Star rating (1-5) if available
+        config: Resolved config from ConfigResolver
+        language: Language hint (default: auto-detect)
+        model: Model to use (default: gpt-4o-mini)
+        max_retries: Max retries on transient errors
+
+    Returns:
+        {
+            "spans": [...],
+            "unmapped": [...],
+            "model": str,
+            "raw_response": str,
+            "review_hash": str,
+            "warnings": [...],
+            "detected_language": str,
+            "language_confidence": float,
+        }
+    """
+    model = model or DEFAULT_MODEL
+
+    # Detect language if auto
+    detected_lang = "unknown"
+    lang_confidence = 0.0
+    if language == "auto":
+        detected_lang, lang_confidence = detect_language(review_text)
+        language = detected_lang
+    else:
+        detected_lang = language
+        lang_confidence = 1.0  # User-specified
+
+    # Build payload with detected language
+    payload = build_user_payload(review_text, rating, config, detected_lang)
+    user_content = json.dumps(payload, ensure_ascii=False, indent=None)
+
+    # Compute hash for caching
+    review_hash = compute_review_hash(review_text, config.get("config_version", "1.0"))
+
+    # Call OpenAI with retries
+    last_error = None
+    client = get_client()
+    for attempt in range(max_retries):
+        try:
+            response = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": user_content},
+                ],
+                response_format={
+                    "type": "json_schema",
+                    "json_schema": SPAN_SCHEMA,
+                },
+                temperature=0.1,  # Low temperature for consistency
+                max_tokens=2000,
+            )
+
+            # Parse response
+            raw_text = response.choices[0].message.content
+            parsed = json.loads(raw_text)
+
+            # Validate primitives
+            enabled = set(config.get("enabled_primitives", []))
+            validated, warnings = validate_response(parsed, enabled)
+
+            return {
+                "spans": validated["spans"],
+                "unmapped": validated["unmapped"],
+                "model": model,
+                "raw_response": raw_text,
+                "review_hash": review_hash,
+                "warnings": warnings,
+                "tokens": {
+                    "prompt": response.usage.prompt_tokens if response.usage else 0,
+                    "completion": response.usage.completion_tokens if response.usage else 0,
+                },
+                "detected_language": detected_lang,
+                "language_confidence": lang_confidence,
+            }
+
+        except json.JSONDecodeError as e:
+            last_error = f"JSON parse error: {e}"
+            # Don't retry parse errors - log and return fallback
+            break
+
+        except Exception as e:
+            last_error = str(e)
+            if "rate_limit" in str(e).lower() or "429" in str(e):
+                # Exponential backoff for rate limits
+                wait = 2 ** attempt
+                time.sleep(wait)
+                continue
+            elif "500" in str(e) or "502" in str(e) or "503" in str(e):
+                # Retry on server errors
+                time.sleep(1)
+                continue
+            else:
+                # Don't retry other errors
+                break
+
+    # Fallback response on error
+    return {
+        "spans": [{
+            "primitive": "UNMAPPED",
+            "valence": "neutral",
+            "intensity": 1,
+            "evidence": review_text[:100] if review_text else "",
+            "start_char": 0,
+            "end_char": min(100, len(review_text)) if review_text else 0,
+            "confidence": 0.1,
+            "details": {"error": last_error},
+        }],
+        "unmapped": [],
+        "model": model,
+        "raw_response": json.dumps({"error": last_error}),
+        "review_hash": review_hash,
+        "warnings": [f"Classification failed: {last_error}"],
+        "tokens": {"prompt": 0, "completion": 0},
+        "detected_language": detected_lang,
+        "language_confidence": lang_confidence,
+    }
+
+
+async def classify_review_async(
+    review_text: str,
+    rating: int | None,
+    config: dict[str, Any],
+    language: str = "auto",
+    model: str | None = None,
+) -> dict[str, Any]:
+    """Async wrapper for classify_review."""
+    import asyncio
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(
+        None,
+        lambda: classify_review(review_text, rating, config, language, model),
+    )
+
+
+# Batch classification (for later optimization)
+async def classify_batch(
+    reviews: list[dict[str, Any]],
+    config: dict[str, Any],
+    model: str | None = None,
+    max_concurrent: int = 5,
+) -> list[dict[str, Any]]:
+    """
+    Classify multiple reviews concurrently.
+
+    Args:
+        reviews: List of {"text": str, "rating": int, "language": str}
+        config: Resolved config
+        model: Model to use
+        max_concurrent: Max concurrent requests
+
+    Returns:
+        List of classification results
+    """
+    import asyncio
+
+    semaphore = asyncio.Semaphore(max_concurrent)
+
+    async def classify_one(review: dict) -> dict:
+        async with semaphore:
+            return await classify_review_async(
+                review.get("text", ""),
+                review.get("rating"),
+                config,
+                review.get("language", "auto"),
+                model,
+            )
+
+    tasks = [classify_one(r) for r in reviews]
+    return await asyncio.gather(*tasks)
--- a/packages/reviewiq-pipeline/scripts/run_classification_v2.py
+++ b/packages/reviewiq-pipeline/scripts/run_classification_v2.py
--- a/packages/reviewiq-pipeline/scripts/validate_l1_configs.py
+++ b/packages/reviewiq-pipeline/scripts/validate_l1_configs.py
@@ -0,0 +1,457 @@
+#!/usr/bin/env python3
+"""
+Wave 1 L1 Config Validation Script
+
+Validates L1 primitive configs against real review data by analyzing:
+1. Coverage: % of spans mapped to enabled primitives
+2. Top primitives by frequency
+3. Disabled primitives appearing (potential misconfig)
+4. Weight effectiveness
+
+Usage:
+    python validate_l1_configs.py --sector ENTERTAINMENT --job-url "gokarts"
+    python validate_l1_configs.py --sector AUTOMOTIVE --job-url "clickrent"
+    python validate_l1_configs.py --all
+"""
+
+import argparse
+import asyncio
+import json
+import os
+import sys
+from collections import Counter, defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+import asyncpg
+
+# Paths
+DATA_DIR = Path(__file__).parent.parent / "data"
+CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l1"
+BRIEFS_DIR = DATA_DIR / "sector_briefs"
+
+# Primitive to URT domain mapping
+# Primitives map to URT domains: O=Offering, P=People, J=Journey, E=Environment, A=Access, V=Value, R=Relationship
+PRIMITIVE_TO_DOMAIN = {
+    # Quality -> Offering (O)
+    "TASTE": "O", "CRAFT": "O", "FRESHNESS": "O", "TEMPERATURE": "O",
+    "EFFECTIVENESS": "O", "ACCURACY": "O", "CONDITION": "O", "CONSISTENCY": "O",
+    # Service -> People (P)
+    "MANNER": "P", "COMPETENCE": "P", "ATTENTIVENESS": "P", "COMMUNICATION": "P",
+    # Process -> Journey (J)
+    "SPEED": "J", "FRICTION": "J", "RELIABILITY": "J", "AVAILABILITY": "J",
+    # Environment -> Environment (E)
+    "CLEANLINESS": "E", "COMFORT": "E", "SAFETY": "E", "AMBIANCE": "E",
+    "ACCESSIBILITY": "E", "DIGITAL_UX": "E",
+    # Value -> Value (V)
+    "PRICE_LEVEL": "V", "PRICE_FAIRNESS": "V", "PRICE_TRANSPARENCY": "V", "VALUE_FOR_MONEY": "V",
+}
+
+# URT code to primitive mapping (simplified - maps URT codes to closest primitive)
+URT_TO_PRIMITIVE = {
+    # Offering codes
+    "O1.01": "CONSISTENCY", "O1.02": "CRAFT", "O1.03": "FRESHNESS",
+    "O1.04": "EFFECTIVENESS", "O1.05": "TASTE", "O1.06": "CONDITION",
+    "O2.01": "ACCURACY", "O2.02": "EFFECTIVENESS", "O2.03": "CRAFT",
+    "O3.01": "ACCURACY", "O3.02": "CONSISTENCY", "O3.03": "EFFECTIVENESS",
+    # People codes
+    "P1.01": "MANNER", "P1.02": "MANNER", "P1.03": "ATTENTIVENESS",
+    "P1.04": "COMMUNICATION", "P1.05": "ATTENTIVENESS",
+    "P2.01": "COMPETENCE", "P2.02": "COMPETENCE", "P2.03": "COMPETENCE",
+    "P3.01": "COMMUNICATION", "P3.02": "COMMUNICATION", "P3.03": "COMMUNICATION",
+    # Journey codes
+    "J1.01": "SPEED", "J1.02": "RELIABILITY", "J1.03": "FRICTION",
+    "J1.04": "SPEED", "J1.05": "RELIABILITY",
+    "J2.01": "RELIABILITY", "J2.02": "RELIABILITY", "J2.03": "FRICTION",
+    "J3.01": "FRICTION", "J3.02": "FRICTION", "J3.03": "FRICTION",
+    # Environment codes
+    "E1.01": "CLEANLINESS", "E1.02": "COMFORT", "E1.03": "AMBIANCE",
+    "E1.04": "AMBIANCE", "E1.05": "COMFORT",
+    "E2.01": "AMBIANCE", "E2.02": "COMFORT", "E2.03": "COMFORT",
+    "E2.04": "AMBIANCE", "E2.05": "DIGITAL_UX",
+    "E3.01": "SAFETY", "E3.02": "SAFETY", "E3.03": "ACCESSIBILITY",
+    "E4.01": "ACCESSIBILITY", "E4.02": "ACCESSIBILITY", "E4.03": "DIGITAL_UX",
+    # Access codes
+    "A1.01": "AVAILABILITY", "A1.02": "AVAILABILITY", "A1.03": "AVAILABILITY",
+    "A1.04": "ACCESSIBILITY", "A1.05": "ACCESSIBILITY",
+    "A2.01": "ACCESSIBILITY", "A2.02": "ACCESSIBILITY", "A2.03": "DIGITAL_UX",
+    "A3.01": "ACCESSIBILITY", "A3.02": "ACCESSIBILITY", "A3.03": "SPEED",
+    "A4.01": "ACCESSIBILITY", "A4.02": "ACCESSIBILITY", "A4.03": "AVAILABILITY",
+    # Value codes
+    "V1.01": "PRICE_LEVEL", "V1.02": "PRICE_FAIRNESS", "V1.03": "PRICE_TRANSPARENCY",
+    "V2.01": "PRICE_FAIRNESS", "V2.02": "PRICE_TRANSPARENCY", "V2.03": "VALUE_FOR_MONEY",
+    "V3.01": "VALUE_FOR_MONEY", "V3.02": "VALUE_FOR_MONEY", "V3.03": "PRICE_FAIRNESS",
+    "V4.01": "VALUE_FOR_MONEY", "V4.02": "VALUE_FOR_MONEY", "V4.03": "VALUE_FOR_MONEY",
+    # Relationship codes
+    "R1.01": "RELIABILITY", "R1.02": "RELIABILITY", "R1.03": "RELIABILITY",
+    "R2.01": "RELIABILITY", "R2.02": "CONSISTENCY", "R2.03": "RELIABILITY",
+    "R3.01": "MANNER", "R3.02": "MANNER", "R3.03": "COMMUNICATION",
+    "R4.01": "CONSISTENCY", "R4.02": "RELIABILITY", "R4.03": "CONSISTENCY",
+}
+
+
+@dataclass
+class ValidationResult:
+    """Validation results for a sector."""
+    sector_code: str
+    job_count: int
+    review_count: int
+    span_count: int
+
+    # Coverage metrics
+    enabled_coverage: float  # % spans using enabled primitives
+    disabled_hits: dict[str, int]  # disabled primitives that appeared
+    unmapped_count: int  # spans that couldn't be mapped
+
+    # Distribution
+    primitive_counts: dict[str, int]  # all primitives by count
+    domain_distribution: dict[str, int]  # O, P, J, E, A, V, R
+    valence_distribution: dict[str, int]  # V+, V-, V0, V±
+
+    # Top codes
+    top_urt_codes: list[tuple[str, int]]
+
+    # Recommendations
+    recommendations: list[str]
+
+
+def load_l1_config(sector_code: str) -> dict[str, Any] | None:
+    """Load L1 config for a sector."""
+    config_file = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
+    if not config_file.exists():
+        return None
+    with open(config_file) as f:
+        return json.load(f)
+
+
+def load_sector_brief(sector_code: str) -> dict[str, Any] | None:
+    """Load sector brief for a sector."""
+    brief_file = BRIEFS_DIR / f"{sector_code.lower()}_brief.json"
+    if not brief_file.exists():
+        return None
+    with open(brief_file) as f:
+        return json.load(f)
+
+
+def map_urt_to_primitive(urt_code: str) -> str | None:
+    """Map URT code to primitive."""
+    return URT_TO_PRIMITIVE.get(urt_code)
+
+
+async def fetch_spans_for_jobs(pool: asyncpg.Pool, job_url_pattern: str) -> list[dict]:
+    """Fetch spans for jobs matching URL pattern."""
+    query = """
+        SELECT
+            rs.urt_primary,
+            rs.valence,
+            rs.intensity,
+            rs.span_text,
+            j.url
+        FROM pipeline.review_spans rs
+        JOIN pipeline.reviews_raw rr ON rs.review_id = rr.review_id
+        JOIN public.jobs j ON rr.job_id = j.job_id
+        WHERE LOWER(j.url) LIKE $1
+        ORDER BY rs.created_at DESC
+    """
+    rows = await pool.fetch(query, f"%{job_url_pattern.lower()}%")
+    return [dict(row) for row in rows]
+
+
+async def fetch_all_spans(pool: asyncpg.Pool) -> list[dict]:
+    """Fetch all spans from database."""
+    query = """
+        SELECT
+            urt_primary,
+            valence,
+            intensity,
+            span_text
+        FROM pipeline.review_spans
+        ORDER BY created_at DESC
+    """
+    rows = await pool.fetch(query)
+    return [dict(row) for row in rows]
+
+
+def analyze_spans(
+    spans: list[dict],
+    config: dict[str, Any],
+) -> ValidationResult:
+    """Analyze spans against L1 config."""
+    sector_code = config["sector_code"]
+    enabled = set(config.get("enabled", []))
+    disabled = set(config.get("disabled", []))
+    weights = config.get("weights", {})
+
+    # Counters
+    primitive_counts: Counter = Counter()
+    domain_counts: Counter = Counter()
+    valence_counts: Counter = Counter()
+    urt_counts: Counter = Counter()
+    disabled_hits: Counter = Counter()
+    unmapped = 0
+    enabled_hits = 0
+
+    for span in spans:
+        urt_code = span["urt_primary"]
+        valence = span.get("valence", "V0")
+
+        # Count URT codes
+        urt_counts[urt_code] += 1
+
+        # Count valence
+        valence_counts[valence] += 1
+
+        # Map to primitive
+        primitive = map_urt_to_primitive(urt_code)
+        if primitive:
+            primitive_counts[primitive] += 1
+
+            # Count domain
+            domain = PRIMITIVE_TO_DOMAIN.get(primitive, urt_code[0])
+            domain_counts[domain] += 1
+
+            # Check if enabled or disabled
+            if primitive in enabled:
+                enabled_hits += 1
+            elif primitive in disabled:
+                disabled_hits[primitive] += 1
+        else:
+            unmapped += 1
+            # Still count domain from URT code
+            domain_counts[urt_code[0]] += 1
+
+    # Calculate coverage
+    total = len(spans)
+    enabled_coverage = enabled_hits / total if total > 0 else 0
+
+    # Generate recommendations
+    recommendations = []
+
+    # Check disabled primitives that appeared frequently
+    for prim, count in disabled_hits.most_common(5):
+        if count >= 10:
+            pct = count / total * 100
+            recommendations.append(
+                f"ENABLE {prim}: Disabled but appeared {count} times ({pct:.1f}%)"
+            )
+
+    # Check for missing high-weight primitives
+    weighted_set = set(weights.keys())
+    for prim in weighted_set:
+        if primitive_counts[prim] == 0 and prim in enabled:
+            recommendations.append(
+                f"CHECK {prim}: Weighted ({weights[prim]}x) but no appearances"
+            )
+
+    # Check for frequently appearing unweighted primitives
+    for prim, count in primitive_counts.most_common(10):
+        if prim in enabled and prim not in weights and count >= total * 0.1:
+            pct = count / total * 100
+            recommendations.append(
+                f"WEIGHT {prim}: High frequency ({count}, {pct:.1f}%) but not weighted"
+            )
+
+    return ValidationResult(
+        sector_code=sector_code,
+        job_count=1,  # Will be updated by caller
+        review_count=0,  # Not tracked at span level
+        span_count=total,
+        enabled_coverage=enabled_coverage,
+        disabled_hits=dict(disabled_hits),
+        unmapped_count=unmapped,
+        primitive_counts=dict(primitive_counts),
+        domain_distribution=dict(domain_counts),
+        valence_distribution=dict(valence_counts),
+        top_urt_codes=urt_counts.most_common(15),
+        recommendations=recommendations,
+    )
+
+
+def print_validation_report(result: ValidationResult, config: dict, brief: dict | None):
+    """Print formatted validation report."""
+    print("\n" + "=" * 70)
+    print(f"VALIDATION REPORT: {result.sector_code}")
+    print("=" * 70)
+
+    # Overview
+    print(f"\n📊 OVERVIEW")
+    print(f"   Spans analyzed: {result.span_count:,}")
+    print(f"   Enabled coverage: {result.enabled_coverage:.1%}")
+    print(f"   Unmapped spans: {result.unmapped_count} ({result.unmapped_count/result.span_count*100:.1f}%)" if result.span_count > 0 else "   No spans")
+
+    # Config summary
+    print(f"\n⚙️  CONFIG SUMMARY")
+    print(f"   Enabled: {len(config.get('enabled', []))} primitives")
+    print(f"   Disabled: {len(config.get('disabled', []))} primitives")
+    print(f"   Weighted: {len(config.get('weights', {}))} primitives")
+
+    # Domain distribution
+    print(f"\n📁 DOMAIN DISTRIBUTION")
+    domain_names = {"O": "Offering", "P": "People", "J": "Journey",
+                    "E": "Environment", "A": "Access", "V": "Value", "R": "Relationship"}
+    for domain in "OPJEVRA":
+        count = result.domain_distribution.get(domain, 0)
+        pct = count / result.span_count * 100 if result.span_count > 0 else 0
+        bar = "█" * int(pct / 2)
+        print(f"   {domain} {domain_names.get(domain, '?'):12} {count:4} ({pct:5.1f}%) {bar}")
+
+    # Valence distribution
+    print(f"\n😊 VALENCE DISTRIBUTION")
+    for val in ["V+", "V-", "V0", "V±"]:
+        count = result.valence_distribution.get(val, 0)
+        pct = count / result.span_count * 100 if result.span_count > 0 else 0
+        print(f"   {val}: {count:4} ({pct:5.1f}%)")
+
+    # Top primitives
+    print(f"\n🔝 TOP PRIMITIVES")
+    enabled_set = set(config.get("enabled", []))
+    weights = config.get("weights", {})
+    for prim, count in sorted(result.primitive_counts.items(), key=lambda x: -x[1])[:12]:
+        pct = count / result.span_count * 100 if result.span_count > 0 else 0
+        status = "✓" if prim in enabled_set else "✗"
+        weight = f"({weights[prim]}x)" if prim in weights else ""
+        print(f"   {status} {prim:20} {count:4} ({pct:5.1f}%) {weight}")
+
+    # Top URT codes
+    print(f"\n📋 TOP URT CODES")
+    for code, count in result.top_urt_codes[:10]:
+        pct = count / result.span_count * 100 if result.span_count > 0 else 0
+        mapped = URT_TO_PRIMITIVE.get(code, "UNMAPPED")
+        print(f"   {code}: {count:4} ({pct:5.1f}%) → {mapped}")
+
+    # Disabled but appearing
+    if result.disabled_hits:
+        print(f"\n⚠️  DISABLED BUT APPEARING")
+        for prim, count in sorted(result.disabled_hits.items(), key=lambda x: -x[1]):
+            pct = count / result.span_count * 100 if result.span_count > 0 else 0
+            print(f"   {prim}: {count} ({pct:.1f}%)")
+
+    # Recommendations
+    if result.recommendations:
+        print(f"\n💡 RECOMMENDATIONS")
+        for rec in result.recommendations:
+            print(f"   • {rec}")
+
+    # Brief signals check (if available)
+    if brief:
+        print(f"\n📝 BRIEF SIGNALS CHECK")
+        what_customers_judge = brief.get("what_customers_judge", {})
+        if isinstance(what_customers_judge, dict):
+            items = what_customers_judge.get("items", [])
+        else:
+            items = what_customers_judge if isinstance(what_customers_judge, list) else []
+
+        print(f"   Key judgment areas from brief:")
+        for item in items[:5]:
+            if isinstance(item, dict):
+                print(f"   • {item.get('area', item)}")
+            else:
+                print(f"   • {item}")
+
+    print("\n" + "=" * 70)
+
+
+async def run_validation(
+    sector_code: str,
+    job_url_pattern: str | None = None,
+    db_url: str | None = None,
+):
+    """Run validation for a sector."""
+    # Load config
+    config = load_l1_config(sector_code)
+    if not config:
+        print(f"❌ No L1 config found for {sector_code}")
+        return None
+
+    # Load brief
+    brief = load_sector_brief(sector_code)
+
+    # Connect to database
+    db_url = db_url or os.environ.get(
+        "DATABASE_URL",
+        "postgresql://scraper:scraper123@localhost:5437/scraper"
+    )
+
+    pool = await asyncpg.create_pool(db_url)
+
+    try:
+        # Fetch spans
+        if job_url_pattern:
+            spans = await fetch_spans_for_jobs(pool, job_url_pattern)
+            if not spans:
+                print(f"⚠️  No spans found for jobs matching '{job_url_pattern}'")
+                return None
+        else:
+            spans = await fetch_all_spans(pool)
+
+        # Analyze
+        result = analyze_spans(spans, config)
+
+        # Print report
+        print_validation_report(result, config, brief)
+
+        return result
+
+    finally:
+        await pool.close()
+
+
+async def run_all_validations(db_url: str | None = None):
+    """Run validation for all sectors with available data."""
+    # Known jobs and their sectors
+    jobs_by_sector = {
+        "ENTERTAINMENT": ["gokarts", "soho"],
+        "AUTOMOTIVE": ["clickrent"],
+        "PERSONAL_SERVICES": ["fleitas"],
+        "FOOD_DINING": ["fika"],
+    }
+
+    results = {}
+
+    for sector, job_patterns in jobs_by_sector.items():
+        print(f"\n{'='*70}")
+        print(f"Validating {sector}...")
+        print(f"{'='*70}")
+
+        for pattern in job_patterns:
+            result = await run_validation(sector, pattern, db_url)
+            if result:
+                results[f"{sector}:{pattern}"] = result
+
+    # Summary
+    print("\n" + "=" * 70)
+    print("VALIDATION SUMMARY")
+    print("=" * 70)
+
+    for key, result in results.items():
+        sector, pattern = key.split(":")
+        print(f"\n{sector} ({pattern}):")
+        print(f"  Coverage: {result.enabled_coverage:.1%}")
+        print(f"  Spans: {result.span_count}")
+        if result.disabled_hits:
+            print(f"  ⚠️ Disabled hits: {sum(result.disabled_hits.values())}")
+        if result.recommendations:
+            print(f"  Recommendations: {len(result.recommendations)}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Validate L1 primitive configs")
+    parser.add_argument("--sector", help="Sector code (e.g., ENTERTAINMENT)")
+    parser.add_argument("--job-url", help="Job URL pattern to filter (e.g., 'gokarts')")
+    parser.add_argument("--all", action="store_true", help="Run all validations")
+    parser.add_argument("--db-url", help="Database URL")
+
+    args = parser.parse_args()
+
+    if args.all:
+        asyncio.run(run_all_validations(args.db_url))
+    elif args.sector:
+        asyncio.run(run_validation(args.sector, args.job_url, args.db_url))
+    else:
+        parser.print_help()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/packages/reviewiq-pipeline/scripts/validate_l1_configs_v2.py
+++ b/packages/reviewiq-pipeline/scripts/validate_l1_configs_v2.py
@@ -0,0 +1,421 @@
+#!/usr/bin/env python3
+"""
+Wave 1 L1 Config Validation Script - V2 (Sector-Scoped)
+
+Validates L1 primitive configs against SECTOR-SPECIFIC review data.
+Only validates sectors where we have real business data.
+
+Key improvement over v1: spans are filtered by business → sector mapping,
+ensuring "TASTE in HEALTHCARE" noise doesn't pollute results.
+
+Usage:
+    python validate_l1_configs_v2.py --sector ENTERTAINMENT
+    python validate_l1_configs_v2.py --sector AUTOMOTIVE
+    python validate_l1_configs_v2.py --all
+    python validate_l1_configs_v2.py --report  # Summary only
+"""
+
+import argparse
+import asyncio
+import json
+import os
+from collections import Counter
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+import asyncpg
+
+# Paths
+DATA_DIR = Path(__file__).parent.parent / "data"
+CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l1"
+BRIEFS_DIR = DATA_DIR / "sector_briefs"
+
+# Business → Sector mapping (ground truth)
+BUSINESS_TO_SECTOR = {
+    "Go Karts Mar Menor": "ENTERTAINMENT",
+    "ClickRent Gran Canaria | Alquiler de Coches y Furgonetas": "AUTOMOTIVE",
+    "Soho Club": "ENTERTAINMENT",
+    "Fika": "FOOD_DINING",
+}
+
+# Sectors with real data
+SECTORS_WITH_DATA = {"ENTERTAINMENT", "AUTOMOTIVE", "FOOD_DINING"}
+
+# URT code to primitive mapping
+URT_TO_PRIMITIVE = {
+    # Offering codes
+    "O1.01": "CONSISTENCY", "O1.02": "CRAFT", "O1.03": "FRESHNESS",
+    "O1.04": "EFFECTIVENESS", "O1.05": "TASTE", "O1.06": "CONDITION",
+    "O2.01": "ACCURACY", "O2.02": "EFFECTIVENESS", "O2.03": "CRAFT",
+    "O3.01": "ACCURACY", "O3.02": "CONSISTENCY", "O3.03": "EFFECTIVENESS",
+    # People codes
+    "P1.01": "MANNER", "P1.02": "MANNER", "P1.03": "ATTENTIVENESS",
+    "P1.04": "COMMUNICATION", "P1.05": "ATTENTIVENESS",
+    "P2.01": "COMPETENCE", "P2.02": "COMPETENCE", "P2.03": "COMPETENCE",
+    "P3.01": "COMMUNICATION", "P3.02": "COMMUNICATION", "P3.03": "COMMUNICATION",
+    # Journey codes
+    "J1.01": "SPEED", "J1.02": "RELIABILITY", "J1.03": "FRICTION",
+    "J1.04": "SPEED", "J1.05": "RELIABILITY",
+    "J2.01": "RELIABILITY", "J2.02": "RELIABILITY", "J2.03": "FRICTION",
+    "J3.01": "FRICTION", "J3.02": "FRICTION", "J3.03": "FRICTION",
+    # Environment codes
+    "E1.01": "CLEANLINESS", "E1.02": "COMFORT", "E1.03": "AMBIANCE",
+    "E1.04": "AMBIANCE", "E1.05": "COMFORT",
+    "E2.01": "AMBIANCE", "E2.02": "COMFORT", "E2.03": "COMFORT",
+    "E2.04": "AMBIANCE", "E2.05": "DIGITAL_UX",
+    "E3.01": "SAFETY", "E3.02": "SAFETY", "E3.03": "ACCESSIBILITY",
+    "E4.01": "ACCESSIBILITY", "E4.02": "ACCESSIBILITY", "E4.03": "DIGITAL_UX",
+    # Access codes
+    "A1.01": "AVAILABILITY", "A1.02": "AVAILABILITY", "A1.03": "AVAILABILITY",
+    "A1.04": "ACCESSIBILITY", "A1.05": "ACCESSIBILITY",
+    "A2.01": "ACCESSIBILITY", "A2.02": "ACCESSIBILITY", "A2.03": "DIGITAL_UX",
+    "A3.01": "ACCESSIBILITY", "A3.02": "ACCESSIBILITY", "A3.03": "SPEED",
+    "A4.01": "ACCESSIBILITY", "A4.02": "ACCESSIBILITY", "A4.03": "AVAILABILITY",
+    # Value codes
+    "V1.01": "PRICE_LEVEL", "V1.02": "PRICE_FAIRNESS", "V1.03": "PRICE_TRANSPARENCY",
+    "V2.01": "PRICE_FAIRNESS", "V2.02": "PRICE_TRANSPARENCY", "V2.03": "VALUE_FOR_MONEY",
+    "V3.01": "VALUE_FOR_MONEY", "V3.02": "VALUE_FOR_MONEY", "V3.03": "PRICE_FAIRNESS",
+    "V4.01": "VALUE_FOR_MONEY", "V4.02": "VALUE_FOR_MONEY", "V4.03": "VALUE_FOR_MONEY",
+    # Relationship codes (map to meta - these should stay unmapped)
+    "R1.01": None, "R1.02": None, "R1.03": None,
+    "R2.01": None, "R2.02": None, "R2.03": None,
+    "R3.01": None, "R3.02": None, "R3.03": None,
+    "R4.01": None, "R4.02": None, "R4.03": None,
+}
+
+# Minimum threshold for "enable" recommendations (% of sector spans)
+ENABLE_THRESHOLD_PCT = 3.0  # Only recommend enable if >= 3% of sector spans
+
+
+@dataclass
+class SectorValidation:
+    """Validation result for a single sector."""
+    sector_code: str
+    businesses: list[str]
+    span_count: int
+
+    # Coverage
+    enabled_coverage: float
+    disabled_hits: dict[str, int] = field(default_factory=dict)
+    unmapped_count: int = 0
+
+    # Distribution
+    primitive_counts: dict[str, int] = field(default_factory=dict)
+    domain_distribution: dict[str, int] = field(default_factory=dict)
+    valence_distribution: dict[str, int] = field(default_factory=dict)
+    top_urt_codes: list[tuple[str, int]] = field(default_factory=list)
+
+    # Recommendations (threshold-gated)
+    recommended_enables: list[tuple[str, float]] = field(default_factory=list)  # (primitive, pct)
+    recommended_disables: list[tuple[str, float]] = field(default_factory=list)
+    weight_issues: list[str] = field(default_factory=list)
+
+    # Metadata
+    validated_at: str = ""
+    config_version: str = ""
+
+
+def load_l1_config(sector_code: str) -> dict[str, Any] | None:
+    """Load L1 config for a sector."""
+    config_file = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
+    if not config_file.exists():
+        return None
+    with open(config_file) as f:
+        return json.load(f)
+
+
+def get_businesses_for_sector(sector_code: str) -> list[str]:
+    """Get list of businesses belonging to a sector."""
+    return [biz for biz, sector in BUSINESS_TO_SECTOR.items() if sector == sector_code]
+
+
+async def fetch_spans_for_businesses(pool: asyncpg.Pool, businesses: list[str]) -> list[dict]:
+    """Fetch spans for specific businesses only."""
+    if not businesses:
+        return []
+
+    query = """
+        SELECT
+            business_id,
+            urt_primary,
+            valence,
+            intensity,
+            span_text
+        FROM pipeline.review_spans
+        WHERE business_id = ANY($1)
+        ORDER BY created_at DESC
+    """
+    rows = await pool.fetch(query, businesses)
+    return [dict(row) for row in rows]
+
+
+def analyze_sector_spans(
+    spans: list[dict],
+    config: dict[str, Any],
+    businesses: list[str],
+) -> SectorValidation:
+    """Analyze spans for a specific sector."""
+    sector_code = config["sector_code"]
+    enabled = set(config.get("enabled", []))
+    disabled = set(config.get("disabled", []))
+    weights = config.get("weights", {})
+    config_version = config.get("config_version", "1.0")
+
+    # Counters
+    primitive_counts: Counter = Counter()
+    domain_counts: Counter = Counter()
+    valence_counts: Counter = Counter()
+    urt_counts: Counter = Counter()
+    disabled_hits: Counter = Counter()
+    unmapped = 0
+    enabled_hits = 0
+
+    for span in spans:
+        urt_code = span["urt_primary"]
+        valence = span.get("valence", "V0")
+
+        urt_counts[urt_code] += 1
+        valence_counts[valence] += 1
+        domain_counts[urt_code[0]] += 1
+
+        primitive = URT_TO_PRIMITIVE.get(urt_code)
+        if primitive:
+            primitive_counts[primitive] += 1
+            if primitive in enabled:
+                enabled_hits += 1
+            elif primitive in disabled:
+                disabled_hits[primitive] += 1
+        else:
+            unmapped += 1
+
+    total = len(spans)
+    enabled_coverage = enabled_hits / total if total > 0 else 0
+
+    # Threshold-gated recommendations
+    recommended_enables = []
+    for prim, count in disabled_hits.most_common():
+        pct = count / total * 100 if total > 0 else 0
+        if pct >= ENABLE_THRESHOLD_PCT:
+            recommended_enables.append((prim, pct))
+
+    # Weight issues
+    weight_issues = []
+    for prim in weights:
+        if primitive_counts[prim] == 0 and prim in enabled:
+            weight_issues.append(f"{prim} weighted ({weights[prim]}x) but 0 appearances")
+
+    # High-frequency unweighted
+    for prim, count in primitive_counts.most_common(5):
+        pct = count / total * 100 if total > 0 else 0
+        if prim in enabled and prim not in weights and pct >= 10:
+            weight_issues.append(f"{prim} high freq ({pct:.1f}%) but unweighted")
+
+    return SectorValidation(
+        sector_code=sector_code,
+        businesses=businesses,
+        span_count=total,
+        enabled_coverage=enabled_coverage,
+        disabled_hits=dict(disabled_hits),
+        unmapped_count=unmapped,
+        primitive_counts=dict(primitive_counts),
+        domain_distribution=dict(domain_counts),
+        valence_distribution=dict(valence_counts),
+        top_urt_codes=urt_counts.most_common(15),
+        recommended_enables=recommended_enables,
+        weight_issues=weight_issues,
+        validated_at=datetime.utcnow().isoformat(),
+        config_version=config_version,
+    )
+
+
+def print_sector_report(result: SectorValidation, config: dict):
+    """Print detailed validation report for a sector."""
+    print("\n" + "=" * 70)
+    print(f"SECTOR-SCOPED VALIDATION: {result.sector_code}")
+    print("=" * 70)
+
+    print(f"\n📊 DATA SOURCE")
+    print(f"   Businesses: {', '.join(result.businesses)}")
+    print(f"   Total spans: {result.span_count:,}")
+    print(f"   Config version: {result.config_version}")
+
+    print(f"\n📈 COVERAGE")
+    print(f"   Enabled coverage: {result.enabled_coverage:.1%}")
+    print(f"   Unmapped (R-domain): {result.unmapped_count} ({result.unmapped_count/result.span_count*100:.1f}%)" if result.span_count > 0 else "")
+
+    # Domain distribution
+    print(f"\n📁 DOMAIN DISTRIBUTION")
+    domain_names = {"O": "Offering", "P": "People", "J": "Journey",
+                    "E": "Environment", "A": "Access", "V": "Value", "R": "Relationship"}
+    for domain in "OPJEVRA":
+        count = result.domain_distribution.get(domain, 0)
+        pct = count / result.span_count * 100 if result.span_count > 0 else 0
+        bar = "█" * int(pct / 2)
+        print(f"   {domain} {domain_names.get(domain, '?'):12} {count:4} ({pct:5.1f}%) {bar}")
+
+    # Top primitives
+    print(f"\n🔝 TOP PRIMITIVES (sector-scoped)")
+    enabled_set = set(config.get("enabled", []))
+    disabled_set = set(config.get("disabled", []))
+    weights = config.get("weights", {})
+
+    for prim, count in sorted(result.primitive_counts.items(), key=lambda x: -x[1])[:12]:
+        pct = count / result.span_count * 100 if result.span_count > 0 else 0
+        if prim in enabled_set:
+            status = "✓"
+        elif prim in disabled_set:
+            status = "✗"
+        else:
+            status = "?"
+        weight = f"({weights[prim]}x)" if prim in weights else ""
+        print(f"   {status} {prim:20} {count:4} ({pct:5.1f}%) {weight}")
+
+    # Threshold-gated recommendations
+    if result.recommended_enables:
+        print(f"\n⚠️  RECOMMENDED ENABLES (≥{ENABLE_THRESHOLD_PCT}% threshold)")
+        for prim, pct in result.recommended_enables:
+            count = result.disabled_hits.get(prim, 0)
+            print(f"   → ENABLE {prim}: {count} spans ({pct:.1f}%)")
+    else:
+        print(f"\n✅ No primitives exceed {ENABLE_THRESHOLD_PCT}% threshold for enabling")
+
+    # Low-frequency disabled (info only)
+    low_freq_disabled = [(p, c) for p, c in result.disabled_hits.items()
+                         if c / result.span_count * 100 < ENABLE_THRESHOLD_PCT]
+    if low_freq_disabled:
+        print(f"\n📋 DISABLED BUT APPEARING (below threshold - no action)")
+        for prim, count in sorted(low_freq_disabled, key=lambda x: -x[1])[:5]:
+            pct = count / result.span_count * 100
+            print(f"   {prim}: {count} ({pct:.1f}%)")
+
+    # Weight issues
+    if result.weight_issues:
+        print(f"\n⚖️  WEIGHT ISSUES")
+        for issue in result.weight_issues:
+            print(f"   • {issue}")
+
+    print(f"\n⏱️  Validated at: {result.validated_at}")
+    print("=" * 70)
+
+
+async def validate_sector(
+    sector_code: str,
+    db_url: str | None = None,
+    verbose: bool = True,
+) -> SectorValidation | None:
+    """Validate a single sector with sector-scoped data."""
+
+    if sector_code not in SECTORS_WITH_DATA:
+        if verbose:
+            print(f"⚠️  {sector_code}: No real business data available for validation")
+        return None
+
+    config = load_l1_config(sector_code)
+    if not config:
+        if verbose:
+            print(f"❌ No L1 config found for {sector_code}")
+        return None
+
+    businesses = get_businesses_for_sector(sector_code)
+    if not businesses:
+        if verbose:
+            print(f"⚠️  {sector_code}: No businesses mapped")
+        return None
+
+    db_url = db_url or os.environ.get(
+        "DATABASE_URL",
+        "postgresql://scraper:scraper123@localhost:5437/scraper"
+    )
+
+    pool = await asyncpg.create_pool(db_url)
+
+    try:
+        spans = await fetch_spans_for_businesses(pool, businesses)
+        if not spans:
+            if verbose:
+                print(f"⚠️  {sector_code}: No spans found for businesses")
+            return None
+
+        result = analyze_sector_spans(spans, config, businesses)
+
+        if verbose:
+            print_sector_report(result, config)
+
+        return result
+
+    finally:
+        await pool.close()
+
+
+async def validate_all_sectors(db_url: str | None = None) -> dict[str, SectorValidation]:
+    """Validate all sectors with available data."""
+    results = {}
+
+    for sector in SECTORS_WITH_DATA:
+        result = await validate_sector(sector, db_url, verbose=True)
+        if result:
+            results[sector] = result
+
+    # Print summary
+    print("\n" + "=" * 70)
+    print("VALIDATION SUMMARY")
+    print("=" * 70)
+    print(f"\n{'Sector':<20} {'Spans':>8} {'Coverage':>10} {'Enables':>10}")
+    print("-" * 50)
+
+    for sector, result in results.items():
+        enables = len(result.recommended_enables)
+        enables_str = f"{enables} recs" if enables > 0 else "✓ OK"
+        print(f"{sector:<20} {result.span_count:>8,} {result.enabled_coverage:>9.1%} {enables_str:>10}")
+
+    print("-" * 50)
+    print(f"Sectors validated: {len(results)}/{len(SECTORS_WITH_DATA)}")
+    print(f"Sectors without data: {20 - len(SECTORS_WITH_DATA)}")
+
+    return results
+
+
+async def generate_summary_report(db_url: str | None = None) -> dict:
+    """Generate a JSON summary report for all sectors."""
+    results = {}
+
+    for sector in SECTORS_WITH_DATA:
+        result = await validate_sector(sector, db_url, verbose=False)
+        if result:
+            results[sector] = {
+                "span_count": result.span_count,
+                "enabled_coverage": round(result.enabled_coverage, 3),
+                "recommended_enables": result.recommended_enables,
+                "weight_issues": result.weight_issues,
+                "config_version": result.config_version,
+                "validated_at": result.validated_at,
+            }
+
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Sector-scoped L1 config validation")
+    parser.add_argument("--sector", help="Validate specific sector")
+    parser.add_argument("--all", action="store_true", help="Validate all sectors with data")
+    parser.add_argument("--report", action="store_true", help="Generate JSON summary report")
+    parser.add_argument("--db-url", help="Database URL")
+
+    args = parser.parse_args()
+
+    if args.report:
+        results = asyncio.run(generate_summary_report(args.db_url))
+        print(json.dumps(results, indent=2))
+    elif args.all:
+        asyncio.run(validate_all_sectors(args.db_url))
+    elif args.sector:
+        asyncio.run(validate_sector(args.sector.upper(), args.db_url))
+    else:
+        parser.print_help()
+        print("\n\nSectors with real data:", ", ".join(sorted(SECTORS_WITH_DATA)))
+
+
+if __name__ == "__main__":
+    main()