whyrating-engine-legacy/packages/reviewiq-pipeline/scripts/run_classification_v2.py

#!/usr/bin/env python3
"""
Classification Run Harness V2

Runs classification on real reviews using resolved L1 config + sector brief.
Stores results to detected_spans_v2 with full config versioning.

Usage:
    python run_classification_v2.py --business "Go Karts Mar Menor" --limit 100
    python run_classification_v2.py --business "ClickRent Gran Canaria" --limit 100 --dry-run
    python run_classification_v2.py --evaluate "Go Karts Mar Menor"
"""

import argparse
import asyncio
import hashlib
import json
import os
import re
import sys
import unicodedata
import uuid
from collections import Counter
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any

import asyncpg

# Use standalone resolver to avoid package import issues
from config_resolver_standalone import ConfigResolver

# Import LLM classifier (optional - falls back to mock if unavailable)
try:
    from llm_classifier import classify_review as llm_classify_review
    LLM_AVAILABLE = True
except ImportError:
    LLM_AVAILABLE = False

# Database URL
DB_URL = os.environ.get(
    "DATABASE_URL",
    "postgresql://scraper:scraper123@localhost:5437/scraper"
)

# Non-informative review detection (score-based, conservative)


def _compute_text_stats(text: str) -> dict:
    """Compute character and token statistics for non-informative detection."""
    if not text:
        return {"empty": True}

    text = text.strip()
    total_chars = len(text)

    if total_chars == 0:
        return {"empty": True}

    # Character counts by unicode category (reliable, no emoji heuristics)
    alpha_chars = sum(1 for c in text if unicodedata.category(c).startswith('L'))
    digit_chars = sum(1 for c in text if unicodedata.category(c).startswith('N'))
    punct_chars = sum(1 for c in text if unicodedata.category(c).startswith('P'))

    # Token stats
    tokens = text.split()
    token_count = len(tokens)
    unique_tokens = len(set(t.lower() for t in tokens))

    # Ratios
    alpha_ratio = alpha_chars / total_chars if total_chars > 0 else 0
    punct_ratio = punct_chars / total_chars if total_chars > 0 else 0

    # Repetition check (same token repeated)
    if tokens:
        most_common_count = Counter(t.lower() for t in tokens).most_common(1)[0][1]
        repetition_ratio = most_common_count / token_count if token_count > 0 else 0
    else:
        repetition_ratio = 0

    return {
        "empty": False,
        "total_chars": total_chars,
        "alpha_chars": alpha_chars,
        "digit_chars": digit_chars,
        "punct_chars": punct_chars,
        "token_count": token_count,
        "unique_tokens": unique_tokens,
        "alpha_ratio": alpha_ratio,
        "punct_ratio": punct_ratio,
        "repetition_ratio": repetition_ratio,
    }


# Safe regex for truly content-free strings (high confidence only)
# No word lists - only structural patterns
PURE_JUNK_RE = re.compile(
    r'^[\s\.\!\?\,\-\_\~\*\#\@]+$'  # Only punctuation/whitespace
    r'|^[\U0001F300-\U0001F9FF\U0001FA00-\U0001FAFF\U00002600-\U000027BF\s\.\!\?]+$'  # Only emoji + punct
    r'|^(translated by google|traducido por google)[\.\s]*$',  # Translation artifacts
    re.IGNORECASE
)


def is_non_informative(text: str) -> tuple[bool, str]:
    """
    Conservative detection of non-informative reviews.

    Goal: Skip LLM only when VERY sure it's junk. Everything else goes to LLM.
    Prefer false negatives (keeping noise) over false positives (dropping content).

    Returns (is_non_informative, reason).
    """
    if not text:
        return True, "empty"

    text = text.strip()
    if not text:
        return True, "empty"

    stats = _compute_text_stats(text)
    if stats.get("empty"):
        return True, "empty"

    # Rule A: Safe regex (emoji-only, punct-only, translation artifact)
    if PURE_JUNK_RE.match(text):
        return True, "junk_pattern"

    # Rule B: No alphanumeric content at all
    if stats["alpha_chars"] == 0 and stats["digit_chars"] == 0:
        return True, "no_content"

    # Rule C: Pure repetition (e.g., "good good good good")
    if stats["token_count"] >= 3 and stats["unique_tokens"] == 1 and stats["alpha_chars"] < 20:
        return True, "pure_repetition"

    # Everything else passes to LLM (including typo-heavy, short meaningful, etc.)
    return False, ""


# Classification prompt template
CLASSIFICATION_PROMPT = """You are a review classifier using primitive-based analysis.

## TASK
Extract semantic spans from this review and classify each span to exactly ONE primitive.

## BUSINESS CONTEXT
- Business: {business_id}
- Sector: {sector_code}
- Path: {gbp_path}

## ENABLED PRIMITIVES (use ONLY these)
{primitives_list}

## SECTOR SIGNALS (what customers typically judge)
{brief_signals}

## RULES
1. Extract 1-5 spans per review (prefer fewer, larger spans)
2. Each span gets exactly ONE primitive (the most specific match)
3. If nothing fits with confidence ≥ 0.5, use UNMAPPED with keywords
4. Valence: + (positive), - (negative), 0 (neutral), ± (mixed)
5. Intensity: 1 (low), 2 (moderate), 3 (high/extreme)
6. Detail: 1 (vague), 2 (some detail), 3 (specific/actionable)

## OUTPUT FORMAT (JSON only, no markdown)
{{
  "spans": [
    {{
      "text": "exact text from review",
      "start": 0,
      "end": 25,
      "primitive": "MANNER",
      "valence": "+",
      "intensity": 2,
      "detail": 2,
      "confidence": 0.85,
      "entity": null,
      "entity_type": null
    }}
  ]
}}

## REVIEW TO CLASSIFY
Rating: {rating}/5
Text: {review_text}

Return JSON only."""


async def resolve_business_id(pool: asyncpg.Pool, search_term: str) -> str | None:
    """
    Resolve a partial business name/pattern to canonical business_id via DB.

    Searches pipeline.business_taxonomy_map by ILIKE on business_id.
    Returns the canonical business_id or None if not found.
    """
    # Try exact match first
    row = await pool.fetchrow("""
        SELECT business_id
        FROM pipeline.business_taxonomy_map
        WHERE business_id = $1
    """, search_term)
    if row:
        return row["business_id"]

    # Build search patterns with varying flexibility
    search_patterns = [
        f"%{search_term}%",                    # Basic wildcard
        f"%{search_term.replace(' ', '%')}%",  # "go karts" -> "%go%karts%"
    ]

    # For camelCase or concatenated words like "gokarts", insert wildcard at likely word boundaries
    # Simple heuristic: insert % before uppercase letters or between common word patterns
    import re
    # Insert % before uppercase letters: "GoKarts" -> "%Go%Karts%"
    camel_pattern = re.sub(r'([a-z])([A-Z])', r'\1%\2', search_term)
    if camel_pattern != search_term:
        search_patterns.append(f"%{camel_pattern}%")

    # For lowercase concatenated words, try common splits
    # "gokarts" -> try "go%karts", "gok%arts", etc. with length >= 2
    if search_term.islower() and len(search_term) >= 4:
        for i in range(2, len(search_term) - 1):
            search_patterns.append(f"%{search_term[:i]}%{search_term[i:]}%")

    for pattern in search_patterns:
        row = await pool.fetchrow("""
            SELECT business_id
            FROM pipeline.business_taxonomy_map
            WHERE LOWER(business_id) LIKE LOWER($1)
        """, pattern)
        if row:
            return row["business_id"]

    return None


async def fetch_reviews_for_business(
    pool: asyncpg.Pool,
    business_id: str,
    limit: int = 100,
) -> list[dict]:
    """Fetch reviews from review_spans source (existing classified reviews)."""
    # Get unique reviews from spans table
    query = """
        SELECT DISTINCT ON (review_id)
            review_id,
            business_id,
            span_text as sample_text
        FROM pipeline.review_spans
        WHERE business_id = $1
        ORDER BY review_id, id
        LIMIT $2
    """
    rows = await pool.fetch(query, business_id, limit)

    # For now, we'll use existing spans as proxy for reviews
    # In production, this would fetch from reviews_raw or jobs.reviews_data
    return [dict(row) for row in rows]


async def fetch_reviews_from_jobs(
    pool: asyncpg.Pool,
    business_pattern: str,
    limit: int = 100,
) -> list[dict]:
    """Fetch reviews from jobs.reviews_data JSON."""
    query = """
        SELECT
            j.job_id,
            j.url,
            jsonb_array_elements(j.reviews_data) as review
        FROM public.jobs j
        WHERE j.reviews_data IS NOT NULL
          AND (LOWER(j.url) LIKE $1 OR LOWER(j.metadata->>'business_name') LIKE $1)
          AND j.status = 'completed'
        LIMIT $2
    """
    rows = await pool.fetch(query, f"%{business_pattern.lower()}%", limit)

    reviews = []
    for row in rows:
        review = row["review"]
        # Handle both dict and JSON string
        if isinstance(review, str):
            review = json.loads(review)
        reviews.append({
            "job_id": str(row["job_id"]),
            "review_id": review.get("review_id", f"rev-{len(reviews)}"),
            "text": review.get("text", ""),
            "rating": review.get("rating", 5),
            "author": review.get("author", "Anonymous"),
        })

    return reviews


def build_classification_prompt(
    review: dict,
    config: dict,
) -> str:
    """Build the classification prompt with resolved config."""
    # Build primitives list with weights
    primitives_list = []
    for prim in sorted(config["enabled_primitives"]):
        prim_info = config["primitives"].get(prim, {})
        weight = config["weights"].get(prim)
        weight_str = f" (weight: {weight}x)" if weight else ""
        primitives_list.append(
            f"- {prim}: {prim_info.get('def', prim_info.get('name', prim))}{weight_str}"
        )

    # Build brief signals
    brief = config.get("brief", {})
    brief_signals = []
    if brief.get("what_customers_judge"):
        items = brief["what_customers_judge"]
        if isinstance(items, dict):
            items = items.get("items", [])
        for item in items[:3]:
            if isinstance(item, dict):
                brief_signals.append(f"- {item.get('aspect', item.get('area', ''))}")
            else:
                brief_signals.append(f"- {item}")

    return CLASSIFICATION_PROMPT.format(
        business_id=config["business_id"],
        sector_code=config["sector_code"],
        gbp_path=config["gbp_path"],
        primitives_list="\n".join(primitives_list),
        brief_signals="\n".join(brief_signals) if brief_signals else "No specific signals",
        rating=review.get("rating", "?"),
        review_text=review.get("text", review.get("sample_text", "")),
    )


async def classify_review_mock(
    review: dict,
    config: dict,
) -> ClassificationResult:
    """
    Mock classification for dry-run testing.

    Returns a synthetic result based on simple heuristics.
    """
    text = review.get("text", review.get("sample_text", ""))
    rating = review.get("rating", 3)

    # Simple heuristic classification
    spans = []

    # Detect some patterns
    if any(word in text.lower() for word in ["friendly", "nice", "helpful", "great staff"]):
        spans.append({
            "text": text[:50] + "..." if len(text) > 50 else text,
            "start": 0,
            "end": min(50, len(text)),
            "primitive": "MANNER",
            "valence": "+",
            "intensity": 2,
            "detail": 2,
            "confidence": 0.75,
        })
    elif any(word in text.lower() for word in ["rude", "unfriendly", "ignored"]):
        spans.append({
            "text": text[:50] + "..." if len(text) > 50 else text,
            "start": 0,
            "end": min(50, len(text)),
            "primitive": "MANNER",
            "valence": "-",
            "intensity": 2,
            "detail": 2,
            "confidence": 0.75,
        })
    elif any(word in text.lower() for word in ["wait", "slow", "fast", "quick"]):
        valence = "+" if rating >= 4 else "-"
        spans.append({
            "text": text[:50] + "..." if len(text) > 50 else text,
            "start": 0,
            "end": min(50, len(text)),
            "primitive": "SPEED",
            "valence": valence,
            "intensity": 2,
            "detail": 2,
            "confidence": 0.70,
        })
    else:
        # Default based on rating
        spans.append({
            "text": text[:50] + "..." if len(text) > 50 else text,
            "start": 0,
            "end": min(50, len(text)),
            "primitive": "VALUE_FOR_MONEY" if rating >= 4 else "UNMAPPED",
            "valence": "+" if rating >= 4 else "-" if rating <= 2 else "0",
            "intensity": 1,
            "detail": 1,
            "confidence": 0.50,
            "unmapped_keywords": ["general"] if rating < 4 else None,
        })

    return ClassificationResult(
        review_id=review.get("review_id", "unknown"),
        business_id=config["business_id"],
        config_version=config["config_version"],
        spans=spans,
        raw_response=json.dumps({"spans": spans}),
        detected_language="en",  # Mock assumes English
        language_confidence=0.5,
    )


@dataclass
class ClassificationResult:
    """Result from classifying a single review."""
    review_id: str
    business_id: str
    config_version: str
    spans: list[dict]
    raw_response: str
    error: str | None = None
    detected_language: str | None = None
    language_confidence: float | None = None


async def classify_review_llm(
    review: dict,
    config: dict,
    model: str | None = None,
) -> ClassificationResult:
    """
    Real LLM classification using OpenAI.

    Uses enabled primitives from resolved config.
    Returns structured spans with audit trail.
    """
    if not LLM_AVAILABLE:
        raise RuntimeError("LLM classifier not available. Check OPENAI_API_KEY.")

    text = review.get("text", review.get("sample_text", ""))
    rating = review.get("rating")

    # Check for non-informative reviews (skip LLM, save cost)
    non_informative, reason = is_non_informative(text)
    if non_informative:
        return ClassificationResult(
            review_id=review.get("review_id", "unknown"),
            business_id=config["business_id"],
            config_version=config["config_version"],
            spans=[{
                "text": text[:100] if text else "",
                "start": 0,
                "end": min(100, len(text)) if text else 0,
                "primitive": "NON_INFORMATIVE",
                "valence": "0",
                "intensity": 1,
                "detail": 1,
                "confidence": 1.0,
                "unmapped_keywords": [reason],
            }],
            raw_response=json.dumps({"non_informative": True, "reason": reason}),
            detected_language="unknown",
            language_confidence=0.0,
        )

    # Call the LLM classifier
    result = llm_classify_review(
        review_text=text,
        rating=rating,
        config=config,
        language="auto",  # Auto-detect
        model=model,
    )

    # Extract language info
    detected_lang = result.get("detected_language", "unknown")
    lang_confidence = result.get("language_confidence", 0.0)

    # Convert LLM response to our format
    spans = []
    for span in result.get("spans", []):
        # Map valence to our format
        valence_map = {"positive": "+", "negative": "-", "mixed": "±", "neutral": "0"}
        valence = valence_map.get(span.get("valence", "neutral"), "0")

        # Map intensity (1-5 to 1-3)
        intensity = span.get("intensity", 2)
        if intensity >= 4:
            detail = 3
        elif intensity >= 2:
            detail = 2
        else:
            detail = 1
        intensity = min(3, max(1, (intensity + 1) // 2))  # Map 1-5 to 1-3

        spans.append({
            "text": span.get("evidence", text[:100]),
            "start": span.get("start_char"),
            "end": span.get("end_char"),
            "primitive": span.get("primitive", "UNMAPPED"),
            "valence": valence,
            "intensity": intensity,
            "detail": detail,
            "confidence": span.get("confidence", 0.5),
            "entity": span.get("details", {}).get("entity") if span.get("details") else None,
            "entity_type": span.get("details", {}).get("entity_type") if span.get("details") else None,
        })

    # Add unmapped items as UNMAPPED spans
    for unmapped in result.get("unmapped", []):
        spans.append({
            "text": unmapped.get("evidence", ""),
            "start": None,
            "end": None,
            "primitive": "UNMAPPED",
            "valence": "0",
            "intensity": 1,
            "detail": 1,
            "confidence": unmapped.get("confidence", 0.3),
            "unmapped_keywords": [unmapped.get("label", "unknown")],
        })

    return ClassificationResult(
        review_id=review.get("review_id", "unknown"),
        business_id=config["business_id"],
        config_version=config["config_version"],
        spans=spans,
        raw_response=result.get("raw_response", "{}"),
        error=result.get("warnings", [None])[0] if result.get("warnings") else None,
        detected_language=detected_lang,
        language_confidence=lang_confidence,
    )


async def store_spans(
    pool: asyncpg.Pool,
    result: ClassificationResult,
    config: dict,
    job_id: str | None = None,
    model: str | None = None,
    review_hash: str | None = None,
    language: str | None = None,
    run_id: uuid.UUID | None = None,
) -> int:
    """Store classified spans to detected_spans_v2 with full audit trail."""
    count = 0
    for span in result.spans:
        await pool.execute("""
            INSERT INTO pipeline.detected_spans_v2 (
                job_id, business_id, review_id, gbp_path, sector_code,
                config_version, primitive, valence, intensity, detail,
                mode, confidence, span_text, span_start, span_end,
                unmapped_keywords, entity, entity_type,
                model, raw_response, review_hash, language, run_id
            ) VALUES (
                $1, $2, $3, $4::ltree, $5,
                $6, $7, $8, $9, $10,
                $11, $12, $13, $14, $15,
                $16, $17, $18,
                $19, $20, $21, $22, $23
            )
        """,
            job_id,
            result.business_id,
            result.review_id,
            config["gbp_path"],
            config["sector_code"],
            result.config_version,
            span["primitive"],
            span["valence"],
            span.get("intensity"),
            span.get("detail"),
            span.get("mode"),
            span["confidence"],
            span["text"],
            span.get("start"),
            span.get("end"),
            span.get("unmapped_keywords"),
            span.get("entity"),
            span.get("entity_type"),
            model,
            json.dumps(json.loads(result.raw_response)) if result.raw_response else None,  # Store as JSONB
            review_hash,
            language,
            run_id,
        )
        count += 1
    return count


async def run_classification(
    business_id: str,
    limit: int = 100,
    dry_run: bool = False,
    use_jobs: bool = True,
    use_llm: bool = False,
    model: str | None = None,
) -> dict[str, Any]:
    """
    Run classification pipeline for a business.

    Args:
        business_id: Business name or URL pattern
        limit: Max reviews to process
        dry_run: If True, don't store results
        use_jobs: If True, fetch from jobs.reviews_data
        use_llm: If True, use real LLM classification (requires OPENAI_API_KEY)
        model: Model to use for LLM classification (default: gpt-4o-mini)

    Returns:
        Summary statistics
    """
    if use_llm and not LLM_AVAILABLE:
        return {"error": "LLM classifier not available. Set OPENAI_API_KEY."}

    model = model or "gpt-4o-mini"
    pool = await asyncpg.create_pool(DB_URL)

    try:
        # Resolve config
        resolver = ConfigResolver()

        # Resolve business_id from partial name via DB lookup
        business_lookup = await resolve_business_id(pool, business_id)
        if not business_lookup:
            return {"error": f"Business not found matching: {business_id}"}

        config = await resolver.resolve(business_lookup, pool)
        if not config:
            return {"error": f"Business not mapped: {business_lookup}"}

        # Generate run_id for this classification run
        run_id = uuid.uuid4()

        print(f"\n📋 Resolved config for: {config['business_id']}")
        print(f"   Sector: {config['sector_code']}")
        print(f"   Config version: {config['config_version']}")
        print(f"   Enabled primitives: {len(config['enabled_primitives'])}")
        print(f"   Weights: {len(config['weights'])}")
        print(f"   Run ID: {run_id}")

        # Fetch reviews
        if use_jobs:
            # Extract pattern from business_id (use first two words for better matching)
            words = business_id.split()
            url_pattern = " ".join(words[:2]) if len(words) > 1 else words[0]
            reviews = await fetch_reviews_from_jobs(pool, url_pattern, limit)
        else:
            reviews = await fetch_reviews_for_business(pool, config["business_id"], limit)

        print(f"\n📥 Fetched {len(reviews)} reviews")

        if not reviews:
            return {"error": "No reviews found", "config": config}

        # Classify reviews
        results = []
        total_tokens = {"prompt": 0, "completion": 0}
        classifier_type = "LLM" if use_llm else "MOCK"
        print(f"\n🔄 Classifying with {classifier_type} classifier...")
        if use_llm:
            print(f"   Model: {model}")

        for i, review in enumerate(reviews):
            if i % 20 == 0:
                print(f"   Processing review {i+1}/{len(reviews)}...")

            # Use LLM or mock classifier
            if use_llm:
                result = await classify_review_llm(review, config, model)
                # Track tokens for cost estimation
                if hasattr(result, 'raw_response'):
                    try:
                        raw = json.loads(result.raw_response)
                        if "tokens" in raw:
                            total_tokens["prompt"] += raw["tokens"].get("prompt", 0)
                            total_tokens["completion"] += raw["tokens"].get("completion", 0)
                    except:
                        pass
            else:
                result = await classify_review_mock(review, config)

            results.append(result)

            # Store if not dry run
            if not dry_run:
                # Compute review hash for caching
                text = review.get("text", review.get("sample_text", ""))
                review_hash = hashlib.sha256(f"{config['config_version']}:{text}".encode()).hexdigest()[:16]

                # Use detected language from result (if available)
                detected_lang = getattr(result, 'detected_language', None) or "unknown"

                await store_spans(
                    pool, result, config,
                    job_id=review.get("job_id"),
                    model=model if use_llm else "mock",
                    review_hash=review_hash,
                    language=detected_lang,
                    run_id=run_id,
                )

        # Calculate stats
        all_spans = [s for r in results for s in r.spans]
        primitive_counts = Counter(s["primitive"] for s in all_spans)
        valence_counts = Counter(s["valence"] for s in all_spans)
        unmapped_count = primitive_counts.get("UNMAPPED", 0)
        non_informative_count = primitive_counts.get("NON_INFORMATIVE", 0)

        # Language distribution (for multilingual tracking)
        language_counts = Counter(
            getattr(r, 'detected_language', 'unknown') or 'unknown'
            for r in results
        )

        # Content spans = total - non-informative
        content_spans = len(all_spans) - non_informative_count
        content_unmapped_rate = unmapped_count / content_spans if content_spans > 0 else 0

        stats = {
            "run_id": str(run_id),
            "business_id": config["business_id"],
            "sector_code": config["sector_code"],
            "config_version": config["config_version"],
            "l2_applied": config.get("l2_applied"),
            "classifier": classifier_type,
            "model": model if use_llm else "mock",
            "reviews_processed": len(reviews),
            "spans_created": len(all_spans),
            "non_informative_count": non_informative_count,
            "content_spans": content_spans,
            "unmapped_count": unmapped_count,
            "raw_unmapped_rate": unmapped_count / len(all_spans) if all_spans else 0,
            "content_unmapped_rate": content_unmapped_rate,
            "top_primitives": primitive_counts.most_common(10),
            "valence_distribution": dict(valence_counts),
            "language_distribution": dict(language_counts),
            "dry_run": dry_run,
        }

        if use_llm:
            stats["tokens"] = total_tokens
            # Rough cost estimate for gpt-4o-mini
            cost = (total_tokens["prompt"] * 0.15 + total_tokens["completion"] * 0.60) / 1_000_000
            stats["estimated_cost_usd"] = round(cost, 4)

        print(f"\n📊 Results:")
        print(f"   Reviews: {stats['reviews_processed']}")
        print(f"   Total spans: {stats['spans_created']}")
        print(f"   NON_INFORMATIVE: {non_informative_count} ({100*non_informative_count/len(all_spans):.1f}%)" if all_spans else "")
        print(f"   Content spans: {content_spans}")
        print(f"   UNMAPPED (of content): {unmapped_count} ({content_unmapped_rate:.1%})")
        if use_llm and "estimated_cost_usd" in stats:
            print(f"   Estimated cost: ${stats['estimated_cost_usd']:.4f}")

        # Print language distribution
        print(f"\n   🌐 Languages detected:")
        for lang, count in language_counts.most_common(5):
            pct = count / len(results) * 100 if results else 0
            print(f"      {lang}: {count} ({pct:.1f}%)")

        print(f"\n   Top primitives:")
        for prim, count in stats["top_primitives"][:5]:
            print(f"      {prim}: {count}")

        return stats

    finally:
        await pool.close()


async def evaluate_business(business_id: str) -> dict[str, Any]:
    """
    Evaluate classification results for a business.

    Runs C4 evaluation metrics:
    1. UNMAPPED rate
    2. Top primitives distribution
    3. Contradiction detection
    4. Coverage by config version
    """
    pool = await asyncpg.create_pool(DB_URL)

    try:
        # Get business mapping
        resolver = ConfigResolver()

        # Resolve business_id from partial name via DB lookup
        business_lookup = await resolve_business_id(pool, business_id)
        if not business_lookup:
            return {"error": f"Business not found matching: {business_id}"}

        config = await resolver.resolve(business_lookup, pool)
        if not config:
            return {"error": f"Business not mapped: {business_lookup}"}

        # Get latest run_id for this business (if exists)
        latest_run = await pool.fetchrow("""
            SELECT run_id, MAX(created_at) as latest
            FROM pipeline.detected_spans_v2
            WHERE business_id = $1 AND run_id IS NOT NULL
            GROUP BY run_id
            ORDER BY latest DESC
            LIMIT 1
        """, config["business_id"])

        run_id_filter = ""
        run_id_value = None
        if latest_run and latest_run["run_id"]:
            run_id_value = latest_run["run_id"]
            run_id_filter = "AND run_id = $2"
            print(f"📎 Using latest run: {run_id_value}")

        # Fetch spans from detected_spans_v2 (include language)
        if run_id_value:
            spans = await pool.fetch(f"""
                SELECT primitive, valence, intensity, confidence, span_text, config_version, language, run_id
                FROM pipeline.detected_spans_v2
                WHERE business_id = $1 {run_id_filter}
                ORDER BY created_at DESC
            """, config["business_id"], run_id_value)
        else:
            # Fallback to all spans if no run_id exists (legacy data)
            spans = await pool.fetch("""
                SELECT primitive, valence, intensity, confidence, span_text, config_version, language, run_id
                FROM pipeline.detected_spans_v2
                WHERE business_id = $1
                ORDER BY created_at DESC
            """, config["business_id"])
            print("⚠️  No run_id found, using all spans (legacy data)")

        if not spans:
            return {"error": "No spans found in detected_spans_v2"}

        spans = [dict(s) for s in spans]

        # 1. UNMAPPED rate (overall)
        unmapped = [s for s in spans if s["primitive"] == "UNMAPPED"]
        unmapped_rate = len(unmapped) / len(spans)

        # 2. Top primitives
        primitive_counts = Counter(s["primitive"] for s in spans)

        # 2b. UNMAPPED rate by language
        language_stats = {}
        for lang in set(s.get("language") or "unknown" for s in spans):
            lang_spans = [s for s in spans if (s.get("language") or "unknown") == lang]
            lang_unmapped = [s for s in lang_spans if s["primitive"] == "UNMAPPED"]
            lang_non_info = [s for s in lang_spans if s["primitive"] == "NON_INFORMATIVE"]
            content_spans = len(lang_spans) - len(lang_non_info)
            language_stats[lang] = {
                "total": len(lang_spans),
                "unmapped": len(lang_unmapped),
                "non_informative": len(lang_non_info),
                "content_spans": content_spans,
                "unmapped_rate": len(lang_unmapped) / content_spans if content_spans > 0 else 0,
            }

        # 3. Contradiction detection (simple heuristics)
        contradictions = []
        positive_words = {"great", "amazing", "excellent", "wonderful", "best", "love", "perfect"}
        negative_words = {"terrible", "awful", "worst", "horrible", "hate", "never"}

        for span in spans:
            text_lower = span["span_text"].lower()
            valence = span["valence"]

            has_positive = any(w in text_lower for w in positive_words)
            has_negative = any(w in text_lower for w in negative_words)

            if has_positive and valence == "-":
                contradictions.append({
                    "type": "positive_text_negative_valence",
                    "text": span["span_text"][:50],
                    "valence": valence,
                })
            elif has_negative and valence == "+":
                contradictions.append({
                    "type": "negative_text_positive_valence",
                    "text": span["span_text"][:50],
                    "valence": valence,
                })

        # 4. Config version coverage
        version_counts = Counter(s["config_version"] for s in spans)

        # 5. Confidence distribution
        avg_confidence = sum(float(s["confidence"]) for s in spans) / len(spans)
        low_confidence = len([s for s in spans if float(s["confidence"]) < 0.5])

        evaluation = {
            "business_id": config["business_id"],
            "sector_code": config["sector_code"],
            "total_spans": len(spans),
            "unmapped_rate": unmapped_rate,
            "unmapped_count": len(unmapped),
            "top_primitives": primitive_counts.most_common(10),
            "contradiction_count": len(contradictions),
            "contradictions_sample": contradictions[:5],
            "config_versions": dict(version_counts),
            "avg_confidence": avg_confidence,
            "low_confidence_count": low_confidence,
            "language_stats": language_stats,
        }

        # Print report
        print("\n" + "=" * 60)
        print(f"EVALUATION: {config['business_id']}")
        print("=" * 60)

        print(f"\n📊 METRICS")
        print(f"   Total spans: {len(spans)}")
        print(f"   UNMAPPED rate: {unmapped_rate:.1%} {'⚠️' if unmapped_rate > 0.2 else '✓'}")
        print(f"   Avg confidence: {avg_confidence:.2f}")
        print(f"   Low confidence (<0.5): {low_confidence}")

        print(f"\n🌐 UNMAPPED BY LANGUAGE")
        for lang, stats in sorted(language_stats.items(), key=lambda x: x[1]["total"], reverse=True):
            rate = stats["unmapped_rate"]
            flag = "⚠️" if rate > 0.15 else "✓"
            print(f"   {lang}: {stats['unmapped']}/{stats['content_spans']} ({rate:.1%}) {flag}")

        print(f"\n🔝 TOP PRIMITIVES")
        for prim, count in primitive_counts.most_common(8):
            pct = count / len(spans) * 100
            print(f"   {prim}: {count} ({pct:.1f}%)")

        print(f"\n⚠️  CONTRADICTIONS: {len(contradictions)}")
        for c in contradictions[:3]:
            print(f"   {c['type']}: \"{c['text']}...\" → {c['valence']}")

        print(f"\n📋 CONFIG VERSIONS")
        for ver, count in version_counts.items():
            print(f"   {ver}: {count} spans")

        print("=" * 60)

        return evaluation

    finally:
        await pool.close()


async def language_analysis(ignore_legacy: bool = False, latest_hours: int | None = None) -> dict[str, Any]:
    """
    Analyze UNMAPPED rates by language across all businesses/sectors.

    Args:
        ignore_legacy: If True, exclude rows with language IN ('auto', 'unknown') or NULL
        latest_hours: If set, only include spans from the last N hours

    This helps determine if multilingual handling needs improvement
    (e.g., translation for non-English reviews).
    """
    pool = await asyncpg.create_pool(DB_URL)

    try:
        # Build WHERE clause based on filters
        where_clauses = []
        if ignore_legacy:
            where_clauses.append("language IS NOT NULL AND language NOT IN ('auto', 'unknown')")
        if latest_hours:
            where_clauses.append(f"created_at >= NOW() - INTERVAL '{latest_hours} hours'")

        where_sql = f"WHERE {' AND '.join(where_clauses)}" if where_clauses else ""

        # Query UNMAPPED rates by language and sector
        rows = await pool.fetch(f"""
            SELECT
                sector_code,
                COALESCE(language, 'unknown') as language,
                COUNT(*) as total_spans,
                COUNT(*) FILTER (WHERE primitive = 'UNMAPPED') as unmapped_count,
                COUNT(*) FILTER (WHERE primitive = 'NON_INFORMATIVE') as non_informative_count,
                AVG(confidence) as avg_confidence
            FROM pipeline.detected_spans_v2
            {where_sql}
            GROUP BY sector_code, COALESCE(language, 'unknown')
            ORDER BY sector_code, total_spans DESC
        """)

        if not rows:
            return {"error": "No data in detected_spans_v2"}

        # Build report
        filter_desc = []
        if ignore_legacy:
            filter_desc.append("excluding legacy (auto/unknown)")
        if latest_hours:
            filter_desc.append(f"last {latest_hours}h only")
        filter_str = f" [{', '.join(filter_desc)}]" if filter_desc else ""

        print("\n" + "=" * 80)
        print(f"LANGUAGE ANALYSIS: UNMAPPED Rates by Language & Sector{filter_str}")
        print("=" * 80)
        print(f"{'':14} | {'total':>5} | {'content':>7} | {'unmapped':>8} | {'raw':>6} | {'adj':>6} |")
        print("-" * 80)

        current_sector = None
        for row in rows:
            sector = row["sector_code"]
            lang = row["language"]
            total = row["total_spans"]
            unmapped = row["unmapped_count"]
            non_info = row["non_informative_count"]
            content = total - non_info
            raw_rate = unmapped / total if total > 0 else 0
            adj_rate = unmapped / content if content > 0 else 0
            conf = float(row["avg_confidence"])

            if sector != current_sector:
                current_sector = sector
                print(f"\n📂 {sector}")

            flag = "⚠️" if adj_rate > 0.15 else "✓"
            print(f"   {lang:12} | {total:5} | {content:7} | {unmapped:8} | {raw_rate:5.1%} | {adj_rate:5.1%} | {flag}")

        # Summary
        print("\n" + "-" * 80)
        print("SUMMARY: Languages with high content-adjusted UNMAPPED (>15%, content >= 20)")
        print("-" * 80)

        high_unmapped = [
            row for row in rows
            if (row["total_spans"] - row["non_informative_count"]) >= 20  # Minimum sample size
            and row["unmapped_count"] / (row["total_spans"] - row["non_informative_count"]) > 0.15
        ]

        if high_unmapped:
            for row in high_unmapped:
                content = row["total_spans"] - row["non_informative_count"]
                rate = row["unmapped_count"] / content
                print(f"   {row['sector_code']}/{row['language']}: {rate:.1%} UNMAPPED ({content} content spans)")
            print("\n💡 Consider: Translation for these language/sector combinations")
        else:
            print("   ✓ No language/sector combinations exceed threshold")

        # Totals
        total_spans = sum(r["total_spans"] for r in rows)
        total_unmapped = sum(r["unmapped_count"] for r in rows)
        total_non_info = sum(r["non_informative_count"] for r in rows)
        total_content = total_spans - total_non_info

        print(f"\n📊 TOTALS: {total_spans} spans, {total_content} content, {total_unmapped} unmapped")
        print(f"   Raw UNMAPPED rate: {total_unmapped/total_spans:.1%}" if total_spans > 0 else "")
        print(f"   Content-adjusted UNMAPPED rate: {total_unmapped/total_content:.1%}" if total_content > 0 else "")

        print("=" * 80)

        return {"rows": [dict(r) for r in rows]}

    finally:
        await pool.close()


def main():
    parser = argparse.ArgumentParser(description="Classification run harness V2")
    parser.add_argument("--business", help="Business name or pattern")
    parser.add_argument("--limit", type=int, default=100, help="Max reviews to process")
    parser.add_argument("--dry-run", action="store_true", help="Don't store results")
    parser.add_argument("--evaluate", metavar="BUSINESS", help="Evaluate existing results")
    parser.add_argument("--language-analysis", action="store_true", help="Analyze UNMAPPED by language across all data")
    parser.add_argument("--ignore-legacy-language", action="store_true", help="Exclude rows with language='auto'/'unknown'/NULL")
    parser.add_argument("--latest-hours", type=int, help="Only include spans from last N hours")
    parser.add_argument("--use-existing", action="store_true", help="Use existing spans instead of jobs")
    parser.add_argument("--use-llm", action="store_true", help="Use real LLM classification (requires OPENAI_API_KEY)")
    parser.add_argument("--model", default="gpt-4o-mini", help="Model for LLM classification (default: gpt-4o-mini)")

    args = parser.parse_args()

    if args.language_analysis:
        asyncio.run(language_analysis(
            ignore_legacy=args.ignore_legacy_language,
            latest_hours=args.latest_hours,
        ))
    elif args.evaluate:
        asyncio.run(evaluate_business(args.evaluate))
    elif args.business:
        asyncio.run(run_classification(
            args.business,
            limit=args.limit,
            dry_run=args.dry_run,
            use_jobs=not args.use_existing,
            use_llm=args.use_llm,
            model=args.model,
        ))
    else:
        parser.print_help()
        print("\n\nExamples:")
        print("  # Mock classification (free, for testing)")
        print("  python run_classification_v2.py --business gokarts --limit 50 --dry-run")
        print("")
        print("  # Real LLM classification")
        print("  python run_classification_v2.py --business gokarts --limit 50 --use-llm")
        print("")
        print("  # Evaluate results")
        print("  python run_classification_v2.py --evaluate gokarts")


if __name__ == "__main__":
    main()