Files
whyrating-engine-legacy/packages/reviewiq-pipeline/scripts/run_classification_v2.py
2026-02-02 18:19:00 +00:00

1103 lines
40 KiB
Python

#!/usr/bin/env python3
"""
Classification Run Harness V2
Runs classification on real reviews using resolved L1 config + sector brief.
Stores results to detected_spans_v2 with full config versioning.
Usage:
python run_classification_v2.py --business "Go Karts Mar Menor" --limit 100
python run_classification_v2.py --business "ClickRent Gran Canaria" --limit 100 --dry-run
python run_classification_v2.py --evaluate "Go Karts Mar Menor"
"""
import argparse
import asyncio
import hashlib
import json
import os
import re
import sys
import unicodedata
import uuid
from collections import Counter
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any
import asyncpg
# Use standalone resolver to avoid package import issues
from config_resolver_standalone import ConfigResolver
# Import LLM classifier (optional - falls back to mock if unavailable)
try:
from llm_classifier import classify_review as llm_classify_review
LLM_AVAILABLE = True
except ImportError:
LLM_AVAILABLE = False
# Database URL
DB_URL = os.environ.get(
"DATABASE_URL",
"postgresql://scraper:scraper123@localhost:5437/scraper"
)
# Non-informative review detection (score-based, conservative)
def _compute_text_stats(text: str) -> dict:
"""Compute character and token statistics for non-informative detection."""
if not text:
return {"empty": True}
text = text.strip()
total_chars = len(text)
if total_chars == 0:
return {"empty": True}
# Character counts by unicode category (reliable, no emoji heuristics)
alpha_chars = sum(1 for c in text if unicodedata.category(c).startswith('L'))
digit_chars = sum(1 for c in text if unicodedata.category(c).startswith('N'))
punct_chars = sum(1 for c in text if unicodedata.category(c).startswith('P'))
# Token stats
tokens = text.split()
token_count = len(tokens)
unique_tokens = len(set(t.lower() for t in tokens))
# Ratios
alpha_ratio = alpha_chars / total_chars if total_chars > 0 else 0
punct_ratio = punct_chars / total_chars if total_chars > 0 else 0
# Repetition check (same token repeated)
if tokens:
most_common_count = Counter(t.lower() for t in tokens).most_common(1)[0][1]
repetition_ratio = most_common_count / token_count if token_count > 0 else 0
else:
repetition_ratio = 0
return {
"empty": False,
"total_chars": total_chars,
"alpha_chars": alpha_chars,
"digit_chars": digit_chars,
"punct_chars": punct_chars,
"token_count": token_count,
"unique_tokens": unique_tokens,
"alpha_ratio": alpha_ratio,
"punct_ratio": punct_ratio,
"repetition_ratio": repetition_ratio,
}
# Safe regex for truly content-free strings (high confidence only)
# No word lists - only structural patterns
PURE_JUNK_RE = re.compile(
r'^[\s\.\!\?\,\-\_\~\*\#\@]+$' # Only punctuation/whitespace
r'|^[\U0001F300-\U0001F9FF\U0001FA00-\U0001FAFF\U00002600-\U000027BF\s\.\!\?]+$' # Only emoji + punct
r'|^(translated by google|traducido por google)[\.\s]*$', # Translation artifacts
re.IGNORECASE
)
def is_non_informative(text: str) -> tuple[bool, str]:
"""
Conservative detection of non-informative reviews.
Goal: Skip LLM only when VERY sure it's junk. Everything else goes to LLM.
Prefer false negatives (keeping noise) over false positives (dropping content).
Returns (is_non_informative, reason).
"""
if not text:
return True, "empty"
text = text.strip()
if not text:
return True, "empty"
stats = _compute_text_stats(text)
if stats.get("empty"):
return True, "empty"
# Rule A: Safe regex (emoji-only, punct-only, translation artifact)
if PURE_JUNK_RE.match(text):
return True, "junk_pattern"
# Rule B: No alphanumeric content at all
if stats["alpha_chars"] == 0 and stats["digit_chars"] == 0:
return True, "no_content"
# Rule C: Pure repetition (e.g., "good good good good")
if stats["token_count"] >= 3 and stats["unique_tokens"] == 1 and stats["alpha_chars"] < 20:
return True, "pure_repetition"
# Everything else passes to LLM (including typo-heavy, short meaningful, etc.)
return False, ""
# Classification prompt template
CLASSIFICATION_PROMPT = """You are a review classifier using primitive-based analysis.
## TASK
Extract semantic spans from this review and classify each span to exactly ONE primitive.
## BUSINESS CONTEXT
- Business: {business_id}
- Sector: {sector_code}
- Path: {gbp_path}
## ENABLED PRIMITIVES (use ONLY these)
{primitives_list}
## SECTOR SIGNALS (what customers typically judge)
{brief_signals}
## RULES
1. Extract 1-5 spans per review (prefer fewer, larger spans)
2. Each span gets exactly ONE primitive (the most specific match)
3. If nothing fits with confidence ≥ 0.5, use UNMAPPED with keywords
4. Valence: + (positive), - (negative), 0 (neutral), ± (mixed)
5. Intensity: 1 (low), 2 (moderate), 3 (high/extreme)
6. Detail: 1 (vague), 2 (some detail), 3 (specific/actionable)
## OUTPUT FORMAT (JSON only, no markdown)
{{
"spans": [
{{
"text": "exact text from review",
"start": 0,
"end": 25,
"primitive": "MANNER",
"valence": "+",
"intensity": 2,
"detail": 2,
"confidence": 0.85,
"entity": null,
"entity_type": null
}}
]
}}
## REVIEW TO CLASSIFY
Rating: {rating}/5
Text: {review_text}
Return JSON only."""
async def resolve_business_id(pool: asyncpg.Pool, search_term: str) -> str | None:
"""
Resolve a partial business name/pattern to canonical business_id via DB.
Searches pipeline.business_taxonomy_map by ILIKE on business_id.
Returns the canonical business_id or None if not found.
"""
# Try exact match first
row = await pool.fetchrow("""
SELECT business_id
FROM pipeline.business_taxonomy_map
WHERE business_id = $1
""", search_term)
if row:
return row["business_id"]
# Build search patterns with varying flexibility
search_patterns = [
f"%{search_term}%", # Basic wildcard
f"%{search_term.replace(' ', '%')}%", # "go karts" -> "%go%karts%"
]
# For camelCase or concatenated words like "gokarts", insert wildcard at likely word boundaries
# Simple heuristic: insert % before uppercase letters or between common word patterns
import re
# Insert % before uppercase letters: "GoKarts" -> "%Go%Karts%"
camel_pattern = re.sub(r'([a-z])([A-Z])', r'\1%\2', search_term)
if camel_pattern != search_term:
search_patterns.append(f"%{camel_pattern}%")
# For lowercase concatenated words, try common splits
# "gokarts" -> try "go%karts", "gok%arts", etc. with length >= 2
if search_term.islower() and len(search_term) >= 4:
for i in range(2, len(search_term) - 1):
search_patterns.append(f"%{search_term[:i]}%{search_term[i:]}%")
for pattern in search_patterns:
row = await pool.fetchrow("""
SELECT business_id
FROM pipeline.business_taxonomy_map
WHERE LOWER(business_id) LIKE LOWER($1)
""", pattern)
if row:
return row["business_id"]
return None
async def fetch_reviews_for_business(
pool: asyncpg.Pool,
business_id: str,
limit: int = 100,
) -> list[dict]:
"""Fetch reviews from review_spans source (existing classified reviews)."""
# Get unique reviews from spans table
query = """
SELECT DISTINCT ON (review_id)
review_id,
business_id,
span_text as sample_text
FROM pipeline.review_spans
WHERE business_id = $1
ORDER BY review_id, id
LIMIT $2
"""
rows = await pool.fetch(query, business_id, limit)
# For now, we'll use existing spans as proxy for reviews
# In production, this would fetch from reviews_raw or jobs.reviews_data
return [dict(row) for row in rows]
async def fetch_reviews_from_jobs(
pool: asyncpg.Pool,
business_pattern: str,
limit: int = 100,
) -> list[dict]:
"""Fetch reviews from jobs.reviews_data JSON."""
query = """
SELECT
j.job_id,
j.url,
jsonb_array_elements(j.reviews_data) as review
FROM public.jobs j
WHERE j.reviews_data IS NOT NULL
AND (LOWER(j.url) LIKE $1 OR LOWER(j.metadata->>'business_name') LIKE $1)
AND j.status = 'completed'
LIMIT $2
"""
rows = await pool.fetch(query, f"%{business_pattern.lower()}%", limit)
reviews = []
for row in rows:
review = row["review"]
# Handle both dict and JSON string
if isinstance(review, str):
review = json.loads(review)
reviews.append({
"job_id": str(row["job_id"]),
"review_id": review.get("review_id", f"rev-{len(reviews)}"),
"text": review.get("text", ""),
"rating": review.get("rating", 5),
"author": review.get("author", "Anonymous"),
})
return reviews
def build_classification_prompt(
review: dict,
config: dict,
) -> str:
"""Build the classification prompt with resolved config."""
# Build primitives list with weights
primitives_list = []
for prim in sorted(config["enabled_primitives"]):
prim_info = config["primitives"].get(prim, {})
weight = config["weights"].get(prim)
weight_str = f" (weight: {weight}x)" if weight else ""
primitives_list.append(
f"- {prim}: {prim_info.get('def', prim_info.get('name', prim))}{weight_str}"
)
# Build brief signals
brief = config.get("brief", {})
brief_signals = []
if brief.get("what_customers_judge"):
items = brief["what_customers_judge"]
if isinstance(items, dict):
items = items.get("items", [])
for item in items[:3]:
if isinstance(item, dict):
brief_signals.append(f"- {item.get('aspect', item.get('area', ''))}")
else:
brief_signals.append(f"- {item}")
return CLASSIFICATION_PROMPT.format(
business_id=config["business_id"],
sector_code=config["sector_code"],
gbp_path=config["gbp_path"],
primitives_list="\n".join(primitives_list),
brief_signals="\n".join(brief_signals) if brief_signals else "No specific signals",
rating=review.get("rating", "?"),
review_text=review.get("text", review.get("sample_text", "")),
)
async def classify_review_mock(
review: dict,
config: dict,
) -> ClassificationResult:
"""
Mock classification for dry-run testing.
Returns a synthetic result based on simple heuristics.
"""
text = review.get("text", review.get("sample_text", ""))
rating = review.get("rating", 3)
# Simple heuristic classification
spans = []
# Detect some patterns
if any(word in text.lower() for word in ["friendly", "nice", "helpful", "great staff"]):
spans.append({
"text": text[:50] + "..." if len(text) > 50 else text,
"start": 0,
"end": min(50, len(text)),
"primitive": "MANNER",
"valence": "+",
"intensity": 2,
"detail": 2,
"confidence": 0.75,
})
elif any(word in text.lower() for word in ["rude", "unfriendly", "ignored"]):
spans.append({
"text": text[:50] + "..." if len(text) > 50 else text,
"start": 0,
"end": min(50, len(text)),
"primitive": "MANNER",
"valence": "-",
"intensity": 2,
"detail": 2,
"confidence": 0.75,
})
elif any(word in text.lower() for word in ["wait", "slow", "fast", "quick"]):
valence = "+" if rating >= 4 else "-"
spans.append({
"text": text[:50] + "..." if len(text) > 50 else text,
"start": 0,
"end": min(50, len(text)),
"primitive": "SPEED",
"valence": valence,
"intensity": 2,
"detail": 2,
"confidence": 0.70,
})
else:
# Default based on rating
spans.append({
"text": text[:50] + "..." if len(text) > 50 else text,
"start": 0,
"end": min(50, len(text)),
"primitive": "VALUE_FOR_MONEY" if rating >= 4 else "UNMAPPED",
"valence": "+" if rating >= 4 else "-" if rating <= 2 else "0",
"intensity": 1,
"detail": 1,
"confidence": 0.50,
"unmapped_keywords": ["general"] if rating < 4 else None,
})
return ClassificationResult(
review_id=review.get("review_id", "unknown"),
business_id=config["business_id"],
config_version=config["config_version"],
spans=spans,
raw_response=json.dumps({"spans": spans}),
detected_language="en", # Mock assumes English
language_confidence=0.5,
)
@dataclass
class ClassificationResult:
"""Result from classifying a single review."""
review_id: str
business_id: str
config_version: str
spans: list[dict]
raw_response: str
error: str | None = None
detected_language: str | None = None
language_confidence: float | None = None
async def classify_review_llm(
review: dict,
config: dict,
model: str | None = None,
) -> ClassificationResult:
"""
Real LLM classification using OpenAI.
Uses enabled primitives from resolved config.
Returns structured spans with audit trail.
"""
if not LLM_AVAILABLE:
raise RuntimeError("LLM classifier not available. Check OPENAI_API_KEY.")
text = review.get("text", review.get("sample_text", ""))
rating = review.get("rating")
# Check for non-informative reviews (skip LLM, save cost)
non_informative, reason = is_non_informative(text)
if non_informative:
return ClassificationResult(
review_id=review.get("review_id", "unknown"),
business_id=config["business_id"],
config_version=config["config_version"],
spans=[{
"text": text[:100] if text else "",
"start": 0,
"end": min(100, len(text)) if text else 0,
"primitive": "NON_INFORMATIVE",
"valence": "0",
"intensity": 1,
"detail": 1,
"confidence": 1.0,
"unmapped_keywords": [reason],
}],
raw_response=json.dumps({"non_informative": True, "reason": reason}),
detected_language="unknown",
language_confidence=0.0,
)
# Call the LLM classifier
result = llm_classify_review(
review_text=text,
rating=rating,
config=config,
language="auto", # Auto-detect
model=model,
)
# Extract language info
detected_lang = result.get("detected_language", "unknown")
lang_confidence = result.get("language_confidence", 0.0)
# Convert LLM response to our format
spans = []
for span in result.get("spans", []):
# Map valence to our format
valence_map = {"positive": "+", "negative": "-", "mixed": "±", "neutral": "0"}
valence = valence_map.get(span.get("valence", "neutral"), "0")
# Map intensity (1-5 to 1-3)
intensity = span.get("intensity", 2)
if intensity >= 4:
detail = 3
elif intensity >= 2:
detail = 2
else:
detail = 1
intensity = min(3, max(1, (intensity + 1) // 2)) # Map 1-5 to 1-3
spans.append({
"text": span.get("evidence", text[:100]),
"start": span.get("start_char"),
"end": span.get("end_char"),
"primitive": span.get("primitive", "UNMAPPED"),
"valence": valence,
"intensity": intensity,
"detail": detail,
"confidence": span.get("confidence", 0.5),
"entity": span.get("details", {}).get("entity") if span.get("details") else None,
"entity_type": span.get("details", {}).get("entity_type") if span.get("details") else None,
})
# Add unmapped items as UNMAPPED spans
for unmapped in result.get("unmapped", []):
spans.append({
"text": unmapped.get("evidence", ""),
"start": None,
"end": None,
"primitive": "UNMAPPED",
"valence": "0",
"intensity": 1,
"detail": 1,
"confidence": unmapped.get("confidence", 0.3),
"unmapped_keywords": [unmapped.get("label", "unknown")],
})
return ClassificationResult(
review_id=review.get("review_id", "unknown"),
business_id=config["business_id"],
config_version=config["config_version"],
spans=spans,
raw_response=result.get("raw_response", "{}"),
error=result.get("warnings", [None])[0] if result.get("warnings") else None,
detected_language=detected_lang,
language_confidence=lang_confidence,
)
async def store_spans(
pool: asyncpg.Pool,
result: ClassificationResult,
config: dict,
job_id: str | None = None,
model: str | None = None,
review_hash: str | None = None,
language: str | None = None,
run_id: uuid.UUID | None = None,
) -> int:
"""Store classified spans to detected_spans_v2 with full audit trail."""
count = 0
for span in result.spans:
await pool.execute("""
INSERT INTO pipeline.detected_spans_v2 (
job_id, business_id, review_id, gbp_path, sector_code,
config_version, primitive, valence, intensity, detail,
mode, confidence, span_text, span_start, span_end,
unmapped_keywords, entity, entity_type,
model, raw_response, review_hash, language, run_id
) VALUES (
$1, $2, $3, $4::ltree, $5,
$6, $7, $8, $9, $10,
$11, $12, $13, $14, $15,
$16, $17, $18,
$19, $20, $21, $22, $23
)
""",
job_id,
result.business_id,
result.review_id,
config["gbp_path"],
config["sector_code"],
result.config_version,
span["primitive"],
span["valence"],
span.get("intensity"),
span.get("detail"),
span.get("mode"),
span["confidence"],
span["text"],
span.get("start"),
span.get("end"),
span.get("unmapped_keywords"),
span.get("entity"),
span.get("entity_type"),
model,
json.dumps(json.loads(result.raw_response)) if result.raw_response else None, # Store as JSONB
review_hash,
language,
run_id,
)
count += 1
return count
async def run_classification(
business_id: str,
limit: int = 100,
dry_run: bool = False,
use_jobs: bool = True,
use_llm: bool = False,
model: str | None = None,
) -> dict[str, Any]:
"""
Run classification pipeline for a business.
Args:
business_id: Business name or URL pattern
limit: Max reviews to process
dry_run: If True, don't store results
use_jobs: If True, fetch from jobs.reviews_data
use_llm: If True, use real LLM classification (requires OPENAI_API_KEY)
model: Model to use for LLM classification (default: gpt-4o-mini)
Returns:
Summary statistics
"""
if use_llm and not LLM_AVAILABLE:
return {"error": "LLM classifier not available. Set OPENAI_API_KEY."}
model = model or "gpt-4o-mini"
pool = await asyncpg.create_pool(DB_URL)
try:
# Resolve config
resolver = ConfigResolver()
# Resolve business_id from partial name via DB lookup
business_lookup = await resolve_business_id(pool, business_id)
if not business_lookup:
return {"error": f"Business not found matching: {business_id}"}
config = await resolver.resolve(business_lookup, pool)
if not config:
return {"error": f"Business not mapped: {business_lookup}"}
# Generate run_id for this classification run
run_id = uuid.uuid4()
print(f"\n📋 Resolved config for: {config['business_id']}")
print(f" Sector: {config['sector_code']}")
print(f" Config version: {config['config_version']}")
print(f" Enabled primitives: {len(config['enabled_primitives'])}")
print(f" Weights: {len(config['weights'])}")
print(f" Run ID: {run_id}")
# Fetch reviews
if use_jobs:
# Extract pattern from business_id (use first two words for better matching)
words = business_id.split()
url_pattern = " ".join(words[:2]) if len(words) > 1 else words[0]
reviews = await fetch_reviews_from_jobs(pool, url_pattern, limit)
else:
reviews = await fetch_reviews_for_business(pool, config["business_id"], limit)
print(f"\n📥 Fetched {len(reviews)} reviews")
if not reviews:
return {"error": "No reviews found", "config": config}
# Classify reviews
results = []
total_tokens = {"prompt": 0, "completion": 0}
classifier_type = "LLM" if use_llm else "MOCK"
print(f"\n🔄 Classifying with {classifier_type} classifier...")
if use_llm:
print(f" Model: {model}")
for i, review in enumerate(reviews):
if i % 20 == 0:
print(f" Processing review {i+1}/{len(reviews)}...")
# Use LLM or mock classifier
if use_llm:
result = await classify_review_llm(review, config, model)
# Track tokens for cost estimation
if hasattr(result, 'raw_response'):
try:
raw = json.loads(result.raw_response)
if "tokens" in raw:
total_tokens["prompt"] += raw["tokens"].get("prompt", 0)
total_tokens["completion"] += raw["tokens"].get("completion", 0)
except:
pass
else:
result = await classify_review_mock(review, config)
results.append(result)
# Store if not dry run
if not dry_run:
# Compute review hash for caching
text = review.get("text", review.get("sample_text", ""))
review_hash = hashlib.sha256(f"{config['config_version']}:{text}".encode()).hexdigest()[:16]
# Use detected language from result (if available)
detected_lang = getattr(result, 'detected_language', None) or "unknown"
await store_spans(
pool, result, config,
job_id=review.get("job_id"),
model=model if use_llm else "mock",
review_hash=review_hash,
language=detected_lang,
run_id=run_id,
)
# Calculate stats
all_spans = [s for r in results for s in r.spans]
primitive_counts = Counter(s["primitive"] for s in all_spans)
valence_counts = Counter(s["valence"] for s in all_spans)
unmapped_count = primitive_counts.get("UNMAPPED", 0)
non_informative_count = primitive_counts.get("NON_INFORMATIVE", 0)
# Language distribution (for multilingual tracking)
language_counts = Counter(
getattr(r, 'detected_language', 'unknown') or 'unknown'
for r in results
)
# Content spans = total - non-informative
content_spans = len(all_spans) - non_informative_count
content_unmapped_rate = unmapped_count / content_spans if content_spans > 0 else 0
stats = {
"run_id": str(run_id),
"business_id": config["business_id"],
"sector_code": config["sector_code"],
"config_version": config["config_version"],
"l2_applied": config.get("l2_applied"),
"classifier": classifier_type,
"model": model if use_llm else "mock",
"reviews_processed": len(reviews),
"spans_created": len(all_spans),
"non_informative_count": non_informative_count,
"content_spans": content_spans,
"unmapped_count": unmapped_count,
"raw_unmapped_rate": unmapped_count / len(all_spans) if all_spans else 0,
"content_unmapped_rate": content_unmapped_rate,
"top_primitives": primitive_counts.most_common(10),
"valence_distribution": dict(valence_counts),
"language_distribution": dict(language_counts),
"dry_run": dry_run,
}
if use_llm:
stats["tokens"] = total_tokens
# Rough cost estimate for gpt-4o-mini
cost = (total_tokens["prompt"] * 0.15 + total_tokens["completion"] * 0.60) / 1_000_000
stats["estimated_cost_usd"] = round(cost, 4)
print(f"\n📊 Results:")
print(f" Reviews: {stats['reviews_processed']}")
print(f" Total spans: {stats['spans_created']}")
print(f" NON_INFORMATIVE: {non_informative_count} ({100*non_informative_count/len(all_spans):.1f}%)" if all_spans else "")
print(f" Content spans: {content_spans}")
print(f" UNMAPPED (of content): {unmapped_count} ({content_unmapped_rate:.1%})")
if use_llm and "estimated_cost_usd" in stats:
print(f" Estimated cost: ${stats['estimated_cost_usd']:.4f}")
# Print language distribution
print(f"\n 🌐 Languages detected:")
for lang, count in language_counts.most_common(5):
pct = count / len(results) * 100 if results else 0
print(f" {lang}: {count} ({pct:.1f}%)")
print(f"\n Top primitives:")
for prim, count in stats["top_primitives"][:5]:
print(f" {prim}: {count}")
return stats
finally:
await pool.close()
async def evaluate_business(business_id: str) -> dict[str, Any]:
"""
Evaluate classification results for a business.
Runs C4 evaluation metrics:
1. UNMAPPED rate
2. Top primitives distribution
3. Contradiction detection
4. Coverage by config version
"""
pool = await asyncpg.create_pool(DB_URL)
try:
# Get business mapping
resolver = ConfigResolver()
# Resolve business_id from partial name via DB lookup
business_lookup = await resolve_business_id(pool, business_id)
if not business_lookup:
return {"error": f"Business not found matching: {business_id}"}
config = await resolver.resolve(business_lookup, pool)
if not config:
return {"error": f"Business not mapped: {business_lookup}"}
# Get latest run_id for this business (if exists)
latest_run = await pool.fetchrow("""
SELECT run_id, MAX(created_at) as latest
FROM pipeline.detected_spans_v2
WHERE business_id = $1 AND run_id IS NOT NULL
GROUP BY run_id
ORDER BY latest DESC
LIMIT 1
""", config["business_id"])
run_id_filter = ""
run_id_value = None
if latest_run and latest_run["run_id"]:
run_id_value = latest_run["run_id"]
run_id_filter = "AND run_id = $2"
print(f"📎 Using latest run: {run_id_value}")
# Fetch spans from detected_spans_v2 (include language)
if run_id_value:
spans = await pool.fetch(f"""
SELECT primitive, valence, intensity, confidence, span_text, config_version, language, run_id
FROM pipeline.detected_spans_v2
WHERE business_id = $1 {run_id_filter}
ORDER BY created_at DESC
""", config["business_id"], run_id_value)
else:
# Fallback to all spans if no run_id exists (legacy data)
spans = await pool.fetch("""
SELECT primitive, valence, intensity, confidence, span_text, config_version, language, run_id
FROM pipeline.detected_spans_v2
WHERE business_id = $1
ORDER BY created_at DESC
""", config["business_id"])
print("⚠️ No run_id found, using all spans (legacy data)")
if not spans:
return {"error": "No spans found in detected_spans_v2"}
spans = [dict(s) for s in spans]
# 1. UNMAPPED rate (overall)
unmapped = [s for s in spans if s["primitive"] == "UNMAPPED"]
unmapped_rate = len(unmapped) / len(spans)
# 2. Top primitives
primitive_counts = Counter(s["primitive"] for s in spans)
# 2b. UNMAPPED rate by language
language_stats = {}
for lang in set(s.get("language") or "unknown" for s in spans):
lang_spans = [s for s in spans if (s.get("language") or "unknown") == lang]
lang_unmapped = [s for s in lang_spans if s["primitive"] == "UNMAPPED"]
lang_non_info = [s for s in lang_spans if s["primitive"] == "NON_INFORMATIVE"]
content_spans = len(lang_spans) - len(lang_non_info)
language_stats[lang] = {
"total": len(lang_spans),
"unmapped": len(lang_unmapped),
"non_informative": len(lang_non_info),
"content_spans": content_spans,
"unmapped_rate": len(lang_unmapped) / content_spans if content_spans > 0 else 0,
}
# 3. Contradiction detection (simple heuristics)
contradictions = []
positive_words = {"great", "amazing", "excellent", "wonderful", "best", "love", "perfect"}
negative_words = {"terrible", "awful", "worst", "horrible", "hate", "never"}
for span in spans:
text_lower = span["span_text"].lower()
valence = span["valence"]
has_positive = any(w in text_lower for w in positive_words)
has_negative = any(w in text_lower for w in negative_words)
if has_positive and valence == "-":
contradictions.append({
"type": "positive_text_negative_valence",
"text": span["span_text"][:50],
"valence": valence,
})
elif has_negative and valence == "+":
contradictions.append({
"type": "negative_text_positive_valence",
"text": span["span_text"][:50],
"valence": valence,
})
# 4. Config version coverage
version_counts = Counter(s["config_version"] for s in spans)
# 5. Confidence distribution
avg_confidence = sum(float(s["confidence"]) for s in spans) / len(spans)
low_confidence = len([s for s in spans if float(s["confidence"]) < 0.5])
evaluation = {
"business_id": config["business_id"],
"sector_code": config["sector_code"],
"total_spans": len(spans),
"unmapped_rate": unmapped_rate,
"unmapped_count": len(unmapped),
"top_primitives": primitive_counts.most_common(10),
"contradiction_count": len(contradictions),
"contradictions_sample": contradictions[:5],
"config_versions": dict(version_counts),
"avg_confidence": avg_confidence,
"low_confidence_count": low_confidence,
"language_stats": language_stats,
}
# Print report
print("\n" + "=" * 60)
print(f"EVALUATION: {config['business_id']}")
print("=" * 60)
print(f"\n📊 METRICS")
print(f" Total spans: {len(spans)}")
print(f" UNMAPPED rate: {unmapped_rate:.1%} {'⚠️' if unmapped_rate > 0.2 else ''}")
print(f" Avg confidence: {avg_confidence:.2f}")
print(f" Low confidence (<0.5): {low_confidence}")
print(f"\n🌐 UNMAPPED BY LANGUAGE")
for lang, stats in sorted(language_stats.items(), key=lambda x: x[1]["total"], reverse=True):
rate = stats["unmapped_rate"]
flag = "⚠️" if rate > 0.15 else ""
print(f" {lang}: {stats['unmapped']}/{stats['content_spans']} ({rate:.1%}) {flag}")
print(f"\n🔝 TOP PRIMITIVES")
for prim, count in primitive_counts.most_common(8):
pct = count / len(spans) * 100
print(f" {prim}: {count} ({pct:.1f}%)")
print(f"\n⚠️ CONTRADICTIONS: {len(contradictions)}")
for c in contradictions[:3]:
print(f" {c['type']}: \"{c['text']}...\"{c['valence']}")
print(f"\n📋 CONFIG VERSIONS")
for ver, count in version_counts.items():
print(f" {ver}: {count} spans")
print("=" * 60)
return evaluation
finally:
await pool.close()
async def language_analysis(ignore_legacy: bool = False, latest_hours: int | None = None) -> dict[str, Any]:
"""
Analyze UNMAPPED rates by language across all businesses/sectors.
Args:
ignore_legacy: If True, exclude rows with language IN ('auto', 'unknown') or NULL
latest_hours: If set, only include spans from the last N hours
This helps determine if multilingual handling needs improvement
(e.g., translation for non-English reviews).
"""
pool = await asyncpg.create_pool(DB_URL)
try:
# Build WHERE clause based on filters
where_clauses = []
if ignore_legacy:
where_clauses.append("language IS NOT NULL AND language NOT IN ('auto', 'unknown')")
if latest_hours:
where_clauses.append(f"created_at >= NOW() - INTERVAL '{latest_hours} hours'")
where_sql = f"WHERE {' AND '.join(where_clauses)}" if where_clauses else ""
# Query UNMAPPED rates by language and sector
rows = await pool.fetch(f"""
SELECT
sector_code,
COALESCE(language, 'unknown') as language,
COUNT(*) as total_spans,
COUNT(*) FILTER (WHERE primitive = 'UNMAPPED') as unmapped_count,
COUNT(*) FILTER (WHERE primitive = 'NON_INFORMATIVE') as non_informative_count,
AVG(confidence) as avg_confidence
FROM pipeline.detected_spans_v2
{where_sql}
GROUP BY sector_code, COALESCE(language, 'unknown')
ORDER BY sector_code, total_spans DESC
""")
if not rows:
return {"error": "No data in detected_spans_v2"}
# Build report
filter_desc = []
if ignore_legacy:
filter_desc.append("excluding legacy (auto/unknown)")
if latest_hours:
filter_desc.append(f"last {latest_hours}h only")
filter_str = f" [{', '.join(filter_desc)}]" if filter_desc else ""
print("\n" + "=" * 80)
print(f"LANGUAGE ANALYSIS: UNMAPPED Rates by Language & Sector{filter_str}")
print("=" * 80)
print(f"{'':14} | {'total':>5} | {'content':>7} | {'unmapped':>8} | {'raw':>6} | {'adj':>6} |")
print("-" * 80)
current_sector = None
for row in rows:
sector = row["sector_code"]
lang = row["language"]
total = row["total_spans"]
unmapped = row["unmapped_count"]
non_info = row["non_informative_count"]
content = total - non_info
raw_rate = unmapped / total if total > 0 else 0
adj_rate = unmapped / content if content > 0 else 0
conf = float(row["avg_confidence"])
if sector != current_sector:
current_sector = sector
print(f"\n📂 {sector}")
flag = "⚠️" if adj_rate > 0.15 else ""
print(f" {lang:12} | {total:5} | {content:7} | {unmapped:8} | {raw_rate:5.1%} | {adj_rate:5.1%} | {flag}")
# Summary
print("\n" + "-" * 80)
print("SUMMARY: Languages with high content-adjusted UNMAPPED (>15%, content >= 20)")
print("-" * 80)
high_unmapped = [
row for row in rows
if (row["total_spans"] - row["non_informative_count"]) >= 20 # Minimum sample size
and row["unmapped_count"] / (row["total_spans"] - row["non_informative_count"]) > 0.15
]
if high_unmapped:
for row in high_unmapped:
content = row["total_spans"] - row["non_informative_count"]
rate = row["unmapped_count"] / content
print(f" {row['sector_code']}/{row['language']}: {rate:.1%} UNMAPPED ({content} content spans)")
print("\n💡 Consider: Translation for these language/sector combinations")
else:
print(" ✓ No language/sector combinations exceed threshold")
# Totals
total_spans = sum(r["total_spans"] for r in rows)
total_unmapped = sum(r["unmapped_count"] for r in rows)
total_non_info = sum(r["non_informative_count"] for r in rows)
total_content = total_spans - total_non_info
print(f"\n📊 TOTALS: {total_spans} spans, {total_content} content, {total_unmapped} unmapped")
print(f" Raw UNMAPPED rate: {total_unmapped/total_spans:.1%}" if total_spans > 0 else "")
print(f" Content-adjusted UNMAPPED rate: {total_unmapped/total_content:.1%}" if total_content > 0 else "")
print("=" * 80)
return {"rows": [dict(r) for r in rows]}
finally:
await pool.close()
def main():
parser = argparse.ArgumentParser(description="Classification run harness V2")
parser.add_argument("--business", help="Business name or pattern")
parser.add_argument("--limit", type=int, default=100, help="Max reviews to process")
parser.add_argument("--dry-run", action="store_true", help="Don't store results")
parser.add_argument("--evaluate", metavar="BUSINESS", help="Evaluate existing results")
parser.add_argument("--language-analysis", action="store_true", help="Analyze UNMAPPED by language across all data")
parser.add_argument("--ignore-legacy-language", action="store_true", help="Exclude rows with language='auto'/'unknown'/NULL")
parser.add_argument("--latest-hours", type=int, help="Only include spans from last N hours")
parser.add_argument("--use-existing", action="store_true", help="Use existing spans instead of jobs")
parser.add_argument("--use-llm", action="store_true", help="Use real LLM classification (requires OPENAI_API_KEY)")
parser.add_argument("--model", default="gpt-4o-mini", help="Model for LLM classification (default: gpt-4o-mini)")
args = parser.parse_args()
if args.language_analysis:
asyncio.run(language_analysis(
ignore_legacy=args.ignore_legacy_language,
latest_hours=args.latest_hours,
))
elif args.evaluate:
asyncio.run(evaluate_business(args.evaluate))
elif args.business:
asyncio.run(run_classification(
args.business,
limit=args.limit,
dry_run=args.dry_run,
use_jobs=not args.use_existing,
use_llm=args.use_llm,
model=args.model,
))
else:
parser.print_help()
print("\n\nExamples:")
print(" # Mock classification (free, for testing)")
print(" python run_classification_v2.py --business gokarts --limit 50 --dry-run")
print("")
print(" # Real LLM classification")
print(" python run_classification_v2.py --business gokarts --limit 50 --use-llm")
print("")
print(" # Evaluate results")
print(" python run_classification_v2.py --evaluate gokarts")
if __name__ == "__main__":
main()