#!/usr/bin/env python3 """ Classification Run Harness V2 Runs classification on real reviews using resolved L1 config + sector brief. Stores results to detected_spans_v2 with full config versioning. Usage: python run_classification_v2.py --business "Go Karts Mar Menor" --limit 100 python run_classification_v2.py --business "ClickRent Gran Canaria" --limit 100 --dry-run python run_classification_v2.py --evaluate "Go Karts Mar Menor" """ import argparse import asyncio import hashlib import json import os import re import sys import unicodedata import uuid from collections import Counter from dataclasses import dataclass from datetime import datetime from pathlib import Path from typing import Any import asyncpg # Use standalone resolver to avoid package import issues from config_resolver_standalone import ConfigResolver # Import LLM classifier (optional - falls back to mock if unavailable) try: from llm_classifier import classify_review as llm_classify_review LLM_AVAILABLE = True except ImportError: LLM_AVAILABLE = False # Database URL DB_URL = os.environ.get( "DATABASE_URL", "postgresql://scraper:scraper123@localhost:5437/scraper" ) # Non-informative review detection (score-based, conservative) def _compute_text_stats(text: str) -> dict: """Compute character and token statistics for non-informative detection.""" if not text: return {"empty": True} text = text.strip() total_chars = len(text) if total_chars == 0: return {"empty": True} # Character counts by unicode category (reliable, no emoji heuristics) alpha_chars = sum(1 for c in text if unicodedata.category(c).startswith('L')) digit_chars = sum(1 for c in text if unicodedata.category(c).startswith('N')) punct_chars = sum(1 for c in text if unicodedata.category(c).startswith('P')) # Token stats tokens = text.split() token_count = len(tokens) unique_tokens = len(set(t.lower() for t in tokens)) # Ratios alpha_ratio = alpha_chars / total_chars if total_chars > 0 else 0 punct_ratio = punct_chars / total_chars if total_chars > 0 else 0 # Repetition check (same token repeated) if tokens: most_common_count = Counter(t.lower() for t in tokens).most_common(1)[0][1] repetition_ratio = most_common_count / token_count if token_count > 0 else 0 else: repetition_ratio = 0 return { "empty": False, "total_chars": total_chars, "alpha_chars": alpha_chars, "digit_chars": digit_chars, "punct_chars": punct_chars, "token_count": token_count, "unique_tokens": unique_tokens, "alpha_ratio": alpha_ratio, "punct_ratio": punct_ratio, "repetition_ratio": repetition_ratio, } # Safe regex for truly content-free strings (high confidence only) # No word lists - only structural patterns PURE_JUNK_RE = re.compile( r'^[\s\.\!\?\,\-\_\~\*\#\@]+$' # Only punctuation/whitespace r'|^[\U0001F300-\U0001F9FF\U0001FA00-\U0001FAFF\U00002600-\U000027BF\s\.\!\?]+$' # Only emoji + punct r'|^(translated by google|traducido por google)[\.\s]*$', # Translation artifacts re.IGNORECASE ) def is_non_informative(text: str) -> tuple[bool, str]: """ Conservative detection of non-informative reviews. Goal: Skip LLM only when VERY sure it's junk. Everything else goes to LLM. Prefer false negatives (keeping noise) over false positives (dropping content). Returns (is_non_informative, reason). """ if not text: return True, "empty" text = text.strip() if not text: return True, "empty" stats = _compute_text_stats(text) if stats.get("empty"): return True, "empty" # Rule A: Safe regex (emoji-only, punct-only, translation artifact) if PURE_JUNK_RE.match(text): return True, "junk_pattern" # Rule B: No alphanumeric content at all if stats["alpha_chars"] == 0 and stats["digit_chars"] == 0: return True, "no_content" # Rule C: Pure repetition (e.g., "good good good good") if stats["token_count"] >= 3 and stats["unique_tokens"] == 1 and stats["alpha_chars"] < 20: return True, "pure_repetition" # Everything else passes to LLM (including typo-heavy, short meaningful, etc.) return False, "" # Classification prompt template CLASSIFICATION_PROMPT = """You are a review classifier using primitive-based analysis. ## TASK Extract semantic spans from this review and classify each span to exactly ONE primitive. ## BUSINESS CONTEXT - Business: {business_id} - Sector: {sector_code} - Path: {gbp_path} ## ENABLED PRIMITIVES (use ONLY these) {primitives_list} ## SECTOR SIGNALS (what customers typically judge) {brief_signals} ## RULES 1. Extract 1-5 spans per review (prefer fewer, larger spans) 2. Each span gets exactly ONE primitive (the most specific match) 3. If nothing fits with confidence ≄ 0.5, use UNMAPPED with keywords 4. Valence: + (positive), - (negative), 0 (neutral), ± (mixed) 5. Intensity: 1 (low), 2 (moderate), 3 (high/extreme) 6. Detail: 1 (vague), 2 (some detail), 3 (specific/actionable) ## OUTPUT FORMAT (JSON only, no markdown) {{ "spans": [ {{ "text": "exact text from review", "start": 0, "end": 25, "primitive": "MANNER", "valence": "+", "intensity": 2, "detail": 2, "confidence": 0.85, "entity": null, "entity_type": null }} ] }} ## REVIEW TO CLASSIFY Rating: {rating}/5 Text: {review_text} Return JSON only.""" async def resolve_business_id(pool: asyncpg.Pool, search_term: str) -> str | None: """ Resolve a partial business name/pattern to canonical business_id via DB. Searches pipeline.business_taxonomy_map by ILIKE on business_id. Returns the canonical business_id or None if not found. """ # Try exact match first row = await pool.fetchrow(""" SELECT business_id FROM pipeline.business_taxonomy_map WHERE business_id = $1 """, search_term) if row: return row["business_id"] # Build search patterns with varying flexibility search_patterns = [ f"%{search_term}%", # Basic wildcard f"%{search_term.replace(' ', '%')}%", # "go karts" -> "%go%karts%" ] # For camelCase or concatenated words like "gokarts", insert wildcard at likely word boundaries # Simple heuristic: insert % before uppercase letters or between common word patterns import re # Insert % before uppercase letters: "GoKarts" -> "%Go%Karts%" camel_pattern = re.sub(r'([a-z])([A-Z])', r'\1%\2', search_term) if camel_pattern != search_term: search_patterns.append(f"%{camel_pattern}%") # For lowercase concatenated words, try common splits # "gokarts" -> try "go%karts", "gok%arts", etc. with length >= 2 if search_term.islower() and len(search_term) >= 4: for i in range(2, len(search_term) - 1): search_patterns.append(f"%{search_term[:i]}%{search_term[i:]}%") for pattern in search_patterns: row = await pool.fetchrow(""" SELECT business_id FROM pipeline.business_taxonomy_map WHERE LOWER(business_id) LIKE LOWER($1) """, pattern) if row: return row["business_id"] return None async def fetch_reviews_for_business( pool: asyncpg.Pool, business_id: str, limit: int = 100, ) -> list[dict]: """Fetch reviews from review_spans source (existing classified reviews).""" # Get unique reviews from spans table query = """ SELECT DISTINCT ON (review_id) review_id, business_id, span_text as sample_text FROM pipeline.review_spans WHERE business_id = $1 ORDER BY review_id, id LIMIT $2 """ rows = await pool.fetch(query, business_id, limit) # For now, we'll use existing spans as proxy for reviews # In production, this would fetch from reviews_raw or jobs.reviews_data return [dict(row) for row in rows] async def fetch_reviews_from_jobs( pool: asyncpg.Pool, business_pattern: str, limit: int = 100, ) -> list[dict]: """Fetch reviews from jobs.reviews_data JSON.""" query = """ SELECT j.job_id, j.url, jsonb_array_elements(j.reviews_data) as review FROM public.jobs j WHERE j.reviews_data IS NOT NULL AND (LOWER(j.url) LIKE $1 OR LOWER(j.metadata->>'business_name') LIKE $1) AND j.status = 'completed' LIMIT $2 """ rows = await pool.fetch(query, f"%{business_pattern.lower()}%", limit) reviews = [] for row in rows: review = row["review"] # Handle both dict and JSON string if isinstance(review, str): review = json.loads(review) reviews.append({ "job_id": str(row["job_id"]), "review_id": review.get("review_id", f"rev-{len(reviews)}"), "text": review.get("text", ""), "rating": review.get("rating", 5), "author": review.get("author", "Anonymous"), }) return reviews def build_classification_prompt( review: dict, config: dict, ) -> str: """Build the classification prompt with resolved config.""" # Build primitives list with weights primitives_list = [] for prim in sorted(config["enabled_primitives"]): prim_info = config["primitives"].get(prim, {}) weight = config["weights"].get(prim) weight_str = f" (weight: {weight}x)" if weight else "" primitives_list.append( f"- {prim}: {prim_info.get('def', prim_info.get('name', prim))}{weight_str}" ) # Build brief signals brief = config.get("brief", {}) brief_signals = [] if brief.get("what_customers_judge"): items = brief["what_customers_judge"] if isinstance(items, dict): items = items.get("items", []) for item in items[:3]: if isinstance(item, dict): brief_signals.append(f"- {item.get('aspect', item.get('area', ''))}") else: brief_signals.append(f"- {item}") return CLASSIFICATION_PROMPT.format( business_id=config["business_id"], sector_code=config["sector_code"], gbp_path=config["gbp_path"], primitives_list="\n".join(primitives_list), brief_signals="\n".join(brief_signals) if brief_signals else "No specific signals", rating=review.get("rating", "?"), review_text=review.get("text", review.get("sample_text", "")), ) async def classify_review_mock( review: dict, config: dict, ) -> ClassificationResult: """ Mock classification for dry-run testing. Returns a synthetic result based on simple heuristics. """ text = review.get("text", review.get("sample_text", "")) rating = review.get("rating", 3) # Simple heuristic classification spans = [] # Detect some patterns if any(word in text.lower() for word in ["friendly", "nice", "helpful", "great staff"]): spans.append({ "text": text[:50] + "..." if len(text) > 50 else text, "start": 0, "end": min(50, len(text)), "primitive": "MANNER", "valence": "+", "intensity": 2, "detail": 2, "confidence": 0.75, }) elif any(word in text.lower() for word in ["rude", "unfriendly", "ignored"]): spans.append({ "text": text[:50] + "..." if len(text) > 50 else text, "start": 0, "end": min(50, len(text)), "primitive": "MANNER", "valence": "-", "intensity": 2, "detail": 2, "confidence": 0.75, }) elif any(word in text.lower() for word in ["wait", "slow", "fast", "quick"]): valence = "+" if rating >= 4 else "-" spans.append({ "text": text[:50] + "..." if len(text) > 50 else text, "start": 0, "end": min(50, len(text)), "primitive": "SPEED", "valence": valence, "intensity": 2, "detail": 2, "confidence": 0.70, }) else: # Default based on rating spans.append({ "text": text[:50] + "..." if len(text) > 50 else text, "start": 0, "end": min(50, len(text)), "primitive": "VALUE_FOR_MONEY" if rating >= 4 else "UNMAPPED", "valence": "+" if rating >= 4 else "-" if rating <= 2 else "0", "intensity": 1, "detail": 1, "confidence": 0.50, "unmapped_keywords": ["general"] if rating < 4 else None, }) return ClassificationResult( review_id=review.get("review_id", "unknown"), business_id=config["business_id"], config_version=config["config_version"], spans=spans, raw_response=json.dumps({"spans": spans}), detected_language="en", # Mock assumes English language_confidence=0.5, ) @dataclass class ClassificationResult: """Result from classifying a single review.""" review_id: str business_id: str config_version: str spans: list[dict] raw_response: str error: str | None = None detected_language: str | None = None language_confidence: float | None = None async def classify_review_llm( review: dict, config: dict, model: str | None = None, ) -> ClassificationResult: """ Real LLM classification using OpenAI. Uses enabled primitives from resolved config. Returns structured spans with audit trail. """ if not LLM_AVAILABLE: raise RuntimeError("LLM classifier not available. Check OPENAI_API_KEY.") text = review.get("text", review.get("sample_text", "")) rating = review.get("rating") # Check for non-informative reviews (skip LLM, save cost) non_informative, reason = is_non_informative(text) if non_informative: return ClassificationResult( review_id=review.get("review_id", "unknown"), business_id=config["business_id"], config_version=config["config_version"], spans=[{ "text": text[:100] if text else "", "start": 0, "end": min(100, len(text)) if text else 0, "primitive": "NON_INFORMATIVE", "valence": "0", "intensity": 1, "detail": 1, "confidence": 1.0, "unmapped_keywords": [reason], }], raw_response=json.dumps({"non_informative": True, "reason": reason}), detected_language="unknown", language_confidence=0.0, ) # Call the LLM classifier result = llm_classify_review( review_text=text, rating=rating, config=config, language="auto", # Auto-detect model=model, ) # Extract language info detected_lang = result.get("detected_language", "unknown") lang_confidence = result.get("language_confidence", 0.0) # Convert LLM response to our format spans = [] for span in result.get("spans", []): # Map valence to our format valence_map = {"positive": "+", "negative": "-", "mixed": "±", "neutral": "0"} valence = valence_map.get(span.get("valence", "neutral"), "0") # Map intensity (1-5 to 1-3) intensity = span.get("intensity", 2) if intensity >= 4: detail = 3 elif intensity >= 2: detail = 2 else: detail = 1 intensity = min(3, max(1, (intensity + 1) // 2)) # Map 1-5 to 1-3 spans.append({ "text": span.get("evidence", text[:100]), "start": span.get("start_char"), "end": span.get("end_char"), "primitive": span.get("primitive", "UNMAPPED"), "valence": valence, "intensity": intensity, "detail": detail, "confidence": span.get("confidence", 0.5), "entity": span.get("details", {}).get("entity") if span.get("details") else None, "entity_type": span.get("details", {}).get("entity_type") if span.get("details") else None, }) # Add unmapped items as UNMAPPED spans for unmapped in result.get("unmapped", []): spans.append({ "text": unmapped.get("evidence", ""), "start": None, "end": None, "primitive": "UNMAPPED", "valence": "0", "intensity": 1, "detail": 1, "confidence": unmapped.get("confidence", 0.3), "unmapped_keywords": [unmapped.get("label", "unknown")], }) return ClassificationResult( review_id=review.get("review_id", "unknown"), business_id=config["business_id"], config_version=config["config_version"], spans=spans, raw_response=result.get("raw_response", "{}"), error=result.get("warnings", [None])[0] if result.get("warnings") else None, detected_language=detected_lang, language_confidence=lang_confidence, ) async def store_spans( pool: asyncpg.Pool, result: ClassificationResult, config: dict, job_id: str | None = None, model: str | None = None, review_hash: str | None = None, language: str | None = None, run_id: uuid.UUID | None = None, ) -> int: """Store classified spans to detected_spans_v2 with full audit trail.""" count = 0 for span in result.spans: await pool.execute(""" INSERT INTO pipeline.detected_spans_v2 ( job_id, business_id, review_id, gbp_path, sector_code, config_version, primitive, valence, intensity, detail, mode, confidence, span_text, span_start, span_end, unmapped_keywords, entity, entity_type, model, raw_response, review_hash, language, run_id ) VALUES ( $1, $2, $3, $4::ltree, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23 ) """, job_id, result.business_id, result.review_id, config["gbp_path"], config["sector_code"], result.config_version, span["primitive"], span["valence"], span.get("intensity"), span.get("detail"), span.get("mode"), span["confidence"], span["text"], span.get("start"), span.get("end"), span.get("unmapped_keywords"), span.get("entity"), span.get("entity_type"), model, json.dumps(json.loads(result.raw_response)) if result.raw_response else None, # Store as JSONB review_hash, language, run_id, ) count += 1 return count async def run_classification( business_id: str, limit: int = 100, dry_run: bool = False, use_jobs: bool = True, use_llm: bool = False, model: str | None = None, ) -> dict[str, Any]: """ Run classification pipeline for a business. Args: business_id: Business name or URL pattern limit: Max reviews to process dry_run: If True, don't store results use_jobs: If True, fetch from jobs.reviews_data use_llm: If True, use real LLM classification (requires OPENAI_API_KEY) model: Model to use for LLM classification (default: gpt-4o-mini) Returns: Summary statistics """ if use_llm and not LLM_AVAILABLE: return {"error": "LLM classifier not available. Set OPENAI_API_KEY."} model = model or "gpt-4o-mini" pool = await asyncpg.create_pool(DB_URL) try: # Resolve config resolver = ConfigResolver() # Resolve business_id from partial name via DB lookup business_lookup = await resolve_business_id(pool, business_id) if not business_lookup: return {"error": f"Business not found matching: {business_id}"} config = await resolver.resolve(business_lookup, pool) if not config: return {"error": f"Business not mapped: {business_lookup}"} # Generate run_id for this classification run run_id = uuid.uuid4() print(f"\nšŸ“‹ Resolved config for: {config['business_id']}") print(f" Sector: {config['sector_code']}") print(f" Config version: {config['config_version']}") print(f" Enabled primitives: {len(config['enabled_primitives'])}") print(f" Weights: {len(config['weights'])}") print(f" Run ID: {run_id}") # Fetch reviews if use_jobs: # Extract pattern from business_id (use first two words for better matching) words = business_id.split() url_pattern = " ".join(words[:2]) if len(words) > 1 else words[0] reviews = await fetch_reviews_from_jobs(pool, url_pattern, limit) else: reviews = await fetch_reviews_for_business(pool, config["business_id"], limit) print(f"\nšŸ“„ Fetched {len(reviews)} reviews") if not reviews: return {"error": "No reviews found", "config": config} # Classify reviews results = [] total_tokens = {"prompt": 0, "completion": 0} classifier_type = "LLM" if use_llm else "MOCK" print(f"\nšŸ”„ Classifying with {classifier_type} classifier...") if use_llm: print(f" Model: {model}") for i, review in enumerate(reviews): if i % 20 == 0: print(f" Processing review {i+1}/{len(reviews)}...") # Use LLM or mock classifier if use_llm: result = await classify_review_llm(review, config, model) # Track tokens for cost estimation if hasattr(result, 'raw_response'): try: raw = json.loads(result.raw_response) if "tokens" in raw: total_tokens["prompt"] += raw["tokens"].get("prompt", 0) total_tokens["completion"] += raw["tokens"].get("completion", 0) except: pass else: result = await classify_review_mock(review, config) results.append(result) # Store if not dry run if not dry_run: # Compute review hash for caching text = review.get("text", review.get("sample_text", "")) review_hash = hashlib.sha256(f"{config['config_version']}:{text}".encode()).hexdigest()[:16] # Use detected language from result (if available) detected_lang = getattr(result, 'detected_language', None) or "unknown" await store_spans( pool, result, config, job_id=review.get("job_id"), model=model if use_llm else "mock", review_hash=review_hash, language=detected_lang, run_id=run_id, ) # Calculate stats all_spans = [s for r in results for s in r.spans] primitive_counts = Counter(s["primitive"] for s in all_spans) valence_counts = Counter(s["valence"] for s in all_spans) unmapped_count = primitive_counts.get("UNMAPPED", 0) non_informative_count = primitive_counts.get("NON_INFORMATIVE", 0) # Language distribution (for multilingual tracking) language_counts = Counter( getattr(r, 'detected_language', 'unknown') or 'unknown' for r in results ) # Content spans = total - non-informative content_spans = len(all_spans) - non_informative_count content_unmapped_rate = unmapped_count / content_spans if content_spans > 0 else 0 stats = { "run_id": str(run_id), "business_id": config["business_id"], "sector_code": config["sector_code"], "config_version": config["config_version"], "l2_applied": config.get("l2_applied"), "classifier": classifier_type, "model": model if use_llm else "mock", "reviews_processed": len(reviews), "spans_created": len(all_spans), "non_informative_count": non_informative_count, "content_spans": content_spans, "unmapped_count": unmapped_count, "raw_unmapped_rate": unmapped_count / len(all_spans) if all_spans else 0, "content_unmapped_rate": content_unmapped_rate, "top_primitives": primitive_counts.most_common(10), "valence_distribution": dict(valence_counts), "language_distribution": dict(language_counts), "dry_run": dry_run, } if use_llm: stats["tokens"] = total_tokens # Rough cost estimate for gpt-4o-mini cost = (total_tokens["prompt"] * 0.15 + total_tokens["completion"] * 0.60) / 1_000_000 stats["estimated_cost_usd"] = round(cost, 4) print(f"\nšŸ“Š Results:") print(f" Reviews: {stats['reviews_processed']}") print(f" Total spans: {stats['spans_created']}") print(f" NON_INFORMATIVE: {non_informative_count} ({100*non_informative_count/len(all_spans):.1f}%)" if all_spans else "") print(f" Content spans: {content_spans}") print(f" UNMAPPED (of content): {unmapped_count} ({content_unmapped_rate:.1%})") if use_llm and "estimated_cost_usd" in stats: print(f" Estimated cost: ${stats['estimated_cost_usd']:.4f}") # Print language distribution print(f"\n 🌐 Languages detected:") for lang, count in language_counts.most_common(5): pct = count / len(results) * 100 if results else 0 print(f" {lang}: {count} ({pct:.1f}%)") print(f"\n Top primitives:") for prim, count in stats["top_primitives"][:5]: print(f" {prim}: {count}") return stats finally: await pool.close() async def evaluate_business(business_id: str) -> dict[str, Any]: """ Evaluate classification results for a business. Runs C4 evaluation metrics: 1. UNMAPPED rate 2. Top primitives distribution 3. Contradiction detection 4. Coverage by config version """ pool = await asyncpg.create_pool(DB_URL) try: # Get business mapping resolver = ConfigResolver() # Resolve business_id from partial name via DB lookup business_lookup = await resolve_business_id(pool, business_id) if not business_lookup: return {"error": f"Business not found matching: {business_id}"} config = await resolver.resolve(business_lookup, pool) if not config: return {"error": f"Business not mapped: {business_lookup}"} # Get latest run_id for this business (if exists) latest_run = await pool.fetchrow(""" SELECT run_id, MAX(created_at) as latest FROM pipeline.detected_spans_v2 WHERE business_id = $1 AND run_id IS NOT NULL GROUP BY run_id ORDER BY latest DESC LIMIT 1 """, config["business_id"]) run_id_filter = "" run_id_value = None if latest_run and latest_run["run_id"]: run_id_value = latest_run["run_id"] run_id_filter = "AND run_id = $2" print(f"šŸ“Ž Using latest run: {run_id_value}") # Fetch spans from detected_spans_v2 (include language) if run_id_value: spans = await pool.fetch(f""" SELECT primitive, valence, intensity, confidence, span_text, config_version, language, run_id FROM pipeline.detected_spans_v2 WHERE business_id = $1 {run_id_filter} ORDER BY created_at DESC """, config["business_id"], run_id_value) else: # Fallback to all spans if no run_id exists (legacy data) spans = await pool.fetch(""" SELECT primitive, valence, intensity, confidence, span_text, config_version, language, run_id FROM pipeline.detected_spans_v2 WHERE business_id = $1 ORDER BY created_at DESC """, config["business_id"]) print("āš ļø No run_id found, using all spans (legacy data)") if not spans: return {"error": "No spans found in detected_spans_v2"} spans = [dict(s) for s in spans] # 1. UNMAPPED rate (overall) unmapped = [s for s in spans if s["primitive"] == "UNMAPPED"] unmapped_rate = len(unmapped) / len(spans) # 2. Top primitives primitive_counts = Counter(s["primitive"] for s in spans) # 2b. UNMAPPED rate by language language_stats = {} for lang in set(s.get("language") or "unknown" for s in spans): lang_spans = [s for s in spans if (s.get("language") or "unknown") == lang] lang_unmapped = [s for s in lang_spans if s["primitive"] == "UNMAPPED"] lang_non_info = [s for s in lang_spans if s["primitive"] == "NON_INFORMATIVE"] content_spans = len(lang_spans) - len(lang_non_info) language_stats[lang] = { "total": len(lang_spans), "unmapped": len(lang_unmapped), "non_informative": len(lang_non_info), "content_spans": content_spans, "unmapped_rate": len(lang_unmapped) / content_spans if content_spans > 0 else 0, } # 3. Contradiction detection (simple heuristics) contradictions = [] positive_words = {"great", "amazing", "excellent", "wonderful", "best", "love", "perfect"} negative_words = {"terrible", "awful", "worst", "horrible", "hate", "never"} for span in spans: text_lower = span["span_text"].lower() valence = span["valence"] has_positive = any(w in text_lower for w in positive_words) has_negative = any(w in text_lower for w in negative_words) if has_positive and valence == "-": contradictions.append({ "type": "positive_text_negative_valence", "text": span["span_text"][:50], "valence": valence, }) elif has_negative and valence == "+": contradictions.append({ "type": "negative_text_positive_valence", "text": span["span_text"][:50], "valence": valence, }) # 4. Config version coverage version_counts = Counter(s["config_version"] for s in spans) # 5. Confidence distribution avg_confidence = sum(float(s["confidence"]) for s in spans) / len(spans) low_confidence = len([s for s in spans if float(s["confidence"]) < 0.5]) evaluation = { "business_id": config["business_id"], "sector_code": config["sector_code"], "total_spans": len(spans), "unmapped_rate": unmapped_rate, "unmapped_count": len(unmapped), "top_primitives": primitive_counts.most_common(10), "contradiction_count": len(contradictions), "contradictions_sample": contradictions[:5], "config_versions": dict(version_counts), "avg_confidence": avg_confidence, "low_confidence_count": low_confidence, "language_stats": language_stats, } # Print report print("\n" + "=" * 60) print(f"EVALUATION: {config['business_id']}") print("=" * 60) print(f"\nšŸ“Š METRICS") print(f" Total spans: {len(spans)}") print(f" UNMAPPED rate: {unmapped_rate:.1%} {'āš ļø' if unmapped_rate > 0.2 else 'āœ“'}") print(f" Avg confidence: {avg_confidence:.2f}") print(f" Low confidence (<0.5): {low_confidence}") print(f"\n🌐 UNMAPPED BY LANGUAGE") for lang, stats in sorted(language_stats.items(), key=lambda x: x[1]["total"], reverse=True): rate = stats["unmapped_rate"] flag = "āš ļø" if rate > 0.15 else "āœ“" print(f" {lang}: {stats['unmapped']}/{stats['content_spans']} ({rate:.1%}) {flag}") print(f"\nšŸ” TOP PRIMITIVES") for prim, count in primitive_counts.most_common(8): pct = count / len(spans) * 100 print(f" {prim}: {count} ({pct:.1f}%)") print(f"\nāš ļø CONTRADICTIONS: {len(contradictions)}") for c in contradictions[:3]: print(f" {c['type']}: \"{c['text']}...\" → {c['valence']}") print(f"\nšŸ“‹ CONFIG VERSIONS") for ver, count in version_counts.items(): print(f" {ver}: {count} spans") print("=" * 60) return evaluation finally: await pool.close() async def language_analysis(ignore_legacy: bool = False, latest_hours: int | None = None) -> dict[str, Any]: """ Analyze UNMAPPED rates by language across all businesses/sectors. Args: ignore_legacy: If True, exclude rows with language IN ('auto', 'unknown') or NULL latest_hours: If set, only include spans from the last N hours This helps determine if multilingual handling needs improvement (e.g., translation for non-English reviews). """ pool = await asyncpg.create_pool(DB_URL) try: # Build WHERE clause based on filters where_clauses = [] if ignore_legacy: where_clauses.append("language IS NOT NULL AND language NOT IN ('auto', 'unknown')") if latest_hours: where_clauses.append(f"created_at >= NOW() - INTERVAL '{latest_hours} hours'") where_sql = f"WHERE {' AND '.join(where_clauses)}" if where_clauses else "" # Query UNMAPPED rates by language and sector rows = await pool.fetch(f""" SELECT sector_code, COALESCE(language, 'unknown') as language, COUNT(*) as total_spans, COUNT(*) FILTER (WHERE primitive = 'UNMAPPED') as unmapped_count, COUNT(*) FILTER (WHERE primitive = 'NON_INFORMATIVE') as non_informative_count, AVG(confidence) as avg_confidence FROM pipeline.detected_spans_v2 {where_sql} GROUP BY sector_code, COALESCE(language, 'unknown') ORDER BY sector_code, total_spans DESC """) if not rows: return {"error": "No data in detected_spans_v2"} # Build report filter_desc = [] if ignore_legacy: filter_desc.append("excluding legacy (auto/unknown)") if latest_hours: filter_desc.append(f"last {latest_hours}h only") filter_str = f" [{', '.join(filter_desc)}]" if filter_desc else "" print("\n" + "=" * 80) print(f"LANGUAGE ANALYSIS: UNMAPPED Rates by Language & Sector{filter_str}") print("=" * 80) print(f"{'':14} | {'total':>5} | {'content':>7} | {'unmapped':>8} | {'raw':>6} | {'adj':>6} |") print("-" * 80) current_sector = None for row in rows: sector = row["sector_code"] lang = row["language"] total = row["total_spans"] unmapped = row["unmapped_count"] non_info = row["non_informative_count"] content = total - non_info raw_rate = unmapped / total if total > 0 else 0 adj_rate = unmapped / content if content > 0 else 0 conf = float(row["avg_confidence"]) if sector != current_sector: current_sector = sector print(f"\nšŸ“‚ {sector}") flag = "āš ļø" if adj_rate > 0.15 else "āœ“" print(f" {lang:12} | {total:5} | {content:7} | {unmapped:8} | {raw_rate:5.1%} | {adj_rate:5.1%} | {flag}") # Summary print("\n" + "-" * 80) print("SUMMARY: Languages with high content-adjusted UNMAPPED (>15%, content >= 20)") print("-" * 80) high_unmapped = [ row for row in rows if (row["total_spans"] - row["non_informative_count"]) >= 20 # Minimum sample size and row["unmapped_count"] / (row["total_spans"] - row["non_informative_count"]) > 0.15 ] if high_unmapped: for row in high_unmapped: content = row["total_spans"] - row["non_informative_count"] rate = row["unmapped_count"] / content print(f" {row['sector_code']}/{row['language']}: {rate:.1%} UNMAPPED ({content} content spans)") print("\nšŸ’” Consider: Translation for these language/sector combinations") else: print(" āœ“ No language/sector combinations exceed threshold") # Totals total_spans = sum(r["total_spans"] for r in rows) total_unmapped = sum(r["unmapped_count"] for r in rows) total_non_info = sum(r["non_informative_count"] for r in rows) total_content = total_spans - total_non_info print(f"\nšŸ“Š TOTALS: {total_spans} spans, {total_content} content, {total_unmapped} unmapped") print(f" Raw UNMAPPED rate: {total_unmapped/total_spans:.1%}" if total_spans > 0 else "") print(f" Content-adjusted UNMAPPED rate: {total_unmapped/total_content:.1%}" if total_content > 0 else "") print("=" * 80) return {"rows": [dict(r) for r in rows]} finally: await pool.close() def main(): parser = argparse.ArgumentParser(description="Classification run harness V2") parser.add_argument("--business", help="Business name or pattern") parser.add_argument("--limit", type=int, default=100, help="Max reviews to process") parser.add_argument("--dry-run", action="store_true", help="Don't store results") parser.add_argument("--evaluate", metavar="BUSINESS", help="Evaluate existing results") parser.add_argument("--language-analysis", action="store_true", help="Analyze UNMAPPED by language across all data") parser.add_argument("--ignore-legacy-language", action="store_true", help="Exclude rows with language='auto'/'unknown'/NULL") parser.add_argument("--latest-hours", type=int, help="Only include spans from last N hours") parser.add_argument("--use-existing", action="store_true", help="Use existing spans instead of jobs") parser.add_argument("--use-llm", action="store_true", help="Use real LLM classification (requires OPENAI_API_KEY)") parser.add_argument("--model", default="gpt-4o-mini", help="Model for LLM classification (default: gpt-4o-mini)") args = parser.parse_args() if args.language_analysis: asyncio.run(language_analysis( ignore_legacy=args.ignore_legacy_language, latest_hours=args.latest_hours, )) elif args.evaluate: asyncio.run(evaluate_business(args.evaluate)) elif args.business: asyncio.run(run_classification( args.business, limit=args.limit, dry_run=args.dry_run, use_jobs=not args.use_existing, use_llm=args.use_llm, model=args.model, )) else: parser.print_help() print("\n\nExamples:") print(" # Mock classification (free, for testing)") print(" python run_classification_v2.py --business gokarts --limit 50 --dry-run") print("") print(" # Real LLM classification") print(" python run_classification_v2.py --business gokarts --limit 50 --use-llm") print("") print(" # Evaluate results") print(" python run_classification_v2.py --evaluate gokarts") if __name__ == "__main__": main()