1103 lines
40 KiB
Python
1103 lines
40 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Classification Run Harness V2
|
|
|
|
Runs classification on real reviews using resolved L1 config + sector brief.
|
|
Stores results to detected_spans_v2 with full config versioning.
|
|
|
|
Usage:
|
|
python run_classification_v2.py --business "Go Karts Mar Menor" --limit 100
|
|
python run_classification_v2.py --business "ClickRent Gran Canaria" --limit 100 --dry-run
|
|
python run_classification_v2.py --evaluate "Go Karts Mar Menor"
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import unicodedata
|
|
import uuid
|
|
from collections import Counter
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import asyncpg
|
|
|
|
# Use standalone resolver to avoid package import issues
|
|
from config_resolver_standalone import ConfigResolver
|
|
|
|
# Import LLM classifier (optional - falls back to mock if unavailable)
|
|
try:
|
|
from llm_classifier import classify_review as llm_classify_review
|
|
LLM_AVAILABLE = True
|
|
except ImportError:
|
|
LLM_AVAILABLE = False
|
|
|
|
# Database URL
|
|
DB_URL = os.environ.get(
|
|
"DATABASE_URL",
|
|
"postgresql://scraper:scraper123@localhost:5437/scraper"
|
|
)
|
|
|
|
# Non-informative review detection (score-based, conservative)
|
|
|
|
|
|
def _compute_text_stats(text: str) -> dict:
|
|
"""Compute character and token statistics for non-informative detection."""
|
|
if not text:
|
|
return {"empty": True}
|
|
|
|
text = text.strip()
|
|
total_chars = len(text)
|
|
|
|
if total_chars == 0:
|
|
return {"empty": True}
|
|
|
|
# Character counts by unicode category (reliable, no emoji heuristics)
|
|
alpha_chars = sum(1 for c in text if unicodedata.category(c).startswith('L'))
|
|
digit_chars = sum(1 for c in text if unicodedata.category(c).startswith('N'))
|
|
punct_chars = sum(1 for c in text if unicodedata.category(c).startswith('P'))
|
|
|
|
# Token stats
|
|
tokens = text.split()
|
|
token_count = len(tokens)
|
|
unique_tokens = len(set(t.lower() for t in tokens))
|
|
|
|
# Ratios
|
|
alpha_ratio = alpha_chars / total_chars if total_chars > 0 else 0
|
|
punct_ratio = punct_chars / total_chars if total_chars > 0 else 0
|
|
|
|
# Repetition check (same token repeated)
|
|
if tokens:
|
|
most_common_count = Counter(t.lower() for t in tokens).most_common(1)[0][1]
|
|
repetition_ratio = most_common_count / token_count if token_count > 0 else 0
|
|
else:
|
|
repetition_ratio = 0
|
|
|
|
return {
|
|
"empty": False,
|
|
"total_chars": total_chars,
|
|
"alpha_chars": alpha_chars,
|
|
"digit_chars": digit_chars,
|
|
"punct_chars": punct_chars,
|
|
"token_count": token_count,
|
|
"unique_tokens": unique_tokens,
|
|
"alpha_ratio": alpha_ratio,
|
|
"punct_ratio": punct_ratio,
|
|
"repetition_ratio": repetition_ratio,
|
|
}
|
|
|
|
|
|
# Safe regex for truly content-free strings (high confidence only)
|
|
# No word lists - only structural patterns
|
|
PURE_JUNK_RE = re.compile(
|
|
r'^[\s\.\!\?\,\-\_\~\*\#\@]+$' # Only punctuation/whitespace
|
|
r'|^[\U0001F300-\U0001F9FF\U0001FA00-\U0001FAFF\U00002600-\U000027BF\s\.\!\?]+$' # Only emoji + punct
|
|
r'|^(translated by google|traducido por google)[\.\s]*$', # Translation artifacts
|
|
re.IGNORECASE
|
|
)
|
|
|
|
|
|
def is_non_informative(text: str) -> tuple[bool, str]:
|
|
"""
|
|
Conservative detection of non-informative reviews.
|
|
|
|
Goal: Skip LLM only when VERY sure it's junk. Everything else goes to LLM.
|
|
Prefer false negatives (keeping noise) over false positives (dropping content).
|
|
|
|
Returns (is_non_informative, reason).
|
|
"""
|
|
if not text:
|
|
return True, "empty"
|
|
|
|
text = text.strip()
|
|
if not text:
|
|
return True, "empty"
|
|
|
|
stats = _compute_text_stats(text)
|
|
if stats.get("empty"):
|
|
return True, "empty"
|
|
|
|
# Rule A: Safe regex (emoji-only, punct-only, translation artifact)
|
|
if PURE_JUNK_RE.match(text):
|
|
return True, "junk_pattern"
|
|
|
|
# Rule B: No alphanumeric content at all
|
|
if stats["alpha_chars"] == 0 and stats["digit_chars"] == 0:
|
|
return True, "no_content"
|
|
|
|
# Rule C: Pure repetition (e.g., "good good good good")
|
|
if stats["token_count"] >= 3 and stats["unique_tokens"] == 1 and stats["alpha_chars"] < 20:
|
|
return True, "pure_repetition"
|
|
|
|
# Everything else passes to LLM (including typo-heavy, short meaningful, etc.)
|
|
return False, ""
|
|
|
|
|
|
# Classification prompt template
|
|
CLASSIFICATION_PROMPT = """You are a review classifier using primitive-based analysis.
|
|
|
|
## TASK
|
|
Extract semantic spans from this review and classify each span to exactly ONE primitive.
|
|
|
|
## BUSINESS CONTEXT
|
|
- Business: {business_id}
|
|
- Sector: {sector_code}
|
|
- Path: {gbp_path}
|
|
|
|
## ENABLED PRIMITIVES (use ONLY these)
|
|
{primitives_list}
|
|
|
|
## SECTOR SIGNALS (what customers typically judge)
|
|
{brief_signals}
|
|
|
|
## RULES
|
|
1. Extract 1-5 spans per review (prefer fewer, larger spans)
|
|
2. Each span gets exactly ONE primitive (the most specific match)
|
|
3. If nothing fits with confidence ≥ 0.5, use UNMAPPED with keywords
|
|
4. Valence: + (positive), - (negative), 0 (neutral), ± (mixed)
|
|
5. Intensity: 1 (low), 2 (moderate), 3 (high/extreme)
|
|
6. Detail: 1 (vague), 2 (some detail), 3 (specific/actionable)
|
|
|
|
## OUTPUT FORMAT (JSON only, no markdown)
|
|
{{
|
|
"spans": [
|
|
{{
|
|
"text": "exact text from review",
|
|
"start": 0,
|
|
"end": 25,
|
|
"primitive": "MANNER",
|
|
"valence": "+",
|
|
"intensity": 2,
|
|
"detail": 2,
|
|
"confidence": 0.85,
|
|
"entity": null,
|
|
"entity_type": null
|
|
}}
|
|
]
|
|
}}
|
|
|
|
## REVIEW TO CLASSIFY
|
|
Rating: {rating}/5
|
|
Text: {review_text}
|
|
|
|
Return JSON only."""
|
|
|
|
|
|
async def resolve_business_id(pool: asyncpg.Pool, search_term: str) -> str | None:
|
|
"""
|
|
Resolve a partial business name/pattern to canonical business_id via DB.
|
|
|
|
Searches pipeline.business_taxonomy_map by ILIKE on business_id.
|
|
Returns the canonical business_id or None if not found.
|
|
"""
|
|
# Try exact match first
|
|
row = await pool.fetchrow("""
|
|
SELECT business_id
|
|
FROM pipeline.business_taxonomy_map
|
|
WHERE business_id = $1
|
|
""", search_term)
|
|
if row:
|
|
return row["business_id"]
|
|
|
|
# Build search patterns with varying flexibility
|
|
search_patterns = [
|
|
f"%{search_term}%", # Basic wildcard
|
|
f"%{search_term.replace(' ', '%')}%", # "go karts" -> "%go%karts%"
|
|
]
|
|
|
|
# For camelCase or concatenated words like "gokarts", insert wildcard at likely word boundaries
|
|
# Simple heuristic: insert % before uppercase letters or between common word patterns
|
|
import re
|
|
# Insert % before uppercase letters: "GoKarts" -> "%Go%Karts%"
|
|
camel_pattern = re.sub(r'([a-z])([A-Z])', r'\1%\2', search_term)
|
|
if camel_pattern != search_term:
|
|
search_patterns.append(f"%{camel_pattern}%")
|
|
|
|
# For lowercase concatenated words, try common splits
|
|
# "gokarts" -> try "go%karts", "gok%arts", etc. with length >= 2
|
|
if search_term.islower() and len(search_term) >= 4:
|
|
for i in range(2, len(search_term) - 1):
|
|
search_patterns.append(f"%{search_term[:i]}%{search_term[i:]}%")
|
|
|
|
for pattern in search_patterns:
|
|
row = await pool.fetchrow("""
|
|
SELECT business_id
|
|
FROM pipeline.business_taxonomy_map
|
|
WHERE LOWER(business_id) LIKE LOWER($1)
|
|
""", pattern)
|
|
if row:
|
|
return row["business_id"]
|
|
|
|
return None
|
|
|
|
|
|
async def fetch_reviews_for_business(
|
|
pool: asyncpg.Pool,
|
|
business_id: str,
|
|
limit: int = 100,
|
|
) -> list[dict]:
|
|
"""Fetch reviews from review_spans source (existing classified reviews)."""
|
|
# Get unique reviews from spans table
|
|
query = """
|
|
SELECT DISTINCT ON (review_id)
|
|
review_id,
|
|
business_id,
|
|
span_text as sample_text
|
|
FROM pipeline.review_spans
|
|
WHERE business_id = $1
|
|
ORDER BY review_id, id
|
|
LIMIT $2
|
|
"""
|
|
rows = await pool.fetch(query, business_id, limit)
|
|
|
|
# For now, we'll use existing spans as proxy for reviews
|
|
# In production, this would fetch from reviews_raw or jobs.reviews_data
|
|
return [dict(row) for row in rows]
|
|
|
|
|
|
async def fetch_reviews_from_jobs(
|
|
pool: asyncpg.Pool,
|
|
business_pattern: str,
|
|
limit: int = 100,
|
|
) -> list[dict]:
|
|
"""Fetch reviews from jobs.reviews_data JSON."""
|
|
query = """
|
|
SELECT
|
|
j.job_id,
|
|
j.url,
|
|
jsonb_array_elements(j.reviews_data) as review
|
|
FROM public.jobs j
|
|
WHERE j.reviews_data IS NOT NULL
|
|
AND (LOWER(j.url) LIKE $1 OR LOWER(j.metadata->>'business_name') LIKE $1)
|
|
AND j.status = 'completed'
|
|
LIMIT $2
|
|
"""
|
|
rows = await pool.fetch(query, f"%{business_pattern.lower()}%", limit)
|
|
|
|
reviews = []
|
|
for row in rows:
|
|
review = row["review"]
|
|
# Handle both dict and JSON string
|
|
if isinstance(review, str):
|
|
review = json.loads(review)
|
|
reviews.append({
|
|
"job_id": str(row["job_id"]),
|
|
"review_id": review.get("review_id", f"rev-{len(reviews)}"),
|
|
"text": review.get("text", ""),
|
|
"rating": review.get("rating", 5),
|
|
"author": review.get("author", "Anonymous"),
|
|
})
|
|
|
|
return reviews
|
|
|
|
|
|
def build_classification_prompt(
|
|
review: dict,
|
|
config: dict,
|
|
) -> str:
|
|
"""Build the classification prompt with resolved config."""
|
|
# Build primitives list with weights
|
|
primitives_list = []
|
|
for prim in sorted(config["enabled_primitives"]):
|
|
prim_info = config["primitives"].get(prim, {})
|
|
weight = config["weights"].get(prim)
|
|
weight_str = f" (weight: {weight}x)" if weight else ""
|
|
primitives_list.append(
|
|
f"- {prim}: {prim_info.get('def', prim_info.get('name', prim))}{weight_str}"
|
|
)
|
|
|
|
# Build brief signals
|
|
brief = config.get("brief", {})
|
|
brief_signals = []
|
|
if brief.get("what_customers_judge"):
|
|
items = brief["what_customers_judge"]
|
|
if isinstance(items, dict):
|
|
items = items.get("items", [])
|
|
for item in items[:3]:
|
|
if isinstance(item, dict):
|
|
brief_signals.append(f"- {item.get('aspect', item.get('area', ''))}")
|
|
else:
|
|
brief_signals.append(f"- {item}")
|
|
|
|
return CLASSIFICATION_PROMPT.format(
|
|
business_id=config["business_id"],
|
|
sector_code=config["sector_code"],
|
|
gbp_path=config["gbp_path"],
|
|
primitives_list="\n".join(primitives_list),
|
|
brief_signals="\n".join(brief_signals) if brief_signals else "No specific signals",
|
|
rating=review.get("rating", "?"),
|
|
review_text=review.get("text", review.get("sample_text", "")),
|
|
)
|
|
|
|
|
|
async def classify_review_mock(
|
|
review: dict,
|
|
config: dict,
|
|
) -> ClassificationResult:
|
|
"""
|
|
Mock classification for dry-run testing.
|
|
|
|
Returns a synthetic result based on simple heuristics.
|
|
"""
|
|
text = review.get("text", review.get("sample_text", ""))
|
|
rating = review.get("rating", 3)
|
|
|
|
# Simple heuristic classification
|
|
spans = []
|
|
|
|
# Detect some patterns
|
|
if any(word in text.lower() for word in ["friendly", "nice", "helpful", "great staff"]):
|
|
spans.append({
|
|
"text": text[:50] + "..." if len(text) > 50 else text,
|
|
"start": 0,
|
|
"end": min(50, len(text)),
|
|
"primitive": "MANNER",
|
|
"valence": "+",
|
|
"intensity": 2,
|
|
"detail": 2,
|
|
"confidence": 0.75,
|
|
})
|
|
elif any(word in text.lower() for word in ["rude", "unfriendly", "ignored"]):
|
|
spans.append({
|
|
"text": text[:50] + "..." if len(text) > 50 else text,
|
|
"start": 0,
|
|
"end": min(50, len(text)),
|
|
"primitive": "MANNER",
|
|
"valence": "-",
|
|
"intensity": 2,
|
|
"detail": 2,
|
|
"confidence": 0.75,
|
|
})
|
|
elif any(word in text.lower() for word in ["wait", "slow", "fast", "quick"]):
|
|
valence = "+" if rating >= 4 else "-"
|
|
spans.append({
|
|
"text": text[:50] + "..." if len(text) > 50 else text,
|
|
"start": 0,
|
|
"end": min(50, len(text)),
|
|
"primitive": "SPEED",
|
|
"valence": valence,
|
|
"intensity": 2,
|
|
"detail": 2,
|
|
"confidence": 0.70,
|
|
})
|
|
else:
|
|
# Default based on rating
|
|
spans.append({
|
|
"text": text[:50] + "..." if len(text) > 50 else text,
|
|
"start": 0,
|
|
"end": min(50, len(text)),
|
|
"primitive": "VALUE_FOR_MONEY" if rating >= 4 else "UNMAPPED",
|
|
"valence": "+" if rating >= 4 else "-" if rating <= 2 else "0",
|
|
"intensity": 1,
|
|
"detail": 1,
|
|
"confidence": 0.50,
|
|
"unmapped_keywords": ["general"] if rating < 4 else None,
|
|
})
|
|
|
|
return ClassificationResult(
|
|
review_id=review.get("review_id", "unknown"),
|
|
business_id=config["business_id"],
|
|
config_version=config["config_version"],
|
|
spans=spans,
|
|
raw_response=json.dumps({"spans": spans}),
|
|
detected_language="en", # Mock assumes English
|
|
language_confidence=0.5,
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class ClassificationResult:
|
|
"""Result from classifying a single review."""
|
|
review_id: str
|
|
business_id: str
|
|
config_version: str
|
|
spans: list[dict]
|
|
raw_response: str
|
|
error: str | None = None
|
|
detected_language: str | None = None
|
|
language_confidence: float | None = None
|
|
|
|
|
|
async def classify_review_llm(
|
|
review: dict,
|
|
config: dict,
|
|
model: str | None = None,
|
|
) -> ClassificationResult:
|
|
"""
|
|
Real LLM classification using OpenAI.
|
|
|
|
Uses enabled primitives from resolved config.
|
|
Returns structured spans with audit trail.
|
|
"""
|
|
if not LLM_AVAILABLE:
|
|
raise RuntimeError("LLM classifier not available. Check OPENAI_API_KEY.")
|
|
|
|
text = review.get("text", review.get("sample_text", ""))
|
|
rating = review.get("rating")
|
|
|
|
# Check for non-informative reviews (skip LLM, save cost)
|
|
non_informative, reason = is_non_informative(text)
|
|
if non_informative:
|
|
return ClassificationResult(
|
|
review_id=review.get("review_id", "unknown"),
|
|
business_id=config["business_id"],
|
|
config_version=config["config_version"],
|
|
spans=[{
|
|
"text": text[:100] if text else "",
|
|
"start": 0,
|
|
"end": min(100, len(text)) if text else 0,
|
|
"primitive": "NON_INFORMATIVE",
|
|
"valence": "0",
|
|
"intensity": 1,
|
|
"detail": 1,
|
|
"confidence": 1.0,
|
|
"unmapped_keywords": [reason],
|
|
}],
|
|
raw_response=json.dumps({"non_informative": True, "reason": reason}),
|
|
detected_language="unknown",
|
|
language_confidence=0.0,
|
|
)
|
|
|
|
# Call the LLM classifier
|
|
result = llm_classify_review(
|
|
review_text=text,
|
|
rating=rating,
|
|
config=config,
|
|
language="auto", # Auto-detect
|
|
model=model,
|
|
)
|
|
|
|
# Extract language info
|
|
detected_lang = result.get("detected_language", "unknown")
|
|
lang_confidence = result.get("language_confidence", 0.0)
|
|
|
|
# Convert LLM response to our format
|
|
spans = []
|
|
for span in result.get("spans", []):
|
|
# Map valence to our format
|
|
valence_map = {"positive": "+", "negative": "-", "mixed": "±", "neutral": "0"}
|
|
valence = valence_map.get(span.get("valence", "neutral"), "0")
|
|
|
|
# Map intensity (1-5 to 1-3)
|
|
intensity = span.get("intensity", 2)
|
|
if intensity >= 4:
|
|
detail = 3
|
|
elif intensity >= 2:
|
|
detail = 2
|
|
else:
|
|
detail = 1
|
|
intensity = min(3, max(1, (intensity + 1) // 2)) # Map 1-5 to 1-3
|
|
|
|
spans.append({
|
|
"text": span.get("evidence", text[:100]),
|
|
"start": span.get("start_char"),
|
|
"end": span.get("end_char"),
|
|
"primitive": span.get("primitive", "UNMAPPED"),
|
|
"valence": valence,
|
|
"intensity": intensity,
|
|
"detail": detail,
|
|
"confidence": span.get("confidence", 0.5),
|
|
"entity": span.get("details", {}).get("entity") if span.get("details") else None,
|
|
"entity_type": span.get("details", {}).get("entity_type") if span.get("details") else None,
|
|
})
|
|
|
|
# Add unmapped items as UNMAPPED spans
|
|
for unmapped in result.get("unmapped", []):
|
|
spans.append({
|
|
"text": unmapped.get("evidence", ""),
|
|
"start": None,
|
|
"end": None,
|
|
"primitive": "UNMAPPED",
|
|
"valence": "0",
|
|
"intensity": 1,
|
|
"detail": 1,
|
|
"confidence": unmapped.get("confidence", 0.3),
|
|
"unmapped_keywords": [unmapped.get("label", "unknown")],
|
|
})
|
|
|
|
return ClassificationResult(
|
|
review_id=review.get("review_id", "unknown"),
|
|
business_id=config["business_id"],
|
|
config_version=config["config_version"],
|
|
spans=spans,
|
|
raw_response=result.get("raw_response", "{}"),
|
|
error=result.get("warnings", [None])[0] if result.get("warnings") else None,
|
|
detected_language=detected_lang,
|
|
language_confidence=lang_confidence,
|
|
)
|
|
|
|
|
|
async def store_spans(
|
|
pool: asyncpg.Pool,
|
|
result: ClassificationResult,
|
|
config: dict,
|
|
job_id: str | None = None,
|
|
model: str | None = None,
|
|
review_hash: str | None = None,
|
|
language: str | None = None,
|
|
run_id: uuid.UUID | None = None,
|
|
) -> int:
|
|
"""Store classified spans to detected_spans_v2 with full audit trail."""
|
|
count = 0
|
|
for span in result.spans:
|
|
await pool.execute("""
|
|
INSERT INTO pipeline.detected_spans_v2 (
|
|
job_id, business_id, review_id, gbp_path, sector_code,
|
|
config_version, primitive, valence, intensity, detail,
|
|
mode, confidence, span_text, span_start, span_end,
|
|
unmapped_keywords, entity, entity_type,
|
|
model, raw_response, review_hash, language, run_id
|
|
) VALUES (
|
|
$1, $2, $3, $4::ltree, $5,
|
|
$6, $7, $8, $9, $10,
|
|
$11, $12, $13, $14, $15,
|
|
$16, $17, $18,
|
|
$19, $20, $21, $22, $23
|
|
)
|
|
""",
|
|
job_id,
|
|
result.business_id,
|
|
result.review_id,
|
|
config["gbp_path"],
|
|
config["sector_code"],
|
|
result.config_version,
|
|
span["primitive"],
|
|
span["valence"],
|
|
span.get("intensity"),
|
|
span.get("detail"),
|
|
span.get("mode"),
|
|
span["confidence"],
|
|
span["text"],
|
|
span.get("start"),
|
|
span.get("end"),
|
|
span.get("unmapped_keywords"),
|
|
span.get("entity"),
|
|
span.get("entity_type"),
|
|
model,
|
|
json.dumps(json.loads(result.raw_response)) if result.raw_response else None, # Store as JSONB
|
|
review_hash,
|
|
language,
|
|
run_id,
|
|
)
|
|
count += 1
|
|
return count
|
|
|
|
|
|
async def run_classification(
|
|
business_id: str,
|
|
limit: int = 100,
|
|
dry_run: bool = False,
|
|
use_jobs: bool = True,
|
|
use_llm: bool = False,
|
|
model: str | None = None,
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Run classification pipeline for a business.
|
|
|
|
Args:
|
|
business_id: Business name or URL pattern
|
|
limit: Max reviews to process
|
|
dry_run: If True, don't store results
|
|
use_jobs: If True, fetch from jobs.reviews_data
|
|
use_llm: If True, use real LLM classification (requires OPENAI_API_KEY)
|
|
model: Model to use for LLM classification (default: gpt-4o-mini)
|
|
|
|
Returns:
|
|
Summary statistics
|
|
"""
|
|
if use_llm and not LLM_AVAILABLE:
|
|
return {"error": "LLM classifier not available. Set OPENAI_API_KEY."}
|
|
|
|
model = model or "gpt-4o-mini"
|
|
pool = await asyncpg.create_pool(DB_URL)
|
|
|
|
try:
|
|
# Resolve config
|
|
resolver = ConfigResolver()
|
|
|
|
# Resolve business_id from partial name via DB lookup
|
|
business_lookup = await resolve_business_id(pool, business_id)
|
|
if not business_lookup:
|
|
return {"error": f"Business not found matching: {business_id}"}
|
|
|
|
config = await resolver.resolve(business_lookup, pool)
|
|
if not config:
|
|
return {"error": f"Business not mapped: {business_lookup}"}
|
|
|
|
# Generate run_id for this classification run
|
|
run_id = uuid.uuid4()
|
|
|
|
print(f"\n📋 Resolved config for: {config['business_id']}")
|
|
print(f" Sector: {config['sector_code']}")
|
|
print(f" Config version: {config['config_version']}")
|
|
print(f" Enabled primitives: {len(config['enabled_primitives'])}")
|
|
print(f" Weights: {len(config['weights'])}")
|
|
print(f" Run ID: {run_id}")
|
|
|
|
# Fetch reviews
|
|
if use_jobs:
|
|
# Extract pattern from business_id (use first two words for better matching)
|
|
words = business_id.split()
|
|
url_pattern = " ".join(words[:2]) if len(words) > 1 else words[0]
|
|
reviews = await fetch_reviews_from_jobs(pool, url_pattern, limit)
|
|
else:
|
|
reviews = await fetch_reviews_for_business(pool, config["business_id"], limit)
|
|
|
|
print(f"\n📥 Fetched {len(reviews)} reviews")
|
|
|
|
if not reviews:
|
|
return {"error": "No reviews found", "config": config}
|
|
|
|
# Classify reviews
|
|
results = []
|
|
total_tokens = {"prompt": 0, "completion": 0}
|
|
classifier_type = "LLM" if use_llm else "MOCK"
|
|
print(f"\n🔄 Classifying with {classifier_type} classifier...")
|
|
if use_llm:
|
|
print(f" Model: {model}")
|
|
|
|
for i, review in enumerate(reviews):
|
|
if i % 20 == 0:
|
|
print(f" Processing review {i+1}/{len(reviews)}...")
|
|
|
|
# Use LLM or mock classifier
|
|
if use_llm:
|
|
result = await classify_review_llm(review, config, model)
|
|
# Track tokens for cost estimation
|
|
if hasattr(result, 'raw_response'):
|
|
try:
|
|
raw = json.loads(result.raw_response)
|
|
if "tokens" in raw:
|
|
total_tokens["prompt"] += raw["tokens"].get("prompt", 0)
|
|
total_tokens["completion"] += raw["tokens"].get("completion", 0)
|
|
except:
|
|
pass
|
|
else:
|
|
result = await classify_review_mock(review, config)
|
|
|
|
results.append(result)
|
|
|
|
# Store if not dry run
|
|
if not dry_run:
|
|
# Compute review hash for caching
|
|
text = review.get("text", review.get("sample_text", ""))
|
|
review_hash = hashlib.sha256(f"{config['config_version']}:{text}".encode()).hexdigest()[:16]
|
|
|
|
# Use detected language from result (if available)
|
|
detected_lang = getattr(result, 'detected_language', None) or "unknown"
|
|
|
|
await store_spans(
|
|
pool, result, config,
|
|
job_id=review.get("job_id"),
|
|
model=model if use_llm else "mock",
|
|
review_hash=review_hash,
|
|
language=detected_lang,
|
|
run_id=run_id,
|
|
)
|
|
|
|
# Calculate stats
|
|
all_spans = [s for r in results for s in r.spans]
|
|
primitive_counts = Counter(s["primitive"] for s in all_spans)
|
|
valence_counts = Counter(s["valence"] for s in all_spans)
|
|
unmapped_count = primitive_counts.get("UNMAPPED", 0)
|
|
non_informative_count = primitive_counts.get("NON_INFORMATIVE", 0)
|
|
|
|
# Language distribution (for multilingual tracking)
|
|
language_counts = Counter(
|
|
getattr(r, 'detected_language', 'unknown') or 'unknown'
|
|
for r in results
|
|
)
|
|
|
|
# Content spans = total - non-informative
|
|
content_spans = len(all_spans) - non_informative_count
|
|
content_unmapped_rate = unmapped_count / content_spans if content_spans > 0 else 0
|
|
|
|
stats = {
|
|
"run_id": str(run_id),
|
|
"business_id": config["business_id"],
|
|
"sector_code": config["sector_code"],
|
|
"config_version": config["config_version"],
|
|
"l2_applied": config.get("l2_applied"),
|
|
"classifier": classifier_type,
|
|
"model": model if use_llm else "mock",
|
|
"reviews_processed": len(reviews),
|
|
"spans_created": len(all_spans),
|
|
"non_informative_count": non_informative_count,
|
|
"content_spans": content_spans,
|
|
"unmapped_count": unmapped_count,
|
|
"raw_unmapped_rate": unmapped_count / len(all_spans) if all_spans else 0,
|
|
"content_unmapped_rate": content_unmapped_rate,
|
|
"top_primitives": primitive_counts.most_common(10),
|
|
"valence_distribution": dict(valence_counts),
|
|
"language_distribution": dict(language_counts),
|
|
"dry_run": dry_run,
|
|
}
|
|
|
|
if use_llm:
|
|
stats["tokens"] = total_tokens
|
|
# Rough cost estimate for gpt-4o-mini
|
|
cost = (total_tokens["prompt"] * 0.15 + total_tokens["completion"] * 0.60) / 1_000_000
|
|
stats["estimated_cost_usd"] = round(cost, 4)
|
|
|
|
print(f"\n📊 Results:")
|
|
print(f" Reviews: {stats['reviews_processed']}")
|
|
print(f" Total spans: {stats['spans_created']}")
|
|
print(f" NON_INFORMATIVE: {non_informative_count} ({100*non_informative_count/len(all_spans):.1f}%)" if all_spans else "")
|
|
print(f" Content spans: {content_spans}")
|
|
print(f" UNMAPPED (of content): {unmapped_count} ({content_unmapped_rate:.1%})")
|
|
if use_llm and "estimated_cost_usd" in stats:
|
|
print(f" Estimated cost: ${stats['estimated_cost_usd']:.4f}")
|
|
|
|
# Print language distribution
|
|
print(f"\n 🌐 Languages detected:")
|
|
for lang, count in language_counts.most_common(5):
|
|
pct = count / len(results) * 100 if results else 0
|
|
print(f" {lang}: {count} ({pct:.1f}%)")
|
|
|
|
print(f"\n Top primitives:")
|
|
for prim, count in stats["top_primitives"][:5]:
|
|
print(f" {prim}: {count}")
|
|
|
|
return stats
|
|
|
|
finally:
|
|
await pool.close()
|
|
|
|
|
|
async def evaluate_business(business_id: str) -> dict[str, Any]:
|
|
"""
|
|
Evaluate classification results for a business.
|
|
|
|
Runs C4 evaluation metrics:
|
|
1. UNMAPPED rate
|
|
2. Top primitives distribution
|
|
3. Contradiction detection
|
|
4. Coverage by config version
|
|
"""
|
|
pool = await asyncpg.create_pool(DB_URL)
|
|
|
|
try:
|
|
# Get business mapping
|
|
resolver = ConfigResolver()
|
|
|
|
# Resolve business_id from partial name via DB lookup
|
|
business_lookup = await resolve_business_id(pool, business_id)
|
|
if not business_lookup:
|
|
return {"error": f"Business not found matching: {business_id}"}
|
|
|
|
config = await resolver.resolve(business_lookup, pool)
|
|
if not config:
|
|
return {"error": f"Business not mapped: {business_lookup}"}
|
|
|
|
# Get latest run_id for this business (if exists)
|
|
latest_run = await pool.fetchrow("""
|
|
SELECT run_id, MAX(created_at) as latest
|
|
FROM pipeline.detected_spans_v2
|
|
WHERE business_id = $1 AND run_id IS NOT NULL
|
|
GROUP BY run_id
|
|
ORDER BY latest DESC
|
|
LIMIT 1
|
|
""", config["business_id"])
|
|
|
|
run_id_filter = ""
|
|
run_id_value = None
|
|
if latest_run and latest_run["run_id"]:
|
|
run_id_value = latest_run["run_id"]
|
|
run_id_filter = "AND run_id = $2"
|
|
print(f"📎 Using latest run: {run_id_value}")
|
|
|
|
# Fetch spans from detected_spans_v2 (include language)
|
|
if run_id_value:
|
|
spans = await pool.fetch(f"""
|
|
SELECT primitive, valence, intensity, confidence, span_text, config_version, language, run_id
|
|
FROM pipeline.detected_spans_v2
|
|
WHERE business_id = $1 {run_id_filter}
|
|
ORDER BY created_at DESC
|
|
""", config["business_id"], run_id_value)
|
|
else:
|
|
# Fallback to all spans if no run_id exists (legacy data)
|
|
spans = await pool.fetch("""
|
|
SELECT primitive, valence, intensity, confidence, span_text, config_version, language, run_id
|
|
FROM pipeline.detected_spans_v2
|
|
WHERE business_id = $1
|
|
ORDER BY created_at DESC
|
|
""", config["business_id"])
|
|
print("⚠️ No run_id found, using all spans (legacy data)")
|
|
|
|
if not spans:
|
|
return {"error": "No spans found in detected_spans_v2"}
|
|
|
|
spans = [dict(s) for s in spans]
|
|
|
|
# 1. UNMAPPED rate (overall)
|
|
unmapped = [s for s in spans if s["primitive"] == "UNMAPPED"]
|
|
unmapped_rate = len(unmapped) / len(spans)
|
|
|
|
# 2. Top primitives
|
|
primitive_counts = Counter(s["primitive"] for s in spans)
|
|
|
|
# 2b. UNMAPPED rate by language
|
|
language_stats = {}
|
|
for lang in set(s.get("language") or "unknown" for s in spans):
|
|
lang_spans = [s for s in spans if (s.get("language") or "unknown") == lang]
|
|
lang_unmapped = [s for s in lang_spans if s["primitive"] == "UNMAPPED"]
|
|
lang_non_info = [s for s in lang_spans if s["primitive"] == "NON_INFORMATIVE"]
|
|
content_spans = len(lang_spans) - len(lang_non_info)
|
|
language_stats[lang] = {
|
|
"total": len(lang_spans),
|
|
"unmapped": len(lang_unmapped),
|
|
"non_informative": len(lang_non_info),
|
|
"content_spans": content_spans,
|
|
"unmapped_rate": len(lang_unmapped) / content_spans if content_spans > 0 else 0,
|
|
}
|
|
|
|
# 3. Contradiction detection (simple heuristics)
|
|
contradictions = []
|
|
positive_words = {"great", "amazing", "excellent", "wonderful", "best", "love", "perfect"}
|
|
negative_words = {"terrible", "awful", "worst", "horrible", "hate", "never"}
|
|
|
|
for span in spans:
|
|
text_lower = span["span_text"].lower()
|
|
valence = span["valence"]
|
|
|
|
has_positive = any(w in text_lower for w in positive_words)
|
|
has_negative = any(w in text_lower for w in negative_words)
|
|
|
|
if has_positive and valence == "-":
|
|
contradictions.append({
|
|
"type": "positive_text_negative_valence",
|
|
"text": span["span_text"][:50],
|
|
"valence": valence,
|
|
})
|
|
elif has_negative and valence == "+":
|
|
contradictions.append({
|
|
"type": "negative_text_positive_valence",
|
|
"text": span["span_text"][:50],
|
|
"valence": valence,
|
|
})
|
|
|
|
# 4. Config version coverage
|
|
version_counts = Counter(s["config_version"] for s in spans)
|
|
|
|
# 5. Confidence distribution
|
|
avg_confidence = sum(float(s["confidence"]) for s in spans) / len(spans)
|
|
low_confidence = len([s for s in spans if float(s["confidence"]) < 0.5])
|
|
|
|
evaluation = {
|
|
"business_id": config["business_id"],
|
|
"sector_code": config["sector_code"],
|
|
"total_spans": len(spans),
|
|
"unmapped_rate": unmapped_rate,
|
|
"unmapped_count": len(unmapped),
|
|
"top_primitives": primitive_counts.most_common(10),
|
|
"contradiction_count": len(contradictions),
|
|
"contradictions_sample": contradictions[:5],
|
|
"config_versions": dict(version_counts),
|
|
"avg_confidence": avg_confidence,
|
|
"low_confidence_count": low_confidence,
|
|
"language_stats": language_stats,
|
|
}
|
|
|
|
# Print report
|
|
print("\n" + "=" * 60)
|
|
print(f"EVALUATION: {config['business_id']}")
|
|
print("=" * 60)
|
|
|
|
print(f"\n📊 METRICS")
|
|
print(f" Total spans: {len(spans)}")
|
|
print(f" UNMAPPED rate: {unmapped_rate:.1%} {'⚠️' if unmapped_rate > 0.2 else '✓'}")
|
|
print(f" Avg confidence: {avg_confidence:.2f}")
|
|
print(f" Low confidence (<0.5): {low_confidence}")
|
|
|
|
print(f"\n🌐 UNMAPPED BY LANGUAGE")
|
|
for lang, stats in sorted(language_stats.items(), key=lambda x: x[1]["total"], reverse=True):
|
|
rate = stats["unmapped_rate"]
|
|
flag = "⚠️" if rate > 0.15 else "✓"
|
|
print(f" {lang}: {stats['unmapped']}/{stats['content_spans']} ({rate:.1%}) {flag}")
|
|
|
|
print(f"\n🔝 TOP PRIMITIVES")
|
|
for prim, count in primitive_counts.most_common(8):
|
|
pct = count / len(spans) * 100
|
|
print(f" {prim}: {count} ({pct:.1f}%)")
|
|
|
|
print(f"\n⚠️ CONTRADICTIONS: {len(contradictions)}")
|
|
for c in contradictions[:3]:
|
|
print(f" {c['type']}: \"{c['text']}...\" → {c['valence']}")
|
|
|
|
print(f"\n📋 CONFIG VERSIONS")
|
|
for ver, count in version_counts.items():
|
|
print(f" {ver}: {count} spans")
|
|
|
|
print("=" * 60)
|
|
|
|
return evaluation
|
|
|
|
finally:
|
|
await pool.close()
|
|
|
|
|
|
async def language_analysis(ignore_legacy: bool = False, latest_hours: int | None = None) -> dict[str, Any]:
|
|
"""
|
|
Analyze UNMAPPED rates by language across all businesses/sectors.
|
|
|
|
Args:
|
|
ignore_legacy: If True, exclude rows with language IN ('auto', 'unknown') or NULL
|
|
latest_hours: If set, only include spans from the last N hours
|
|
|
|
This helps determine if multilingual handling needs improvement
|
|
(e.g., translation for non-English reviews).
|
|
"""
|
|
pool = await asyncpg.create_pool(DB_URL)
|
|
|
|
try:
|
|
# Build WHERE clause based on filters
|
|
where_clauses = []
|
|
if ignore_legacy:
|
|
where_clauses.append("language IS NOT NULL AND language NOT IN ('auto', 'unknown')")
|
|
if latest_hours:
|
|
where_clauses.append(f"created_at >= NOW() - INTERVAL '{latest_hours} hours'")
|
|
|
|
where_sql = f"WHERE {' AND '.join(where_clauses)}" if where_clauses else ""
|
|
|
|
# Query UNMAPPED rates by language and sector
|
|
rows = await pool.fetch(f"""
|
|
SELECT
|
|
sector_code,
|
|
COALESCE(language, 'unknown') as language,
|
|
COUNT(*) as total_spans,
|
|
COUNT(*) FILTER (WHERE primitive = 'UNMAPPED') as unmapped_count,
|
|
COUNT(*) FILTER (WHERE primitive = 'NON_INFORMATIVE') as non_informative_count,
|
|
AVG(confidence) as avg_confidence
|
|
FROM pipeline.detected_spans_v2
|
|
{where_sql}
|
|
GROUP BY sector_code, COALESCE(language, 'unknown')
|
|
ORDER BY sector_code, total_spans DESC
|
|
""")
|
|
|
|
if not rows:
|
|
return {"error": "No data in detected_spans_v2"}
|
|
|
|
# Build report
|
|
filter_desc = []
|
|
if ignore_legacy:
|
|
filter_desc.append("excluding legacy (auto/unknown)")
|
|
if latest_hours:
|
|
filter_desc.append(f"last {latest_hours}h only")
|
|
filter_str = f" [{', '.join(filter_desc)}]" if filter_desc else ""
|
|
|
|
print("\n" + "=" * 80)
|
|
print(f"LANGUAGE ANALYSIS: UNMAPPED Rates by Language & Sector{filter_str}")
|
|
print("=" * 80)
|
|
print(f"{'':14} | {'total':>5} | {'content':>7} | {'unmapped':>8} | {'raw':>6} | {'adj':>6} |")
|
|
print("-" * 80)
|
|
|
|
current_sector = None
|
|
for row in rows:
|
|
sector = row["sector_code"]
|
|
lang = row["language"]
|
|
total = row["total_spans"]
|
|
unmapped = row["unmapped_count"]
|
|
non_info = row["non_informative_count"]
|
|
content = total - non_info
|
|
raw_rate = unmapped / total if total > 0 else 0
|
|
adj_rate = unmapped / content if content > 0 else 0
|
|
conf = float(row["avg_confidence"])
|
|
|
|
if sector != current_sector:
|
|
current_sector = sector
|
|
print(f"\n📂 {sector}")
|
|
|
|
flag = "⚠️" if adj_rate > 0.15 else "✓"
|
|
print(f" {lang:12} | {total:5} | {content:7} | {unmapped:8} | {raw_rate:5.1%} | {adj_rate:5.1%} | {flag}")
|
|
|
|
# Summary
|
|
print("\n" + "-" * 80)
|
|
print("SUMMARY: Languages with high content-adjusted UNMAPPED (>15%, content >= 20)")
|
|
print("-" * 80)
|
|
|
|
high_unmapped = [
|
|
row for row in rows
|
|
if (row["total_spans"] - row["non_informative_count"]) >= 20 # Minimum sample size
|
|
and row["unmapped_count"] / (row["total_spans"] - row["non_informative_count"]) > 0.15
|
|
]
|
|
|
|
if high_unmapped:
|
|
for row in high_unmapped:
|
|
content = row["total_spans"] - row["non_informative_count"]
|
|
rate = row["unmapped_count"] / content
|
|
print(f" {row['sector_code']}/{row['language']}: {rate:.1%} UNMAPPED ({content} content spans)")
|
|
print("\n💡 Consider: Translation for these language/sector combinations")
|
|
else:
|
|
print(" ✓ No language/sector combinations exceed threshold")
|
|
|
|
# Totals
|
|
total_spans = sum(r["total_spans"] for r in rows)
|
|
total_unmapped = sum(r["unmapped_count"] for r in rows)
|
|
total_non_info = sum(r["non_informative_count"] for r in rows)
|
|
total_content = total_spans - total_non_info
|
|
|
|
print(f"\n📊 TOTALS: {total_spans} spans, {total_content} content, {total_unmapped} unmapped")
|
|
print(f" Raw UNMAPPED rate: {total_unmapped/total_spans:.1%}" if total_spans > 0 else "")
|
|
print(f" Content-adjusted UNMAPPED rate: {total_unmapped/total_content:.1%}" if total_content > 0 else "")
|
|
|
|
print("=" * 80)
|
|
|
|
return {"rows": [dict(r) for r in rows]}
|
|
|
|
finally:
|
|
await pool.close()
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Classification run harness V2")
|
|
parser.add_argument("--business", help="Business name or pattern")
|
|
parser.add_argument("--limit", type=int, default=100, help="Max reviews to process")
|
|
parser.add_argument("--dry-run", action="store_true", help="Don't store results")
|
|
parser.add_argument("--evaluate", metavar="BUSINESS", help="Evaluate existing results")
|
|
parser.add_argument("--language-analysis", action="store_true", help="Analyze UNMAPPED by language across all data")
|
|
parser.add_argument("--ignore-legacy-language", action="store_true", help="Exclude rows with language='auto'/'unknown'/NULL")
|
|
parser.add_argument("--latest-hours", type=int, help="Only include spans from last N hours")
|
|
parser.add_argument("--use-existing", action="store_true", help="Use existing spans instead of jobs")
|
|
parser.add_argument("--use-llm", action="store_true", help="Use real LLM classification (requires OPENAI_API_KEY)")
|
|
parser.add_argument("--model", default="gpt-4o-mini", help="Model for LLM classification (default: gpt-4o-mini)")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.language_analysis:
|
|
asyncio.run(language_analysis(
|
|
ignore_legacy=args.ignore_legacy_language,
|
|
latest_hours=args.latest_hours,
|
|
))
|
|
elif args.evaluate:
|
|
asyncio.run(evaluate_business(args.evaluate))
|
|
elif args.business:
|
|
asyncio.run(run_classification(
|
|
args.business,
|
|
limit=args.limit,
|
|
dry_run=args.dry_run,
|
|
use_jobs=not args.use_existing,
|
|
use_llm=args.use_llm,
|
|
model=args.model,
|
|
))
|
|
else:
|
|
parser.print_help()
|
|
print("\n\nExamples:")
|
|
print(" # Mock classification (free, for testing)")
|
|
print(" python run_classification_v2.py --business gokarts --limit 50 --dry-run")
|
|
print("")
|
|
print(" # Real LLM classification")
|
|
print(" python run_classification_v2.py --business gokarts --limit 50 --use-llm")
|
|
print("")
|
|
print(" # Evaluate results")
|
|
print(" python run_classification_v2.py --evaluate gokarts")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|