diff --git a/.artifacts/LLM-Classification-Contract-v1.md b/.artifacts/LLM-Classification-Contract-v1.md new file mode 100644 index 0000000..239de46 --- /dev/null +++ b/.artifacts/LLM-Classification-Contract-v1.md @@ -0,0 +1,754 @@ +# LLM Classification Contract v1.0 + +**Purpose**: Define the prompt, output schema, and validation rules for span-level URT classification. +**Target Model**: Claude 3.5 Sonnet / GPT-4o (structured output mode) +**Date**: 2026-01-24 + +--- + +## 1. Overview + +The LLM receives a single review text and returns an array of **spans** — semantically distinct units of feedback. Each span is independently classified using URT v5.1. + +**Pipeline position**: +``` +reviews_raw.text → LLM → spans[] → review_spans table +``` + +--- + +## 2. System Prompt + +``` +You are a review classification system using URT (Universal Review Taxonomy) v5.1. + +Your task is to extract semantic spans from customer reviews and classify each span independently. + +## SPAN EXTRACTION RULES + +1. **Split on contrasting conjunctions**: but, however, although, despite, yet, though +2. **Split on topic/target change**: food → service → bathroom = 3 spans +3. **Split on valence change**: positive → negative = split +4. **Split on domain change**: O (Offering) → J (Journey) → E (Environment) = split +5. **Keep together**: cause→effect within same feedback unit ("X because Y" = 1 span) + +**Guardrails**: +- Max 3 spans per sentence (if 4+, re-check for over-splitting) +- Min 1 span per review (even single-word reviews) +- Spans must be non-overlapping and cover meaningful content + +## URT DOMAINS (Tier-3 codes: X#.##) + +| Domain | Code | Description | +|--------|------|-------------| +| Offering | O1-O4 | Product/service quality, features, variety | +| Price | P1-P4 | Value, pricing, promotions, payment | +| Journey | J1-J4 | Timing, process, convenience, accessibility | +| Environment | E1-E4 | Physical space, ambiance, cleanliness, digital UX | +| Attitude | A1-A4 | Staff behavior, helpfulness, professionalism | +| Voice | V1-V4 | Brand, communication, marketing, transparency | +| Relationship | R1-R4 | Loyalty, trust, consistency, personalization | + +## DIMENSION CODES + +### Valence +- V+ : Positive sentiment +- V- : Negative sentiment +- V0 : Neutral/factual +- V± : Mixed within the span + +### Intensity +- I1 : Low ("okay", "fine", "decent") +- I2 : Moderate ("good", "bad", "slow") +- I3 : High ("amazing", "terrible", "unacceptable") + +### Specificity +- S1 : Vague ("it was bad") +- S2 : Some detail ("the food was cold") +- S3 : Precise ("waited 45 minutes for appetizers") + +### Actionability +- A1 : No clear action possible +- A2 : Possible actions, unclear which +- A3 : Clear, specific action ("train staff on X", "fix Y") + +### Temporal +- TC : Current visit (default when no markers) +- TR : Recent pattern ("lately", "recently", "again") +- TH : Historical ("for years", "always", "used to") +- TF : Future ("won't return", "next time", "I expect") + +### Evidence +- ES : Stated explicitly in text (default) +- EI : Inferred logically (not stated, but entailed) +- EC : Contextual (depends on surrounding text) + +### Comparative +- CR-N : No comparison (default) +- CR-B : Better than alternatives +- CR-W : Worse than alternatives +- CR-S : Same as alternatives + +## PRIMARY SPAN SELECTION + +Mark exactly ONE span as is_primary=true using this order: +1. Highest intensity (I3 > I2 > I1) +2. Tie-break: negative over positive (V- > V± > V0 > V+) +3. Tie-break: earliest span_index + +## USN (URT String Notation) + +Generate a USN string for each span: +``` +URT:S:{primary}[+{sec1}][+{sec2}]:{valence_sign}{intensity_num}:{S#}{A#}{temporal}.{evidence}.{CR_suffix} +``` + +Examples: +- `URT:S:J1.03:-2:22TC.ES.N` (J1.03, V-, I2, S2, A2, TC, ES, CR-N) +- `URT:S:P1.01+O2.03:+3:33TR.ES.B` (P1.01 primary, O2.03 secondary, V+, I3, S3, A3, TR, ES, CR-B) + +Valence encoding: + for V+, - for V-, 0 for V0, ± for V± +CR suffix: N=CR-N, B=CR-B, W=CR-W, S=CR-S + +## OUTPUT FORMAT + +Return valid JSON matching the schema exactly. No markdown, no explanations. +``` + +--- + +## 3. Output JSON Schema + +```json +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "URT Span Extraction Response", + "type": "object", + "required": ["spans", "review_summary"], + "additionalProperties": false, + "properties": { + "spans": { + "type": "array", + "minItems": 1, + "maxItems": 15, + "items": { + "type": "object", + "required": [ + "span_index", + "span_text", + "span_start", + "span_end", + "urt_primary", + "urt_secondary", + "valence", + "intensity", + "specificity", + "actionability", + "temporal", + "evidence", + "comparative", + "is_primary", + "usn" + ], + "additionalProperties": false, + "properties": { + "span_index": { + "type": "integer", + "minimum": 0, + "description": "0-based position in review" + }, + "span_text": { + "type": "string", + "minLength": 1, + "description": "Exact text extracted from review" + }, + "span_start": { + "type": "integer", + "minimum": 0, + "description": "Character offset start (0-indexed)" + }, + "span_end": { + "type": "integer", + "minimum": 1, + "description": "Character offset end (exclusive)" + }, + "urt_primary": { + "type": "string", + "pattern": "^[OPJEAVR][1-4]\\.[0-9]{2}$", + "description": "Primary URT Tier-3 code" + }, + "urt_secondary": { + "type": "array", + "maxItems": 2, + "items": { + "type": "string", + "pattern": "^[OPJEAVR][1-4]\\.[0-9]{2}$" + }, + "description": "Secondary codes (max 2, different domains preferred)" + }, + "valence": { + "type": "string", + "enum": ["V+", "V-", "V0", "V±"] + }, + "intensity": { + "type": "string", + "enum": ["I1", "I2", "I3"] + }, + "specificity": { + "type": "string", + "enum": ["S1", "S2", "S3"] + }, + "actionability": { + "type": "string", + "enum": ["A1", "A2", "A3"] + }, + "temporal": { + "type": "string", + "enum": ["TC", "TR", "TH", "TF"] + }, + "evidence": { + "type": "string", + "enum": ["ES", "EI", "EC"] + }, + "comparative": { + "type": "string", + "enum": ["CR-N", "CR-B", "CR-W", "CR-S"] + }, + "is_primary": { + "type": "boolean", + "description": "True for exactly one span per review" + }, + "confidence": { + "type": "string", + "enum": ["high", "medium", "low"], + "default": "medium" + }, + "entity": { + "type": ["string", "null"], + "description": "Named entity if present (staff name, product, location)" + }, + "entity_type": { + "type": ["string", "null"], + "enum": ["location", "staff", "product", "process", "time", "other", null] + }, + "relation_type": { + "type": ["string", "null"], + "enum": ["cause_of", "effect_of", "contrast", "resolution", null], + "description": "Relationship to another span in this review" + }, + "related_span_index": { + "type": ["integer", "null"], + "minimum": 0, + "description": "Index of related span (must be different from this span)" + }, + "usn": { + "type": "string", + "pattern": "^URT:S:[OPJEAVR][1-4]\\.[0-9]{2}", + "description": "URT String Notation for audit" + } + } + } + }, + "review_summary": { + "type": "object", + "required": ["dominant_valence", "dominant_domain", "span_count"], + "properties": { + "dominant_valence": { + "type": "string", + "enum": ["V+", "V-", "V0", "V±"] + }, + "dominant_domain": { + "type": "string", + "pattern": "^[OPJEAVR]$" + }, + "span_count": { + "type": "integer", + "minimum": 1 + }, + "has_comparative": { + "type": "boolean" + }, + "has_entity": { + "type": "boolean" + } + } + } + } +} +``` + +--- + +## 4. Few-Shot Examples + +### Example 1: Multi-span review (3 spans) + +**Input**: +```json +{ + "review_text": "The food was incredible, best steak I've ever had. But we waited 45 minutes for a table even with a reservation. The manager comped our drinks which was nice.", + "profile": "standard" +} +``` + +**Output**: +```json +{ + "spans": [ + { + "span_index": 0, + "span_text": "The food was incredible, best steak I've ever had.", + "span_start": 0, + "span_end": 50, + "urt_primary": "O1.01", + "urt_secondary": [], + "valence": "V+", + "intensity": "I3", + "specificity": "S2", + "actionability": "A1", + "temporal": "TC", + "evidence": "ES", + "comparative": "CR-B", + "is_primary": false, + "confidence": "high", + "entity": "steak", + "entity_type": "product", + "relation_type": null, + "related_span_index": null, + "usn": "URT:S:O1.01:+3:21TC.ES.B" + }, + { + "span_index": 1, + "span_text": "But we waited 45 minutes for a table even with a reservation.", + "span_start": 51, + "span_end": 112, + "urt_primary": "J1.02", + "urt_secondary": [], + "valence": "V-", + "intensity": "I3", + "specificity": "S3", + "actionability": "A3", + "temporal": "TC", + "evidence": "ES", + "comparative": "CR-N", + "is_primary": true, + "confidence": "high", + "entity": null, + "entity_type": null, + "relation_type": null, + "related_span_index": null, + "usn": "URT:S:J1.02:-3:33TC.ES.N" + }, + { + "span_index": 2, + "span_text": "The manager comped our drinks which was nice.", + "span_start": 113, + "span_end": 158, + "urt_primary": "A2.01", + "urt_secondary": [], + "valence": "V+", + "intensity": "I2", + "specificity": "S2", + "actionability": "A1", + "temporal": "TC", + "evidence": "ES", + "comparative": "CR-N", + "is_primary": false, + "confidence": "high", + "entity": "manager", + "entity_type": "staff", + "relation_type": "resolution", + "related_span_index": 1, + "usn": "URT:S:A2.01:+2:21TC.ES.N" + } + ], + "review_summary": { + "dominant_valence": "V±", + "dominant_domain": "J", + "span_count": 3, + "has_comparative": true, + "has_entity": true + } +} +``` + +**Why this classification**: +- Span 0: Food quality (O1.01), superlative language (I3), "best ever" is comparative (CR-B) +- Span 1: Wait time (J1.02), specific time (S3), actionable (A3), negative + I3 = primary +- Span 2: Staff action (A2.01), resolution of span 1's problem + +--- + +### Example 2: Single-span review + +**Input**: +```json +{ + "review_text": "Great place!", + "profile": "standard" +} +``` + +**Output**: +```json +{ + "spans": [ + { + "span_index": 0, + "span_text": "Great place!", + "span_start": 0, + "span_end": 12, + "urt_primary": "O1.01", + "urt_secondary": [], + "valence": "V+", + "intensity": "I2", + "specificity": "S1", + "actionability": "A1", + "temporal": "TC", + "evidence": "ES", + "comparative": "CR-N", + "is_primary": true, + "confidence": "medium", + "entity": null, + "entity_type": null, + "relation_type": null, + "related_span_index": null, + "usn": "URT:S:O1.01:+2:11TC.ES.N" + } + ], + "review_summary": { + "dominant_valence": "V+", + "dominant_domain": "O", + "span_count": 1, + "has_comparative": false, + "has_entity": false + } +} +``` + +**Why this classification**: +- Vague positive (S1, A1), default to O1.01 for general positive +- Lower confidence due to lack of specificity + +--- + +### Example 3: Historical + future temporal markers + +**Input**: +```json +{ + "review_text": "I've been coming here for years and the quality has always been consistent. Won't be back after today though - they've really gone downhill.", + "profile": "standard" +} +``` + +**Output**: +```json +{ + "spans": [ + { + "span_index": 0, + "span_text": "I've been coming here for years and the quality has always been consistent.", + "span_start": 0, + "span_end": 76, + "urt_primary": "R1.01", + "urt_secondary": ["O1.01"], + "valence": "V+", + "intensity": "I2", + "specificity": "S2", + "actionability": "A1", + "temporal": "TH", + "evidence": "ES", + "comparative": "CR-N", + "is_primary": false, + "confidence": "high", + "entity": null, + "entity_type": null, + "relation_type": "contrast", + "related_span_index": 1, + "usn": "URT:S:R1.01+O1.01:+2:21TH.ES.N" + }, + { + "span_index": 1, + "span_text": "Won't be back after today though - they've really gone downhill.", + "span_start": 77, + "span_end": 141, + "urt_primary": "R1.02", + "urt_secondary": [], + "valence": "V-", + "intensity": "I3", + "specificity": "S1", + "actionability": "A1", + "temporal": "TF", + "evidence": "ES", + "comparative": "CR-N", + "is_primary": true, + "confidence": "high", + "entity": null, + "entity_type": null, + "relation_type": "contrast", + "related_span_index": 0, + "usn": "URT:S:R1.02:-3:11TF.ES.N" + } + ], + "review_summary": { + "dominant_valence": "V-", + "dominant_domain": "R", + "span_count": 2, + "has_comparative": false, + "has_entity": false + } +} +``` + +**Why this classification**: +- Span 0: Historical loyalty (TH), secondary O1.01 for quality mention +- Span 1: Future intent (TF), I3 due to "really" + finality of "won't be back" +- Mutual contrast relationship + +--- + +### Example 4: Comparative review + +**Input**: +```json +{ + "review_text": "Way better than the other coffee shops in the area. Their lattes are smoother and the prices are actually reasonable.", + "profile": "standard" +} +``` + +**Output**: +```json +{ + "spans": [ + { + "span_index": 0, + "span_text": "Way better than the other coffee shops in the area.", + "span_start": 0, + "span_end": 51, + "urt_primary": "O1.01", + "urt_secondary": [], + "valence": "V+", + "intensity": "I3", + "specificity": "S1", + "actionability": "A1", + "temporal": "TC", + "evidence": "ES", + "comparative": "CR-B", + "is_primary": true, + "confidence": "high", + "entity": null, + "entity_type": null, + "relation_type": null, + "related_span_index": null, + "usn": "URT:S:O1.01:+3:11TC.ES.B" + }, + { + "span_index": 1, + "span_text": "Their lattes are smoother", + "span_start": 52, + "span_end": 77, + "urt_primary": "O1.02", + "urt_secondary": [], + "valence": "V+", + "intensity": "I2", + "specificity": "S2", + "actionability": "A1", + "temporal": "TC", + "evidence": "ES", + "comparative": "CR-B", + "is_primary": false, + "confidence": "high", + "entity": "lattes", + "entity_type": "product", + "relation_type": null, + "related_span_index": null, + "usn": "URT:S:O1.02:+2:21TC.ES.B" + }, + { + "span_index": 2, + "span_text": "and the prices are actually reasonable.", + "span_start": 78, + "span_end": 117, + "urt_primary": "P1.01", + "urt_secondary": [], + "valence": "V+", + "intensity": "I2", + "specificity": "S2", + "actionability": "A1", + "temporal": "TC", + "evidence": "ES", + "comparative": "CR-B", + "is_primary": false, + "confidence": "high", + "entity": null, + "entity_type": null, + "relation_type": null, + "related_span_index": null, + "usn": "URT:S:P1.01:+2:21TC.ES.B" + } + ], + "review_summary": { + "dominant_valence": "V+", + "dominant_domain": "O", + "span_count": 3, + "has_comparative": true, + "has_entity": true + } +} +``` + +--- + +## 5. Validation Rules + +### 5.1 Structural Validation (pre-insert) + +| Rule | Check | Error | +|------|-------|-------| +| Span count | `1 <= spans.length <= 15` | INVALID_SPAN_COUNT | +| Exactly one primary | `spans.filter(s => s.is_primary).length === 1` | INVALID_PRIMARY_COUNT | +| Contiguous indices | `spans[i].span_index === i` for all i | NON_CONTIGUOUS_INDEX | +| Non-overlapping | `spans[i].span_end <= spans[i+1].span_start` | OVERLAPPING_SPANS | +| Valid offsets | `span_end > span_start && span_start >= 0` | INVALID_OFFSETS | +| Text matches | `review_text.slice(span_start, span_end) ~= span_text` | TEXT_MISMATCH | +| USN format | Matches regex for profile | INVALID_USN | +| Self-reference | `related_span_index !== span_index` | SELF_REFERENCE | +| Related exists | `related_span_index < spans.length` | INVALID_RELATION | + +### 5.2 Semantic Validation (warnings, not errors) + +| Rule | Check | Warning | +|------|-------|---------| +| Secondary domain | Secondary codes should differ from primary domain | SAME_DOMAIN_SECONDARY | +| Over-splitting | More than 3 spans per sentence | POSSIBLE_OVERSPLIT | +| Intensity/valence match | I3 + V0 is unusual | UNUSUAL_INTENSITY_VALENCE | +| Specificity/actionability | S1 + A3 is rare | UNUSUAL_SPEC_ACTION | + +### 5.3 Text Matching Rules + +Allow normalization: +- Whitespace collapse: multiple spaces → single space +- Trim: leading/trailing whitespace +- Case: must match exactly (no case normalization) + +```python +def text_matches(review_text: str, span: dict) -> bool: + expected = review_text[span['span_start']:span['span_end']] + actual = span['span_text'] + + # Normalize whitespace + expected_norm = ' '.join(expected.split()) + actual_norm = ' '.join(actual.split()) + + return expected_norm == actual_norm +``` + +--- + +## 6. Error Handling + +### 6.1 Retry Strategy + +| Error Type | Action | +|------------|--------| +| JSON parse error | Retry with "Return ONLY valid JSON" appended | +| Schema validation error | Retry with specific field errors in prompt | +| Offset mismatch | Retry with "Offsets must match exactly" warning | +| No primary span | Auto-select using primary selection rules | +| Multiple primary spans | Keep first by selection rules, unset others | + +### 6.2 Fallback Behavior + +If after 3 retries the LLM still fails: + +```python +def fallback_single_span(review_text: str) -> dict: + """Create minimal valid response for failed classification.""" + return { + "spans": [{ + "span_index": 0, + "span_text": review_text, + "span_start": 0, + "span_end": len(review_text), + "urt_primary": "O1.01", # Default: general offering + "urt_secondary": [], + "valence": "V0", # Neutral - we don't know + "intensity": "I1", + "specificity": "S1", + "actionability": "A1", + "temporal": "TC", + "evidence": "ES", + "comparative": "CR-N", + "is_primary": True, + "confidence": "low", + "entity": None, + "entity_type": None, + "relation_type": None, + "related_span_index": None, + "usn": "URT:S:O1.01:01:11TC.ES.N" + }], + "review_summary": { + "dominant_valence": "V0", + "dominant_domain": "O", + "span_count": 1, + "has_comparative": False, + "has_entity": False + }, + "_fallback": True, + "_error": "Classification failed after 3 retries" + } +``` + +--- + +## 7. Performance Considerations + +### 7.1 Prompt Token Budget + +| Component | Tokens (approx) | +|-----------|-----------------| +| System prompt | ~800 | +| Schema | ~400 | +| 3 few-shot examples | ~1,200 | +| Average review input | ~100 | +| **Total input** | ~2,500 | +| Average output | ~300-800 | + +### 7.2 Batching + +For high-volume processing, consider: +- Batch 5-10 short reviews per request +- Use `review_id` field in input/output for correlation +- Validate each review's spans independently + +### 7.3 Caching + +Cache key: `sha256(review_text + model_version + prompt_version)` + +Invalidate on: +- Model version change +- Prompt version change +- URT code taxonomy change + +--- + +## 8. Version History + +| Version | Date | Changes | +|---------|------|---------| +| 1.0 | 2026-01-24 | Initial contract for URT-Standard profile | + +--- + +## 9. Future Extensions (v2.0) + +- **Full profile support**: Add `causal_chain` to output schema +- **Confidence calibration**: Train confidence based on validation results +- **Entity linking**: Link entities across reviews for trend detection +- **Multi-language**: Add language detection and localized prompts + +--- + +*End of LLM Classification Contract v1.0*