feat(reviewiq): Add AI synthesis support to dashboard components

Frontend: - Add Synthesis type with action plan, insights, annotations - ExecutiveSummary: Accept synthesis prop for AI narrative - SentimentPie: Accept insight prop for contextual explanation - IntensityHeatmap: Accept insight + highlightDomain props - TimelineChart: Accept insight + annotations props - All components gracefully degrade when synthesis is null Backend: - Add Stage 4: Synthesize for generating AI narratives - Gathers context from classified spans - Generates executive narrative, section insights, action plan - Produces timeline annotations and marketing angles - Stores synthesis in pipeline.executions table Components show AI insights with purple gradient styling when available, fall back to existing behavior when synthesis is not yet generated. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 02:59:47 +00:00
parent 8f9dd136cd
commit c8ecb4b98f
21 changed files with 3959 additions and 90 deletions
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage4_synthesize.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage4_synthesize.py
@@ -0,0 +1,477 @@
+"""
+Stage 4: Synthesize - Generate AI narratives and action plans.
+
+This stage runs after classification and routing to produce:
+- Executive narrative (business-specific story)
+- Section insights (sentiment, category, timeline)
+- Action plan with prioritized recommendations
+- Timeline annotations for key events
+- Marketing angles from strengths
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    import asyncpg
+
+from reviewiq_pipeline.services.llm_client import LLMClientBase
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ActionItem:
+    """A specific action recommendation."""
+    id: str
+    title: str
+    why: str
+    what: str
+    who: str
+    impact: str
+    evidence: list[str]
+    estimated_rating_lift: float | None
+    complexity: str  # 'quick' | 'medium' | 'complex'
+    priority: str    # 'critical' | 'high' | 'medium' | 'low'
+    timeline: str
+    related_subcode: str
+
+
+@dataclass
+class TimelineAnnotation:
+    """An annotation for a key event on the timeline."""
+    date: str
+    label: str
+    description: str
+    type: str  # 'positive' | 'negative' | 'neutral' | 'event'
+
+
+@dataclass
+class Synthesis:
+    """Complete synthesis output from Stage 4."""
+    executive_narrative: str
+    sentiment_insight: str
+    category_insight: str
+    timeline_insight: str
+    priority_domain: str | None
+    priority_issue: str | None
+    action_plan: list[ActionItem]
+    issue_actions: dict[str, str]
+    timeline_annotations: list[TimelineAnnotation]
+    marketing_angles: list[str]
+    competitor_context: str | None
+    generated_at: str
+
+
+SYNTHESIS_SYSTEM_PROMPT = """You are an expert business analyst specializing in customer experience and review analysis.
+
+Your task is to analyze classified review data and generate actionable business insights.
+
+You will receive:
+1. Summary statistics (total reviews, rating, sentiment distribution)
+2. Top issues by category with example quotes
+3. Top strengths with example quotes
+4. Domain breakdown (what customers talk about most)
+
+Generate a JSON response with these fields:
+
+{
+  "executive_narrative": "2-3 paragraph story explaining the business situation, key problems, and path forward. Be specific with numbers and examples.",
+
+  "sentiment_insight": "1-2 sentences explaining WHY sentiment is distributed this way. Connect to specific issues.",
+
+  "category_insight": "1-2 sentences about the pattern in categories. Which domain needs most attention and why?",
+
+  "timeline_insight": "1-2 sentences about trends if data shows changes over time.",
+
+  "priority_domain": "Single letter code (P/V/J/O/A/E/R) for the domain needing most attention, or null",
+
+  "priority_issue": "The subcode (e.g., 'V1.03') that should be fixed first, or null",
+
+  "action_plan": [
+    {
+      "id": "action_1",
+      "title": "Clear action title",
+      "why": "Root cause from the reviews",
+      "what": "Specific steps to take",
+      "who": "Department or role responsible",
+      "impact": "Expected outcome",
+      "evidence": ["Quote 1", "Quote 2"],
+      "estimated_rating_lift": 0.3,
+      "complexity": "quick|medium|complex",
+      "priority": "critical|high|medium|low",
+      "timeline": "This week|This month|This quarter",
+      "related_subcode": "V1.03"
+    }
+  ],
+
+  "timeline_annotations": [
+    {
+      "date": "2024-01-15",
+      "label": "Short label",
+      "description": "What happened",
+      "type": "positive|negative|neutral|event"
+    }
+  ],
+
+  "marketing_angles": [
+    "Way to promote strength 1",
+    "Way to promote strength 2"
+  ],
+
+  "competitor_context": "How this compares to industry/competitors, or null if unknown"
+}
+
+Be specific, actionable, and business-focused. Use actual numbers and quotes from the data.
+Prioritize actions by impact and feasibility.
+"""
+
+
+class SynthesisStage:
+    """
+    Stage 4: Generate AI synthesis from classified review data.
+
+    This stage:
+    1. Aggregates classification results
+    2. Identifies patterns and priorities
+    3. Generates narrative insights via LLM
+    4. Produces actionable recommendations
+    """
+
+    def __init__(self, pool: asyncpg.Pool, llm_client: LLMClientBase):
+        self.pool = pool
+        self.llm_client = llm_client
+
+    async def run(self, job_id: str, execution_id: str) -> Synthesis:
+        """
+        Generate synthesis for a completed pipeline execution.
+
+        Args:
+            job_id: The scraping job ID
+            execution_id: The pipeline execution ID
+
+        Returns:
+            Synthesis object with all generated insights
+        """
+        logger.info(f"Stage 4: Generating synthesis for job {job_id}")
+
+        # Gather all the data we need
+        context = await self._gather_context(job_id)
+
+        # Generate synthesis via LLM
+        synthesis = await self._generate_synthesis(context)
+
+        # Store synthesis in database
+        await self._store_synthesis(execution_id, synthesis)
+
+        logger.info(f"Stage 4: Synthesis complete - {len(synthesis.action_plan)} actions generated")
+        return synthesis
+
+    async def _gather_context(self, job_id: str) -> dict[str, Any]:
+        """Gather all context needed for synthesis."""
+
+        # Get overview stats
+        overview = await self.pool.fetchrow("""
+            SELECT
+                COUNT(DISTINCT r.review_id) as total_reviews,
+                AVG(r.rating) as avg_rating,
+                COUNT(s.span_id) as total_spans
+            FROM reviews r
+            LEFT JOIN pipeline.spans s ON s.source_review_id = r.review_id
+            WHERE r.job_id = $1
+        """, job_id)
+
+        # Get sentiment distribution
+        sentiment = await self.pool.fetch("""
+            SELECT
+                valence,
+                COUNT(*) as count,
+                COUNT(DISTINCT source_review_id) as review_count
+            FROM pipeline.spans
+            WHERE job_id = $1 AND valence IS NOT NULL
+            GROUP BY valence
+            ORDER BY count DESC
+        """, job_id)
+
+        # Get top issues (weaknesses)
+        top_issues = await self.pool.fetch("""
+            SELECT
+                s.urt_primary as subcode,
+                sc.name as subcode_name,
+                sc.definition,
+                d.code as domain,
+                d.name as domain_name,
+                COUNT(*) as span_count,
+                COUNT(*) FILTER (WHERE s.valence = 'V-') as negative_count,
+                ARRAY_AGG(s.span_text ORDER BY s.intensity DESC) FILTER (WHERE s.valence = 'V-') as example_quotes
+            FROM pipeline.spans s
+            JOIN pipeline.urt_subcodes sc ON sc.code = s.urt_primary
+            JOIN pipeline.urt_domains d ON d.code = SUBSTRING(s.urt_primary, 1, 1)
+            WHERE s.job_id = $1 AND s.valence = 'V-'
+            GROUP BY s.urt_primary, sc.name, sc.definition, d.code, d.name
+            ORDER BY negative_count DESC
+            LIMIT 10
+        """, job_id)
+
+        # Get top strengths
+        top_strengths = await self.pool.fetch("""
+            SELECT
+                s.urt_primary as subcode,
+                sc.name as subcode_name,
+                sc.definition,
+                d.code as domain,
+                d.name as domain_name,
+                COUNT(*) as span_count,
+                COUNT(*) FILTER (WHERE s.valence = 'V+') as positive_count,
+                ARRAY_AGG(s.span_text ORDER BY s.intensity DESC) FILTER (WHERE s.valence = 'V+') as example_quotes
+            FROM pipeline.spans s
+            JOIN pipeline.urt_subcodes sc ON sc.code = s.urt_primary
+            JOIN pipeline.urt_domains d ON d.code = SUBSTRING(s.urt_primary, 1, 1)
+            WHERE s.job_id = $1 AND s.valence = 'V+'
+            GROUP BY s.urt_primary, sc.name, sc.definition, d.code, d.name
+            ORDER BY positive_count DESC
+            LIMIT 5
+        """, job_id)
+
+        # Get domain distribution
+        domains = await self.pool.fetch("""
+            SELECT
+                SUBSTRING(urt_primary, 1, 1) as domain,
+                d.name as domain_name,
+                COUNT(*) as total_count,
+                COUNT(*) FILTER (WHERE valence = 'V+') as positive_count,
+                COUNT(*) FILTER (WHERE valence = 'V-') as negative_count
+            FROM pipeline.spans s
+            JOIN pipeline.urt_domains d ON d.code = SUBSTRING(s.urt_primary, 1, 1)
+            WHERE s.job_id = $1
+            GROUP BY SUBSTRING(urt_primary, 1, 1), d.name
+            ORDER BY total_count DESC
+        """, job_id)
+
+        # Get business name if available
+        business = await self.pool.fetchrow("""
+            SELECT DISTINCT business_name
+            FROM reviews
+            WHERE job_id = $1 AND business_name IS NOT NULL
+            LIMIT 1
+        """, job_id)
+
+        return {
+            "business_name": business["business_name"] if business else "This business",
+            "overview": dict(overview) if overview else {},
+            "sentiment": [dict(r) for r in sentiment],
+            "top_issues": [dict(r) for r in top_issues],
+            "top_strengths": [dict(r) for r in top_strengths],
+            "domains": [dict(r) for r in domains],
+        }
+
+    async def _generate_synthesis(self, context: dict[str, Any]) -> Synthesis:
+        """Generate synthesis using LLM."""
+
+        # Build the user prompt with context
+        user_prompt = f"""Analyze this review data for {context['business_name']}:
+
+## Overview
+- Total Reviews: {context['overview'].get('total_reviews', 0)}
+- Average Rating: {context['overview'].get('avg_rating', 'N/A')}
+- Total Insights Extracted: {context['overview'].get('total_spans', 0)}
+
+## Sentiment Distribution
+{self._format_sentiment(context['sentiment'])}
+
+## Top Issues (Problems)
+{self._format_issues(context['top_issues'])}
+
+## Top Strengths
+{self._format_strengths(context['top_strengths'])}
+
+## Domain Breakdown
+{self._format_domains(context['domains'])}
+
+Generate a complete synthesis with actionable insights.
+"""
+
+        # Call LLM
+        try:
+            response = await self.llm_client.generate(
+                system_prompt=SYNTHESIS_SYSTEM_PROMPT,
+                user_prompt=user_prompt,
+                temperature=0.7,  # Allow some creativity
+                max_tokens=4000,
+            )
+
+            # Parse JSON response
+            result = json.loads(response)
+
+            # Convert to Synthesis object
+            return Synthesis(
+                executive_narrative=result.get("executive_narrative", ""),
+                sentiment_insight=result.get("sentiment_insight", ""),
+                category_insight=result.get("category_insight", ""),
+                timeline_insight=result.get("timeline_insight", ""),
+                priority_domain=result.get("priority_domain"),
+                priority_issue=result.get("priority_issue"),
+                action_plan=[
+                    ActionItem(
+                        id=a.get("id", f"action_{i}"),
+                        title=a.get("title", ""),
+                        why=a.get("why", ""),
+                        what=a.get("what", ""),
+                        who=a.get("who", ""),
+                        impact=a.get("impact", ""),
+                        evidence=a.get("evidence", []),
+                        estimated_rating_lift=a.get("estimated_rating_lift"),
+                        complexity=a.get("complexity", "medium"),
+                        priority=a.get("priority", "medium"),
+                        timeline=a.get("timeline", "This month"),
+                        related_subcode=a.get("related_subcode", ""),
+                    )
+                    for i, a in enumerate(result.get("action_plan", []))
+                ],
+                issue_actions={},  # Can be populated from action_plan
+                timeline_annotations=[
+                    TimelineAnnotation(
+                        date=t.get("date", ""),
+                        label=t.get("label", ""),
+                        description=t.get("description", ""),
+                        type=t.get("type", "neutral"),
+                    )
+                    for t in result.get("timeline_annotations", [])
+                ],
+                marketing_angles=result.get("marketing_angles", []),
+                competitor_context=result.get("competitor_context"),
+                generated_at=datetime.utcnow().isoformat(),
+            )
+
+        except json.JSONDecodeError as e:
+            logger.error(f"Failed to parse LLM response: {e}")
+            return self._create_fallback_synthesis()
+        except Exception as e:
+            logger.error(f"Synthesis generation failed: {e}")
+            return self._create_fallback_synthesis()
+
+    def _format_sentiment(self, sentiment: list[dict]) -> str:
+        """Format sentiment data for prompt."""
+        lines = []
+        for s in sentiment:
+            valence = s.get("valence", "Unknown")
+            count = s.get("count", 0)
+            reviews = s.get("review_count", 0)
+            label = {"V+": "Positive", "V-": "Negative", "V0": "Neutral", "V±": "Mixed"}.get(valence, valence)
+            lines.append(f"- {label}: {count} mentions ({reviews} reviews)")
+        return "\n".join(lines) or "No sentiment data"
+
+    def _format_issues(self, issues: list[dict]) -> str:
+        """Format issues for prompt."""
+        lines = []
+        for i, issue in enumerate(issues[:5], 1):
+            subcode = issue.get("subcode", "")
+            name = issue.get("subcode_name", "")
+            domain = issue.get("domain_name", "")
+            count = issue.get("negative_count", 0)
+            quotes = issue.get("example_quotes", [])[:2]
+
+            lines.append(f"{i}. [{subcode}] {name} ({domain})")
+            lines.append(f"   - {count} negative mentions")
+            for q in quotes:
+                if q:
+                    lines.append(f'   - Example: "{q[:100]}..."' if len(q) > 100 else f'   - Example: "{q}"')
+        return "\n".join(lines) or "No issues found"
+
+    def _format_strengths(self, strengths: list[dict]) -> str:
+        """Format strengths for prompt."""
+        lines = []
+        for i, strength in enumerate(strengths[:3], 1):
+            subcode = strength.get("subcode", "")
+            name = strength.get("subcode_name", "")
+            domain = strength.get("domain_name", "")
+            count = strength.get("positive_count", 0)
+            quotes = strength.get("example_quotes", [])[:2]
+
+            lines.append(f"{i}. [{subcode}] {name} ({domain})")
+            lines.append(f"   - {count} positive mentions")
+            for q in quotes:
+                if q:
+                    lines.append(f'   - Example: "{q[:100]}..."' if len(q) > 100 else f'   - Example: "{q}"')
+        return "\n".join(lines) or "No strengths found"
+
+    def _format_domains(self, domains: list[dict]) -> str:
+        """Format domain distribution for prompt."""
+        lines = []
+        for d in domains:
+            domain = d.get("domain", "")
+            name = d.get("domain_name", "")
+            total = d.get("total_count", 0)
+            positive = d.get("positive_count", 0)
+            negative = d.get("negative_count", 0)
+            lines.append(f"- {domain} ({name}): {total} total ({positive} positive, {negative} negative)")
+        return "\n".join(lines) or "No domain data"
+
+    def _create_fallback_synthesis(self) -> Synthesis:
+        """Create a minimal synthesis when LLM fails."""
+        return Synthesis(
+            executive_narrative="Unable to generate detailed analysis. Please review the data manually.",
+            sentiment_insight="",
+            category_insight="",
+            timeline_insight="",
+            priority_domain=None,
+            priority_issue=None,
+            action_plan=[],
+            issue_actions={},
+            timeline_annotations=[],
+            marketing_angles=[],
+            competitor_context=None,
+            generated_at=datetime.utcnow().isoformat(),
+        )
+
+    async def _store_synthesis(self, execution_id: str, synthesis: Synthesis) -> None:
+        """Store synthesis in database."""
+        await self.pool.execute("""
+            UPDATE pipeline.executions
+            SET
+                synthesis = $2,
+                updated_at = NOW()
+            WHERE execution_id = $1
+        """, execution_id, json.dumps({
+            "executive_narrative": synthesis.executive_narrative,
+            "sentiment_insight": synthesis.sentiment_insight,
+            "category_insight": synthesis.category_insight,
+            "timeline_insight": synthesis.timeline_insight,
+            "priority_domain": synthesis.priority_domain,
+            "priority_issue": synthesis.priority_issue,
+            "action_plan": [
+                {
+                    "id": a.id,
+                    "title": a.title,
+                    "why": a.why,
+                    "what": a.what,
+                    "who": a.who,
+                    "impact": a.impact,
+                    "evidence": a.evidence,
+                    "estimated_rating_lift": a.estimated_rating_lift,
+                    "complexity": a.complexity,
+                    "priority": a.priority,
+                    "timeline": a.timeline,
+                    "related_subcode": a.related_subcode,
+                }
+                for a in synthesis.action_plan
+            ],
+            "issue_actions": synthesis.issue_actions,
+            "timeline_annotations": [
+                {
+                    "date": t.date,
+                    "label": t.label,
+                    "description": t.description,
+                    "type": t.type,
+                }
+                for t in synthesis.timeline_annotations
+            ],
+            "marketing_angles": synthesis.marketing_angles,
+            "competitor_context": synthesis.competitor_context,
+            "generated_at": synthesis.generated_at,
+        }))