feat(reviewiq): Add AI synthesis support to dashboard components

Frontend:
- Add Synthesis type with action plan, insights, annotations
- ExecutiveSummary: Accept synthesis prop for AI narrative
- SentimentPie: Accept insight prop for contextual explanation
- IntensityHeatmap: Accept insight + highlightDomain props
- TimelineChart: Accept insight + annotations props
- All components gracefully degrade when synthesis is null

Backend:
- Add Stage 4: Synthesize for generating AI narratives
- Gathers context from classified spans
- Generates executive narrative, section insights, action plan
- Produces timeline annotations and marketing angles
- Stores synthesis in pipeline.executions table

Components show AI insights with purple gradient styling when available,
fall back to existing behavior when synthesis is not yet generated.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-29 02:59:47 +00:00
parent 8f9dd136cd
commit c8ecb4b98f
21 changed files with 3959 additions and 90 deletions

View File

@@ -0,0 +1,477 @@
"""
Stage 4: Synthesize - Generate AI narratives and action plans.
This stage runs after classification and routing to produce:
- Executive narrative (business-specific story)
- Section insights (sentiment, category, timeline)
- Action plan with prioritized recommendations
- Timeline annotations for key events
- Marketing angles from strengths
"""
from __future__ import annotations
import json
import logging
from dataclasses import dataclass, field
from datetime import datetime
from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
import asyncpg
from reviewiq_pipeline.services.llm_client import LLMClientBase
logger = logging.getLogger(__name__)
@dataclass
class ActionItem:
"""A specific action recommendation."""
id: str
title: str
why: str
what: str
who: str
impact: str
evidence: list[str]
estimated_rating_lift: float | None
complexity: str # 'quick' | 'medium' | 'complex'
priority: str # 'critical' | 'high' | 'medium' | 'low'
timeline: str
related_subcode: str
@dataclass
class TimelineAnnotation:
"""An annotation for a key event on the timeline."""
date: str
label: str
description: str
type: str # 'positive' | 'negative' | 'neutral' | 'event'
@dataclass
class Synthesis:
"""Complete synthesis output from Stage 4."""
executive_narrative: str
sentiment_insight: str
category_insight: str
timeline_insight: str
priority_domain: str | None
priority_issue: str | None
action_plan: list[ActionItem]
issue_actions: dict[str, str]
timeline_annotations: list[TimelineAnnotation]
marketing_angles: list[str]
competitor_context: str | None
generated_at: str
SYNTHESIS_SYSTEM_PROMPT = """You are an expert business analyst specializing in customer experience and review analysis.
Your task is to analyze classified review data and generate actionable business insights.
You will receive:
1. Summary statistics (total reviews, rating, sentiment distribution)
2. Top issues by category with example quotes
3. Top strengths with example quotes
4. Domain breakdown (what customers talk about most)
Generate a JSON response with these fields:
{
"executive_narrative": "2-3 paragraph story explaining the business situation, key problems, and path forward. Be specific with numbers and examples.",
"sentiment_insight": "1-2 sentences explaining WHY sentiment is distributed this way. Connect to specific issues.",
"category_insight": "1-2 sentences about the pattern in categories. Which domain needs most attention and why?",
"timeline_insight": "1-2 sentences about trends if data shows changes over time.",
"priority_domain": "Single letter code (P/V/J/O/A/E/R) for the domain needing most attention, or null",
"priority_issue": "The subcode (e.g., 'V1.03') that should be fixed first, or null",
"action_plan": [
{
"id": "action_1",
"title": "Clear action title",
"why": "Root cause from the reviews",
"what": "Specific steps to take",
"who": "Department or role responsible",
"impact": "Expected outcome",
"evidence": ["Quote 1", "Quote 2"],
"estimated_rating_lift": 0.3,
"complexity": "quick|medium|complex",
"priority": "critical|high|medium|low",
"timeline": "This week|This month|This quarter",
"related_subcode": "V1.03"
}
],
"timeline_annotations": [
{
"date": "2024-01-15",
"label": "Short label",
"description": "What happened",
"type": "positive|negative|neutral|event"
}
],
"marketing_angles": [
"Way to promote strength 1",
"Way to promote strength 2"
],
"competitor_context": "How this compares to industry/competitors, or null if unknown"
}
Be specific, actionable, and business-focused. Use actual numbers and quotes from the data.
Prioritize actions by impact and feasibility.
"""
class SynthesisStage:
"""
Stage 4: Generate AI synthesis from classified review data.
This stage:
1. Aggregates classification results
2. Identifies patterns and priorities
3. Generates narrative insights via LLM
4. Produces actionable recommendations
"""
def __init__(self, pool: asyncpg.Pool, llm_client: LLMClientBase):
self.pool = pool
self.llm_client = llm_client
async def run(self, job_id: str, execution_id: str) -> Synthesis:
"""
Generate synthesis for a completed pipeline execution.
Args:
job_id: The scraping job ID
execution_id: The pipeline execution ID
Returns:
Synthesis object with all generated insights
"""
logger.info(f"Stage 4: Generating synthesis for job {job_id}")
# Gather all the data we need
context = await self._gather_context(job_id)
# Generate synthesis via LLM
synthesis = await self._generate_synthesis(context)
# Store synthesis in database
await self._store_synthesis(execution_id, synthesis)
logger.info(f"Stage 4: Synthesis complete - {len(synthesis.action_plan)} actions generated")
return synthesis
async def _gather_context(self, job_id: str) -> dict[str, Any]:
"""Gather all context needed for synthesis."""
# Get overview stats
overview = await self.pool.fetchrow("""
SELECT
COUNT(DISTINCT r.review_id) as total_reviews,
AVG(r.rating) as avg_rating,
COUNT(s.span_id) as total_spans
FROM reviews r
LEFT JOIN pipeline.spans s ON s.source_review_id = r.review_id
WHERE r.job_id = $1
""", job_id)
# Get sentiment distribution
sentiment = await self.pool.fetch("""
SELECT
valence,
COUNT(*) as count,
COUNT(DISTINCT source_review_id) as review_count
FROM pipeline.spans
WHERE job_id = $1 AND valence IS NOT NULL
GROUP BY valence
ORDER BY count DESC
""", job_id)
# Get top issues (weaknesses)
top_issues = await self.pool.fetch("""
SELECT
s.urt_primary as subcode,
sc.name as subcode_name,
sc.definition,
d.code as domain,
d.name as domain_name,
COUNT(*) as span_count,
COUNT(*) FILTER (WHERE s.valence = 'V-') as negative_count,
ARRAY_AGG(s.span_text ORDER BY s.intensity DESC) FILTER (WHERE s.valence = 'V-') as example_quotes
FROM pipeline.spans s
JOIN pipeline.urt_subcodes sc ON sc.code = s.urt_primary
JOIN pipeline.urt_domains d ON d.code = SUBSTRING(s.urt_primary, 1, 1)
WHERE s.job_id = $1 AND s.valence = 'V-'
GROUP BY s.urt_primary, sc.name, sc.definition, d.code, d.name
ORDER BY negative_count DESC
LIMIT 10
""", job_id)
# Get top strengths
top_strengths = await self.pool.fetch("""
SELECT
s.urt_primary as subcode,
sc.name as subcode_name,
sc.definition,
d.code as domain,
d.name as domain_name,
COUNT(*) as span_count,
COUNT(*) FILTER (WHERE s.valence = 'V+') as positive_count,
ARRAY_AGG(s.span_text ORDER BY s.intensity DESC) FILTER (WHERE s.valence = 'V+') as example_quotes
FROM pipeline.spans s
JOIN pipeline.urt_subcodes sc ON sc.code = s.urt_primary
JOIN pipeline.urt_domains d ON d.code = SUBSTRING(s.urt_primary, 1, 1)
WHERE s.job_id = $1 AND s.valence = 'V+'
GROUP BY s.urt_primary, sc.name, sc.definition, d.code, d.name
ORDER BY positive_count DESC
LIMIT 5
""", job_id)
# Get domain distribution
domains = await self.pool.fetch("""
SELECT
SUBSTRING(urt_primary, 1, 1) as domain,
d.name as domain_name,
COUNT(*) as total_count,
COUNT(*) FILTER (WHERE valence = 'V+') as positive_count,
COUNT(*) FILTER (WHERE valence = 'V-') as negative_count
FROM pipeline.spans s
JOIN pipeline.urt_domains d ON d.code = SUBSTRING(s.urt_primary, 1, 1)
WHERE s.job_id = $1
GROUP BY SUBSTRING(urt_primary, 1, 1), d.name
ORDER BY total_count DESC
""", job_id)
# Get business name if available
business = await self.pool.fetchrow("""
SELECT DISTINCT business_name
FROM reviews
WHERE job_id = $1 AND business_name IS NOT NULL
LIMIT 1
""", job_id)
return {
"business_name": business["business_name"] if business else "This business",
"overview": dict(overview) if overview else {},
"sentiment": [dict(r) for r in sentiment],
"top_issues": [dict(r) for r in top_issues],
"top_strengths": [dict(r) for r in top_strengths],
"domains": [dict(r) for r in domains],
}
async def _generate_synthesis(self, context: dict[str, Any]) -> Synthesis:
"""Generate synthesis using LLM."""
# Build the user prompt with context
user_prompt = f"""Analyze this review data for {context['business_name']}:
## Overview
- Total Reviews: {context['overview'].get('total_reviews', 0)}
- Average Rating: {context['overview'].get('avg_rating', 'N/A')}
- Total Insights Extracted: {context['overview'].get('total_spans', 0)}
## Sentiment Distribution
{self._format_sentiment(context['sentiment'])}
## Top Issues (Problems)
{self._format_issues(context['top_issues'])}
## Top Strengths
{self._format_strengths(context['top_strengths'])}
## Domain Breakdown
{self._format_domains(context['domains'])}
Generate a complete synthesis with actionable insights.
"""
# Call LLM
try:
response = await self.llm_client.generate(
system_prompt=SYNTHESIS_SYSTEM_PROMPT,
user_prompt=user_prompt,
temperature=0.7, # Allow some creativity
max_tokens=4000,
)
# Parse JSON response
result = json.loads(response)
# Convert to Synthesis object
return Synthesis(
executive_narrative=result.get("executive_narrative", ""),
sentiment_insight=result.get("sentiment_insight", ""),
category_insight=result.get("category_insight", ""),
timeline_insight=result.get("timeline_insight", ""),
priority_domain=result.get("priority_domain"),
priority_issue=result.get("priority_issue"),
action_plan=[
ActionItem(
id=a.get("id", f"action_{i}"),
title=a.get("title", ""),
why=a.get("why", ""),
what=a.get("what", ""),
who=a.get("who", ""),
impact=a.get("impact", ""),
evidence=a.get("evidence", []),
estimated_rating_lift=a.get("estimated_rating_lift"),
complexity=a.get("complexity", "medium"),
priority=a.get("priority", "medium"),
timeline=a.get("timeline", "This month"),
related_subcode=a.get("related_subcode", ""),
)
for i, a in enumerate(result.get("action_plan", []))
],
issue_actions={}, # Can be populated from action_plan
timeline_annotations=[
TimelineAnnotation(
date=t.get("date", ""),
label=t.get("label", ""),
description=t.get("description", ""),
type=t.get("type", "neutral"),
)
for t in result.get("timeline_annotations", [])
],
marketing_angles=result.get("marketing_angles", []),
competitor_context=result.get("competitor_context"),
generated_at=datetime.utcnow().isoformat(),
)
except json.JSONDecodeError as e:
logger.error(f"Failed to parse LLM response: {e}")
return self._create_fallback_synthesis()
except Exception as e:
logger.error(f"Synthesis generation failed: {e}")
return self._create_fallback_synthesis()
def _format_sentiment(self, sentiment: list[dict]) -> str:
"""Format sentiment data for prompt."""
lines = []
for s in sentiment:
valence = s.get("valence", "Unknown")
count = s.get("count", 0)
reviews = s.get("review_count", 0)
label = {"V+": "Positive", "V-": "Negative", "V0": "Neutral", "": "Mixed"}.get(valence, valence)
lines.append(f"- {label}: {count} mentions ({reviews} reviews)")
return "\n".join(lines) or "No sentiment data"
def _format_issues(self, issues: list[dict]) -> str:
"""Format issues for prompt."""
lines = []
for i, issue in enumerate(issues[:5], 1):
subcode = issue.get("subcode", "")
name = issue.get("subcode_name", "")
domain = issue.get("domain_name", "")
count = issue.get("negative_count", 0)
quotes = issue.get("example_quotes", [])[:2]
lines.append(f"{i}. [{subcode}] {name} ({domain})")
lines.append(f" - {count} negative mentions")
for q in quotes:
if q:
lines.append(f' - Example: "{q[:100]}..."' if len(q) > 100 else f' - Example: "{q}"')
return "\n".join(lines) or "No issues found"
def _format_strengths(self, strengths: list[dict]) -> str:
"""Format strengths for prompt."""
lines = []
for i, strength in enumerate(strengths[:3], 1):
subcode = strength.get("subcode", "")
name = strength.get("subcode_name", "")
domain = strength.get("domain_name", "")
count = strength.get("positive_count", 0)
quotes = strength.get("example_quotes", [])[:2]
lines.append(f"{i}. [{subcode}] {name} ({domain})")
lines.append(f" - {count} positive mentions")
for q in quotes:
if q:
lines.append(f' - Example: "{q[:100]}..."' if len(q) > 100 else f' - Example: "{q}"')
return "\n".join(lines) or "No strengths found"
def _format_domains(self, domains: list[dict]) -> str:
"""Format domain distribution for prompt."""
lines = []
for d in domains:
domain = d.get("domain", "")
name = d.get("domain_name", "")
total = d.get("total_count", 0)
positive = d.get("positive_count", 0)
negative = d.get("negative_count", 0)
lines.append(f"- {domain} ({name}): {total} total ({positive} positive, {negative} negative)")
return "\n".join(lines) or "No domain data"
def _create_fallback_synthesis(self) -> Synthesis:
"""Create a minimal synthesis when LLM fails."""
return Synthesis(
executive_narrative="Unable to generate detailed analysis. Please review the data manually.",
sentiment_insight="",
category_insight="",
timeline_insight="",
priority_domain=None,
priority_issue=None,
action_plan=[],
issue_actions={},
timeline_annotations=[],
marketing_angles=[],
competitor_context=None,
generated_at=datetime.utcnow().isoformat(),
)
async def _store_synthesis(self, execution_id: str, synthesis: Synthesis) -> None:
"""Store synthesis in database."""
await self.pool.execute("""
UPDATE pipeline.executions
SET
synthesis = $2,
updated_at = NOW()
WHERE execution_id = $1
""", execution_id, json.dumps({
"executive_narrative": synthesis.executive_narrative,
"sentiment_insight": synthesis.sentiment_insight,
"category_insight": synthesis.category_insight,
"timeline_insight": synthesis.timeline_insight,
"priority_domain": synthesis.priority_domain,
"priority_issue": synthesis.priority_issue,
"action_plan": [
{
"id": a.id,
"title": a.title,
"why": a.why,
"what": a.what,
"who": a.who,
"impact": a.impact,
"evidence": a.evidence,
"estimated_rating_lift": a.estimated_rating_lift,
"complexity": a.complexity,
"priority": a.priority,
"timeline": a.timeline,
"related_subcode": a.related_subcode,
}
for a in synthesis.action_plan
],
"issue_actions": synthesis.issue_actions,
"timeline_annotations": [
{
"date": t.date,
"label": t.label,
"description": t.description,
"type": t.type,
}
for t in synthesis.timeline_annotations
],
"marketing_angles": synthesis.marketing_angles,
"competitor_context": synthesis.competitor_context,
"generated_at": synthesis.generated_at,
}))