feat(reviewiq): Add AI synthesis support to dashboard components
Frontend: - Add Synthesis type with action plan, insights, annotations - ExecutiveSummary: Accept synthesis prop for AI narrative - SentimentPie: Accept insight prop for contextual explanation - IntensityHeatmap: Accept insight + highlightDomain props - TimelineChart: Accept insight + annotations props - All components gracefully degrade when synthesis is null Backend: - Add Stage 4: Synthesize for generating AI narratives - Gathers context from classified spans - Generates executive narrative, section insights, action plan - Produces timeline annotations and marketing angles - Stores synthesis in pipeline.executions table Components show AI insights with purple gradient styling when available, fall back to existing behavior when synthesis is not yet generated. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,477 @@
|
||||
"""
|
||||
Stage 4: Synthesize - Generate AI narratives and action plans.
|
||||
|
||||
This stage runs after classification and routing to produce:
|
||||
- Executive narrative (business-specific story)
|
||||
- Section insights (sentiment, category, timeline)
|
||||
- Action plan with prioritized recommendations
|
||||
- Timeline annotations for key events
|
||||
- Marketing angles from strengths
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import asyncpg
|
||||
|
||||
from reviewiq_pipeline.services.llm_client import LLMClientBase
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ActionItem:
|
||||
"""A specific action recommendation."""
|
||||
id: str
|
||||
title: str
|
||||
why: str
|
||||
what: str
|
||||
who: str
|
||||
impact: str
|
||||
evidence: list[str]
|
||||
estimated_rating_lift: float | None
|
||||
complexity: str # 'quick' | 'medium' | 'complex'
|
||||
priority: str # 'critical' | 'high' | 'medium' | 'low'
|
||||
timeline: str
|
||||
related_subcode: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class TimelineAnnotation:
|
||||
"""An annotation for a key event on the timeline."""
|
||||
date: str
|
||||
label: str
|
||||
description: str
|
||||
type: str # 'positive' | 'negative' | 'neutral' | 'event'
|
||||
|
||||
|
||||
@dataclass
|
||||
class Synthesis:
|
||||
"""Complete synthesis output from Stage 4."""
|
||||
executive_narrative: str
|
||||
sentiment_insight: str
|
||||
category_insight: str
|
||||
timeline_insight: str
|
||||
priority_domain: str | None
|
||||
priority_issue: str | None
|
||||
action_plan: list[ActionItem]
|
||||
issue_actions: dict[str, str]
|
||||
timeline_annotations: list[TimelineAnnotation]
|
||||
marketing_angles: list[str]
|
||||
competitor_context: str | None
|
||||
generated_at: str
|
||||
|
||||
|
||||
SYNTHESIS_SYSTEM_PROMPT = """You are an expert business analyst specializing in customer experience and review analysis.
|
||||
|
||||
Your task is to analyze classified review data and generate actionable business insights.
|
||||
|
||||
You will receive:
|
||||
1. Summary statistics (total reviews, rating, sentiment distribution)
|
||||
2. Top issues by category with example quotes
|
||||
3. Top strengths with example quotes
|
||||
4. Domain breakdown (what customers talk about most)
|
||||
|
||||
Generate a JSON response with these fields:
|
||||
|
||||
{
|
||||
"executive_narrative": "2-3 paragraph story explaining the business situation, key problems, and path forward. Be specific with numbers and examples.",
|
||||
|
||||
"sentiment_insight": "1-2 sentences explaining WHY sentiment is distributed this way. Connect to specific issues.",
|
||||
|
||||
"category_insight": "1-2 sentences about the pattern in categories. Which domain needs most attention and why?",
|
||||
|
||||
"timeline_insight": "1-2 sentences about trends if data shows changes over time.",
|
||||
|
||||
"priority_domain": "Single letter code (P/V/J/O/A/E/R) for the domain needing most attention, or null",
|
||||
|
||||
"priority_issue": "The subcode (e.g., 'V1.03') that should be fixed first, or null",
|
||||
|
||||
"action_plan": [
|
||||
{
|
||||
"id": "action_1",
|
||||
"title": "Clear action title",
|
||||
"why": "Root cause from the reviews",
|
||||
"what": "Specific steps to take",
|
||||
"who": "Department or role responsible",
|
||||
"impact": "Expected outcome",
|
||||
"evidence": ["Quote 1", "Quote 2"],
|
||||
"estimated_rating_lift": 0.3,
|
||||
"complexity": "quick|medium|complex",
|
||||
"priority": "critical|high|medium|low",
|
||||
"timeline": "This week|This month|This quarter",
|
||||
"related_subcode": "V1.03"
|
||||
}
|
||||
],
|
||||
|
||||
"timeline_annotations": [
|
||||
{
|
||||
"date": "2024-01-15",
|
||||
"label": "Short label",
|
||||
"description": "What happened",
|
||||
"type": "positive|negative|neutral|event"
|
||||
}
|
||||
],
|
||||
|
||||
"marketing_angles": [
|
||||
"Way to promote strength 1",
|
||||
"Way to promote strength 2"
|
||||
],
|
||||
|
||||
"competitor_context": "How this compares to industry/competitors, or null if unknown"
|
||||
}
|
||||
|
||||
Be specific, actionable, and business-focused. Use actual numbers and quotes from the data.
|
||||
Prioritize actions by impact and feasibility.
|
||||
"""
|
||||
|
||||
|
||||
class SynthesisStage:
|
||||
"""
|
||||
Stage 4: Generate AI synthesis from classified review data.
|
||||
|
||||
This stage:
|
||||
1. Aggregates classification results
|
||||
2. Identifies patterns and priorities
|
||||
3. Generates narrative insights via LLM
|
||||
4. Produces actionable recommendations
|
||||
"""
|
||||
|
||||
def __init__(self, pool: asyncpg.Pool, llm_client: LLMClientBase):
|
||||
self.pool = pool
|
||||
self.llm_client = llm_client
|
||||
|
||||
async def run(self, job_id: str, execution_id: str) -> Synthesis:
|
||||
"""
|
||||
Generate synthesis for a completed pipeline execution.
|
||||
|
||||
Args:
|
||||
job_id: The scraping job ID
|
||||
execution_id: The pipeline execution ID
|
||||
|
||||
Returns:
|
||||
Synthesis object with all generated insights
|
||||
"""
|
||||
logger.info(f"Stage 4: Generating synthesis for job {job_id}")
|
||||
|
||||
# Gather all the data we need
|
||||
context = await self._gather_context(job_id)
|
||||
|
||||
# Generate synthesis via LLM
|
||||
synthesis = await self._generate_synthesis(context)
|
||||
|
||||
# Store synthesis in database
|
||||
await self._store_synthesis(execution_id, synthesis)
|
||||
|
||||
logger.info(f"Stage 4: Synthesis complete - {len(synthesis.action_plan)} actions generated")
|
||||
return synthesis
|
||||
|
||||
async def _gather_context(self, job_id: str) -> dict[str, Any]:
|
||||
"""Gather all context needed for synthesis."""
|
||||
|
||||
# Get overview stats
|
||||
overview = await self.pool.fetchrow("""
|
||||
SELECT
|
||||
COUNT(DISTINCT r.review_id) as total_reviews,
|
||||
AVG(r.rating) as avg_rating,
|
||||
COUNT(s.span_id) as total_spans
|
||||
FROM reviews r
|
||||
LEFT JOIN pipeline.spans s ON s.source_review_id = r.review_id
|
||||
WHERE r.job_id = $1
|
||||
""", job_id)
|
||||
|
||||
# Get sentiment distribution
|
||||
sentiment = await self.pool.fetch("""
|
||||
SELECT
|
||||
valence,
|
||||
COUNT(*) as count,
|
||||
COUNT(DISTINCT source_review_id) as review_count
|
||||
FROM pipeline.spans
|
||||
WHERE job_id = $1 AND valence IS NOT NULL
|
||||
GROUP BY valence
|
||||
ORDER BY count DESC
|
||||
""", job_id)
|
||||
|
||||
# Get top issues (weaknesses)
|
||||
top_issues = await self.pool.fetch("""
|
||||
SELECT
|
||||
s.urt_primary as subcode,
|
||||
sc.name as subcode_name,
|
||||
sc.definition,
|
||||
d.code as domain,
|
||||
d.name as domain_name,
|
||||
COUNT(*) as span_count,
|
||||
COUNT(*) FILTER (WHERE s.valence = 'V-') as negative_count,
|
||||
ARRAY_AGG(s.span_text ORDER BY s.intensity DESC) FILTER (WHERE s.valence = 'V-') as example_quotes
|
||||
FROM pipeline.spans s
|
||||
JOIN pipeline.urt_subcodes sc ON sc.code = s.urt_primary
|
||||
JOIN pipeline.urt_domains d ON d.code = SUBSTRING(s.urt_primary, 1, 1)
|
||||
WHERE s.job_id = $1 AND s.valence = 'V-'
|
||||
GROUP BY s.urt_primary, sc.name, sc.definition, d.code, d.name
|
||||
ORDER BY negative_count DESC
|
||||
LIMIT 10
|
||||
""", job_id)
|
||||
|
||||
# Get top strengths
|
||||
top_strengths = await self.pool.fetch("""
|
||||
SELECT
|
||||
s.urt_primary as subcode,
|
||||
sc.name as subcode_name,
|
||||
sc.definition,
|
||||
d.code as domain,
|
||||
d.name as domain_name,
|
||||
COUNT(*) as span_count,
|
||||
COUNT(*) FILTER (WHERE s.valence = 'V+') as positive_count,
|
||||
ARRAY_AGG(s.span_text ORDER BY s.intensity DESC) FILTER (WHERE s.valence = 'V+') as example_quotes
|
||||
FROM pipeline.spans s
|
||||
JOIN pipeline.urt_subcodes sc ON sc.code = s.urt_primary
|
||||
JOIN pipeline.urt_domains d ON d.code = SUBSTRING(s.urt_primary, 1, 1)
|
||||
WHERE s.job_id = $1 AND s.valence = 'V+'
|
||||
GROUP BY s.urt_primary, sc.name, sc.definition, d.code, d.name
|
||||
ORDER BY positive_count DESC
|
||||
LIMIT 5
|
||||
""", job_id)
|
||||
|
||||
# Get domain distribution
|
||||
domains = await self.pool.fetch("""
|
||||
SELECT
|
||||
SUBSTRING(urt_primary, 1, 1) as domain,
|
||||
d.name as domain_name,
|
||||
COUNT(*) as total_count,
|
||||
COUNT(*) FILTER (WHERE valence = 'V+') as positive_count,
|
||||
COUNT(*) FILTER (WHERE valence = 'V-') as negative_count
|
||||
FROM pipeline.spans s
|
||||
JOIN pipeline.urt_domains d ON d.code = SUBSTRING(s.urt_primary, 1, 1)
|
||||
WHERE s.job_id = $1
|
||||
GROUP BY SUBSTRING(urt_primary, 1, 1), d.name
|
||||
ORDER BY total_count DESC
|
||||
""", job_id)
|
||||
|
||||
# Get business name if available
|
||||
business = await self.pool.fetchrow("""
|
||||
SELECT DISTINCT business_name
|
||||
FROM reviews
|
||||
WHERE job_id = $1 AND business_name IS NOT NULL
|
||||
LIMIT 1
|
||||
""", job_id)
|
||||
|
||||
return {
|
||||
"business_name": business["business_name"] if business else "This business",
|
||||
"overview": dict(overview) if overview else {},
|
||||
"sentiment": [dict(r) for r in sentiment],
|
||||
"top_issues": [dict(r) for r in top_issues],
|
||||
"top_strengths": [dict(r) for r in top_strengths],
|
||||
"domains": [dict(r) for r in domains],
|
||||
}
|
||||
|
||||
async def _generate_synthesis(self, context: dict[str, Any]) -> Synthesis:
|
||||
"""Generate synthesis using LLM."""
|
||||
|
||||
# Build the user prompt with context
|
||||
user_prompt = f"""Analyze this review data for {context['business_name']}:
|
||||
|
||||
## Overview
|
||||
- Total Reviews: {context['overview'].get('total_reviews', 0)}
|
||||
- Average Rating: {context['overview'].get('avg_rating', 'N/A')}
|
||||
- Total Insights Extracted: {context['overview'].get('total_spans', 0)}
|
||||
|
||||
## Sentiment Distribution
|
||||
{self._format_sentiment(context['sentiment'])}
|
||||
|
||||
## Top Issues (Problems)
|
||||
{self._format_issues(context['top_issues'])}
|
||||
|
||||
## Top Strengths
|
||||
{self._format_strengths(context['top_strengths'])}
|
||||
|
||||
## Domain Breakdown
|
||||
{self._format_domains(context['domains'])}
|
||||
|
||||
Generate a complete synthesis with actionable insights.
|
||||
"""
|
||||
|
||||
# Call LLM
|
||||
try:
|
||||
response = await self.llm_client.generate(
|
||||
system_prompt=SYNTHESIS_SYSTEM_PROMPT,
|
||||
user_prompt=user_prompt,
|
||||
temperature=0.7, # Allow some creativity
|
||||
max_tokens=4000,
|
||||
)
|
||||
|
||||
# Parse JSON response
|
||||
result = json.loads(response)
|
||||
|
||||
# Convert to Synthesis object
|
||||
return Synthesis(
|
||||
executive_narrative=result.get("executive_narrative", ""),
|
||||
sentiment_insight=result.get("sentiment_insight", ""),
|
||||
category_insight=result.get("category_insight", ""),
|
||||
timeline_insight=result.get("timeline_insight", ""),
|
||||
priority_domain=result.get("priority_domain"),
|
||||
priority_issue=result.get("priority_issue"),
|
||||
action_plan=[
|
||||
ActionItem(
|
||||
id=a.get("id", f"action_{i}"),
|
||||
title=a.get("title", ""),
|
||||
why=a.get("why", ""),
|
||||
what=a.get("what", ""),
|
||||
who=a.get("who", ""),
|
||||
impact=a.get("impact", ""),
|
||||
evidence=a.get("evidence", []),
|
||||
estimated_rating_lift=a.get("estimated_rating_lift"),
|
||||
complexity=a.get("complexity", "medium"),
|
||||
priority=a.get("priority", "medium"),
|
||||
timeline=a.get("timeline", "This month"),
|
||||
related_subcode=a.get("related_subcode", ""),
|
||||
)
|
||||
for i, a in enumerate(result.get("action_plan", []))
|
||||
],
|
||||
issue_actions={}, # Can be populated from action_plan
|
||||
timeline_annotations=[
|
||||
TimelineAnnotation(
|
||||
date=t.get("date", ""),
|
||||
label=t.get("label", ""),
|
||||
description=t.get("description", ""),
|
||||
type=t.get("type", "neutral"),
|
||||
)
|
||||
for t in result.get("timeline_annotations", [])
|
||||
],
|
||||
marketing_angles=result.get("marketing_angles", []),
|
||||
competitor_context=result.get("competitor_context"),
|
||||
generated_at=datetime.utcnow().isoformat(),
|
||||
)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse LLM response: {e}")
|
||||
return self._create_fallback_synthesis()
|
||||
except Exception as e:
|
||||
logger.error(f"Synthesis generation failed: {e}")
|
||||
return self._create_fallback_synthesis()
|
||||
|
||||
def _format_sentiment(self, sentiment: list[dict]) -> str:
|
||||
"""Format sentiment data for prompt."""
|
||||
lines = []
|
||||
for s in sentiment:
|
||||
valence = s.get("valence", "Unknown")
|
||||
count = s.get("count", 0)
|
||||
reviews = s.get("review_count", 0)
|
||||
label = {"V+": "Positive", "V-": "Negative", "V0": "Neutral", "V±": "Mixed"}.get(valence, valence)
|
||||
lines.append(f"- {label}: {count} mentions ({reviews} reviews)")
|
||||
return "\n".join(lines) or "No sentiment data"
|
||||
|
||||
def _format_issues(self, issues: list[dict]) -> str:
|
||||
"""Format issues for prompt."""
|
||||
lines = []
|
||||
for i, issue in enumerate(issues[:5], 1):
|
||||
subcode = issue.get("subcode", "")
|
||||
name = issue.get("subcode_name", "")
|
||||
domain = issue.get("domain_name", "")
|
||||
count = issue.get("negative_count", 0)
|
||||
quotes = issue.get("example_quotes", [])[:2]
|
||||
|
||||
lines.append(f"{i}. [{subcode}] {name} ({domain})")
|
||||
lines.append(f" - {count} negative mentions")
|
||||
for q in quotes:
|
||||
if q:
|
||||
lines.append(f' - Example: "{q[:100]}..."' if len(q) > 100 else f' - Example: "{q}"')
|
||||
return "\n".join(lines) or "No issues found"
|
||||
|
||||
def _format_strengths(self, strengths: list[dict]) -> str:
|
||||
"""Format strengths for prompt."""
|
||||
lines = []
|
||||
for i, strength in enumerate(strengths[:3], 1):
|
||||
subcode = strength.get("subcode", "")
|
||||
name = strength.get("subcode_name", "")
|
||||
domain = strength.get("domain_name", "")
|
||||
count = strength.get("positive_count", 0)
|
||||
quotes = strength.get("example_quotes", [])[:2]
|
||||
|
||||
lines.append(f"{i}. [{subcode}] {name} ({domain})")
|
||||
lines.append(f" - {count} positive mentions")
|
||||
for q in quotes:
|
||||
if q:
|
||||
lines.append(f' - Example: "{q[:100]}..."' if len(q) > 100 else f' - Example: "{q}"')
|
||||
return "\n".join(lines) or "No strengths found"
|
||||
|
||||
def _format_domains(self, domains: list[dict]) -> str:
|
||||
"""Format domain distribution for prompt."""
|
||||
lines = []
|
||||
for d in domains:
|
||||
domain = d.get("domain", "")
|
||||
name = d.get("domain_name", "")
|
||||
total = d.get("total_count", 0)
|
||||
positive = d.get("positive_count", 0)
|
||||
negative = d.get("negative_count", 0)
|
||||
lines.append(f"- {domain} ({name}): {total} total ({positive} positive, {negative} negative)")
|
||||
return "\n".join(lines) or "No domain data"
|
||||
|
||||
def _create_fallback_synthesis(self) -> Synthesis:
|
||||
"""Create a minimal synthesis when LLM fails."""
|
||||
return Synthesis(
|
||||
executive_narrative="Unable to generate detailed analysis. Please review the data manually.",
|
||||
sentiment_insight="",
|
||||
category_insight="",
|
||||
timeline_insight="",
|
||||
priority_domain=None,
|
||||
priority_issue=None,
|
||||
action_plan=[],
|
||||
issue_actions={},
|
||||
timeline_annotations=[],
|
||||
marketing_angles=[],
|
||||
competitor_context=None,
|
||||
generated_at=datetime.utcnow().isoformat(),
|
||||
)
|
||||
|
||||
async def _store_synthesis(self, execution_id: str, synthesis: Synthesis) -> None:
|
||||
"""Store synthesis in database."""
|
||||
await self.pool.execute("""
|
||||
UPDATE pipeline.executions
|
||||
SET
|
||||
synthesis = $2,
|
||||
updated_at = NOW()
|
||||
WHERE execution_id = $1
|
||||
""", execution_id, json.dumps({
|
||||
"executive_narrative": synthesis.executive_narrative,
|
||||
"sentiment_insight": synthesis.sentiment_insight,
|
||||
"category_insight": synthesis.category_insight,
|
||||
"timeline_insight": synthesis.timeline_insight,
|
||||
"priority_domain": synthesis.priority_domain,
|
||||
"priority_issue": synthesis.priority_issue,
|
||||
"action_plan": [
|
||||
{
|
||||
"id": a.id,
|
||||
"title": a.title,
|
||||
"why": a.why,
|
||||
"what": a.what,
|
||||
"who": a.who,
|
||||
"impact": a.impact,
|
||||
"evidence": a.evidence,
|
||||
"estimated_rating_lift": a.estimated_rating_lift,
|
||||
"complexity": a.complexity,
|
||||
"priority": a.priority,
|
||||
"timeline": a.timeline,
|
||||
"related_subcode": a.related_subcode,
|
||||
}
|
||||
for a in synthesis.action_plan
|
||||
],
|
||||
"issue_actions": synthesis.issue_actions,
|
||||
"timeline_annotations": [
|
||||
{
|
||||
"date": t.date,
|
||||
"label": t.label,
|
||||
"description": t.description,
|
||||
"type": t.type,
|
||||
}
|
||||
for t in synthesis.timeline_annotations
|
||||
],
|
||||
"marketing_angles": synthesis.marketing_angles,
|
||||
"competitor_context": synthesis.competitor_context,
|
||||
"generated_at": synthesis.generated_at,
|
||||
}))
|
||||
Reference in New Issue
Block a user