feat(pipeline): Add Stage 5 Synthesis for AI-generated narratives

- Add Stage5Synthesizer class that generates AI narratives and action plans
- Add generate() method to LLMClient for synthesis generation
- Integrate Stage 5 into pipeline runner after route stage
- Add synthesis JSONB column to pipeline.executions table
- Update reviewiq_analytics API to return synthesis data
- Synthesis includes: executive narrative, sentiment/category/timeline insights,
  action plan, marketing angles, and priority recommendations

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-29 03:12:53 +00:00
parent c8ecb4b98f
commit 9b667e69a7
5 changed files with 3129 additions and 67 deletions

View File

@@ -7,9 +7,11 @@ the BasePipeline interface for the extensible pipeline system.
from __future__ import annotations
import json
import logging
import re
import time
from datetime import date
from datetime import date, datetime, timedelta
from typing import TYPE_CHECKING, Any
from pipeline_core import (
@@ -51,6 +53,8 @@ from reviewiq_pipeline.stages.stage1_normalize import Stage1Normalizer
from reviewiq_pipeline.stages.stage2_classify import Stage2Classifier
from reviewiq_pipeline.stages.stage3_route import Stage3Router
from reviewiq_pipeline.stages.stage4_aggregate import Stage4Aggregator
from reviewiq_pipeline.stages.stage5_synthesize import Stage5Synthesizer
from reviewiq_pipeline.services.llm_client import LLMClient
from reviewiq_pipeline.validation.validators import (
validate_stage1_output,
validate_stage2_output,
@@ -64,9 +68,65 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__)
# Stage name to number mapping
STAGE_NAMES = ["normalize", "classify", "route", "aggregate"]
STAGE_NAME_TO_NUM = {"normalize": 1, "classify": 2, "route": 3, "aggregate": 4}
STAGE_NUM_TO_NAME = {1: "normalize", 2: "classify", 3: "route", 4: "aggregate"}
STAGE_NAMES = ["normalize", "classify", "route", "aggregate", "synthesize"]
STAGE_NAME_TO_NUM = {"normalize": 1, "classify": 2, "route": 3, "aggregate": 4, "synthesize": 5}
STAGE_NUM_TO_NAME = {1: "normalize", 2: "classify", 3: "route", 4: "aggregate", 5: "synthesize"}
def _parse_relative_date(date_str: str | None, default_to_now: bool = True) -> datetime | None:
"""Parse relative date strings like '10 months ago' into datetime objects.
Args:
date_str: A relative date string (e.g., "10 months ago", "2 weeks ago")
or an ISO date string, or None.
default_to_now: If True, returns current datetime when parsing fails.
Returns:
A datetime object, or None if parsing fails and default_to_now is False.
"""
now = datetime.now()
if not date_str:
return now if default_to_now else None
# Try to parse as ISO date first
try:
return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
except (ValueError, AttributeError):
pass
# Parse relative dates like "10 months ago", "2 weeks ago", "a day ago"
date_str = date_str.lower().strip()
# Handle "a/an" as 1
date_str = re.sub(r'\b(a|an)\s+', '1 ', date_str)
# Extract number and unit
match = re.match(r'(\d+)\s*(second|minute|hour|day|week|month|year)s?\s*ago', date_str)
if match:
amount = int(match.group(1))
unit = match.group(2)
if unit == 'second':
return now - timedelta(seconds=amount)
elif unit == 'minute':
return now - timedelta(minutes=amount)
elif unit == 'hour':
return now - timedelta(hours=amount)
elif unit == 'day':
return now - timedelta(days=amount)
elif unit == 'week':
return now - timedelta(weeks=amount)
elif unit == 'month':
# Approximate months as 30 days
return now - timedelta(days=amount * 30)
elif unit == 'year':
# Approximate years as 365 days
return now - timedelta(days=amount * 365)
# If we can't parse it, return now or None
logger.warning(f"Could not parse relative date: {date_str}")
return now if default_to_now else None
class PipelineResult:
@@ -228,8 +288,11 @@ class ReviewIQPipeline(BasePipeline):
stages_run: list[str] = []
stage_results: dict[str, StageResult] = {}
# Convert input to ScraperOutput if needed
scraper_output = self._ensure_scraper_output(input_data)
# Convert input to ScraperOutput if needed (may fetch from DB)
scraper_output = await self._ensure_scraper_output(input_data)
# Extract job_id for linking issues to pipeline executions
job_id = scraper_output.get("job_id")
# Track intermediate results for stage dependencies
stage1_result: Stage1Output | None = None
@@ -270,6 +333,20 @@ class ReviewIQPipeline(BasePipeline):
)
# Stage 2: Classify
# If classify is requested but we don't have stage1_result, try to fetch from DB
if "classify" in stages and not stage1_result and job_id:
logger.info("No stage1_result, fetching existing normalized reviews from database")
stage1_result = await self._fetch_normalized_reviews_from_db(job_id)
if stage1_result:
logger.info(f"Loaded {len(stage1_result.get('reviews_normalized', []))} reviews from DB for reclassification")
# Clean up old spans and issues before reclassification
if self._span_repo:
deactivated = await self._span_repo.deactivate_spans_for_job(job_id)
logger.info(f"Deactivated {deactivated} existing spans for job {job_id}")
if self._issue_repo:
deleted = await self._issue_repo.delete_issues_for_job(job_id)
logger.info(f"Deleted {deleted} existing issues for job {job_id}")
if "classify" in stages and stage1_result:
start = time.time()
logger.info("Running Stage 2: Classification")
@@ -308,7 +385,7 @@ class ReviewIQPipeline(BasePipeline):
logger.info("Running Stage 3: Issue Routing")
try:
stage3_result = await self._run_route(stage2_result)
stage3_result = await self._run_route(stage2_result, job_id=job_id)
duration_ms = int((time.time() - start) * 1000)
stages_run.append("route")
stage_results["route"] = StageResult(
@@ -371,6 +448,43 @@ class ReviewIQPipeline(BasePipeline):
error=f"aggregate failed: {e}",
)
# Stage 5: Synthesize (AI-generated narratives)
# Requires job_id and execution_id from pipeline execution tracking
if "synthesize" in stages and job_id:
start = time.time()
logger.info("Running Stage 5: Synthesis")
try:
# Get the execution_id for this pipeline run
execution_id = input_data.get("execution_id")
if execution_id:
stage5_result = await self._run_synthesize(job_id, execution_id)
duration_ms = int((time.time() - start) * 1000)
stages_run.append("synthesize")
stage_results["synthesize"] = StageResult(
stage="synthesize",
success=True,
data={
"actions_generated": len(stage5_result.action_plan) if stage5_result else 0,
"has_narrative": bool(stage5_result and stage5_result.executive_narrative),
},
error=None,
duration_ms=duration_ms,
)
else:
logger.warning("No execution_id provided, skipping synthesis")
except Exception as e:
logger.exception("Stage 5 failed")
stage_results["synthesize"] = StageResult(
stage="synthesize",
success=False,
data={},
error=str(e),
duration_ms=int((time.time() - start) * 1000),
)
# Synthesis failure is non-fatal - pipeline still succeeds
logger.warning(f"Synthesis failed but continuing: {e}")
return BasePipelineResult(
pipeline_id="reviewiq",
stages_run=stages_run,
@@ -558,6 +672,34 @@ class ReviewIQPipeline(BasePipeline):
],
collapsed=False,
),
DashboardSection(
id="classified_reviews",
title="Classified Reviews",
description="All reviews with URT classification codes and human-readable meanings",
widgets=[
WidgetConfig(
id="classified_reviews_table",
type="table",
title="Reviews with URT Codes",
grid={"x": 0, "y": 0, "w": 12, "h": 3},
config={
"columns": [
{"key": "span_text", "header": "Review Excerpt", "width": 300},
{"key": "urt_code", "header": "Code", "width": 80},
{"key": "code_name", "header": "Category", "width": 150},
{"key": "domain_name", "header": "Domain", "width": 100},
{"key": "valence", "header": "Sentiment", "width": 80},
{"key": "intensity", "header": "Intensity", "width": 80},
{"key": "rating", "header": "Stars", "width": 60, "align": "center"},
],
"row_key": "span_id",
"page_size": 15,
"sortable": True,
},
),
],
collapsed=False,
),
],
default_time_range="30d",
refresh_interval=300,
@@ -573,7 +715,7 @@ class ReviewIQPipeline(BasePipeline):
Args:
widget_id: Widget identifier
params: Query parameters (business_id, time_range, etc.)
params: Query parameters (business_id, job_id, time_range, etc.)
Returns:
Widget data dictionary
@@ -581,36 +723,41 @@ class ReviewIQPipeline(BasePipeline):
await self.initialize()
business_id = params.get("business_id")
job_id = params.get("job_id")
time_range = params.get("time_range", "30d")
match widget_id:
# Overview stats
case "total_reviews":
return await self._get_review_count(business_id)
return await self._get_review_count(business_id, job_id)
case "reviews_processed":
return await self._get_processed_count(business_id, time_range)
return await self._get_processed_count(business_id, job_id, time_range)
case "issues_found":
return await self._get_issues_count(business_id)
return await self._get_issues_count(business_id, job_id)
case "avg_rating":
return await self._get_avg_rating(business_id, time_range)
return await self._get_avg_rating(business_id, job_id, time_range)
# Sentiment
case "sentiment_distribution":
return await self._get_sentiment_distribution(business_id)
return await self._get_sentiment_distribution(business_id, job_id)
case "sentiment_trend":
return await self._get_sentiment_trend(business_id, time_range)
return await self._get_sentiment_trend(business_id, job_id, time_range)
# Classification
case "urt_distribution":
return await self._get_urt_distribution(business_id)
return await self._get_urt_distribution(business_id, job_id)
case "intensity_heatmap":
return await self._get_intensity_heatmap(business_id)
return await self._get_intensity_heatmap(business_id, job_id)
# Issues
case "issues_table":
return await self._get_issues_table(business_id, params)
return await self._get_issues_table(business_id, job_id, params)
case "issues_by_domain":
return await self._get_issues_by_domain(business_id)
return await self._get_issues_by_domain(business_id, job_id)
# Classified Reviews
case "classified_reviews_table":
return await self._get_classified_reviews(business_id, job_id, params)
case _:
logger.warning(f"Unknown widget: {widget_id}")
@@ -643,6 +790,9 @@ class ReviewIQPipeline(BasePipeline):
result = PipelineResult()
validation_results: dict[str, ValidationResult] = {}
# Extract job_id for linking issues
job_id = scraper_output.get("job_id")
# Stage 1: Normalize
if 1 in stages:
logger.info("Running Stage 1: Normalization")
@@ -668,7 +818,7 @@ class ReviewIQPipeline(BasePipeline):
# Stage 3: Route
if 3 in stages and result.stage2:
logger.info("Running Stage 3: Issue Routing")
result.stage3 = await self._run_route(result.stage2)
result.stage3 = await self._run_route(result.stage2, job_id=job_id)
if validate:
validation_results["stage3"] = await validate_stage3_output(
@@ -700,10 +850,10 @@ class ReviewIQPipeline(BasePipeline):
await self.initialize()
return await self._run_classify(stage1_output)
async def route(self, stage2_output: Stage2Output) -> Stage3Output:
async def route(self, stage2_output: Stage2Output, job_id: str | None = None) -> Stage3Output:
"""Run Stage 3: Issue Routing (legacy method)."""
await self.initialize()
return await self._run_route(stage2_output)
return await self._run_route(stage2_output, job_id=job_id)
async def aggregate(
self,
@@ -719,14 +869,91 @@ class ReviewIQPipeline(BasePipeline):
# Internal Stage Implementations
# =========================================================================
def _ensure_scraper_output(self, input_data: dict[str, Any]) -> ScraperOutput:
"""Ensure input data is in ScraperOutput format."""
async def _ensure_scraper_output(self, input_data: dict[str, Any]) -> ScraperOutput:
"""Ensure input data is in ScraperOutput format.
If only job_id is provided, fetches job data from the database.
"""
# If it has all required fields, use as-is
required = ["job_id", "business_id", "place_id", "reviews"]
if all(k in input_data for k in required):
return input_data # type: ignore
# Otherwise, wrap it
# If we have a job_id but missing reviews, fetch from database
job_id = input_data.get("job_id")
if job_id and not input_data.get("reviews") and self._db:
logger.info(f"Fetching job data from database for job_id: {job_id}")
async with self._db.pool.acquire() as conn:
row = await conn.fetchrow(
"""
SELECT job_id, status, reviews_data, reviews_count,
metadata->>'business_name' as business_name,
metadata->>'place_id' as place_id,
metadata->>'address' as address,
metadata->>'category' as category,
metadata->>'total_reviews' as total_reviews,
metadata->>'average_rating' as average_rating,
scraper_version
FROM public.jobs
WHERE job_id = $1::uuid
""",
str(job_id),
)
if row and row["reviews_data"]:
reviews_data = row["reviews_data"]
# asyncpg may return JSONB as a string - parse it if needed
if isinstance(reviews_data, str):
logger.info("Parsing reviews_data JSON string")
reviews_data = json.loads(reviews_data)
# Convert reviews_data to RawReview format
# Handle both API format (review_id, author, rating) and scraper format (reviewId, name, stars)
reviews = []
for i, review in enumerate(reviews_data):
if isinstance(review, str):
# Skip if review is somehow a string
logger.warning(f"Skipping review {i}: got string instead of dict")
continue
# Parse the review time (may be relative like "10 months ago")
raw_time = review.get("timestamp") or review.get("publishedAtDate") or ""
parsed_time = _parse_relative_date(raw_time)
reviews.append({
"review_id": review.get("review_id") or review.get("reviewId") or f"review_{i}",
"author_name": review.get("author") or review.get("name") or "Anonymous",
"author_id": review.get("reviewerId"),
"rating": review.get("rating") or review.get("stars") or 0,
"text": review.get("text"),
"review_time": parsed_time,
"response_text": review.get("responseFromOwner", {}).get("text") if review.get("responseFromOwner") else None,
"response_time": review.get("responseFromOwner", {}).get("publishedAtDate") if review.get("responseFromOwner") else None,
"photos": review.get("reviewImageUrls"),
"raw_payload": review,
})
logger.info(f"Loaded {len(reviews)} reviews from job {job_id}")
return ScraperOutput(
job_id=str(row["job_id"]),
status=row["status"] or "completed",
business_id=row["business_name"] or "unknown",
place_id=row["place_id"] or "unknown",
business_info={
"name": row["business_name"] or "",
"address": row["address"] or "",
"category": row["category"] or "",
"total_reviews": int(row["total_reviews"]) if row["total_reviews"] else 0,
"average_rating": float(row["average_rating"]) if row["average_rating"] else 0.0,
},
reviews=reviews,
scrape_time_ms=0,
reviews_scraped=len(reviews),
scraper_version=row["scraper_version"] or "unknown",
)
else:
logger.warning(f"No reviews found in database for job_id: {job_id}")
# Otherwise, wrap it with empty/default values
return ScraperOutput(
job_id=input_data.get("job_id", "unknown"),
status=input_data.get("status", "completed"),
@@ -739,6 +966,70 @@ class ReviewIQPipeline(BasePipeline):
scraper_version=input_data.get("scraper_version", "unknown"),
)
async def _fetch_normalized_reviews_from_db(self, job_id: str) -> Stage1Output | None:
"""Fetch existing normalized reviews from DB for reclassification.
Used when running classify stage standalone without normalize.
"""
if not self._db:
return None
async with self._db.pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT
source,
review_id,
review_version,
business_id,
place_id,
text,
text_normalized,
rating,
review_time
FROM pipeline.reviews_enriched
WHERE job_id = $1::uuid
AND is_latest = TRUE
ORDER BY review_time DESC
""",
job_id,
)
if not rows:
logger.warning(f"No normalized reviews found in DB for job_id: {job_id}")
return None
reviews_normalized = [
NormalizedReview(
source=row["source"],
review_id=row["review_id"],
review_version=row["review_version"],
business_id=row["business_id"],
place_id=row["place_id"],
text=row["text"],
text_normalized=row["text_normalized"],
rating=row["rating"],
review_time=row["review_time"],
)
for row in rows
]
logger.info(f"Fetched {len(reviews_normalized)} normalized reviews from DB for job {job_id}")
return Stage1Output(
job_id=job_id,
reviews_normalized=reviews_normalized,
reviews_skipped=[],
duplicates_found=[],
stats={
"total_input": len(reviews_normalized),
"processed": len(reviews_normalized),
"skipped": 0,
"duplicates": 0,
"from_db": True,
},
)
async def _run_normalize(self, scraper_output: ScraperOutput) -> Stage1Output:
"""Run normalization stage."""
stage1 = Stage1Normalizer(
@@ -788,6 +1079,7 @@ class ReviewIQPipeline(BasePipeline):
taxonomy_version=self._config.taxonomy_version,
profile=self._config.classification_profile,
max_spans_per_review=self._config.max_spans_per_review,
job_id=stage1_output.get("job_id"),
),
)
@@ -796,7 +1088,7 @@ class ReviewIQPipeline(BasePipeline):
finally:
await stage2.close()
async def _run_route(self, stage2_output: Stage2Output) -> Stage3Output:
async def _run_route(self, stage2_output: Stage2Output, job_id: str | None = None) -> Stage3Output:
"""Run issue routing stage."""
stage3 = Stage3Router(
self._config,
@@ -806,9 +1098,12 @@ class ReviewIQPipeline(BasePipeline):
)
spans_to_route = []
now = datetime.now()
for review in stage2_output["reviews_classified"]:
for span in review.get("spans", []):
if span["valence"] in ("V-", ""):
# Use current datetime as fallback for missing review_time
review_time = review.get("review_time") or now
spans_to_route.append(
SpanToRoute(
span_id=span["span_id"],
@@ -818,13 +1113,13 @@ class ReviewIQPipeline(BasePipeline):
valence=span["valence"],
intensity=span["intensity"],
entity_normalized=span.get("entity_normalized"),
review_time=review.get("review_time", ""),
review_time=review_time,
confidence=span.get("confidence", "medium"),
trust_score=review.get("trust_score", 0.5),
)
)
return await stage3.process(Stage3Input(spans=spans_to_route))
return await stage3.process(Stage3Input(spans=spans_to_route, job_id=job_id))
async def _run_aggregate(
self,
@@ -848,17 +1143,39 @@ class ReviewIQPipeline(BasePipeline):
return await stage4.process(input_data)
async def _run_synthesize(self, job_id: str, execution_id: str):
"""Run AI synthesis stage to generate narratives and action plans."""
from reviewiq_pipeline.stages.stage5_synthesize import Synthesis
# Create LLM client for synthesis
llm_client = LLMClient.create(self._config)
try:
stage5 = Stage5Synthesizer(
pool=self._db.pool,
llm_client=llm_client,
)
return await stage5.run(job_id, execution_id)
finally:
await llm_client.close()
# =========================================================================
# Widget Data Methods
# =========================================================================
async def _get_review_count(self, business_id: str | None) -> dict[str, Any]:
async def _get_review_count(self, business_id: str | None, job_id: str | None = None) -> dict[str, Any]:
"""Get total review count."""
if not self._db:
return {"total_reviews": 0}
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
count = await conn.fetchval(
"SELECT COUNT(*) FROM pipeline.reviews_enriched WHERE job_id = $1::uuid",
job_id,
)
elif business_id:
count = await conn.fetchval(
"SELECT COUNT(*) FROM pipeline.reviews_raw WHERE business_id = $1",
business_id,
@@ -871,7 +1188,7 @@ class ReviewIQPipeline(BasePipeline):
return {"total_reviews": count or 0}
async def _get_processed_count(
self, business_id: str | None, time_range: str
self, business_id: str | None, job_id: str | None, time_range: str
) -> dict[str, Any]:
"""Get processed review count with trend."""
if not self._db:
@@ -881,7 +1198,14 @@ class ReviewIQPipeline(BasePipeline):
days = self._parse_time_range(time_range)
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
# When filtering by job_id, just return count for that job
current = await conn.fetchval(
"SELECT COUNT(*) FROM pipeline.reviews_enriched WHERE job_id = $1::uuid",
job_id,
)
return {"reviews_processed": current or 0, "processed_change": 0}
elif business_id:
current = await conn.fetchval(
"""
SELECT COUNT(*) FROM pipeline.reviews_enriched
@@ -929,13 +1253,21 @@ class ReviewIQPipeline(BasePipeline):
"processed_change": round(change, 1),
}
async def _get_issues_count(self, business_id: str | None) -> dict[str, Any]:
async def _get_issues_count(self, business_id: str | None, job_id: str | None = None) -> dict[str, Any]:
"""Get open issues count."""
if not self._db:
return {"issues_count": 0}
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
count = await conn.fetchval(
"""
SELECT COUNT(*) FROM pipeline.issues
WHERE job_id = $1::uuid AND state = 'open'
""",
job_id,
)
elif business_id:
count = await conn.fetchval(
"""
SELECT COUNT(*) FROM pipeline.issues
@@ -951,7 +1283,7 @@ class ReviewIQPipeline(BasePipeline):
return {"issues_count": count or 0}
async def _get_avg_rating(
self, business_id: str | None, time_range: str
self, business_id: str | None, job_id: str | None, time_range: str
) -> dict[str, Any]:
"""Get average rating with trend."""
if not self._db:
@@ -960,7 +1292,13 @@ class ReviewIQPipeline(BasePipeline):
days = self._parse_time_range(time_range)
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
current = await conn.fetchval(
"SELECT AVG(rating) FROM pipeline.reviews_enriched WHERE job_id = $1::uuid",
job_id,
)
return {"avg_rating": round(float(current), 2) if current else 0, "rating_change": 0}
elif business_id:
current = await conn.fetchval(
"""
SELECT AVG(rating) FROM pipeline.reviews_enriched
@@ -1009,14 +1347,26 @@ class ReviewIQPipeline(BasePipeline):
}
async def _get_sentiment_distribution(
self, business_id: str | None
self, business_id: str | None, job_id: str | None = None
) -> dict[str, Any]:
"""Get sentiment distribution for pie chart."""
if not self._db:
return {"data": []}
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
rows = await conn.fetch(
"""
SELECT
valence,
COUNT(*) as count
FROM pipeline.review_spans
WHERE job_id = $1::uuid AND is_active = TRUE
GROUP BY valence
""",
job_id,
)
elif business_id:
rows = await conn.fetch(
"""
SELECT
@@ -1059,7 +1409,7 @@ class ReviewIQPipeline(BasePipeline):
return {"data": data}
async def _get_sentiment_trend(
self, business_id: str | None, time_range: str
self, business_id: str | None, job_id: str | None, time_range: str
) -> dict[str, Any]:
"""Get sentiment trend over time for line chart."""
if not self._db:
@@ -1068,7 +1418,23 @@ class ReviewIQPipeline(BasePipeline):
days = self._parse_time_range(time_range)
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
rows = await conn.fetch(
"""
SELECT
DATE(review_time) as date,
COUNT(*) FILTER (WHERE valence = 'V+') as positive,
COUNT(*) FILTER (WHERE valence = 'V-') as negative,
COUNT(*) FILTER (WHERE valence = 'V0') as neutral
FROM pipeline.review_spans
WHERE job_id = $1::uuid
AND is_active = TRUE
GROUP BY DATE(review_time)
ORDER BY date
""",
job_id,
)
elif business_id:
rows = await conn.fetch(
"""
SELECT
@@ -1115,13 +1481,26 @@ class ReviewIQPipeline(BasePipeline):
return {"data": data}
async def _get_urt_distribution(self, business_id: str | None) -> dict[str, Any]:
async def _get_urt_distribution(self, business_id: str | None, job_id: str | None = None) -> dict[str, Any]:
"""Get URT domain distribution for bar chart."""
if not self._db:
return {"data": []}
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
rows = await conn.fetch(
"""
SELECT
SUBSTRING(urt_primary, 1, 1) as domain,
COUNT(*) as count
FROM pipeline.review_spans
WHERE job_id = $1::uuid AND is_active = TRUE
GROUP BY SUBSTRING(urt_primary, 1, 1)
ORDER BY count DESC
""",
job_id,
)
elif business_id:
rows = await conn.fetch(
"""
SELECT
@@ -1168,13 +1547,26 @@ class ReviewIQPipeline(BasePipeline):
return {"data": data}
async def _get_intensity_heatmap(self, business_id: str | None) -> dict[str, Any]:
async def _get_intensity_heatmap(self, business_id: str | None, job_id: str | None = None) -> dict[str, Any]:
"""Get domain x intensity heatmap data."""
if not self._db:
return {"data": []}
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
rows = await conn.fetch(
"""
SELECT
SUBSTRING(urt_primary, 1, 1) as domain,
intensity,
COUNT(*) as count
FROM pipeline.review_spans
WHERE job_id = $1::uuid AND is_active = TRUE
GROUP BY SUBSTRING(urt_primary, 1, 1), intensity
""",
job_id,
)
elif business_id:
rows = await conn.fetch(
"""
SELECT
@@ -1222,7 +1614,7 @@ class ReviewIQPipeline(BasePipeline):
return {"data": data}
async def _get_issues_table(
self, business_id: str | None, params: dict[str, Any]
self, business_id: str | None, job_id: str | None, params: dict[str, Any]
) -> dict[str, Any]:
"""Get issues table data."""
if not self._db:
@@ -1233,7 +1625,30 @@ class ReviewIQPipeline(BasePipeline):
offset = (page - 1) * page_size
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
rows = await conn.fetch(
"""
SELECT
issue_id,
domain,
primary_subcode as subcode,
span_count,
max_intensity,
state
FROM pipeline.issues
WHERE job_id = $1::uuid
ORDER BY span_count DESC, created_at DESC
LIMIT $2 OFFSET $3
""",
job_id,
page_size,
offset,
)
total = await conn.fetchval(
"SELECT COUNT(*) FROM pipeline.issues WHERE job_id = $1::uuid",
job_id,
)
elif business_id:
rows = await conn.fetch(
"""
SELECT
@@ -1279,13 +1694,24 @@ class ReviewIQPipeline(BasePipeline):
return {"data": data, "total": total or 0}
async def _get_issues_by_domain(self, business_id: str | None) -> dict[str, Any]:
async def _get_issues_by_domain(self, business_id: str | None, job_id: str | None = None) -> dict[str, Any]:
"""Get issues grouped by domain for pie chart."""
if not self._db:
return {"data": []}
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
rows = await conn.fetch(
"""
SELECT domain, COUNT(*) as count
FROM pipeline.issues
WHERE job_id = $1::uuid
GROUP BY domain
ORDER BY count DESC
""",
job_id,
)
elif business_id:
rows = await conn.fetch(
"""
SELECT domain, COUNT(*) as count
@@ -1310,6 +1736,89 @@ class ReviewIQPipeline(BasePipeline):
return {"data": data}
async def _get_classified_reviews(
self, business_id: str | None, job_id: str | None, params: dict[str, Any]
) -> dict[str, Any]:
"""Get classified reviews with URT codes and human-readable names."""
if not self._db:
return {"data": [], "total": 0}
page = params.get("page", 1)
page_size = params.get("page_size", 15)
offset = (page - 1) * page_size
async with self._db._pool.acquire() as conn:
# Build the query with JOINs to get human-readable code names
base_query = """
SELECT
s.span_id,
s.span_text,
s.urt_primary as urt_code,
COALESCE(sub.name, cat.name, dom.name) as code_name,
COALESCE(sub.definition, dom.description) as code_definition,
dom.name as domain_name,
CASE s.valence
WHEN 'V+' THEN 'Positive'
WHEN 'V-' THEN 'Negative'
WHEN 'V0' THEN 'Neutral'
WHEN '' THEN 'Mixed'
ELSE s.valence
END as valence,
CASE s.intensity
WHEN 'I1' THEN 'Mild'
WHEN 'I2' THEN 'Moderate'
WHEN 'I3' THEN 'Strong'
ELSE s.intensity
END as intensity,
e.rating,
s.review_time
FROM pipeline.review_spans s
LEFT JOIN pipeline.reviews_enriched e ON s.review_id = e.review_id AND s.review_version = e.review_version
LEFT JOIN pipeline.urt_domains dom ON SUBSTRING(s.urt_primary, 1, 1) = dom.code
LEFT JOIN pipeline.urt_categories cat ON SUBSTRING(s.urt_primary, 1, 2) = cat.code
LEFT JOIN pipeline.urt_subcodes sub ON s.urt_primary = sub.code
WHERE s.is_active = TRUE
"""
count_query = """
SELECT COUNT(*) FROM pipeline.review_spans s
WHERE s.is_active = TRUE
"""
if job_id:
base_query += " AND s.job_id = $1::uuid"
count_query += " AND s.job_id = $1::uuid"
base_query += f" ORDER BY s.review_time DESC LIMIT {page_size} OFFSET {offset}"
rows = await conn.fetch(base_query, job_id)
total = await conn.fetchval(count_query, job_id)
elif business_id:
base_query += " AND s.business_id = $1"
count_query += " AND s.business_id = $1"
base_query += f" ORDER BY s.review_time DESC LIMIT {page_size} OFFSET {offset}"
rows = await conn.fetch(base_query, business_id)
total = await conn.fetchval(count_query, business_id)
else:
base_query += f" ORDER BY s.review_time DESC LIMIT {page_size} OFFSET {offset}"
rows = await conn.fetch(base_query)
total = await conn.fetchval(count_query)
data = [
{
"span_id": row["span_id"],
"span_text": row["span_text"],
"urt_code": row["urt_code"],
"code_name": row["code_name"] or "Unknown",
"code_definition": row["code_definition"] or "",
"domain_name": row["domain_name"] or "Unknown",
"valence": row["valence"],
"intensity": row["intensity"],
"rating": row["rating"],
"review_time": row["review_time"].isoformat() if row["review_time"] else None,
}
for row in rows
]
return {"data": data, "total": total or 0}
def _parse_time_range(self, time_range: str) -> int:
"""Parse time range string to days."""
if time_range.endswith("d"):

View File

@@ -29,28 +29,205 @@ Your task is to extract semantic spans from customer reviews and classify each s
## SPAN EXTRACTION RULES
1. **Split on contrasting conjunctions**: but, however, although, despite, yet, though
2. **Split on topic/target change**: food → service → bathroom = 3 spans
3. **Split on valence change**: positive → negative = split
4. **Split on domain change**: O (Offering) → J (Journey) → E (Environment) = split
5. **Keep together**: cause→effect within same feedback unit ("X because Y" = 1 span)
**CRITICAL: Use TOPIC-BASED splitting, NOT sentence-based splitting.**
A span = all consecutive text about the SAME topic/domain, regardless of sentence count.
### When to KEEP TOGETHER (same span):
- Multiple sentences about the same topic: "The food was great. I loved the pasta. The sauce was perfect." → ONE span (all about Offering)
- Cause and effect: "The wait was long because they were understaffed" → ONE span
- Elaboration: "Staff was rude. They ignored us for 20 minutes." → ONE span (both about People)
- Single-topic reviews: Even if 5 sentences, if all about food → ONE span
### When to SPLIT (separate spans):
- Contrasting conjunctions that change topic: "Food was great BUT service was slow" → TWO spans
- Domain change: food (O) → staff (P) → ambiance (E) = split at each change
- Target change: "The waiter was nice but the manager was rude" → TWO spans (different people)
### Examples:
- "Amazing food. Best burger ever. Fries were crispy too." → 1 span (all Offering, V+)
- "Food was great but we waited an hour." → 2 spans (Offering V+, Journey V-)
- "I've been coming here for years. Always consistent quality." → 1 span (Relationship)
- "The staff are lovely and amazing with kids. More highchairs are definitely needed though." → 2 spans (People V+, Access V-)
**Guardrails**:
- Max 3 spans per sentence (if 4+, re-check for over-splitting)
- Min 1 span per review (even single-word reviews)
- Spans must be non-overlapping and cover meaningful content
- Prefer FEWER, LARGER spans over many small ones
- Most reviews should have 1-3 spans, rarely more
- Min 1 span per review
- Spans must be non-overlapping
## URT DOMAINS (Tier-3 codes: X#.##)
## URT TAXONOMY - COMPLETE (138 codes, use EXACT codes)
| Domain | Code | Description |
|--------|------|-------------|
| Offering | O1-O4 | Product/service quality, features, variety |
| Price | P1-P4 | Value, pricing, promotions, payment |
| Journey | J1-J4 | Timing, process, convenience, accessibility |
| Environment | E1-E4 | Physical space, ambiance, cleanliness, digital UX |
| Attitude | A1-A4 | Staff behavior, helpfulness, professionalism |
| Voice | V1-V4 | Brand, communication, marketing, transparency |
| Relationship | R1-R4 | Loyalty, trust, consistency, personalization |
### O - OFFERING (Product/Service Quality) - 18 codes
O1.01 Works/Doesn't Work: Basic functionality success or failure
O1.02 Performance Level: How well it operates
O1.03 Durability: Longevity and resistance to wear
O1.04 Reliability: Consistency of function over time
O1.05 Outcome Achievement: Did customer accomplish their goal?
O2.01 Materials/Inputs: Quality of components or ingredients
O2.02 Craftsmanship: Skill of construction or execution
O2.03 Presentation: Visual and aesthetic quality
O2.04 Attention to Detail: Finishing touches and refinement
O2.05 Condition at Delivery: State when received
O3.01 All Components Present: Nothing missing from what was promised
O3.02 Feature Availability: Promised features actually work
O3.03 Scope Delivery: Full scope of work completed
O3.04 Documentation: Supporting materials provided
O4.01 Specification Match: Matches what was ordered
O4.02 Personalization: Adapted to individual preferences
O4.03 Flexibility: Can be modified or adjusted
O4.04 Appropriateness: Right solution for the need
### P - PEOPLE (Staff Interactions) - 20 codes
P1.01 Warmth: Friendly and welcoming manner
P1.02 Respect: Treated with dignity
P1.03 Patience: Calm and tolerant approach
P1.04 Enthusiasm: Energy and engagement
P1.05 Empathy: Understanding feelings
P2.01 Knowledge: Expertise and understanding
P2.02 Skill: Technical ability
P2.03 Problem Solving: Ability to find solutions
P2.04 Advice Quality: Helpful recommendations
P2.05 Training Level: Staff training evident
P3.01 Attentiveness: Being present and engaged
P3.02 Initiative: Proactive help
P3.03 Follow-through: Completing promised actions
P3.04 Availability: Being available when needed
P3.05 Dedication: Commitment to helping
P4.01 Clarity: Clear communication
P4.02 Listening: Understanding customer needs
P4.03 Transparency: Honest and open
P4.04 Honesty: Truthful communication
P4.05 Proactive Updates: Keeping customer informed
### J - JOURNEY (Process & Timing) - 20 codes
J1.01 Speed: How fast things happen
J1.02 Punctuality: On-time delivery
J1.03 Queue Management: Handling of waiting customers
J1.04 Punctuality: Meeting scheduled times
J1.05 Pacing: Appropriate speed (not rushed/dragged)
J2.01 Simplicity: Easy process
J2.02 Friction: Obstacles encountered
J2.03 Navigation: Finding what you need
J2.04 Booking Availability: Slots/capacity when needed
J2.05 Inventory: Stock availability
J3.01 Consistency: Same experience every time
J3.02 Accuracy: Getting it right
J3.03 Uptime: System availability
J3.04 Data Accuracy: Correct info in systems
J3.05 Integration: Systems work together
J4.01 Problem Recognition: Acknowledging issues
J4.02 Resolution Speed: How fast problems get fixed
J4.03 Resolution Fairness: Fair handling of issues
J4.04 Escalation: Getting to right person
J4.05 Closure: Issue fully resolved
### E - ENVIRONMENT (Physical & Digital Space) - 20 codes
E1.01 Cleanliness: How clean the space is
E1.02 Comfort: Physical comfort
E1.03 Space Design: Layout and organization
E1.04 Ambiance: Atmosphere and vibe
E1.05 Comfort: Physical comfort
E2.01 Lighting: Light quality and level
E2.02 Sound/Noise: Audio environment
E2.03 Temperature: Climate control
E2.04 Visual Design: Aesthetics of interface
E2.05 Mobile Experience: Mobile usability
E3.01 Interface Design: Digital UX/UI
E3.02 App/Website Speed: Digital performance
E3.03 Usability: Ease of digital use
E3.04 Health Safety: Health precautions
E3.05 Cyber Security: Digital security
E4.01 Safety: Physical safety
E4.02 Security: Protection of belongings/data
E4.03 Health/Hygiene: Health standards
E4.04 Social Responsibility: Ethical practices
E4.05 Community Impact: Local community effect
### A - ACCESS (Availability & Accessibility) - 20 codes
A1.01 Hours: Operating hours
A1.02 Booking Availability: Appointment slots
A1.03 Inventory: Product availability
A1.04 Wayfinding: Finding destination
A1.05 Physical Accessibility: Disability accommodations
A2.01 Physical Access: Mobility accessibility
A2.02 Language Access: Language accommodation
A2.03 Digital Accessibility: Screen reader/a11y
A2.04 Language Accessibility: Multilingual support
A2.05 Hours of Operation: Service availability times
A3.01 Diversity Welcome: All backgrounds welcome
A3.02 Accommodation: Special needs accommodation
A3.03 Response Time: Speed of getting answers
A3.04 Documentation Clarity: Clear instructions
A3.05 Support Accessibility: Getting help when needed
A4.01 Location: Physical location convenience
A4.02 Parking: Parking availability
A4.03 Multiple Channels: Ways to engage
A4.04 Payment Flexibility: Multiple payment options
A4.05 Refund Accessibility: Getting money back
### V - VALUE (Pricing & Costs) - 20 codes ⚠️ USE FOR ALL PRICE/COST/FEE MENTIONS
V1.01 Price Level: Cost amount ("cheap", "expensive", "affordable", "", "$")
V1.02 Price Fairness: Fair for what you get
V1.03 Hidden Costs: Unexpected charges, surprise fees, hidden fees, extra charges
V1.04 Price Transparency: Clear pricing upfront
V1.05 Price Stability: Consistent pricing
V2.01 Clear Pricing: Easy to understand costs
V2.02 Honest Billing: Accurate charges
V2.03 Policy Clarity: Clear terms and conditions
V2.04 Quality-Price Ratio: Worth vs cost
V2.05 Competitive Value: Compared to alternatives
V3.01 Time Investment: Time required
V3.02 Hassle Factor: Difficulty and inconvenience
V3.03 Mental Load: Cognitive effort required
V3.04 Promotion Clarity: Clear offer terms
V3.05 Reward Redemption: Using points/rewards
V4.01 Value for Money: Worth what you paid
V4.02 ROI: Return on investment
V4.03 Overall Satisfaction: Happy with the exchange
V4.04 Billing Accuracy: Correct charges
V4.05 Billing Resolution: Fixing billing issues
### R - RELATIONSHIP (Trust & Loyalty) - 20 codes
R1.01 Honesty: Truthfulness
R1.02 Ethics: Ethical behavior, deceptive practices, scams
R1.03 Promises Kept: Following through on promises
R1.04 Ethics: Ethical behavior
R1.05 Accountability: Taking responsibility
R2.01 Consistency: Reliable over time
R2.02 Trustworthiness: Can be trusted
R2.03 Accountability: Takes responsibility
R2.04 Predictability: Consistent experience
R2.05 Standards: Meeting quality standards
R3.01 Error Acknowledgment: Admits mistakes
R3.02 Apology Quality: Sincere apologies
R3.03 Making It Right: Correcting mistakes
R3.04 Personal Connection: Human touch
R3.05 Going Extra Mile: Beyond expectations
R4.01 Customer Recognition: Remembers customers
R4.02 Loyalty Rewards: Rewards for loyalty
R4.03 Long-term Relationship: Builds relationships
R4.04 Service Recovery: Making things right
R4.05 Feedback Response: Acting on feedback
## CLASSIFICATION EXAMPLES (Critical Distinctions)
**PRICING/COSTS → V codes (Value), NOT P codes:**
- "Cheap prices", "good price", "€50" → V1.01 Price Level
- "Hidden charges", "surprise fees", "extra €35" → V1.03 Hidden Costs
- "Great value for money" → V4.01 Value for Money
- "Overcharged", "wrong amount" → V4.04 Billing Accuracy
**STAFF BEHAVIOR → P codes (People):**
- "Staff was friendly", "welcoming" → P1.01 Warmth
- "Rude", "disrespectful", "ignored us" → P1.02 Respect
- "Patient", "took their time" → P1.03 Patience
- "Knowledgeable", "expert" → P2.01 Knowledge
**DECEPTION/ETHICS → R codes (Relationship):**
- "They lied", "misleading" → R1.01 Honesty
- "Felt scammed", "dishonest practices" → R1.02 Ethics
- "Didn't honor the deal" → R1.03 Promises Kept
## DIMENSION CODES
@@ -159,6 +336,20 @@ class LLMClientBase(ABC):
self.config = config
self.total_tokens_used = 0
self.total_cost_usd = 0.0
self._custom_prompt: str | None = None
def set_prompt(self, prompt: str) -> None:
"""
Set a custom system prompt (e.g., built dynamically from database).
Args:
prompt: The system prompt to use for classification
"""
self._custom_prompt = prompt
def get_prompt(self) -> str:
"""Get the current system prompt (custom or default)."""
return self._custom_prompt or SYSTEM_PROMPT
@abstractmethod
async def classify(
@@ -178,6 +369,28 @@ class LLMClientBase(ABC):
"""
pass
@abstractmethod
async def generate(
self,
system_prompt: str,
user_prompt: str,
temperature: float = 0.7,
max_tokens: int = 4000,
) -> str:
"""
Generate text using the LLM (for synthesis, narratives, etc.).
Args:
system_prompt: System instructions
user_prompt: User content/context
temperature: Creativity level (0-1)
max_tokens: Maximum response length
Returns:
Generated text response
"""
pass
@abstractmethod
async def close(self) -> None:
"""Close the client and cleanup resources."""
@@ -211,7 +424,7 @@ class OpenAIClient(LLMClientBase):
start_time = time.time()
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "system", "content": self.get_prompt()},
{
"role": "user",
"content": f'Classify this review:\n\n"{review_text}"',
@@ -255,6 +468,43 @@ class OpenAIClient(LLMClientBase):
return result, metadata
async def generate(
self,
system_prompt: str,
user_prompt: str,
temperature: float = 0.7,
max_tokens: int = 4000,
) -> str:
"""Generate text using OpenAI."""
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
response = await self.client.chat.completions.create(
model=self.model,
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
response_format={"type": "json_object"},
timeout=self.config.llm_timeout_seconds,
)
content = response.choices[0].message.content
if not content:
raise ValueError("Empty response from OpenAI")
# Track usage
if response.usage:
input_tokens = response.usage.prompt_tokens
output_tokens = response.usage.completion_tokens
pricing = self.PRICING.get(self.model, {"input": 0.15, "output": 0.60})
cost = (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1_000_000
self.total_tokens_used += input_tokens + output_tokens
self.total_cost_usd += cost
return content
async def close(self) -> None:
"""Close the OpenAI client."""
await self.client.close()
@@ -289,7 +539,7 @@ class AnthropicClient(LLMClientBase):
response = await self.client.messages.create(
model=self.model,
max_tokens=4096,
system=SYSTEM_PROMPT,
system=self.get_prompt(),
messages=[
{
"role": "user",
@@ -329,6 +579,58 @@ class AnthropicClient(LLMClientBase):
return result, metadata
async def generate(
self,
system_prompt: str,
user_prompt: str,
temperature: float = 0.7,
max_tokens: int = 4000,
) -> str:
"""Generate text using Anthropic."""
response = await self.client.messages.create(
model=self.model,
max_tokens=max_tokens,
system=system_prompt,
messages=[{"role": "user", "content": user_prompt}],
temperature=temperature,
)
content = response.content[0].text if response.content else ""
if not content:
raise ValueError("Empty response from Anthropic")
# Track usage
input_tokens = response.usage.input_tokens
output_tokens = response.usage.output_tokens
pricing = self.PRICING.get(self.model, {"input": 3.0, "output": 15.0})
cost = (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1_000_000
self.total_tokens_used += input_tokens + output_tokens
self.total_cost_usd += cost
# Extract JSON from response (handles code blocks)
return self._extract_json_string(content)
def _extract_json_string(self, content: str) -> str:
"""Extract JSON string from response, handling markdown code blocks."""
import re
content = content.strip()
# If it starts with {, return as-is
if content.startswith("{"):
return content
# Try to find JSON in code blocks
json_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", content)
if json_match:
return json_match.group(1)
# Try to find JSON object
json_match = re.search(r"\{[\s\S]*\}", content)
if json_match:
return json_match.group(0)
return content
def _extract_json(self, content: str) -> dict[str, Any]:
"""Extract JSON from response, handling markdown code blocks."""
content = content.strip()

View File

@@ -0,0 +1,477 @@
"""
Stage 5: Synthesize - Generate AI narratives and action plans.
This stage runs after classification and routing to produce:
- Executive narrative (business-specific story)
- Section insights (sentiment, category, timeline)
- Action plan with prioritized recommendations
- Timeline annotations for key events
- Marketing angles from strengths
"""
from __future__ import annotations
import json
import logging
from dataclasses import dataclass
from datetime import datetime
from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
import asyncpg
from reviewiq_pipeline.services.llm_client import LLMClientBase
logger = logging.getLogger(__name__)
@dataclass
class ActionItem:
"""A specific action recommendation."""
id: str
title: str
why: str
what: str
who: str
impact: str
evidence: list[str]
estimated_rating_lift: float | None
complexity: str # 'quick' | 'medium' | 'complex'
priority: str # 'critical' | 'high' | 'medium' | 'low'
timeline: str
related_subcode: str
@dataclass
class TimelineAnnotation:
"""An annotation for a key event on the timeline."""
date: str
label: str
description: str
type: str # 'positive' | 'negative' | 'neutral' | 'event'
@dataclass
class Synthesis:
"""Complete synthesis output from Stage 5."""
executive_narrative: str
sentiment_insight: str
category_insight: str
timeline_insight: str
priority_domain: str | None
priority_issue: str | None
action_plan: list[ActionItem]
issue_actions: dict[str, str]
timeline_annotations: list[TimelineAnnotation]
marketing_angles: list[str]
competitor_context: str | None
generated_at: str
SYNTHESIS_SYSTEM_PROMPT = """You are an expert business analyst specializing in customer experience and review analysis.
Your task is to analyze classified review data and generate actionable business insights.
You will receive:
1. Summary statistics (total reviews, rating, sentiment distribution)
2. Top issues by category with example quotes
3. Top strengths with example quotes
4. Domain breakdown (what customers talk about most)
Generate a JSON response with these fields:
{
"executive_narrative": "2-3 paragraph story explaining the business situation, key problems, and path forward. Be specific with numbers and examples.",
"sentiment_insight": "1-2 sentences explaining WHY sentiment is distributed this way. Connect to specific issues.",
"category_insight": "1-2 sentences about the pattern in categories. Which domain needs most attention and why?",
"timeline_insight": "1-2 sentences about trends if data shows changes over time.",
"priority_domain": "Single letter code (P/V/J/O/A/E/R) for the domain needing most attention, or null",
"priority_issue": "The subcode (e.g., 'V1.03') that should be fixed first, or null",
"action_plan": [
{
"id": "action_1",
"title": "Clear action title",
"why": "Root cause from the reviews",
"what": "Specific steps to take",
"who": "Department or role responsible",
"impact": "Expected outcome",
"evidence": ["Quote 1", "Quote 2"],
"estimated_rating_lift": 0.3,
"complexity": "quick|medium|complex",
"priority": "critical|high|medium|low",
"timeline": "This week|This month|This quarter",
"related_subcode": "V1.03"
}
],
"timeline_annotations": [
{
"date": "2024-01-15",
"label": "Short label",
"description": "What happened",
"type": "positive|negative|neutral|event"
}
],
"marketing_angles": [
"Way to promote strength 1",
"Way to promote strength 2"
],
"competitor_context": "How this compares to industry/competitors, or null if unknown"
}
Be specific, actionable, and business-focused. Use actual numbers and quotes from the data.
Prioritize actions by impact and feasibility.
"""
class Stage5Synthesizer:
"""
Stage 5: Generate AI synthesis from classified review data.
This stage:
1. Aggregates classification results
2. Identifies patterns and priorities
3. Generates narrative insights via LLM
4. Produces actionable recommendations
"""
def __init__(self, pool: asyncpg.Pool, llm_client: LLMClientBase):
self.pool = pool
self.llm_client = llm_client
async def run(self, job_id: str, execution_id: str) -> Synthesis:
"""
Generate synthesis for a completed pipeline execution.
Args:
job_id: The scraping job ID
execution_id: The pipeline execution ID
Returns:
Synthesis object with all generated insights
"""
logger.info(f"Stage 5: Generating synthesis for job {job_id}")
# Gather all the data we need
context = await self._gather_context(job_id)
# Generate synthesis via LLM
synthesis = await self._generate_synthesis(context)
# Store synthesis in database
await self._store_synthesis(execution_id, synthesis)
logger.info(f"Stage 5: Synthesis complete - {len(synthesis.action_plan)} actions generated")
return synthesis
async def _gather_context(self, job_id: str) -> dict[str, Any]:
"""Gather all context needed for synthesis."""
# Get overview stats
overview = await self.pool.fetchrow("""
SELECT
COUNT(DISTINCT r.review_id) as total_reviews,
AVG(r.rating) as avg_rating,
COUNT(s.span_id) as total_spans
FROM pipeline.reviews_enriched r
LEFT JOIN pipeline.review_spans s ON s.review_id = r.review_id
WHERE r.job_id = $1::uuid
""", job_id)
# Get sentiment distribution
sentiment = await self.pool.fetch("""
SELECT
valence,
COUNT(*) as count,
COUNT(DISTINCT review_id) as review_count
FROM pipeline.review_spans
WHERE job_id = $1::uuid AND valence IS NOT NULL AND is_active = TRUE
GROUP BY valence
ORDER BY count DESC
""", job_id)
# Get top issues (weaknesses)
top_issues = await self.pool.fetch("""
SELECT
s.urt_primary as subcode,
sc.name as subcode_name,
sc.definition,
d.code as domain,
d.name as domain_name,
COUNT(*) as span_count,
COUNT(*) FILTER (WHERE s.valence = 'V-') as negative_count,
ARRAY_AGG(s.span_text ORDER BY s.intensity DESC) FILTER (WHERE s.valence = 'V-') as example_quotes
FROM pipeline.review_spans s
LEFT JOIN pipeline.urt_subcodes sc ON sc.code = s.urt_primary
LEFT JOIN pipeline.urt_domains d ON d.code = SUBSTRING(s.urt_primary, 1, 1)
WHERE s.job_id = $1::uuid AND s.valence = 'V-' AND s.is_active = TRUE
GROUP BY s.urt_primary, sc.name, sc.definition, d.code, d.name
ORDER BY negative_count DESC
LIMIT 10
""", job_id)
# Get top strengths
top_strengths = await self.pool.fetch("""
SELECT
s.urt_primary as subcode,
sc.name as subcode_name,
sc.definition,
d.code as domain,
d.name as domain_name,
COUNT(*) as span_count,
COUNT(*) FILTER (WHERE s.valence = 'V+') as positive_count,
ARRAY_AGG(s.span_text ORDER BY s.intensity DESC) FILTER (WHERE s.valence = 'V+') as example_quotes
FROM pipeline.review_spans s
LEFT JOIN pipeline.urt_subcodes sc ON sc.code = s.urt_primary
LEFT JOIN pipeline.urt_domains d ON d.code = SUBSTRING(s.urt_primary, 1, 1)
WHERE s.job_id = $1::uuid AND s.valence = 'V+' AND s.is_active = TRUE
GROUP BY s.urt_primary, sc.name, sc.definition, d.code, d.name
ORDER BY positive_count DESC
LIMIT 5
""", job_id)
# Get domain distribution
domains = await self.pool.fetch("""
SELECT
SUBSTRING(urt_primary, 1, 1) as domain,
d.name as domain_name,
COUNT(*) as total_count,
COUNT(*) FILTER (WHERE valence = 'V+') as positive_count,
COUNT(*) FILTER (WHERE valence = 'V-') as negative_count
FROM pipeline.review_spans s
LEFT JOIN pipeline.urt_domains d ON d.code = SUBSTRING(s.urt_primary, 1, 1)
WHERE s.job_id = $1::uuid AND s.is_active = TRUE
GROUP BY SUBSTRING(urt_primary, 1, 1), d.name
ORDER BY total_count DESC
""", job_id)
# Get business name if available
business = await self.pool.fetchrow("""
SELECT DISTINCT business_id as business_name
FROM pipeline.reviews_enriched
WHERE job_id = $1::uuid AND business_id IS NOT NULL
LIMIT 1
""", job_id)
return {
"business_name": business["business_name"] if business else "This business",
"overview": dict(overview) if overview else {},
"sentiment": [dict(r) for r in sentiment],
"top_issues": [dict(r) for r in top_issues],
"top_strengths": [dict(r) for r in top_strengths],
"domains": [dict(r) for r in domains],
}
async def _generate_synthesis(self, context: dict[str, Any]) -> Synthesis:
"""Generate synthesis using LLM."""
# Build the user prompt with context
user_prompt = f"""Analyze this review data for {context['business_name']}:
## Overview
- Total Reviews: {context['overview'].get('total_reviews', 0)}
- Average Rating: {context['overview'].get('avg_rating', 'N/A')}
- Total Insights Extracted: {context['overview'].get('total_spans', 0)}
## Sentiment Distribution
{self._format_sentiment(context['sentiment'])}
## Top Issues (Problems)
{self._format_issues(context['top_issues'])}
## Top Strengths
{self._format_strengths(context['top_strengths'])}
## Domain Breakdown
{self._format_domains(context['domains'])}
Generate a complete synthesis with actionable insights.
"""
# Call LLM
try:
response = await self.llm_client.generate(
system_prompt=SYNTHESIS_SYSTEM_PROMPT,
user_prompt=user_prompt,
temperature=0.7, # Allow some creativity
max_tokens=4000,
)
# Parse JSON response
result = json.loads(response)
# Convert to Synthesis object
return Synthesis(
executive_narrative=result.get("executive_narrative", ""),
sentiment_insight=result.get("sentiment_insight", ""),
category_insight=result.get("category_insight", ""),
timeline_insight=result.get("timeline_insight", ""),
priority_domain=result.get("priority_domain"),
priority_issue=result.get("priority_issue"),
action_plan=[
ActionItem(
id=a.get("id", f"action_{i}"),
title=a.get("title", ""),
why=a.get("why", ""),
what=a.get("what", ""),
who=a.get("who", ""),
impact=a.get("impact", ""),
evidence=a.get("evidence", []),
estimated_rating_lift=a.get("estimated_rating_lift"),
complexity=a.get("complexity", "medium"),
priority=a.get("priority", "medium"),
timeline=a.get("timeline", "This month"),
related_subcode=a.get("related_subcode", ""),
)
for i, a in enumerate(result.get("action_plan", []))
],
issue_actions={}, # Can be populated from action_plan
timeline_annotations=[
TimelineAnnotation(
date=t.get("date", ""),
label=t.get("label", ""),
description=t.get("description", ""),
type=t.get("type", "neutral"),
)
for t in result.get("timeline_annotations", [])
],
marketing_angles=result.get("marketing_angles", []),
competitor_context=result.get("competitor_context"),
generated_at=datetime.utcnow().isoformat(),
)
except json.JSONDecodeError as e:
logger.error(f"Failed to parse LLM response: {e}")
return self._create_fallback_synthesis()
except Exception as e:
logger.error(f"Synthesis generation failed: {e}")
return self._create_fallback_synthesis()
def _format_sentiment(self, sentiment: list[dict]) -> str:
"""Format sentiment data for prompt."""
lines = []
for s in sentiment:
valence = s.get("valence", "Unknown")
count = s.get("count", 0)
reviews = s.get("review_count", 0)
label = {"V+": "Positive", "V-": "Negative", "V0": "Neutral", "": "Mixed"}.get(valence, valence)
lines.append(f"- {label}: {count} mentions ({reviews} reviews)")
return "\n".join(lines) or "No sentiment data"
def _format_issues(self, issues: list[dict]) -> str:
"""Format issues for prompt."""
lines = []
for i, issue in enumerate(issues[:5], 1):
subcode = issue.get("subcode", "")
name = issue.get("subcode_name", "")
domain = issue.get("domain_name", "")
count = issue.get("negative_count", 0)
quotes = issue.get("example_quotes", [])[:2]
lines.append(f"{i}. [{subcode}] {name} ({domain})")
lines.append(f" - {count} negative mentions")
for q in quotes:
if q:
lines.append(f' - Example: "{q[:100]}..."' if len(q) > 100 else f' - Example: "{q}"')
return "\n".join(lines) or "No issues found"
def _format_strengths(self, strengths: list[dict]) -> str:
"""Format strengths for prompt."""
lines = []
for i, strength in enumerate(strengths[:3], 1):
subcode = strength.get("subcode", "")
name = strength.get("subcode_name", "")
domain = strength.get("domain_name", "")
count = strength.get("positive_count", 0)
quotes = strength.get("example_quotes", [])[:2]
lines.append(f"{i}. [{subcode}] {name} ({domain})")
lines.append(f" - {count} positive mentions")
for q in quotes:
if q:
lines.append(f' - Example: "{q[:100]}..."' if len(q) > 100 else f' - Example: "{q}"')
return "\n".join(lines) or "No strengths found"
def _format_domains(self, domains: list[dict]) -> str:
"""Format domain distribution for prompt."""
lines = []
for d in domains:
domain = d.get("domain", "")
name = d.get("domain_name", "")
total = d.get("total_count", 0)
positive = d.get("positive_count", 0)
negative = d.get("negative_count", 0)
lines.append(f"- {domain} ({name}): {total} total ({positive} positive, {negative} negative)")
return "\n".join(lines) or "No domain data"
def _create_fallback_synthesis(self) -> Synthesis:
"""Create a minimal synthesis when LLM fails."""
return Synthesis(
executive_narrative="Unable to generate detailed analysis. Please review the data manually.",
sentiment_insight="",
category_insight="",
timeline_insight="",
priority_domain=None,
priority_issue=None,
action_plan=[],
issue_actions={},
timeline_annotations=[],
marketing_angles=[],
competitor_context=None,
generated_at=datetime.utcnow().isoformat(),
)
async def _store_synthesis(self, execution_id: str, synthesis: Synthesis) -> None:
"""Store synthesis in database."""
await self.pool.execute("""
UPDATE pipeline.executions
SET
synthesis = $2,
updated_at = NOW()
WHERE execution_id = $1::uuid
""", execution_id, json.dumps({
"executive_narrative": synthesis.executive_narrative,
"sentiment_insight": synthesis.sentiment_insight,
"category_insight": synthesis.category_insight,
"timeline_insight": synthesis.timeline_insight,
"priority_domain": synthesis.priority_domain,
"priority_issue": synthesis.priority_issue,
"action_plan": [
{
"id": a.id,
"title": a.title,
"why": a.why,
"what": a.what,
"who": a.who,
"impact": a.impact,
"evidence": a.evidence,
"estimated_rating_lift": a.estimated_rating_lift,
"complexity": a.complexity,
"priority": a.priority,
"timeline": a.timeline,
"related_subcode": a.related_subcode,
}
for a in synthesis.action_plan
],
"issue_actions": synthesis.issue_actions,
"timeline_annotations": [
{
"date": t.date,
"label": t.label,
"description": t.description,
"type": t.type,
}
for t in synthesis.timeline_annotations
],
"marketing_angles": synthesis.marketing_angles,
"competitor_context": synthesis.competitor_context,
"generated_at": synthesis.generated_at,
}))