feat(pipeline): Add Stage 5 Synthesis for AI-generated narratives
- Add Stage5Synthesizer class that generates AI narratives and action plans - Add generate() method to LLMClient for synthesis generation - Integrate Stage 5 into pipeline runner after route stage - Add synthesis JSONB column to pipeline.executions table - Update reviewiq_analytics API to return synthesis data - Synthesis includes: executive narrative, sentiment/category/timeline insights, action plan, marketing angles, and priority recommendations Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -7,9 +7,11 @@ the BasePipeline interface for the extensible pipeline system.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from datetime import date
|
||||
from datetime import date, datetime, timedelta
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from pipeline_core import (
|
||||
@@ -51,6 +53,8 @@ from reviewiq_pipeline.stages.stage1_normalize import Stage1Normalizer
|
||||
from reviewiq_pipeline.stages.stage2_classify import Stage2Classifier
|
||||
from reviewiq_pipeline.stages.stage3_route import Stage3Router
|
||||
from reviewiq_pipeline.stages.stage4_aggregate import Stage4Aggregator
|
||||
from reviewiq_pipeline.stages.stage5_synthesize import Stage5Synthesizer
|
||||
from reviewiq_pipeline.services.llm_client import LLMClient
|
||||
from reviewiq_pipeline.validation.validators import (
|
||||
validate_stage1_output,
|
||||
validate_stage2_output,
|
||||
@@ -64,9 +68,65 @@ if TYPE_CHECKING:
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Stage name to number mapping
|
||||
STAGE_NAMES = ["normalize", "classify", "route", "aggregate"]
|
||||
STAGE_NAME_TO_NUM = {"normalize": 1, "classify": 2, "route": 3, "aggregate": 4}
|
||||
STAGE_NUM_TO_NAME = {1: "normalize", 2: "classify", 3: "route", 4: "aggregate"}
|
||||
STAGE_NAMES = ["normalize", "classify", "route", "aggregate", "synthesize"]
|
||||
STAGE_NAME_TO_NUM = {"normalize": 1, "classify": 2, "route": 3, "aggregate": 4, "synthesize": 5}
|
||||
STAGE_NUM_TO_NAME = {1: "normalize", 2: "classify", 3: "route", 4: "aggregate", 5: "synthesize"}
|
||||
|
||||
|
||||
def _parse_relative_date(date_str: str | None, default_to_now: bool = True) -> datetime | None:
|
||||
"""Parse relative date strings like '10 months ago' into datetime objects.
|
||||
|
||||
Args:
|
||||
date_str: A relative date string (e.g., "10 months ago", "2 weeks ago")
|
||||
or an ISO date string, or None.
|
||||
default_to_now: If True, returns current datetime when parsing fails.
|
||||
|
||||
Returns:
|
||||
A datetime object, or None if parsing fails and default_to_now is False.
|
||||
"""
|
||||
now = datetime.now()
|
||||
|
||||
if not date_str:
|
||||
return now if default_to_now else None
|
||||
|
||||
# Try to parse as ISO date first
|
||||
try:
|
||||
return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
|
||||
except (ValueError, AttributeError):
|
||||
pass
|
||||
|
||||
# Parse relative dates like "10 months ago", "2 weeks ago", "a day ago"
|
||||
date_str = date_str.lower().strip()
|
||||
|
||||
# Handle "a/an" as 1
|
||||
date_str = re.sub(r'\b(a|an)\s+', '1 ', date_str)
|
||||
|
||||
# Extract number and unit
|
||||
match = re.match(r'(\d+)\s*(second|minute|hour|day|week|month|year)s?\s*ago', date_str)
|
||||
if match:
|
||||
amount = int(match.group(1))
|
||||
unit = match.group(2)
|
||||
|
||||
if unit == 'second':
|
||||
return now - timedelta(seconds=amount)
|
||||
elif unit == 'minute':
|
||||
return now - timedelta(minutes=amount)
|
||||
elif unit == 'hour':
|
||||
return now - timedelta(hours=amount)
|
||||
elif unit == 'day':
|
||||
return now - timedelta(days=amount)
|
||||
elif unit == 'week':
|
||||
return now - timedelta(weeks=amount)
|
||||
elif unit == 'month':
|
||||
# Approximate months as 30 days
|
||||
return now - timedelta(days=amount * 30)
|
||||
elif unit == 'year':
|
||||
# Approximate years as 365 days
|
||||
return now - timedelta(days=amount * 365)
|
||||
|
||||
# If we can't parse it, return now or None
|
||||
logger.warning(f"Could not parse relative date: {date_str}")
|
||||
return now if default_to_now else None
|
||||
|
||||
|
||||
class PipelineResult:
|
||||
@@ -228,8 +288,11 @@ class ReviewIQPipeline(BasePipeline):
|
||||
stages_run: list[str] = []
|
||||
stage_results: dict[str, StageResult] = {}
|
||||
|
||||
# Convert input to ScraperOutput if needed
|
||||
scraper_output = self._ensure_scraper_output(input_data)
|
||||
# Convert input to ScraperOutput if needed (may fetch from DB)
|
||||
scraper_output = await self._ensure_scraper_output(input_data)
|
||||
|
||||
# Extract job_id for linking issues to pipeline executions
|
||||
job_id = scraper_output.get("job_id")
|
||||
|
||||
# Track intermediate results for stage dependencies
|
||||
stage1_result: Stage1Output | None = None
|
||||
@@ -270,6 +333,20 @@ class ReviewIQPipeline(BasePipeline):
|
||||
)
|
||||
|
||||
# Stage 2: Classify
|
||||
# If classify is requested but we don't have stage1_result, try to fetch from DB
|
||||
if "classify" in stages and not stage1_result and job_id:
|
||||
logger.info("No stage1_result, fetching existing normalized reviews from database")
|
||||
stage1_result = await self._fetch_normalized_reviews_from_db(job_id)
|
||||
if stage1_result:
|
||||
logger.info(f"Loaded {len(stage1_result.get('reviews_normalized', []))} reviews from DB for reclassification")
|
||||
# Clean up old spans and issues before reclassification
|
||||
if self._span_repo:
|
||||
deactivated = await self._span_repo.deactivate_spans_for_job(job_id)
|
||||
logger.info(f"Deactivated {deactivated} existing spans for job {job_id}")
|
||||
if self._issue_repo:
|
||||
deleted = await self._issue_repo.delete_issues_for_job(job_id)
|
||||
logger.info(f"Deleted {deleted} existing issues for job {job_id}")
|
||||
|
||||
if "classify" in stages and stage1_result:
|
||||
start = time.time()
|
||||
logger.info("Running Stage 2: Classification")
|
||||
@@ -308,7 +385,7 @@ class ReviewIQPipeline(BasePipeline):
|
||||
logger.info("Running Stage 3: Issue Routing")
|
||||
|
||||
try:
|
||||
stage3_result = await self._run_route(stage2_result)
|
||||
stage3_result = await self._run_route(stage2_result, job_id=job_id)
|
||||
duration_ms = int((time.time() - start) * 1000)
|
||||
stages_run.append("route")
|
||||
stage_results["route"] = StageResult(
|
||||
@@ -371,6 +448,43 @@ class ReviewIQPipeline(BasePipeline):
|
||||
error=f"aggregate failed: {e}",
|
||||
)
|
||||
|
||||
# Stage 5: Synthesize (AI-generated narratives)
|
||||
# Requires job_id and execution_id from pipeline execution tracking
|
||||
if "synthesize" in stages and job_id:
|
||||
start = time.time()
|
||||
logger.info("Running Stage 5: Synthesis")
|
||||
|
||||
try:
|
||||
# Get the execution_id for this pipeline run
|
||||
execution_id = input_data.get("execution_id")
|
||||
if execution_id:
|
||||
stage5_result = await self._run_synthesize(job_id, execution_id)
|
||||
duration_ms = int((time.time() - start) * 1000)
|
||||
stages_run.append("synthesize")
|
||||
stage_results["synthesize"] = StageResult(
|
||||
stage="synthesize",
|
||||
success=True,
|
||||
data={
|
||||
"actions_generated": len(stage5_result.action_plan) if stage5_result else 0,
|
||||
"has_narrative": bool(stage5_result and stage5_result.executive_narrative),
|
||||
},
|
||||
error=None,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
else:
|
||||
logger.warning("No execution_id provided, skipping synthesis")
|
||||
except Exception as e:
|
||||
logger.exception("Stage 5 failed")
|
||||
stage_results["synthesize"] = StageResult(
|
||||
stage="synthesize",
|
||||
success=False,
|
||||
data={},
|
||||
error=str(e),
|
||||
duration_ms=int((time.time() - start) * 1000),
|
||||
)
|
||||
# Synthesis failure is non-fatal - pipeline still succeeds
|
||||
logger.warning(f"Synthesis failed but continuing: {e}")
|
||||
|
||||
return BasePipelineResult(
|
||||
pipeline_id="reviewiq",
|
||||
stages_run=stages_run,
|
||||
@@ -558,6 +672,34 @@ class ReviewIQPipeline(BasePipeline):
|
||||
],
|
||||
collapsed=False,
|
||||
),
|
||||
DashboardSection(
|
||||
id="classified_reviews",
|
||||
title="Classified Reviews",
|
||||
description="All reviews with URT classification codes and human-readable meanings",
|
||||
widgets=[
|
||||
WidgetConfig(
|
||||
id="classified_reviews_table",
|
||||
type="table",
|
||||
title="Reviews with URT Codes",
|
||||
grid={"x": 0, "y": 0, "w": 12, "h": 3},
|
||||
config={
|
||||
"columns": [
|
||||
{"key": "span_text", "header": "Review Excerpt", "width": 300},
|
||||
{"key": "urt_code", "header": "Code", "width": 80},
|
||||
{"key": "code_name", "header": "Category", "width": 150},
|
||||
{"key": "domain_name", "header": "Domain", "width": 100},
|
||||
{"key": "valence", "header": "Sentiment", "width": 80},
|
||||
{"key": "intensity", "header": "Intensity", "width": 80},
|
||||
{"key": "rating", "header": "Stars", "width": 60, "align": "center"},
|
||||
],
|
||||
"row_key": "span_id",
|
||||
"page_size": 15,
|
||||
"sortable": True,
|
||||
},
|
||||
),
|
||||
],
|
||||
collapsed=False,
|
||||
),
|
||||
],
|
||||
default_time_range="30d",
|
||||
refresh_interval=300,
|
||||
@@ -573,7 +715,7 @@ class ReviewIQPipeline(BasePipeline):
|
||||
|
||||
Args:
|
||||
widget_id: Widget identifier
|
||||
params: Query parameters (business_id, time_range, etc.)
|
||||
params: Query parameters (business_id, job_id, time_range, etc.)
|
||||
|
||||
Returns:
|
||||
Widget data dictionary
|
||||
@@ -581,36 +723,41 @@ class ReviewIQPipeline(BasePipeline):
|
||||
await self.initialize()
|
||||
|
||||
business_id = params.get("business_id")
|
||||
job_id = params.get("job_id")
|
||||
time_range = params.get("time_range", "30d")
|
||||
|
||||
match widget_id:
|
||||
# Overview stats
|
||||
case "total_reviews":
|
||||
return await self._get_review_count(business_id)
|
||||
return await self._get_review_count(business_id, job_id)
|
||||
case "reviews_processed":
|
||||
return await self._get_processed_count(business_id, time_range)
|
||||
return await self._get_processed_count(business_id, job_id, time_range)
|
||||
case "issues_found":
|
||||
return await self._get_issues_count(business_id)
|
||||
return await self._get_issues_count(business_id, job_id)
|
||||
case "avg_rating":
|
||||
return await self._get_avg_rating(business_id, time_range)
|
||||
return await self._get_avg_rating(business_id, job_id, time_range)
|
||||
|
||||
# Sentiment
|
||||
case "sentiment_distribution":
|
||||
return await self._get_sentiment_distribution(business_id)
|
||||
return await self._get_sentiment_distribution(business_id, job_id)
|
||||
case "sentiment_trend":
|
||||
return await self._get_sentiment_trend(business_id, time_range)
|
||||
return await self._get_sentiment_trend(business_id, job_id, time_range)
|
||||
|
||||
# Classification
|
||||
case "urt_distribution":
|
||||
return await self._get_urt_distribution(business_id)
|
||||
return await self._get_urt_distribution(business_id, job_id)
|
||||
case "intensity_heatmap":
|
||||
return await self._get_intensity_heatmap(business_id)
|
||||
return await self._get_intensity_heatmap(business_id, job_id)
|
||||
|
||||
# Issues
|
||||
case "issues_table":
|
||||
return await self._get_issues_table(business_id, params)
|
||||
return await self._get_issues_table(business_id, job_id, params)
|
||||
case "issues_by_domain":
|
||||
return await self._get_issues_by_domain(business_id)
|
||||
return await self._get_issues_by_domain(business_id, job_id)
|
||||
|
||||
# Classified Reviews
|
||||
case "classified_reviews_table":
|
||||
return await self._get_classified_reviews(business_id, job_id, params)
|
||||
|
||||
case _:
|
||||
logger.warning(f"Unknown widget: {widget_id}")
|
||||
@@ -643,6 +790,9 @@ class ReviewIQPipeline(BasePipeline):
|
||||
result = PipelineResult()
|
||||
validation_results: dict[str, ValidationResult] = {}
|
||||
|
||||
# Extract job_id for linking issues
|
||||
job_id = scraper_output.get("job_id")
|
||||
|
||||
# Stage 1: Normalize
|
||||
if 1 in stages:
|
||||
logger.info("Running Stage 1: Normalization")
|
||||
@@ -668,7 +818,7 @@ class ReviewIQPipeline(BasePipeline):
|
||||
# Stage 3: Route
|
||||
if 3 in stages and result.stage2:
|
||||
logger.info("Running Stage 3: Issue Routing")
|
||||
result.stage3 = await self._run_route(result.stage2)
|
||||
result.stage3 = await self._run_route(result.stage2, job_id=job_id)
|
||||
|
||||
if validate:
|
||||
validation_results["stage3"] = await validate_stage3_output(
|
||||
@@ -700,10 +850,10 @@ class ReviewIQPipeline(BasePipeline):
|
||||
await self.initialize()
|
||||
return await self._run_classify(stage1_output)
|
||||
|
||||
async def route(self, stage2_output: Stage2Output) -> Stage3Output:
|
||||
async def route(self, stage2_output: Stage2Output, job_id: str | None = None) -> Stage3Output:
|
||||
"""Run Stage 3: Issue Routing (legacy method)."""
|
||||
await self.initialize()
|
||||
return await self._run_route(stage2_output)
|
||||
return await self._run_route(stage2_output, job_id=job_id)
|
||||
|
||||
async def aggregate(
|
||||
self,
|
||||
@@ -719,14 +869,91 @@ class ReviewIQPipeline(BasePipeline):
|
||||
# Internal Stage Implementations
|
||||
# =========================================================================
|
||||
|
||||
def _ensure_scraper_output(self, input_data: dict[str, Any]) -> ScraperOutput:
|
||||
"""Ensure input data is in ScraperOutput format."""
|
||||
async def _ensure_scraper_output(self, input_data: dict[str, Any]) -> ScraperOutput:
|
||||
"""Ensure input data is in ScraperOutput format.
|
||||
|
||||
If only job_id is provided, fetches job data from the database.
|
||||
"""
|
||||
# If it has all required fields, use as-is
|
||||
required = ["job_id", "business_id", "place_id", "reviews"]
|
||||
if all(k in input_data for k in required):
|
||||
return input_data # type: ignore
|
||||
|
||||
# Otherwise, wrap it
|
||||
# If we have a job_id but missing reviews, fetch from database
|
||||
job_id = input_data.get("job_id")
|
||||
if job_id and not input_data.get("reviews") and self._db:
|
||||
logger.info(f"Fetching job data from database for job_id: {job_id}")
|
||||
async with self._db.pool.acquire() as conn:
|
||||
row = await conn.fetchrow(
|
||||
"""
|
||||
SELECT job_id, status, reviews_data, reviews_count,
|
||||
metadata->>'business_name' as business_name,
|
||||
metadata->>'place_id' as place_id,
|
||||
metadata->>'address' as address,
|
||||
metadata->>'category' as category,
|
||||
metadata->>'total_reviews' as total_reviews,
|
||||
metadata->>'average_rating' as average_rating,
|
||||
scraper_version
|
||||
FROM public.jobs
|
||||
WHERE job_id = $1::uuid
|
||||
""",
|
||||
str(job_id),
|
||||
)
|
||||
|
||||
if row and row["reviews_data"]:
|
||||
reviews_data = row["reviews_data"]
|
||||
# asyncpg may return JSONB as a string - parse it if needed
|
||||
if isinstance(reviews_data, str):
|
||||
logger.info("Parsing reviews_data JSON string")
|
||||
reviews_data = json.loads(reviews_data)
|
||||
# Convert reviews_data to RawReview format
|
||||
# Handle both API format (review_id, author, rating) and scraper format (reviewId, name, stars)
|
||||
reviews = []
|
||||
for i, review in enumerate(reviews_data):
|
||||
if isinstance(review, str):
|
||||
# Skip if review is somehow a string
|
||||
logger.warning(f"Skipping review {i}: got string instead of dict")
|
||||
continue
|
||||
# Parse the review time (may be relative like "10 months ago")
|
||||
raw_time = review.get("timestamp") or review.get("publishedAtDate") or ""
|
||||
parsed_time = _parse_relative_date(raw_time)
|
||||
|
||||
reviews.append({
|
||||
"review_id": review.get("review_id") or review.get("reviewId") or f"review_{i}",
|
||||
"author_name": review.get("author") or review.get("name") or "Anonymous",
|
||||
"author_id": review.get("reviewerId"),
|
||||
"rating": review.get("rating") or review.get("stars") or 0,
|
||||
"text": review.get("text"),
|
||||
"review_time": parsed_time,
|
||||
"response_text": review.get("responseFromOwner", {}).get("text") if review.get("responseFromOwner") else None,
|
||||
"response_time": review.get("responseFromOwner", {}).get("publishedAtDate") if review.get("responseFromOwner") else None,
|
||||
"photos": review.get("reviewImageUrls"),
|
||||
"raw_payload": review,
|
||||
})
|
||||
|
||||
logger.info(f"Loaded {len(reviews)} reviews from job {job_id}")
|
||||
|
||||
return ScraperOutput(
|
||||
job_id=str(row["job_id"]),
|
||||
status=row["status"] or "completed",
|
||||
business_id=row["business_name"] or "unknown",
|
||||
place_id=row["place_id"] or "unknown",
|
||||
business_info={
|
||||
"name": row["business_name"] or "",
|
||||
"address": row["address"] or "",
|
||||
"category": row["category"] or "",
|
||||
"total_reviews": int(row["total_reviews"]) if row["total_reviews"] else 0,
|
||||
"average_rating": float(row["average_rating"]) if row["average_rating"] else 0.0,
|
||||
},
|
||||
reviews=reviews,
|
||||
scrape_time_ms=0,
|
||||
reviews_scraped=len(reviews),
|
||||
scraper_version=row["scraper_version"] or "unknown",
|
||||
)
|
||||
else:
|
||||
logger.warning(f"No reviews found in database for job_id: {job_id}")
|
||||
|
||||
# Otherwise, wrap it with empty/default values
|
||||
return ScraperOutput(
|
||||
job_id=input_data.get("job_id", "unknown"),
|
||||
status=input_data.get("status", "completed"),
|
||||
@@ -739,6 +966,70 @@ class ReviewIQPipeline(BasePipeline):
|
||||
scraper_version=input_data.get("scraper_version", "unknown"),
|
||||
)
|
||||
|
||||
async def _fetch_normalized_reviews_from_db(self, job_id: str) -> Stage1Output | None:
|
||||
"""Fetch existing normalized reviews from DB for reclassification.
|
||||
|
||||
Used when running classify stage standalone without normalize.
|
||||
"""
|
||||
if not self._db:
|
||||
return None
|
||||
|
||||
async with self._db.pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT
|
||||
source,
|
||||
review_id,
|
||||
review_version,
|
||||
business_id,
|
||||
place_id,
|
||||
text,
|
||||
text_normalized,
|
||||
rating,
|
||||
review_time
|
||||
FROM pipeline.reviews_enriched
|
||||
WHERE job_id = $1::uuid
|
||||
AND is_latest = TRUE
|
||||
ORDER BY review_time DESC
|
||||
""",
|
||||
job_id,
|
||||
)
|
||||
|
||||
if not rows:
|
||||
logger.warning(f"No normalized reviews found in DB for job_id: {job_id}")
|
||||
return None
|
||||
|
||||
reviews_normalized = [
|
||||
NormalizedReview(
|
||||
source=row["source"],
|
||||
review_id=row["review_id"],
|
||||
review_version=row["review_version"],
|
||||
business_id=row["business_id"],
|
||||
place_id=row["place_id"],
|
||||
text=row["text"],
|
||||
text_normalized=row["text_normalized"],
|
||||
rating=row["rating"],
|
||||
review_time=row["review_time"],
|
||||
)
|
||||
for row in rows
|
||||
]
|
||||
|
||||
logger.info(f"Fetched {len(reviews_normalized)} normalized reviews from DB for job {job_id}")
|
||||
|
||||
return Stage1Output(
|
||||
job_id=job_id,
|
||||
reviews_normalized=reviews_normalized,
|
||||
reviews_skipped=[],
|
||||
duplicates_found=[],
|
||||
stats={
|
||||
"total_input": len(reviews_normalized),
|
||||
"processed": len(reviews_normalized),
|
||||
"skipped": 0,
|
||||
"duplicates": 0,
|
||||
"from_db": True,
|
||||
},
|
||||
)
|
||||
|
||||
async def _run_normalize(self, scraper_output: ScraperOutput) -> Stage1Output:
|
||||
"""Run normalization stage."""
|
||||
stage1 = Stage1Normalizer(
|
||||
@@ -788,6 +1079,7 @@ class ReviewIQPipeline(BasePipeline):
|
||||
taxonomy_version=self._config.taxonomy_version,
|
||||
profile=self._config.classification_profile,
|
||||
max_spans_per_review=self._config.max_spans_per_review,
|
||||
job_id=stage1_output.get("job_id"),
|
||||
),
|
||||
)
|
||||
|
||||
@@ -796,7 +1088,7 @@ class ReviewIQPipeline(BasePipeline):
|
||||
finally:
|
||||
await stage2.close()
|
||||
|
||||
async def _run_route(self, stage2_output: Stage2Output) -> Stage3Output:
|
||||
async def _run_route(self, stage2_output: Stage2Output, job_id: str | None = None) -> Stage3Output:
|
||||
"""Run issue routing stage."""
|
||||
stage3 = Stage3Router(
|
||||
self._config,
|
||||
@@ -806,9 +1098,12 @@ class ReviewIQPipeline(BasePipeline):
|
||||
)
|
||||
|
||||
spans_to_route = []
|
||||
now = datetime.now()
|
||||
for review in stage2_output["reviews_classified"]:
|
||||
for span in review.get("spans", []):
|
||||
if span["valence"] in ("V-", "V±"):
|
||||
# Use current datetime as fallback for missing review_time
|
||||
review_time = review.get("review_time") or now
|
||||
spans_to_route.append(
|
||||
SpanToRoute(
|
||||
span_id=span["span_id"],
|
||||
@@ -818,13 +1113,13 @@ class ReviewIQPipeline(BasePipeline):
|
||||
valence=span["valence"],
|
||||
intensity=span["intensity"],
|
||||
entity_normalized=span.get("entity_normalized"),
|
||||
review_time=review.get("review_time", ""),
|
||||
review_time=review_time,
|
||||
confidence=span.get("confidence", "medium"),
|
||||
trust_score=review.get("trust_score", 0.5),
|
||||
)
|
||||
)
|
||||
|
||||
return await stage3.process(Stage3Input(spans=spans_to_route))
|
||||
return await stage3.process(Stage3Input(spans=spans_to_route, job_id=job_id))
|
||||
|
||||
async def _run_aggregate(
|
||||
self,
|
||||
@@ -848,17 +1143,39 @@ class ReviewIQPipeline(BasePipeline):
|
||||
|
||||
return await stage4.process(input_data)
|
||||
|
||||
async def _run_synthesize(self, job_id: str, execution_id: str):
|
||||
"""Run AI synthesis stage to generate narratives and action plans."""
|
||||
from reviewiq_pipeline.stages.stage5_synthesize import Synthesis
|
||||
|
||||
# Create LLM client for synthesis
|
||||
llm_client = LLMClient.create(self._config)
|
||||
|
||||
try:
|
||||
stage5 = Stage5Synthesizer(
|
||||
pool=self._db.pool,
|
||||
llm_client=llm_client,
|
||||
)
|
||||
|
||||
return await stage5.run(job_id, execution_id)
|
||||
finally:
|
||||
await llm_client.close()
|
||||
|
||||
# =========================================================================
|
||||
# Widget Data Methods
|
||||
# =========================================================================
|
||||
|
||||
async def _get_review_count(self, business_id: str | None) -> dict[str, Any]:
|
||||
async def _get_review_count(self, business_id: str | None, job_id: str | None = None) -> dict[str, Any]:
|
||||
"""Get total review count."""
|
||||
if not self._db:
|
||||
return {"total_reviews": 0}
|
||||
|
||||
async with self._db._pool.acquire() as conn:
|
||||
if business_id:
|
||||
if job_id:
|
||||
count = await conn.fetchval(
|
||||
"SELECT COUNT(*) FROM pipeline.reviews_enriched WHERE job_id = $1::uuid",
|
||||
job_id,
|
||||
)
|
||||
elif business_id:
|
||||
count = await conn.fetchval(
|
||||
"SELECT COUNT(*) FROM pipeline.reviews_raw WHERE business_id = $1",
|
||||
business_id,
|
||||
@@ -871,7 +1188,7 @@ class ReviewIQPipeline(BasePipeline):
|
||||
return {"total_reviews": count or 0}
|
||||
|
||||
async def _get_processed_count(
|
||||
self, business_id: str | None, time_range: str
|
||||
self, business_id: str | None, job_id: str | None, time_range: str
|
||||
) -> dict[str, Any]:
|
||||
"""Get processed review count with trend."""
|
||||
if not self._db:
|
||||
@@ -881,7 +1198,14 @@ class ReviewIQPipeline(BasePipeline):
|
||||
days = self._parse_time_range(time_range)
|
||||
|
||||
async with self._db._pool.acquire() as conn:
|
||||
if business_id:
|
||||
if job_id:
|
||||
# When filtering by job_id, just return count for that job
|
||||
current = await conn.fetchval(
|
||||
"SELECT COUNT(*) FROM pipeline.reviews_enriched WHERE job_id = $1::uuid",
|
||||
job_id,
|
||||
)
|
||||
return {"reviews_processed": current or 0, "processed_change": 0}
|
||||
elif business_id:
|
||||
current = await conn.fetchval(
|
||||
"""
|
||||
SELECT COUNT(*) FROM pipeline.reviews_enriched
|
||||
@@ -929,13 +1253,21 @@ class ReviewIQPipeline(BasePipeline):
|
||||
"processed_change": round(change, 1),
|
||||
}
|
||||
|
||||
async def _get_issues_count(self, business_id: str | None) -> dict[str, Any]:
|
||||
async def _get_issues_count(self, business_id: str | None, job_id: str | None = None) -> dict[str, Any]:
|
||||
"""Get open issues count."""
|
||||
if not self._db:
|
||||
return {"issues_count": 0}
|
||||
|
||||
async with self._db._pool.acquire() as conn:
|
||||
if business_id:
|
||||
if job_id:
|
||||
count = await conn.fetchval(
|
||||
"""
|
||||
SELECT COUNT(*) FROM pipeline.issues
|
||||
WHERE job_id = $1::uuid AND state = 'open'
|
||||
""",
|
||||
job_id,
|
||||
)
|
||||
elif business_id:
|
||||
count = await conn.fetchval(
|
||||
"""
|
||||
SELECT COUNT(*) FROM pipeline.issues
|
||||
@@ -951,7 +1283,7 @@ class ReviewIQPipeline(BasePipeline):
|
||||
return {"issues_count": count or 0}
|
||||
|
||||
async def _get_avg_rating(
|
||||
self, business_id: str | None, time_range: str
|
||||
self, business_id: str | None, job_id: str | None, time_range: str
|
||||
) -> dict[str, Any]:
|
||||
"""Get average rating with trend."""
|
||||
if not self._db:
|
||||
@@ -960,7 +1292,13 @@ class ReviewIQPipeline(BasePipeline):
|
||||
days = self._parse_time_range(time_range)
|
||||
|
||||
async with self._db._pool.acquire() as conn:
|
||||
if business_id:
|
||||
if job_id:
|
||||
current = await conn.fetchval(
|
||||
"SELECT AVG(rating) FROM pipeline.reviews_enriched WHERE job_id = $1::uuid",
|
||||
job_id,
|
||||
)
|
||||
return {"avg_rating": round(float(current), 2) if current else 0, "rating_change": 0}
|
||||
elif business_id:
|
||||
current = await conn.fetchval(
|
||||
"""
|
||||
SELECT AVG(rating) FROM pipeline.reviews_enriched
|
||||
@@ -1009,14 +1347,26 @@ class ReviewIQPipeline(BasePipeline):
|
||||
}
|
||||
|
||||
async def _get_sentiment_distribution(
|
||||
self, business_id: str | None
|
||||
self, business_id: str | None, job_id: str | None = None
|
||||
) -> dict[str, Any]:
|
||||
"""Get sentiment distribution for pie chart."""
|
||||
if not self._db:
|
||||
return {"data": []}
|
||||
|
||||
async with self._db._pool.acquire() as conn:
|
||||
if business_id:
|
||||
if job_id:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT
|
||||
valence,
|
||||
COUNT(*) as count
|
||||
FROM pipeline.review_spans
|
||||
WHERE job_id = $1::uuid AND is_active = TRUE
|
||||
GROUP BY valence
|
||||
""",
|
||||
job_id,
|
||||
)
|
||||
elif business_id:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT
|
||||
@@ -1059,7 +1409,7 @@ class ReviewIQPipeline(BasePipeline):
|
||||
return {"data": data}
|
||||
|
||||
async def _get_sentiment_trend(
|
||||
self, business_id: str | None, time_range: str
|
||||
self, business_id: str | None, job_id: str | None, time_range: str
|
||||
) -> dict[str, Any]:
|
||||
"""Get sentiment trend over time for line chart."""
|
||||
if not self._db:
|
||||
@@ -1068,7 +1418,23 @@ class ReviewIQPipeline(BasePipeline):
|
||||
days = self._parse_time_range(time_range)
|
||||
|
||||
async with self._db._pool.acquire() as conn:
|
||||
if business_id:
|
||||
if job_id:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT
|
||||
DATE(review_time) as date,
|
||||
COUNT(*) FILTER (WHERE valence = 'V+') as positive,
|
||||
COUNT(*) FILTER (WHERE valence = 'V-') as negative,
|
||||
COUNT(*) FILTER (WHERE valence = 'V0') as neutral
|
||||
FROM pipeline.review_spans
|
||||
WHERE job_id = $1::uuid
|
||||
AND is_active = TRUE
|
||||
GROUP BY DATE(review_time)
|
||||
ORDER BY date
|
||||
""",
|
||||
job_id,
|
||||
)
|
||||
elif business_id:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT
|
||||
@@ -1115,13 +1481,26 @@ class ReviewIQPipeline(BasePipeline):
|
||||
|
||||
return {"data": data}
|
||||
|
||||
async def _get_urt_distribution(self, business_id: str | None) -> dict[str, Any]:
|
||||
async def _get_urt_distribution(self, business_id: str | None, job_id: str | None = None) -> dict[str, Any]:
|
||||
"""Get URT domain distribution for bar chart."""
|
||||
if not self._db:
|
||||
return {"data": []}
|
||||
|
||||
async with self._db._pool.acquire() as conn:
|
||||
if business_id:
|
||||
if job_id:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT
|
||||
SUBSTRING(urt_primary, 1, 1) as domain,
|
||||
COUNT(*) as count
|
||||
FROM pipeline.review_spans
|
||||
WHERE job_id = $1::uuid AND is_active = TRUE
|
||||
GROUP BY SUBSTRING(urt_primary, 1, 1)
|
||||
ORDER BY count DESC
|
||||
""",
|
||||
job_id,
|
||||
)
|
||||
elif business_id:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT
|
||||
@@ -1168,13 +1547,26 @@ class ReviewIQPipeline(BasePipeline):
|
||||
|
||||
return {"data": data}
|
||||
|
||||
async def _get_intensity_heatmap(self, business_id: str | None) -> dict[str, Any]:
|
||||
async def _get_intensity_heatmap(self, business_id: str | None, job_id: str | None = None) -> dict[str, Any]:
|
||||
"""Get domain x intensity heatmap data."""
|
||||
if not self._db:
|
||||
return {"data": []}
|
||||
|
||||
async with self._db._pool.acquire() as conn:
|
||||
if business_id:
|
||||
if job_id:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT
|
||||
SUBSTRING(urt_primary, 1, 1) as domain,
|
||||
intensity,
|
||||
COUNT(*) as count
|
||||
FROM pipeline.review_spans
|
||||
WHERE job_id = $1::uuid AND is_active = TRUE
|
||||
GROUP BY SUBSTRING(urt_primary, 1, 1), intensity
|
||||
""",
|
||||
job_id,
|
||||
)
|
||||
elif business_id:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT
|
||||
@@ -1222,7 +1614,7 @@ class ReviewIQPipeline(BasePipeline):
|
||||
return {"data": data}
|
||||
|
||||
async def _get_issues_table(
|
||||
self, business_id: str | None, params: dict[str, Any]
|
||||
self, business_id: str | None, job_id: str | None, params: dict[str, Any]
|
||||
) -> dict[str, Any]:
|
||||
"""Get issues table data."""
|
||||
if not self._db:
|
||||
@@ -1233,7 +1625,30 @@ class ReviewIQPipeline(BasePipeline):
|
||||
offset = (page - 1) * page_size
|
||||
|
||||
async with self._db._pool.acquire() as conn:
|
||||
if business_id:
|
||||
if job_id:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT
|
||||
issue_id,
|
||||
domain,
|
||||
primary_subcode as subcode,
|
||||
span_count,
|
||||
max_intensity,
|
||||
state
|
||||
FROM pipeline.issues
|
||||
WHERE job_id = $1::uuid
|
||||
ORDER BY span_count DESC, created_at DESC
|
||||
LIMIT $2 OFFSET $3
|
||||
""",
|
||||
job_id,
|
||||
page_size,
|
||||
offset,
|
||||
)
|
||||
total = await conn.fetchval(
|
||||
"SELECT COUNT(*) FROM pipeline.issues WHERE job_id = $1::uuid",
|
||||
job_id,
|
||||
)
|
||||
elif business_id:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT
|
||||
@@ -1279,13 +1694,24 @@ class ReviewIQPipeline(BasePipeline):
|
||||
|
||||
return {"data": data, "total": total or 0}
|
||||
|
||||
async def _get_issues_by_domain(self, business_id: str | None) -> dict[str, Any]:
|
||||
async def _get_issues_by_domain(self, business_id: str | None, job_id: str | None = None) -> dict[str, Any]:
|
||||
"""Get issues grouped by domain for pie chart."""
|
||||
if not self._db:
|
||||
return {"data": []}
|
||||
|
||||
async with self._db._pool.acquire() as conn:
|
||||
if business_id:
|
||||
if job_id:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT domain, COUNT(*) as count
|
||||
FROM pipeline.issues
|
||||
WHERE job_id = $1::uuid
|
||||
GROUP BY domain
|
||||
ORDER BY count DESC
|
||||
""",
|
||||
job_id,
|
||||
)
|
||||
elif business_id:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT domain, COUNT(*) as count
|
||||
@@ -1310,6 +1736,89 @@ class ReviewIQPipeline(BasePipeline):
|
||||
|
||||
return {"data": data}
|
||||
|
||||
async def _get_classified_reviews(
|
||||
self, business_id: str | None, job_id: str | None, params: dict[str, Any]
|
||||
) -> dict[str, Any]:
|
||||
"""Get classified reviews with URT codes and human-readable names."""
|
||||
if not self._db:
|
||||
return {"data": [], "total": 0}
|
||||
|
||||
page = params.get("page", 1)
|
||||
page_size = params.get("page_size", 15)
|
||||
offset = (page - 1) * page_size
|
||||
|
||||
async with self._db._pool.acquire() as conn:
|
||||
# Build the query with JOINs to get human-readable code names
|
||||
base_query = """
|
||||
SELECT
|
||||
s.span_id,
|
||||
s.span_text,
|
||||
s.urt_primary as urt_code,
|
||||
COALESCE(sub.name, cat.name, dom.name) as code_name,
|
||||
COALESCE(sub.definition, dom.description) as code_definition,
|
||||
dom.name as domain_name,
|
||||
CASE s.valence
|
||||
WHEN 'V+' THEN 'Positive'
|
||||
WHEN 'V-' THEN 'Negative'
|
||||
WHEN 'V0' THEN 'Neutral'
|
||||
WHEN 'V±' THEN 'Mixed'
|
||||
ELSE s.valence
|
||||
END as valence,
|
||||
CASE s.intensity
|
||||
WHEN 'I1' THEN 'Mild'
|
||||
WHEN 'I2' THEN 'Moderate'
|
||||
WHEN 'I3' THEN 'Strong'
|
||||
ELSE s.intensity
|
||||
END as intensity,
|
||||
e.rating,
|
||||
s.review_time
|
||||
FROM pipeline.review_spans s
|
||||
LEFT JOIN pipeline.reviews_enriched e ON s.review_id = e.review_id AND s.review_version = e.review_version
|
||||
LEFT JOIN pipeline.urt_domains dom ON SUBSTRING(s.urt_primary, 1, 1) = dom.code
|
||||
LEFT JOIN pipeline.urt_categories cat ON SUBSTRING(s.urt_primary, 1, 2) = cat.code
|
||||
LEFT JOIN pipeline.urt_subcodes sub ON s.urt_primary = sub.code
|
||||
WHERE s.is_active = TRUE
|
||||
"""
|
||||
count_query = """
|
||||
SELECT COUNT(*) FROM pipeline.review_spans s
|
||||
WHERE s.is_active = TRUE
|
||||
"""
|
||||
|
||||
if job_id:
|
||||
base_query += " AND s.job_id = $1::uuid"
|
||||
count_query += " AND s.job_id = $1::uuid"
|
||||
base_query += f" ORDER BY s.review_time DESC LIMIT {page_size} OFFSET {offset}"
|
||||
rows = await conn.fetch(base_query, job_id)
|
||||
total = await conn.fetchval(count_query, job_id)
|
||||
elif business_id:
|
||||
base_query += " AND s.business_id = $1"
|
||||
count_query += " AND s.business_id = $1"
|
||||
base_query += f" ORDER BY s.review_time DESC LIMIT {page_size} OFFSET {offset}"
|
||||
rows = await conn.fetch(base_query, business_id)
|
||||
total = await conn.fetchval(count_query, business_id)
|
||||
else:
|
||||
base_query += f" ORDER BY s.review_time DESC LIMIT {page_size} OFFSET {offset}"
|
||||
rows = await conn.fetch(base_query)
|
||||
total = await conn.fetchval(count_query)
|
||||
|
||||
data = [
|
||||
{
|
||||
"span_id": row["span_id"],
|
||||
"span_text": row["span_text"],
|
||||
"urt_code": row["urt_code"],
|
||||
"code_name": row["code_name"] or "Unknown",
|
||||
"code_definition": row["code_definition"] or "",
|
||||
"domain_name": row["domain_name"] or "Unknown",
|
||||
"valence": row["valence"],
|
||||
"intensity": row["intensity"],
|
||||
"rating": row["rating"],
|
||||
"review_time": row["review_time"].isoformat() if row["review_time"] else None,
|
||||
}
|
||||
for row in rows
|
||||
]
|
||||
|
||||
return {"data": data, "total": total or 0}
|
||||
|
||||
def _parse_time_range(self, time_range: str) -> int:
|
||||
"""Parse time range string to days."""
|
||||
if time_range.endswith("d"):
|
||||
|
||||
@@ -29,28 +29,205 @@ Your task is to extract semantic spans from customer reviews and classify each s
|
||||
|
||||
## SPAN EXTRACTION RULES
|
||||
|
||||
1. **Split on contrasting conjunctions**: but, however, although, despite, yet, though
|
||||
2. **Split on topic/target change**: food → service → bathroom = 3 spans
|
||||
3. **Split on valence change**: positive → negative = split
|
||||
4. **Split on domain change**: O (Offering) → J (Journey) → E (Environment) = split
|
||||
5. **Keep together**: cause→effect within same feedback unit ("X because Y" = 1 span)
|
||||
**CRITICAL: Use TOPIC-BASED splitting, NOT sentence-based splitting.**
|
||||
|
||||
A span = all consecutive text about the SAME topic/domain, regardless of sentence count.
|
||||
|
||||
### When to KEEP TOGETHER (same span):
|
||||
- Multiple sentences about the same topic: "The food was great. I loved the pasta. The sauce was perfect." → ONE span (all about Offering)
|
||||
- Cause and effect: "The wait was long because they were understaffed" → ONE span
|
||||
- Elaboration: "Staff was rude. They ignored us for 20 minutes." → ONE span (both about People)
|
||||
- Single-topic reviews: Even if 5 sentences, if all about food → ONE span
|
||||
|
||||
### When to SPLIT (separate spans):
|
||||
- Contrasting conjunctions that change topic: "Food was great BUT service was slow" → TWO spans
|
||||
- Domain change: food (O) → staff (P) → ambiance (E) = split at each change
|
||||
- Target change: "The waiter was nice but the manager was rude" → TWO spans (different people)
|
||||
|
||||
### Examples:
|
||||
- "Amazing food. Best burger ever. Fries were crispy too." → 1 span (all Offering, V+)
|
||||
- "Food was great but we waited an hour." → 2 spans (Offering V+, Journey V-)
|
||||
- "I've been coming here for years. Always consistent quality." → 1 span (Relationship)
|
||||
- "The staff are lovely and amazing with kids. More highchairs are definitely needed though." → 2 spans (People V+, Access V-)
|
||||
|
||||
**Guardrails**:
|
||||
- Max 3 spans per sentence (if 4+, re-check for over-splitting)
|
||||
- Min 1 span per review (even single-word reviews)
|
||||
- Spans must be non-overlapping and cover meaningful content
|
||||
- Prefer FEWER, LARGER spans over many small ones
|
||||
- Most reviews should have 1-3 spans, rarely more
|
||||
- Min 1 span per review
|
||||
- Spans must be non-overlapping
|
||||
|
||||
## URT DOMAINS (Tier-3 codes: X#.##)
|
||||
## URT TAXONOMY - COMPLETE (138 codes, use EXACT codes)
|
||||
|
||||
| Domain | Code | Description |
|
||||
|--------|------|-------------|
|
||||
| Offering | O1-O4 | Product/service quality, features, variety |
|
||||
| Price | P1-P4 | Value, pricing, promotions, payment |
|
||||
| Journey | J1-J4 | Timing, process, convenience, accessibility |
|
||||
| Environment | E1-E4 | Physical space, ambiance, cleanliness, digital UX |
|
||||
| Attitude | A1-A4 | Staff behavior, helpfulness, professionalism |
|
||||
| Voice | V1-V4 | Brand, communication, marketing, transparency |
|
||||
| Relationship | R1-R4 | Loyalty, trust, consistency, personalization |
|
||||
### O - OFFERING (Product/Service Quality) - 18 codes
|
||||
O1.01 Works/Doesn't Work: Basic functionality success or failure
|
||||
O1.02 Performance Level: How well it operates
|
||||
O1.03 Durability: Longevity and resistance to wear
|
||||
O1.04 Reliability: Consistency of function over time
|
||||
O1.05 Outcome Achievement: Did customer accomplish their goal?
|
||||
O2.01 Materials/Inputs: Quality of components or ingredients
|
||||
O2.02 Craftsmanship: Skill of construction or execution
|
||||
O2.03 Presentation: Visual and aesthetic quality
|
||||
O2.04 Attention to Detail: Finishing touches and refinement
|
||||
O2.05 Condition at Delivery: State when received
|
||||
O3.01 All Components Present: Nothing missing from what was promised
|
||||
O3.02 Feature Availability: Promised features actually work
|
||||
O3.03 Scope Delivery: Full scope of work completed
|
||||
O3.04 Documentation: Supporting materials provided
|
||||
O4.01 Specification Match: Matches what was ordered
|
||||
O4.02 Personalization: Adapted to individual preferences
|
||||
O4.03 Flexibility: Can be modified or adjusted
|
||||
O4.04 Appropriateness: Right solution for the need
|
||||
|
||||
### P - PEOPLE (Staff Interactions) - 20 codes
|
||||
P1.01 Warmth: Friendly and welcoming manner
|
||||
P1.02 Respect: Treated with dignity
|
||||
P1.03 Patience: Calm and tolerant approach
|
||||
P1.04 Enthusiasm: Energy and engagement
|
||||
P1.05 Empathy: Understanding feelings
|
||||
P2.01 Knowledge: Expertise and understanding
|
||||
P2.02 Skill: Technical ability
|
||||
P2.03 Problem Solving: Ability to find solutions
|
||||
P2.04 Advice Quality: Helpful recommendations
|
||||
P2.05 Training Level: Staff training evident
|
||||
P3.01 Attentiveness: Being present and engaged
|
||||
P3.02 Initiative: Proactive help
|
||||
P3.03 Follow-through: Completing promised actions
|
||||
P3.04 Availability: Being available when needed
|
||||
P3.05 Dedication: Commitment to helping
|
||||
P4.01 Clarity: Clear communication
|
||||
P4.02 Listening: Understanding customer needs
|
||||
P4.03 Transparency: Honest and open
|
||||
P4.04 Honesty: Truthful communication
|
||||
P4.05 Proactive Updates: Keeping customer informed
|
||||
|
||||
### J - JOURNEY (Process & Timing) - 20 codes
|
||||
J1.01 Speed: How fast things happen
|
||||
J1.02 Punctuality: On-time delivery
|
||||
J1.03 Queue Management: Handling of waiting customers
|
||||
J1.04 Punctuality: Meeting scheduled times
|
||||
J1.05 Pacing: Appropriate speed (not rushed/dragged)
|
||||
J2.01 Simplicity: Easy process
|
||||
J2.02 Friction: Obstacles encountered
|
||||
J2.03 Navigation: Finding what you need
|
||||
J2.04 Booking Availability: Slots/capacity when needed
|
||||
J2.05 Inventory: Stock availability
|
||||
J3.01 Consistency: Same experience every time
|
||||
J3.02 Accuracy: Getting it right
|
||||
J3.03 Uptime: System availability
|
||||
J3.04 Data Accuracy: Correct info in systems
|
||||
J3.05 Integration: Systems work together
|
||||
J4.01 Problem Recognition: Acknowledging issues
|
||||
J4.02 Resolution Speed: How fast problems get fixed
|
||||
J4.03 Resolution Fairness: Fair handling of issues
|
||||
J4.04 Escalation: Getting to right person
|
||||
J4.05 Closure: Issue fully resolved
|
||||
|
||||
### E - ENVIRONMENT (Physical & Digital Space) - 20 codes
|
||||
E1.01 Cleanliness: How clean the space is
|
||||
E1.02 Comfort: Physical comfort
|
||||
E1.03 Space Design: Layout and organization
|
||||
E1.04 Ambiance: Atmosphere and vibe
|
||||
E1.05 Comfort: Physical comfort
|
||||
E2.01 Lighting: Light quality and level
|
||||
E2.02 Sound/Noise: Audio environment
|
||||
E2.03 Temperature: Climate control
|
||||
E2.04 Visual Design: Aesthetics of interface
|
||||
E2.05 Mobile Experience: Mobile usability
|
||||
E3.01 Interface Design: Digital UX/UI
|
||||
E3.02 App/Website Speed: Digital performance
|
||||
E3.03 Usability: Ease of digital use
|
||||
E3.04 Health Safety: Health precautions
|
||||
E3.05 Cyber Security: Digital security
|
||||
E4.01 Safety: Physical safety
|
||||
E4.02 Security: Protection of belongings/data
|
||||
E4.03 Health/Hygiene: Health standards
|
||||
E4.04 Social Responsibility: Ethical practices
|
||||
E4.05 Community Impact: Local community effect
|
||||
|
||||
### A - ACCESS (Availability & Accessibility) - 20 codes
|
||||
A1.01 Hours: Operating hours
|
||||
A1.02 Booking Availability: Appointment slots
|
||||
A1.03 Inventory: Product availability
|
||||
A1.04 Wayfinding: Finding destination
|
||||
A1.05 Physical Accessibility: Disability accommodations
|
||||
A2.01 Physical Access: Mobility accessibility
|
||||
A2.02 Language Access: Language accommodation
|
||||
A2.03 Digital Accessibility: Screen reader/a11y
|
||||
A2.04 Language Accessibility: Multilingual support
|
||||
A2.05 Hours of Operation: Service availability times
|
||||
A3.01 Diversity Welcome: All backgrounds welcome
|
||||
A3.02 Accommodation: Special needs accommodation
|
||||
A3.03 Response Time: Speed of getting answers
|
||||
A3.04 Documentation Clarity: Clear instructions
|
||||
A3.05 Support Accessibility: Getting help when needed
|
||||
A4.01 Location: Physical location convenience
|
||||
A4.02 Parking: Parking availability
|
||||
A4.03 Multiple Channels: Ways to engage
|
||||
A4.04 Payment Flexibility: Multiple payment options
|
||||
A4.05 Refund Accessibility: Getting money back
|
||||
|
||||
### V - VALUE (Pricing & Costs) - 20 codes ⚠️ USE FOR ALL PRICE/COST/FEE MENTIONS
|
||||
V1.01 Price Level: Cost amount ("cheap", "expensive", "affordable", "€", "$")
|
||||
V1.02 Price Fairness: Fair for what you get
|
||||
V1.03 Hidden Costs: Unexpected charges, surprise fees, hidden fees, extra charges
|
||||
V1.04 Price Transparency: Clear pricing upfront
|
||||
V1.05 Price Stability: Consistent pricing
|
||||
V2.01 Clear Pricing: Easy to understand costs
|
||||
V2.02 Honest Billing: Accurate charges
|
||||
V2.03 Policy Clarity: Clear terms and conditions
|
||||
V2.04 Quality-Price Ratio: Worth vs cost
|
||||
V2.05 Competitive Value: Compared to alternatives
|
||||
V3.01 Time Investment: Time required
|
||||
V3.02 Hassle Factor: Difficulty and inconvenience
|
||||
V3.03 Mental Load: Cognitive effort required
|
||||
V3.04 Promotion Clarity: Clear offer terms
|
||||
V3.05 Reward Redemption: Using points/rewards
|
||||
V4.01 Value for Money: Worth what you paid
|
||||
V4.02 ROI: Return on investment
|
||||
V4.03 Overall Satisfaction: Happy with the exchange
|
||||
V4.04 Billing Accuracy: Correct charges
|
||||
V4.05 Billing Resolution: Fixing billing issues
|
||||
|
||||
### R - RELATIONSHIP (Trust & Loyalty) - 20 codes
|
||||
R1.01 Honesty: Truthfulness
|
||||
R1.02 Ethics: Ethical behavior, deceptive practices, scams
|
||||
R1.03 Promises Kept: Following through on promises
|
||||
R1.04 Ethics: Ethical behavior
|
||||
R1.05 Accountability: Taking responsibility
|
||||
R2.01 Consistency: Reliable over time
|
||||
R2.02 Trustworthiness: Can be trusted
|
||||
R2.03 Accountability: Takes responsibility
|
||||
R2.04 Predictability: Consistent experience
|
||||
R2.05 Standards: Meeting quality standards
|
||||
R3.01 Error Acknowledgment: Admits mistakes
|
||||
R3.02 Apology Quality: Sincere apologies
|
||||
R3.03 Making It Right: Correcting mistakes
|
||||
R3.04 Personal Connection: Human touch
|
||||
R3.05 Going Extra Mile: Beyond expectations
|
||||
R4.01 Customer Recognition: Remembers customers
|
||||
R4.02 Loyalty Rewards: Rewards for loyalty
|
||||
R4.03 Long-term Relationship: Builds relationships
|
||||
R4.04 Service Recovery: Making things right
|
||||
R4.05 Feedback Response: Acting on feedback
|
||||
|
||||
## CLASSIFICATION EXAMPLES (Critical Distinctions)
|
||||
|
||||
**PRICING/COSTS → V codes (Value), NOT P codes:**
|
||||
- "Cheap prices", "good price", "€50" → V1.01 Price Level
|
||||
- "Hidden charges", "surprise fees", "extra €35" → V1.03 Hidden Costs
|
||||
- "Great value for money" → V4.01 Value for Money
|
||||
- "Overcharged", "wrong amount" → V4.04 Billing Accuracy
|
||||
|
||||
**STAFF BEHAVIOR → P codes (People):**
|
||||
- "Staff was friendly", "welcoming" → P1.01 Warmth
|
||||
- "Rude", "disrespectful", "ignored us" → P1.02 Respect
|
||||
- "Patient", "took their time" → P1.03 Patience
|
||||
- "Knowledgeable", "expert" → P2.01 Knowledge
|
||||
|
||||
**DECEPTION/ETHICS → R codes (Relationship):**
|
||||
- "They lied", "misleading" → R1.01 Honesty
|
||||
- "Felt scammed", "dishonest practices" → R1.02 Ethics
|
||||
- "Didn't honor the deal" → R1.03 Promises Kept
|
||||
|
||||
## DIMENSION CODES
|
||||
|
||||
@@ -159,6 +336,20 @@ class LLMClientBase(ABC):
|
||||
self.config = config
|
||||
self.total_tokens_used = 0
|
||||
self.total_cost_usd = 0.0
|
||||
self._custom_prompt: str | None = None
|
||||
|
||||
def set_prompt(self, prompt: str) -> None:
|
||||
"""
|
||||
Set a custom system prompt (e.g., built dynamically from database).
|
||||
|
||||
Args:
|
||||
prompt: The system prompt to use for classification
|
||||
"""
|
||||
self._custom_prompt = prompt
|
||||
|
||||
def get_prompt(self) -> str:
|
||||
"""Get the current system prompt (custom or default)."""
|
||||
return self._custom_prompt or SYSTEM_PROMPT
|
||||
|
||||
@abstractmethod
|
||||
async def classify(
|
||||
@@ -178,6 +369,28 @@ class LLMClientBase(ABC):
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def generate(
|
||||
self,
|
||||
system_prompt: str,
|
||||
user_prompt: str,
|
||||
temperature: float = 0.7,
|
||||
max_tokens: int = 4000,
|
||||
) -> str:
|
||||
"""
|
||||
Generate text using the LLM (for synthesis, narratives, etc.).
|
||||
|
||||
Args:
|
||||
system_prompt: System instructions
|
||||
user_prompt: User content/context
|
||||
temperature: Creativity level (0-1)
|
||||
max_tokens: Maximum response length
|
||||
|
||||
Returns:
|
||||
Generated text response
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def close(self) -> None:
|
||||
"""Close the client and cleanup resources."""
|
||||
@@ -211,7 +424,7 @@ class OpenAIClient(LLMClientBase):
|
||||
start_time = time.time()
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "system", "content": self.get_prompt()},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f'Classify this review:\n\n"{review_text}"',
|
||||
@@ -255,6 +468,43 @@ class OpenAIClient(LLMClientBase):
|
||||
|
||||
return result, metadata
|
||||
|
||||
async def generate(
|
||||
self,
|
||||
system_prompt: str,
|
||||
user_prompt: str,
|
||||
temperature: float = 0.7,
|
||||
max_tokens: int = 4000,
|
||||
) -> str:
|
||||
"""Generate text using OpenAI."""
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
]
|
||||
|
||||
response = await self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=messages,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
response_format={"type": "json_object"},
|
||||
timeout=self.config.llm_timeout_seconds,
|
||||
)
|
||||
|
||||
content = response.choices[0].message.content
|
||||
if not content:
|
||||
raise ValueError("Empty response from OpenAI")
|
||||
|
||||
# Track usage
|
||||
if response.usage:
|
||||
input_tokens = response.usage.prompt_tokens
|
||||
output_tokens = response.usage.completion_tokens
|
||||
pricing = self.PRICING.get(self.model, {"input": 0.15, "output": 0.60})
|
||||
cost = (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1_000_000
|
||||
self.total_tokens_used += input_tokens + output_tokens
|
||||
self.total_cost_usd += cost
|
||||
|
||||
return content
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close the OpenAI client."""
|
||||
await self.client.close()
|
||||
@@ -289,7 +539,7 @@ class AnthropicClient(LLMClientBase):
|
||||
response = await self.client.messages.create(
|
||||
model=self.model,
|
||||
max_tokens=4096,
|
||||
system=SYSTEM_PROMPT,
|
||||
system=self.get_prompt(),
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
@@ -329,6 +579,58 @@ class AnthropicClient(LLMClientBase):
|
||||
|
||||
return result, metadata
|
||||
|
||||
async def generate(
|
||||
self,
|
||||
system_prompt: str,
|
||||
user_prompt: str,
|
||||
temperature: float = 0.7,
|
||||
max_tokens: int = 4000,
|
||||
) -> str:
|
||||
"""Generate text using Anthropic."""
|
||||
response = await self.client.messages.create(
|
||||
model=self.model,
|
||||
max_tokens=max_tokens,
|
||||
system=system_prompt,
|
||||
messages=[{"role": "user", "content": user_prompt}],
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
content = response.content[0].text if response.content else ""
|
||||
if not content:
|
||||
raise ValueError("Empty response from Anthropic")
|
||||
|
||||
# Track usage
|
||||
input_tokens = response.usage.input_tokens
|
||||
output_tokens = response.usage.output_tokens
|
||||
pricing = self.PRICING.get(self.model, {"input": 3.0, "output": 15.0})
|
||||
cost = (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1_000_000
|
||||
self.total_tokens_used += input_tokens + output_tokens
|
||||
self.total_cost_usd += cost
|
||||
|
||||
# Extract JSON from response (handles code blocks)
|
||||
return self._extract_json_string(content)
|
||||
|
||||
def _extract_json_string(self, content: str) -> str:
|
||||
"""Extract JSON string from response, handling markdown code blocks."""
|
||||
import re
|
||||
content = content.strip()
|
||||
|
||||
# If it starts with {, return as-is
|
||||
if content.startswith("{"):
|
||||
return content
|
||||
|
||||
# Try to find JSON in code blocks
|
||||
json_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", content)
|
||||
if json_match:
|
||||
return json_match.group(1)
|
||||
|
||||
# Try to find JSON object
|
||||
json_match = re.search(r"\{[\s\S]*\}", content)
|
||||
if json_match:
|
||||
return json_match.group(0)
|
||||
|
||||
return content
|
||||
|
||||
def _extract_json(self, content: str) -> dict[str, Any]:
|
||||
"""Extract JSON from response, handling markdown code blocks."""
|
||||
content = content.strip()
|
||||
|
||||
@@ -0,0 +1,477 @@
|
||||
"""
|
||||
Stage 5: Synthesize - Generate AI narratives and action plans.
|
||||
|
||||
This stage runs after classification and routing to produce:
|
||||
- Executive narrative (business-specific story)
|
||||
- Section insights (sentiment, category, timeline)
|
||||
- Action plan with prioritized recommendations
|
||||
- Timeline annotations for key events
|
||||
- Marketing angles from strengths
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import asyncpg
|
||||
|
||||
from reviewiq_pipeline.services.llm_client import LLMClientBase
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ActionItem:
|
||||
"""A specific action recommendation."""
|
||||
id: str
|
||||
title: str
|
||||
why: str
|
||||
what: str
|
||||
who: str
|
||||
impact: str
|
||||
evidence: list[str]
|
||||
estimated_rating_lift: float | None
|
||||
complexity: str # 'quick' | 'medium' | 'complex'
|
||||
priority: str # 'critical' | 'high' | 'medium' | 'low'
|
||||
timeline: str
|
||||
related_subcode: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class TimelineAnnotation:
|
||||
"""An annotation for a key event on the timeline."""
|
||||
date: str
|
||||
label: str
|
||||
description: str
|
||||
type: str # 'positive' | 'negative' | 'neutral' | 'event'
|
||||
|
||||
|
||||
@dataclass
|
||||
class Synthesis:
|
||||
"""Complete synthesis output from Stage 5."""
|
||||
executive_narrative: str
|
||||
sentiment_insight: str
|
||||
category_insight: str
|
||||
timeline_insight: str
|
||||
priority_domain: str | None
|
||||
priority_issue: str | None
|
||||
action_plan: list[ActionItem]
|
||||
issue_actions: dict[str, str]
|
||||
timeline_annotations: list[TimelineAnnotation]
|
||||
marketing_angles: list[str]
|
||||
competitor_context: str | None
|
||||
generated_at: str
|
||||
|
||||
|
||||
SYNTHESIS_SYSTEM_PROMPT = """You are an expert business analyst specializing in customer experience and review analysis.
|
||||
|
||||
Your task is to analyze classified review data and generate actionable business insights.
|
||||
|
||||
You will receive:
|
||||
1. Summary statistics (total reviews, rating, sentiment distribution)
|
||||
2. Top issues by category with example quotes
|
||||
3. Top strengths with example quotes
|
||||
4. Domain breakdown (what customers talk about most)
|
||||
|
||||
Generate a JSON response with these fields:
|
||||
|
||||
{
|
||||
"executive_narrative": "2-3 paragraph story explaining the business situation, key problems, and path forward. Be specific with numbers and examples.",
|
||||
|
||||
"sentiment_insight": "1-2 sentences explaining WHY sentiment is distributed this way. Connect to specific issues.",
|
||||
|
||||
"category_insight": "1-2 sentences about the pattern in categories. Which domain needs most attention and why?",
|
||||
|
||||
"timeline_insight": "1-2 sentences about trends if data shows changes over time.",
|
||||
|
||||
"priority_domain": "Single letter code (P/V/J/O/A/E/R) for the domain needing most attention, or null",
|
||||
|
||||
"priority_issue": "The subcode (e.g., 'V1.03') that should be fixed first, or null",
|
||||
|
||||
"action_plan": [
|
||||
{
|
||||
"id": "action_1",
|
||||
"title": "Clear action title",
|
||||
"why": "Root cause from the reviews",
|
||||
"what": "Specific steps to take",
|
||||
"who": "Department or role responsible",
|
||||
"impact": "Expected outcome",
|
||||
"evidence": ["Quote 1", "Quote 2"],
|
||||
"estimated_rating_lift": 0.3,
|
||||
"complexity": "quick|medium|complex",
|
||||
"priority": "critical|high|medium|low",
|
||||
"timeline": "This week|This month|This quarter",
|
||||
"related_subcode": "V1.03"
|
||||
}
|
||||
],
|
||||
|
||||
"timeline_annotations": [
|
||||
{
|
||||
"date": "2024-01-15",
|
||||
"label": "Short label",
|
||||
"description": "What happened",
|
||||
"type": "positive|negative|neutral|event"
|
||||
}
|
||||
],
|
||||
|
||||
"marketing_angles": [
|
||||
"Way to promote strength 1",
|
||||
"Way to promote strength 2"
|
||||
],
|
||||
|
||||
"competitor_context": "How this compares to industry/competitors, or null if unknown"
|
||||
}
|
||||
|
||||
Be specific, actionable, and business-focused. Use actual numbers and quotes from the data.
|
||||
Prioritize actions by impact and feasibility.
|
||||
"""
|
||||
|
||||
|
||||
class Stage5Synthesizer:
|
||||
"""
|
||||
Stage 5: Generate AI synthesis from classified review data.
|
||||
|
||||
This stage:
|
||||
1. Aggregates classification results
|
||||
2. Identifies patterns and priorities
|
||||
3. Generates narrative insights via LLM
|
||||
4. Produces actionable recommendations
|
||||
"""
|
||||
|
||||
def __init__(self, pool: asyncpg.Pool, llm_client: LLMClientBase):
|
||||
self.pool = pool
|
||||
self.llm_client = llm_client
|
||||
|
||||
async def run(self, job_id: str, execution_id: str) -> Synthesis:
|
||||
"""
|
||||
Generate synthesis for a completed pipeline execution.
|
||||
|
||||
Args:
|
||||
job_id: The scraping job ID
|
||||
execution_id: The pipeline execution ID
|
||||
|
||||
Returns:
|
||||
Synthesis object with all generated insights
|
||||
"""
|
||||
logger.info(f"Stage 5: Generating synthesis for job {job_id}")
|
||||
|
||||
# Gather all the data we need
|
||||
context = await self._gather_context(job_id)
|
||||
|
||||
# Generate synthesis via LLM
|
||||
synthesis = await self._generate_synthesis(context)
|
||||
|
||||
# Store synthesis in database
|
||||
await self._store_synthesis(execution_id, synthesis)
|
||||
|
||||
logger.info(f"Stage 5: Synthesis complete - {len(synthesis.action_plan)} actions generated")
|
||||
return synthesis
|
||||
|
||||
async def _gather_context(self, job_id: str) -> dict[str, Any]:
|
||||
"""Gather all context needed for synthesis."""
|
||||
|
||||
# Get overview stats
|
||||
overview = await self.pool.fetchrow("""
|
||||
SELECT
|
||||
COUNT(DISTINCT r.review_id) as total_reviews,
|
||||
AVG(r.rating) as avg_rating,
|
||||
COUNT(s.span_id) as total_spans
|
||||
FROM pipeline.reviews_enriched r
|
||||
LEFT JOIN pipeline.review_spans s ON s.review_id = r.review_id
|
||||
WHERE r.job_id = $1::uuid
|
||||
""", job_id)
|
||||
|
||||
# Get sentiment distribution
|
||||
sentiment = await self.pool.fetch("""
|
||||
SELECT
|
||||
valence,
|
||||
COUNT(*) as count,
|
||||
COUNT(DISTINCT review_id) as review_count
|
||||
FROM pipeline.review_spans
|
||||
WHERE job_id = $1::uuid AND valence IS NOT NULL AND is_active = TRUE
|
||||
GROUP BY valence
|
||||
ORDER BY count DESC
|
||||
""", job_id)
|
||||
|
||||
# Get top issues (weaknesses)
|
||||
top_issues = await self.pool.fetch("""
|
||||
SELECT
|
||||
s.urt_primary as subcode,
|
||||
sc.name as subcode_name,
|
||||
sc.definition,
|
||||
d.code as domain,
|
||||
d.name as domain_name,
|
||||
COUNT(*) as span_count,
|
||||
COUNT(*) FILTER (WHERE s.valence = 'V-') as negative_count,
|
||||
ARRAY_AGG(s.span_text ORDER BY s.intensity DESC) FILTER (WHERE s.valence = 'V-') as example_quotes
|
||||
FROM pipeline.review_spans s
|
||||
LEFT JOIN pipeline.urt_subcodes sc ON sc.code = s.urt_primary
|
||||
LEFT JOIN pipeline.urt_domains d ON d.code = SUBSTRING(s.urt_primary, 1, 1)
|
||||
WHERE s.job_id = $1::uuid AND s.valence = 'V-' AND s.is_active = TRUE
|
||||
GROUP BY s.urt_primary, sc.name, sc.definition, d.code, d.name
|
||||
ORDER BY negative_count DESC
|
||||
LIMIT 10
|
||||
""", job_id)
|
||||
|
||||
# Get top strengths
|
||||
top_strengths = await self.pool.fetch("""
|
||||
SELECT
|
||||
s.urt_primary as subcode,
|
||||
sc.name as subcode_name,
|
||||
sc.definition,
|
||||
d.code as domain,
|
||||
d.name as domain_name,
|
||||
COUNT(*) as span_count,
|
||||
COUNT(*) FILTER (WHERE s.valence = 'V+') as positive_count,
|
||||
ARRAY_AGG(s.span_text ORDER BY s.intensity DESC) FILTER (WHERE s.valence = 'V+') as example_quotes
|
||||
FROM pipeline.review_spans s
|
||||
LEFT JOIN pipeline.urt_subcodes sc ON sc.code = s.urt_primary
|
||||
LEFT JOIN pipeline.urt_domains d ON d.code = SUBSTRING(s.urt_primary, 1, 1)
|
||||
WHERE s.job_id = $1::uuid AND s.valence = 'V+' AND s.is_active = TRUE
|
||||
GROUP BY s.urt_primary, sc.name, sc.definition, d.code, d.name
|
||||
ORDER BY positive_count DESC
|
||||
LIMIT 5
|
||||
""", job_id)
|
||||
|
||||
# Get domain distribution
|
||||
domains = await self.pool.fetch("""
|
||||
SELECT
|
||||
SUBSTRING(urt_primary, 1, 1) as domain,
|
||||
d.name as domain_name,
|
||||
COUNT(*) as total_count,
|
||||
COUNT(*) FILTER (WHERE valence = 'V+') as positive_count,
|
||||
COUNT(*) FILTER (WHERE valence = 'V-') as negative_count
|
||||
FROM pipeline.review_spans s
|
||||
LEFT JOIN pipeline.urt_domains d ON d.code = SUBSTRING(s.urt_primary, 1, 1)
|
||||
WHERE s.job_id = $1::uuid AND s.is_active = TRUE
|
||||
GROUP BY SUBSTRING(urt_primary, 1, 1), d.name
|
||||
ORDER BY total_count DESC
|
||||
""", job_id)
|
||||
|
||||
# Get business name if available
|
||||
business = await self.pool.fetchrow("""
|
||||
SELECT DISTINCT business_id as business_name
|
||||
FROM pipeline.reviews_enriched
|
||||
WHERE job_id = $1::uuid AND business_id IS NOT NULL
|
||||
LIMIT 1
|
||||
""", job_id)
|
||||
|
||||
return {
|
||||
"business_name": business["business_name"] if business else "This business",
|
||||
"overview": dict(overview) if overview else {},
|
||||
"sentiment": [dict(r) for r in sentiment],
|
||||
"top_issues": [dict(r) for r in top_issues],
|
||||
"top_strengths": [dict(r) for r in top_strengths],
|
||||
"domains": [dict(r) for r in domains],
|
||||
}
|
||||
|
||||
async def _generate_synthesis(self, context: dict[str, Any]) -> Synthesis:
|
||||
"""Generate synthesis using LLM."""
|
||||
|
||||
# Build the user prompt with context
|
||||
user_prompt = f"""Analyze this review data for {context['business_name']}:
|
||||
|
||||
## Overview
|
||||
- Total Reviews: {context['overview'].get('total_reviews', 0)}
|
||||
- Average Rating: {context['overview'].get('avg_rating', 'N/A')}
|
||||
- Total Insights Extracted: {context['overview'].get('total_spans', 0)}
|
||||
|
||||
## Sentiment Distribution
|
||||
{self._format_sentiment(context['sentiment'])}
|
||||
|
||||
## Top Issues (Problems)
|
||||
{self._format_issues(context['top_issues'])}
|
||||
|
||||
## Top Strengths
|
||||
{self._format_strengths(context['top_strengths'])}
|
||||
|
||||
## Domain Breakdown
|
||||
{self._format_domains(context['domains'])}
|
||||
|
||||
Generate a complete synthesis with actionable insights.
|
||||
"""
|
||||
|
||||
# Call LLM
|
||||
try:
|
||||
response = await self.llm_client.generate(
|
||||
system_prompt=SYNTHESIS_SYSTEM_PROMPT,
|
||||
user_prompt=user_prompt,
|
||||
temperature=0.7, # Allow some creativity
|
||||
max_tokens=4000,
|
||||
)
|
||||
|
||||
# Parse JSON response
|
||||
result = json.loads(response)
|
||||
|
||||
# Convert to Synthesis object
|
||||
return Synthesis(
|
||||
executive_narrative=result.get("executive_narrative", ""),
|
||||
sentiment_insight=result.get("sentiment_insight", ""),
|
||||
category_insight=result.get("category_insight", ""),
|
||||
timeline_insight=result.get("timeline_insight", ""),
|
||||
priority_domain=result.get("priority_domain"),
|
||||
priority_issue=result.get("priority_issue"),
|
||||
action_plan=[
|
||||
ActionItem(
|
||||
id=a.get("id", f"action_{i}"),
|
||||
title=a.get("title", ""),
|
||||
why=a.get("why", ""),
|
||||
what=a.get("what", ""),
|
||||
who=a.get("who", ""),
|
||||
impact=a.get("impact", ""),
|
||||
evidence=a.get("evidence", []),
|
||||
estimated_rating_lift=a.get("estimated_rating_lift"),
|
||||
complexity=a.get("complexity", "medium"),
|
||||
priority=a.get("priority", "medium"),
|
||||
timeline=a.get("timeline", "This month"),
|
||||
related_subcode=a.get("related_subcode", ""),
|
||||
)
|
||||
for i, a in enumerate(result.get("action_plan", []))
|
||||
],
|
||||
issue_actions={}, # Can be populated from action_plan
|
||||
timeline_annotations=[
|
||||
TimelineAnnotation(
|
||||
date=t.get("date", ""),
|
||||
label=t.get("label", ""),
|
||||
description=t.get("description", ""),
|
||||
type=t.get("type", "neutral"),
|
||||
)
|
||||
for t in result.get("timeline_annotations", [])
|
||||
],
|
||||
marketing_angles=result.get("marketing_angles", []),
|
||||
competitor_context=result.get("competitor_context"),
|
||||
generated_at=datetime.utcnow().isoformat(),
|
||||
)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse LLM response: {e}")
|
||||
return self._create_fallback_synthesis()
|
||||
except Exception as e:
|
||||
logger.error(f"Synthesis generation failed: {e}")
|
||||
return self._create_fallback_synthesis()
|
||||
|
||||
def _format_sentiment(self, sentiment: list[dict]) -> str:
|
||||
"""Format sentiment data for prompt."""
|
||||
lines = []
|
||||
for s in sentiment:
|
||||
valence = s.get("valence", "Unknown")
|
||||
count = s.get("count", 0)
|
||||
reviews = s.get("review_count", 0)
|
||||
label = {"V+": "Positive", "V-": "Negative", "V0": "Neutral", "V±": "Mixed"}.get(valence, valence)
|
||||
lines.append(f"- {label}: {count} mentions ({reviews} reviews)")
|
||||
return "\n".join(lines) or "No sentiment data"
|
||||
|
||||
def _format_issues(self, issues: list[dict]) -> str:
|
||||
"""Format issues for prompt."""
|
||||
lines = []
|
||||
for i, issue in enumerate(issues[:5], 1):
|
||||
subcode = issue.get("subcode", "")
|
||||
name = issue.get("subcode_name", "")
|
||||
domain = issue.get("domain_name", "")
|
||||
count = issue.get("negative_count", 0)
|
||||
quotes = issue.get("example_quotes", [])[:2]
|
||||
|
||||
lines.append(f"{i}. [{subcode}] {name} ({domain})")
|
||||
lines.append(f" - {count} negative mentions")
|
||||
for q in quotes:
|
||||
if q:
|
||||
lines.append(f' - Example: "{q[:100]}..."' if len(q) > 100 else f' - Example: "{q}"')
|
||||
return "\n".join(lines) or "No issues found"
|
||||
|
||||
def _format_strengths(self, strengths: list[dict]) -> str:
|
||||
"""Format strengths for prompt."""
|
||||
lines = []
|
||||
for i, strength in enumerate(strengths[:3], 1):
|
||||
subcode = strength.get("subcode", "")
|
||||
name = strength.get("subcode_name", "")
|
||||
domain = strength.get("domain_name", "")
|
||||
count = strength.get("positive_count", 0)
|
||||
quotes = strength.get("example_quotes", [])[:2]
|
||||
|
||||
lines.append(f"{i}. [{subcode}] {name} ({domain})")
|
||||
lines.append(f" - {count} positive mentions")
|
||||
for q in quotes:
|
||||
if q:
|
||||
lines.append(f' - Example: "{q[:100]}..."' if len(q) > 100 else f' - Example: "{q}"')
|
||||
return "\n".join(lines) or "No strengths found"
|
||||
|
||||
def _format_domains(self, domains: list[dict]) -> str:
|
||||
"""Format domain distribution for prompt."""
|
||||
lines = []
|
||||
for d in domains:
|
||||
domain = d.get("domain", "")
|
||||
name = d.get("domain_name", "")
|
||||
total = d.get("total_count", 0)
|
||||
positive = d.get("positive_count", 0)
|
||||
negative = d.get("negative_count", 0)
|
||||
lines.append(f"- {domain} ({name}): {total} total ({positive} positive, {negative} negative)")
|
||||
return "\n".join(lines) or "No domain data"
|
||||
|
||||
def _create_fallback_synthesis(self) -> Synthesis:
|
||||
"""Create a minimal synthesis when LLM fails."""
|
||||
return Synthesis(
|
||||
executive_narrative="Unable to generate detailed analysis. Please review the data manually.",
|
||||
sentiment_insight="",
|
||||
category_insight="",
|
||||
timeline_insight="",
|
||||
priority_domain=None,
|
||||
priority_issue=None,
|
||||
action_plan=[],
|
||||
issue_actions={},
|
||||
timeline_annotations=[],
|
||||
marketing_angles=[],
|
||||
competitor_context=None,
|
||||
generated_at=datetime.utcnow().isoformat(),
|
||||
)
|
||||
|
||||
async def _store_synthesis(self, execution_id: str, synthesis: Synthesis) -> None:
|
||||
"""Store synthesis in database."""
|
||||
await self.pool.execute("""
|
||||
UPDATE pipeline.executions
|
||||
SET
|
||||
synthesis = $2,
|
||||
updated_at = NOW()
|
||||
WHERE execution_id = $1::uuid
|
||||
""", execution_id, json.dumps({
|
||||
"executive_narrative": synthesis.executive_narrative,
|
||||
"sentiment_insight": synthesis.sentiment_insight,
|
||||
"category_insight": synthesis.category_insight,
|
||||
"timeline_insight": synthesis.timeline_insight,
|
||||
"priority_domain": synthesis.priority_domain,
|
||||
"priority_issue": synthesis.priority_issue,
|
||||
"action_plan": [
|
||||
{
|
||||
"id": a.id,
|
||||
"title": a.title,
|
||||
"why": a.why,
|
||||
"what": a.what,
|
||||
"who": a.who,
|
||||
"impact": a.impact,
|
||||
"evidence": a.evidence,
|
||||
"estimated_rating_lift": a.estimated_rating_lift,
|
||||
"complexity": a.complexity,
|
||||
"priority": a.priority,
|
||||
"timeline": a.timeline,
|
||||
"related_subcode": a.related_subcode,
|
||||
}
|
||||
for a in synthesis.action_plan
|
||||
],
|
||||
"issue_actions": synthesis.issue_actions,
|
||||
"timeline_annotations": [
|
||||
{
|
||||
"date": t.date,
|
||||
"label": t.label,
|
||||
"description": t.description,
|
||||
"type": t.type,
|
||||
}
|
||||
for t in synthesis.timeline_annotations
|
||||
],
|
||||
"marketing_angles": synthesis.marketing_angles,
|
||||
"competitor_context": synthesis.competitor_context,
|
||||
"generated_at": synthesis.generated_at,
|
||||
}))
|
||||
Reference in New Issue
Block a user