feat(pipeline): Add Stage 5 Synthesis for AI-generated narratives

- Add Stage5Synthesizer class that generates AI narratives and action plans
- Add generate() method to LLMClient for synthesis generation
- Integrate Stage 5 into pipeline runner after route stage
- Add synthesis JSONB column to pipeline.executions table
- Update reviewiq_analytics API to return synthesis data
- Synthesis includes: executive narrative, sentiment/category/timeline insights,
  action plan, marketing angles, and priority recommendations

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-29 03:12:53 +00:00
parent c8ecb4b98f
commit 9b667e69a7
5 changed files with 3129 additions and 67 deletions

View File

@@ -7,9 +7,11 @@ the BasePipeline interface for the extensible pipeline system.
from __future__ import annotations
import json
import logging
import re
import time
from datetime import date
from datetime import date, datetime, timedelta
from typing import TYPE_CHECKING, Any
from pipeline_core import (
@@ -51,6 +53,8 @@ from reviewiq_pipeline.stages.stage1_normalize import Stage1Normalizer
from reviewiq_pipeline.stages.stage2_classify import Stage2Classifier
from reviewiq_pipeline.stages.stage3_route import Stage3Router
from reviewiq_pipeline.stages.stage4_aggregate import Stage4Aggregator
from reviewiq_pipeline.stages.stage5_synthesize import Stage5Synthesizer
from reviewiq_pipeline.services.llm_client import LLMClient
from reviewiq_pipeline.validation.validators import (
validate_stage1_output,
validate_stage2_output,
@@ -64,9 +68,65 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__)
# Stage name to number mapping
STAGE_NAMES = ["normalize", "classify", "route", "aggregate"]
STAGE_NAME_TO_NUM = {"normalize": 1, "classify": 2, "route": 3, "aggregate": 4}
STAGE_NUM_TO_NAME = {1: "normalize", 2: "classify", 3: "route", 4: "aggregate"}
STAGE_NAMES = ["normalize", "classify", "route", "aggregate", "synthesize"]
STAGE_NAME_TO_NUM = {"normalize": 1, "classify": 2, "route": 3, "aggregate": 4, "synthesize": 5}
STAGE_NUM_TO_NAME = {1: "normalize", 2: "classify", 3: "route", 4: "aggregate", 5: "synthesize"}
def _parse_relative_date(date_str: str | None, default_to_now: bool = True) -> datetime | None:
"""Parse relative date strings like '10 months ago' into datetime objects.
Args:
date_str: A relative date string (e.g., "10 months ago", "2 weeks ago")
or an ISO date string, or None.
default_to_now: If True, returns current datetime when parsing fails.
Returns:
A datetime object, or None if parsing fails and default_to_now is False.
"""
now = datetime.now()
if not date_str:
return now if default_to_now else None
# Try to parse as ISO date first
try:
return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
except (ValueError, AttributeError):
pass
# Parse relative dates like "10 months ago", "2 weeks ago", "a day ago"
date_str = date_str.lower().strip()
# Handle "a/an" as 1
date_str = re.sub(r'\b(a|an)\s+', '1 ', date_str)
# Extract number and unit
match = re.match(r'(\d+)\s*(second|minute|hour|day|week|month|year)s?\s*ago', date_str)
if match:
amount = int(match.group(1))
unit = match.group(2)
if unit == 'second':
return now - timedelta(seconds=amount)
elif unit == 'minute':
return now - timedelta(minutes=amount)
elif unit == 'hour':
return now - timedelta(hours=amount)
elif unit == 'day':
return now - timedelta(days=amount)
elif unit == 'week':
return now - timedelta(weeks=amount)
elif unit == 'month':
# Approximate months as 30 days
return now - timedelta(days=amount * 30)
elif unit == 'year':
# Approximate years as 365 days
return now - timedelta(days=amount * 365)
# If we can't parse it, return now or None
logger.warning(f"Could not parse relative date: {date_str}")
return now if default_to_now else None
class PipelineResult:
@@ -228,8 +288,11 @@ class ReviewIQPipeline(BasePipeline):
stages_run: list[str] = []
stage_results: dict[str, StageResult] = {}
# Convert input to ScraperOutput if needed
scraper_output = self._ensure_scraper_output(input_data)
# Convert input to ScraperOutput if needed (may fetch from DB)
scraper_output = await self._ensure_scraper_output(input_data)
# Extract job_id for linking issues to pipeline executions
job_id = scraper_output.get("job_id")
# Track intermediate results for stage dependencies
stage1_result: Stage1Output | None = None
@@ -270,6 +333,20 @@ class ReviewIQPipeline(BasePipeline):
)
# Stage 2: Classify
# If classify is requested but we don't have stage1_result, try to fetch from DB
if "classify" in stages and not stage1_result and job_id:
logger.info("No stage1_result, fetching existing normalized reviews from database")
stage1_result = await self._fetch_normalized_reviews_from_db(job_id)
if stage1_result:
logger.info(f"Loaded {len(stage1_result.get('reviews_normalized', []))} reviews from DB for reclassification")
# Clean up old spans and issues before reclassification
if self._span_repo:
deactivated = await self._span_repo.deactivate_spans_for_job(job_id)
logger.info(f"Deactivated {deactivated} existing spans for job {job_id}")
if self._issue_repo:
deleted = await self._issue_repo.delete_issues_for_job(job_id)
logger.info(f"Deleted {deleted} existing issues for job {job_id}")
if "classify" in stages and stage1_result:
start = time.time()
logger.info("Running Stage 2: Classification")
@@ -308,7 +385,7 @@ class ReviewIQPipeline(BasePipeline):
logger.info("Running Stage 3: Issue Routing")
try:
stage3_result = await self._run_route(stage2_result)
stage3_result = await self._run_route(stage2_result, job_id=job_id)
duration_ms = int((time.time() - start) * 1000)
stages_run.append("route")
stage_results["route"] = StageResult(
@@ -371,6 +448,43 @@ class ReviewIQPipeline(BasePipeline):
error=f"aggregate failed: {e}",
)
# Stage 5: Synthesize (AI-generated narratives)
# Requires job_id and execution_id from pipeline execution tracking
if "synthesize" in stages and job_id:
start = time.time()
logger.info("Running Stage 5: Synthesis")
try:
# Get the execution_id for this pipeline run
execution_id = input_data.get("execution_id")
if execution_id:
stage5_result = await self._run_synthesize(job_id, execution_id)
duration_ms = int((time.time() - start) * 1000)
stages_run.append("synthesize")
stage_results["synthesize"] = StageResult(
stage="synthesize",
success=True,
data={
"actions_generated": len(stage5_result.action_plan) if stage5_result else 0,
"has_narrative": bool(stage5_result and stage5_result.executive_narrative),
},
error=None,
duration_ms=duration_ms,
)
else:
logger.warning("No execution_id provided, skipping synthesis")
except Exception as e:
logger.exception("Stage 5 failed")
stage_results["synthesize"] = StageResult(
stage="synthesize",
success=False,
data={},
error=str(e),
duration_ms=int((time.time() - start) * 1000),
)
# Synthesis failure is non-fatal - pipeline still succeeds
logger.warning(f"Synthesis failed but continuing: {e}")
return BasePipelineResult(
pipeline_id="reviewiq",
stages_run=stages_run,
@@ -558,6 +672,34 @@ class ReviewIQPipeline(BasePipeline):
],
collapsed=False,
),
DashboardSection(
id="classified_reviews",
title="Classified Reviews",
description="All reviews with URT classification codes and human-readable meanings",
widgets=[
WidgetConfig(
id="classified_reviews_table",
type="table",
title="Reviews with URT Codes",
grid={"x": 0, "y": 0, "w": 12, "h": 3},
config={
"columns": [
{"key": "span_text", "header": "Review Excerpt", "width": 300},
{"key": "urt_code", "header": "Code", "width": 80},
{"key": "code_name", "header": "Category", "width": 150},
{"key": "domain_name", "header": "Domain", "width": 100},
{"key": "valence", "header": "Sentiment", "width": 80},
{"key": "intensity", "header": "Intensity", "width": 80},
{"key": "rating", "header": "Stars", "width": 60, "align": "center"},
],
"row_key": "span_id",
"page_size": 15,
"sortable": True,
},
),
],
collapsed=False,
),
],
default_time_range="30d",
refresh_interval=300,
@@ -573,7 +715,7 @@ class ReviewIQPipeline(BasePipeline):
Args:
widget_id: Widget identifier
params: Query parameters (business_id, time_range, etc.)
params: Query parameters (business_id, job_id, time_range, etc.)
Returns:
Widget data dictionary
@@ -581,36 +723,41 @@ class ReviewIQPipeline(BasePipeline):
await self.initialize()
business_id = params.get("business_id")
job_id = params.get("job_id")
time_range = params.get("time_range", "30d")
match widget_id:
# Overview stats
case "total_reviews":
return await self._get_review_count(business_id)
return await self._get_review_count(business_id, job_id)
case "reviews_processed":
return await self._get_processed_count(business_id, time_range)
return await self._get_processed_count(business_id, job_id, time_range)
case "issues_found":
return await self._get_issues_count(business_id)
return await self._get_issues_count(business_id, job_id)
case "avg_rating":
return await self._get_avg_rating(business_id, time_range)
return await self._get_avg_rating(business_id, job_id, time_range)
# Sentiment
case "sentiment_distribution":
return await self._get_sentiment_distribution(business_id)
return await self._get_sentiment_distribution(business_id, job_id)
case "sentiment_trend":
return await self._get_sentiment_trend(business_id, time_range)
return await self._get_sentiment_trend(business_id, job_id, time_range)
# Classification
case "urt_distribution":
return await self._get_urt_distribution(business_id)
return await self._get_urt_distribution(business_id, job_id)
case "intensity_heatmap":
return await self._get_intensity_heatmap(business_id)
return await self._get_intensity_heatmap(business_id, job_id)
# Issues
case "issues_table":
return await self._get_issues_table(business_id, params)
return await self._get_issues_table(business_id, job_id, params)
case "issues_by_domain":
return await self._get_issues_by_domain(business_id)
return await self._get_issues_by_domain(business_id, job_id)
# Classified Reviews
case "classified_reviews_table":
return await self._get_classified_reviews(business_id, job_id, params)
case _:
logger.warning(f"Unknown widget: {widget_id}")
@@ -643,6 +790,9 @@ class ReviewIQPipeline(BasePipeline):
result = PipelineResult()
validation_results: dict[str, ValidationResult] = {}
# Extract job_id for linking issues
job_id = scraper_output.get("job_id")
# Stage 1: Normalize
if 1 in stages:
logger.info("Running Stage 1: Normalization")
@@ -668,7 +818,7 @@ class ReviewIQPipeline(BasePipeline):
# Stage 3: Route
if 3 in stages and result.stage2:
logger.info("Running Stage 3: Issue Routing")
result.stage3 = await self._run_route(result.stage2)
result.stage3 = await self._run_route(result.stage2, job_id=job_id)
if validate:
validation_results["stage3"] = await validate_stage3_output(
@@ -700,10 +850,10 @@ class ReviewIQPipeline(BasePipeline):
await self.initialize()
return await self._run_classify(stage1_output)
async def route(self, stage2_output: Stage2Output) -> Stage3Output:
async def route(self, stage2_output: Stage2Output, job_id: str | None = None) -> Stage3Output:
"""Run Stage 3: Issue Routing (legacy method)."""
await self.initialize()
return await self._run_route(stage2_output)
return await self._run_route(stage2_output, job_id=job_id)
async def aggregate(
self,
@@ -719,14 +869,91 @@ class ReviewIQPipeline(BasePipeline):
# Internal Stage Implementations
# =========================================================================
def _ensure_scraper_output(self, input_data: dict[str, Any]) -> ScraperOutput:
"""Ensure input data is in ScraperOutput format."""
async def _ensure_scraper_output(self, input_data: dict[str, Any]) -> ScraperOutput:
"""Ensure input data is in ScraperOutput format.
If only job_id is provided, fetches job data from the database.
"""
# If it has all required fields, use as-is
required = ["job_id", "business_id", "place_id", "reviews"]
if all(k in input_data for k in required):
return input_data # type: ignore
# Otherwise, wrap it
# If we have a job_id but missing reviews, fetch from database
job_id = input_data.get("job_id")
if job_id and not input_data.get("reviews") and self._db:
logger.info(f"Fetching job data from database for job_id: {job_id}")
async with self._db.pool.acquire() as conn:
row = await conn.fetchrow(
"""
SELECT job_id, status, reviews_data, reviews_count,
metadata->>'business_name' as business_name,
metadata->>'place_id' as place_id,
metadata->>'address' as address,
metadata->>'category' as category,
metadata->>'total_reviews' as total_reviews,
metadata->>'average_rating' as average_rating,
scraper_version
FROM public.jobs
WHERE job_id = $1::uuid
""",
str(job_id),
)
if row and row["reviews_data"]:
reviews_data = row["reviews_data"]
# asyncpg may return JSONB as a string - parse it if needed
if isinstance(reviews_data, str):
logger.info("Parsing reviews_data JSON string")
reviews_data = json.loads(reviews_data)
# Convert reviews_data to RawReview format
# Handle both API format (review_id, author, rating) and scraper format (reviewId, name, stars)
reviews = []
for i, review in enumerate(reviews_data):
if isinstance(review, str):
# Skip if review is somehow a string
logger.warning(f"Skipping review {i}: got string instead of dict")
continue
# Parse the review time (may be relative like "10 months ago")
raw_time = review.get("timestamp") or review.get("publishedAtDate") or ""
parsed_time = _parse_relative_date(raw_time)
reviews.append({
"review_id": review.get("review_id") or review.get("reviewId") or f"review_{i}",
"author_name": review.get("author") or review.get("name") or "Anonymous",
"author_id": review.get("reviewerId"),
"rating": review.get("rating") or review.get("stars") or 0,
"text": review.get("text"),
"review_time": parsed_time,
"response_text": review.get("responseFromOwner", {}).get("text") if review.get("responseFromOwner") else None,
"response_time": review.get("responseFromOwner", {}).get("publishedAtDate") if review.get("responseFromOwner") else None,
"photos": review.get("reviewImageUrls"),
"raw_payload": review,
})
logger.info(f"Loaded {len(reviews)} reviews from job {job_id}")
return ScraperOutput(
job_id=str(row["job_id"]),
status=row["status"] or "completed",
business_id=row["business_name"] or "unknown",
place_id=row["place_id"] or "unknown",
business_info={
"name": row["business_name"] or "",
"address": row["address"] or "",
"category": row["category"] or "",
"total_reviews": int(row["total_reviews"]) if row["total_reviews"] else 0,
"average_rating": float(row["average_rating"]) if row["average_rating"] else 0.0,
},
reviews=reviews,
scrape_time_ms=0,
reviews_scraped=len(reviews),
scraper_version=row["scraper_version"] or "unknown",
)
else:
logger.warning(f"No reviews found in database for job_id: {job_id}")
# Otherwise, wrap it with empty/default values
return ScraperOutput(
job_id=input_data.get("job_id", "unknown"),
status=input_data.get("status", "completed"),
@@ -739,6 +966,70 @@ class ReviewIQPipeline(BasePipeline):
scraper_version=input_data.get("scraper_version", "unknown"),
)
async def _fetch_normalized_reviews_from_db(self, job_id: str) -> Stage1Output | None:
"""Fetch existing normalized reviews from DB for reclassification.
Used when running classify stage standalone without normalize.
"""
if not self._db:
return None
async with self._db.pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT
source,
review_id,
review_version,
business_id,
place_id,
text,
text_normalized,
rating,
review_time
FROM pipeline.reviews_enriched
WHERE job_id = $1::uuid
AND is_latest = TRUE
ORDER BY review_time DESC
""",
job_id,
)
if not rows:
logger.warning(f"No normalized reviews found in DB for job_id: {job_id}")
return None
reviews_normalized = [
NormalizedReview(
source=row["source"],
review_id=row["review_id"],
review_version=row["review_version"],
business_id=row["business_id"],
place_id=row["place_id"],
text=row["text"],
text_normalized=row["text_normalized"],
rating=row["rating"],
review_time=row["review_time"],
)
for row in rows
]
logger.info(f"Fetched {len(reviews_normalized)} normalized reviews from DB for job {job_id}")
return Stage1Output(
job_id=job_id,
reviews_normalized=reviews_normalized,
reviews_skipped=[],
duplicates_found=[],
stats={
"total_input": len(reviews_normalized),
"processed": len(reviews_normalized),
"skipped": 0,
"duplicates": 0,
"from_db": True,
},
)
async def _run_normalize(self, scraper_output: ScraperOutput) -> Stage1Output:
"""Run normalization stage."""
stage1 = Stage1Normalizer(
@@ -788,6 +1079,7 @@ class ReviewIQPipeline(BasePipeline):
taxonomy_version=self._config.taxonomy_version,
profile=self._config.classification_profile,
max_spans_per_review=self._config.max_spans_per_review,
job_id=stage1_output.get("job_id"),
),
)
@@ -796,7 +1088,7 @@ class ReviewIQPipeline(BasePipeline):
finally:
await stage2.close()
async def _run_route(self, stage2_output: Stage2Output) -> Stage3Output:
async def _run_route(self, stage2_output: Stage2Output, job_id: str | None = None) -> Stage3Output:
"""Run issue routing stage."""
stage3 = Stage3Router(
self._config,
@@ -806,9 +1098,12 @@ class ReviewIQPipeline(BasePipeline):
)
spans_to_route = []
now = datetime.now()
for review in stage2_output["reviews_classified"]:
for span in review.get("spans", []):
if span["valence"] in ("V-", ""):
# Use current datetime as fallback for missing review_time
review_time = review.get("review_time") or now
spans_to_route.append(
SpanToRoute(
span_id=span["span_id"],
@@ -818,13 +1113,13 @@ class ReviewIQPipeline(BasePipeline):
valence=span["valence"],
intensity=span["intensity"],
entity_normalized=span.get("entity_normalized"),
review_time=review.get("review_time", ""),
review_time=review_time,
confidence=span.get("confidence", "medium"),
trust_score=review.get("trust_score", 0.5),
)
)
return await stage3.process(Stage3Input(spans=spans_to_route))
return await stage3.process(Stage3Input(spans=spans_to_route, job_id=job_id))
async def _run_aggregate(
self,
@@ -848,17 +1143,39 @@ class ReviewIQPipeline(BasePipeline):
return await stage4.process(input_data)
async def _run_synthesize(self, job_id: str, execution_id: str):
"""Run AI synthesis stage to generate narratives and action plans."""
from reviewiq_pipeline.stages.stage5_synthesize import Synthesis
# Create LLM client for synthesis
llm_client = LLMClient.create(self._config)
try:
stage5 = Stage5Synthesizer(
pool=self._db.pool,
llm_client=llm_client,
)
return await stage5.run(job_id, execution_id)
finally:
await llm_client.close()
# =========================================================================
# Widget Data Methods
# =========================================================================
async def _get_review_count(self, business_id: str | None) -> dict[str, Any]:
async def _get_review_count(self, business_id: str | None, job_id: str | None = None) -> dict[str, Any]:
"""Get total review count."""
if not self._db:
return {"total_reviews": 0}
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
count = await conn.fetchval(
"SELECT COUNT(*) FROM pipeline.reviews_enriched WHERE job_id = $1::uuid",
job_id,
)
elif business_id:
count = await conn.fetchval(
"SELECT COUNT(*) FROM pipeline.reviews_raw WHERE business_id = $1",
business_id,
@@ -871,7 +1188,7 @@ class ReviewIQPipeline(BasePipeline):
return {"total_reviews": count or 0}
async def _get_processed_count(
self, business_id: str | None, time_range: str
self, business_id: str | None, job_id: str | None, time_range: str
) -> dict[str, Any]:
"""Get processed review count with trend."""
if not self._db:
@@ -881,7 +1198,14 @@ class ReviewIQPipeline(BasePipeline):
days = self._parse_time_range(time_range)
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
# When filtering by job_id, just return count for that job
current = await conn.fetchval(
"SELECT COUNT(*) FROM pipeline.reviews_enriched WHERE job_id = $1::uuid",
job_id,
)
return {"reviews_processed": current or 0, "processed_change": 0}
elif business_id:
current = await conn.fetchval(
"""
SELECT COUNT(*) FROM pipeline.reviews_enriched
@@ -929,13 +1253,21 @@ class ReviewIQPipeline(BasePipeline):
"processed_change": round(change, 1),
}
async def _get_issues_count(self, business_id: str | None) -> dict[str, Any]:
async def _get_issues_count(self, business_id: str | None, job_id: str | None = None) -> dict[str, Any]:
"""Get open issues count."""
if not self._db:
return {"issues_count": 0}
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
count = await conn.fetchval(
"""
SELECT COUNT(*) FROM pipeline.issues
WHERE job_id = $1::uuid AND state = 'open'
""",
job_id,
)
elif business_id:
count = await conn.fetchval(
"""
SELECT COUNT(*) FROM pipeline.issues
@@ -951,7 +1283,7 @@ class ReviewIQPipeline(BasePipeline):
return {"issues_count": count or 0}
async def _get_avg_rating(
self, business_id: str | None, time_range: str
self, business_id: str | None, job_id: str | None, time_range: str
) -> dict[str, Any]:
"""Get average rating with trend."""
if not self._db:
@@ -960,7 +1292,13 @@ class ReviewIQPipeline(BasePipeline):
days = self._parse_time_range(time_range)
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
current = await conn.fetchval(
"SELECT AVG(rating) FROM pipeline.reviews_enriched WHERE job_id = $1::uuid",
job_id,
)
return {"avg_rating": round(float(current), 2) if current else 0, "rating_change": 0}
elif business_id:
current = await conn.fetchval(
"""
SELECT AVG(rating) FROM pipeline.reviews_enriched
@@ -1009,14 +1347,26 @@ class ReviewIQPipeline(BasePipeline):
}
async def _get_sentiment_distribution(
self, business_id: str | None
self, business_id: str | None, job_id: str | None = None
) -> dict[str, Any]:
"""Get sentiment distribution for pie chart."""
if not self._db:
return {"data": []}
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
rows = await conn.fetch(
"""
SELECT
valence,
COUNT(*) as count
FROM pipeline.review_spans
WHERE job_id = $1::uuid AND is_active = TRUE
GROUP BY valence
""",
job_id,
)
elif business_id:
rows = await conn.fetch(
"""
SELECT
@@ -1059,7 +1409,7 @@ class ReviewIQPipeline(BasePipeline):
return {"data": data}
async def _get_sentiment_trend(
self, business_id: str | None, time_range: str
self, business_id: str | None, job_id: str | None, time_range: str
) -> dict[str, Any]:
"""Get sentiment trend over time for line chart."""
if not self._db:
@@ -1068,7 +1418,23 @@ class ReviewIQPipeline(BasePipeline):
days = self._parse_time_range(time_range)
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
rows = await conn.fetch(
"""
SELECT
DATE(review_time) as date,
COUNT(*) FILTER (WHERE valence = 'V+') as positive,
COUNT(*) FILTER (WHERE valence = 'V-') as negative,
COUNT(*) FILTER (WHERE valence = 'V0') as neutral
FROM pipeline.review_spans
WHERE job_id = $1::uuid
AND is_active = TRUE
GROUP BY DATE(review_time)
ORDER BY date
""",
job_id,
)
elif business_id:
rows = await conn.fetch(
"""
SELECT
@@ -1115,13 +1481,26 @@ class ReviewIQPipeline(BasePipeline):
return {"data": data}
async def _get_urt_distribution(self, business_id: str | None) -> dict[str, Any]:
async def _get_urt_distribution(self, business_id: str | None, job_id: str | None = None) -> dict[str, Any]:
"""Get URT domain distribution for bar chart."""
if not self._db:
return {"data": []}
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
rows = await conn.fetch(
"""
SELECT
SUBSTRING(urt_primary, 1, 1) as domain,
COUNT(*) as count
FROM pipeline.review_spans
WHERE job_id = $1::uuid AND is_active = TRUE
GROUP BY SUBSTRING(urt_primary, 1, 1)
ORDER BY count DESC
""",
job_id,
)
elif business_id:
rows = await conn.fetch(
"""
SELECT
@@ -1168,13 +1547,26 @@ class ReviewIQPipeline(BasePipeline):
return {"data": data}
async def _get_intensity_heatmap(self, business_id: str | None) -> dict[str, Any]:
async def _get_intensity_heatmap(self, business_id: str | None, job_id: str | None = None) -> dict[str, Any]:
"""Get domain x intensity heatmap data."""
if not self._db:
return {"data": []}
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
rows = await conn.fetch(
"""
SELECT
SUBSTRING(urt_primary, 1, 1) as domain,
intensity,
COUNT(*) as count
FROM pipeline.review_spans
WHERE job_id = $1::uuid AND is_active = TRUE
GROUP BY SUBSTRING(urt_primary, 1, 1), intensity
""",
job_id,
)
elif business_id:
rows = await conn.fetch(
"""
SELECT
@@ -1222,7 +1614,7 @@ class ReviewIQPipeline(BasePipeline):
return {"data": data}
async def _get_issues_table(
self, business_id: str | None, params: dict[str, Any]
self, business_id: str | None, job_id: str | None, params: dict[str, Any]
) -> dict[str, Any]:
"""Get issues table data."""
if not self._db:
@@ -1233,7 +1625,30 @@ class ReviewIQPipeline(BasePipeline):
offset = (page - 1) * page_size
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
rows = await conn.fetch(
"""
SELECT
issue_id,
domain,
primary_subcode as subcode,
span_count,
max_intensity,
state
FROM pipeline.issues
WHERE job_id = $1::uuid
ORDER BY span_count DESC, created_at DESC
LIMIT $2 OFFSET $3
""",
job_id,
page_size,
offset,
)
total = await conn.fetchval(
"SELECT COUNT(*) FROM pipeline.issues WHERE job_id = $1::uuid",
job_id,
)
elif business_id:
rows = await conn.fetch(
"""
SELECT
@@ -1279,13 +1694,24 @@ class ReviewIQPipeline(BasePipeline):
return {"data": data, "total": total or 0}
async def _get_issues_by_domain(self, business_id: str | None) -> dict[str, Any]:
async def _get_issues_by_domain(self, business_id: str | None, job_id: str | None = None) -> dict[str, Any]:
"""Get issues grouped by domain for pie chart."""
if not self._db:
return {"data": []}
async with self._db._pool.acquire() as conn:
if business_id:
if job_id:
rows = await conn.fetch(
"""
SELECT domain, COUNT(*) as count
FROM pipeline.issues
WHERE job_id = $1::uuid
GROUP BY domain
ORDER BY count DESC
""",
job_id,
)
elif business_id:
rows = await conn.fetch(
"""
SELECT domain, COUNT(*) as count
@@ -1310,6 +1736,89 @@ class ReviewIQPipeline(BasePipeline):
return {"data": data}
async def _get_classified_reviews(
self, business_id: str | None, job_id: str | None, params: dict[str, Any]
) -> dict[str, Any]:
"""Get classified reviews with URT codes and human-readable names."""
if not self._db:
return {"data": [], "total": 0}
page = params.get("page", 1)
page_size = params.get("page_size", 15)
offset = (page - 1) * page_size
async with self._db._pool.acquire() as conn:
# Build the query with JOINs to get human-readable code names
base_query = """
SELECT
s.span_id,
s.span_text,
s.urt_primary as urt_code,
COALESCE(sub.name, cat.name, dom.name) as code_name,
COALESCE(sub.definition, dom.description) as code_definition,
dom.name as domain_name,
CASE s.valence
WHEN 'V+' THEN 'Positive'
WHEN 'V-' THEN 'Negative'
WHEN 'V0' THEN 'Neutral'
WHEN '' THEN 'Mixed'
ELSE s.valence
END as valence,
CASE s.intensity
WHEN 'I1' THEN 'Mild'
WHEN 'I2' THEN 'Moderate'
WHEN 'I3' THEN 'Strong'
ELSE s.intensity
END as intensity,
e.rating,
s.review_time
FROM pipeline.review_spans s
LEFT JOIN pipeline.reviews_enriched e ON s.review_id = e.review_id AND s.review_version = e.review_version
LEFT JOIN pipeline.urt_domains dom ON SUBSTRING(s.urt_primary, 1, 1) = dom.code
LEFT JOIN pipeline.urt_categories cat ON SUBSTRING(s.urt_primary, 1, 2) = cat.code
LEFT JOIN pipeline.urt_subcodes sub ON s.urt_primary = sub.code
WHERE s.is_active = TRUE
"""
count_query = """
SELECT COUNT(*) FROM pipeline.review_spans s
WHERE s.is_active = TRUE
"""
if job_id:
base_query += " AND s.job_id = $1::uuid"
count_query += " AND s.job_id = $1::uuid"
base_query += f" ORDER BY s.review_time DESC LIMIT {page_size} OFFSET {offset}"
rows = await conn.fetch(base_query, job_id)
total = await conn.fetchval(count_query, job_id)
elif business_id:
base_query += " AND s.business_id = $1"
count_query += " AND s.business_id = $1"
base_query += f" ORDER BY s.review_time DESC LIMIT {page_size} OFFSET {offset}"
rows = await conn.fetch(base_query, business_id)
total = await conn.fetchval(count_query, business_id)
else:
base_query += f" ORDER BY s.review_time DESC LIMIT {page_size} OFFSET {offset}"
rows = await conn.fetch(base_query)
total = await conn.fetchval(count_query)
data = [
{
"span_id": row["span_id"],
"span_text": row["span_text"],
"urt_code": row["urt_code"],
"code_name": row["code_name"] or "Unknown",
"code_definition": row["code_definition"] or "",
"domain_name": row["domain_name"] or "Unknown",
"valence": row["valence"],
"intensity": row["intensity"],
"rating": row["rating"],
"review_time": row["review_time"].isoformat() if row["review_time"] else None,
}
for row in rows
]
return {"data": data, "total": total or 0}
def _parse_time_range(self, time_range: str) -> int:
"""Parse time range string to days."""
if time_range.endswith("d"):