From 2a292e0754d2d679411e37ee04d8d781e874d9a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Fri, 30 Jan 2026 15:28:02 +0000 Subject: [PATCH] fix(synthesis): Select most common business_id to handle data leakage Changed the business name query to ORDER BY COUNT(*) DESC instead of arbitrary LIMIT 1, ensuring the correct business is identified even when trace amounts of other business data leak into a job. Co-Authored-By: Claude Opus 4.5 --- .../src/reviewiq_pipeline/stages/stage5_synthesize.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage5_synthesize.py b/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage5_synthesize.py index e5874c2..c652d71 100644 --- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage5_synthesize.py +++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/stages/stage5_synthesize.py @@ -486,10 +486,13 @@ class Stage5Synthesizer: ORDER BY negative DESC """, job_id) - # Business name + # Business name - get the most common one (in case of data leakage) business = await self.pool.fetchval(""" - SELECT DISTINCT business_id FROM pipeline.reviews_enriched - WHERE job_id = $1::uuid LIMIT 1 + SELECT business_id FROM pipeline.reviews_enriched + WHERE job_id = $1::uuid + GROUP BY business_id + ORDER BY COUNT(*) DESC + LIMIT 1 """, job_id) # MOMENTUM: Calculate from data (not LLM guess)