feat: Add decoupled pipeline schema with separate PostgreSQL namespace

- Create consolidated migration (005_create_pipeline_schema.sql) with
  'pipeline' schema for all classification tables
- Update pipeline repositories to use schema prefix (pipeline.*)
- Add run_migrations() method to DatabaseManager
- Add CLI tool for running versioned migrations

Tables created in pipeline schema:
- reviews_raw, reviews_enriched (Stage 1)
- review_spans (Stage 2)
- issues, issue_spans, issue_events (Stage 3)
- fact_timeseries (Stage 4)
- urt_domains, urt_categories (taxonomy lookup)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 18:17:20 +00:00
parent 7d720f5378
commit 03ed7029e2
4 changed files with 710 additions and 23 deletions

View File

@@ -1,4 +1,8 @@
"""Data access layer for pipeline operations."""
"""Data access layer for pipeline operations.
All tables live in the 'pipeline' schema, keeping them decoupled from the
main scraper schema while sharing the same database.
"""
from __future__ import annotations
@@ -20,6 +24,9 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__)
# Schema prefix for all pipeline tables
SCHEMA = "pipeline"
class ReviewRepository:
"""Repository for review data operations."""
@@ -35,7 +42,7 @@ class ReviewRepository:
) -> int:
"""Insert a raw review and return its ID."""
query = """
INSERT INTO reviews_raw (
INSERT INTO pipeline.reviews_raw (
source, review_id, place_id, raw_payload,
review_text, rating, review_time, reviewer_name, reviewer_id,
review_version, pulled_at
@@ -66,7 +73,7 @@ class ReviewRepository:
) -> int:
"""Insert an enriched review stub (pre-classification)."""
query = """
INSERT INTO reviews_enriched (
INSERT INTO pipeline.reviews_enriched (
source, review_id, review_version, is_latest, raw_id,
business_id, place_id, text, text_normalized, rating, review_time,
language, taxonomy_version
@@ -101,7 +108,7 @@ class ReviewRepository:
) -> None:
"""Update an enriched review with classification results."""
query = """
UPDATE reviews_enriched SET
UPDATE pipeline.reviews_enriched SET
urt_primary = $1,
urt_secondary = $2,
valence = $3,
@@ -147,7 +154,7 @@ class ReviewRepository:
SELECT
source, review_id, review_version, business_id, place_id,
text, text_normalized, rating, review_time
FROM reviews_enriched
FROM pipeline.reviews_enriched
WHERE urt_primary IS NULL
AND is_latest = TRUE
ORDER BY review_time DESC
@@ -164,7 +171,7 @@ class ReviewRepository:
) -> dict[str, Any] | None:
"""Get a specific review by its composite key."""
query = """
SELECT * FROM reviews_enriched
SELECT * FROM pipeline.reviews_enriched
WHERE source = $1 AND review_id = $2 AND review_version = $3
"""
row = await self.db.fetchrow(query, source, review_id, review_version)
@@ -179,7 +186,7 @@ class ReviewRepository:
# For now, we check by querying the first occurrence
# A proper dedup table would be better for production
query = """
SELECT review_id FROM reviews_enriched
SELECT review_id FROM pipeline.reviews_enriched
WHERE business_id = $1
AND text_normalized IS NOT NULL
LIMIT 1
@@ -209,7 +216,7 @@ class SpanRepository:
) -> None:
"""Insert a span into the database."""
query = """
INSERT INTO review_spans (
INSERT INTO pipeline.review_spans (
span_id, business_id, place_id, source, review_id, review_version,
span_index, span_text, span_start, span_end,
profile, urt_primary, urt_secondary, valence, intensity, comparative,
@@ -282,8 +289,8 @@ class SpanRepository:
rs.urt_primary, rs.valence, rs.intensity,
rs.entity_normalized, rs.review_time, rs.confidence,
re.trust_score
FROM review_spans rs
JOIN reviews_enriched re ON (
FROM pipeline.review_spans rs
JOIN pipeline.reviews_enriched re ON (
re.source = rs.source
AND re.review_id = rs.review_id
AND re.review_version = rs.review_version
@@ -291,7 +298,7 @@ class SpanRepository:
WHERE rs.is_active = TRUE
AND rs.valence IN ('V-', '')
AND NOT EXISTS (
SELECT 1 FROM issue_spans iss WHERE iss.span_id = rs.span_id
SELECT 1 FROM pipeline.issue_spans iss WHERE iss.span_id = rs.span_id
)
ORDER BY rs.review_time DESC
LIMIT $1
@@ -301,7 +308,7 @@ class SpanRepository:
async def get_span_by_id(self, span_id: str) -> dict[str, Any] | None:
"""Get a span by its ID."""
query = "SELECT * FROM review_spans WHERE span_id = $1"
query = "SELECT * FROM pipeline.review_spans WHERE span_id = $1"
row = await self.db.fetchrow(query, span_id)
return dict(row) if row else None
@@ -326,7 +333,7 @@ class IssueRepository:
"""Create or update an issue. Returns True if newly created."""
# First check if exists
existing = await self.db.fetchval(
"SELECT 1 FROM issues WHERE issue_id = $1",
"SELECT 1 FROM pipeline.issues WHERE issue_id = $1",
issue_id,
)
@@ -334,7 +341,7 @@ class IssueRepository:
# Update
await self.db.execute(
"""
UPDATE issues SET
UPDATE pipeline.issues SET
span_count = span_count + 1,
max_intensity = CASE
WHEN $1 = 'I3' THEN 'I3'
@@ -353,7 +360,7 @@ class IssueRepository:
domain = primary_subcode[0] if primary_subcode else "O"
await self.db.execute(
"""
INSERT INTO issues (
INSERT INTO pipeline.issues (
issue_id, business_id, place_id, primary_subcode, domain,
state, priority_score, confidence_score, span_count, max_intensity,
entity, entity_normalized, taxonomy_version
@@ -388,7 +395,7 @@ class IssueRepository:
"""Link a span to an issue."""
await self.db.execute(
"""
INSERT INTO issue_spans (
INSERT INTO pipeline.issue_spans (
issue_id, span_id, source, review_id, review_version,
is_primary_match, intensity, review_time
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
@@ -416,7 +423,7 @@ class IssueRepository:
"""Log an issue event for audit trail."""
await self.db.execute(
"""
INSERT INTO issue_events (
INSERT INTO pipeline.issue_events (
issue_id, event_type, span_id, old_value, new_value, metadata
) VALUES ($1, $2, $3, $4, $5, $6)
""",
@@ -430,14 +437,14 @@ class IssueRepository:
async def get_issue_by_id(self, issue_id: str) -> dict[str, Any] | None:
"""Get an issue by its ID."""
query = "SELECT * FROM issues WHERE issue_id = $1"
query = "SELECT * FROM pipeline.issues WHERE issue_id = $1"
row = await self.db.fetchrow(query, issue_id)
return dict(row) if row else None
async def check_span_already_linked(self, span_id: str) -> str | None:
"""Check if a span is already linked to an issue."""
return await self.db.fetchval(
"SELECT issue_id FROM issue_spans WHERE span_id = $1",
"SELECT issue_id FROM pipeline.issue_spans WHERE span_id = $1",
span_id,
)
@@ -452,7 +459,7 @@ class FactRepository:
"""Insert or update a fact record."""
await self.db.execute(
"""
INSERT INTO fact_timeseries (
INSERT INTO pipeline.fact_timeseries (
business_id, place_id, period_date, bucket_type,
subject_type, subject_id, taxonomy_version,
review_count, span_count, negative_count, positive_count,
@@ -534,8 +541,8 @@ class FactRepository:
rs.comparative,
re.trust_score,
re.rating
FROM review_spans rs
JOIN reviews_enriched re ON (
FROM pipeline.review_spans rs
JOIN pipeline.reviews_enriched re ON (
re.source = rs.source
AND re.review_id = rs.review_id
AND re.review_version = rs.review_version
@@ -554,7 +561,7 @@ class FactRepository:
"""Get all place IDs for a business."""
rows = await self.db.fetch(
"""
SELECT DISTINCT place_id FROM reviews_enriched
SELECT DISTINCT place_id FROM pipeline.reviews_enriched
WHERE business_id = $1
""",
business_id,