feat: Add decoupled pipeline schema with separate PostgreSQL namespace
- Create consolidated migration (005_create_pipeline_schema.sql) with 'pipeline' schema for all classification tables - Update pipeline repositories to use schema prefix (pipeline.*) - Add run_migrations() method to DatabaseManager - Add CLI tool for running versioned migrations Tables created in pipeline schema: - reviews_raw, reviews_enriched (Stage 1) - review_spans (Stage 2) - issues, issue_spans, issue_events (Stage 3) - fact_timeseries (Stage 4) - urt_domains, urt_categories (taxonomy lookup) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,8 @@
|
||||
"""Data access layer for pipeline operations."""
|
||||
"""Data access layer for pipeline operations.
|
||||
|
||||
All tables live in the 'pipeline' schema, keeping them decoupled from the
|
||||
main scraper schema while sharing the same database.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -20,6 +24,9 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Schema prefix for all pipeline tables
|
||||
SCHEMA = "pipeline"
|
||||
|
||||
|
||||
class ReviewRepository:
|
||||
"""Repository for review data operations."""
|
||||
@@ -35,7 +42,7 @@ class ReviewRepository:
|
||||
) -> int:
|
||||
"""Insert a raw review and return its ID."""
|
||||
query = """
|
||||
INSERT INTO reviews_raw (
|
||||
INSERT INTO pipeline.reviews_raw (
|
||||
source, review_id, place_id, raw_payload,
|
||||
review_text, rating, review_time, reviewer_name, reviewer_id,
|
||||
review_version, pulled_at
|
||||
@@ -66,7 +73,7 @@ class ReviewRepository:
|
||||
) -> int:
|
||||
"""Insert an enriched review stub (pre-classification)."""
|
||||
query = """
|
||||
INSERT INTO reviews_enriched (
|
||||
INSERT INTO pipeline.reviews_enriched (
|
||||
source, review_id, review_version, is_latest, raw_id,
|
||||
business_id, place_id, text, text_normalized, rating, review_time,
|
||||
language, taxonomy_version
|
||||
@@ -101,7 +108,7 @@ class ReviewRepository:
|
||||
) -> None:
|
||||
"""Update an enriched review with classification results."""
|
||||
query = """
|
||||
UPDATE reviews_enriched SET
|
||||
UPDATE pipeline.reviews_enriched SET
|
||||
urt_primary = $1,
|
||||
urt_secondary = $2,
|
||||
valence = $3,
|
||||
@@ -147,7 +154,7 @@ class ReviewRepository:
|
||||
SELECT
|
||||
source, review_id, review_version, business_id, place_id,
|
||||
text, text_normalized, rating, review_time
|
||||
FROM reviews_enriched
|
||||
FROM pipeline.reviews_enriched
|
||||
WHERE urt_primary IS NULL
|
||||
AND is_latest = TRUE
|
||||
ORDER BY review_time DESC
|
||||
@@ -164,7 +171,7 @@ class ReviewRepository:
|
||||
) -> dict[str, Any] | None:
|
||||
"""Get a specific review by its composite key."""
|
||||
query = """
|
||||
SELECT * FROM reviews_enriched
|
||||
SELECT * FROM pipeline.reviews_enriched
|
||||
WHERE source = $1 AND review_id = $2 AND review_version = $3
|
||||
"""
|
||||
row = await self.db.fetchrow(query, source, review_id, review_version)
|
||||
@@ -179,7 +186,7 @@ class ReviewRepository:
|
||||
# For now, we check by querying the first occurrence
|
||||
# A proper dedup table would be better for production
|
||||
query = """
|
||||
SELECT review_id FROM reviews_enriched
|
||||
SELECT review_id FROM pipeline.reviews_enriched
|
||||
WHERE business_id = $1
|
||||
AND text_normalized IS NOT NULL
|
||||
LIMIT 1
|
||||
@@ -209,7 +216,7 @@ class SpanRepository:
|
||||
) -> None:
|
||||
"""Insert a span into the database."""
|
||||
query = """
|
||||
INSERT INTO review_spans (
|
||||
INSERT INTO pipeline.review_spans (
|
||||
span_id, business_id, place_id, source, review_id, review_version,
|
||||
span_index, span_text, span_start, span_end,
|
||||
profile, urt_primary, urt_secondary, valence, intensity, comparative,
|
||||
@@ -282,8 +289,8 @@ class SpanRepository:
|
||||
rs.urt_primary, rs.valence, rs.intensity,
|
||||
rs.entity_normalized, rs.review_time, rs.confidence,
|
||||
re.trust_score
|
||||
FROM review_spans rs
|
||||
JOIN reviews_enriched re ON (
|
||||
FROM pipeline.review_spans rs
|
||||
JOIN pipeline.reviews_enriched re ON (
|
||||
re.source = rs.source
|
||||
AND re.review_id = rs.review_id
|
||||
AND re.review_version = rs.review_version
|
||||
@@ -291,7 +298,7 @@ class SpanRepository:
|
||||
WHERE rs.is_active = TRUE
|
||||
AND rs.valence IN ('V-', 'V±')
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM issue_spans iss WHERE iss.span_id = rs.span_id
|
||||
SELECT 1 FROM pipeline.issue_spans iss WHERE iss.span_id = rs.span_id
|
||||
)
|
||||
ORDER BY rs.review_time DESC
|
||||
LIMIT $1
|
||||
@@ -301,7 +308,7 @@ class SpanRepository:
|
||||
|
||||
async def get_span_by_id(self, span_id: str) -> dict[str, Any] | None:
|
||||
"""Get a span by its ID."""
|
||||
query = "SELECT * FROM review_spans WHERE span_id = $1"
|
||||
query = "SELECT * FROM pipeline.review_spans WHERE span_id = $1"
|
||||
row = await self.db.fetchrow(query, span_id)
|
||||
return dict(row) if row else None
|
||||
|
||||
@@ -326,7 +333,7 @@ class IssueRepository:
|
||||
"""Create or update an issue. Returns True if newly created."""
|
||||
# First check if exists
|
||||
existing = await self.db.fetchval(
|
||||
"SELECT 1 FROM issues WHERE issue_id = $1",
|
||||
"SELECT 1 FROM pipeline.issues WHERE issue_id = $1",
|
||||
issue_id,
|
||||
)
|
||||
|
||||
@@ -334,7 +341,7 @@ class IssueRepository:
|
||||
# Update
|
||||
await self.db.execute(
|
||||
"""
|
||||
UPDATE issues SET
|
||||
UPDATE pipeline.issues SET
|
||||
span_count = span_count + 1,
|
||||
max_intensity = CASE
|
||||
WHEN $1 = 'I3' THEN 'I3'
|
||||
@@ -353,7 +360,7 @@ class IssueRepository:
|
||||
domain = primary_subcode[0] if primary_subcode else "O"
|
||||
await self.db.execute(
|
||||
"""
|
||||
INSERT INTO issues (
|
||||
INSERT INTO pipeline.issues (
|
||||
issue_id, business_id, place_id, primary_subcode, domain,
|
||||
state, priority_score, confidence_score, span_count, max_intensity,
|
||||
entity, entity_normalized, taxonomy_version
|
||||
@@ -388,7 +395,7 @@ class IssueRepository:
|
||||
"""Link a span to an issue."""
|
||||
await self.db.execute(
|
||||
"""
|
||||
INSERT INTO issue_spans (
|
||||
INSERT INTO pipeline.issue_spans (
|
||||
issue_id, span_id, source, review_id, review_version,
|
||||
is_primary_match, intensity, review_time
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
@@ -416,7 +423,7 @@ class IssueRepository:
|
||||
"""Log an issue event for audit trail."""
|
||||
await self.db.execute(
|
||||
"""
|
||||
INSERT INTO issue_events (
|
||||
INSERT INTO pipeline.issue_events (
|
||||
issue_id, event_type, span_id, old_value, new_value, metadata
|
||||
) VALUES ($1, $2, $3, $4, $5, $6)
|
||||
""",
|
||||
@@ -430,14 +437,14 @@ class IssueRepository:
|
||||
|
||||
async def get_issue_by_id(self, issue_id: str) -> dict[str, Any] | None:
|
||||
"""Get an issue by its ID."""
|
||||
query = "SELECT * FROM issues WHERE issue_id = $1"
|
||||
query = "SELECT * FROM pipeline.issues WHERE issue_id = $1"
|
||||
row = await self.db.fetchrow(query, issue_id)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def check_span_already_linked(self, span_id: str) -> str | None:
|
||||
"""Check if a span is already linked to an issue."""
|
||||
return await self.db.fetchval(
|
||||
"SELECT issue_id FROM issue_spans WHERE span_id = $1",
|
||||
"SELECT issue_id FROM pipeline.issue_spans WHERE span_id = $1",
|
||||
span_id,
|
||||
)
|
||||
|
||||
@@ -452,7 +459,7 @@ class FactRepository:
|
||||
"""Insert or update a fact record."""
|
||||
await self.db.execute(
|
||||
"""
|
||||
INSERT INTO fact_timeseries (
|
||||
INSERT INTO pipeline.fact_timeseries (
|
||||
business_id, place_id, period_date, bucket_type,
|
||||
subject_type, subject_id, taxonomy_version,
|
||||
review_count, span_count, negative_count, positive_count,
|
||||
@@ -534,8 +541,8 @@ class FactRepository:
|
||||
rs.comparative,
|
||||
re.trust_score,
|
||||
re.rating
|
||||
FROM review_spans rs
|
||||
JOIN reviews_enriched re ON (
|
||||
FROM pipeline.review_spans rs
|
||||
JOIN pipeline.reviews_enriched re ON (
|
||||
re.source = rs.source
|
||||
AND re.review_id = rs.review_id
|
||||
AND re.review_version = rs.review_version
|
||||
@@ -554,7 +561,7 @@ class FactRepository:
|
||||
"""Get all place IDs for a business."""
|
||||
rows = await self.db.fetch(
|
||||
"""
|
||||
SELECT DISTINCT place_id FROM reviews_enriched
|
||||
SELECT DISTINCT place_id FROM pipeline.reviews_enriched
|
||||
WHERE business_id = $1
|
||||
""",
|
||||
business_id,
|
||||
|
||||
Reference in New Issue
Block a user