feat: Add decoupled pipeline schema with separate PostgreSQL namespace

- Create consolidated migration (005_create_pipeline_schema.sql) with 'pipeline' schema for all classification tables - Update pipeline repositories to use schema prefix (pipeline.*) - Add run_migrations() method to DatabaseManager - Add CLI tool for running versioned migrations Tables created in pipeline schema: - reviews_raw, reviews_enriched (Stage 1) - review_spans (Stage 2) - issues, issue_spans, issue_events (Stage 3) - fact_timeseries (Stage 4) - urt_domains, urt_categories (taxonomy lookup) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 18:17:20 +00:00
parent 7d720f5378
commit 03ed7029e2
4 changed files with 710 additions and 23 deletions
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/repositories.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/db/repositories.py
@@ -1,4 +1,8 @@
-"""Data access layer for pipeline operations."""
+"""Data access layer for pipeline operations.
+
+All tables live in the 'pipeline' schema, keeping them decoupled from the
+main scraper schema while sharing the same database.
+"""

 from __future__ import annotations

@@ -20,6 +24,9 @@ if TYPE_CHECKING:

 logger = logging.getLogger(__name__)

+# Schema prefix for all pipeline tables
+SCHEMA = "pipeline"
+

 class ReviewRepository:
    """Repository for review data operations."""
@@ -35,7 +42,7 @@ class ReviewRepository:
    ) -> int:
        """Insert a raw review and return its ID."""
        query = """
-            INSERT INTO reviews_raw (
+            INSERT INTO pipeline.reviews_raw (
                source, review_id, place_id, raw_payload,
                review_text, rating, review_time, reviewer_name, reviewer_id,
                review_version, pulled_at
@@ -66,7 +73,7 @@ class ReviewRepository:
    ) -> int:
        """Insert an enriched review stub (pre-classification)."""
        query = """
-            INSERT INTO reviews_enriched (
+            INSERT INTO pipeline.reviews_enriched (
                source, review_id, review_version, is_latest, raw_id,
                business_id, place_id, text, text_normalized, rating, review_time,
                language, taxonomy_version
@@ -101,7 +108,7 @@ class ReviewRepository:
    ) -> None:
        """Update an enriched review with classification results."""
        query = """
-            UPDATE reviews_enriched SET
+            UPDATE pipeline.reviews_enriched SET
                urt_primary = $1,
                urt_secondary = $2,
                valence = $3,
@@ -147,7 +154,7 @@ class ReviewRepository:
            SELECT
                source, review_id, review_version, business_id, place_id,
                text, text_normalized, rating, review_time
-            FROM reviews_enriched
+            FROM pipeline.reviews_enriched
            WHERE urt_primary IS NULL
              AND is_latest = TRUE
            ORDER BY review_time DESC
@@ -164,7 +171,7 @@ class ReviewRepository:
    ) -> dict[str, Any] | None:
        """Get a specific review by its composite key."""
        query = """
-            SELECT * FROM reviews_enriched
+            SELECT * FROM pipeline.reviews_enriched
            WHERE source = $1 AND review_id = $2 AND review_version = $3
        """
        row = await self.db.fetchrow(query, source, review_id, review_version)
@@ -179,7 +186,7 @@ class ReviewRepository:
        # For now, we check by querying the first occurrence
        # A proper dedup table would be better for production
        query = """
-            SELECT review_id FROM reviews_enriched
+            SELECT review_id FROM pipeline.reviews_enriched
            WHERE business_id = $1
              AND text_normalized IS NOT NULL
            LIMIT 1
@@ -209,7 +216,7 @@ class SpanRepository:
    ) -> None:
        """Insert a span into the database."""
        query = """
-            INSERT INTO review_spans (
+            INSERT INTO pipeline.review_spans (
                span_id, business_id, place_id, source, review_id, review_version,
                span_index, span_text, span_start, span_end,
                profile, urt_primary, urt_secondary, valence, intensity, comparative,
@@ -282,8 +289,8 @@ class SpanRepository:
                rs.urt_primary, rs.valence, rs.intensity,
                rs.entity_normalized, rs.review_time, rs.confidence,
                re.trust_score
-            FROM review_spans rs
-            JOIN reviews_enriched re ON (
+            FROM pipeline.review_spans rs
+            JOIN pipeline.reviews_enriched re ON (
                re.source = rs.source
                AND re.review_id = rs.review_id
                AND re.review_version = rs.review_version
@@ -291,7 +298,7 @@ class SpanRepository:
            WHERE rs.is_active = TRUE
              AND rs.valence IN ('V-', 'V±')
              AND NOT EXISTS (
-                SELECT 1 FROM issue_spans iss WHERE iss.span_id = rs.span_id
+                SELECT 1 FROM pipeline.issue_spans iss WHERE iss.span_id = rs.span_id
              )
            ORDER BY rs.review_time DESC
            LIMIT $1
@@ -301,7 +308,7 @@ class SpanRepository:

    async def get_span_by_id(self, span_id: str) -> dict[str, Any] | None:
        """Get a span by its ID."""
-        query = "SELECT * FROM review_spans WHERE span_id = $1"
+        query = "SELECT * FROM pipeline.review_spans WHERE span_id = $1"
        row = await self.db.fetchrow(query, span_id)
        return dict(row) if row else None

@@ -326,7 +333,7 @@ class IssueRepository:
        """Create or update an issue. Returns True if newly created."""
        # First check if exists
        existing = await self.db.fetchval(
-            "SELECT 1 FROM issues WHERE issue_id = $1",
+            "SELECT 1 FROM pipeline.issues WHERE issue_id = $1",
            issue_id,
        )

@@ -334,7 +341,7 @@ class IssueRepository:
            # Update
            await self.db.execute(
                """
-                UPDATE issues SET
+                UPDATE pipeline.issues SET
                    span_count = span_count + 1,
                    max_intensity = CASE
                        WHEN $1 = 'I3' THEN 'I3'
@@ -353,7 +360,7 @@ class IssueRepository:
            domain = primary_subcode[0] if primary_subcode else "O"
            await self.db.execute(
                """
-                INSERT INTO issues (
+                INSERT INTO pipeline.issues (
                    issue_id, business_id, place_id, primary_subcode, domain,
                    state, priority_score, confidence_score, span_count, max_intensity,
                    entity, entity_normalized, taxonomy_version
@@ -388,7 +395,7 @@ class IssueRepository:
        """Link a span to an issue."""
        await self.db.execute(
            """
-            INSERT INTO issue_spans (
+            INSERT INTO pipeline.issue_spans (
                issue_id, span_id, source, review_id, review_version,
                is_primary_match, intensity, review_time
            ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
@@ -416,7 +423,7 @@ class IssueRepository:
        """Log an issue event for audit trail."""
        await self.db.execute(
            """
-            INSERT INTO issue_events (
+            INSERT INTO pipeline.issue_events (
                issue_id, event_type, span_id, old_value, new_value, metadata
            ) VALUES ($1, $2, $3, $4, $5, $6)
            """,
@@ -430,14 +437,14 @@ class IssueRepository:

    async def get_issue_by_id(self, issue_id: str) -> dict[str, Any] | None:
        """Get an issue by its ID."""
-        query = "SELECT * FROM issues WHERE issue_id = $1"
+        query = "SELECT * FROM pipeline.issues WHERE issue_id = $1"
        row = await self.db.fetchrow(query, issue_id)
        return dict(row) if row else None

    async def check_span_already_linked(self, span_id: str) -> str | None:
        """Check if a span is already linked to an issue."""
        return await self.db.fetchval(
-            "SELECT issue_id FROM issue_spans WHERE span_id = $1",
+            "SELECT issue_id FROM pipeline.issue_spans WHERE span_id = $1",
            span_id,
        )

@@ -452,7 +459,7 @@ class FactRepository:
        """Insert or update a fact record."""
        await self.db.execute(
            """
-            INSERT INTO fact_timeseries (
+            INSERT INTO pipeline.fact_timeseries (
                business_id, place_id, period_date, bucket_type,
                subject_type, subject_id, taxonomy_version,
                review_count, span_count, negative_count, positive_count,
@@ -534,8 +541,8 @@ class FactRepository:
                rs.comparative,
                re.trust_score,
                re.rating
-            FROM review_spans rs
-            JOIN reviews_enriched re ON (
+            FROM pipeline.review_spans rs
+            JOIN pipeline.reviews_enriched re ON (
                re.source = rs.source
                AND re.review_id = rs.review_id
                AND re.review_version = rs.review_version
@@ -554,7 +561,7 @@ class FactRepository:
        """Get all place IDs for a business."""
        rows = await self.db.fetch(
            """
-            SELECT DISTINCT place_id FROM reviews_enriched
+            SELECT DISTINCT place_id FROM pipeline.reviews_enriched
            WHERE business_id = $1
            """,
            business_id,