feat: Add decoupled pipeline schema with separate PostgreSQL namespace

- Create consolidated migration (005_create_pipeline_schema.sql) with
  'pipeline' schema for all classification tables
- Update pipeline repositories to use schema prefix (pipeline.*)
- Add run_migrations() method to DatabaseManager
- Add CLI tool for running versioned migrations

Tables created in pipeline schema:
- reviews_raw, reviews_enriched (Stage 1)
- review_spans (Stage 2)
- issues, issue_spans, issue_events (Stage 3)
- fact_timeseries (Stage 4)
- urt_domains, urt_categories (taxonomy lookup)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 18:17:20 +00:00
parent 7d720f5378
commit 03ed7029e2
4 changed files with 710 additions and 23 deletions

76
tools/run_migrations.py Normal file
View File

@@ -0,0 +1,76 @@
#!/usr/bin/env python3
"""
CLI tool to run database migrations.
Usage:
python tools/run_migrations.py --database-url $DATABASE_URL
# Or with environment variable
export DATABASE_URL=postgresql://user:pass@localhost/db
python tools/run_migrations.py
"""
import asyncio
import os
import sys
import argparse
import logging
# Add project root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from core.database import DatabaseManager
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
async def main(database_url: str, migrations_dir: str = "migrations/versions"):
"""Run migrations against the database."""
db = DatabaseManager(database_url)
try:
await db.connect()
# First initialize base schema (jobs table, etc.)
print("Initializing base schema...")
await db.initialize_schema()
# Then run versioned migrations
print(f"\nRunning migrations from {migrations_dir}...")
count = await db.run_migrations(migrations_dir)
if count > 0:
print(f"\n✓ Applied {count} migration(s)")
else:
print("\n✓ No pending migrations")
except Exception as e:
print(f"\n✗ Migration failed: {e}", file=sys.stderr)
sys.exit(1)
finally:
await db.disconnect()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run database migrations")
parser.add_argument(
"--database-url",
default=os.environ.get("DATABASE_URL"),
help="PostgreSQL connection string (default: $DATABASE_URL)",
)
parser.add_argument(
"--migrations-dir",
default="migrations/versions",
help="Directory containing .sql migration files",
)
args = parser.parse_args()
if not args.database_url:
print("Error: --database-url required or set DATABASE_URL environment variable", file=sys.stderr)
sys.exit(1)
asyncio.run(main(args.database_url, args.migrations_dir))