feat: Add extensible multi-pipeline integration system

This commit implements a plugin-like pipeline architecture with: Pipeline Core Package (packages/pipeline-core/): - BasePipeline abstract class all pipelines implement - PipelineRegistry for database-backed discovery/management - PipelineRunner for execution with status tracking - DashboardConfig contracts for dynamic widget definitions Database Migration (006_pipeline_registry.sql): - pipeline.registry table for registered pipelines - pipeline.executions table for execution history - Views for execution stats and monitoring ReviewIQ Pipeline Refactor: - Implements BasePipeline interface - Adds get_dashboard_config() with widget definitions - Adds get_widget_data() methods for all dashboard widgets - Maintains backward compatibility with Pipeline alias Generic Pipeline API (api/routes/pipelines.py): - GET /api/pipelines - List all registered pipelines - GET /api/pipelines/{id} - Pipeline details - POST /api/pipelines/{id}/execute - Execute pipeline - GET /api/pipelines/{id}/dashboard - Dashboard config - GET /api/pipelines/{id}/widgets/{w} - Widget data - GET /api/pipelines/{id}/executions - Execution history Frontend Dynamic Dashboard System: - DynamicDashboard component renders from config - WidgetRegistry maps types to components - Widget components: StatCard, LineChart, BarChart, PieChart, DataTable, Heatmap - Pipeline API client library Frontend Pipeline Pages: - /pipelines - List all registered pipelines - /pipelines/[id] - Dynamic dashboard for pipeline - /pipelines/[id]/executions - Execution history - Pipelines nav item in Sidebar Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 19:05:38 +00:00
parent d64f06ba9e
commit 824634aa76
30 changed files with 5697 additions and 95 deletions
--- a/api/routes/init.py
+++ b/api/routes/init.py
@@ -6,6 +6,7 @@ This module exports all route modules for easy import into the main server.
 from api.routes.batches import router as batches_router, set_database as set_batches_db
 from api.routes.dashboard import router as dashboard_router, set_database as set_dashboard_db
 from api.routes.admin import router as admin_router, set_database as set_admin_db
+from api.routes.pipelines import router as pipelines_router, set_database as set_pipelines_db

 __all__ = [
    'batches_router',
@@ -14,4 +15,6 @@ __all__ = [
    'set_dashboard_db',
    'admin_router',
    'set_admin_db',
+    'pipelines_router',
+    'set_pipelines_db',
 ]
--- a/api/routes/pipelines.py
+++ b/api/routes/pipelines.py
@@ -0,0 +1,560 @@
+#!/usr/bin/env python3
+"""
+Generic Pipeline API endpoints.
+
+Provides a unified API for all registered pipelines:
+- List available pipelines
+- Get pipeline details and metadata
+- Execute pipelines
+- Get dashboard configuration
+- Get widget data
+- List execution history
+"""
+
+import logging
+from typing import Any
+
+import asyncpg
+from fastapi import APIRouter, HTTPException, Query
+from pydantic import BaseModel, Field
+
+log = logging.getLogger(__name__)
+
+# Create router
+router = APIRouter(prefix="/api/pipelines", tags=["pipelines"])
+
+# Database pool (set by main server)
+_pool: asyncpg.Pool | None = None
+
+# Pipeline instances cache
+_pipeline_instances: dict[str, Any] = {}
+
+
+def set_database(pool: asyncpg.Pool) -> None:
+    """Set the database pool for pipeline operations."""
+    global _pool
+    _pool = pool
+
+
+# ==================== Pydantic Models ====================
+
+
+class PipelineInfo(BaseModel):
+    """Summary information about a pipeline."""
+
+    id: str = Field(..., description="Pipeline identifier")
+    name: str = Field(..., description="Display name")
+    description: str = Field(..., description="Human-readable description")
+    version: str = Field(..., description="Semantic version")
+    is_enabled: bool = Field(..., description="Whether pipeline is enabled")
+    stages: list[str] = Field(..., description="Available stages")
+    input_type: str = Field(..., description="Expected input type")
+
+
+class PipelineDetail(PipelineInfo):
+    """Detailed pipeline information."""
+
+    module_path: str = Field(..., description="Python module path")
+    config: dict[str, Any] | None = Field(None, description="Pipeline configuration")
+    created_at: str | None = Field(None, description="Registration timestamp")
+    updated_at: str | None = Field(None, description="Last update timestamp")
+
+
+class ExecuteRequest(BaseModel):
+    """Request to execute a pipeline."""
+
+    job_id: str | None = Field(None, description="Job ID to process")
+    business_id: str | None = Field(None, description="Business identifier")
+    input_data: dict[str, Any] | None = Field(None, description="Direct input data")
+    stages: list[str] | None = Field(None, description="Stages to run (default: all)")
+    options: dict[str, Any] | None = Field(None, description="Pipeline-specific options")
+
+
+class ExecuteResponse(BaseModel):
+    """Response from pipeline execution."""
+
+    execution_id: str = Field(..., description="Execution identifier")
+    pipeline_id: str = Field(..., description="Pipeline that was executed")
+    success: bool = Field(..., description="Whether execution succeeded")
+    stages_run: list[str] = Field(..., description="Stages that were run")
+    error: str | None = Field(None, description="Error message if failed")
+
+
+class ExecutionSummary(BaseModel):
+    """Summary of a pipeline execution."""
+
+    id: str = Field(..., description="Execution identifier")
+    pipeline_id: str = Field(..., description="Pipeline identifier")
+    job_id: str | None = Field(None, description="Associated job ID")
+    business_id: str | None = Field(None, description="Business identifier")
+    status: str = Field(..., description="Execution status")
+    stages_requested: list[str] = Field(..., description="Stages requested")
+    stages_completed: list[str] = Field(..., description="Stages completed")
+    error_message: str | None = Field(None, description="Error if failed")
+    started_at: str | None = Field(None, description="Start timestamp")
+    completed_at: str | None = Field(None, description="Completion timestamp")
+    created_at: str | None = Field(None, description="Creation timestamp")
+
+
+class DashboardSectionModel(BaseModel):
+    """Dashboard section configuration."""
+
+    id: str
+    title: str
+    description: str | None = None
+    widgets: list[dict[str, Any]]
+    collapsed: bool | None = None
+
+
+class DashboardConfigModel(BaseModel):
+    """Dashboard configuration for a pipeline."""
+
+    pipeline_id: str
+    title: str
+    description: str | None = None
+    sections: list[DashboardSectionModel]
+    default_time_range: str | None = None
+    refresh_interval: int | None = None
+
+
+# ==================== Helper Functions ====================
+
+
+async def _get_pipeline_instance(pipeline_id: str) -> Any:
+    """Get or create a pipeline instance."""
+    if pipeline_id in _pipeline_instances:
+        return _pipeline_instances[pipeline_id]
+
+    if not _pool:
+        raise HTTPException(status_code=503, detail="Database not initialized")
+
+    # Look up pipeline in registry
+    async with _pool.acquire() as conn:
+        row = await conn.fetchrow(
+            """
+            SELECT pipeline_id, module_path, is_enabled
+            FROM pipeline.registry
+            WHERE pipeline_id = $1
+            """,
+            pipeline_id,
+        )
+
+    if not row:
+        raise HTTPException(status_code=404, detail=f"Pipeline not found: {pipeline_id}")
+
+    if not row["is_enabled"]:
+        raise HTTPException(status_code=400, detail=f"Pipeline is disabled: {pipeline_id}")
+
+    # Import and instantiate
+    try:
+        module_path = row["module_path"]
+        module_name, class_name = module_path.rsplit(":", 1)
+
+        import importlib
+
+        module = importlib.import_module(module_name)
+        cls = getattr(module, class_name)
+        instance = cls()
+
+        # Initialize the pipeline
+        await instance.initialize()
+
+        _pipeline_instances[pipeline_id] = instance
+        return instance
+
+    except Exception as e:
+        log.exception(f"Failed to load pipeline {pipeline_id}")
+        raise HTTPException(
+            status_code=500, detail=f"Failed to load pipeline: {e}"
+        )
+
+
+# ==================== API Endpoints ====================
+
+
+@router.get("/", response_model=list[PipelineInfo])
+async def list_pipelines(
+    enabled_only: bool = Query(True, description="Only return enabled pipelines"),
+) -> list[PipelineInfo]:
+    """
+    List all registered pipelines.
+
+    Returns summary information for each pipeline including name, version,
+    available stages, and enabled status.
+    """
+    if not _pool:
+        raise HTTPException(status_code=503, detail="Database not initialized")
+
+    async with _pool.acquire() as conn:
+        if enabled_only:
+            rows = await conn.fetch(
+                """
+                SELECT pipeline_id, name, description, version,
+                       is_enabled, stages, input_type
+                FROM pipeline.registry
+                WHERE is_enabled = TRUE
+                ORDER BY name
+                """
+            )
+        else:
+            rows = await conn.fetch(
+                """
+                SELECT pipeline_id, name, description, version,
+                       is_enabled, stages, input_type
+                FROM pipeline.registry
+                ORDER BY name
+                """
+            )
+
+    return [
+        PipelineInfo(
+            id=row["pipeline_id"],
+            name=row["name"],
+            description=row["description"] or "",
+            version=row["version"],
+            is_enabled=row["is_enabled"],
+            stages=row["stages"] or [],
+            input_type=row["input_type"] or "dict",
+        )
+        for row in rows
+    ]
+
+
+@router.get("/{pipeline_id}", response_model=PipelineDetail)
+async def get_pipeline(pipeline_id: str) -> PipelineDetail:
+    """
+    Get detailed information about a pipeline.
+
+    Includes metadata, configuration, and registration timestamps.
+    """
+    if not _pool:
+        raise HTTPException(status_code=503, detail="Database not initialized")
+
+    async with _pool.acquire() as conn:
+        row = await conn.fetchrow(
+            """
+            SELECT pipeline_id, name, description, version, module_path,
+                   is_enabled, stages, input_type, config,
+                   created_at, updated_at
+            FROM pipeline.registry
+            WHERE pipeline_id = $1
+            """,
+            pipeline_id,
+        )
+
+    if not row:
+        raise HTTPException(status_code=404, detail=f"Pipeline not found: {pipeline_id}")
+
+    return PipelineDetail(
+        id=row["pipeline_id"],
+        name=row["name"],
+        description=row["description"] or "",
+        version=row["version"],
+        is_enabled=row["is_enabled"],
+        stages=row["stages"] or [],
+        input_type=row["input_type"] or "dict",
+        module_path=row["module_path"],
+        config=row["config"],
+        created_at=row["created_at"].isoformat() if row["created_at"] else None,
+        updated_at=row["updated_at"].isoformat() if row["updated_at"] else None,
+    )
+
+
+@router.post("/{pipeline_id}/execute", response_model=ExecuteResponse)
+async def execute_pipeline(
+    pipeline_id: str,
+    request: ExecuteRequest,
+) -> ExecuteResponse:
+    """
+    Execute a pipeline.
+
+    The pipeline can be executed with:
+    - A job_id to process an existing scraper job
+    - Direct input_data for testing
+    - Specific stages to run (default: all)
+    """
+    import uuid
+
+    pipeline = await _get_pipeline_instance(pipeline_id)
+
+    # Prepare input data
+    input_data = request.input_data or {}
+    if request.job_id:
+        input_data["job_id"] = request.job_id
+    if request.business_id:
+        input_data["business_id"] = request.business_id
+
+    # Create execution record
+    execution_id = str(uuid.uuid4())
+    stages = request.stages or pipeline.get_stage_names()
+
+    if _pool:
+        async with _pool.acquire() as conn:
+            await conn.execute(
+                """
+                INSERT INTO pipeline.executions (
+                    id, pipeline_id, job_id, business_id,
+                    status, stages_requested, created_at
+                )
+                VALUES ($1, $2, $3, $4, 'running', $5, NOW())
+                """,
+                uuid.UUID(execution_id),
+                pipeline_id,
+                uuid.UUID(request.job_id) if request.job_id else None,
+                request.business_id,
+                stages,
+            )
+
+    try:
+        # Execute pipeline
+        result = await pipeline.process(input_data, stages=stages)
+
+        # Update execution status
+        if _pool:
+            async with _pool.acquire() as conn:
+                await conn.execute(
+                    """
+                    UPDATE pipeline.executions
+                    SET status = $2, stages_completed = $3, error_message = $4,
+                        completed_at = NOW()
+                    WHERE id = $1
+                    """,
+                    uuid.UUID(execution_id),
+                    "completed" if result.success else "failed",
+                    result.stages_run,
+                    result.error,
+                )
+
+        return ExecuteResponse(
+            execution_id=execution_id,
+            pipeline_id=pipeline_id,
+            success=result.success,
+            stages_run=result.stages_run,
+            error=result.error,
+        )
+
+    except Exception as e:
+        log.exception(f"Pipeline execution failed: {e}")
+
+        # Update execution status
+        if _pool:
+            async with _pool.acquire() as conn:
+                await conn.execute(
+                    """
+                    UPDATE pipeline.executions
+                    SET status = 'failed', error_message = $2, completed_at = NOW()
+                    WHERE id = $1
+                    """,
+                    uuid.UUID(execution_id),
+                    str(e),
+                )
+
+        raise HTTPException(status_code=500, detail=f"Execution failed: {e}")
+
+
+@router.get("/{pipeline_id}/executions", response_model=list[ExecutionSummary])
+async def list_executions(
+    pipeline_id: str,
+    status: str | None = Query(None, description="Filter by status"),
+    limit: int = Query(50, ge=1, le=200, description="Max results"),
+    offset: int = Query(0, ge=0, description="Offset for pagination"),
+) -> list[ExecutionSummary]:
+    """
+    List execution history for a pipeline.
+
+    Can filter by status and paginate results.
+    """
+    if not _pool:
+        raise HTTPException(status_code=503, detail="Database not initialized")
+
+    conditions = ["pipeline_id = $1"]
+    params: list[Any] = [pipeline_id]
+    param_idx = 2
+
+    if status:
+        conditions.append(f"status = ${param_idx}")
+        params.append(status)
+        param_idx += 1
+
+    where_clause = " AND ".join(conditions)
+
+    async with _pool.acquire() as conn:
+        rows = await conn.fetch(
+            f"""
+            SELECT id, pipeline_id, job_id, business_id, status,
+                   stages_requested, stages_completed, error_message,
+                   started_at, completed_at, created_at
+            FROM pipeline.executions
+            WHERE {where_clause}
+            ORDER BY created_at DESC
+            LIMIT ${param_idx} OFFSET ${param_idx + 1}
+            """,
+            *params,
+            limit,
+            offset,
+        )
+
+    return [
+        ExecutionSummary(
+            id=str(row["id"]),
+            pipeline_id=row["pipeline_id"],
+            job_id=str(row["job_id"]) if row["job_id"] else None,
+            business_id=row["business_id"],
+            status=row["status"],
+            stages_requested=row["stages_requested"] or [],
+            stages_completed=row["stages_completed"] or [],
+            error_message=row["error_message"],
+            started_at=row["started_at"].isoformat() if row["started_at"] else None,
+            completed_at=row["completed_at"].isoformat() if row["completed_at"] else None,
+            created_at=row["created_at"].isoformat() if row["created_at"] else None,
+        )
+        for row in rows
+    ]
+
+
+@router.get("/{pipeline_id}/dashboard", response_model=DashboardConfigModel)
+async def get_dashboard_config(pipeline_id: str) -> DashboardConfigModel:
+    """
+    Get dashboard configuration for a pipeline.
+
+    Returns the widget configuration that the frontend uses to render
+    the pipeline's dynamic dashboard.
+    """
+    pipeline = await _get_pipeline_instance(pipeline_id)
+
+    try:
+        config = pipeline.get_dashboard_config()
+
+        return DashboardConfigModel(
+            pipeline_id=config["pipeline_id"],
+            title=config["title"],
+            description=config.get("description"),
+            sections=[
+                DashboardSectionModel(
+                    id=s["id"],
+                    title=s["title"],
+                    description=s.get("description"),
+                    widgets=s["widgets"],
+                    collapsed=s.get("collapsed"),
+                )
+                for s in config["sections"]
+            ],
+            default_time_range=config.get("default_time_range"),
+            refresh_interval=config.get("refresh_interval"),
+        )
+
+    except Exception as e:
+        log.exception(f"Failed to get dashboard config: {e}")
+        raise HTTPException(
+            status_code=500, detail=f"Failed to get dashboard config: {e}"
+        )
+
+
+@router.get("/{pipeline_id}/widgets/{widget_id}")
+async def get_widget_data(
+    pipeline_id: str,
+    widget_id: str,
+    business_id: str | None = Query(None, description="Filter by business"),
+    time_range: str = Query("30d", description="Time range (e.g., 7d, 30d, 90d)"),
+    page: int = Query(1, ge=1, description="Page number for paginated widgets"),
+    page_size: int = Query(10, ge=1, le=100, description="Items per page"),
+) -> dict[str, Any]:
+    """
+    Get data for a specific dashboard widget.
+
+    The response format depends on the widget type. Common formats:
+    - stat_card: {value, trend, ...}
+    - chart: {data: [{x, y, ...}, ...]}
+    - table: {data: [...], total: n}
+    """
+    pipeline = await _get_pipeline_instance(pipeline_id)
+
+    try:
+        params = {
+            "business_id": business_id,
+            "time_range": time_range,
+            "page": page,
+            "page_size": page_size,
+        }
+
+        data = await pipeline.get_widget_data(widget_id, params)
+        return data
+
+    except Exception as e:
+        log.exception(f"Failed to get widget data: {e}")
+        raise HTTPException(
+            status_code=500, detail=f"Failed to get widget data: {e}"
+        )
+
+
+@router.post("/{pipeline_id}/enable")
+async def enable_pipeline(pipeline_id: str) -> dict[str, str]:
+    """Enable a disabled pipeline."""
+    if not _pool:
+        raise HTTPException(status_code=503, detail="Database not initialized")
+
+    async with _pool.acquire() as conn:
+        result = await conn.execute(
+            """
+            UPDATE pipeline.registry
+            SET is_enabled = TRUE, updated_at = NOW()
+            WHERE pipeline_id = $1
+            """,
+            pipeline_id,
+        )
+
+    if result.split()[-1] == "0":
+        raise HTTPException(status_code=404, detail=f"Pipeline not found: {pipeline_id}")
+
+    # Clear cached instance
+    _pipeline_instances.pop(pipeline_id, None)
+
+    return {"status": "enabled", "pipeline_id": pipeline_id}
+
+
+@router.post("/{pipeline_id}/disable")
+async def disable_pipeline(pipeline_id: str) -> dict[str, str]:
+    """Disable a pipeline."""
+    if not _pool:
+        raise HTTPException(status_code=503, detail="Database not initialized")
+
+    async with _pool.acquire() as conn:
+        result = await conn.execute(
+            """
+            UPDATE pipeline.registry
+            SET is_enabled = FALSE, updated_at = NOW()
+            WHERE pipeline_id = $1
+            """,
+            pipeline_id,
+        )
+
+    if result.split()[-1] == "0":
+        raise HTTPException(status_code=404, detail=f"Pipeline not found: {pipeline_id}")
+
+    # Clear cached instance
+    _pipeline_instances.pop(pipeline_id, None)
+
+    return {"status": "disabled", "pipeline_id": pipeline_id}
+
+
+@router.get("/{pipeline_id}/health")
+async def pipeline_health(pipeline_id: str) -> dict[str, Any]:
+    """
+    Check pipeline health.
+
+    Returns health status and any issues detected.
+    """
+    pipeline = await _get_pipeline_instance(pipeline_id)
+
+    try:
+        health = await pipeline.health_check()
+        return {
+            "pipeline_id": pipeline_id,
+            **health,
+        }
+    except Exception as e:
+        return {
+            "pipeline_id": pipeline_id,
+            "healthy": False,
+            "error": str(e),
+        }