whyrating-engine-legacy/packages/reviewiq-pipeline/run_classification.py

#!/usr/bin/env python
"""
Run classification pipeline for a scraping job.

Usage:
    python run_classification.py 22c747a6-b913-4ae4-82bc-14b4195008b6
"""

import asyncio
import logging
import os
import sys
from datetime import datetime

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger("run_classification")


async def run_pipeline(job_id: str):
    """Run the classification pipeline for a job."""
    from reviewiq_pipeline import Pipeline
    from reviewiq_pipeline.config import Config

    # Get database URL from environment or use default
    database_url = os.environ.get(
        "DATABASE_URL",
        "postgresql://scraper:scraper123@localhost:5437/scraper"
    )

    logger.info(f"Processing job {job_id}")

    # Initialize pipeline
    config = Config(
        database_url=database_url,
        llm_provider="anthropic",
        llm_model="claude-sonnet-4-5-20250929",
        openai_api_key=os.environ.get("OPENAI_API_KEY"),
        anthropic_api_key="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",
        classification_batch_size=25,
        classification_max_concurrent=5,
        classification_target_utilization=0.70,
    )

    pipeline = Pipeline(config)

    try:
        await pipeline.initialize()
        logger.info("Pipeline initialized")

        # Run all stages (normalize, classify, route, aggregate)
        # Just pass job_id - pipeline will fetch and transform reviews from database
        logger.info("Starting pipeline execution...")
        start_time = datetime.now()

        result = await pipeline.process(
            {"job_id": job_id},
            stages=["normalize", "classify", "route", "aggregate"],
        )

        elapsed = (datetime.now() - start_time).total_seconds()

        # Print results
        if result.success:
            logger.info(f"Pipeline completed successfully in {elapsed:.1f}s")
        else:
            logger.warning(f"Pipeline completed with errors in {elapsed:.1f}s")
            if result.error:
                logger.error(f"Error: {result.error}")

        # Stage summaries
        for stage_name, stage_result in result.stage_results.items():
            # Handle both object and dict access
            success = getattr(stage_result, 'success', None) or stage_result.get('success', False)
            data = getattr(stage_result, 'data', None) or stage_result.get('data', {})
            error = getattr(stage_result, 'error', None) or stage_result.get('error')
            duration_ms = getattr(stage_result, 'duration_ms', None) or stage_result.get('duration_ms', 0)

            if success:
                stats = data.get("stats", {}) if data else {}

                if stage_name == "normalize":
                    logger.info(f"  Stage 1 (Normalize): {stats.get('output_count', '?')} reviews")
                elif stage_name == "classify":
                    logger.info(
                        f"  Stage 2 (Classify): {stats.get('success_count', '?')} reviews, "
                        f"{stats.get('total_spans', '?')} spans, "
                        f"${stats.get('llm_cost_usd', 0):.4f} LLM cost"
                    )
                elif stage_name == "route":
                    logger.info(
                        f"  Stage 3 (Route): {stats.get('spans_routed', '?')} spans, "
                        f"{stats.get('issues_created', '?')} issues"
                    )
                elif stage_name == "aggregate":
                    logger.info(f"  Stage 4 (Aggregate): {stats.get('facts_upserted', '?')} facts")

                logger.info(f"    Duration: {duration_ms}ms")
            else:
                logger.error(f"  {stage_name}: FAILED - {error}")

        return result

    except Exception as e:
        logger.exception(f"Pipeline failed: {e}")
        raise
    finally:
        await pipeline.close()


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python run_classification.py <job_id>")
        sys.exit(1)

    job_id = sys.argv[1]

    # Validate UUID format
    import uuid
    try:
        uuid.UUID(job_id)
    except ValueError:
        print(f"Invalid job ID format: {job_id}")
        sys.exit(1)

    result = asyncio.run(run_pipeline(job_id))

    if result and not result.success:
        sys.exit(1)