whyrating-engine-legacy/packages/reviewiq-pipeline/scripts/generate_sector_briefs.py

#!/usr/bin/env python3
"""
Wave 0: Sector Brief Generator

Generates alignment context briefs for each sector.
These briefs inform Wave 1 and Wave 2 primitive config generation.

Usage:
    python generate_sector_briefs.py                       # Generate all sectors
    python generate_sector_briefs.py --sector FOOD_DINING  # Generate one sector
    python generate_sector_briefs.py --dry-run             # Show what would be generated
    python generate_sector_briefs.py --validate            # Validate existing briefs
"""

import argparse
import json
import os
import sys
from datetime import datetime
from pathlib import Path

try:
    from openai import OpenAI
except ImportError:
    print("ERROR: openai package required. Install with: pip install openai")
    sys.exit(1)


PROMPT_TEMPLATE = '''You are an expert in customer experience analysis across industries.

Your task: Generate a **sector brief** for the "{sector_name}" sector.

This brief will be used to align classification agents with industry-specific context.
It describes what customers care about — NOT how to classify, NOT what primitives to use.

## Sector Information

- **Code**: {sector_code}
- **Name**: {sector_name}
- **Description**: {description}
- **Sample Business Types**: {business_types}

## Output Requirements

Generate a JSON object with this exact structure:

```json
{{
  "sector_code": "{sector_code}",
  "sector_name": "{sector_name}",
  "generated_at": "<ISO timestamp>",
  "version": "1.0",

  "what_customers_judge": {{
    "description": "The primary dimensions customers evaluate in this sector",
    "items": [
      {{
        "aspect": "string (2-5 words)",
        "importance": "critical | high | moderate",
        "why_it_matters": "string (1 sentence)"
      }}
    ]
  }},

  "critical_pain_points": {{
    "description": "What damages reputation most severely",
    "items": [
      {{
        "pain_point": "string (2-5 words)",
        "typical_language": ["phrases customers actually use in reviews"],
        "reputation_impact": "severe | significant | moderate"
      }}
    ]
  }},

  "common_praise": {{
    "description": "What earns customer loyalty and positive reviews",
    "items": [
      {{
        "praise_area": "string (2-5 words)",
        "typical_language": ["phrases customers actually use in reviews"],
        "loyalty_impact": "high | moderate"
      }}
    ]
  }},

  "industry_terminology": {{
    "description": "Domain-specific vocabulary",
    "staff_terms": ["terms for staff roles in this sector"],
    "product_terms": ["terms for products/services"],
    "process_terms": ["terms for processes/interactions"],
    "quality_terms": ["positive quality descriptors"],
    "problem_terms": ["negative quality descriptors"]
  }},

  "mode_specific_concerns": {{
    "description": "Different service modes have different priorities",
    "modes": [
      {{
        "mode": "string (e.g., 'In-person', 'Online', 'Phone')",
        "primary_concerns": ["top concerns for this mode"],
        "unique_pain_points": ["pain points specific to this mode"]
      }}
    ]
  }},

  "what_is_actionable": {{
    "description": "Feedback businesses can act on",
    "actionable_examples": [
      {{
        "feedback_type": "string",
        "example": "string (realistic review excerpt)",
        "action_owner": "role/team that can fix it"
      }}
    ],
    "not_actionable_examples": [
      {{
        "feedback_type": "string",
        "example": "string (realistic review excerpt)",
        "why_not_actionable": "string"
      }}
    ]
  }},

  "sector_specific_signals": {{
    "description": "Signals with sector-specific meaning",
    "examples": [
      {{
        "signal": "string (word or phrase)",
        "meaning_in_this_sector": "string",
        "contrast_with": "how it differs in other sectors"
      }}
    ]
  }}
}}
```

## Critical Rules

1. **Use realistic review language** in `typical_language` arrays - actual phrases customers write
2. **Include 4-8 items** per array (not too few, not excessive)
3. **Be sector-specific** - don't use generic phrases that apply to all businesses
4. **Include appropriate modes** - only modes that actually exist in this sector
5. **NO primitive codes, priorities, weights, or solutions**
6. **Focus on WHAT customers care about**, not HOW to classify it

Return ONLY the JSON object, no markdown formatting or explanation.'''


def load_sectors(data_path: Path) -> list[dict]:
    """Load sector definitions from JSON file."""
    with open(data_path) as f:
        data = json.load(f)
    return data["sectors"]


def generate_sector_brief(client: OpenAI, sector: dict, model: str) -> dict:
    """Generate a sector brief using LLM."""
    prompt = PROMPT_TEMPLATE.format(
        sector_code=sector["sector_code"],
        sector_name=sector["sector_name"],
        description=sector["description"],
        business_types=", ".join(sector["sample_business_types"])
    )

    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "You are an expert customer experience analyst. Return only valid JSON, no markdown."
            },
            {"role": "user", "content": prompt}
        ],
        temperature=0.3,
        max_tokens=4000,
        response_format={"type": "json_object"}
    )

    text = response.choices[0].message.content.strip()

    # Parse JSON
    brief = json.loads(text)

    # Ensure required fields
    brief["sector_code"] = sector["sector_code"]
    brief["sector_name"] = sector["sector_name"]
    brief["generated_at"] = datetime.utcnow().isoformat() + "Z"
    brief["version"] = "1.0"

    return brief


def validate_brief(brief: dict) -> list[str]:
    """Validate a sector brief, return list of issues."""
    issues = []

    required_keys = [
        "what_customers_judge",
        "critical_pain_points",
        "common_praise",
        "industry_terminology",
        "mode_specific_concerns",
        "what_is_actionable",
        "sector_specific_signals"
    ]

    for key in required_keys:
        if key not in brief:
            issues.append(f"Missing required key: {key}")

    # Check array lengths
    if "what_customers_judge" in brief:
        items = brief["what_customers_judge"].get("items", [])
        if len(items) < 3:
            issues.append(f"what_customers_judge has only {len(items)} items (need 3+)")
        if len(items) > 10:
            issues.append(f"what_customers_judge has {len(items)} items (max 10)")

    if "critical_pain_points" in brief:
        items = brief["critical_pain_points"].get("items", [])
        if len(items) < 3:
            issues.append(f"critical_pain_points has only {len(items)} items (need 3+)")

    if "common_praise" in brief:
        items = brief["common_praise"].get("items", [])
        if len(items) < 3:
            issues.append(f"common_praise has only {len(items)} items (need 3+)")

    # Check for forbidden content
    text = json.dumps(brief).lower()
    forbidden = ["priority", "weight", "primitive", "enabled", "disabled", "solution"]
    for word in forbidden:
        if word in text and word != "solution":  # solution can appear in context
            issues.append(f"Contains potentially forbidden term: {word}")

    return issues


def save_brief(brief: dict, output_dir: Path) -> Path:
    """Save brief to JSON file."""
    output_dir.mkdir(parents=True, exist_ok=True)
    filename = f"{brief['sector_code'].lower()}_brief.json"
    output_path = output_dir / filename

    with open(output_path, "w") as f:
        json.dump(brief, f, indent=2)

    return output_path


def validate_existing_briefs(output_dir: Path) -> None:
    """Validate all existing brief files."""
    if not output_dir.exists():
        print(f"Output directory does not exist: {output_dir}")
        return

    files = list(output_dir.glob("*_brief.json"))
    if not files:
        print("No brief files found")
        return

    print(f"Validating {len(files)} brief files...\n")

    all_valid = True
    for filepath in sorted(files):
        with open(filepath) as f:
            brief = json.load(f)

        issues = validate_brief(brief)
        status = "✓" if not issues else "✗"
        print(f"{status} {filepath.name}")

        if issues:
            all_valid = False
            for issue in issues:
                print(f"    - {issue}")

    print()
    if all_valid:
        print("All briefs valid!")
    else:
        print("Some briefs have issues.")


def main():
    parser = argparse.ArgumentParser(description="Generate sector briefs for Wave 0")
    parser.add_argument("--sector", help="Generate only this sector code")
    parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
    parser.add_argument("--validate", action="store_true", help="Validate existing briefs")
    parser.add_argument("--output-dir", default="data/sector_briefs", help="Output directory")
    parser.add_argument("--model", default="gpt-4o", help="OpenAI model to use")
    args = parser.parse_args()

    # Paths
    script_dir = Path(__file__).parent
    package_dir = script_dir.parent
    data_path = package_dir / "data" / "sectors.json"
    output_dir = package_dir / args.output_dir

    # Validate mode
    if args.validate:
        validate_existing_briefs(output_dir)
        return

    # Load sectors
    sectors = load_sectors(data_path)
    print(f"Loaded {len(sectors)} sectors")

    # Filter to single sector if specified
    if args.sector:
        sectors = [s for s in sectors if s["sector_code"] == args.sector]
        if not sectors:
            print(f"ERROR: Sector '{args.sector}' not found")
            sys.exit(1)

    if args.dry_run:
        print("\n[DRY RUN] Would generate briefs for:")
        for sector in sectors:
            print(f"  - {sector['sector_code']}: {sector['sector_name']}")
        print(f"\nOutput directory: {output_dir}")
        return

    # Check API key
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        print("ERROR: OPENAI_API_KEY environment variable required")
        sys.exit(1)

    # Initialize client
    client = OpenAI(api_key=api_key)
    print(f"Using model: {args.model}")

    # Generate briefs
    results = {"success": [], "failed": []}

    for i, sector in enumerate(sectors, 1):
        print(f"\n[{i}/{len(sectors)}] Generating brief for: {sector['sector_name']}")

        try:
            brief = generate_sector_brief(client, sector, args.model)

            # Validate
            issues = validate_brief(brief)
            if issues:
                print(f"  Warnings:")
                for issue in issues:
                    print(f"    - {issue}")

            # Save
            output_path = save_brief(brief, output_dir)
            print(f"  ✓ Saved to: {output_path}")
            results["success"].append(sector["sector_code"])

        except Exception as e:
            print(f"  ✗ FAILED: {e}")
            results["failed"].append(sector["sector_code"])

    # Summary
    print(f"\n{'='*60}")
    print(f"SUMMARY")
    print(f"{'='*60}")
    print(f"Success: {len(results['success'])}")
    print(f"Failed:  {len(results['failed'])}")

    if results["failed"]:
        print(f"\nFailed sectors: {', '.join(results['failed'])}")
        sys.exit(1)


if __name__ == "__main__":
    main()