Initial commit - WhyRating Engine (Google Reviews Scraper)

2026-02-02 18:19:00 +00:00
parent 0543a08242
commit 2206ddeff2
136 changed files with 51138 additions and 855 deletions
--- a/packages/reviewiq-pipeline/scripts/generate_sector_briefs.py
+++ b/packages/reviewiq-pipeline/scripts/generate_sector_briefs.py
@@ -0,0 +1,372 @@
+#!/usr/bin/env python3
+"""
+Wave 0: Sector Brief Generator
+
+Generates alignment context briefs for each sector.
+These briefs inform Wave 1 and Wave 2 primitive config generation.
+
+Usage:
+    python generate_sector_briefs.py                       # Generate all sectors
+    python generate_sector_briefs.py --sector FOOD_DINING  # Generate one sector
+    python generate_sector_briefs.py --dry-run             # Show what would be generated
+    python generate_sector_briefs.py --validate            # Validate existing briefs
+"""
+
+import argparse
+import json
+import os
+import sys
+from datetime import datetime
+from pathlib import Path
+
+try:
+    from openai import OpenAI
+except ImportError:
+    print("ERROR: openai package required. Install with: pip install openai")
+    sys.exit(1)
+
+
+PROMPT_TEMPLATE = '''You are an expert in customer experience analysis across industries.
+
+Your task: Generate a **sector brief** for the "{sector_name}" sector.
+
+This brief will be used to align classification agents with industry-specific context.
+It describes what customers care about — NOT how to classify, NOT what primitives to use.
+
+## Sector Information
+
+- **Code**: {sector_code}
+- **Name**: {sector_name}
+- **Description**: {description}
+- **Sample Business Types**: {business_types}
+
+## Output Requirements
+
+Generate a JSON object with this exact structure:
+
+```json
+{{
+  "sector_code": "{sector_code}",
+  "sector_name": "{sector_name}",
+  "generated_at": "<ISO timestamp>",
+  "version": "1.0",
+
+  "what_customers_judge": {{
+    "description": "The primary dimensions customers evaluate in this sector",
+    "items": [
+      {{
+        "aspect": "string (2-5 words)",
+        "importance": "critical | high | moderate",
+        "why_it_matters": "string (1 sentence)"
+      }}
+    ]
+  }},
+
+  "critical_pain_points": {{
+    "description": "What damages reputation most severely",
+    "items": [
+      {{
+        "pain_point": "string (2-5 words)",
+        "typical_language": ["phrases customers actually use in reviews"],
+        "reputation_impact": "severe | significant | moderate"
+      }}
+    ]
+  }},
+
+  "common_praise": {{
+    "description": "What earns customer loyalty and positive reviews",
+    "items": [
+      {{
+        "praise_area": "string (2-5 words)",
+        "typical_language": ["phrases customers actually use in reviews"],
+        "loyalty_impact": "high | moderate"
+      }}
+    ]
+  }},
+
+  "industry_terminology": {{
+    "description": "Domain-specific vocabulary",
+    "staff_terms": ["terms for staff roles in this sector"],
+    "product_terms": ["terms for products/services"],
+    "process_terms": ["terms for processes/interactions"],
+    "quality_terms": ["positive quality descriptors"],
+    "problem_terms": ["negative quality descriptors"]
+  }},
+
+  "mode_specific_concerns": {{
+    "description": "Different service modes have different priorities",
+    "modes": [
+      {{
+        "mode": "string (e.g., 'In-person', 'Online', 'Phone')",
+        "primary_concerns": ["top concerns for this mode"],
+        "unique_pain_points": ["pain points specific to this mode"]
+      }}
+    ]
+  }},
+
+  "what_is_actionable": {{
+    "description": "Feedback businesses can act on",
+    "actionable_examples": [
+      {{
+        "feedback_type": "string",
+        "example": "string (realistic review excerpt)",
+        "action_owner": "role/team that can fix it"
+      }}
+    ],
+    "not_actionable_examples": [
+      {{
+        "feedback_type": "string",
+        "example": "string (realistic review excerpt)",
+        "why_not_actionable": "string"
+      }}
+    ]
+  }},
+
+  "sector_specific_signals": {{
+    "description": "Signals with sector-specific meaning",
+    "examples": [
+      {{
+        "signal": "string (word or phrase)",
+        "meaning_in_this_sector": "string",
+        "contrast_with": "how it differs in other sectors"
+      }}
+    ]
+  }}
+}}
+```
+
+## Critical Rules
+
+1. **Use realistic review language** in `typical_language` arrays - actual phrases customers write
+2. **Include 4-8 items** per array (not too few, not excessive)
+3. **Be sector-specific** - don't use generic phrases that apply to all businesses
+4. **Include appropriate modes** - only modes that actually exist in this sector
+5. **NO primitive codes, priorities, weights, or solutions**
+6. **Focus on WHAT customers care about**, not HOW to classify it
+
+Return ONLY the JSON object, no markdown formatting or explanation.'''
+
+
+def load_sectors(data_path: Path) -> list[dict]:
+    """Load sector definitions from JSON file."""
+    with open(data_path) as f:
+        data = json.load(f)
+    return data["sectors"]
+
+
+def generate_sector_brief(client: OpenAI, sector: dict, model: str) -> dict:
+    """Generate a sector brief using LLM."""
+    prompt = PROMPT_TEMPLATE.format(
+        sector_code=sector["sector_code"],
+        sector_name=sector["sector_name"],
+        description=sector["description"],
+        business_types=", ".join(sector["sample_business_types"])
+    )
+
+    response = client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "system",
+                "content": "You are an expert customer experience analyst. Return only valid JSON, no markdown."
+            },
+            {"role": "user", "content": prompt}
+        ],
+        temperature=0.3,
+        max_tokens=4000,
+        response_format={"type": "json_object"}
+    )
+
+    text = response.choices[0].message.content.strip()
+
+    # Parse JSON
+    brief = json.loads(text)
+
+    # Ensure required fields
+    brief["sector_code"] = sector["sector_code"]
+    brief["sector_name"] = sector["sector_name"]
+    brief["generated_at"] = datetime.utcnow().isoformat() + "Z"
+    brief["version"] = "1.0"
+
+    return brief
+
+
+def validate_brief(brief: dict) -> list[str]:
+    """Validate a sector brief, return list of issues."""
+    issues = []
+
+    required_keys = [
+        "what_customers_judge",
+        "critical_pain_points",
+        "common_praise",
+        "industry_terminology",
+        "mode_specific_concerns",
+        "what_is_actionable",
+        "sector_specific_signals"
+    ]
+
+    for key in required_keys:
+        if key not in brief:
+            issues.append(f"Missing required key: {key}")
+
+    # Check array lengths
+    if "what_customers_judge" in brief:
+        items = brief["what_customers_judge"].get("items", [])
+        if len(items) < 3:
+            issues.append(f"what_customers_judge has only {len(items)} items (need 3+)")
+        if len(items) > 10:
+            issues.append(f"what_customers_judge has {len(items)} items (max 10)")
+
+    if "critical_pain_points" in brief:
+        items = brief["critical_pain_points"].get("items", [])
+        if len(items) < 3:
+            issues.append(f"critical_pain_points has only {len(items)} items (need 3+)")
+
+    if "common_praise" in brief:
+        items = brief["common_praise"].get("items", [])
+        if len(items) < 3:
+            issues.append(f"common_praise has only {len(items)} items (need 3+)")
+
+    # Check for forbidden content
+    text = json.dumps(brief).lower()
+    forbidden = ["priority", "weight", "primitive", "enabled", "disabled", "solution"]
+    for word in forbidden:
+        if word in text and word != "solution":  # solution can appear in context
+            issues.append(f"Contains potentially forbidden term: {word}")
+
+    return issues
+
+
+def save_brief(brief: dict, output_dir: Path) -> Path:
+    """Save brief to JSON file."""
+    output_dir.mkdir(parents=True, exist_ok=True)
+    filename = f"{brief['sector_code'].lower()}_brief.json"
+    output_path = output_dir / filename
+
+    with open(output_path, "w") as f:
+        json.dump(brief, f, indent=2)
+
+    return output_path
+
+
+def validate_existing_briefs(output_dir: Path) -> None:
+    """Validate all existing brief files."""
+    if not output_dir.exists():
+        print(f"Output directory does not exist: {output_dir}")
+        return
+
+    files = list(output_dir.glob("*_brief.json"))
+    if not files:
+        print("No brief files found")
+        return
+
+    print(f"Validating {len(files)} brief files...\n")
+
+    all_valid = True
+    for filepath in sorted(files):
+        with open(filepath) as f:
+            brief = json.load(f)
+
+        issues = validate_brief(brief)
+        status = "✓" if not issues else "✗"
+        print(f"{status} {filepath.name}")
+
+        if issues:
+            all_valid = False
+            for issue in issues:
+                print(f"    - {issue}")
+
+    print()
+    if all_valid:
+        print("All briefs valid!")
+    else:
+        print("Some briefs have issues.")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate sector briefs for Wave 0")
+    parser.add_argument("--sector", help="Generate only this sector code")
+    parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
+    parser.add_argument("--validate", action="store_true", help="Validate existing briefs")
+    parser.add_argument("--output-dir", default="data/sector_briefs", help="Output directory")
+    parser.add_argument("--model", default="gpt-4o", help="OpenAI model to use")
+    args = parser.parse_args()
+
+    # Paths
+    script_dir = Path(__file__).parent
+    package_dir = script_dir.parent
+    data_path = package_dir / "data" / "sectors.json"
+    output_dir = package_dir / args.output_dir
+
+    # Validate mode
+    if args.validate:
+        validate_existing_briefs(output_dir)
+        return
+
+    # Load sectors
+    sectors = load_sectors(data_path)
+    print(f"Loaded {len(sectors)} sectors")
+
+    # Filter to single sector if specified
+    if args.sector:
+        sectors = [s for s in sectors if s["sector_code"] == args.sector]
+        if not sectors:
+            print(f"ERROR: Sector '{args.sector}' not found")
+            sys.exit(1)
+
+    if args.dry_run:
+        print("\n[DRY RUN] Would generate briefs for:")
+        for sector in sectors:
+            print(f"  - {sector['sector_code']}: {sector['sector_name']}")
+        print(f"\nOutput directory: {output_dir}")
+        return
+
+    # Check API key
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        print("ERROR: OPENAI_API_KEY environment variable required")
+        sys.exit(1)
+
+    # Initialize client
+    client = OpenAI(api_key=api_key)
+    print(f"Using model: {args.model}")
+
+    # Generate briefs
+    results = {"success": [], "failed": []}
+
+    for i, sector in enumerate(sectors, 1):
+        print(f"\n[{i}/{len(sectors)}] Generating brief for: {sector['sector_name']}")
+
+        try:
+            brief = generate_sector_brief(client, sector, args.model)
+
+            # Validate
+            issues = validate_brief(brief)
+            if issues:
+                print(f"  Warnings:")
+                for issue in issues:
+                    print(f"    - {issue}")
+
+            # Save
+            output_path = save_brief(brief, output_dir)
+            print(f"  ✓ Saved to: {output_path}")
+            results["success"].append(sector["sector_code"])
+
+        except Exception as e:
+            print(f"  ✗ FAILED: {e}")
+            results["failed"].append(sector["sector_code"])
+
+    # Summary
+    print(f"\n{'='*60}")
+    print(f"SUMMARY")
+    print(f"{'='*60}")
+    print(f"Success: {len(results['success'])}")
+    print(f"Failed:  {len(results['failed'])}")
+
+    if results["failed"]:
+        print(f"\nFailed sectors: {', '.join(results['failed'])}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()