Initial commit - WhyRating Engine (Google Reviews Scraper)
This commit is contained in:
372
packages/reviewiq-pipeline/scripts/generate_sector_briefs.py
Normal file
372
packages/reviewiq-pipeline/scripts/generate_sector_briefs.py
Normal file
@@ -0,0 +1,372 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Wave 0: Sector Brief Generator
|
||||
|
||||
Generates alignment context briefs for each sector.
|
||||
These briefs inform Wave 1 and Wave 2 primitive config generation.
|
||||
|
||||
Usage:
|
||||
python generate_sector_briefs.py # Generate all sectors
|
||||
python generate_sector_briefs.py --sector FOOD_DINING # Generate one sector
|
||||
python generate_sector_briefs.py --dry-run # Show what would be generated
|
||||
python generate_sector_briefs.py --validate # Validate existing briefs
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
from openai import OpenAI
|
||||
except ImportError:
|
||||
print("ERROR: openai package required. Install with: pip install openai")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
PROMPT_TEMPLATE = '''You are an expert in customer experience analysis across industries.
|
||||
|
||||
Your task: Generate a **sector brief** for the "{sector_name}" sector.
|
||||
|
||||
This brief will be used to align classification agents with industry-specific context.
|
||||
It describes what customers care about — NOT how to classify, NOT what primitives to use.
|
||||
|
||||
## Sector Information
|
||||
|
||||
- **Code**: {sector_code}
|
||||
- **Name**: {sector_name}
|
||||
- **Description**: {description}
|
||||
- **Sample Business Types**: {business_types}
|
||||
|
||||
## Output Requirements
|
||||
|
||||
Generate a JSON object with this exact structure:
|
||||
|
||||
```json
|
||||
{{
|
||||
"sector_code": "{sector_code}",
|
||||
"sector_name": "{sector_name}",
|
||||
"generated_at": "<ISO timestamp>",
|
||||
"version": "1.0",
|
||||
|
||||
"what_customers_judge": {{
|
||||
"description": "The primary dimensions customers evaluate in this sector",
|
||||
"items": [
|
||||
{{
|
||||
"aspect": "string (2-5 words)",
|
||||
"importance": "critical | high | moderate",
|
||||
"why_it_matters": "string (1 sentence)"
|
||||
}}
|
||||
]
|
||||
}},
|
||||
|
||||
"critical_pain_points": {{
|
||||
"description": "What damages reputation most severely",
|
||||
"items": [
|
||||
{{
|
||||
"pain_point": "string (2-5 words)",
|
||||
"typical_language": ["phrases customers actually use in reviews"],
|
||||
"reputation_impact": "severe | significant | moderate"
|
||||
}}
|
||||
]
|
||||
}},
|
||||
|
||||
"common_praise": {{
|
||||
"description": "What earns customer loyalty and positive reviews",
|
||||
"items": [
|
||||
{{
|
||||
"praise_area": "string (2-5 words)",
|
||||
"typical_language": ["phrases customers actually use in reviews"],
|
||||
"loyalty_impact": "high | moderate"
|
||||
}}
|
||||
]
|
||||
}},
|
||||
|
||||
"industry_terminology": {{
|
||||
"description": "Domain-specific vocabulary",
|
||||
"staff_terms": ["terms for staff roles in this sector"],
|
||||
"product_terms": ["terms for products/services"],
|
||||
"process_terms": ["terms for processes/interactions"],
|
||||
"quality_terms": ["positive quality descriptors"],
|
||||
"problem_terms": ["negative quality descriptors"]
|
||||
}},
|
||||
|
||||
"mode_specific_concerns": {{
|
||||
"description": "Different service modes have different priorities",
|
||||
"modes": [
|
||||
{{
|
||||
"mode": "string (e.g., 'In-person', 'Online', 'Phone')",
|
||||
"primary_concerns": ["top concerns for this mode"],
|
||||
"unique_pain_points": ["pain points specific to this mode"]
|
||||
}}
|
||||
]
|
||||
}},
|
||||
|
||||
"what_is_actionable": {{
|
||||
"description": "Feedback businesses can act on",
|
||||
"actionable_examples": [
|
||||
{{
|
||||
"feedback_type": "string",
|
||||
"example": "string (realistic review excerpt)",
|
||||
"action_owner": "role/team that can fix it"
|
||||
}}
|
||||
],
|
||||
"not_actionable_examples": [
|
||||
{{
|
||||
"feedback_type": "string",
|
||||
"example": "string (realistic review excerpt)",
|
||||
"why_not_actionable": "string"
|
||||
}}
|
||||
]
|
||||
}},
|
||||
|
||||
"sector_specific_signals": {{
|
||||
"description": "Signals with sector-specific meaning",
|
||||
"examples": [
|
||||
{{
|
||||
"signal": "string (word or phrase)",
|
||||
"meaning_in_this_sector": "string",
|
||||
"contrast_with": "how it differs in other sectors"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
}}
|
||||
```
|
||||
|
||||
## Critical Rules
|
||||
|
||||
1. **Use realistic review language** in `typical_language` arrays - actual phrases customers write
|
||||
2. **Include 4-8 items** per array (not too few, not excessive)
|
||||
3. **Be sector-specific** - don't use generic phrases that apply to all businesses
|
||||
4. **Include appropriate modes** - only modes that actually exist in this sector
|
||||
5. **NO primitive codes, priorities, weights, or solutions**
|
||||
6. **Focus on WHAT customers care about**, not HOW to classify it
|
||||
|
||||
Return ONLY the JSON object, no markdown formatting or explanation.'''
|
||||
|
||||
|
||||
def load_sectors(data_path: Path) -> list[dict]:
|
||||
"""Load sector definitions from JSON file."""
|
||||
with open(data_path) as f:
|
||||
data = json.load(f)
|
||||
return data["sectors"]
|
||||
|
||||
|
||||
def generate_sector_brief(client: OpenAI, sector: dict, model: str) -> dict:
|
||||
"""Generate a sector brief using LLM."""
|
||||
prompt = PROMPT_TEMPLATE.format(
|
||||
sector_code=sector["sector_code"],
|
||||
sector_name=sector["sector_name"],
|
||||
description=sector["description"],
|
||||
business_types=", ".join(sector["sample_business_types"])
|
||||
)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are an expert customer experience analyst. Return only valid JSON, no markdown."
|
||||
},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
temperature=0.3,
|
||||
max_tokens=4000,
|
||||
response_format={"type": "json_object"}
|
||||
)
|
||||
|
||||
text = response.choices[0].message.content.strip()
|
||||
|
||||
# Parse JSON
|
||||
brief = json.loads(text)
|
||||
|
||||
# Ensure required fields
|
||||
brief["sector_code"] = sector["sector_code"]
|
||||
brief["sector_name"] = sector["sector_name"]
|
||||
brief["generated_at"] = datetime.utcnow().isoformat() + "Z"
|
||||
brief["version"] = "1.0"
|
||||
|
||||
return brief
|
||||
|
||||
|
||||
def validate_brief(brief: dict) -> list[str]:
|
||||
"""Validate a sector brief, return list of issues."""
|
||||
issues = []
|
||||
|
||||
required_keys = [
|
||||
"what_customers_judge",
|
||||
"critical_pain_points",
|
||||
"common_praise",
|
||||
"industry_terminology",
|
||||
"mode_specific_concerns",
|
||||
"what_is_actionable",
|
||||
"sector_specific_signals"
|
||||
]
|
||||
|
||||
for key in required_keys:
|
||||
if key not in brief:
|
||||
issues.append(f"Missing required key: {key}")
|
||||
|
||||
# Check array lengths
|
||||
if "what_customers_judge" in brief:
|
||||
items = brief["what_customers_judge"].get("items", [])
|
||||
if len(items) < 3:
|
||||
issues.append(f"what_customers_judge has only {len(items)} items (need 3+)")
|
||||
if len(items) > 10:
|
||||
issues.append(f"what_customers_judge has {len(items)} items (max 10)")
|
||||
|
||||
if "critical_pain_points" in brief:
|
||||
items = brief["critical_pain_points"].get("items", [])
|
||||
if len(items) < 3:
|
||||
issues.append(f"critical_pain_points has only {len(items)} items (need 3+)")
|
||||
|
||||
if "common_praise" in brief:
|
||||
items = brief["common_praise"].get("items", [])
|
||||
if len(items) < 3:
|
||||
issues.append(f"common_praise has only {len(items)} items (need 3+)")
|
||||
|
||||
# Check for forbidden content
|
||||
text = json.dumps(brief).lower()
|
||||
forbidden = ["priority", "weight", "primitive", "enabled", "disabled", "solution"]
|
||||
for word in forbidden:
|
||||
if word in text and word != "solution": # solution can appear in context
|
||||
issues.append(f"Contains potentially forbidden term: {word}")
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
def save_brief(brief: dict, output_dir: Path) -> Path:
|
||||
"""Save brief to JSON file."""
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
filename = f"{brief['sector_code'].lower()}_brief.json"
|
||||
output_path = output_dir / filename
|
||||
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(brief, f, indent=2)
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
def validate_existing_briefs(output_dir: Path) -> None:
|
||||
"""Validate all existing brief files."""
|
||||
if not output_dir.exists():
|
||||
print(f"Output directory does not exist: {output_dir}")
|
||||
return
|
||||
|
||||
files = list(output_dir.glob("*_brief.json"))
|
||||
if not files:
|
||||
print("No brief files found")
|
||||
return
|
||||
|
||||
print(f"Validating {len(files)} brief files...\n")
|
||||
|
||||
all_valid = True
|
||||
for filepath in sorted(files):
|
||||
with open(filepath) as f:
|
||||
brief = json.load(f)
|
||||
|
||||
issues = validate_brief(brief)
|
||||
status = "✓" if not issues else "✗"
|
||||
print(f"{status} {filepath.name}")
|
||||
|
||||
if issues:
|
||||
all_valid = False
|
||||
for issue in issues:
|
||||
print(f" - {issue}")
|
||||
|
||||
print()
|
||||
if all_valid:
|
||||
print("All briefs valid!")
|
||||
else:
|
||||
print("Some briefs have issues.")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Generate sector briefs for Wave 0")
|
||||
parser.add_argument("--sector", help="Generate only this sector code")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
|
||||
parser.add_argument("--validate", action="store_true", help="Validate existing briefs")
|
||||
parser.add_argument("--output-dir", default="data/sector_briefs", help="Output directory")
|
||||
parser.add_argument("--model", default="gpt-4o", help="OpenAI model to use")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Paths
|
||||
script_dir = Path(__file__).parent
|
||||
package_dir = script_dir.parent
|
||||
data_path = package_dir / "data" / "sectors.json"
|
||||
output_dir = package_dir / args.output_dir
|
||||
|
||||
# Validate mode
|
||||
if args.validate:
|
||||
validate_existing_briefs(output_dir)
|
||||
return
|
||||
|
||||
# Load sectors
|
||||
sectors = load_sectors(data_path)
|
||||
print(f"Loaded {len(sectors)} sectors")
|
||||
|
||||
# Filter to single sector if specified
|
||||
if args.sector:
|
||||
sectors = [s for s in sectors if s["sector_code"] == args.sector]
|
||||
if not sectors:
|
||||
print(f"ERROR: Sector '{args.sector}' not found")
|
||||
sys.exit(1)
|
||||
|
||||
if args.dry_run:
|
||||
print("\n[DRY RUN] Would generate briefs for:")
|
||||
for sector in sectors:
|
||||
print(f" - {sector['sector_code']}: {sector['sector_name']}")
|
||||
print(f"\nOutput directory: {output_dir}")
|
||||
return
|
||||
|
||||
# Check API key
|
||||
api_key = os.environ.get("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
print("ERROR: OPENAI_API_KEY environment variable required")
|
||||
sys.exit(1)
|
||||
|
||||
# Initialize client
|
||||
client = OpenAI(api_key=api_key)
|
||||
print(f"Using model: {args.model}")
|
||||
|
||||
# Generate briefs
|
||||
results = {"success": [], "failed": []}
|
||||
|
||||
for i, sector in enumerate(sectors, 1):
|
||||
print(f"\n[{i}/{len(sectors)}] Generating brief for: {sector['sector_name']}")
|
||||
|
||||
try:
|
||||
brief = generate_sector_brief(client, sector, args.model)
|
||||
|
||||
# Validate
|
||||
issues = validate_brief(brief)
|
||||
if issues:
|
||||
print(f" Warnings:")
|
||||
for issue in issues:
|
||||
print(f" - {issue}")
|
||||
|
||||
# Save
|
||||
output_path = save_brief(brief, output_dir)
|
||||
print(f" ✓ Saved to: {output_path}")
|
||||
results["success"].append(sector["sector_code"])
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ FAILED: {e}")
|
||||
results["failed"].append(sector["sector_code"])
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"SUMMARY")
|
||||
print(f"{'='*60}")
|
||||
print(f"Success: {len(results['success'])}")
|
||||
print(f"Failed: {len(results['failed'])}")
|
||||
|
||||
if results["failed"]:
|
||||
print(f"\nFailed sectors: {', '.join(results['failed'])}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user