Initial commit - WhyRating Engine (Google Reviews Scraper)
This commit is contained in:
21
.env.nuc
Normal file
21
.env.nuc
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
# NUC Production Environment Variables
|
||||||
|
# Use this to connect to NUC-hosted database
|
||||||
|
# Copy to .env: cp .env.nuc .env
|
||||||
|
|
||||||
|
# Database (NUC PostgreSQL on port 5437)
|
||||||
|
DB_PASSWORD=scraper_nuc_2026
|
||||||
|
DATABASE_URL=postgresql://scraper:scraper_nuc_2026@192.168.1.3:5437/scraper
|
||||||
|
|
||||||
|
# API Configuration
|
||||||
|
API_BASE_URL=http://localhost:8001
|
||||||
|
PORT=8001
|
||||||
|
|
||||||
|
# Job Concurrency
|
||||||
|
MAX_CONCURRENT_JOBS=5
|
||||||
|
|
||||||
|
# Canary Test Configuration
|
||||||
|
CANARY_TEST_URL=https://www.google.com/maps/place/Soho+Factory/@54.6738155,25.2595844,17z/
|
||||||
|
|
||||||
|
# LLM API Keys (for ReviewIQ pipeline)
|
||||||
|
OPENAI_API_KEY=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
|
||||||
|
ANTHROPIC_API_KEY=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
|
||||||
142
CLAUDE.md
Normal file
142
CLAUDE.md
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
# Google Reviews Scraper Pro - Claude Code Instructions
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### Run with NUC Database (Recommended)
|
||||||
|
The PostgreSQL database is hosted on the NUC server. Only the API runs locally.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Use NUC database config
|
||||||
|
cp .env.nuc .env
|
||||||
|
|
||||||
|
# Start API only (connects to NUC database)
|
||||||
|
docker compose -f docker-compose.production.yml -f docker-compose.nuc.yml up -d
|
||||||
|
|
||||||
|
# View logs
|
||||||
|
docker compose -f docker-compose.production.yml logs -f api
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run Fully Local (Legacy)
|
||||||
|
Runs both PostgreSQL and API locally.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Use local database config
|
||||||
|
cp .env.example .env
|
||||||
|
# Edit .env with your settings
|
||||||
|
|
||||||
|
# Start all services
|
||||||
|
docker compose -f docker-compose.production.yml up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
## NUC Database Connection
|
||||||
|
|
||||||
|
| Property | Value |
|
||||||
|
|----------|-------|
|
||||||
|
| Host | 192.168.1.3 |
|
||||||
|
| Port | 5437 |
|
||||||
|
| Database | scraper |
|
||||||
|
| User | scraper |
|
||||||
|
| Password | scraper_nuc_2026 |
|
||||||
|
| Coolify UUID | g4s8w4csk8s8ocswg48kkogo |
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Direct connection
|
||||||
|
psql postgresql://scraper:scraper_nuc_2026@192.168.1.3:5437/scraper
|
||||||
|
|
||||||
|
# Via SSH tunnel (if needed)
|
||||||
|
ssh -L 5437:localhost:5437 nuc
|
||||||
|
```
|
||||||
|
|
||||||
|
## Service URLs
|
||||||
|
|
||||||
|
| Service | URL |
|
||||||
|
|---------|-----|
|
||||||
|
| API | http://localhost:8001 |
|
||||||
|
| API Docs | http://localhost:8001/docs |
|
||||||
|
| VNC (browser debugging) | http://localhost:6080 |
|
||||||
|
| VNC (client) | vnc://localhost:5900 |
|
||||||
|
|
||||||
|
## Common Commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Start services
|
||||||
|
docker compose -f docker-compose.production.yml -f docker-compose.nuc.yml up -d
|
||||||
|
|
||||||
|
# Stop services
|
||||||
|
docker compose -f docker-compose.production.yml -f docker-compose.nuc.yml down
|
||||||
|
|
||||||
|
# View API logs
|
||||||
|
docker logs -f scraper-api
|
||||||
|
|
||||||
|
# Rebuild API after code changes
|
||||||
|
docker compose -f docker-compose.production.yml -f docker-compose.nuc.yml up -d --build api
|
||||||
|
|
||||||
|
# Run a scrape job (example)
|
||||||
|
curl -X POST http://localhost:8001/api/jobs \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"url": "https://www.google.com/maps/place/..."}'
|
||||||
|
|
||||||
|
# Check job status
|
||||||
|
curl http://localhost:8001/api/jobs/{job_id}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Database Management
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Connect to NUC database
|
||||||
|
docker run --rm -it postgres:15-alpine psql postgresql://scraper:scraper_nuc_2026@192.168.1.3:5437/scraper
|
||||||
|
|
||||||
|
# Backup database
|
||||||
|
ssh nuc "docker exec postgres-g4s8w4csk8s8ocswg48kkogo pg_dump -U scraper scraper" > backup.sql
|
||||||
|
|
||||||
|
# Restore database
|
||||||
|
cat backup.sql | ssh nuc "docker exec -i postgres-g4s8w4csk8s8ocswg48kkogo psql -U scraper scraper"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
├── api/ # FastAPI backend
|
||||||
|
├── packages/
|
||||||
|
│ ├── pipeline-core/ # Shared pipeline utilities
|
||||||
|
│ └── reviewiq-pipeline/ # Review analysis pipeline
|
||||||
|
├── web/ # Next.js frontend (optional)
|
||||||
|
├── db/init/ # Database initialization scripts
|
||||||
|
├── docker-compose.production.yml # Main compose file
|
||||||
|
├── docker-compose.nuc.yml # NUC database override
|
||||||
|
├── .env.nuc # NUC environment config
|
||||||
|
└── Dockerfile # API container build
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### API can't connect to NUC database
|
||||||
|
```bash
|
||||||
|
# Check NUC is reachable
|
||||||
|
nc -zv 192.168.1.3 5437
|
||||||
|
|
||||||
|
# Check database is running
|
||||||
|
ssh nuc "docker ps | grep postgres-g4s8w4csk8s8ocswg48kkogo"
|
||||||
|
|
||||||
|
# Restart database on NUC
|
||||||
|
ssh nuc "docker restart postgres-g4s8w4csk8s8ocswg48kkogo"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Chrome/Scraping issues
|
||||||
|
```bash
|
||||||
|
# Check VNC for visual debugging
|
||||||
|
open http://localhost:6080
|
||||||
|
|
||||||
|
# Increase shared memory if crashes
|
||||||
|
# Edit docker-compose: shm_size: 4gb
|
||||||
|
```
|
||||||
|
|
||||||
|
## Environment Variables
|
||||||
|
|
||||||
|
| Variable | Description | Default |
|
||||||
|
|----------|-------------|---------|
|
||||||
|
| DATABASE_URL | PostgreSQL connection string | (required) |
|
||||||
|
| API_BASE_URL | Public API URL | http://localhost:8001 |
|
||||||
|
| MAX_CONCURRENT_JOBS | Parallel scrape jobs | 5 |
|
||||||
|
| OPENAI_API_KEY | For ReviewIQ analysis | (optional) |
|
||||||
|
| ANTHROPIC_API_KEY | For ReviewIQ analysis | (optional) |
|
||||||
@@ -64,6 +64,10 @@ COPY workers/ ./workers/
|
|||||||
COPY api_server_production.py .
|
COPY api_server_production.py .
|
||||||
COPY config.yaml .
|
COPY config.yaml .
|
||||||
|
|
||||||
|
# Copy and install pipeline packages
|
||||||
|
COPY packages/ ./packages/
|
||||||
|
RUN pip install --no-cache-dir -e ./packages/pipeline-core -e ./packages/reviewiq-pipeline
|
||||||
|
|
||||||
# Create startup script for Xvfb + VNC + API server
|
# Create startup script for Xvfb + VNC + API server
|
||||||
RUN echo '#!/bin/bash\n\
|
RUN echo '#!/bin/bash\n\
|
||||||
# Start Xvfb (virtual display) in background\n\
|
# Start Xvfb (virtual display) in background\n\
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from api.routes.batches import router as batches_router, set_database as set_bat
|
|||||||
from api.routes.dashboard import router as dashboard_router, set_database as set_dashboard_db
|
from api.routes.dashboard import router as dashboard_router, set_database as set_dashboard_db
|
||||||
from api.routes.admin import router as admin_router, set_database as set_admin_db
|
from api.routes.admin import router as admin_router, set_database as set_admin_db
|
||||||
from api.routes.pipelines import router as pipelines_router, set_database as set_pipelines_db
|
from api.routes.pipelines import router as pipelines_router, set_database as set_pipelines_db
|
||||||
|
from api.routes.reviewiq_analytics import router as reviewiq_analytics_router, set_database as set_reviewiq_analytics_db
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'batches_router',
|
'batches_router',
|
||||||
@@ -17,4 +18,6 @@ __all__ = [
|
|||||||
'set_admin_db',
|
'set_admin_db',
|
||||||
'pipelines_router',
|
'pipelines_router',
|
||||||
'set_pipelines_db',
|
'set_pipelines_db',
|
||||||
|
'reviewiq_analytics_router',
|
||||||
|
'set_reviewiq_analytics_db',
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -277,15 +277,17 @@ async def execute_pipeline(
|
|||||||
|
|
||||||
pipeline = await _get_pipeline_instance(pipeline_id)
|
pipeline = await _get_pipeline_instance(pipeline_id)
|
||||||
|
|
||||||
|
# Create execution record
|
||||||
|
execution_id = str(uuid.uuid4())
|
||||||
|
|
||||||
# Prepare input data
|
# Prepare input data
|
||||||
input_data = request.input_data or {}
|
input_data = request.input_data or {}
|
||||||
if request.job_id:
|
if request.job_id:
|
||||||
input_data["job_id"] = request.job_id
|
input_data["job_id"] = request.job_id
|
||||||
if request.business_id:
|
if request.business_id:
|
||||||
input_data["business_id"] = request.business_id
|
input_data["business_id"] = request.business_id
|
||||||
|
# Pass execution_id so Stage 5 synthesis can store results
|
||||||
# Create execution record
|
input_data["execution_id"] = execution_id
|
||||||
execution_id = str(uuid.uuid4())
|
|
||||||
stages = request.stages or pipeline.get_stage_names()
|
stages = request.stages or pipeline.get_stage_names()
|
||||||
|
|
||||||
# Prepare input summary for storage
|
# Prepare input summary for storage
|
||||||
@@ -604,6 +606,7 @@ async def get_widget_data(
|
|||||||
pipeline_id: str,
|
pipeline_id: str,
|
||||||
widget_id: str,
|
widget_id: str,
|
||||||
business_id: str | None = Query(None, description="Filter by business"),
|
business_id: str | None = Query(None, description="Filter by business"),
|
||||||
|
job_id: str | None = Query(None, description="Filter by job ID"),
|
||||||
time_range: str = Query("30d", description="Time range (e.g., 7d, 30d, 90d)"),
|
time_range: str = Query("30d", description="Time range (e.g., 7d, 30d, 90d)"),
|
||||||
page: int = Query(1, ge=1, description="Page number for paginated widgets"),
|
page: int = Query(1, ge=1, description="Page number for paginated widgets"),
|
||||||
page_size: int = Query(10, ge=1, le=100, description="Items per page"),
|
page_size: int = Query(10, ge=1, le=100, description="Items per page"),
|
||||||
@@ -621,6 +624,7 @@ async def get_widget_data(
|
|||||||
try:
|
try:
|
||||||
params = {
|
params = {
|
||||||
"business_id": business_id,
|
"business_id": business_id,
|
||||||
|
"job_id": job_id,
|
||||||
"time_range": time_range,
|
"time_range": time_range,
|
||||||
"page": page,
|
"page": page,
|
||||||
"page_size": page_size,
|
"page_size": page_size,
|
||||||
|
|||||||
300
api/routes/sessions.py
Normal file
300
api/routes/sessions.py
Normal file
@@ -0,0 +1,300 @@
|
|||||||
|
"""
|
||||||
|
Session Routes for Google Reviews Scraper API
|
||||||
|
|
||||||
|
Provides session handoff endpoints for efficient validation → scraping workflow.
|
||||||
|
Uses scraper v1.2.0 with session support.
|
||||||
|
|
||||||
|
Endpoints:
|
||||||
|
POST /sessions/validate - Validate URL, keep browser alive, return session_id
|
||||||
|
POST /sessions/scrape - Scrape using existing session (skips navigation)
|
||||||
|
GET /sessions - List active sessions
|
||||||
|
GET /sessions/{id} - Get session status
|
||||||
|
DELETE /sessions/{id} - Release session manually
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
1. POST /sessions/validate with URL → returns session_id
|
||||||
|
2. Frontend shows business info to user for confirmation
|
||||||
|
3. POST /sessions/scrape with session_id → scrapes using existing browser
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
from typing import Optional, Dict, Any
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
from pydantic import BaseModel, HttpUrl, Field
|
||||||
|
|
||||||
|
# Import v1.2.0 scraper with session support
|
||||||
|
from scrapers.google_reviews.v1_2_0 import (
|
||||||
|
validate_with_session,
|
||||||
|
scrape_with_session,
|
||||||
|
LogCapture
|
||||||
|
)
|
||||||
|
from scrapers.google_reviews.session_manager import get_session_manager
|
||||||
|
|
||||||
|
log = logging.getLogger("api_sessions")
|
||||||
|
|
||||||
|
# Create router
|
||||||
|
router = APIRouter(prefix="/sessions", tags=["sessions"])
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Request/Response Models
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
class GeoLocation(BaseModel):
|
||||||
|
lat: float
|
||||||
|
lng: float
|
||||||
|
|
||||||
|
class Viewport(BaseModel):
|
||||||
|
width: int
|
||||||
|
height: int
|
||||||
|
|
||||||
|
class BrowserFingerprint(BaseModel):
|
||||||
|
userAgent: Optional[str] = None
|
||||||
|
timezone: Optional[str] = None
|
||||||
|
language: Optional[str] = None
|
||||||
|
platform: Optional[str] = None
|
||||||
|
viewport: Optional[Viewport] = None
|
||||||
|
geolocation: Optional[GeoLocation] = None
|
||||||
|
|
||||||
|
|
||||||
|
class ValidateRequest(BaseModel):
|
||||||
|
"""Request body for session validation."""
|
||||||
|
url: HttpUrl = Field(..., description="Google Maps URL to validate")
|
||||||
|
browser_fingerprint: Optional[BrowserFingerprint] = None
|
||||||
|
geolocation: Optional[GeoLocation] = None
|
||||||
|
session_ttl: int = Field(300, description="Session TTL in seconds (default: 5 min)", ge=60, le=900)
|
||||||
|
|
||||||
|
|
||||||
|
class ValidateResponse(BaseModel):
|
||||||
|
"""Response from session validation."""
|
||||||
|
session_id: Optional[str] = Field(None, description="Session ID for scraping (None if validation failed)")
|
||||||
|
business_info: Dict[str, Any] = Field(default_factory=dict)
|
||||||
|
total_reviews: Optional[int] = None
|
||||||
|
success: bool
|
||||||
|
error: Optional[str] = None
|
||||||
|
expires_in: Optional[int] = Field(None, description="Seconds until session expires")
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapeWithSessionRequest(BaseModel):
|
||||||
|
"""Request body for scraping with an existing session."""
|
||||||
|
session_id: str = Field(..., description="Session ID from validation")
|
||||||
|
max_reviews: Optional[int] = Field(None, description="Max reviews to collect (None = unlimited)", ge=1, le=50000)
|
||||||
|
sort_strategy: str = Field("auto", description="Sort strategy: auto, multi, newest, lowest, highest, relevant")
|
||||||
|
initial_sort: Optional[str] = Field(None, description="Initial sort order for first pass")
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapeWithSessionResponse(BaseModel):
|
||||||
|
"""Response from session-based scraping."""
|
||||||
|
reviews: list = Field(default_factory=list)
|
||||||
|
count: int = 0
|
||||||
|
total_reviews: int = 0
|
||||||
|
success: bool
|
||||||
|
error: Optional[str] = None
|
||||||
|
time: float = 0
|
||||||
|
session_reused: bool = Field(True, description="Indicates session was reused from validation")
|
||||||
|
business_info: Dict[str, Any] = Field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
class SessionInfo(BaseModel):
|
||||||
|
"""Information about an active session."""
|
||||||
|
session_id: str
|
||||||
|
business: str
|
||||||
|
state: str
|
||||||
|
total_reviews: int
|
||||||
|
age_seconds: int
|
||||||
|
ttl_remaining: int
|
||||||
|
|
||||||
|
|
||||||
|
class SessionListResponse(BaseModel):
|
||||||
|
"""Response listing all active sessions."""
|
||||||
|
total_sessions: int
|
||||||
|
sessions: list
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Endpoints
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
@router.post("/validate", response_model=ValidateResponse, summary="Validate URL and Create Session")
|
||||||
|
async def validate_and_create_session(request: ValidateRequest):
|
||||||
|
"""
|
||||||
|
Validate a Google Maps URL and keep the browser session alive for scraping.
|
||||||
|
|
||||||
|
This endpoint:
|
||||||
|
1. Creates a Chrome browser
|
||||||
|
2. Navigates to the Google Maps URL
|
||||||
|
3. Extracts business information
|
||||||
|
4. Keeps the browser ALIVE and returns a session_id
|
||||||
|
|
||||||
|
The session can then be used with /sessions/scrape to continue scraping
|
||||||
|
without re-navigating (saves 4-16 seconds per job).
|
||||||
|
|
||||||
|
Session expires after TTL (default: 5 minutes).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
url = str(request.url)
|
||||||
|
log.info(f"Validating URL with session: {url[:80]}...")
|
||||||
|
|
||||||
|
# Build fingerprint dict
|
||||||
|
fingerprint = None
|
||||||
|
if request.browser_fingerprint:
|
||||||
|
fp = request.browser_fingerprint
|
||||||
|
fingerprint = {
|
||||||
|
"userAgent": fp.userAgent,
|
||||||
|
"timezone": fp.timezone,
|
||||||
|
"language": fp.language,
|
||||||
|
"platform": fp.platform,
|
||||||
|
}
|
||||||
|
if fp.viewport:
|
||||||
|
fingerprint["viewport"] = {"width": fp.viewport.width, "height": fp.viewport.height}
|
||||||
|
if fp.geolocation:
|
||||||
|
fingerprint["geolocation"] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
|
||||||
|
elif request.geolocation:
|
||||||
|
fingerprint = {"geolocation": {"lat": request.geolocation.lat, "lng": request.geolocation.lng}}
|
||||||
|
|
||||||
|
# Run validation in thread (blocks Chrome operations)
|
||||||
|
result = await asyncio.to_thread(
|
||||||
|
validate_with_session,
|
||||||
|
url=url,
|
||||||
|
headless=False, # Headed Chrome with Xvfb
|
||||||
|
browser_fingerprint=fingerprint,
|
||||||
|
session_ttl=request.session_ttl
|
||||||
|
)
|
||||||
|
|
||||||
|
return ValidateResponse(
|
||||||
|
session_id=result.get("session_id"),
|
||||||
|
business_info=result.get("business_info", {}),
|
||||||
|
total_reviews=result.get("total_reviews"),
|
||||||
|
success=result.get("success", False),
|
||||||
|
error=result.get("error"),
|
||||||
|
expires_in=result.get("expires_in")
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Session validation error: {e}")
|
||||||
|
return ValidateResponse(
|
||||||
|
session_id=None,
|
||||||
|
success=False,
|
||||||
|
error=str(e)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/scrape", response_model=ScrapeWithSessionResponse, summary="Scrape Using Existing Session")
|
||||||
|
async def scrape_using_session(request: ScrapeWithSessionRequest):
|
||||||
|
"""
|
||||||
|
Scrape reviews using an existing validated session.
|
||||||
|
|
||||||
|
This endpoint:
|
||||||
|
1. Retrieves the browser from the session (already on Google Maps page)
|
||||||
|
2. Skips navigation and consent handling (already done)
|
||||||
|
3. Clicks Reviews tab and starts scraping
|
||||||
|
4. Releases the session when done
|
||||||
|
|
||||||
|
Saves 4-16 seconds compared to starting fresh.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
log.info(f"Scraping with session {request.session_id}...")
|
||||||
|
|
||||||
|
# Run scraping in thread
|
||||||
|
result = await asyncio.to_thread(
|
||||||
|
scrape_with_session,
|
||||||
|
session_id=request.session_id,
|
||||||
|
max_reviews=request.max_reviews,
|
||||||
|
sort_strategy=request.sort_strategy,
|
||||||
|
initial_sort=request.initial_sort
|
||||||
|
)
|
||||||
|
|
||||||
|
return ScrapeWithSessionResponse(
|
||||||
|
reviews=result.get("reviews", []),
|
||||||
|
count=result.get("count", 0),
|
||||||
|
total_reviews=result.get("total_reviews", 0),
|
||||||
|
success=result.get("success", False),
|
||||||
|
error=result.get("error"),
|
||||||
|
time=result.get("time", 0),
|
||||||
|
session_reused=result.get("session_reused", True),
|
||||||
|
business_info=result.get("business_info", {})
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Session scraping error: {e}")
|
||||||
|
return ScrapeWithSessionResponse(
|
||||||
|
success=False,
|
||||||
|
error=str(e)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("", response_model=SessionListResponse, summary="List Active Sessions")
|
||||||
|
async def list_sessions():
|
||||||
|
"""
|
||||||
|
List all active browser sessions.
|
||||||
|
|
||||||
|
Returns information about each session including:
|
||||||
|
- Business name
|
||||||
|
- State (validated, scraping)
|
||||||
|
- Time until expiration
|
||||||
|
"""
|
||||||
|
session_manager = get_session_manager()
|
||||||
|
stats = session_manager.get_stats()
|
||||||
|
|
||||||
|
return SessionListResponse(
|
||||||
|
total_sessions=stats.get("total_sessions", 0),
|
||||||
|
sessions=stats.get("sessions", [])
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{session_id}", summary="Get Session Status")
|
||||||
|
async def get_session_status(session_id: str):
|
||||||
|
"""
|
||||||
|
Get the status of a specific session.
|
||||||
|
"""
|
||||||
|
session_manager = get_session_manager()
|
||||||
|
session = session_manager.get_session(session_id)
|
||||||
|
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found or expired")
|
||||||
|
|
||||||
|
import time
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"session_id": session.session_id,
|
||||||
|
"business": session.business_info.get("name", "unknown"),
|
||||||
|
"state": session.state,
|
||||||
|
"total_reviews": session.total_reviews,
|
||||||
|
"url": session.url,
|
||||||
|
"age_seconds": int(now - session.created_at),
|
||||||
|
"ttl_remaining": int(session.expires_at - now)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/{session_id}", summary="Release Session")
|
||||||
|
async def release_session(session_id: str):
|
||||||
|
"""
|
||||||
|
Manually release a session and close its browser.
|
||||||
|
|
||||||
|
Use this if the user cancels before scraping.
|
||||||
|
"""
|
||||||
|
session_manager = get_session_manager()
|
||||||
|
session = session_manager.get_session(session_id)
|
||||||
|
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found or expired")
|
||||||
|
|
||||||
|
session_manager.release_session(session_id, reason="manual_release")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"message": f"Session {session_id} released"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Helper to register router with main app
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def register_session_routes(app):
|
||||||
|
"""Register session routes with the FastAPI app."""
|
||||||
|
app.include_router(router)
|
||||||
|
log.info("Session routes registered at /sessions")
|
||||||
@@ -61,7 +61,9 @@ from api.routes import (
|
|||||||
dashboard_router, set_dashboard_db,
|
dashboard_router, set_dashboard_db,
|
||||||
admin_router, set_admin_db,
|
admin_router, set_admin_db,
|
||||||
pipelines_router, set_pipelines_db,
|
pipelines_router, set_pipelines_db,
|
||||||
|
reviewiq_analytics_router, set_reviewiq_analytics_db,
|
||||||
)
|
)
|
||||||
|
from api.routes.sessions import router as sessions_router
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
@@ -110,6 +112,7 @@ async def lifespan(app: FastAPI):
|
|||||||
set_dashboard_db(db)
|
set_dashboard_db(db)
|
||||||
set_admin_db(db)
|
set_admin_db(db)
|
||||||
set_pipelines_db(db.pool) # Pipeline router uses raw asyncpg pool
|
set_pipelines_db(db.pool) # Pipeline router uses raw asyncpg pool
|
||||||
|
set_reviewiq_analytics_db(db.pool) # ReviewIQ analytics uses raw asyncpg pool
|
||||||
|
|
||||||
# Initialize health check system with canary monitoring
|
# Initialize health check system with canary monitoring
|
||||||
# DISABLED: Canary tests consume Google Maps requests and trigger rate limiting
|
# DISABLED: Canary tests consume Google Maps requests and trigger rate limiting
|
||||||
@@ -124,12 +127,15 @@ async def lifespan(app: FastAPI):
|
|||||||
|
|
||||||
# Start Chrome worker pools (1 for validation, 2 for scraping)
|
# Start Chrome worker pools (1 for validation, 2 for scraping)
|
||||||
# These pre-warm Chrome instances for instant availability
|
# These pre-warm Chrome instances for instant availability
|
||||||
# headless=False because Docker uses Xvfb virtual display for better compatibility
|
# In Docker: headless=False with Xvfb virtual display for better compatibility
|
||||||
|
# Locally: use CHROME_HEADLESS env var to control (default: headed for scraping)
|
||||||
|
is_docker = os.path.exists("/.dockerenv") or os.environ.get("DOCKER_CONTAINER", "false").lower() == "true"
|
||||||
|
chrome_headless = os.environ.get("CHROME_HEADLESS", "false").lower() == "true"
|
||||||
await asyncio.to_thread(
|
await asyncio.to_thread(
|
||||||
start_worker_pools,
|
start_worker_pools,
|
||||||
validation_size=1,
|
validation_size=1,
|
||||||
scraping_size=2,
|
scraping_size=2,
|
||||||
headless=False
|
headless=chrome_headless if not is_docker else False
|
||||||
)
|
)
|
||||||
log.info("Chrome worker pools started (1 validation + 2 scraping)")
|
log.info("Chrome worker pools started (1 validation + 2 scraping)")
|
||||||
|
|
||||||
@@ -172,6 +178,8 @@ app.include_router(batches_router)
|
|||||||
app.include_router(dashboard_router)
|
app.include_router(dashboard_router)
|
||||||
app.include_router(admin_router)
|
app.include_router(admin_router)
|
||||||
app.include_router(pipelines_router)
|
app.include_router(pipelines_router)
|
||||||
|
app.include_router(reviewiq_analytics_router)
|
||||||
|
app.include_router(sessions_router) # Session handoff for validation → scraping
|
||||||
|
|
||||||
|
|
||||||
# ==================== Request/Response Models ====================
|
# ==================== Request/Response Models ====================
|
||||||
@@ -220,6 +228,10 @@ class ScrapeRequest(BaseModel):
|
|||||||
callback_url: Optional[HttpUrl] = Field(None, description="URL to call when job completes (alternative to webhook)")
|
callback_url: Optional[HttpUrl] = Field(None, description="URL to call when job completes (alternative to webhook)")
|
||||||
scraper_version: Optional[str] = Field(None, description="Specific scraper version to use")
|
scraper_version: Optional[str] = Field(None, description="Specific scraper version to use")
|
||||||
scraper_variant: Optional[str] = Field(None, description="Scraper variant (e.g., 'fast', 'thorough', 'stealth')")
|
scraper_variant: Optional[str] = Field(None, description="Scraper variant (e.g., 'fast', 'thorough', 'stealth')")
|
||||||
|
# Testing options
|
||||||
|
max_reviews: Optional[int] = Field(None, description="Maximum reviews to collect (for testing, default: unlimited)", ge=1, le=10000)
|
||||||
|
# Session handoff (v1.2.0) - reuse browser from validation
|
||||||
|
session_id: Optional[str] = Field(None, description="Session ID from /sessions/validate for browser reuse")
|
||||||
|
|
||||||
|
|
||||||
class GoogleReviewsScrapeRequest(BaseModel):
|
class GoogleReviewsScrapeRequest(BaseModel):
|
||||||
@@ -236,6 +248,10 @@ class GoogleReviewsScrapeRequest(BaseModel):
|
|||||||
callback_url: Optional[HttpUrl] = Field(None, description="URL to call when job completes (alternative to webhook)")
|
callback_url: Optional[HttpUrl] = Field(None, description="URL to call when job completes (alternative to webhook)")
|
||||||
scraper_version: Optional[str] = Field(None, description="Specific scraper version to use")
|
scraper_version: Optional[str] = Field(None, description="Specific scraper version to use")
|
||||||
scraper_variant: Optional[str] = Field(None, description="Scraper variant (e.g., 'fast', 'thorough', 'stealth')")
|
scraper_variant: Optional[str] = Field(None, description="Scraper variant (e.g., 'fast', 'thorough', 'stealth')")
|
||||||
|
# Testing options
|
||||||
|
max_reviews: Optional[int] = Field(None, description="Maximum reviews to collect (for testing, default: unlimited)", ge=1, le=10000)
|
||||||
|
# Session handoff (v1.2.0) - reuse browser from validation
|
||||||
|
session_id: Optional[str] = Field(None, description="Session ID from /sessions/validate for browser reuse")
|
||||||
|
|
||||||
|
|
||||||
class JobResponse(BaseModel):
|
class JobResponse(BaseModel):
|
||||||
@@ -548,16 +564,21 @@ async def get_job(job_id: UUID):
|
|||||||
except:
|
except:
|
||||||
review_topics = None
|
review_topics = None
|
||||||
|
|
||||||
# Extract business info from metadata if available
|
# Read business info from dedicated columns (with fallback to metadata for older jobs)
|
||||||
metadata = job.get('metadata')
|
business_name = job.get('business_name')
|
||||||
if isinstance(metadata, str):
|
business_category = job.get('business_category')
|
||||||
try:
|
|
||||||
metadata = json.loads(metadata)
|
|
||||||
except:
|
|
||||||
metadata = None
|
|
||||||
|
|
||||||
business_name = metadata.get('business_name') if metadata else None
|
# Fallback to metadata for jobs created before migration
|
||||||
business_category = metadata.get('business_category') if metadata else None
|
if not business_name or not business_category:
|
||||||
|
metadata = job.get('metadata')
|
||||||
|
if isinstance(metadata, str):
|
||||||
|
try:
|
||||||
|
metadata = json.loads(metadata)
|
||||||
|
except:
|
||||||
|
metadata = None
|
||||||
|
if metadata:
|
||||||
|
business_name = business_name or metadata.get('business_name')
|
||||||
|
# Note: business_category was not previously stored in metadata
|
||||||
|
|
||||||
return JobResponse(
|
return JobResponse(
|
||||||
job_id=str(job['job_id']),
|
job_id=str(job['job_id']),
|
||||||
@@ -1051,17 +1072,22 @@ async def list_jobs(
|
|||||||
|
|
||||||
result = []
|
result = []
|
||||||
for job in jobs:
|
for job in jobs:
|
||||||
# Extract business info from metadata if available
|
# Read business info from dedicated columns (with fallback to metadata for older jobs)
|
||||||
metadata = job.get('metadata')
|
business_name = job.get('business_name')
|
||||||
if isinstance(metadata, str):
|
business_address = job.get('business_address')
|
||||||
try:
|
business_category = job.get('business_category')
|
||||||
metadata = json.loads(metadata)
|
|
||||||
except:
|
|
||||||
metadata = None
|
|
||||||
|
|
||||||
business_name = metadata.get('business_name') if metadata else None
|
# Fallback to metadata for jobs created before migration
|
||||||
business_address = metadata.get('business_address') if metadata else None
|
if not business_name:
|
||||||
business_category = metadata.get('business_category') if metadata else None
|
metadata = job.get('metadata')
|
||||||
|
if isinstance(metadata, str):
|
||||||
|
try:
|
||||||
|
metadata = json.loads(metadata)
|
||||||
|
except:
|
||||||
|
metadata = None
|
||||||
|
if metadata:
|
||||||
|
business_name = business_name or metadata.get('business_name')
|
||||||
|
business_address = business_address or metadata.get('business_address')
|
||||||
|
|
||||||
# Parse review_topics if it's a string
|
# Parse review_topics if it's a string
|
||||||
review_topics = job.get('review_topics')
|
review_topics = job.get('review_topics')
|
||||||
@@ -1191,6 +1217,193 @@ async def get_stats():
|
|||||||
return StatsResponse(**stats)
|
return StatsResponse(**stats)
|
||||||
|
|
||||||
|
|
||||||
|
# ==================== GBP Categories Endpoints ====================
|
||||||
|
|
||||||
|
@app.get("/categories", summary="Get GBP Categories")
|
||||||
|
async def get_categories(
|
||||||
|
search: Optional[str] = Query(None, description="Search term for category name"),
|
||||||
|
parent: Optional[str] = Query(None, description="Parent path (ltree) to filter children"),
|
||||||
|
level: Optional[int] = Query(None, description="Category level (1-4)", ge=1, le=4),
|
||||||
|
limit: int = Query(5000, description="Maximum number of results", ge=1, le=10000),
|
||||||
|
offset: int = Query(0, description="Offset for pagination", ge=0),
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Get Google Business Profile categories.
|
||||||
|
|
||||||
|
Supports filtering by:
|
||||||
|
- search: Text search in category name
|
||||||
|
- parent: Get children of a specific path
|
||||||
|
- level: Filter by hierarchy level (1=Sector, 2=Business Type, 3=Sub-category, 4=Category)
|
||||||
|
"""
|
||||||
|
if not db or not db.pool:
|
||||||
|
raise HTTPException(status_code=500, detail="Database not initialized")
|
||||||
|
|
||||||
|
async with db.pool.acquire() as conn:
|
||||||
|
# Build query dynamically based on filters
|
||||||
|
conditions = []
|
||||||
|
params = []
|
||||||
|
param_idx = 1
|
||||||
|
|
||||||
|
if search:
|
||||||
|
conditions.append(f"name ILIKE ${param_idx}")
|
||||||
|
params.append(f"%{search}%")
|
||||||
|
param_idx += 1
|
||||||
|
|
||||||
|
if parent:
|
||||||
|
conditions.append(f"path <@ ${param_idx}::ltree AND path != ${param_idx}::ltree")
|
||||||
|
params.append(parent)
|
||||||
|
param_idx += 1
|
||||||
|
|
||||||
|
if level:
|
||||||
|
conditions.append(f"level = ${param_idx}")
|
||||||
|
params.append(level)
|
||||||
|
param_idx += 1
|
||||||
|
|
||||||
|
where_clause = " AND ".join(conditions) if conditions else "TRUE"
|
||||||
|
|
||||||
|
# Get total count
|
||||||
|
count_query = f"SELECT COUNT(*) FROM gbp_categories WHERE {where_clause}"
|
||||||
|
total = await conn.fetchval(count_query, *params)
|
||||||
|
|
||||||
|
# Get categories
|
||||||
|
query = f"""
|
||||||
|
SELECT id, name, slug, path::text as path, level, parent_id, category_count
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE {where_clause}
|
||||||
|
ORDER BY path
|
||||||
|
LIMIT ${param_idx} OFFSET ${param_idx + 1}
|
||||||
|
"""
|
||||||
|
params.extend([limit, offset])
|
||||||
|
|
||||||
|
rows = await conn.fetch(query, *params)
|
||||||
|
categories = [dict(row) for row in rows]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"categories": categories,
|
||||||
|
"total": total,
|
||||||
|
"limit": limit,
|
||||||
|
"offset": offset,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/categories/tree", summary="Get GBP Categories Tree")
|
||||||
|
async def get_categories_tree(
|
||||||
|
root: Optional[str] = Query(None, description="Root path to start the tree from"),
|
||||||
|
max_depth: int = Query(4, description="Maximum depth of the tree", ge=1, le=4),
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Get categories as a hierarchical tree structure.
|
||||||
|
|
||||||
|
Returns nested categories starting from root (or all roots if not specified).
|
||||||
|
"""
|
||||||
|
if not db or not db.pool:
|
||||||
|
raise HTTPException(status_code=500, detail="Database not initialized")
|
||||||
|
|
||||||
|
async with db.pool.acquire() as conn:
|
||||||
|
if root:
|
||||||
|
# Get subtree starting from root
|
||||||
|
query = """
|
||||||
|
SELECT id, name, slug, path::text as path, level, parent_id, category_count
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE path <@ $1::ltree
|
||||||
|
ORDER BY path
|
||||||
|
"""
|
||||||
|
rows = await conn.fetch(query, root)
|
||||||
|
else:
|
||||||
|
# Get all categories
|
||||||
|
query = """
|
||||||
|
SELECT id, name, slug, path::text as path, level, parent_id, category_count
|
||||||
|
FROM gbp_categories
|
||||||
|
ORDER BY path
|
||||||
|
"""
|
||||||
|
rows = await conn.fetch(query)
|
||||||
|
|
||||||
|
categories = [dict(row) for row in rows]
|
||||||
|
|
||||||
|
# Build tree structure
|
||||||
|
def build_tree(cats, parent_path=None, current_depth=1):
|
||||||
|
if current_depth > max_depth:
|
||||||
|
return []
|
||||||
|
|
||||||
|
result = []
|
||||||
|
for cat in cats:
|
||||||
|
cat_parts = cat['path'].split('.')
|
||||||
|
|
||||||
|
if parent_path is None:
|
||||||
|
# Root level - single segment paths
|
||||||
|
if len(cat_parts) == 1:
|
||||||
|
children = build_tree(cats, cat['path'], current_depth + 1)
|
||||||
|
result.append({
|
||||||
|
**cat,
|
||||||
|
'children': children if children else None
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
# Check if this is a direct child of parent_path
|
||||||
|
parent_parts = parent_path.split('.')
|
||||||
|
if (len(cat_parts) == len(parent_parts) + 1 and
|
||||||
|
cat['path'].startswith(parent_path + '.')):
|
||||||
|
children = build_tree(cats, cat['path'], current_depth + 1)
|
||||||
|
result.append({
|
||||||
|
**cat,
|
||||||
|
'children': children if children else None
|
||||||
|
})
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
tree = build_tree(categories)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"tree": tree,
|
||||||
|
"total": len(categories),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/categories/{path:path}", summary="Get Category by Path")
|
||||||
|
async def get_category_by_path(path: str):
|
||||||
|
"""
|
||||||
|
Get a specific category by its ltree path.
|
||||||
|
|
||||||
|
Also returns ancestors and direct children.
|
||||||
|
"""
|
||||||
|
if not db or not db.pool:
|
||||||
|
raise HTTPException(status_code=500, detail="Database not initialized")
|
||||||
|
|
||||||
|
async with db.pool.acquire() as conn:
|
||||||
|
# Get the category
|
||||||
|
category = await conn.fetchrow("""
|
||||||
|
SELECT id, name, slug, path::text as path, level, parent_id, category_count
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE path = $1::ltree
|
||||||
|
""", path)
|
||||||
|
|
||||||
|
if not category:
|
||||||
|
raise HTTPException(status_code=404, detail="Category not found")
|
||||||
|
|
||||||
|
category = dict(category)
|
||||||
|
|
||||||
|
# Get ancestors
|
||||||
|
ancestors = await conn.fetch("""
|
||||||
|
SELECT id, name, slug, path::text as path, level, parent_id, category_count
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE path @> $1::ltree AND path != $1::ltree
|
||||||
|
ORDER BY path
|
||||||
|
""", path)
|
||||||
|
|
||||||
|
# Get direct children
|
||||||
|
children = await conn.fetch("""
|
||||||
|
SELECT id, name, slug, path::text as path, level, parent_id, category_count
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE path ~ ($1 || '.*{1}')::lquery
|
||||||
|
ORDER BY name
|
||||||
|
""", path)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"category": category,
|
||||||
|
"ancestors": [dict(a) for a in ancestors],
|
||||||
|
"children": [dict(c) for c in children],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@app.get("/pool-stats", summary="Get Worker Pool Statistics")
|
@app.get("/pool-stats", summary="Get Worker Pool Statistics")
|
||||||
async def pool_stats():
|
async def pool_stats():
|
||||||
"""Get Chrome worker pool statistics"""
|
"""Get Chrome worker pool statistics"""
|
||||||
@@ -1331,10 +1544,82 @@ async def get_crash_report(job_id: UUID):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Available sort orders for retry strategy
|
||||||
|
SORT_ORDERS = ["newest", "lowest", "highest", "relevant"]
|
||||||
|
|
||||||
|
# Fingerprint rotation for retry - realistic browser profiles to avoid bot detection
|
||||||
|
import random
|
||||||
|
|
||||||
|
FINGERPRINT_PROFILES = [
|
||||||
|
{
|
||||||
|
"platform": "MacIntel",
|
||||||
|
"timezone": "Europe/Madrid",
|
||||||
|
"language": "es-ES",
|
||||||
|
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
"viewport": {"width": 1440, "height": 900}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"platform": "Win32",
|
||||||
|
"timezone": "Europe/London",
|
||||||
|
"language": "en-GB",
|
||||||
|
"userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
||||||
|
"viewport": {"width": 1920, "height": 1080}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"platform": "MacIntel",
|
||||||
|
"timezone": "America/New_York",
|
||||||
|
"language": "en-US",
|
||||||
|
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
||||||
|
"viewport": {"width": 1680, "height": 1050}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"platform": "Win32",
|
||||||
|
"timezone": "Europe/Paris",
|
||||||
|
"language": "fr-FR",
|
||||||
|
"userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
"viewport": {"width": 1366, "height": 768}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"platform": "MacIntel",
|
||||||
|
"timezone": "Europe/Berlin",
|
||||||
|
"language": "de-DE",
|
||||||
|
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
||||||
|
"viewport": {"width": 1512, "height": 982}
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_rotated_fingerprint(retry_attempt: int = 0, previous_fingerprints: list = None) -> dict:
|
||||||
|
"""
|
||||||
|
Get a fingerprint profile for retry, avoiding previously used ones.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
retry_attempt: Which retry attempt this is (0-indexed)
|
||||||
|
previous_fingerprints: List of previously used fingerprint platforms
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A fingerprint profile dict
|
||||||
|
"""
|
||||||
|
previous_fingerprints = previous_fingerprints or []
|
||||||
|
|
||||||
|
# Filter out previously used profiles
|
||||||
|
available = [fp for fp in FINGERPRINT_PROFILES
|
||||||
|
if fp["platform"] not in previous_fingerprints]
|
||||||
|
|
||||||
|
# If all used, cycle back
|
||||||
|
if not available:
|
||||||
|
available = FINGERPRINT_PROFILES
|
||||||
|
|
||||||
|
# Select based on retry attempt (deterministic but varied)
|
||||||
|
selected = available[retry_attempt % len(available)]
|
||||||
|
|
||||||
|
return selected.copy()
|
||||||
|
|
||||||
|
|
||||||
@app.post("/jobs/{job_id}/retry", response_model=RetryJobResponse, summary="Retry Failed Job")
|
@app.post("/jobs/{job_id}/retry", response_model=RetryJobResponse, summary="Retry Failed Job")
|
||||||
async def retry_job(
|
async def retry_job(
|
||||||
job_id: UUID,
|
job_id: UUID,
|
||||||
apply_fix: bool = Query(False, description="Apply auto-fix parameters based on crash analysis")
|
apply_fix: bool = Query(False, description="Apply auto-fix parameters based on crash analysis"),
|
||||||
|
next_sort: bool = Query(False, description="Use a different sort order than the original job (for partial jobs)")
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Retry a failed or partial job, optionally applying auto-fix parameters.
|
Retry a failed or partial job, optionally applying auto-fix parameters.
|
||||||
@@ -1344,6 +1629,11 @@ async def retry_job(
|
|||||||
- Applies recommended parameter adjustments (e.g., reduced batch size for memory issues)
|
- Applies recommended parameter adjustments (e.g., reduced batch size for memory issues)
|
||||||
- Creates a new job with the adjusted parameters
|
- Creates a new job with the adjusted parameters
|
||||||
|
|
||||||
|
When next_sort=true:
|
||||||
|
- Uses a different sort order than previously attempted
|
||||||
|
- Helps get different reviews when stuck at ~1000 limit
|
||||||
|
- Tracks sort_orders_attempted for review merging
|
||||||
|
|
||||||
Returns the new job ID for tracking.
|
Returns the new job ID for tracking.
|
||||||
"""
|
"""
|
||||||
if not db:
|
if not db:
|
||||||
@@ -1418,6 +1708,72 @@ async def retry_job(
|
|||||||
applied_fixes = analysis.auto_fix_params
|
applied_fixes = analysis.auto_fix_params
|
||||||
log.info(f"Applying auto-fix for pattern '{analysis.pattern}': {applied_fixes}")
|
log.info(f"Applying auto-fix for pattern '{analysis.pattern}': {applied_fixes}")
|
||||||
|
|
||||||
|
# Handle next_sort: use a different sort order than previously attempted
|
||||||
|
selected_sort = None
|
||||||
|
if next_sort:
|
||||||
|
# Get previously attempted sort orders
|
||||||
|
sort_orders_attempted = original_metadata.get('sort_orders_attempted', [])
|
||||||
|
|
||||||
|
# If no sort was tracked, assume "newest" was used (default)
|
||||||
|
if not sort_orders_attempted:
|
||||||
|
initial_sort_used = original_metadata.get('initial_sort_used', 'newest')
|
||||||
|
sort_orders_attempted = [initial_sort_used]
|
||||||
|
|
||||||
|
# Find next unused sort order
|
||||||
|
for sort_order in SORT_ORDERS:
|
||||||
|
if sort_order not in sort_orders_attempted:
|
||||||
|
selected_sort = sort_order
|
||||||
|
break
|
||||||
|
|
||||||
|
if selected_sort:
|
||||||
|
# Set the new sort strategy
|
||||||
|
original_metadata['initial_sort'] = selected_sort
|
||||||
|
original_metadata['sort_strategy'] = 'single' # Don't auto-trigger multi-sort
|
||||||
|
|
||||||
|
# Track all attempted sorts (including this one)
|
||||||
|
original_metadata['sort_orders_attempted'] = sort_orders_attempted + [selected_sort]
|
||||||
|
|
||||||
|
# Track retry chain for review merging
|
||||||
|
if 'retry_chain' not in original_metadata:
|
||||||
|
original_metadata['retry_chain'] = [str(job_id)]
|
||||||
|
else:
|
||||||
|
original_metadata['retry_chain'].append(str(job_id))
|
||||||
|
|
||||||
|
original_metadata['retry_info'] = original_metadata.get('retry_info', {})
|
||||||
|
original_metadata['retry_info']['original_job_id'] = str(job_id)
|
||||||
|
original_metadata['retry_info']['retry_reason'] = 'next_sort'
|
||||||
|
original_metadata['retry_info']['selected_sort'] = selected_sort
|
||||||
|
|
||||||
|
log.info(f"Retry with next_sort: using '{selected_sort}' (previously tried: {sort_orders_attempted})")
|
||||||
|
else:
|
||||||
|
log.warn(f"All sort orders already attempted: {sort_orders_attempted}")
|
||||||
|
|
||||||
|
# Fingerprint rotation: if bot was detected, use a different fingerprint
|
||||||
|
selected_fingerprint = None
|
||||||
|
if next_sort and original_metadata.get('bot_detected', False):
|
||||||
|
# Get previously used fingerprints
|
||||||
|
previous_fingerprints = original_metadata.get('fingerprints_used', [])
|
||||||
|
retry_count = len(original_metadata.get('retry_chain', []))
|
||||||
|
|
||||||
|
# Get a rotated fingerprint
|
||||||
|
selected_fingerprint = get_rotated_fingerprint(retry_count, previous_fingerprints)
|
||||||
|
|
||||||
|
# Store the fingerprint in metadata
|
||||||
|
original_metadata['browser_fingerprint'] = selected_fingerprint
|
||||||
|
|
||||||
|
# Track used fingerprints
|
||||||
|
if 'fingerprints_used' not in original_metadata:
|
||||||
|
original_metadata['fingerprints_used'] = []
|
||||||
|
original_metadata['fingerprints_used'].append(selected_fingerprint['platform'])
|
||||||
|
|
||||||
|
original_metadata['retry_info']['fingerprint_rotated'] = True
|
||||||
|
original_metadata['retry_info']['new_fingerprint'] = {
|
||||||
|
'platform': selected_fingerprint['platform'],
|
||||||
|
'timezone': selected_fingerprint['timezone']
|
||||||
|
}
|
||||||
|
|
||||||
|
log.info(f"Fingerprint rotated for retry: {selected_fingerprint['platform']}, {selected_fingerprint['timezone']}")
|
||||||
|
|
||||||
# Create new job with same URL and (possibly modified) metadata
|
# Create new job with same URL and (possibly modified) metadata
|
||||||
new_job_id = await db.create_job(
|
new_job_id = await db.create_job(
|
||||||
url=original_job['url'],
|
url=original_job['url'],
|
||||||
@@ -1431,11 +1787,28 @@ async def retry_job(
|
|||||||
|
|
||||||
log.info(f"Created retry job {new_job_id} for original job {job_id}")
|
log.info(f"Created retry job {new_job_id} for original job {job_id}")
|
||||||
|
|
||||||
|
# Build response message
|
||||||
|
message = f"Retry job created from original job {job_id}"
|
||||||
|
if selected_sort:
|
||||||
|
message += f" (using sort: {selected_sort})"
|
||||||
|
if selected_fingerprint:
|
||||||
|
message += f" (fingerprint: {selected_fingerprint['platform']}/{selected_fingerprint['timezone']})"
|
||||||
|
|
||||||
|
# Build applied_fixes response
|
||||||
|
retry_fixes = {}
|
||||||
|
if selected_sort:
|
||||||
|
retry_fixes["selected_sort"] = selected_sort
|
||||||
|
if selected_fingerprint:
|
||||||
|
retry_fixes["fingerprint"] = {
|
||||||
|
"platform": selected_fingerprint["platform"],
|
||||||
|
"timezone": selected_fingerprint["timezone"]
|
||||||
|
}
|
||||||
|
|
||||||
return RetryJobResponse(
|
return RetryJobResponse(
|
||||||
job_id=str(new_job_id),
|
job_id=str(new_job_id),
|
||||||
status="started",
|
status="started",
|
||||||
message=f"Retry job created from original job {job_id}",
|
message=message,
|
||||||
applied_fixes=applied_fixes
|
applied_fixes=applied_fixes if applied_fixes else (retry_fixes if retry_fixes else None)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -1529,8 +1902,9 @@ async def liveness():
|
|||||||
|
|
||||||
Use this for Kubernetes liveness probe - restart container if fails.
|
Use this for Kubernetes liveness probe - restart container if fails.
|
||||||
"""
|
"""
|
||||||
|
# If health system is disabled, just return healthy (server is alive)
|
||||||
if not health_system:
|
if not health_system:
|
||||||
raise HTTPException(status_code=503, detail="Health system not initialized")
|
return {"status": "healthy", "message": "Server is alive (health system disabled)"}
|
||||||
|
|
||||||
return await health_system.check_liveness()
|
return await health_system.check_liveness()
|
||||||
|
|
||||||
@@ -1542,8 +1916,12 @@ async def readiness():
|
|||||||
|
|
||||||
Use this for Kubernetes readiness probe - remove from load balancer if fails.
|
Use this for Kubernetes readiness probe - remove from load balancer if fails.
|
||||||
"""
|
"""
|
||||||
|
# If health system is disabled, check if DB is connected
|
||||||
if not health_system:
|
if not health_system:
|
||||||
raise HTTPException(status_code=503, detail="Health system not initialized")
|
if db and db.pool:
|
||||||
|
return {"status": "ready", "message": "Server is ready (health system disabled)"}
|
||||||
|
else:
|
||||||
|
raise HTTPException(status_code=503, detail="Database not connected")
|
||||||
|
|
||||||
result = await health_system.check_readiness()
|
result = await health_system.check_readiness()
|
||||||
|
|
||||||
@@ -1728,17 +2106,67 @@ async def run_scraping_job(job_id: UUID):
|
|||||||
scraper_func, actual_version = get_scraper_for_version(requested_version)
|
scraper_func, actual_version = get_scraper_for_version(requested_version)
|
||||||
log.info(f"Using scraper version {actual_version} for job {job_id}")
|
log.info(f"Using scraper version {actual_version} for job {job_id}")
|
||||||
|
|
||||||
# Run scraping with progress callback and shared log capture
|
# Get sort strategy parameters from metadata (for retry with different sort)
|
||||||
# headless=False because Docker uses Xvfb virtual display
|
initial_sort = metadata.get('initial_sort') if metadata else None
|
||||||
result = await asyncio.to_thread(
|
sort_strategy = metadata.get('sort_strategy', 'auto') if metadata else 'auto'
|
||||||
scraper_func,
|
max_reviews = metadata.get('max_reviews') if metadata else None
|
||||||
url=url,
|
session_id = metadata.get('session_id') if metadata else None
|
||||||
headless=False,
|
if initial_sort:
|
||||||
progress_callback=progress_callback,
|
log.info(f"Using initial_sort={initial_sort}, sort_strategy={sort_strategy} for job {job_id}")
|
||||||
log_capture=log_capture,
|
if max_reviews:
|
||||||
flush_callback=flush_callback,
|
log.info(f"Using max_reviews={max_reviews} limit for job {job_id} (testing mode)")
|
||||||
browser_fingerprint=browser_fingerprint # Pass user's browser fingerprint
|
|
||||||
)
|
# Check if we have a session_id for browser reuse (session handoff from validation)
|
||||||
|
if session_id:
|
||||||
|
log.info(f"Using session handoff (session_id={session_id}) for job {job_id} - skipping navigation")
|
||||||
|
from scrapers.google_reviews.v1_2_0 import scrape_with_session
|
||||||
|
result = await asyncio.to_thread(
|
||||||
|
scrape_with_session,
|
||||||
|
session_id=session_id,
|
||||||
|
max_reviews=max_reviews,
|
||||||
|
progress_callback=progress_callback,
|
||||||
|
flush_callback=flush_callback,
|
||||||
|
sort_strategy=sort_strategy,
|
||||||
|
initial_sort=initial_sort
|
||||||
|
)
|
||||||
|
# Add logs from session scraping
|
||||||
|
if 'logs' in result:
|
||||||
|
for log_entry in result.get('logs', []):
|
||||||
|
log_capture.entries.append(log_entry)
|
||||||
|
else:
|
||||||
|
# Run scraping with progress callback and shared log capture
|
||||||
|
# headless=False because Docker uses Xvfb virtual display
|
||||||
|
result = await asyncio.to_thread(
|
||||||
|
scraper_func,
|
||||||
|
url=url,
|
||||||
|
headless=False,
|
||||||
|
progress_callback=progress_callback,
|
||||||
|
log_capture=log_capture,
|
||||||
|
flush_callback=flush_callback,
|
||||||
|
browser_fingerprint=browser_fingerprint, # Pass user's browser fingerprint
|
||||||
|
initial_sort=initial_sort, # Sort order for retry strategy
|
||||||
|
sort_strategy=sort_strategy, # Sort strategy (auto, multi, single)
|
||||||
|
max_reviews=max_reviews # Optional limit for testing
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update job metadata with tracking info from scraper result
|
||||||
|
tracking_metadata = {
|
||||||
|
'bot_detected': result.get('bot_detected', False),
|
||||||
|
'initial_sort_used': result.get('initial_sort_used', 'newest'),
|
||||||
|
'multi_sort': result.get('multi_sort', {}),
|
||||||
|
}
|
||||||
|
# Preserve existing sort_orders_attempted and add current sort
|
||||||
|
existing_sorts = metadata.get('sort_orders_attempted', []) if metadata else []
|
||||||
|
current_sort = result.get('initial_sort_used', 'newest')
|
||||||
|
if current_sort not in existing_sorts:
|
||||||
|
tracking_metadata['sort_orders_attempted'] = existing_sorts + [current_sort]
|
||||||
|
else:
|
||||||
|
tracking_metadata['sort_orders_attempted'] = existing_sorts
|
||||||
|
|
||||||
|
# Update metadata in database
|
||||||
|
await db.update_job_metadata(job_id, tracking_metadata)
|
||||||
|
if result.get('bot_detected'):
|
||||||
|
log.warn(f"Bot detection flagged for job {job_id} - sort button was hidden")
|
||||||
|
|
||||||
if result['success']:
|
if result['success']:
|
||||||
# Save session fingerprint if captured
|
# Save session fingerprint if captured
|
||||||
@@ -1746,6 +2174,18 @@ async def run_scraping_job(job_id: UUID):
|
|||||||
await db.update_session_fingerprint(job_id, result['session_fingerprint'])
|
await db.update_session_fingerprint(job_id, result['session_fingerprint'])
|
||||||
log.info(f"Saved session fingerprint for job {job_id}")
|
log.info(f"Saved session fingerprint for job {job_id}")
|
||||||
|
|
||||||
|
# Save business info to dedicated columns (queryable/indexable)
|
||||||
|
business_info = result.get('business_info', {})
|
||||||
|
if business_info:
|
||||||
|
await db.update_business_info(
|
||||||
|
job_id=job_id,
|
||||||
|
business_name=business_info.get('name'),
|
||||||
|
business_category=business_info.get('category'),
|
||||||
|
business_address=business_info.get('address'),
|
||||||
|
business_rating=business_info.get('rating')
|
||||||
|
)
|
||||||
|
log.info(f"Saved business info for job {job_id}: {business_info.get('name')} ({business_info.get('category')})")
|
||||||
|
|
||||||
# Save results to database (including scraper logs and review topics)
|
# Save results to database (including scraper logs and review topics)
|
||||||
await db.save_job_result(
|
await db.save_job_result(
|
||||||
job_id=job_id,
|
job_id=job_id,
|
||||||
|
|||||||
@@ -354,7 +354,11 @@ class DatabaseManager:
|
|||||||
callback_status,
|
callback_status,
|
||||||
callback_attempts,
|
callback_attempts,
|
||||||
scraper_version,
|
scraper_version,
|
||||||
scraper_variant
|
scraper_variant,
|
||||||
|
business_name,
|
||||||
|
business_category,
|
||||||
|
business_address,
|
||||||
|
business_rating
|
||||||
FROM jobs
|
FROM jobs
|
||||||
WHERE job_id = $1
|
WHERE job_id = $1
|
||||||
""", job_id)
|
""", job_id)
|
||||||
@@ -575,6 +579,69 @@ class DatabaseManager:
|
|||||||
|
|
||||||
log.debug(f"Updated session fingerprint for job {job_id}")
|
log.debug(f"Updated session fingerprint for job {job_id}")
|
||||||
|
|
||||||
|
async def update_job_metadata(
|
||||||
|
self,
|
||||||
|
job_id: UUID,
|
||||||
|
metadata_updates: Dict[str, Any]
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Update specific fields in job metadata without overwriting existing data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
job_id: Job UUID
|
||||||
|
metadata_updates: Dictionary of metadata fields to update/add
|
||||||
|
- bot_detected: True if sort button was hidden (bot detection)
|
||||||
|
- initial_sort_used: Sort order used for scraping
|
||||||
|
- sort_orders_attempted: List of all sort orders tried
|
||||||
|
- multi_sort: Multi-sort completion info
|
||||||
|
"""
|
||||||
|
async with self.pool.acquire() as conn:
|
||||||
|
# Merge new metadata with existing metadata using JSONB concatenation
|
||||||
|
await conn.execute("""
|
||||||
|
UPDATE jobs
|
||||||
|
SET
|
||||||
|
metadata = COALESCE(metadata, '{}'::jsonb) || $2::jsonb,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE job_id = $1
|
||||||
|
""", job_id, json.dumps(metadata_updates))
|
||||||
|
|
||||||
|
log.debug(f"Updated job metadata for job {job_id}: {list(metadata_updates.keys())}")
|
||||||
|
|
||||||
|
async def update_business_info(
|
||||||
|
self,
|
||||||
|
job_id: UUID,
|
||||||
|
business_name: Optional[str] = None,
|
||||||
|
business_category: Optional[str] = None,
|
||||||
|
business_address: Optional[str] = None,
|
||||||
|
business_rating: Optional[float] = None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Update business info columns for a job.
|
||||||
|
|
||||||
|
These are dedicated columns (not JSONB) for queryable business data
|
||||||
|
captured from the Google Maps page during scraping.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
job_id: Job UUID
|
||||||
|
business_name: Business name from Google Maps
|
||||||
|
business_category: Business category (e.g., "Restaurant", "Toy store")
|
||||||
|
business_address: Full address from Google Maps
|
||||||
|
business_rating: Aggregate rating at time of scrape (e.g., 4.5)
|
||||||
|
"""
|
||||||
|
async with self.pool.acquire() as conn:
|
||||||
|
await conn.execute("""
|
||||||
|
UPDATE jobs
|
||||||
|
SET
|
||||||
|
business_name = COALESCE($2, business_name),
|
||||||
|
business_category = COALESCE($3, business_category),
|
||||||
|
business_address = COALESCE($4, business_address),
|
||||||
|
business_rating = COALESCE($5, business_rating),
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE job_id = $1
|
||||||
|
""", job_id, business_name, business_category, business_address, business_rating)
|
||||||
|
|
||||||
|
log.debug(f"Updated business info for job {job_id}: name={business_name}, category={business_category}")
|
||||||
|
|
||||||
async def mark_job_partial(
|
async def mark_job_partial(
|
||||||
self,
|
self,
|
||||||
job_id: UUID,
|
job_id: UUID,
|
||||||
@@ -674,7 +741,11 @@ class DatabaseManager:
|
|||||||
callback_status,
|
callback_status,
|
||||||
callback_attempts,
|
callback_attempts,
|
||||||
scraper_version,
|
scraper_version,
|
||||||
scraper_variant
|
scraper_variant,
|
||||||
|
business_name,
|
||||||
|
business_category,
|
||||||
|
business_address,
|
||||||
|
business_rating
|
||||||
FROM jobs
|
FROM jobs
|
||||||
{where_clause}
|
{where_clause}
|
||||||
ORDER BY created_at DESC
|
ORDER BY created_at DESC
|
||||||
|
|||||||
202
db/apply_recategorization.py
Normal file
202
db/apply_recategorization.py
Normal file
@@ -0,0 +1,202 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Apply the hierarchical recategorization to the database.
|
||||||
|
|
||||||
|
This script:
|
||||||
|
1. Gets all items currently in Other.Uncategorized
|
||||||
|
2. Applies the categorization rules
|
||||||
|
3. Updates the database with new paths
|
||||||
|
4. Creates new level 2/3 categories as needed
|
||||||
|
5. Updates category counts
|
||||||
|
"""
|
||||||
|
|
||||||
|
import psycopg2
|
||||||
|
import re
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
# Import categorization functions
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, '/Users/agutierrez/Desktop/google-reviews-scraper-pro/db')
|
||||||
|
from recategorize_hierarchical import get_sector_for_item, get_business_type_for_item
|
||||||
|
|
||||||
|
DB_URL = "postgresql://scraper:scraper123@localhost:5437/scraper"
|
||||||
|
|
||||||
|
def slugify(text):
|
||||||
|
"""Convert text to slug format"""
|
||||||
|
slug = re.sub(r'[^\w\s-]', '', text)
|
||||||
|
slug = re.sub(r'[-\s]+', '_', slug)
|
||||||
|
return slug.strip('_')
|
||||||
|
|
||||||
|
def main():
|
||||||
|
conn = psycopg2.connect(DB_URL)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# Get all items in Other.Uncategorized
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT id, name, slug
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE path ~ 'Other.Uncategorized.*' AND level = 4
|
||||||
|
ORDER BY name
|
||||||
|
""")
|
||||||
|
other_items = cursor.fetchall()
|
||||||
|
print(f"Found {len(other_items)} items in Other.Uncategorized")
|
||||||
|
|
||||||
|
# Get existing paths
|
||||||
|
cursor.execute("SELECT path::text, id FROM gbp_categories")
|
||||||
|
existing_paths = {row[0]: row[1] for row in cursor.fetchall()}
|
||||||
|
print(f"Found {len(existing_paths)} existing paths")
|
||||||
|
|
||||||
|
# Categorize items
|
||||||
|
moves = [] # (item_id, item_name, item_slug, new_sector, new_btype)
|
||||||
|
stats = defaultdict(int)
|
||||||
|
|
||||||
|
for item_id, name, slug in other_items:
|
||||||
|
sector = get_sector_for_item(name)
|
||||||
|
btype = get_business_type_for_item(name, sector)
|
||||||
|
|
||||||
|
if sector != 'Other':
|
||||||
|
moves.append((item_id, name, slug, sector, btype))
|
||||||
|
stats[sector] += 1
|
||||||
|
else:
|
||||||
|
stats['Still_Other'] += 1
|
||||||
|
|
||||||
|
print(f"\nCategorization results:")
|
||||||
|
for sector, count in sorted(stats.items(), key=lambda x: -x[1]):
|
||||||
|
print(f" {sector}: {count}")
|
||||||
|
|
||||||
|
print(f"\nTotal to move: {len(moves)}")
|
||||||
|
print(f"Remaining in Other: {stats.get('Still_Other', 0)}")
|
||||||
|
|
||||||
|
# Ask for confirmation
|
||||||
|
response = input("\nProceed with database updates? (yes/no): ")
|
||||||
|
if response.lower() != 'yes':
|
||||||
|
print("Aborted.")
|
||||||
|
conn.close()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Process moves
|
||||||
|
created_paths = set()
|
||||||
|
updated = 0
|
||||||
|
errors = []
|
||||||
|
|
||||||
|
for item_id, name, slug, sector, btype in moves:
|
||||||
|
try:
|
||||||
|
sector_slug = slugify(sector)
|
||||||
|
btype_slug = slugify(btype)
|
||||||
|
|
||||||
|
# Check if sector exists
|
||||||
|
sector_path = sector_slug
|
||||||
|
if sector_path not in existing_paths:
|
||||||
|
print(f" [ERROR] Sector not found: {sector_path} for '{name}'")
|
||||||
|
errors.append((name, f"Sector not found: {sector_path}"))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check/create business type (level 2)
|
||||||
|
btype_path = f"{sector_path}.{btype_slug}"
|
||||||
|
if btype_path not in existing_paths and btype_path not in created_paths:
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
|
||||||
|
SELECT %s, %s, %s::ltree, 2, id, 0
|
||||||
|
FROM gbp_categories WHERE path = %s::ltree
|
||||||
|
ON CONFLICT (path) DO NOTHING
|
||||||
|
RETURNING id
|
||||||
|
""", (btype, btype_slug, btype_path, sector_path))
|
||||||
|
result = cursor.fetchone()
|
||||||
|
if result:
|
||||||
|
existing_paths[btype_path] = result[0]
|
||||||
|
created_paths.add(btype_path)
|
||||||
|
print(f" [NEW] Created business type: {btype_path}")
|
||||||
|
|
||||||
|
# Check/create sub-category (level 3) - use "General" as default
|
||||||
|
subcat = "General"
|
||||||
|
subcat_slug = "General"
|
||||||
|
subcat_path = f"{btype_path}.{subcat_slug}"
|
||||||
|
if subcat_path not in existing_paths and subcat_path not in created_paths:
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
|
||||||
|
SELECT %s, %s, %s::ltree, 3, id, 0
|
||||||
|
FROM gbp_categories WHERE path = %s::ltree
|
||||||
|
ON CONFLICT (path) DO NOTHING
|
||||||
|
RETURNING id
|
||||||
|
""", (subcat, subcat_slug, subcat_path, btype_path))
|
||||||
|
result = cursor.fetchone()
|
||||||
|
if result:
|
||||||
|
existing_paths[subcat_path] = result[0]
|
||||||
|
created_paths.add(subcat_path)
|
||||||
|
print(f" [NEW] Created sub-category: {subcat_path}")
|
||||||
|
|
||||||
|
# Update the item's path
|
||||||
|
new_path = f"{subcat_path}.{slug}"
|
||||||
|
cursor.execute("""
|
||||||
|
UPDATE gbp_categories
|
||||||
|
SET path = %s::ltree,
|
||||||
|
parent_id = (SELECT id FROM gbp_categories WHERE path = %s::ltree)
|
||||||
|
WHERE id = %s
|
||||||
|
""", (new_path, subcat_path, item_id))
|
||||||
|
updated += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
errors.append((name, str(e)))
|
||||||
|
print(f" [ERROR] {name}: {e}")
|
||||||
|
|
||||||
|
# Update category counts
|
||||||
|
print("\nUpdating category counts...")
|
||||||
|
cursor.execute("""
|
||||||
|
WITH counts AS (
|
||||||
|
SELECT
|
||||||
|
parent_id,
|
||||||
|
COUNT(*) as cnt
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE parent_id IS NOT NULL
|
||||||
|
GROUP BY parent_id
|
||||||
|
)
|
||||||
|
UPDATE gbp_categories g
|
||||||
|
SET category_count = COALESCE(c.cnt, 0)
|
||||||
|
FROM counts c
|
||||||
|
WHERE g.id = c.parent_id
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Also reset counts for categories that no longer have children
|
||||||
|
cursor.execute("""
|
||||||
|
UPDATE gbp_categories
|
||||||
|
SET category_count = 0
|
||||||
|
WHERE id NOT IN (
|
||||||
|
SELECT DISTINCT parent_id FROM gbp_categories WHERE parent_id IS NOT NULL
|
||||||
|
)
|
||||||
|
AND level < 4
|
||||||
|
""")
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"SUMMARY")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"Items moved: {updated}")
|
||||||
|
print(f"New paths created: {len(created_paths)}")
|
||||||
|
print(f"Errors: {len(errors)}")
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
print("\nErrors:")
|
||||||
|
for name, err in errors[:10]:
|
||||||
|
print(f" - {name}: {err}")
|
||||||
|
if len(errors) > 10:
|
||||||
|
print(f" ... and {len(errors) - 10} more")
|
||||||
|
|
||||||
|
# Show final stats
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT
|
||||||
|
SPLIT_PART(path::text, '.', 1) as sector,
|
||||||
|
COUNT(*) as count
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE level = 4
|
||||||
|
GROUP BY sector
|
||||||
|
ORDER BY count DESC
|
||||||
|
""")
|
||||||
|
print("\nFinal category distribution:")
|
||||||
|
for sector, count in cursor.fetchall():
|
||||||
|
print(f" {sector}: {count}")
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
977
db/import_categories.py
Normal file
977
db/import_categories.py
Normal file
@@ -0,0 +1,977 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Import Google Business Profile categories into PostgreSQL with ltree hierarchy.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python import_categories.py [--csv-path PATH] [--db-url URL]
|
||||||
|
|
||||||
|
Example:
|
||||||
|
python import_categories.py --csv-path ./categories.csv --db-url postgresql://scraper:scraper123@localhost:5437/scraper
|
||||||
|
"""
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
try:
|
||||||
|
import psycopg2
|
||||||
|
from psycopg2.extras import execute_values
|
||||||
|
HAS_PSYCOPG2 = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_PSYCOPG2 = False
|
||||||
|
|
||||||
|
# Default paths
|
||||||
|
DEFAULT_CSV_PATH = os.path.expanduser("~/Downloads/Google Business Profile Categories (2025 List) - Category List (English).csv")
|
||||||
|
DEFAULT_DB_URL = "postgresql://scraper:scraper123@localhost:5437/scraper"
|
||||||
|
|
||||||
|
|
||||||
|
def slugify(text: str) -> str:
|
||||||
|
"""Convert text to ltree-safe slug."""
|
||||||
|
# Replace special characters with underscores
|
||||||
|
slug = re.sub(r'[^a-zA-Z0-9]+', '_', text)
|
||||||
|
# Remove leading/trailing underscores
|
||||||
|
slug = slug.strip('_')
|
||||||
|
# Ensure it starts with a letter (ltree requirement)
|
||||||
|
if slug and not slug[0].isalpha():
|
||||||
|
slug = 'cat_' + slug
|
||||||
|
return slug or 'unknown'
|
||||||
|
|
||||||
|
|
||||||
|
def categorize_category(cat: str) -> tuple:
|
||||||
|
"""
|
||||||
|
Categorize a GBP category into 4-level hierarchy.
|
||||||
|
Returns: (level1, level2, level3, level4)
|
||||||
|
"""
|
||||||
|
c = cat.lower()
|
||||||
|
|
||||||
|
# === FOOD & DINING ===
|
||||||
|
if 'restaurant' in c:
|
||||||
|
if any(x in c for x in ['fast food', 'drive-in', 'takeaway', 'takeout', 'quick service']):
|
||||||
|
return ("Food & Dining", "Restaurants", "Fast Food & Quick Service", cat)
|
||||||
|
# Cuisine types
|
||||||
|
return ("Food & Dining", "Restaurants", "By Cuisine", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['cafe', 'coffee shop', 'tea house', 'tea room', 'espresso bar']):
|
||||||
|
return ("Food & Dining", "Cafes & Coffee", "Coffee Shops", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['bar', 'pub', 'nightclub', 'night club', 'cocktail', 'wine bar', 'beer', 'lounge']):
|
||||||
|
if 'gay' in c or 'lesbian' in c:
|
||||||
|
return ("Food & Dining", "Bars & Nightlife", "LGBTQ+ Venues", cat)
|
||||||
|
if 'karaoke' in c:
|
||||||
|
return ("Food & Dining", "Bars & Nightlife", "Karaoke", cat)
|
||||||
|
return ("Food & Dining", "Bars & Nightlife", "Bars & Pubs", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['bakery', 'pastry', 'cake', 'donut', 'dessert', 'ice cream', 'frozen yogurt', 'candy', 'chocolate', 'confection']):
|
||||||
|
return ("Food & Dining", "Bakeries & Desserts", "Sweet Shops", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['caterer', 'catering']):
|
||||||
|
return ("Food & Dining", "Food Services", "Catering", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['brewery', 'winery', 'distillery', 'vineyard']):
|
||||||
|
return ("Food & Dining", "Beverage Production", "Producers", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['food truck', 'food stand', 'food stall', 'food court']):
|
||||||
|
return ("Food & Dining", "Quick Service", "Street Food", cat)
|
||||||
|
|
||||||
|
# === RETAIL & SHOPPING ===
|
||||||
|
if 'store' in c or 'shop' in c:
|
||||||
|
if any(x in c for x in ['clothing', 'fashion', 'shoe', 'dress', 'apparel', 'wear', 'boutique', 'tailor']):
|
||||||
|
return ("Retail & Shopping", "Clothing & Fashion", "Apparel Stores", cat)
|
||||||
|
if any(x in c for x in ['electronic', 'computer', 'phone', 'appliance', 'tv', 'audio', 'video game']):
|
||||||
|
return ("Retail & Shopping", "Electronics", "Electronics Stores", cat)
|
||||||
|
if any(x in c for x in ['furniture', 'home decor', 'kitchen', 'bed', 'mattress', 'carpet', 'curtain', 'lighting']):
|
||||||
|
return ("Retail & Shopping", "Home & Garden", "Home Furnishings", cat)
|
||||||
|
if any(x in c for x in ['grocery', 'supermarket', 'food', 'beverage', 'wine', 'liquor', 'butcher', 'fish', 'fruit', 'vegetable']):
|
||||||
|
return ("Retail & Shopping", "Food & Grocery", "Grocery Stores", cat)
|
||||||
|
if any(x in c for x in ['book', 'stationery', 'office supply', 'paper']):
|
||||||
|
return ("Retail & Shopping", "Books & Office", "Book Stores", cat)
|
||||||
|
if any(x in c for x in ['pet', 'animal']):
|
||||||
|
return ("Retail & Shopping", "Pet Supplies", "Pet Stores", cat)
|
||||||
|
if any(x in c for x in ['toy', 'game', 'hobby']):
|
||||||
|
return ("Retail & Shopping", "Toys & Hobbies", "Toy Stores", cat)
|
||||||
|
if any(x in c for x in ['jewelry', 'watch', 'gold', 'diamond']):
|
||||||
|
return ("Retail & Shopping", "Jewelry & Watches", "Jewelry Stores", cat)
|
||||||
|
if any(x in c for x in ['sport', 'athletic', 'fitness', 'outdoor', 'camping', 'fishing', 'hunting']):
|
||||||
|
return ("Retail & Shopping", "Sports & Outdoors", "Sporting Goods", cat)
|
||||||
|
if any(x in c for x in ['music', 'instrument', 'record', 'vinyl']):
|
||||||
|
return ("Retail & Shopping", "Music & Entertainment", "Music Stores", cat)
|
||||||
|
if any(x in c for x in ['art', 'craft', 'fabric', 'sewing', 'yarn', 'knitting']):
|
||||||
|
return ("Retail & Shopping", "Arts & Crafts", "Art Supply Stores", cat)
|
||||||
|
if any(x in c for x in ['beauty', 'cosmetic', 'perfume', 'makeup']):
|
||||||
|
return ("Retail & Shopping", "Beauty & Cosmetics", "Beauty Stores", cat)
|
||||||
|
if any(x in c for x in ['pharmacy', 'drug', 'medicine', 'health']):
|
||||||
|
return ("Retail & Shopping", "Health & Pharmacy", "Pharmacies", cat)
|
||||||
|
if any(x in c for x in ['garden', 'plant', 'flower', 'nursery', 'landscap']):
|
||||||
|
return ("Retail & Shopping", "Home & Garden", "Garden Centers", cat)
|
||||||
|
if any(x in c for x in ['hardware', 'tool', 'building', 'lumber', 'paint']):
|
||||||
|
return ("Retail & Shopping", "Hardware & Building", "Hardware Stores", cat)
|
||||||
|
if any(x in c for x in ['antique', 'vintage', 'thrift', 'consignment', 'second hand', 'used']):
|
||||||
|
return ("Retail & Shopping", "Secondhand & Vintage", "Thrift Stores", cat)
|
||||||
|
return ("Retail & Shopping", "Specialty Retail", "Other Stores", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['supplier', 'wholesaler', 'distributor', 'exporter', 'importer']):
|
||||||
|
if any(x in c for x in ['food', 'beverage', 'meat', 'seafood', 'produce']):
|
||||||
|
return ("Retail & Shopping", "Wholesale & Distribution", "Food Wholesale", cat)
|
||||||
|
if any(x in c for x in ['building', 'construction', 'lumber', 'concrete', 'steel']):
|
||||||
|
return ("Retail & Shopping", "Wholesale & Distribution", "Building Materials", cat)
|
||||||
|
if any(x in c for x in ['industrial', 'machinery', 'equipment']):
|
||||||
|
return ("Retail & Shopping", "Wholesale & Distribution", "Industrial Supplies", cat)
|
||||||
|
return ("Retail & Shopping", "Wholesale & Distribution", "General Wholesale", cat)
|
||||||
|
|
||||||
|
if 'market' in c and 'marketing' not in c:
|
||||||
|
if 'flea' in c or 'antique' in c:
|
||||||
|
return ("Retail & Shopping", "Markets", "Flea Markets", cat)
|
||||||
|
if 'farmer' in c:
|
||||||
|
return ("Retail & Shopping", "Markets", "Farmers Markets", cat)
|
||||||
|
return ("Retail & Shopping", "Markets", "General Markets", cat)
|
||||||
|
|
||||||
|
# === AUTOMOTIVE ===
|
||||||
|
if 'dealer' in c:
|
||||||
|
car_brands = ['abarth', 'acura', 'alfa romeo', 'aston martin', 'audi', 'bentley', 'bmw', 'bugatti',
|
||||||
|
'buick', 'cadillac', 'chevrolet', 'chrysler', 'citroen', 'cupra', 'dacia', 'daihatsu',
|
||||||
|
'dodge', 'ferrari', 'fiat', 'ford', 'genesis', 'gmc', 'honda', 'hummer', 'hyundai',
|
||||||
|
'infiniti', 'isuzu', 'jaguar', 'jeep', 'kia', 'lamborghini', 'lancia', 'land rover',
|
||||||
|
'lexus', 'lincoln', 'lotus', 'maserati', 'mazda', 'mclaren', 'mercedes', 'mini',
|
||||||
|
'mitsubishi', 'nissan', 'opel', 'peugeot', 'porsche', 'ram', 'renault', 'rolls-royce',
|
||||||
|
'saab', 'seat', 'skoda', 'smart', 'subaru', 'suzuki', 'tesla', 'toyota', 'volkswagen',
|
||||||
|
'volvo', 'yamaha', 'harley', 'ducati', 'kawasaki', 'triumph', 'vespa', 'piaggio']
|
||||||
|
if any(b in c for b in car_brands):
|
||||||
|
if 'motorcycle' in c or any(x in c for x in ['harley', 'ducati', 'kawasaki', 'triumph', 'vespa']):
|
||||||
|
return ("Automotive", "Dealers", "Motorcycle Brands", cat)
|
||||||
|
return ("Automotive", "Dealers", "Car Brands", cat)
|
||||||
|
if any(x in c for x in ['motorcycle', 'scooter', 'moped']):
|
||||||
|
return ("Automotive", "Dealers", "Motorcycle Dealers", cat)
|
||||||
|
if any(x in c for x in ['truck', 'commercial vehicle', 'trailer']):
|
||||||
|
return ("Automotive", "Dealers", "Truck & Commercial", cat)
|
||||||
|
if any(x in c for x in ['boat', 'yacht', 'marine', 'jet ski']):
|
||||||
|
return ("Automotive", "Dealers", "Marine & Boats", cat)
|
||||||
|
if any(x in c for x in ['rv', 'camper', 'motorhome', 'caravan']):
|
||||||
|
return ("Automotive", "Dealers", "RV & Campers", cat)
|
||||||
|
if any(x in c for x in ['atv', 'quad', 'off-road', 'utv']):
|
||||||
|
return ("Automotive", "Dealers", "ATV & Off-Road", cat)
|
||||||
|
if 'used' in c or 'pre-owned' in c:
|
||||||
|
return ("Automotive", "Dealers", "Used Vehicles", cat)
|
||||||
|
return ("Automotive", "Dealers", "Other Dealers", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['car wash', 'auto detailing', 'car detailing']):
|
||||||
|
return ("Automotive", "Vehicle Care", "Cleaning & Detailing", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['car rental', 'auto rental', 'vehicle rental', 'truck rental']):
|
||||||
|
return ("Automotive", "Rental Services", "Vehicle Rental", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['car repair', 'auto repair', 'mechanic', 'garage', 'auto body', 'collision']):
|
||||||
|
return ("Automotive", "Repair & Maintenance", "Auto Repair", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['tire', 'tyre', 'wheel']):
|
||||||
|
return ("Automotive", "Parts & Accessories", "Tires & Wheels", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['auto part', 'car part', 'auto accessories']):
|
||||||
|
return ("Automotive", "Parts & Accessories", "Auto Parts", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['driving school', 'driving instruction']):
|
||||||
|
return ("Automotive", "Training", "Driving Schools", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['parking', 'car park', 'garage']):
|
||||||
|
if 'repair' not in c and 'mechanic' not in c:
|
||||||
|
return ("Automotive", "Parking", "Parking Facilities", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['gas station', 'petrol', 'fuel', 'charging station', 'ev charging']):
|
||||||
|
return ("Automotive", "Fuel & Charging", "Fuel Stations", cat)
|
||||||
|
|
||||||
|
# === HEALTHCARE ===
|
||||||
|
if any(x in c for x in ['hospital']):
|
||||||
|
if 'animal' in c or 'veterinar' in c:
|
||||||
|
return ("Healthcare", "Veterinary", "Animal Hospitals", cat)
|
||||||
|
if 'children' in c or 'pediatric' in c:
|
||||||
|
return ("Healthcare", "Hospitals", "Pediatric Hospitals", cat)
|
||||||
|
if 'mental' in c or 'psychiatric' in c:
|
||||||
|
return ("Healthcare", "Mental Health", "Psychiatric Hospitals", cat)
|
||||||
|
return ("Healthcare", "Hospitals", "General Hospitals", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['clinic']):
|
||||||
|
if 'dental' in c:
|
||||||
|
return ("Healthcare", "Dental", "Dental Clinics", cat)
|
||||||
|
if 'eye' in c or 'vision' in c or 'optical' in c:
|
||||||
|
return ("Healthcare", "Vision Care", "Eye Clinics", cat)
|
||||||
|
if 'fertility' in c or 'ivf' in c:
|
||||||
|
return ("Healthcare", "Specialty Care", "Fertility Clinics", cat)
|
||||||
|
if 'skin' in c or 'dermatol' in c:
|
||||||
|
return ("Healthcare", "Specialty Care", "Dermatology", cat)
|
||||||
|
if 'physical therapy' in c or 'physiotherapy' in c or 'rehab' in c:
|
||||||
|
return ("Healthcare", "Rehabilitation", "Physical Therapy", cat)
|
||||||
|
return ("Healthcare", "Clinics", "Medical Clinics", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['doctor', 'physician']):
|
||||||
|
return ("Healthcare", "Medical Practitioners", "Doctors", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['dentist', 'dental', 'orthodont', 'endodont', 'periodont']):
|
||||||
|
return ("Healthcare", "Dental", "Dental Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['surgeon', 'surgery']):
|
||||||
|
if 'plastic' in c or 'cosmetic' in c:
|
||||||
|
return ("Healthcare", "Specialty Care", "Cosmetic Surgery", cat)
|
||||||
|
return ("Healthcare", "Medical Practitioners", "Surgeons", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['psycholog', 'psychiatr', 'mental health', 'counselor', 'therapist']):
|
||||||
|
if 'marriage' in c or 'family' in c:
|
||||||
|
return ("Healthcare", "Mental Health", "Family Counseling", cat)
|
||||||
|
if 'addiction' in c or 'substance' in c:
|
||||||
|
return ("Healthcare", "Mental Health", "Addiction Treatment", cat)
|
||||||
|
return ("Healthcare", "Mental Health", "Mental Health Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['chiropract']):
|
||||||
|
return ("Healthcare", "Alternative Medicine", "Chiropractic", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['acupuncture', 'acupuncturist']):
|
||||||
|
return ("Healthcare", "Alternative Medicine", "Acupuncture", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['naturopath', 'homeopath', 'ayurved', 'holistic']):
|
||||||
|
return ("Healthcare", "Alternative Medicine", "Natural Medicine", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['optometrist', 'optician', 'eye doctor', 'ophthalmol']):
|
||||||
|
return ("Healthcare", "Vision Care", "Eye Care", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['pharmacy', 'drugstore', 'apothecary']):
|
||||||
|
return ("Healthcare", "Pharmacies", "Retail Pharmacies", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['veterinar', 'vet ', 'animal clinic', 'pet clinic']):
|
||||||
|
return ("Healthcare", "Veterinary", "Veterinary Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['nursing home', 'assisted living', 'senior care', 'elder care', 'retirement home']):
|
||||||
|
return ("Healthcare", "Senior Care", "Senior Living", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['lab', 'laboratory', 'diagnostic', 'imaging', 'x-ray', 'mri', 'radiology']):
|
||||||
|
return ("Healthcare", "Diagnostics", "Medical Labs", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['ambulance', 'emergency', 'urgent care']):
|
||||||
|
return ("Healthcare", "Emergency Services", "Emergency Care", cat)
|
||||||
|
|
||||||
|
# === EDUCATION ===
|
||||||
|
if 'school' in c or 'academy' in c:
|
||||||
|
if any(x in c for x in ['preschool', 'kindergarten', 'nursery', 'daycare', 'pre-school']):
|
||||||
|
return ("Education", "Early Childhood", "Preschools", cat)
|
||||||
|
if any(x in c for x in ['elementary', 'primary']):
|
||||||
|
return ("Education", "K-12 Schools", "Elementary Schools", cat)
|
||||||
|
if any(x in c for x in ['middle', 'junior high']):
|
||||||
|
return ("Education", "K-12 Schools", "Middle Schools", cat)
|
||||||
|
if any(x in c for x in ['high school', 'secondary']):
|
||||||
|
return ("Education", "K-12 Schools", "High Schools", cat)
|
||||||
|
if any(x in c for x in ['boarding']):
|
||||||
|
return ("Education", "K-12 Schools", "Boarding Schools", cat)
|
||||||
|
if any(x in c for x in ['driving']):
|
||||||
|
return ("Automotive", "Training", "Driving Schools", cat)
|
||||||
|
if any(x in c for x in ['language', 'english', 'spanish', 'french', 'german', 'chinese', 'japanese']):
|
||||||
|
return ("Education", "Language Learning", "Language Schools", cat)
|
||||||
|
if any(x in c for x in ['art', 'music', 'dance', 'drama', 'theater', 'acting']):
|
||||||
|
return ("Education", "Arts Education", "Arts Schools", cat)
|
||||||
|
if any(x in c for x in ['martial art', 'karate', 'judo', 'taekwondo', 'kung fu', 'aikido', 'boxing']):
|
||||||
|
return ("Education", "Sports Training", "Martial Arts Schools", cat)
|
||||||
|
if any(x in c for x in ['beauty', 'cosmetology', 'barber']):
|
||||||
|
return ("Education", "Vocational Training", "Beauty Schools", cat)
|
||||||
|
if any(x in c for x in ['cooking', 'culinary', 'chef']):
|
||||||
|
return ("Education", "Vocational Training", "Culinary Schools", cat)
|
||||||
|
if any(x in c for x in ['business', 'mba']):
|
||||||
|
return ("Education", "Higher Education", "Business Schools", cat)
|
||||||
|
if any(x in c for x in ['medical', 'nursing', 'dental']):
|
||||||
|
return ("Education", "Higher Education", "Medical Schools", cat)
|
||||||
|
if any(x in c for x in ['law']):
|
||||||
|
return ("Education", "Higher Education", "Law Schools", cat)
|
||||||
|
if any(x in c for x in ['flight', 'aviation', 'pilot']):
|
||||||
|
return ("Education", "Vocational Training", "Aviation Schools", cat)
|
||||||
|
if any(x in c for x in ['computer', 'it ', 'coding', 'programming', 'software']):
|
||||||
|
return ("Education", "Technology Training", "Computer Schools", cat)
|
||||||
|
if any(x in c for x in ['trade', 'technical', 'vocational']):
|
||||||
|
return ("Education", "Vocational Training", "Trade Schools", cat)
|
||||||
|
return ("Education", "Specialty Schools", "Other Schools", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['university', 'college']):
|
||||||
|
if 'community' in c:
|
||||||
|
return ("Education", "Higher Education", "Community Colleges", cat)
|
||||||
|
return ("Education", "Higher Education", "Universities", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['tutor', 'tutoring']):
|
||||||
|
return ("Education", "Tutoring", "Private Tutoring", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['training center', 'training program', 'training institute']):
|
||||||
|
return ("Education", "Professional Training", "Training Centers", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['library']):
|
||||||
|
return ("Education", "Libraries", "Public Libraries", cat)
|
||||||
|
|
||||||
|
# === PROFESSIONAL SERVICES ===
|
||||||
|
if any(x in c for x in ['lawyer', 'attorney', 'law firm', 'legal']):
|
||||||
|
if any(x in c for x in ['immigration']):
|
||||||
|
return ("Professional Services", "Legal", "Immigration Law", cat)
|
||||||
|
if any(x in c for x in ['criminal', 'defense']):
|
||||||
|
return ("Professional Services", "Legal", "Criminal Law", cat)
|
||||||
|
if any(x in c for x in ['family', 'divorce']):
|
||||||
|
return ("Professional Services", "Legal", "Family Law", cat)
|
||||||
|
if any(x in c for x in ['personal injury', 'accident']):
|
||||||
|
return ("Professional Services", "Legal", "Personal Injury", cat)
|
||||||
|
if any(x in c for x in ['real estate', 'property']):
|
||||||
|
return ("Professional Services", "Legal", "Real Estate Law", cat)
|
||||||
|
if any(x in c for x in ['business', 'corporate', 'commercial']):
|
||||||
|
return ("Professional Services", "Legal", "Business Law", cat)
|
||||||
|
return ("Professional Services", "Legal", "General Legal", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['accountant', 'accounting', 'bookkeep', 'tax']):
|
||||||
|
return ("Professional Services", "Financial Services", "Accounting", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['consultant', 'consulting', 'advisor']):
|
||||||
|
if any(x in c for x in ['business', 'management']):
|
||||||
|
return ("Professional Services", "Consulting", "Business Consulting", cat)
|
||||||
|
if any(x in c for x in ['it ', 'technology', 'computer']):
|
||||||
|
return ("Professional Services", "Consulting", "IT Consulting", cat)
|
||||||
|
if any(x in c for x in ['marketing', 'advertising']):
|
||||||
|
return ("Professional Services", "Consulting", "Marketing Consulting", cat)
|
||||||
|
return ("Professional Services", "Consulting", "General Consulting", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['notary', 'notarial']):
|
||||||
|
return ("Professional Services", "Legal", "Notary Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['architect', 'architecture']):
|
||||||
|
return ("Professional Services", "Design", "Architecture", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['engineer', 'engineering']):
|
||||||
|
if 'civil' in c:
|
||||||
|
return ("Professional Services", "Engineering", "Civil Engineering", cat)
|
||||||
|
if 'structural' in c:
|
||||||
|
return ("Professional Services", "Engineering", "Structural Engineering", cat)
|
||||||
|
if 'mechanical' in c:
|
||||||
|
return ("Professional Services", "Engineering", "Mechanical Engineering", cat)
|
||||||
|
if 'electrical' in c:
|
||||||
|
return ("Professional Services", "Engineering", "Electrical Engineering", cat)
|
||||||
|
return ("Professional Services", "Engineering", "General Engineering", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['agency']):
|
||||||
|
if any(x in c for x in ['advertising', 'marketing', 'creative', 'digital']):
|
||||||
|
return ("Professional Services", "Marketing & Advertising", "Agencies", cat)
|
||||||
|
if any(x in c for x in ['real estate', 'property']):
|
||||||
|
return ("Real Estate", "Agencies", "Real Estate Agencies", cat)
|
||||||
|
if any(x in c for x in ['insurance']):
|
||||||
|
return ("Finance & Insurance", "Insurance", "Insurance Agencies", cat)
|
||||||
|
if any(x in c for x in ['travel', 'tour']):
|
||||||
|
return ("Hospitality & Travel", "Travel Services", "Travel Agencies", cat)
|
||||||
|
if any(x in c for x in ['employment', 'staffing', 'recruitment', 'temp']):
|
||||||
|
return ("Professional Services", "HR Services", "Staffing Agencies", cat)
|
||||||
|
return ("Professional Services", "Agencies", "Other Agencies", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['photographer', 'photography', 'photo studio']):
|
||||||
|
return ("Professional Services", "Creative Services", "Photography", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['graphic design', 'web design', 'design studio']):
|
||||||
|
return ("Professional Services", "Creative Services", "Design Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['translator', 'translation', 'interpreter']):
|
||||||
|
return ("Professional Services", "Language Services", "Translation", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['printing', 'print shop', 'copy']):
|
||||||
|
return ("Professional Services", "Business Services", "Printing Services", cat)
|
||||||
|
|
||||||
|
# === HOME SERVICES ===
|
||||||
|
if any(x in c for x in ['plumber', 'plumbing']):
|
||||||
|
return ("Home Services", "Plumbing", "Plumbers", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['electrician', 'electrical']):
|
||||||
|
if 'contractor' in c or 'service' in c:
|
||||||
|
return ("Home Services", "Electrical", "Electricians", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['hvac', 'air conditioning', 'heating', 'furnace']):
|
||||||
|
return ("Home Services", "HVAC", "Heating & Cooling", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['roofing', 'roofer']):
|
||||||
|
return ("Home Services", "Roofing", "Roofing Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['painter', 'painting']):
|
||||||
|
if 'house' in c or 'residential' in c or 'contractor' in c:
|
||||||
|
return ("Home Services", "Painting", "House Painters", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['landscap', 'lawn', 'garden']):
|
||||||
|
if 'service' in c or 'company' in c or 'contractor' in c:
|
||||||
|
return ("Home Services", "Landscaping", "Landscaping Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['cleaning service', 'maid', 'housekeep', 'janitorial']):
|
||||||
|
return ("Home Services", "Cleaning", "Cleaning Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['pest control', 'exterminator']):
|
||||||
|
return ("Home Services", "Pest Control", "Exterminators", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['locksmith']):
|
||||||
|
return ("Home Services", "Security", "Locksmiths", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['moving company', 'mover', 'relocation']):
|
||||||
|
return ("Home Services", "Moving", "Moving Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['contractor']):
|
||||||
|
if 'general' in c:
|
||||||
|
return ("Home Services", "Construction", "General Contractors", cat)
|
||||||
|
return ("Home Services", "Construction", "Contractors", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['carpenter', 'carpentry']):
|
||||||
|
return ("Home Services", "Construction", "Carpenters", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['flooring', 'floor']):
|
||||||
|
if 'service' in c or 'contractor' in c or 'installation' in c:
|
||||||
|
return ("Home Services", "Flooring", "Floor Installation", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['window', 'glass']):
|
||||||
|
if 'repair' in c or 'installation' in c or 'service' in c:
|
||||||
|
return ("Home Services", "Windows & Doors", "Window Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['pool', 'spa']):
|
||||||
|
if 'service' in c or 'cleaning' in c or 'maintenance' in c:
|
||||||
|
return ("Home Services", "Pool & Spa", "Pool Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['appliance repair', 'appliance service']):
|
||||||
|
return ("Home Services", "Appliance Repair", "Appliance Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['handyman']):
|
||||||
|
return ("Home Services", "General Repair", "Handyman Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['interior design', 'decorator']):
|
||||||
|
return ("Home Services", "Design", "Interior Design", cat)
|
||||||
|
|
||||||
|
# === PERSONAL SERVICES ===
|
||||||
|
if any(x in c for x in ['salon', 'hair', 'hairdress', 'stylist']):
|
||||||
|
return ("Personal Services", "Hair Care", "Hair Salons", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['barber']):
|
||||||
|
if 'shop' in c or not 'school' in c:
|
||||||
|
return ("Personal Services", "Hair Care", "Barber Shops", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['nail', 'manicure', 'pedicure']):
|
||||||
|
return ("Personal Services", "Nail Care", "Nail Salons", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['spa']):
|
||||||
|
if 'day spa' in c or 'medical spa' in c or ('service' not in c and 'pool' not in c):
|
||||||
|
return ("Personal Services", "Spa & Wellness", "Day Spas", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['massage']):
|
||||||
|
return ("Personal Services", "Massage", "Massage Therapy", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['beauty']):
|
||||||
|
if 'salon' in c or 'parlor' in c:
|
||||||
|
return ("Personal Services", "Beauty", "Beauty Salons", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['tattoo']):
|
||||||
|
return ("Personal Services", "Body Art", "Tattoo Shops", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['piercing']):
|
||||||
|
return ("Personal Services", "Body Art", "Piercing Studios", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['tanning']):
|
||||||
|
return ("Personal Services", "Tanning", "Tanning Salons", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['tailor', 'alteration', 'seamstress']):
|
||||||
|
return ("Personal Services", "Clothing Care", "Tailoring", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['dry clean', 'laundry', 'laundromat']):
|
||||||
|
return ("Personal Services", "Laundry", "Laundry Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['personal trainer', 'fitness trainer']):
|
||||||
|
return ("Personal Services", "Fitness", "Personal Training", cat)
|
||||||
|
|
||||||
|
# === ENTERTAINMENT & RECREATION ===
|
||||||
|
if any(x in c for x in ['movie theater', 'cinema', 'multiplex']):
|
||||||
|
return ("Entertainment", "Movies", "Movie Theaters", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['theater', 'theatre']):
|
||||||
|
if 'movie' not in c:
|
||||||
|
return ("Entertainment", "Performing Arts", "Theaters", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['museum']):
|
||||||
|
if 'art' in c:
|
||||||
|
return ("Entertainment", "Museums", "Art Museums", cat)
|
||||||
|
if 'history' in c or 'historical' in c:
|
||||||
|
return ("Entertainment", "Museums", "History Museums", cat)
|
||||||
|
if 'science' in c or 'natural' in c:
|
||||||
|
return ("Entertainment", "Museums", "Science Museums", cat)
|
||||||
|
if 'children' in c or 'kid' in c:
|
||||||
|
return ("Entertainment", "Museums", "Children's Museums", cat)
|
||||||
|
return ("Entertainment", "Museums", "General Museums", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['art gallery', 'gallery']):
|
||||||
|
return ("Entertainment", "Arts", "Art Galleries", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['amusement park', 'theme park', 'water park']):
|
||||||
|
return ("Entertainment", "Amusement", "Theme Parks", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['zoo', 'aquarium', 'wildlife']):
|
||||||
|
return ("Entertainment", "Wildlife", "Zoos & Aquariums", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['bowling']):
|
||||||
|
return ("Entertainment", "Games & Recreation", "Bowling", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['arcade', 'video game']):
|
||||||
|
return ("Entertainment", "Games & Recreation", "Arcades", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['escape room']):
|
||||||
|
return ("Entertainment", "Games & Recreation", "Escape Rooms", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['casino', 'gambling']):
|
||||||
|
return ("Entertainment", "Gambling", "Casinos", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['concert', 'music venue', 'live music']):
|
||||||
|
return ("Entertainment", "Music Venues", "Concert Halls", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['gym', 'fitness center', 'health club']):
|
||||||
|
return ("Entertainment", "Fitness", "Gyms", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['yoga']):
|
||||||
|
if 'studio' in c or 'center' in c:
|
||||||
|
return ("Entertainment", "Fitness", "Yoga Studios", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['pilates']):
|
||||||
|
return ("Entertainment", "Fitness", "Pilates Studios", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['swimming pool', 'swim']):
|
||||||
|
return ("Entertainment", "Sports", "Swimming Pools", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['golf']):
|
||||||
|
if 'course' in c or 'club' in c:
|
||||||
|
return ("Entertainment", "Sports", "Golf Courses", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['tennis']):
|
||||||
|
return ("Entertainment", "Sports", "Tennis Courts", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['stadium', 'arena', 'sports complex']):
|
||||||
|
return ("Entertainment", "Venues", "Sports Venues", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['park']):
|
||||||
|
if 'amusement' not in c and 'theme' not in c:
|
||||||
|
if 'national' in c or 'state' in c:
|
||||||
|
return ("Entertainment", "Parks", "National Parks", cat)
|
||||||
|
if 'dog' in c:
|
||||||
|
return ("Entertainment", "Parks", "Dog Parks", cat)
|
||||||
|
return ("Entertainment", "Parks", "Public Parks", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['recreation center', 'community center']):
|
||||||
|
return ("Entertainment", "Recreation", "Community Centers", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['club']):
|
||||||
|
if 'night' in c:
|
||||||
|
return ("Food & Dining", "Bars & Nightlife", "Night Clubs", cat)
|
||||||
|
if 'country' in c:
|
||||||
|
return ("Entertainment", "Sports", "Country Clubs", cat)
|
||||||
|
if 'sport' in c or 'athletic' in c:
|
||||||
|
return ("Entertainment", "Sports", "Sports Clubs", cat)
|
||||||
|
if 'social' in c:
|
||||||
|
return ("Entertainment", "Social", "Social Clubs", cat)
|
||||||
|
|
||||||
|
# === HOSPITALITY & TRAVEL ===
|
||||||
|
if any(x in c for x in ['hotel', 'motel', 'inn']):
|
||||||
|
if 'boutique' in c:
|
||||||
|
return ("Hospitality & Travel", "Lodging", "Boutique Hotels", cat)
|
||||||
|
if 'resort' in c:
|
||||||
|
return ("Hospitality & Travel", "Lodging", "Resorts", cat)
|
||||||
|
if 'budget' in c or 'economy' in c:
|
||||||
|
return ("Hospitality & Travel", "Lodging", "Budget Hotels", cat)
|
||||||
|
return ("Hospitality & Travel", "Lodging", "Hotels", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['hostel']):
|
||||||
|
return ("Hospitality & Travel", "Lodging", "Hostels", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['bed and breakfast', 'b&b', 'bnb']):
|
||||||
|
return ("Hospitality & Travel", "Lodging", "B&Bs", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['resort']):
|
||||||
|
return ("Hospitality & Travel", "Lodging", "Resorts", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['vacation rental', 'holiday rental']):
|
||||||
|
return ("Hospitality & Travel", "Lodging", "Vacation Rentals", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['campground', 'camping', 'rv park']):
|
||||||
|
return ("Hospitality & Travel", "Lodging", "Campgrounds", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['travel agency', 'tour operator', 'travel agent']):
|
||||||
|
return ("Hospitality & Travel", "Travel Services", "Travel Agencies", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['airline', 'airport']):
|
||||||
|
return ("Hospitality & Travel", "Transportation", "Airlines & Airports", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['cruise']):
|
||||||
|
return ("Hospitality & Travel", "Travel Services", "Cruises", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['tourist', 'attraction', 'sightseeing']):
|
||||||
|
return ("Hospitality & Travel", "Attractions", "Tourist Attractions", cat)
|
||||||
|
|
||||||
|
# === FINANCE & INSURANCE ===
|
||||||
|
if any(x in c for x in ['bank', 'banking', 'credit union']):
|
||||||
|
return ("Finance & Insurance", "Banking", "Banks", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['atm', 'cash machine']):
|
||||||
|
return ("Finance & Insurance", "Banking", "ATMs", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['insurance']):
|
||||||
|
if 'health' in c or 'medical' in c:
|
||||||
|
return ("Finance & Insurance", "Insurance", "Health Insurance", cat)
|
||||||
|
if 'auto' in c or 'car' in c:
|
||||||
|
return ("Finance & Insurance", "Insurance", "Auto Insurance", cat)
|
||||||
|
if 'home' in c or 'property' in c:
|
||||||
|
return ("Finance & Insurance", "Insurance", "Home Insurance", cat)
|
||||||
|
if 'life' in c:
|
||||||
|
return ("Finance & Insurance", "Insurance", "Life Insurance", cat)
|
||||||
|
return ("Finance & Insurance", "Insurance", "Insurance Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['loan', 'mortgage', 'lending']):
|
||||||
|
return ("Finance & Insurance", "Lending", "Loans", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['investment', 'financial advisor', 'wealth management', 'financial planner']):
|
||||||
|
return ("Finance & Insurance", "Investment", "Financial Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['currency exchange', 'money transfer', 'wire transfer']):
|
||||||
|
return ("Finance & Insurance", "Money Services", "Currency Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['pawn']):
|
||||||
|
return ("Finance & Insurance", "Money Services", "Pawn Shops", cat)
|
||||||
|
|
||||||
|
# === REAL ESTATE ===
|
||||||
|
if any(x in c for x in ['real estate', 'property', 'realty', 'realtor']):
|
||||||
|
if 'agent' in c or 'agency' in c or 'broker' in c:
|
||||||
|
return ("Real Estate", "Agencies", "Real Estate Agents", cat)
|
||||||
|
if 'developer' in c or 'development' in c:
|
||||||
|
return ("Real Estate", "Development", "Developers", cat)
|
||||||
|
if 'management' in c:
|
||||||
|
return ("Real Estate", "Management", "Property Management", cat)
|
||||||
|
if 'commercial' in c:
|
||||||
|
return ("Real Estate", "Commercial", "Commercial Real Estate", cat)
|
||||||
|
return ("Real Estate", "Services", "Real Estate Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['apartment', 'condo', 'rental']):
|
||||||
|
if 'complex' in c or 'building' in c:
|
||||||
|
return ("Real Estate", "Residential", "Apartment Complexes", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['storage', 'self storage', 'warehouse']):
|
||||||
|
if 'self' in c or 'mini' in c:
|
||||||
|
return ("Real Estate", "Storage", "Self Storage", cat)
|
||||||
|
|
||||||
|
# === RELIGIOUS ===
|
||||||
|
if any(x in c for x in ['church']):
|
||||||
|
if 'catholic' in c:
|
||||||
|
return ("Religious", "Christian", "Catholic Churches", cat)
|
||||||
|
if 'baptist' in c:
|
||||||
|
return ("Religious", "Christian", "Baptist Churches", cat)
|
||||||
|
if 'methodist' in c:
|
||||||
|
return ("Religious", "Christian", "Methodist Churches", cat)
|
||||||
|
if 'lutheran' in c:
|
||||||
|
return ("Religious", "Christian", "Lutheran Churches", cat)
|
||||||
|
if 'orthodox' in c:
|
||||||
|
return ("Religious", "Christian", "Orthodox Churches", cat)
|
||||||
|
if 'pentecostal' in c:
|
||||||
|
return ("Religious", "Christian", "Pentecostal Churches", cat)
|
||||||
|
return ("Religious", "Christian", "Churches", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['mosque', 'islamic', 'muslim']):
|
||||||
|
return ("Religious", "Islam", "Mosques", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['synagogue', 'jewish', 'temple']):
|
||||||
|
if 'jewish' in c or 'synagogue' in c:
|
||||||
|
return ("Religious", "Judaism", "Synagogues", cat)
|
||||||
|
if 'hindu' in c:
|
||||||
|
return ("Religious", "Hinduism", "Hindu Temples", cat)
|
||||||
|
if 'buddhist' in c:
|
||||||
|
return ("Religious", "Buddhism", "Buddhist Temples", cat)
|
||||||
|
return ("Religious", "Other", "Temples", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['abbey', 'monastery', 'convent']):
|
||||||
|
return ("Religious", "Christian", "Monasteries", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['gurdwara', 'sikh']):
|
||||||
|
return ("Religious", "Sikhism", "Gurdwaras", cat)
|
||||||
|
|
||||||
|
# === GOVERNMENT & PUBLIC SERVICES ===
|
||||||
|
if any(x in c for x in ['government', 'city hall', 'town hall', 'municipal']):
|
||||||
|
return ("Government", "Local Government", "Government Offices", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['court', 'courthouse']):
|
||||||
|
return ("Government", "Legal", "Courts", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['police', 'sheriff']):
|
||||||
|
return ("Government", "Public Safety", "Police", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['fire station', 'fire department']):
|
||||||
|
return ("Government", "Public Safety", "Fire Departments", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['post office', 'postal']):
|
||||||
|
return ("Government", "Postal", "Post Offices", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['embassy', 'consulate']):
|
||||||
|
return ("Government", "International", "Embassies", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['dmv', 'motor vehicle', 'driver license']):
|
||||||
|
return ("Government", "Transportation", "DMV", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['social security', 'welfare', 'social services']):
|
||||||
|
return ("Government", "Social Services", "Social Services", cat)
|
||||||
|
|
||||||
|
# === INDUSTRIAL & MANUFACTURING ===
|
||||||
|
if any(x in c for x in ['manufacturer', 'manufacturing', 'factory', 'plant']):
|
||||||
|
if any(x in c for x in ['food', 'beverage', 'bakery']):
|
||||||
|
return ("Industrial", "Manufacturing", "Food Manufacturing", cat)
|
||||||
|
if any(x in c for x in ['textile', 'clothing', 'garment']):
|
||||||
|
return ("Industrial", "Manufacturing", "Textile Manufacturing", cat)
|
||||||
|
if any(x in c for x in ['electronics', 'computer', 'semiconductor']):
|
||||||
|
return ("Industrial", "Manufacturing", "Electronics Manufacturing", cat)
|
||||||
|
if any(x in c for x in ['auto', 'car', 'vehicle']):
|
||||||
|
return ("Industrial", "Manufacturing", "Auto Manufacturing", cat)
|
||||||
|
if any(x in c for x in ['chemical', 'pharmaceutical']):
|
||||||
|
return ("Industrial", "Manufacturing", "Chemical Manufacturing", cat)
|
||||||
|
if any(x in c for x in ['metal', 'steel', 'iron']):
|
||||||
|
return ("Industrial", "Manufacturing", "Metal Manufacturing", cat)
|
||||||
|
if any(x in c for x in ['plastic', 'rubber']):
|
||||||
|
return ("Industrial", "Manufacturing", "Plastics Manufacturing", cat)
|
||||||
|
if any(x in c for x in ['furniture', 'wood']):
|
||||||
|
return ("Industrial", "Manufacturing", "Furniture Manufacturing", cat)
|
||||||
|
return ("Industrial", "Manufacturing", "General Manufacturing", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['mining', 'quarry']):
|
||||||
|
return ("Industrial", "Mining", "Mining Operations", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['construction company', 'builder']):
|
||||||
|
return ("Industrial", "Construction", "Construction Companies", cat)
|
||||||
|
|
||||||
|
# === TECHNOLOGY ===
|
||||||
|
if any(x in c for x in ['software', 'app developer', 'web developer']):
|
||||||
|
return ("Technology", "Software", "Software Development", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['it service', 'computer service', 'tech support']):
|
||||||
|
return ("Technology", "IT Services", "IT Support", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['data center', 'hosting', 'cloud']):
|
||||||
|
return ("Technology", "Infrastructure", "Data Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['telecommunication', 'telecom', 'internet service']):
|
||||||
|
return ("Technology", "Telecommunications", "Telecom Services", cat)
|
||||||
|
|
||||||
|
# === TRANSPORTATION & LOGISTICS ===
|
||||||
|
if any(x in c for x in ['shipping', 'freight', 'cargo', 'logistics']):
|
||||||
|
return ("Transportation", "Logistics", "Shipping & Freight", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['courier', 'delivery', 'express']):
|
||||||
|
return ("Transportation", "Delivery", "Courier Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['taxi', 'cab', 'ride', 'limo', 'chauffeur']):
|
||||||
|
return ("Transportation", "Passenger", "Taxi & Ride Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['bus', 'coach', 'shuttle']):
|
||||||
|
if 'station' in c or 'terminal' in c or 'stop' in c:
|
||||||
|
return ("Transportation", "Public Transit", "Bus Stations", cat)
|
||||||
|
return ("Transportation", "Passenger", "Bus Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['train', 'rail', 'subway', 'metro']):
|
||||||
|
if 'station' in c or 'terminal' in c:
|
||||||
|
return ("Transportation", "Public Transit", "Train Stations", cat)
|
||||||
|
return ("Transportation", "Public Transit", "Rail Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['towing', 'tow truck']):
|
||||||
|
return ("Transportation", "Vehicle Services", "Towing", cat)
|
||||||
|
|
||||||
|
# === AGRICULTURE ===
|
||||||
|
if any(x in c for x in ['farm', 'ranch', 'orchard', 'vineyard']):
|
||||||
|
return ("Agriculture", "Farming", "Farms", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['agricultural', 'agri']):
|
||||||
|
return ("Agriculture", "Services", "Agricultural Services", cat)
|
||||||
|
|
||||||
|
# === PETS & ANIMALS ===
|
||||||
|
if any(x in c for x in ['pet', 'dog', 'cat']):
|
||||||
|
if 'grooming' in c or 'groomer' in c:
|
||||||
|
return ("Pets & Animals", "Pet Services", "Pet Grooming", cat)
|
||||||
|
if 'boarding' in c or 'kennel' in c or 'sitting' in c or 'daycare' in c:
|
||||||
|
return ("Pets & Animals", "Pet Services", "Pet Boarding", cat)
|
||||||
|
if 'training' in c or 'trainer' in c:
|
||||||
|
return ("Pets & Animals", "Pet Services", "Pet Training", cat)
|
||||||
|
if 'adoption' in c or 'shelter' in c or 'rescue' in c:
|
||||||
|
return ("Pets & Animals", "Animal Welfare", "Shelters", cat)
|
||||||
|
if 'store' in c or 'shop' in c:
|
||||||
|
return ("Retail & Shopping", "Pet Supplies", "Pet Stores", cat)
|
||||||
|
|
||||||
|
# === EVENTS & WEDDINGS ===
|
||||||
|
if any(x in c for x in ['wedding', 'bridal']):
|
||||||
|
if 'venue' in c or 'hall' in c:
|
||||||
|
return ("Events & Weddings", "Venues", "Wedding Venues", cat)
|
||||||
|
if 'planner' in c:
|
||||||
|
return ("Events & Weddings", "Planning", "Wedding Planners", cat)
|
||||||
|
if 'dress' in c or 'gown' in c:
|
||||||
|
return ("Events & Weddings", "Attire", "Bridal Shops", cat)
|
||||||
|
return ("Events & Weddings", "Services", "Wedding Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['event', 'party', 'banquet']):
|
||||||
|
if 'venue' in c or 'hall' in c or 'center' in c:
|
||||||
|
return ("Events & Weddings", "Venues", "Event Venues", cat)
|
||||||
|
if 'planner' in c or 'planning' in c:
|
||||||
|
return ("Events & Weddings", "Planning", "Event Planners", cat)
|
||||||
|
if 'rental' in c or 'supply' in c:
|
||||||
|
return ("Events & Weddings", "Rentals", "Event Rentals", cat)
|
||||||
|
return ("Events & Weddings", "Services", "Event Services", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['florist', 'flower']):
|
||||||
|
if 'shop' in c or 'store' not in c:
|
||||||
|
return ("Events & Weddings", "Florists", "Flower Shops", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['funeral', 'mortuary', 'cremation', 'cemetery']):
|
||||||
|
return ("Events & Weddings", "Memorial", "Funeral Services", cat)
|
||||||
|
|
||||||
|
# === NON-PROFIT & COMMUNITY ===
|
||||||
|
if any(x in c for x in ['non-profit', 'nonprofit', 'charity', 'foundation']):
|
||||||
|
return ("Non-Profit", "Charities", "Non-Profit Organizations", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['community', 'civic', 'volunteer']):
|
||||||
|
if 'center' in c:
|
||||||
|
return ("Non-Profit", "Community", "Community Centers", cat)
|
||||||
|
return ("Non-Profit", "Community", "Community Organizations", cat)
|
||||||
|
|
||||||
|
if any(x in c for x in ['association', 'organization', 'society']):
|
||||||
|
if 'professional' in c or 'trade' in c or 'business' in c:
|
||||||
|
return ("Non-Profit", "Professional", "Professional Associations", cat)
|
||||||
|
return ("Non-Profit", "General", "Organizations", cat)
|
||||||
|
|
||||||
|
# Default fallback
|
||||||
|
return ("Other", "Uncategorized", "General", cat)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description='Import GBP categories into PostgreSQL with ltree')
|
||||||
|
parser.add_argument('--csv-path', default=DEFAULT_CSV_PATH, help='Path to categories CSV')
|
||||||
|
parser.add_argument('--db-url', default=DEFAULT_DB_URL, help='PostgreSQL connection URL')
|
||||||
|
parser.add_argument('--dry-run', action='store_true', help='Print categories without importing')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Read categories
|
||||||
|
print(f"Reading categories from: {args.csv_path}")
|
||||||
|
categories = []
|
||||||
|
with open(args.csv_path, 'r', encoding='utf-8') as f:
|
||||||
|
reader = csv.reader(f)
|
||||||
|
next(reader) # Skip header
|
||||||
|
for row in reader:
|
||||||
|
if row and row[0].strip():
|
||||||
|
categories.append(row[0].strip())
|
||||||
|
|
||||||
|
print(f"Found {len(categories)} categories")
|
||||||
|
|
||||||
|
# Build tree structure
|
||||||
|
tree = {} # path -> (name, level, parent_path)
|
||||||
|
|
||||||
|
for cat in categories:
|
||||||
|
l1, l2, l3, l4 = categorize_category(cat)
|
||||||
|
|
||||||
|
# Build paths
|
||||||
|
l1_slug = slugify(l1)
|
||||||
|
l2_slug = slugify(l2)
|
||||||
|
l3_slug = slugify(l3)
|
||||||
|
l4_slug = slugify(l4)
|
||||||
|
|
||||||
|
# Level 1 (Sector)
|
||||||
|
l1_path = l1_slug
|
||||||
|
if l1_path not in tree:
|
||||||
|
tree[l1_path] = (l1, 1, None)
|
||||||
|
|
||||||
|
# Level 2 (Business Type)
|
||||||
|
l2_path = f"{l1_slug}.{l2_slug}"
|
||||||
|
if l2_path not in tree:
|
||||||
|
tree[l2_path] = (l2, 2, l1_path)
|
||||||
|
|
||||||
|
# Level 3 (Sub-category)
|
||||||
|
l3_path = f"{l1_slug}.{l2_slug}.{l3_slug}"
|
||||||
|
if l3_path not in tree:
|
||||||
|
tree[l3_path] = (l3, 3, l2_path)
|
||||||
|
|
||||||
|
# Level 4 (Specific Category)
|
||||||
|
l4_path = f"{l1_slug}.{l2_slug}.{l3_slug}.{l4_slug}"
|
||||||
|
if l4_path not in tree:
|
||||||
|
tree[l4_path] = (l4, 4, l3_path)
|
||||||
|
|
||||||
|
# Print statistics
|
||||||
|
level_counts = {1: 0, 2: 0, 3: 0, 4: 0}
|
||||||
|
for path, (name, level, parent) in tree.items():
|
||||||
|
level_counts[level] += 1
|
||||||
|
|
||||||
|
print(f"\nTree structure:")
|
||||||
|
print(f" Level 1 (Sectors): {level_counts[1]}")
|
||||||
|
print(f" Level 2 (Business Types): {level_counts[2]}")
|
||||||
|
print(f" Level 3 (Sub-categories): {level_counts[3]}")
|
||||||
|
print(f" Level 4 (Categories): {level_counts[4]}")
|
||||||
|
print(f" Total nodes: {len(tree)}")
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
print("\n[DRY RUN] Would insert these nodes:")
|
||||||
|
for path in sorted(tree.keys())[:20]:
|
||||||
|
name, level, parent = tree[path]
|
||||||
|
print(f" {' ' * (level-1)}{name} ({path})")
|
||||||
|
print(f" ... and {len(tree) - 20} more")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check for psycopg2
|
||||||
|
if not HAS_PSYCOPG2:
|
||||||
|
print("\nERROR: psycopg2 is required for database import.")
|
||||||
|
print("Install it with: pip install psycopg2-binary")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Connect to database
|
||||||
|
print(f"\nConnecting to database...")
|
||||||
|
conn = psycopg2.connect(args.db_url)
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
# Run init SQL first
|
||||||
|
init_sql_path = os.path.join(os.path.dirname(__file__), 'init', '01_create_categories.sql')
|
||||||
|
if os.path.exists(init_sql_path):
|
||||||
|
print(f"Running init SQL: {init_sql_path}")
|
||||||
|
with open(init_sql_path, 'r') as f:
|
||||||
|
cur.execute(f.read())
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# Clear existing data
|
||||||
|
print("Clearing existing categories...")
|
||||||
|
cur.execute("TRUNCATE TABLE gbp_categories RESTART IDENTITY CASCADE")
|
||||||
|
|
||||||
|
# Insert nodes in order (parents first)
|
||||||
|
print("Inserting categories...")
|
||||||
|
path_to_id = {}
|
||||||
|
|
||||||
|
# Sort by level to ensure parents are inserted first
|
||||||
|
sorted_items = sorted(tree.items(), key=lambda x: x[1][1])
|
||||||
|
|
||||||
|
for path, (name, level, parent_path) in sorted_items:
|
||||||
|
parent_id = path_to_id.get(parent_path) if parent_path else None
|
||||||
|
slug = path.split('.')[-1]
|
||||||
|
|
||||||
|
cur.execute("""
|
||||||
|
INSERT INTO gbp_categories (name, slug, path, level, parent_id)
|
||||||
|
VALUES (%s, %s, %s, %s, %s)
|
||||||
|
RETURNING id
|
||||||
|
""", (name, slug, path, level, parent_id))
|
||||||
|
|
||||||
|
path_to_id[path] = cur.fetchone()[0]
|
||||||
|
|
||||||
|
# Update category counts
|
||||||
|
print("Updating category counts...")
|
||||||
|
cur.execute("""
|
||||||
|
UPDATE gbp_categories p
|
||||||
|
SET category_count = (
|
||||||
|
SELECT COUNT(*) FROM gbp_categories c
|
||||||
|
WHERE c.path <@ p.path AND c.path != p.path
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
cur.execute("SELECT COUNT(*) FROM gbp_categories")
|
||||||
|
count = cur.fetchone()[0]
|
||||||
|
print(f"\nSuccess! Inserted {count} nodes into gbp_categories table")
|
||||||
|
|
||||||
|
# Show tree stats
|
||||||
|
cur.execute("SELECT * FROM category_tree_stats")
|
||||||
|
print("\nTree statistics:")
|
||||||
|
for row in cur.fetchall():
|
||||||
|
print(f" Level {row[0]}: {row[1]} nodes")
|
||||||
|
|
||||||
|
cur.close()
|
||||||
|
conn.close()
|
||||||
|
print("\nDone!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
120
db/init/01_create_categories.sql
Normal file
120
db/init/01_create_categories.sql
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
-- Enable ltree extension for hierarchical data
|
||||||
|
CREATE EXTENSION IF NOT EXISTS ltree;
|
||||||
|
|
||||||
|
-- Categories tree table
|
||||||
|
CREATE TABLE IF NOT EXISTS gbp_categories (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
slug TEXT NOT NULL,
|
||||||
|
path ltree NOT NULL,
|
||||||
|
level INT NOT NULL DEFAULT 1,
|
||||||
|
parent_id INT REFERENCES gbp_categories(id),
|
||||||
|
category_count INT DEFAULT 0,
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
UNIQUE(path)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Indexes for fast hierarchical queries
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_gbp_categories_path ON gbp_categories USING GIST (path);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_gbp_categories_path_btree ON gbp_categories USING BTREE (path);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_gbp_categories_name ON gbp_categories (name);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_gbp_categories_slug ON gbp_categories (slug);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_gbp_categories_level ON gbp_categories (level);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_gbp_categories_parent ON gbp_categories (parent_id);
|
||||||
|
|
||||||
|
-- Full text search index
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_gbp_categories_name_trgm ON gbp_categories USING GIN (name gin_trgm_ops);
|
||||||
|
|
||||||
|
-- Enable trigram extension for fuzzy search
|
||||||
|
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||||
|
|
||||||
|
-- Function to update timestamp
|
||||||
|
CREATE OR REPLACE FUNCTION update_updated_at_column()
|
||||||
|
RETURNS TRIGGER AS $$
|
||||||
|
BEGIN
|
||||||
|
NEW.updated_at = CURRENT_TIMESTAMP;
|
||||||
|
RETURN NEW;
|
||||||
|
END;
|
||||||
|
$$ language 'plpgsql';
|
||||||
|
|
||||||
|
-- Trigger for auto-updating timestamp
|
||||||
|
DROP TRIGGER IF EXISTS update_gbp_categories_updated_at ON gbp_categories;
|
||||||
|
CREATE TRIGGER update_gbp_categories_updated_at
|
||||||
|
BEFORE UPDATE ON gbp_categories
|
||||||
|
FOR EACH ROW
|
||||||
|
EXECUTE FUNCTION update_updated_at_column();
|
||||||
|
|
||||||
|
-- Helper function: Get all children of a category
|
||||||
|
CREATE OR REPLACE FUNCTION get_category_children(parent_path ltree)
|
||||||
|
RETURNS TABLE (
|
||||||
|
id INT,
|
||||||
|
name TEXT,
|
||||||
|
slug TEXT,
|
||||||
|
path ltree,
|
||||||
|
level INT
|
||||||
|
) AS $$
|
||||||
|
BEGIN
|
||||||
|
RETURN QUERY
|
||||||
|
SELECT c.id, c.name, c.slug, c.path, c.level
|
||||||
|
FROM gbp_categories c
|
||||||
|
WHERE c.path <@ parent_path AND c.path != parent_path
|
||||||
|
ORDER BY c.path;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- Helper function: Get ancestors of a category
|
||||||
|
CREATE OR REPLACE FUNCTION get_category_ancestors(category_path ltree)
|
||||||
|
RETURNS TABLE (
|
||||||
|
id INT,
|
||||||
|
name TEXT,
|
||||||
|
slug TEXT,
|
||||||
|
path ltree,
|
||||||
|
level INT
|
||||||
|
) AS $$
|
||||||
|
BEGIN
|
||||||
|
RETURN QUERY
|
||||||
|
SELECT c.id, c.name, c.slug, c.path, c.level
|
||||||
|
FROM gbp_categories c
|
||||||
|
WHERE category_path <@ c.path AND c.path != category_path
|
||||||
|
ORDER BY c.level;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- Helper function: Search categories by name (fuzzy)
|
||||||
|
CREATE OR REPLACE FUNCTION search_categories(search_term TEXT, limit_count INT DEFAULT 20)
|
||||||
|
RETURNS TABLE (
|
||||||
|
id INT,
|
||||||
|
name TEXT,
|
||||||
|
path ltree,
|
||||||
|
level INT,
|
||||||
|
similarity REAL
|
||||||
|
) AS $$
|
||||||
|
BEGIN
|
||||||
|
RETURN QUERY
|
||||||
|
SELECT c.id, c.name, c.path, c.level,
|
||||||
|
similarity(c.name, search_term) as sim
|
||||||
|
FROM gbp_categories c
|
||||||
|
WHERE c.name ILIKE '%' || search_term || '%'
|
||||||
|
OR similarity(c.name, search_term) > 0.3
|
||||||
|
ORDER BY sim DESC, c.level, c.name
|
||||||
|
LIMIT limit_count;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- View for tree statistics
|
||||||
|
CREATE OR REPLACE VIEW category_tree_stats AS
|
||||||
|
SELECT
|
||||||
|
level,
|
||||||
|
COUNT(*) as count,
|
||||||
|
COUNT(*) FILTER (WHERE level = 1) as sectors,
|
||||||
|
COUNT(*) FILTER (WHERE level = 2) as business_types,
|
||||||
|
COUNT(*) FILTER (WHERE level = 3) as sub_categories,
|
||||||
|
COUNT(*) FILTER (WHERE level = 4) as leaf_categories
|
||||||
|
FROM gbp_categories
|
||||||
|
GROUP BY level
|
||||||
|
ORDER BY level;
|
||||||
|
|
||||||
|
COMMENT ON TABLE gbp_categories IS 'Google Business Profile categories organized in a 4-level hierarchy using ltree';
|
||||||
|
COMMENT ON COLUMN gbp_categories.path IS 'Hierarchical path using ltree (e.g., Food_Dining.Restaurants.By_Cuisine.Afghan_restaurant)';
|
||||||
|
COMMENT ON COLUMN gbp_categories.level IS '1=Sector, 2=Business Type, 3=Sub-category, 4=Specific Category';
|
||||||
293
db/recategorize_hierarchical.py
Normal file
293
db/recategorize_hierarchical.py
Normal file
@@ -0,0 +1,293 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Hierarchical categorization of Other items.
|
||||||
|
|
||||||
|
APPROACH:
|
||||||
|
1. First pass: Assign to Level 1 (Sector) - items that don't match go to sector's "Other" business type
|
||||||
|
2. Second pass: Within each sector, refine Level 2 (Business Type)
|
||||||
|
3. Third pass: Within each business type, refine Level 3 (Sub-category)
|
||||||
|
|
||||||
|
This creates:
|
||||||
|
- Sector.Other.Uncategorized for sector-level unknowns
|
||||||
|
- Sector.BusinessType.Other for business-type-level unknowns
|
||||||
|
|
||||||
|
EXISTING SECTORS (21 + Other):
|
||||||
|
Agriculture, Automotive, Education, Entertainment, Events_Weddings, Finance_Insurance,
|
||||||
|
Food_Dining, Government, Healthcare, Home_Services, Hospitality_Travel, Industrial,
|
||||||
|
Non_Profit, Personal_Services, Pets_Animals, Professional_Services, Real_Estate,
|
||||||
|
Religious, Retail_Shopping, Technology, Transportation, Other
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
# ==================== LEVEL 1: SECTOR ASSIGNMENT ====================
|
||||||
|
# Maps keyword patterns to sectors. Order matters - first match wins.
|
||||||
|
# These are broad patterns to catch as much as possible at sector level.
|
||||||
|
|
||||||
|
SECTOR_PATTERNS = [
|
||||||
|
# HEALTHCARE - Medical professionals, facilities, services
|
||||||
|
(r'(doctor|clinic|hospital|medical|health\s|dental|dentist|therapy|therapist|psycho|chiropract|optom|optician|pharmacy|pharmacist|nurse|surgeon|physician|cardiolog|dermatol|pediatr|orthoped|neurolog|oncolog|urolog|allergist|anesthesiol|audiolog|blood\sbank|blood\sdonat|blood\stest|dialysis|fertility|hospice|rehab|physiother|acupunct|naturopath|homeopath|osteopath|midwife|birth\scenter|prenatal|maternity|wellness\s(clinic|center)|diagnostic|x-ray|mri|ultrasound|laboratory|patholog|radiolog|pulmonolog|gastroenter|endocrin|rheumatol|immunolog|geriatr|podiatr|ophthalmolog|otolaryng|hematolog|nephrolog|proctolog|physiatrist|diabetolog|toxicolog|epidemiolog|oncology|assisted\sliving|nursing\shome|senior\scare|aged\scare|elder\scare|ambulance|emergency\sroom|urgent\scare|first\said|denture|diabetes\scenter|eye\scare|hiv\stest|perinatal|physical\sexam|pregnancy\scare|surgical\scenter|mammograph|std\stest|drug\stest|lactation|doula|bonesetting|hearing\said|prosthetic|orthotic|oxygen|ostomy|sleep\sclinic|sleep\slab|fertility|ivf|sperm\sbank|stem\scell|general\spractitioner|gynecolog|obstetrician|hepatolog|intensivist|internist|neurophysiol|orthoptist|prosthodontist|sexolog|venereolog|nutritionist|dietitian|endoscopist|kinesiolog|pedorthist|seitai|foot\scare|internal\smedicine|family\smedic|family\sdoctor|gp\s|medical\sward)', 'Healthcare'),
|
||||||
|
|
||||||
|
# EDUCATION - Schools, training, learning
|
||||||
|
(r'(school|university|college|academy|training\scenter|training\sschool|lesson|instructor|tutor|education|library|kindergarten|preschool|pre-?school|daycare|day\scare|learning\scenter|vocational|apprentice|faculty|campus|institute|seminary|boarding\sschool|private\sschool|public\sschool|elementary|middle\sschool|high\sschool|montessori|waldorf|charter\sschool|language\sschool|driving\sschool|flight\sschool|cooking\sclass|art\sclass|music\sclass|dance\sclass|acting\sclass|drama\sclass|conservatory|music\sacademy|ballet\sacademy|film\sschool|design\sschool|fashion\sschool|culinary|bartending|beauty\sschool|cosmetology|esthetician|barber\sschool|massage\sschool|yoga\steacher|yoga\straining|meditation\sclass|self-?defense\sclass|swimming\slesson|tennis\slesson|golf\slesson|ski\sschool|surf\sschool|scuba|sailing\sschool|studying\scenter|test\sprep|sat\sprep|gre\sprep|cram\sschool|juku|hagwon|coaching\scenter|head\sstart|early\shead|childminder|assistante\smaternelle|au\spair|nanny\sagency|student\sdormitor|student\shousing|student\scareer|career\scounseling|english\slanguage\scamp|language\scamp|summer\scamp|science\scamp|coding\scamp|academic\sdepartment)', 'Education'),
|
||||||
|
|
||||||
|
# AUTOMOTIVE - Vehicles, parts, services
|
||||||
|
(r'(auto\s|car\s|vehicle|motor\s|tire\s|tyre\s|mechanic|garage(?!\sdoor)|parking\s(lot|garage|facility)|driving|truck\s|motorcycle|motorbike|scooter\s|atv\s|automotive|car\swash|car\sdetail|car\sdealer|car\srental|car\slease|car\sinspect|car\sauction|smog\scheck|oil\schange|brake\s|transmission|radiator|exhaust|muffler|auto\sbody|collision|windshield|car\sstorage|towing|roadside)', 'Automotive'),
|
||||||
|
|
||||||
|
# TRANSPORTATION - Moving people/goods
|
||||||
|
(r'(airport|airline|aviation(?!\sschool)|aircraft|airplane|airfield|airstrip|heliport|seaplane|ferry|cruise|port\sauthority|port\soperating|harbor|dock\s|pier\s|marina|shipping|freight|cargo|trucking|logistics|warehouse|courier|messenger|delivery\sservice|taxi|cab\sservice|limo|chauffeur|bus\sstation|bus\sterminal|train\sstation|rail|metro|subway|transit|rickshaw|bicycle\srental|boat\srental|bike\sshare|car\sshare)', 'Transportation'),
|
||||||
|
|
||||||
|
# GOVERNMENT - Public administration, military, legal system
|
||||||
|
(r'(government|military|army\s|navy\s|naval\sbase|air\sforce|marine\s|coast\sguard|national\sguard|police|sheriff|law\senforce|fire\sstation|fire\sdepartment|courthouse|court\s|embassy|consulate|city\shall|municipal|county\s|district\soffice|passport|immigration|citizenship|dmv|tax\soffice|social\ssecurity|border|customs|post\soffice|postal|public\srecord|voter|election|legislature|parliament|congress|senate|mayor|governor|council|permit|license\s(office|bureau)|civil\sdefense|emergency\smanagement|public\ssafety|prison|jail|detention|correctional|probation|parole|aadhaar|agenzia\sentrate|anganwadi|asylum\scenter|city\sclerk|environment\soffice|land\sregistry|patent\soffice|pension\soffice|registration\soffice|registry\soffice|unemployment|employment\scenter|citizen\sinformation|consumer\sadvice|state\sarchive|national\sarchive|public\sarchive|guardia\scivil|highway\spatrol|department\sof|ministry\sof|bureau\sof|board\sof\seducation|public\sworks|sanitation|water\sauthority|housing\sauthority|port\sauthority|transit\sauthority)', 'Government'),
|
||||||
|
|
||||||
|
# RELIGIOUS - Places of worship, spiritual
|
||||||
|
(r'(church|temple|mosque|masjid|synagogue|chapel|cathedral|basilica|parish|religious|spiritual|ashram|monastery|convent|abbey|priory|buddhist|hindu|christian|catholic|protestant|orthodox|baptist|methodist|lutheran|presbyterian|pentecostal|evangelical|muslim|islamic|jewish|judai|sikh|gurdwara|gurudwara|baha.?i|shinto|taoist|quaker|mennonite|amish|latter-?day|jehovah|scientolog|meditation\scenter|retreat\scenter|pilgrimage|shrine|pagoda|wat\s|vihara|mission(?!\scontrol)|musalla|place\sof\sworship|rectory|yeshiva|marae|congregation|spiritist|priest|mohel|botanica)', 'Religious'),
|
||||||
|
|
||||||
|
# ENTERTAINMENT - Fun, recreation, sports, arts, culture
|
||||||
|
(r'(sports\s|sport\s|club(?!\shouse)|field$|court\s|gym\s|gymnasium|fitness|athletic|stadium|arena|pool\s|swimming|track\s|golf\s|tennis|soccer|football|basketball|baseball|hockey|volleyball|badminton|squash|racquetball|bowling|billiard|snooker|boxing|martial\sart|karate|judo|taekwondo|aikido|wrestling|fencing|archery|shooting\srange|gun\sclub|yoga\s|pilates|crossfit|cycling|skating|skateboard|skiing|snowboard|surfing|diving|climbing|bouldering|trampoline|gymnastics|dance\s|ballet|museum|theater|theatre|cinema|movie|art\sgallery|art\scenter|art\sstudio|gallery|music\svenue|concert|entertainment|amusement|theme\spark|water\spark|zoo|aquarium|wildlife|safari|botanical|arboretum|casino|gambling|betting|arcade|game\scenter|escape\sroom|laser\stag|paintball|go-?kart|mini\sgolf|comedy\sclub|jazz\sclub|blues\sclub|karaoke|nightclub|disco|rave|circus|carnival|fair\s|rodeo|bullring|race\strack|racecourse|hippodrome|velodrome|skate\spark|bmx|motocross|off-?road|aquatic\scenter|batting\scage|bungee|hang\sglid|paraglid|skydiv|indoor\ssnow|leisure\scenter|recreation\scenter|cultural\scenter|exhibit|festival|philharmon|opera\shouse|opera\scompany|symphony|orchestra|planetarium|observatory|science\scenter|discovery\scenter|children.*amusement|funfair|bouncy\scastle|inflatab|playground|adventure\spark|treetop|zipline|zip\sline|ropes\scourse|obstacle\scourse|ninja\swarrior|canoeing|kayaking|rafting|fishing\spond|fishing\sarea|bird\swatch|nature\sreserve|nature\scenter|hiking\strail|walking\strail|hiking\sarea|beach\spavil|beach\sresort|waterfront|promenade|pier\s(?!fishing)|boardwalk|scenic\spoint|scenic\sspot|lookout|viewpoint|observation|monument|landmark|castle|palace|fortress|historic\ssite|heritage|ruins|amphitheater|bandstand|gazebo|pavilion|curling\shall|scout\shall|scout\shome|village\shall|community\shall|social\shall|civic\scenter|convention\scenter|exhibition\scenter|artist$|band$|choir|musician|entertainer|magician|pyrotechnician|performing\sarts|stage$|sculpture|statuary|painting$|roller\scoaster|haunted\shouse|fairground|ghost\stown|lido|rugby|rugby\sfield|softball\sfield|little\sleague\sfield|water\spolo|cricket\sground|rowing\sarea|weightlifting|off\sroading|prawn\sfishing|raft\strip|mountaineering|summer\stoboggan|pumpkin\spatch|picnic\sground|national\sforest|national\sreserve|national\spark|nature\spreserve|protected\sarea|reenactment|sambodrome|pachinko|mahjong\shouse|children\shall|children.*camp|outdoor\sactivity|outdoor\sbath|onsen|thermal\sbath|day-?use\sonsen|foot\sbath)', 'Entertainment'),
|
||||||
|
|
||||||
|
# FOOD & DINING - Restaurants, bars, food production
|
||||||
|
(r'(restaurant|cafe(?!\steria)|café|coffee\s|espresso|bar\s(?!association)|pub\s|tavern|lounge|brewery|taproom|brewpub|winery|distillery|bakery|patisserie|pastry|dessert|ice\scream|gelato|frozen\syogurt|pizzeria|pizza\s|taco|burrito|sushi|ramen|noodle|dim\ssum|dumpling|steakhouse|steak\shouse|seafood|grill|bbq|barbecue|diner|bistro|brasserie|eatery|canteen|cafeteria|food\scourt|food\struck|food\scart|catering|caterer|buffet|brunch|breakfast|lunch|dinner|takeout|take-?away|delivery\sfood|meal|kitchen(?!\scabinet)|chef\s|cook\s|juice\sbar|smoothie|tea\shouse|traditional\steahouse|bubble\stea|boba|wine\sbar|wine\scellar|cocktail|speakeasy|gastropub|chophouse|crab\shouse|fish\s&\schips|curry|indian\srestaurant|chinese\srestaurant|chinese\stakeaway|italian\srestaurant|mexican\srestaurant|thai\srestaurant|japanese\srestaurant|korean\srestaurant|vietnamese|french\srestaurant|greek\srestaurant|mediterranean|middle\seastern|african\srestaurant|caribbean|latin\samerican|american\srestaurant|fast\sfood|quick\sservice|drive-?thru|dhaba|tiffin|hawker|churreria|creperie|crepe|pastelaria|pasteleria|tapas|izakaya|yakiniku|okonomiyaki|tempura|udon|soba|tonkatsu|kaiseki|robatayaki|teppanyaki|kushiyaki|yakitori|gyudon|poke\sbowl|acai|falafel|shawarma|kebab|gyro|pita|hummus|mezze|tagine|injera|pho|banh\smi|bibimbap|bulgogi|kimchi|hotpot|fondue|raclette|schnitzel|bratwurst|currywurst|pierogi|borscht|blini|pelmeni|empanada|arepa|pupusa|ceviche|asado|churrasco|rodizio|feijoada|moqueca|acaraje|jerk|oxtail|doubles|roti|samosa|biryani|tandoori|masala|tikka|naan|dosa|idli|vada|chaat|thali|satay|laksa|rendang|nasi\sgoreng|pad\sthai|som\stam|tom\syum|green\scurry|massaman|poutine|smoked\smeat|lobster\sroll|clam\schowder|po.?boy|gumbo|jambalaya|soul\sfood|southern\sfood|cajun|creole|carvery|dairy$|frituur|fruit\sparlor|meyhane|sugar\shack|yakatabune|olive\soil\scooperative|soy\ssauce)', 'Food_Dining'),
|
||||||
|
|
||||||
|
# HOME SERVICES - Home improvement, maintenance, repair
|
||||||
|
(r'(plumb|electrician|electrical\scontract|hvac|heating|air\scondition|cooling|roof|landscap|lawn\s|garden\sservice|gardener|arborist|tree\sservice|clean\s(service|company)|cleaning\sservice|cleaners$|pest\scontrol|exterminator|paint\scontract|painter(?!\sartist)|paint\sstrip|carpent|cabinet\smaker|flooring|tile\sinstall|hardwood|carpet\sinstall|repair\sservice|contractor|remodel|renovation|handyman|locksmith\sservice|moving\scompany|mover\s|moving\sand\sstorage|piano\smoving|appliance\srepair|garage\sdoor|gutter|chimney|window\sinstall|door\sinstall|double\sglazing|glass\srepair|fence\s|deck\sbuild|patio|drywall|insulation|siding|masonry|brick|concrete|paving|asphalt|pool\sservice|pool\scleaning|spa\sservice|septic|sewer|drain|water\sheater|well\sdrill|solar\sinstall|solar\spanel\smaintenance|security\ssystem|alarm\sinstall|home\sinspect|building\sinspect|surveyor|interior\sdesign|home\sstaging|pressure\swash|graffiti\sremoval|debris\sremoval|junk\sremoval|house\sclearance|snow\sremoval|antenna\sservice|satellite\sinstall|gasfitter|gas\sinstall|height\sworks|impermeabilization|wallpaper\sinstall|airbrushing|home\shelp|stall\sinstall)', 'Home_Services'),
|
||||||
|
|
||||||
|
# RETAIL & SHOPPING - Stores, shops, markets
|
||||||
|
(r'(store\s|shop\s(?!service)|retail|boutique|market(?!ing)|mall\s|outlet|dealer(?!ship)|supplier|wholesale|distributor|supermarket|grocery|convenience|department\sstore|discount|thrift|consignment|pawn|antique|vintage|secondhand|used\s|book\sstore|stationery|office\ssupply|toy\sstore|game\sstore|hobby|craft\sstore|art\ssupply|music\sstore|record\sstore|electronics|computer\sstore|phone\sstore|appliance\sstore|furniture\sstore|home\sdecor|bedding|mattress|kitchenware|hardware|tool\sstore|building\ssupply|lumber|garden\scenter|plant\snursery|florist|flower\sshop|pet\sstore|pet\ssupply|clothing|fashion|apparel|shoe\sstore|jewelry|watch\sstore|cosmetic|beauty\ssupply|pharmacy|drugstore|health\sstore|vitamin|supplement|sporting\sgoods|outdoor\sstore|bicycle\sshop|gun\sshop|hunting|fishing\sstore|camping|liquor|wine\sshop|beer\sstore|tobacco|cigar|vape|smoke\sshop|candy|chocolate|confection|bakery\sshop|cheese\sshop|spice|tea\sshop|coffee\sshop(?!\scafe)|newsstand|kiosk|vending|bazar|bazaar|hawker\scenter|flea\smarket|farmers\smarket|night\smarket|food\shall|food\scourt|deli(?!very)|delicatessen|charcuterie|butcher|fishmonger|greengrocer|produce|fruit\sstand|flower\sstand|fabric|textile\sshop|yarn|knitting|sewing\sshop|craft\ssuppl|frame\sshop|framing|trophy|engraving|gift\sshop|souvenir|duty\sfree|airport\sshop|convenience|corner\sstore|general\sstore|variety|dollar\sstore|pound\sshop|euro\sshop|99\scent|surplus|closeout|liquidat|outlet\small|factory\soutlet|warehouse\sstore|membership\sclub|costco|sam.*club)', 'Retail_Shopping'),
|
||||||
|
|
||||||
|
# PROFESSIONAL SERVICES - Business services, consulting, legal, creative
|
||||||
|
(r'(lawyer|attorney|law\sfirm|legal\sservice|accountant|accounting|bookkeep|cpa\s|tax\s(prepar|service|consult)|consultant|consulting|architect(?!ure)|engineer(?!ing\sschool)|survey\scompany|land\ssurvey|topograph|agency(?!\sgovernment)|staffing|recruiting|recruiter|employment\sagency|hr\sservice|marketing|advertis|pr\sfirm|public\srelations|graphic\sdesign|web\sdesign|website\sdesign|photography|photographer|videograph|film\sproduction|animation\sstudio|recording\sstudio|rehearsal\sstudio|production\sstudio|portrait\sstudio|model\sportfolio\sstudio|painting\sstudio|translation|interpret|transcription|notary|commissioner\sfor\soaths|private\sinvestigat|detective|appraiser|appraisal|estate\sappraiser|auditor|financial\saudit|actuary|financial\splanner|wealth\smanag|investment\sadvis|business\sconsult|management\sconsult|it\sconsult|media\scompany|media\shouse|record\scompany|scenograph|model\sdesign|telemarket|direct\smail|copywriter|editor|proofreader|technical\swriter|ghostwriter|literary\sagent|talent\sagent|booking\sagent|casting|modeling\sagent|artist\smanage|court\sreport|patent\sagent|trademark|intellectual\sproperty|customs\sbroker|freight\sforward|import\sexport|export\scompany|geological\sresearch|geological\sservice|environmental\sconsult|safety\sconsult|quality\sconsult|process\sserv|skip\strac|bail\senforce|collection\sagent|factoring|mezzanine\sfinance|conveyancer|executor|genealogist|gemologist|loss\sadjuster|foreclosure|insolvency|judicial\sscrivener|commercial\sagent|executive\ssearch|payroll\sservice|resume\sservice|typing\sservice|fax\sservice|mailing\sservice|shredding\sservice|blueprint|drafting|mapping\sservice|research\sand\sproduct|information\sservice|news\sservice|music\smanagement|yacht\sbroker|finance\sbroker|food\sbroker)', 'Professional_Services'),
|
||||||
|
|
||||||
|
# INDUSTRIAL - Manufacturing, construction, mining, utilities, trades
|
||||||
|
(r'(factory|plant(?!\snursery)|mill$|mill\s|manufactur|industrial|mining|mine\s|quarry|production|foundry|forge|smelter|refinery|chemical\s|pharmaceutical\scompan|textile|garment\sfactory|food\sprocessing|cannery|bottling|assembly|fabricat|machine\sshop|metal\swork|metal\sprocess|metallurg|welding|welder|steel|iron\sworks|aluminum|plastic|rubber|paper\smill|lumber\smill|sawmill|saw\smill|print\sshop|commercial\sprint|digital\sprint|packaging|recycling|waste\smanagement|construction\scompany|general\scontractor|building\scompany|building\sfirm|developer|civil\sengineering|demolition|excavat|crane\sservice|scaffold|heavy\sequipment|blacksmith|coppersmith|goldsmith|silversmith|horseshoe|locksmith(?!\sservice)|tinsmith|gunsmith|bladesmith|knifesmith|boilermaker|machinist|millwright|pipefitter|rigger|sheet\smetal|ironwork|structural\ssteel|precast|concrete\splant|asphalt\splant|gravel|aggregate|sand\s&\sgravel|earth\sworks|anodizing|electroplat|galvaniz|powder\scoat|metal\spolish|metal\sfinish|sandblast|shot\sblast|heat\streat|tempering|hardening|casting|die\scast|injection\smold|blow\smold|extrusion|stamping|forging|cnc|lathe|milling\smachine|grinding|boring|drilling|water\sutility|electric\sutility|gas\scompany|power\sstation|power\splant|nuclear\spower|solar\senergy|wind\sfarm|hydroelectric|substation|transformer|utility\scompany|water\spurification|sewage|wastewater|biotechnolog|shipbuilding|ship\srepair|shipyard|dry\sdock|boatyard|marine\sengine|propeller|cotton\smill|flour\smill|rice\smill|jute\smill|water\smill|weaving\smill|cider\smill|slaughterhouse|tannery|dyeworks|meat\spacker|meat\sprocessor|fruit.*processing|glass\sindustry|sewing\scompany|turnery|toolroom|machine\sconstruct|stone\scutter|stone\scarving|joiner|woodworker|plasterer|glazier|plating\sservice|embossing|lamination|laser\scutting|water\sjet|salvage\syard|junkyard|garbage\sdump|waste\stransfer|coalfield|oilfield)', 'Industrial'),
|
||||||
|
|
||||||
|
# HOSPITALITY & TRAVEL - Lodging, tourism
|
||||||
|
(r'(hotel|motel|inn\s|resort|hostel|lodge\s|bed\s&\sbreakfast|bed\sand\sbreakfast|b&b|guesthouse|guest\shouse|vacation\srental|holiday\s(rental|apartment|home)|cabin\srental|cottage\srental|cottage(?!\sindustry)|chalet|airbnb|vrbo|travel\sagent|travel\sagency|tour\soperator|tour\sguide|tourist\s(information|office|attraction)|sightseeing|excursion|cruise|camping|campground|caravan\spark|rv\spark|glamping|youth\shostel|retreat\scenter(?!\sreligious)|boarding\shouse|rooming\shouse|dormitory(?!\sstudent)|rest\sstop|rest\sarea|truck\sstop|service\sarea|visitor\scenter|welcome\scenter|country\shouse|manor\shouse|estate\shouse|villa\srental|apartment\shotel|extended\sstay|residence\sinn|suite\shotel|capsule\shotel|love\shotel|ryokan|minshuku|pension\s|agriturismo|pousada|parador|paradores)', 'Hospitality_Travel'),
|
||||||
|
|
||||||
|
# PERSONAL SERVICES - Beauty, wellness, personal care
|
||||||
|
(r'(salon\s|spa\s(?!automotive)|massage(?!\schair)|tattoo|piercing|body\sart|barber|beauty\s(?!supply|store)|nail\s|manicure|pedicure|hair\s(salon|stylist|dresser|cut)|waxing|threading|lash|brow|eyelash|makeup\sartist|esthetician|cosmetolog|tanning|sunbed|sauna|steam\sroom|bathhouse|hammam|laundry|laundromat|dry\sclean|tailor|alteration|seamstress|shoe\srepair|cobbler|watch\srepair|key\scutting|weight\sloss|diet\scenter|personal\strainer|life\scoach|dating\sservice|matchmak)', 'Personal_Services'),
|
||||||
|
|
||||||
|
# FINANCE & INSURANCE - Banks, financial services
|
||||||
|
(r'(bank(?!\sfood)|credit\sunion|savings\s&\sloan|atm\s|insurance\s(agent|agency|company|broker)|mortgage|loan\s(company|officer|broker)|lending|finance\scompany|financial\sservic|investment\s(firm|company|bank)|stock\sbroker|wealth\smanage|money\stransfer|remittance|currency\sexchange|forex|check\scash|payday\sloan|pawn(?!shop)|bail\sbond|credit\srepair|debt\scollect|factoring|leasing\scompany)', 'Finance_Insurance'),
|
||||||
|
|
||||||
|
# REAL ESTATE - Property, housing, storage
|
||||||
|
(r'(real\sestate|realtor|property\s(agent|management|company)|apartment\s(complex|building|rental)|condo|condominium|housing|home\sbuilder|land\sdeveloper|commercial\sreal|office\sspace|coworking|business\scenter|storage\s(facility|unit)|self.?storage|mini\sstorage|warehouse\sspace|parking\sspace|mobile\shome\spark|trailer\spark)', 'Real_Estate'),
|
||||||
|
|
||||||
|
# EVENTS & WEDDINGS - Event services, funeral
|
||||||
|
(r'(funeral|mortuary|cremation|crematorium|cemetery|memorial\s|casket|burial|wedding\s(planner|venue|dress|photographer)|event\s(planner|venue|center)|party\s(planner|supply|rental)|banquet\shall|reception\shall|conference\scenter|convention|meeting\sroom|catering\shall|dj\sservice|disc\sjockey|band\sfor\shire|balloon|decoration\sservice|tent\srental|photo\sbooth|florist(?!\sshop))', 'Events_Weddings'),
|
||||||
|
|
||||||
|
# NON-PROFIT - Charities, community organizations, social services
|
||||||
|
(r'(charity|charitable|non-?profit|ngo\s|foundation(?!\srepair)|community\scenter|community\sorganiz|civic\s|volunteer|food\sbank|soup\skitchen|homeless\s(shelter|service)|social\sservice|social\sworker|welfare\soffice|crisis\scenter|hotline|support\sgroup|self-?help|aa\s|alcoholics|narcotics\sanonymous|veteran|vfw|american\slegion|rotary|lions\sclub|kiwanis|elks|freemason|masonic|fraternal|chamber\sof\scommerce|chamber\sof\shandicrafts|trade\sassociation|professional\sassociation|labor\sunion|tenant.*union|indigenous|aboriginal|tribal|youth\scenter|youth\scare|youth\sgroup|senior\scitizen\scenter|women.s\s(shelter|center|protection)|domestic\sviolence|battered|abuse\s(shelter|center)|halfway\shouse|sober\sliving|addiction\s(center|service)|recovery\scenter|rehab\scenter(?!ilitation)|detox|mental\shealth\sadvocacy|disability\s(service|advocacy)|deaf\sservice|blind\sservice|immigrant\s(service|aid)|refugee\s(service|aid|camp)|legal\said|pro\sbono|family\sservice|family\splanning|birth\scontrol|child\swelfare|foster\scare|adoption\sagency|big\sbrothers|big\ssisters|boys\s&\sgirls|ymca|ywca|jewish\scommunity|jcc|salvation\sarmy|goodwill|habitat\sfor\shumanity|red\scross|united\sway|make-?a-?wish|special\solympics|donations\scenter|thrift(?!\sstore)|donation\sdrop|orphanage|children.*home|group\shome|shelter$|scouting|literacy\sprogram|crime\svictim|mediation\sservice|special\seducator|playgroup|student\sunion)', 'Non_Profit'),
|
||||||
|
|
||||||
|
# TECHNOLOGY - IT, software, telecom
|
||||||
|
(r'(software|app\sdevelop|web\sdevelop|it\sservice|it\ssupport|computer\sservice|computer\srepair|computer\ssecurity|computer\snetwork|tech\ssupport|data\scenter|data\srecovery|data\sentry|database|server\s(farm|hosting)|cloud\sservice|internet\sservice|isp\s|broadband|telecom|telephone\scompany|mobile\s(operator|network)|cell\sphone\sservice|fiber\soptic|satellite\s(communication|service)|cable\sprovider|cybersecurity|network|systems\sintegrat|bpo|call\scenter|outsourc|automation\scompany|home\sautomation|robotics|ai\scompany|machine\slearning|e-?commerce|digital\smarketing|seo|web\shost|domain\sregist|ssl|vpn|managed\sservice|msp|helpdesk|remote\ssupport|pc\srepair)', 'Technology'),
|
||||||
|
|
||||||
|
# AGRICULTURE - Farming, ranching
|
||||||
|
(r'(farm(?!acy|er.s\smarket)|ranch|agriculture|livestock|cattle|poultry|dairy\sfarm|pig\sfarm|sheep|goat|horse\sfarm|stable(?!\sservice)|equestrian\scenter|riding\sschool|crop|orchard|vineyard(?!\swinery)|plantation|greenhouse|horticulture|nursery(?!school)|floricult|aquaculture|fish\sfarm|beekeep|apiary|agronomy|fertilizer|seed\scompany|farm\sequipment|tractor|irrigation|grain|silo|feed\sstore|livestock\sauction|veterinari.*(large|farm|livestock))', 'Agriculture'),
|
||||||
|
|
||||||
|
# PETS & ANIMALS - Pet services, animal welfare
|
||||||
|
(r'(pet\s(?!rol)|animal\s(?!hospital|clinic)|dog\s(?!hot)|cat\s|bird\s(?!watch)|fish\s(?!market|restaurant)|reptile|aquarium\sstore|vet(?!eran)|veterinar(?!.*large|.*farm)|kennel|doggy\sdaycare|pet\sgrooming|pet\sboarding|pet\ssitting|dog\swalk|pet\strain|animal\sshelter|animal\srescue|animal\scontrol|humane\ssociety|spca|aspca|wildlife\srehab|sanctuary|cattery|aviary|breeder|stud\sservice|horse\sboarding|stable(?!\sindustry)|equine|farrier|horse\sshoe)', 'Pets_Animals'),
|
||||||
|
|
||||||
|
# FINANCE & INSURANCE - Banks, financial services
|
||||||
|
(r'(bank(?!\sfood)|credit\sunion|savings\s&\sloan|atm\s|insurance\s(agent|agency|company|broker)|mortgage|loan\s(company|officer|broker)|lending|finance\scompany|financial\sservic|investment\s(firm|company|bank)|stock\sbroker|wealth\smanage|money\stransfer|remittance|currency\sexchange|forex|check\scash|payday\sloan|bail\sbond|credit\srepair|debt\scollect|factoring|leasing\scompany|venture\scapital|private\sequity|hedge\sfund|asset\smanag|trust\scompany|escrow|title\scompany|credit\scounseling|financial\splanning|retirement\splanning|pension\sfund|401k|ira|annuity|securities|commodities|futures|options|trading|brokerage|fintech|mobile\smoney|digital\swallet|cryptocurrency|bitcoin|blockchain)', 'Finance_Insurance'),
|
||||||
|
|
||||||
|
# Catch more rentals and specialized services
|
||||||
|
(r'(equipment\srental|tool\srental|party\srental|tent\srental|chair\srental|table\srental|linen\srental|costume\srental|tuxedo\srental|dress\srental|appliance\srental|furniture\srental|office\sequipment\srental|audiovisual.*rental|av\srental|musical\sinstrument\srental|ski\srental|snowboard\srental|snowmobile\srental|jet\sski\srental|boat\srental|kayak\srental|canoe\srental|bicycle\srental|scooter\srental|segway|atv\srental|motorcycle\srental|rv\srental|camper\srental|trailer\srental|truck\srental|van\srental|car\srental|forklift\srental|crane\srental|scaffolding\srental|construction.*rental|dumpster\srental|portable\stoilet|porta.*potty)', 'Retail_Shopping'),
|
||||||
|
|
||||||
|
# Specialized restoration and repair services
|
||||||
|
(r'(restoration\sservice|furniture\srestoration|antique\srestoration|art\srestoration|photo\srestoration|document\srestoration|clock\srepair|watch\srepair|jewelry\srepair|shoe\srepair|luggage\srepair|leather\srepair|upholstery\srepair|musical\sinstrument\srepair|piano\stuning|guitar\srepair|violin\srepair|camera\srepair|electronics\srepair|phone\srepair|screen\srepair|computer\srepair|printer\srepair|copier\srepair|typewriter|sewing\smachine\srepair|vacuum\srepair|small\sengine\srepair|lawn\smower\srepair|chainsaw|power\stool\srepair|fire\sextinguisher\sservice|scale\srepair|calibration|water\sdamage\srestoration|fire\sdamage|smoke\sdamage|mold\sremediation|biohazard|crime\sscene\sclean|hoarding\sclean)', 'Home_Services'),
|
||||||
|
|
||||||
|
# Specialized trades and craftspeople
|
||||||
|
(r'(clock\smaker|watch\smaker|furniture\smaker|cabinet\smaker|instrument\smaker|stringed\sinstrument\smaker|piano\smaker|organ\sbuilder|luthier|bookbinder|print\smaker|engraver|etcher|lithograph|screen\sprint|sign\smaker|sign\spainter|glass\sblower|stained\sglass|ceramic|pottery|potter|sculptor|woodcarver|wood\sturner|basket\smaker|weaver|spinner|knitter|quilter|longarm|embroidery|monogram|tailor|seamstress|dressmaker|milliner|cobbler|saddle|harness|leather\scraft|upholster|framemaker|gilder|conservator|taxiderm|model\smaker|prop\smaker|costume\smaker|wig\smaker|prosthetic|mask\smaker|puppet|doll\smaker|toy\smaker)', 'Industrial'),
|
||||||
|
|
||||||
|
# Specialized testing and inspection services
|
||||||
|
(r'(testing\sservice|inspection\sservice|asbestos\stest|lead\stest|radon\stest|water\stest|soil\stest|air\squality|environmental\stest|mold\stest|home\sinspect|building\sinspect|property\sinspect|roof\sinspect|termite\sinspect|pest\sinspect|pool\sinspect|chimney\sinspect|septic\sinspect|well\sinspect|electrical\sinspect|plumbing\sinspect|hvac\sinspect|fire\sinspect|safety\sinspect|code\senforcement|energy\saudit|blower\sdoor|duct\stest|infrared|thermal\simag)', 'Professional_Services'),
|
||||||
|
|
||||||
|
# Personal and lifestyle services
|
||||||
|
(r'(psychic|astrologer|fortune\steller|fortune\stelling|palm\sread|tarot|medium|spiritual\sadvis|feng\shui|numerolog|grapholog|hypnotherap|hypnosis|past\slife|akashic|aura|chakra|reiki|energy\shealing|crystal\shealing|sound\shealing|aromatherap|reflexolog|iridolog|kinesiology|craniosacral|rolfing|alexander\stechnique|feldenkrais|pilates\sinstructor|yoga\sinstructor|meditation\sinstructor|breathwork|pranayama|ayurved|traditional\schinese|tcm|herbalist|naturopath|homeopath|beautician|esthetician|esthetics|body\sshaping|boot\scamp|loctician|mehandi|mehndi|teeth\swhitening|wellness\sprogram|alternative\smedicine\spractitioner)', 'Personal_Services'),
|
||||||
|
|
||||||
|
# More Government patterns
|
||||||
|
(r'(archive$|birth\scertificate|city\semployment|state\semployment|company\sregistry|district\sjustice|justice\sdepartment|land\splanning|urban\splanning|toll\sstation|traffic\sofficer|weigh\sstation|sanitary\sinspect|smog\sinspect|superfund|water\sworks|weather\sforecast|ground\sself\sdefense|united\sstates\sarmed|radio\sbroadcaster|television\sstation|closed\scircuit|communications\stower)', 'Government'),
|
||||||
|
|
||||||
|
# More Transportation patterns
|
||||||
|
(r'(boat\sramp|container\sterminal|helicopter\scharter|river\sport|transportation\sservice|transportation\sescort|fixed-?base\soperator|handicapped\stransportation|carpooling)', 'Transportation'),
|
||||||
|
|
||||||
|
# More Finance patterns
|
||||||
|
(r'(diamond\sbuyer|financial\sinstitution|holding\scompany|leasing\sservice|stock\sexchange|money\sorder|payment\sterminal)', 'Finance_Insurance'),
|
||||||
|
|
||||||
|
# More Real Estate patterns
|
||||||
|
(r'(corporate\soffice|display\shome|townhouse\scomplex|villa$|serviced\s(accommodation|apartment)|function\sroom|virtual\soffice)', 'Real_Estate'),
|
||||||
|
|
||||||
|
# More Entertainment/Sports patterns
|
||||||
|
(r'(fishing\s(camp|charter|pier)|horseback\sriding|horse\srental|equestrian\sfacility|outdoor\sequestrian|salsa\sclass|wood\sworking\sclass|stitching\sclass|childbirth\sclass|mehandi\sclass)', 'Entertainment'),
|
||||||
|
|
||||||
|
# More Industrial/Repair patterns
|
||||||
|
(r'(engine\srebuilding|machine\smaintenance|saw\ssharpening|skate\ssharpening|sharpening\sservice|lpg\sconversion|cng\sfitment|boat\sdetailing|rv\sdetailing|rv\srepair|bike\swash|fire\sprotection|elevator\sservice|drone\sservice)', 'Industrial'),
|
||||||
|
|
||||||
|
# More Retail patterns
|
||||||
|
(r'(haberdashery|jeweler$|lapidary|glass\smerchant|furniture\saccessories|showroom$|tesla\sshowroom|bottle.*redemption|coin\soperated)', 'Retail_Shopping'),
|
||||||
|
|
||||||
|
# More Professional Services patterns
|
||||||
|
(r'(building\sdesigner|polygraph|professional\sorganizer|video\s(conferencing|duplication|editing)|meeting\splanning|personal\sconcierge|house\ssitter|marriage\scelebrant|singing\stelegram|roommate\sreferral)', 'Professional_Services'),
|
||||||
|
|
||||||
|
# Miscellaneous remaining - catch-all for specific items
|
||||||
|
(r'(agistment|auction\shouse|appliances\scustomer|bicycle\srack|bridge$|building\sequipment\shire|container\sservice|distribution\sservice|diaper\sservice|divorce\sservice|drinking\swater\sfountain|energy\sequipment|environment\srenewable|forestry\sservice|fur\sservice|garbage\scollection|garden$|handicraft|hiking\sguide|homekill|judicial\sauction|key\sduplication|land\sallotment|line\smark|livery\scompany|lodge$|lodging$|lyceum|mailbox\srental|marquee\shire|memorial$|mercantile|mineral\swater\scompany|mold\smaker|office\srefurbish|oil\sand\sgas\sexploration|orchid\sgrower|package\slocker|pedestrian\szone|road\ssafety\stown|sacem|sailmaker|seating\ssystems|security\s(guard|service)|shoe\sshining|societe|staple\sfood|tenant\sownership|ticket\soffice|weir|wi-?fi\sspot)', 'Other'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_sector_for_item(name):
|
||||||
|
"""
|
||||||
|
Determine which sector an item belongs to.
|
||||||
|
Returns sector slug or 'Other' if no match.
|
||||||
|
"""
|
||||||
|
name_lower = name.lower()
|
||||||
|
|
||||||
|
for pattern, sector in SECTOR_PATTERNS:
|
||||||
|
if re.search(pattern, name_lower, re.IGNORECASE):
|
||||||
|
return sector
|
||||||
|
|
||||||
|
return 'Other'
|
||||||
|
|
||||||
|
|
||||||
|
# ==================== LEVEL 2: BUSINESS TYPE PATTERNS ====================
|
||||||
|
# These are more specific patterns within each sector
|
||||||
|
|
||||||
|
BUSINESS_TYPE_PATTERNS = {
|
||||||
|
'Entertainment': [
|
||||||
|
(r'(fitness|gym|workout|crossfit|pilates|yoga|aerobic|exercise|weight\s(room|training)|spin\sclass|bootcamp)', 'Fitness'),
|
||||||
|
(r'(sports\s|athletic|stadium|arena|field\s|court\s|track\s|league|team\s)', 'Sports'),
|
||||||
|
(r'(museum|exhibit|gallery|art\s(center|gallery)|sculpture)', 'Museums'),
|
||||||
|
(r'(theater|theatre|playhouse|opera|ballet|symphony|orchestra|concert|performance|show)', 'Performing Arts'),
|
||||||
|
(r'(cinema|movie|film|drive-?in)', 'Movies'),
|
||||||
|
(r'(park(?!\sing)|playground|recreation|picnic|garden|botanical|arboretum|nature|trail)', 'Parks'),
|
||||||
|
(r'(amusement|theme\spark|water\spark|carnival|fair|ride|attraction)', 'Amusement'),
|
||||||
|
(r'(arcade|game|escape\sroom|laser|paintball|go.?kart|bowling|billiard|mini\sgolf)', 'Games & Recreation'),
|
||||||
|
(r'(casino|gambling|betting|poker|slot)', 'Gambling'),
|
||||||
|
(r'(club|nightclub|disco|bar|lounge)', 'Social'),
|
||||||
|
(r'(zoo|aquarium|wildlife|safari|sanctuary)', 'Wildlife'),
|
||||||
|
(r'(music|concert|jazz|blues|rock|karaoke)', 'Music Venues'),
|
||||||
|
],
|
||||||
|
'Healthcare': [
|
||||||
|
(r'(hospital|medical\scenter|health\scenter)', 'Hospitals'),
|
||||||
|
(r'(clinic|office|practice|urgent\scare)', 'Clinics'),
|
||||||
|
(r'(dentist|dental|orthodont|oral\ssurg|periodont|endodont)', 'Dental'),
|
||||||
|
(r'(eye|vision|optom|optician|ophthalmolog)', 'Vision Care'),
|
||||||
|
(r'(mental|psych|counsel|therapist|psychiatr)', 'Mental Health'),
|
||||||
|
(r'(chiropract|acupunct|naturopath|homeopath|osteopath|alternative|holistic)', 'Alternative Medicine'),
|
||||||
|
(r'(physical\stherap|occupational|speech|rehab)', 'Rehabilitation'),
|
||||||
|
(r'(lab|diagnostic|patholog|radiology|x-?ray|imaging|blood\stest)', 'Diagnostics'),
|
||||||
|
(r'(pharmacy|drugstore|prescription)', 'Pharmacies'),
|
||||||
|
(r'(senior|aged|elder|nursing\shome|assisted)', 'Senior Care'),
|
||||||
|
(r'(emergency|ambulance|paramedic|first\said|urgent)', 'Emergency Services'),
|
||||||
|
(r'(veterinar|vet\s|animal\s(hospital|clinic))', 'Veterinary'),
|
||||||
|
(r'(doctor|physician|surgeon|specialist|practitioner)', 'Medical Practitioners'),
|
||||||
|
],
|
||||||
|
'Food_Dining': [
|
||||||
|
(r'(restaurant|eatery|dining|bistro|brasserie|grill|steakhouse)', 'Restaurants'),
|
||||||
|
(r'(cafe|café|coffee|espresso|tea\shouse)', 'Cafes & Coffee'),
|
||||||
|
(r'(bar\s|pub|tavern|brewery|taproom|lounge|cocktail|wine\sbar)', 'Bars & Nightlife'),
|
||||||
|
(r'(bakery|patisserie|pastry|bread|donut|bagel)', 'Bakeries & Desserts'),
|
||||||
|
(r'(ice\scream|gelato|dessert|frozen\syogurt|candy|chocolate)', 'Bakeries & Desserts'),
|
||||||
|
(r'(fast\sfood|quick\sservice|drive.?thru|takeout|take.?away)', 'Quick Service'),
|
||||||
|
(r'(caterer|catering|food\sservice|meal\sprep)', 'Food Services'),
|
||||||
|
(r'(winery|distillery|vineyard)', 'Beverage Production'),
|
||||||
|
],
|
||||||
|
'Home_Services': [
|
||||||
|
(r'(plumb|pipe|drain|sewer|septic)', 'Plumbing'),
|
||||||
|
(r'(electric|wiring|panel|outlet)', 'Electrical'),
|
||||||
|
(r'(hvac|heat|cool|air\scondition|furnace)', 'HVAC'),
|
||||||
|
(r'(roof|gutter|shingle)', 'Roofing'),
|
||||||
|
(r'(landscap|lawn|garden|tree|arbor)', 'Landscaping'),
|
||||||
|
(r'(clean|maid|janitor|housekeep)', 'Cleaning'),
|
||||||
|
(r'(pest|exterminator|termite)', 'Pest Control'),
|
||||||
|
(r'(paint|drywall|plaster|wallpaper)', 'Construction'),
|
||||||
|
(r'(floor|carpet|tile|hardwood)', 'Flooring'),
|
||||||
|
(r'(window|door|glass)', 'Windows & Doors'),
|
||||||
|
(r'(pool|spa|hot\stub)', 'Pool & Spa'),
|
||||||
|
(r'(security|alarm|lock|safe)', 'Security'),
|
||||||
|
(r'(appliance|washer|dryer|refrigerator)', 'Appliance Repair'),
|
||||||
|
(r'(handyman|repair|fix|maintenance)', 'General Repair'),
|
||||||
|
(r'(construct|build|remodel|renovation|contractor)', 'Construction'),
|
||||||
|
(r'(mov(er|ing)|relocat)', 'Moving'),
|
||||||
|
(r'(interior|decor|design|stag)', 'Design'),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_business_type_for_item(name, sector):
|
||||||
|
"""
|
||||||
|
Determine which business type an item belongs to within a sector.
|
||||||
|
Returns business type or 'Other' if no match.
|
||||||
|
"""
|
||||||
|
if sector not in BUSINESS_TYPE_PATTERNS:
|
||||||
|
return 'Other'
|
||||||
|
|
||||||
|
name_lower = name.lower()
|
||||||
|
|
||||||
|
for pattern, btype in BUSINESS_TYPE_PATTERNS[sector]:
|
||||||
|
if re.search(pattern, name_lower, re.IGNORECASE):
|
||||||
|
return btype
|
||||||
|
|
||||||
|
return 'Other'
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main function to categorize and show results"""
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Read items from stdin or file
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
with open(sys.argv[1]) as f:
|
||||||
|
items = [line.strip() for line in f if line.strip()]
|
||||||
|
else:
|
||||||
|
items = [line.strip() for line in sys.stdin if line.strip()]
|
||||||
|
|
||||||
|
# Categorize
|
||||||
|
results = {}
|
||||||
|
for name in items:
|
||||||
|
sector = get_sector_for_item(name)
|
||||||
|
btype = get_business_type_for_item(name, sector)
|
||||||
|
|
||||||
|
key = (sector, btype)
|
||||||
|
if key not in results:
|
||||||
|
results[key] = []
|
||||||
|
results[key].append(name)
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print(f"Total items: {len(items)}\n")
|
||||||
|
|
||||||
|
# Group by sector
|
||||||
|
by_sector = {}
|
||||||
|
for (sector, btype), names in results.items():
|
||||||
|
if sector not in by_sector:
|
||||||
|
by_sector[sector] = {}
|
||||||
|
by_sector[sector][btype] = names
|
||||||
|
|
||||||
|
# Print sector summary
|
||||||
|
print("=" * 60)
|
||||||
|
print("SECTOR SUMMARY")
|
||||||
|
print("=" * 60)
|
||||||
|
for sector in sorted(by_sector.keys()):
|
||||||
|
total = sum(len(names) for names in by_sector[sector].values())
|
||||||
|
other_count = len(by_sector[sector].get('Other', []))
|
||||||
|
print(f"{sector}: {total} items ({other_count} in Other)")
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("DETAILED BREAKDOWN")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
for sector in sorted(by_sector.keys()):
|
||||||
|
print(f"\n### {sector} ###")
|
||||||
|
for btype in sorted(by_sector[sector].keys()):
|
||||||
|
names = by_sector[sector][btype]
|
||||||
|
print(f" {btype}: {len(names)}")
|
||||||
|
if len(names) <= 10:
|
||||||
|
for name in sorted(names):
|
||||||
|
print(f" - {name}")
|
||||||
|
else:
|
||||||
|
for name in sorted(names)[:5]:
|
||||||
|
print(f" - {name}")
|
||||||
|
print(f" ... and {len(names) - 5} more")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
555
db/recategorize_other.py
Normal file
555
db/recategorize_other.py
Normal file
@@ -0,0 +1,555 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Recategorize items from Other.Uncategorized into appropriate existing categories.
|
||||||
|
|
||||||
|
RULES:
|
||||||
|
1. NEVER create new Level 1 (Sector) categories
|
||||||
|
2. Only create new Level 2 (Business Type) if >10 items would use it
|
||||||
|
3. Only create new Level 3 (Sub-category) if >5 items would use it
|
||||||
|
4. Prefer matching to existing categories at all times
|
||||||
|
5. If uncertain, leave in Other
|
||||||
|
|
||||||
|
EXISTING SECTORS (21 non-Other):
|
||||||
|
- Agriculture: Farming, Services
|
||||||
|
- Automotive: Dealers, Fuel & Charging, Parking, Parts & Accessories, Rental Services, Repair & Maintenance, Training, Vehicle Care
|
||||||
|
- Education: Arts Education, Early Childhood, Higher Education, K-12 Schools, Language Learning, Libraries, Professional Training, Specialty Schools, Sports Training, Technology Training, Tutoring, Vocational Training
|
||||||
|
- Entertainment: Amusement, Arts, Fitness, Gambling, Games & Recreation, Movies, Museums, Music Venues, Parks, Performing Arts, Recreation, Social, Sports, Venues, Wildlife
|
||||||
|
- Events_Weddings: Attire, Florists, Memorial, Planning, Rentals, Services, Venues
|
||||||
|
- Finance_Insurance: Banking, Insurance, Investment, Lending, Money Services
|
||||||
|
- Food_Dining: Bakeries & Desserts, Bars & Nightlife, Beverage Production, Cafes & Coffee, Food Services, Quick Service, Restaurants
|
||||||
|
- Government: International, Legal, Local Government, Postal, Public Safety, Social Services, Transportation
|
||||||
|
- Healthcare: Alternative Medicine, Clinics, Dental, Diagnostics, Emergency Services, Hospitals, Medical Practitioners, Mental Health, Pharmacies, Rehabilitation, Senior Care, Specialty Care, Veterinary, Vision Care
|
||||||
|
- Home_Services: Appliance Repair, Cleaning, Construction, Design, Electrical, Flooring, General Repair, HVAC, Landscaping, Moving, Pest Control, Plumbing, Pool & Spa, Roofing, Security, Windows & Doors
|
||||||
|
- Hospitality_Travel: Attractions, Lodging, Transportation, Travel Services
|
||||||
|
- Industrial: Construction, Manufacturing, Mining
|
||||||
|
- Non_Profit: Charities, Community, General, Professional
|
||||||
|
- Personal_Services: Body Art, Clothing Care, Fitness, Hair Care, Laundry, Massage, Spa & Wellness
|
||||||
|
- Pets_Animals: Animal Welfare, Pet Services
|
||||||
|
- Professional_Services: Agencies, Business Services, Consulting, Creative Services, Design, Engineering, Financial Services, HR Services, Language Services, Legal, Marketing & Advertising
|
||||||
|
- Real_Estate: Agencies, Commercial, Development, Management, Residential, Services, Storage
|
||||||
|
- Religious: Buddhism, Christian, Hinduism, Islam, Judaism, Other
|
||||||
|
- Retail_Shopping: Arts & Crafts, Beauty & Cosmetics, Books & Office, Clothing & Fashion, Electronics, Food & Grocery, Hardware & Building, Health & Pharmacy, Home & Garden, Jewelry & Watches, Markets, Music & Entertainment, Pet Supplies, Secondhand & Vintage, Specialty Retail, Sports & Outdoors, Toys & Hobbies, Wholesale & Distribution
|
||||||
|
- Technology: IT Services, Infrastructure, Software, Telecommunications
|
||||||
|
- Transportation: Delivery, Logistics, Passenger, Public Transit, Vehicle Services
|
||||||
|
"""
|
||||||
|
|
||||||
|
import psycopg2
|
||||||
|
import re
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
# Database connection
|
||||||
|
DB_URL = "postgresql://scraper:scraper123@localhost:5437/scraper"
|
||||||
|
|
||||||
|
def slugify(text):
|
||||||
|
"""Convert text to slug format"""
|
||||||
|
slug = re.sub(r'[^\w\s-]', '', text)
|
||||||
|
slug = re.sub(r'[-\s]+', '_', slug)
|
||||||
|
return slug.strip('_')
|
||||||
|
|
||||||
|
# ==================== CATEGORIZATION RULES ====================
|
||||||
|
# Format: (keyword_pattern, sector, business_type, sub_category)
|
||||||
|
# Use regex patterns for flexibility
|
||||||
|
|
||||||
|
CATEGORIZATION_RULES = [
|
||||||
|
# ==================== SPORTS & FITNESS (→ Entertainment.Sports or Entertainment.Fitness) ====================
|
||||||
|
# Sports clubs and facilities
|
||||||
|
(r'\b(basketball|baseball|football|soccer|tennis|golf|hockey|rugby|cricket|volleyball|badminton|squash|racquetball)\b.*(club|court|field|ground|stadium|arena|complex)', 'Entertainment', 'Sports', 'Facilities'),
|
||||||
|
(r'\b(swimming|diving|aquatic|pool)\b.*(club|center|pool|facility)', 'Entertainment', 'Sports', 'Aquatic'),
|
||||||
|
(r'\b(gym|fitness|workout|crossfit|aerobic|pilates|yoga|zumba)\b.*(center|studio|club|class)', 'Entertainment', 'Fitness', 'Studios'),
|
||||||
|
(r'\b(martial arts|karate|judo|taekwondo|aikido|boxing|kickboxing|mma|wrestling|fencing)\b.*(club|school|academy|dojo|studio)', 'Entertainment', 'Sports', 'Martial_Arts'),
|
||||||
|
(r'\b(archery|shooting|rifle|gun)\b.*(range|club|center)', 'Entertainment', 'Sports', 'Shooting'),
|
||||||
|
(r'\b(skateboard|skate park|bmx|cycling|bicycle)\b.*(park|venue|club|center)', 'Entertainment', 'Sports', 'Cycling_Skating'),
|
||||||
|
(r'\b(climbing|bouldering|rock climbing)\b.*(gym|wall|center|club)', 'Entertainment', 'Fitness', 'Climbing'),
|
||||||
|
(r'\b(dance|ballet|ballroom|salsa|tango)\b.*(studio|school|class|instructor)', 'Entertainment', 'Performing Arts', 'Dance'),
|
||||||
|
(r'\bsports\b.*(center|complex|facility|club)', 'Entertainment', 'Sports', 'General'),
|
||||||
|
(r'\bathletic\b.*(field|track|club|center)', 'Entertainment', 'Sports', 'Facilities'),
|
||||||
|
(r'\b(rowing|canoeing|kayaking|sailing|boat)\b.*(club|center|school)', 'Entertainment', 'Sports', 'Water_Sports'),
|
||||||
|
(r'\b(equestrian|horse|polo|riding)\b.*(club|center|school|stable|arena)', 'Entertainment', 'Sports', 'Equestrian'),
|
||||||
|
(r'\b(ski|snowboard|ice skating|ice rink)\b.*(resort|center|club|rink)', 'Entertainment', 'Sports', 'Winter_Sports'),
|
||||||
|
|
||||||
|
# Instructors and trainers
|
||||||
|
(r'\b(fitness|personal|sports|athletic)\b.*\b(trainer|instructor|coach)\b', 'Entertainment', 'Fitness', 'Trainers'),
|
||||||
|
(r'\baerobic.*instructor\b', 'Entertainment', 'Fitness', 'Trainers'),
|
||||||
|
|
||||||
|
# ==================== HEALTHCARE (various) ====================
|
||||||
|
# Medical specialists
|
||||||
|
(r'\b(allergist|anesthesiologist|cardiologist|dermatologist|endocrinologist|gastroenterologist|geriatrician|hematologist|immunologist|nephrologist|neurologist|oncologist|ophthalmologist|orthopedist|otolaryngologist|pathologist|pediatrician|physiatrist|podiatrist|proctologist|pulmonologist|radiologist|rheumatologist|urologist)\b', 'Healthcare', 'Medical Practitioners', 'Specialists'),
|
||||||
|
(r'\b(audiologist|speech therapist|occupational therapist|physical therapist)\b', 'Healthcare', 'Rehabilitation', 'Therapists'),
|
||||||
|
(r'\b(psychologist|psychiatrist|counselor|therapist)\b(?!.*massage)', 'Healthcare', 'Mental Health', 'Practitioners'),
|
||||||
|
(r'\b(chiropractor|osteopath|naturopath|homeopath|acupuncturist|herbalist)\b', 'Healthcare', 'Alternative Medicine', 'Practitioners'),
|
||||||
|
(r'\b(optometrist|optician)\b', 'Healthcare', 'Vision Care', 'Practitioners'),
|
||||||
|
(r'\b(medical|health)\b.*(center|clinic|office|practice)', 'Healthcare', 'Clinics', 'General'),
|
||||||
|
(r'\b(aged care|elder care|senior care|nursing home|assisted living|retirement)\b', 'Healthcare', 'Senior Care', 'Facilities'),
|
||||||
|
(r'\b(blood bank|blood donation|plasma)\b', 'Healthcare', 'Diagnostics', 'Blood_Services'),
|
||||||
|
(r'\b(dialysis|kidney)\b.*(center|clinic)', 'Healthcare', 'Specialty Care', 'Dialysis'),
|
||||||
|
(r'\b(fertility|ivf|reproductive)\b.*(clinic|center)', 'Healthcare', 'Specialty Care', 'Fertility'),
|
||||||
|
(r'\b(hospice|palliative)\b', 'Healthcare', 'Senior Care', 'Hospice'),
|
||||||
|
(r'\b(medical lab|laboratory|pathology|diagnostic)\b.*(center|lab)', 'Healthcare', 'Diagnostics', 'Labs'),
|
||||||
|
(r'\b(ambulance|emergency|paramedic|first aid)\b', 'Healthcare', 'Emergency Services', 'EMS'),
|
||||||
|
|
||||||
|
# ==================== AUTOMOTIVE (various) ====================
|
||||||
|
(r'\bauto\b.*(body|paint|dent|collision|restoration|upholster)', 'Automotive', 'Repair & Maintenance', 'Body_Work'),
|
||||||
|
(r'\bauto\b.*(repair|mechanic|service|tune.?up|brake|transmission|radiator)', 'Automotive', 'Repair & Maintenance', 'Mechanical'),
|
||||||
|
(r'\bauto\b.*(auction|broker|dealer)', 'Automotive', 'Dealers', 'Used_Vehicles'),
|
||||||
|
(r'\bauto\b.*(wrecker|salvage|junk|dismantl)', 'Automotive', 'Parts & Accessories', 'Salvage'),
|
||||||
|
(r'\b(car|vehicle|auto)\b.*(wash|detail|clean|wax)', 'Automotive', 'Vehicle Care', 'Cleaning'),
|
||||||
|
(r'\b(car|vehicle|auto)\b.*(rental|hire|lease)', 'Automotive', 'Rental Services', 'Vehicles'),
|
||||||
|
(r'\b(car|vehicle|auto)\b.*(storage|parking)', 'Automotive', 'Parking', 'Storage'),
|
||||||
|
(r'\b(motorcycle|motorbike|scooter|atv|quad)\b.*(dealer|shop|rental|repair)', 'Automotive', 'Dealers', 'Motorcycles'),
|
||||||
|
(r'\b(tire|tyre|wheel)\b.*(shop|store|service|dealer)', 'Automotive', 'Parts & Accessories', 'Tires'),
|
||||||
|
(r'\b(driving|driver)\b.*(school|training|instructor|lesson)', 'Automotive', 'Training', 'Driving_Schools'),
|
||||||
|
(r'\btruck\b.*(stop|dealer|rental|repair)', 'Automotive', 'Dealers', 'Trucks'),
|
||||||
|
(r'\b(rickshaw|auto rickshaw)\b', 'Transportation', 'Passenger', 'Local'),
|
||||||
|
|
||||||
|
# ==================== GOVERNMENT & MILITARY ====================
|
||||||
|
(r'\b(air force|army|navy|military|armed forces)\b.*(base|facility|office|recruitment)', 'Government', 'Public Safety', 'Military'),
|
||||||
|
(r'\b(police|sheriff|law enforcement)\b.*(station|department|office)', 'Government', 'Public Safety', 'Police'),
|
||||||
|
(r'\b(fire|firefighter)\b.*(station|department)', 'Government', 'Public Safety', 'Fire'),
|
||||||
|
(r'\b(court|courthouse|tribunal|judiciary)\b', 'Government', 'Legal', 'Courts'),
|
||||||
|
(r'\b(embassy|consulate|visa)\b.*(office|center)', 'Government', 'International', 'Diplomatic'),
|
||||||
|
(r'\b(city|town|municipal|county|district|borough)\b.*(hall|office|government|administration)', 'Government', 'Local Government', 'Offices'),
|
||||||
|
(r'\b(social services|welfare|unemployment|disability)\b.*(office|center)', 'Government', 'Social Services', 'Welfare'),
|
||||||
|
(r'\b(dmv|driver.*license|vehicle registration|motor vehicle)\b', 'Government', 'Transportation', 'DMV'),
|
||||||
|
(r'\b(passport|immigration|citizenship)\b.*(office|center)', 'Government', 'International', 'Immigration'),
|
||||||
|
(r'\b(aadhaar|agenzia entrate|tax)\b.*(office|center)', 'Government', 'Local Government', 'Tax'),
|
||||||
|
(r'\b(asylum|refugee)\b.*(center|office)', 'Government', 'Social Services', 'Refugee'),
|
||||||
|
|
||||||
|
# ==================== PETS & ANIMALS ====================
|
||||||
|
(r'\b(animal|pet)\b.*(shelter|rescue|adoption|welfare|pound|sanctuary)', 'Pets_Animals', 'Animal Welfare', 'Shelters'),
|
||||||
|
(r'\b(animal|pet)\b.*(hospital|clinic|vet|veterinary)', 'Healthcare', 'Veterinary', 'Clinics'),
|
||||||
|
(r'\b(animal|pet)\b.*(grooming|boarding|kennel|daycare|sitting|walking)', 'Pets_Animals', 'Pet Services', 'Care'),
|
||||||
|
(r'\b(animal|pet)\b.*(training|obedience|behavior)', 'Pets_Animals', 'Pet Services', 'Training'),
|
||||||
|
(r'\b(dog|cat|bird|fish|reptile|aquarium)\b.*(breeder|shop|store)', 'Retail_Shopping', 'Pet Supplies', 'Breeders'),
|
||||||
|
(r'\bzoo\b|aquarium|wildlife.*park|safari', 'Entertainment', 'Wildlife', 'Zoos'),
|
||||||
|
|
||||||
|
# ==================== RELIGIOUS ====================
|
||||||
|
(r'\b(church|chapel|cathedral|basilica|parish)\b', 'Religious', 'Christian', 'Churches'),
|
||||||
|
(r'\b(temple|mandir|hindu)\b', 'Religious', 'Hinduism', 'Temples'),
|
||||||
|
(r'\b(mosque|masjid|islamic)\b', 'Religious', 'Islam', 'Mosques'),
|
||||||
|
(r'\b(synagogue|jewish|judaism)\b', 'Religious', 'Judaism', 'Synagogues'),
|
||||||
|
(r'\b(buddhist|buddha|monastery|zen|meditation center)\b', 'Religious', 'Buddhism', 'Temples'),
|
||||||
|
(r'\b(ashram|spiritual|guru)\b', 'Religious', 'Other', 'Spiritual'),
|
||||||
|
(r'\b(baha.*i|sikh|gurdwara|shinto)\b', 'Religious', 'Other', 'Houses_of_Worship'),
|
||||||
|
|
||||||
|
# ==================== EDUCATION ====================
|
||||||
|
(r'\b(university|college|faculty|academic department)\b', 'Education', 'Higher Education', 'Universities'),
|
||||||
|
(r'\b(preschool|kindergarten|nursery|daycare|child.*care|creche)\b(?!.*animal)', 'Education', 'Early Childhood', 'Preschools'),
|
||||||
|
(r'\b(school|academy)\b(?!.*driving|.*martial|.*dance|.*music|.*art|.*beauty|.*cooking|.*flight)', 'Education', 'K-12 Schools', 'General'),
|
||||||
|
(r'\b(language|esl|english)\b.*(school|class|course|learning)', 'Education', 'Language Learning', 'Schools'),
|
||||||
|
(r'\b(art|drawing|painting)\b.*(school|class|studio)', 'Education', 'Arts Education', 'Visual_Arts'),
|
||||||
|
(r'\b(music|piano|guitar|violin|drum)\b.*(school|lesson|instructor|teacher)', 'Education', 'Arts Education', 'Music'),
|
||||||
|
(r'\b(acting|theater|drama)\b.*(school|class|academy)', 'Education', 'Arts Education', 'Performing'),
|
||||||
|
(r'\b(tutoring|tutor|coaching)\b.*(center|service)', 'Education', 'Tutoring', 'General'),
|
||||||
|
(r'\b(library|public library)\b', 'Education', 'Libraries', 'Public'),
|
||||||
|
(r'\b(archive|historical|museum)\b.*library', 'Education', 'Libraries', 'Special'),
|
||||||
|
(r'\b(vocational|trade|technical)\b.*(school|training|institute)', 'Education', 'Vocational Training', 'General'),
|
||||||
|
(r'\b(apprentice|internship)\b', 'Education', 'Vocational Training', 'Apprenticeships'),
|
||||||
|
(r'\b(flight|aviation|pilot)\b.*(school|training|academy)', 'Education', 'Specialty Schools', 'Aviation'),
|
||||||
|
(r'\b(cooking|culinary|chef)\b.*(school|class|academy)', 'Education', 'Specialty Schools', 'Culinary'),
|
||||||
|
(r'\b(beauty|cosmetology|esthetician)\b.*(school|academy)', 'Education', 'Specialty Schools', 'Beauty'),
|
||||||
|
|
||||||
|
# ==================== HOME SERVICES ====================
|
||||||
|
(r'\b(bathroom|kitchen)\b.*(remodel|renovation|contractor)', 'Home_Services', 'Construction', 'Remodeling'),
|
||||||
|
(r'\b(general|home)\b.*contractor', 'Home_Services', 'Construction', 'General'),
|
||||||
|
(r'\b(painter|painting)\b.*(contractor|service|company)(?!.*auto)', 'Home_Services', 'Construction', 'Painting'),
|
||||||
|
(r'\b(carpenter|carpentry|cabinet|woodwork)\b', 'Home_Services', 'Construction', 'Carpentry'),
|
||||||
|
(r'\b(mason|masonry|brick|concrete|stone)\b.*(contractor|service|company)', 'Home_Services', 'Construction', 'Masonry'),
|
||||||
|
(r'\b(electrician|electrical)\b.*(contractor|service|company)', 'Home_Services', 'Electrical', 'Contractors'),
|
||||||
|
(r'\b(plumber|plumbing)\b.*(contractor|service|company)', 'Home_Services', 'Plumbing', 'Contractors'),
|
||||||
|
(r'\b(hvac|heating|air conditioning|furnace)\b.*(contractor|service|company)', 'Home_Services', 'HVAC', 'Contractors'),
|
||||||
|
(r'\b(roofer|roofing)\b.*(contractor|service|company)', 'Home_Services', 'Roofing', 'Contractors'),
|
||||||
|
(r'\b(landscap|lawn|garden)\b.*(service|company|contractor)(?!.*store|.*center)', 'Home_Services', 'Landscaping', 'Services'),
|
||||||
|
(r'\b(pool|spa)\b.*(service|cleaning|maintenance|contractor)', 'Home_Services', 'Pool & Spa', 'Services'),
|
||||||
|
(r'\b(pest|exterminator|termite)\b.*(control|service)', 'Home_Services', 'Pest Control', 'Services'),
|
||||||
|
(r'\b(cleaning|maid|janitorial|housekeeping)\b.*(service|company)', 'Home_Services', 'Cleaning', 'Services'),
|
||||||
|
(r'\b(window)\b.*(cleaning|wash)', 'Home_Services', 'Cleaning', 'Window'),
|
||||||
|
(r'\b(appliance)\b.*(repair|service)', 'Home_Services', 'Appliance Repair', 'Services'),
|
||||||
|
(r'\b(handyman|odd job|home repair)\b', 'Home_Services', 'General Repair', 'Handyman'),
|
||||||
|
(r'\b(moving|movers|relocation)\b.*(company|service)', 'Home_Services', 'Moving', 'Services'),
|
||||||
|
(r'\b(locksmith)\b', 'Home_Services', 'Security', 'Locksmith'),
|
||||||
|
(r'\b(alarm|security system)\b.*(company|service|installer)', 'Home_Services', 'Security', 'Systems'),
|
||||||
|
(r'\b(arborist|tree)\b.*(service|removal|trimming)', 'Home_Services', 'Landscaping', 'Tree_Service'),
|
||||||
|
(r'\b(fence)\b.*(contractor|company|install)', 'Home_Services', 'Construction', 'Fencing'),
|
||||||
|
(r'\b(garage door)\b.*(service|repair|install)', 'Home_Services', 'General Repair', 'Garage_Doors'),
|
||||||
|
(r'\b(gutter)\b.*(cleaning|service|install)', 'Home_Services', 'Construction', 'Gutters'),
|
||||||
|
(r'\b(insulation)\b.*(contractor|company)', 'Home_Services', 'Construction', 'Insulation'),
|
||||||
|
(r'\b(deck|patio)\b.*(builder|contractor)', 'Home_Services', 'Construction', 'Outdoor'),
|
||||||
|
(r'\b(drywall|sheetrock)\b', 'Home_Services', 'Construction', 'Drywall'),
|
||||||
|
(r'\b(flooring|carpet|tile|hardwood)\b.*(install|contractor|company)(?!.*store)', 'Home_Services', 'Flooring', 'Installation'),
|
||||||
|
(r'\b(window|door)\b.*(install|replacement|contractor)', 'Home_Services', 'Windows & Doors', 'Installation'),
|
||||||
|
(r'\b(glass)\b.*(repair|replacement|company)(?!.*auto)', 'Home_Services', 'Windows & Doors', 'Glass'),
|
||||||
|
(r'\b(chimney)\b.*(sweep|cleaning|repair)', 'Home_Services', 'General Repair', 'Chimney'),
|
||||||
|
(r'\b(septic|sewer)\b.*(service|pumping|cleaning)', 'Home_Services', 'Plumbing', 'Septic'),
|
||||||
|
(r'\b(well)\b.*(drilling|service|pump)', 'Home_Services', 'Plumbing', 'Wells'),
|
||||||
|
(r'\b(solar)\b.*(install|contractor|company)', 'Home_Services', 'Electrical', 'Solar'),
|
||||||
|
|
||||||
|
# ==================== RETAIL & SHOPPING ====================
|
||||||
|
(r'\b(antique|vintage|secondhand|thrift|consignment|pawn)\b.*(shop|store)', 'Retail_Shopping', 'Secondhand & Vintage', 'Stores'),
|
||||||
|
(r'\b(auction)\b.*(house|company)', 'Retail_Shopping', 'Secondhand & Vintage', 'Auctions'),
|
||||||
|
(r'\b(art|craft|hobby)\b.*(supply|store|shop)', 'Retail_Shopping', 'Arts & Crafts', 'Supplies'),
|
||||||
|
(r'\b(toy|game|hobby)\b.*(store|shop)', 'Retail_Shopping', 'Toys & Hobbies', 'Stores'),
|
||||||
|
(r'\b(book|stationery|office supply)\b.*(store|shop)', 'Retail_Shopping', 'Books & Office', 'Stores'),
|
||||||
|
(r'\b(music|instrument|record|vinyl)\b.*(store|shop)', 'Retail_Shopping', 'Music & Entertainment', 'Stores'),
|
||||||
|
(r'\b(sporting|sports|outdoor|camping|fishing|hunting)\b.*(goods|store|shop)', 'Retail_Shopping', 'Sports & Outdoors', 'Stores'),
|
||||||
|
(r'\b(electronics|computer|phone|appliance)\b.*(store|shop|retailer)', 'Retail_Shopping', 'Electronics', 'Stores'),
|
||||||
|
(r'\b(furniture|home decor|bedding|mattress)\b.*(store|shop)', 'Retail_Shopping', 'Home & Garden', 'Stores'),
|
||||||
|
(r'\b(clothing|fashion|apparel|boutique|shoe)\b.*(store|shop)', 'Retail_Shopping', 'Clothing & Fashion', 'Stores'),
|
||||||
|
(r'\b(jewelry|watch|gem)\b.*(store|shop)', 'Retail_Shopping', 'Jewelry & Watches', 'Stores'),
|
||||||
|
(r'\b(hardware|tool|building supply|lumber)\b.*(store|shop)', 'Retail_Shopping', 'Hardware & Building', 'Stores'),
|
||||||
|
(r'\b(garden|nursery|plant)\b.*(center|store|shop)', 'Retail_Shopping', 'Home & Garden', 'Garden_Centers'),
|
||||||
|
(r'\b(pharmacy|drugstore)\b', 'Retail_Shopping', 'Health & Pharmacy', 'Pharmacies'),
|
||||||
|
(r'\b(cosmetic|beauty|makeup)\b.*(store|shop)', 'Retail_Shopping', 'Beauty & Cosmetics', 'Stores'),
|
||||||
|
(r'\b(grocery|supermarket|food|convenience)\b.*(store|market|shop)', 'Retail_Shopping', 'Food & Grocery', 'Stores'),
|
||||||
|
(r'\b(liquor|wine|beer|alcohol)\b.*(store|shop)', 'Retail_Shopping', 'Food & Grocery', 'Liquor'),
|
||||||
|
(r'\b(tobacco|cigar|vape|smoke)\b.*(shop|store)', 'Retail_Shopping', 'Specialty Retail', 'Tobacco'),
|
||||||
|
(r'\b(mobile phone|cell phone)\b.*(store|shop|dealer)', 'Retail_Shopping', 'Electronics', 'Phones'),
|
||||||
|
(r'\b(optical|eyewear|glasses|sunglass)\b.*(store|shop)', 'Retail_Shopping', 'Health & Pharmacy', 'Optical'),
|
||||||
|
(r'\b(florist|flower)\b.*(shop|store)', 'Events_Weddings', 'Florists', 'Shops'),
|
||||||
|
(r'\b(bridal|wedding)\b.*(shop|store|boutique)', 'Events_Weddings', 'Attire', 'Bridal'),
|
||||||
|
(r'\b(uniform|workwear)\b.*(store|shop)', 'Retail_Shopping', 'Clothing & Fashion', 'Specialty'),
|
||||||
|
|
||||||
|
# ==================== PROFESSIONAL SERVICES ====================
|
||||||
|
(r'\b(lawyer|attorney|law firm|legal)\b.*(office|firm|service)', 'Professional_Services', 'Legal', 'Firms'),
|
||||||
|
(r'\b(accountant|accounting|bookkeep|tax)\b.*(firm|service|office)(?!.*government)', 'Professional_Services', 'Financial Services', 'Accounting'),
|
||||||
|
(r'\b(architect|architecture)\b.*(firm|office|studio)', 'Professional_Services', 'Engineering', 'Architecture'),
|
||||||
|
(r'\b(engineer|engineering)\b.*(firm|office|company)', 'Professional_Services', 'Engineering', 'Firms'),
|
||||||
|
(r'\b(surveyor|surveying|land survey)\b', 'Professional_Services', 'Engineering', 'Surveying'),
|
||||||
|
(r'\b(consultant|consulting)\b.*(firm|company|service)', 'Professional_Services', 'Consulting', 'General'),
|
||||||
|
(r'\b(marketing|advertising|pr|public relations)\b.*(agency|firm|company)', 'Professional_Services', 'Marketing & Advertising', 'Agencies'),
|
||||||
|
(r'\b(graphic|web|design)\b.*(studio|agency|firm)', 'Professional_Services', 'Creative Services', 'Design'),
|
||||||
|
(r'\b(photography|photographer|video|videograph)\b.*(studio|service)', 'Professional_Services', 'Creative Services', 'Photography'),
|
||||||
|
(r'\b(translation|interpreter|language)\b.*service', 'Professional_Services', 'Language Services', 'Translation'),
|
||||||
|
(r'\b(staffing|recruiting|employment|hr)\b.*(agency|service|firm)', 'Professional_Services', 'HR Services', 'Agencies'),
|
||||||
|
(r'\b(notary|notarial)\b', 'Professional_Services', 'Legal', 'Notary'),
|
||||||
|
(r'\b(private investigator|detective)\b', 'Professional_Services', 'Agencies', 'Investigation'),
|
||||||
|
(r'\b(appraiser|appraisal|valuation)\b', 'Professional_Services', 'Financial Services', 'Appraisal'),
|
||||||
|
(r'\b(auditor|audit)\b.*(firm|service)', 'Professional_Services', 'Financial Services', 'Audit'),
|
||||||
|
(r'\b(courier|messenger|delivery)\b.*service', 'Transportation', 'Delivery', 'Courier'),
|
||||||
|
|
||||||
|
# ==================== ARTS & CULTURE ====================
|
||||||
|
(r'\b(art|gallery|exhibition)\b(?!.*supply|.*store|.*school)', 'Entertainment', 'Arts', 'Galleries'),
|
||||||
|
(r'\b(museum)\b', 'Entertainment', 'Museums', 'General'),
|
||||||
|
(r'\b(theater|theatre|playhouse|opera house)\b', 'Entertainment', 'Performing Arts', 'Venues'),
|
||||||
|
(r'\b(cinema|movie theater|multiplex)\b', 'Entertainment', 'Movies', 'Theaters'),
|
||||||
|
(r'\b(concert|music)\b.*(hall|venue)', 'Entertainment', 'Music Venues', 'Concert_Halls'),
|
||||||
|
(r'\b(band|orchestra|choir|ensemble)\b', 'Entertainment', 'Performing Arts', 'Groups'),
|
||||||
|
(r'\b(comedian|comedy club)\b', 'Entertainment', 'Performing Arts', 'Comedy'),
|
||||||
|
(r'\b(artist|sculptor|painter)\b(?!.*makeup)', 'Entertainment', 'Arts', 'Artists'),
|
||||||
|
(r'\b(animation|animator)\b.*(studio|company)', 'Professional_Services', 'Creative Services', 'Animation'),
|
||||||
|
(r'\b(recording|music)\b.*studio', 'Professional_Services', 'Creative Services', 'Recording'),
|
||||||
|
(r'\b(art restoration|restoration service)\b', 'Professional_Services', 'Creative Services', 'Restoration'),
|
||||||
|
|
||||||
|
# ==================== ENTERTAINMENT & RECREATION ====================
|
||||||
|
(r'\b(amusement|theme)\b.*park', 'Entertainment', 'Amusement', 'Parks'),
|
||||||
|
(r'\b(arcade|game center|gaming)\b', 'Entertainment', 'Games & Recreation', 'Arcades'),
|
||||||
|
(r'\b(escape room|puzzle room)\b', 'Entertainment', 'Games & Recreation', 'Escape_Rooms'),
|
||||||
|
(r'\b(bowling)\b.*(alley|center)', 'Entertainment', 'Games & Recreation', 'Bowling'),
|
||||||
|
(r'\b(billiard|pool hall|snooker)\b', 'Entertainment', 'Games & Recreation', 'Billiards'),
|
||||||
|
(r'\b(karaoke)\b', 'Entertainment', 'Music Venues', 'Karaoke'),
|
||||||
|
(r'\b(casino|gambling|betting)\b', 'Entertainment', 'Gambling', 'Casinos'),
|
||||||
|
(r'\b(nightclub|disco|club)\b(?!.*golf|.*country|.*tennis)', 'Food_Dining', 'Bars & Nightlife', 'Nightclubs'),
|
||||||
|
(r'\b(country club|private club|social club)\b', 'Entertainment', 'Social', 'Clubs'),
|
||||||
|
(r'\b(botanical garden|arboretum)\b', 'Entertainment', 'Parks', 'Gardens'),
|
||||||
|
(r'\b(park|playground|recreation area)\b(?!.*theme|.*water|.*trailer|.*mobile)', 'Entertainment', 'Parks', 'Public'),
|
||||||
|
(r'\b(beach|waterfront|marina)\b(?!.*hotel)', 'Entertainment', 'Parks', 'Beaches'),
|
||||||
|
(r'\b(campground|camping|rv park|caravan)\b', 'Hospitality_Travel', 'Lodging', 'Camping'),
|
||||||
|
(r'\b(go.?kart|kart|karting)\b', 'Entertainment', 'Games & Recreation', 'Karting'),
|
||||||
|
(r'\b(laser tag|paintball)\b', 'Entertainment', 'Games & Recreation', 'Adventure'),
|
||||||
|
(r'\b(trampoline|bounce|jump)\b.*(park|center)', 'Entertainment', 'Games & Recreation', 'Trampoline'),
|
||||||
|
(r'\b(mini golf|miniature golf|putt.?putt)\b', 'Entertainment', 'Games & Recreation', 'Mini_Golf'),
|
||||||
|
(r'\b(water park|aqua park)\b', 'Entertainment', 'Amusement', 'Water_Parks'),
|
||||||
|
(r'\b(haunted|horror)\b.*(house|attraction)', 'Entertainment', 'Amusement', 'Attractions'),
|
||||||
|
(r'\b(circus|carnival|fair)\b', 'Entertainment', 'Amusement', 'Shows'),
|
||||||
|
(r'\b(planetarium|observatory)\b', 'Entertainment', 'Museums', 'Science'),
|
||||||
|
|
||||||
|
# ==================== FOOD & DINING ====================
|
||||||
|
(r'\b(bar|pub|tavern|lounge|brewery|taproom|brewpub)\b(?!.*brow|.*eyebrow)', 'Food_Dining', 'Bars & Nightlife', 'Bars'),
|
||||||
|
(r'\b(cafe|coffee|espresso)\b.*(shop|house|bar)', 'Food_Dining', 'Cafes & Coffee', 'Cafes'),
|
||||||
|
(r'\b(restaurant|eatery|diner|bistro|brasserie|grill)\b', 'Food_Dining', 'Restaurants', 'General'),
|
||||||
|
(r'\b(bakery|patisserie|pastry)\b', 'Food_Dining', 'Bakeries & Desserts', 'Bakeries'),
|
||||||
|
(r'\b(ice cream|gelato|frozen yogurt|dessert)\b.*(shop|parlor|store)', 'Food_Dining', 'Bakeries & Desserts', 'Desserts'),
|
||||||
|
(r'\b(caterer|catering)\b', 'Food_Dining', 'Food Services', 'Catering'),
|
||||||
|
(r'\b(food truck|food cart)\b', 'Food_Dining', 'Quick Service', 'Mobile'),
|
||||||
|
(r'\b(juice|smoothie)\b.*(bar|shop)', 'Food_Dining', 'Cafes & Coffee', 'Juice'),
|
||||||
|
(r'\b(tea|bubble tea|boba)\b.*(shop|house|room)', 'Food_Dining', 'Cafes & Coffee', 'Tea'),
|
||||||
|
(r'\b(winery|vineyard|wine)\b.*(tasting|cellar)', 'Food_Dining', 'Beverage Production', 'Wineries'),
|
||||||
|
(r'\b(distillery|spirit)\b', 'Food_Dining', 'Beverage Production', 'Distilleries'),
|
||||||
|
(r'\b(butcher|meat)\b.*shop', 'Retail_Shopping', 'Food & Grocery', 'Butchers'),
|
||||||
|
(r'\b(fish|seafood)\b.*market', 'Retail_Shopping', 'Food & Grocery', 'Seafood'),
|
||||||
|
(r'\b(deli|delicatessen)\b', 'Retail_Shopping', 'Food & Grocery', 'Delis'),
|
||||||
|
(r'\b(candy|chocolate|sweet|confection)\b.*(shop|store)', 'Retail_Shopping', 'Food & Grocery', 'Confectionery'),
|
||||||
|
|
||||||
|
# ==================== PERSONAL SERVICES ====================
|
||||||
|
(r'\b(barber|hair)\b.*(shop|salon|stylist)', 'Personal_Services', 'Hair Care', 'Salons'),
|
||||||
|
(r'\b(beauty|nail|manicure|pedicure)\b.*(salon|spa|studio)', 'Personal_Services', 'Spa & Wellness', 'Beauty'),
|
||||||
|
(r'\b(tattoo|piercing|body art)\b.*(shop|studio|parlor)', 'Personal_Services', 'Body Art', 'Studios'),
|
||||||
|
(r'\b(massage)\b.*(therapist|spa|parlor|studio)', 'Personal_Services', 'Massage', 'Studios'),
|
||||||
|
(r'\b(spa|wellness|day spa)\b', 'Personal_Services', 'Spa & Wellness', 'Spas'),
|
||||||
|
(r'\b(tanning|sunbed)\b.*(salon|studio)', 'Personal_Services', 'Spa & Wellness', 'Tanning'),
|
||||||
|
(r'\b(laundry|laundromat|dry clean|tailor|alteration|seamstress)\b', 'Personal_Services', 'Laundry', 'Services'),
|
||||||
|
(r'\b(shoe repair|cobbler)\b', 'Personal_Services', 'Clothing Care', 'Shoe_Repair'),
|
||||||
|
(r'\b(brow|eyebrow|lash|eyelash)\b.*(bar|salon|studio)', 'Personal_Services', 'Spa & Wellness', 'Brows_Lashes'),
|
||||||
|
(r'\b(makeup artist|stylist)\b', 'Personal_Services', 'Spa & Wellness', 'Makeup'),
|
||||||
|
(r'\b(sauna|steam room|bathhouse|hammam)\b', 'Personal_Services', 'Spa & Wellness', 'Baths'),
|
||||||
|
(r'\b(waxing)\b.*(salon|studio)', 'Personal_Services', 'Spa & Wellness', 'Waxing'),
|
||||||
|
|
||||||
|
# ==================== HOSPITALITY & TRAVEL ====================
|
||||||
|
(r'\b(hotel|motel|inn|resort|hostel|lodge|bed and breakfast|b&b|guesthouse)\b', 'Hospitality_Travel', 'Lodging', 'Hotels'),
|
||||||
|
(r'\b(travel|tour)\b.*(agency|operator|company)', 'Hospitality_Travel', 'Travel Services', 'Agencies'),
|
||||||
|
(r'\b(airline|airport|aviation)\b(?!.*school)', 'Transportation', 'Passenger', 'Air'),
|
||||||
|
(r'\b(cruise|ferry)\b.*(line|terminal|port)', 'Transportation', 'Passenger', 'Water'),
|
||||||
|
(r'\b(train|rail)\b.*(station|service)', 'Transportation', 'Passenger', 'Rail'),
|
||||||
|
(r'\b(bus|coach)\b.*(station|terminal|service|company)', 'Transportation', 'Passenger', 'Bus'),
|
||||||
|
(r'\b(taxi|cab|ride|uber|lyft|limo|limousine|chauffeur)\b.*(service|company|stand)', 'Transportation', 'Passenger', 'Taxi'),
|
||||||
|
(r'\b(tourist|visitor)\b.*(information|center|bureau)', 'Hospitality_Travel', 'Travel Services', 'Information'),
|
||||||
|
(r'\b(rental)\b.*\b(cabin|cottage|vacation|holiday)\b', 'Hospitality_Travel', 'Lodging', 'Rentals'),
|
||||||
|
|
||||||
|
# ==================== INDUSTRIAL & MANUFACTURING ====================
|
||||||
|
(r'\b(factory|plant|mill|manufacturing)\b', 'Industrial', 'Manufacturing', 'General'),
|
||||||
|
(r'\b(warehouse|distribution|logistics)\b.*(center|facility)', 'Transportation', 'Logistics', 'Warehouses'),
|
||||||
|
(r'\b(machine|machinist|metalwork|welding|welder)\b.*(shop|company|service)', 'Industrial', 'Manufacturing', 'Metal'),
|
||||||
|
(r'\b(print|printing|press)\b.*(shop|company|service)', 'Industrial', 'Manufacturing', 'Printing'),
|
||||||
|
(r'\b(textile|fabric|garment)\b.*(factory|mill|manufacturer)', 'Industrial', 'Manufacturing', 'Textile'),
|
||||||
|
(r'\b(chemical|pharmaceutical)\b.*(company|manufacturer|plant)', 'Industrial', 'Manufacturing', 'Chemical'),
|
||||||
|
(r'\b(construction|building)\b.*(company|contractor|firm)', 'Industrial', 'Construction', 'General'),
|
||||||
|
(r'\b(quarry|gravel|sand|aggregate)\b', 'Industrial', 'Mining', 'Quarries'),
|
||||||
|
(r'\b(sawmill|lumber)\b.*(mill|yard)', 'Industrial', 'Manufacturing', 'Wood'),
|
||||||
|
(r'\b(steel|iron|aluminum)\b.*(plant|manufacturer|company)', 'Industrial', 'Manufacturing', 'Metal'),
|
||||||
|
(r'\b(packaging|container)\b.*(company|manufacturer)', 'Industrial', 'Manufacturing', 'Packaging'),
|
||||||
|
(r'\b(recycling|waste)\b.*(center|facility|company)', 'Industrial', 'Manufacturing', 'Recycling'),
|
||||||
|
|
||||||
|
# ==================== REAL ESTATE ====================
|
||||||
|
(r'\b(real estate|realtor|property)\b.*(agent|agency|company)', 'Real_Estate', 'Agencies', 'Agents'),
|
||||||
|
(r'\b(property management|apartment|rental)\b.*(company|agency)', 'Real_Estate', 'Management', 'Residential'),
|
||||||
|
(r'\b(storage|self storage|mini storage)\b.*(facility|unit)', 'Real_Estate', 'Storage', 'Self_Storage'),
|
||||||
|
(r'\b(office|commercial)\b.*(space|building|complex)', 'Real_Estate', 'Commercial', 'Office'),
|
||||||
|
(r'\b(apartment|condo|housing)\b.*(complex|building|community)', 'Real_Estate', 'Residential', 'Apartments'),
|
||||||
|
(r'\b(home builder|housing development)\b', 'Real_Estate', 'Development', 'Residential'),
|
||||||
|
|
||||||
|
# ==================== NON-PROFIT & COMMUNITY ====================
|
||||||
|
(r'\b(charity|charitable|foundation|fund)\b(?!.*investment)', 'Non_Profit', 'Charities', 'General'),
|
||||||
|
(r'\b(non.?profit|ngo|association)\b', 'Non_Profit', 'General', 'Organizations'),
|
||||||
|
(r'\b(community|civic|neighborhood)\b.*(center|organization|association)', 'Non_Profit', 'Community', 'Centers'),
|
||||||
|
(r'\b(youth|boys|girls|scout)\b.*(club|organization|center)', 'Non_Profit', 'Community', 'Youth'),
|
||||||
|
(r'\b(senior|elder)\b.*(center|club)(?!.*care)', 'Non_Profit', 'Community', 'Seniors'),
|
||||||
|
(r'\b(veterans|vfw|american legion)\b', 'Non_Profit', 'Community', 'Veterans'),
|
||||||
|
(r'\b(rotary|lions|kiwanis|elks|freemason|lodge)\b', 'Non_Profit', 'Community', 'Fraternal'),
|
||||||
|
(r'\b(union|labor)\b.*(hall|organization)', 'Non_Profit', 'Professional', 'Unions'),
|
||||||
|
(r'\b(chamber of commerce|business association)\b', 'Non_Profit', 'Professional', 'Business'),
|
||||||
|
(r'\b(aboriginal|indigenous|tribal)\b.*(organization|center)', 'Non_Profit', 'Community', 'Indigenous'),
|
||||||
|
|
||||||
|
# ==================== TECHNOLOGY ====================
|
||||||
|
(r'\b(software|app|web)\b.*(developer|development|company)', 'Technology', 'Software', 'Development'),
|
||||||
|
(r'\b(it|computer|tech)\b.*(service|support|repair)', 'Technology', 'IT Services', 'Support'),
|
||||||
|
(r'\b(data center|server|cloud)\b', 'Technology', 'Infrastructure', 'Data_Centers'),
|
||||||
|
(r'\b(internet|isp|broadband|telecom)\b.*(provider|service|company)', 'Technology', 'Telecommunications', 'Providers'),
|
||||||
|
(r'\b(bpo|call center|outsourc)\b', 'Technology', 'IT Services', 'BPO'),
|
||||||
|
(r'\b(automation|robot)\b.*(company|service)', 'Technology', 'Software', 'Automation'),
|
||||||
|
|
||||||
|
# ==================== FINANCE & INSURANCE ====================
|
||||||
|
(r'\b(bank|credit union|savings)\b(?!.*blood|.*food)', 'Finance_Insurance', 'Banking', 'Banks'),
|
||||||
|
(r'\b(atm|cash machine)\b', 'Finance_Insurance', 'Banking', 'ATMs'),
|
||||||
|
(r'\b(insurance)\b.*(agent|agency|company|broker)', 'Finance_Insurance', 'Insurance', 'Agents'),
|
||||||
|
(r'\b(mortgage|loan|lending|finance)\b.*(company|broker|service)', 'Finance_Insurance', 'Lending', 'Lenders'),
|
||||||
|
(r'\b(investment|wealth|portfolio|financial advisor)\b', 'Finance_Insurance', 'Investment', 'Advisors'),
|
||||||
|
(r'\b(money transfer|remittance|western union|moneygram)\b', 'Finance_Insurance', 'Money Services', 'Transfer'),
|
||||||
|
(r'\b(currency exchange|forex)\b', 'Finance_Insurance', 'Money Services', 'Exchange'),
|
||||||
|
(r'\b(bail bond)\b', 'Professional_Services', 'Legal', 'Bail'),
|
||||||
|
|
||||||
|
# ==================== EVENTS & WEDDINGS ====================
|
||||||
|
(r'\b(funeral|mortuary|cremation|cemetery|memorial)\b', 'Events_Weddings', 'Memorial', 'Funeral'),
|
||||||
|
(r'\b(event|party|wedding)\b.*(planner|planning|coordinator)', 'Events_Weddings', 'Planning', 'Planners'),
|
||||||
|
(r'\b(banquet|event|reception|wedding)\b.*(hall|venue|center)', 'Events_Weddings', 'Venues', 'Halls'),
|
||||||
|
(r'\b(dj|disc jockey|entertainment)\b.*service', 'Events_Weddings', 'Services', 'Entertainment'),
|
||||||
|
(r'\b(balloon|party supply|decoration)\b', 'Events_Weddings', 'Services', 'Decorations'),
|
||||||
|
(r'\b(tent|equipment)\b.*rental(?!.*car|.*truck)', 'Events_Weddings', 'Rentals', 'Equipment'),
|
||||||
|
(r'\b(photo booth|photobooth)\b', 'Events_Weddings', 'Services', 'Photography'),
|
||||||
|
|
||||||
|
# ==================== AGRICULTURE ====================
|
||||||
|
(r'\b(farm|ranch|orchard|vineyard)\b(?!.*winery)', 'Agriculture', 'Farming', 'Farms'),
|
||||||
|
(r'\b(agriculture|farming|crop)\b.*(service|supply|equipment)', 'Agriculture', 'Services', 'Supplies'),
|
||||||
|
(r'\b(livestock|cattle|poultry|dairy)\b', 'Agriculture', 'Farming', 'Livestock'),
|
||||||
|
(r'\b(nursery|greenhouse|horticulture)\b.*(wholesale|grower)', 'Agriculture', 'Farming', 'Horticulture'),
|
||||||
|
(r'\b(agistment|horse boarding|stable)\b', 'Agriculture', 'Services', 'Equine'),
|
||||||
|
(r'\b(veterinarian|vet)\b.*(livestock|farm|large animal)', 'Agriculture', 'Services', 'Veterinary'),
|
||||||
|
|
||||||
|
# ==================== TRANSPORTATION ====================
|
||||||
|
(r'\b(shipping|freight|cargo|trucking)\b.*(company|service)', 'Transportation', 'Logistics', 'Shipping'),
|
||||||
|
(r'\b(courier|messenger|express)\b.*(service|delivery)', 'Transportation', 'Delivery', 'Courier'),
|
||||||
|
(r'\b(airport|airfield|airstrip|heliport)\b', 'Transportation', 'Passenger', 'Airports'),
|
||||||
|
(r'\b(port|harbor|dock|pier|marina)\b(?!.*wine)', 'Transportation', 'Logistics', 'Ports'),
|
||||||
|
(r'\b(parking)\b.*(lot|garage|structure)', 'Automotive', 'Parking', 'Lots'),
|
||||||
|
(r'\b(towing|tow truck)\b', 'Transportation', 'Vehicle Services', 'Towing'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def categorize_item(name):
|
||||||
|
"""
|
||||||
|
Categorize a single item based on rules.
|
||||||
|
Returns (sector, business_type, sub_category) or None if no match.
|
||||||
|
"""
|
||||||
|
name_lower = name.lower()
|
||||||
|
|
||||||
|
for pattern, sector, btype, subcat in CATEGORIZATION_RULES:
|
||||||
|
if re.search(pattern, name_lower, re.IGNORECASE):
|
||||||
|
return (sector, btype, subcat)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_existing_paths(cursor):
|
||||||
|
"""Get all existing paths in the database"""
|
||||||
|
cursor.execute("SELECT path::text FROM gbp_categories")
|
||||||
|
return {row[0] for row in cursor.fetchall()}
|
||||||
|
|
||||||
|
def get_or_create_path(cursor, sector, btype, subcat, existing_paths):
|
||||||
|
"""
|
||||||
|
Get or create the full path for a category.
|
||||||
|
Returns the parent path (level 3) for the item.
|
||||||
|
"""
|
||||||
|
sector_slug = slugify(sector)
|
||||||
|
btype_slug = slugify(btype)
|
||||||
|
subcat_slug = slugify(subcat)
|
||||||
|
|
||||||
|
# Level 1: Sector
|
||||||
|
sector_path = sector_slug
|
||||||
|
if sector_path not in existing_paths:
|
||||||
|
# Don't create new sectors - return None
|
||||||
|
print(f" [SKIP] Would need new sector: {sector_path}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Level 2: Business Type
|
||||||
|
btype_path = f"{sector_path}.{btype_slug}"
|
||||||
|
if btype_path not in existing_paths:
|
||||||
|
# Create new business type
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
|
||||||
|
SELECT %s, %s, %s::ltree, 2, id, 0
|
||||||
|
FROM gbp_categories WHERE path = %s::ltree
|
||||||
|
ON CONFLICT (path) DO NOTHING
|
||||||
|
RETURNING id
|
||||||
|
""", (btype, btype_slug, btype_path, sector_path))
|
||||||
|
result = cursor.fetchone()
|
||||||
|
if result:
|
||||||
|
existing_paths.add(btype_path)
|
||||||
|
print(f" [NEW] Created business type: {btype_path}")
|
||||||
|
|
||||||
|
# Level 3: Sub-category
|
||||||
|
subcat_path = f"{btype_path}.{subcat_slug}"
|
||||||
|
if subcat_path not in existing_paths:
|
||||||
|
# Create new sub-category
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
|
||||||
|
SELECT %s, %s, %s::ltree, 3, id, 0
|
||||||
|
FROM gbp_categories WHERE path = %s::ltree
|
||||||
|
ON CONFLICT (path) DO NOTHING
|
||||||
|
RETURNING id
|
||||||
|
""", (subcat, subcat_slug, subcat_path, btype_path))
|
||||||
|
result = cursor.fetchone()
|
||||||
|
if result:
|
||||||
|
existing_paths.add(subcat_path)
|
||||||
|
print(f" [NEW] Created sub-category: {subcat_path}")
|
||||||
|
|
||||||
|
return subcat_path
|
||||||
|
|
||||||
|
def main():
|
||||||
|
conn = psycopg2.connect(DB_URL)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# Get all items in Other.Uncategorized
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT id, name, slug
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE path ~ 'Other.Uncategorized.*' AND level = 4
|
||||||
|
ORDER BY name
|
||||||
|
""")
|
||||||
|
other_items = cursor.fetchall()
|
||||||
|
print(f"Found {len(other_items)} items in Other.Uncategorized\n")
|
||||||
|
|
||||||
|
# Get existing paths
|
||||||
|
existing_paths = get_existing_paths(cursor)
|
||||||
|
|
||||||
|
# Categorize items
|
||||||
|
categorized = []
|
||||||
|
uncategorized = []
|
||||||
|
category_counts = defaultdict(int)
|
||||||
|
|
||||||
|
for item_id, name, slug in other_items:
|
||||||
|
result = categorize_item(name)
|
||||||
|
if result:
|
||||||
|
sector, btype, subcat = result
|
||||||
|
categorized.append((item_id, name, slug, sector, btype, subcat))
|
||||||
|
category_counts[(sector, btype, subcat)] += 1
|
||||||
|
else:
|
||||||
|
uncategorized.append((item_id, name))
|
||||||
|
|
||||||
|
print(f"Categorized: {len(categorized)}")
|
||||||
|
print(f"Still uncategorized: {len(uncategorized)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Show category distribution
|
||||||
|
print("Category distribution:")
|
||||||
|
for (sector, btype, subcat), count in sorted(category_counts.items(), key=lambda x: -x[1])[:30]:
|
||||||
|
print(f" {sector}.{btype}.{subcat}: {count}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Show some uncategorized items
|
||||||
|
print("Sample uncategorized items (first 50):")
|
||||||
|
for item_id, name in uncategorized[:50]:
|
||||||
|
print(f" - {name}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Ask for confirmation
|
||||||
|
response = input("Proceed with database updates? (yes/no): ")
|
||||||
|
if response.lower() != 'yes':
|
||||||
|
print("Aborted.")
|
||||||
|
conn.close()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Update database
|
||||||
|
updated = 0
|
||||||
|
for item_id, name, slug, sector, btype, subcat in categorized:
|
||||||
|
parent_path = get_or_create_path(cursor, sector, btype, subcat, existing_paths)
|
||||||
|
if parent_path:
|
||||||
|
new_path = f"{parent_path}.{slug}"
|
||||||
|
# Update the item
|
||||||
|
cursor.execute("""
|
||||||
|
UPDATE gbp_categories
|
||||||
|
SET path = %s::ltree,
|
||||||
|
parent_id = (SELECT id FROM gbp_categories WHERE path = %s::ltree)
|
||||||
|
WHERE id = %s
|
||||||
|
""", (new_path, parent_path, item_id))
|
||||||
|
updated += 1
|
||||||
|
|
||||||
|
# Update category counts
|
||||||
|
cursor.execute("""
|
||||||
|
WITH counts AS (
|
||||||
|
SELECT
|
||||||
|
parent_id,
|
||||||
|
COUNT(*) as cnt
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE parent_id IS NOT NULL
|
||||||
|
GROUP BY parent_id
|
||||||
|
)
|
||||||
|
UPDATE gbp_categories g
|
||||||
|
SET category_count = COALESCE(c.cnt, 0)
|
||||||
|
FROM counts c
|
||||||
|
WHERE g.id = c.parent_id
|
||||||
|
""")
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
print(f"\nUpdated {updated} items")
|
||||||
|
|
||||||
|
# Show final stats
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT path, name, category_count
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE level = 1
|
||||||
|
ORDER BY category_count DESC
|
||||||
|
""")
|
||||||
|
print("\nFinal sector counts:")
|
||||||
|
for path, name, count in cursor.fetchall():
|
||||||
|
print(f" {name}: {count}")
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
25344
db_backup_20260201_1712.sql
Normal file
25344
db_backup_20260201_1712.sql
Normal file
File diff suppressed because one or more lines are too long
25
docker-compose.nuc.yml
Normal file
25
docker-compose.nuc.yml
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
# NUC Docker Compose Override
|
||||||
|
# Uses NUC-hosted PostgreSQL instead of local container
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# cp .env.nuc .env
|
||||||
|
# docker compose -f docker-compose.production.yml -f docker-compose.nuc.yml up -d
|
||||||
|
#
|
||||||
|
# This removes the local db service and connects API to NUC database
|
||||||
|
|
||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
# Disable local database (using NUC instead)
|
||||||
|
db:
|
||||||
|
profiles:
|
||||||
|
- disabled
|
||||||
|
|
||||||
|
# API Server - connect to NUC database
|
||||||
|
api:
|
||||||
|
environment:
|
||||||
|
- DATABASE_URL=postgresql://scraper:scraper_nuc_2026@192.168.1.3:5437/scraper
|
||||||
|
- REVIEWIQ_DATABASE_URL=postgresql://scraper:scraper_nuc_2026@192.168.1.3:5437/scraper
|
||||||
|
depends_on: [] # Remove db dependency
|
||||||
|
extra_hosts:
|
||||||
|
- "host.docker.internal:host-gateway"
|
||||||
@@ -10,9 +10,10 @@ services:
|
|||||||
POSTGRES_USER: scraper
|
POSTGRES_USER: scraper
|
||||||
POSTGRES_PASSWORD: ${DB_PASSWORD:-scraper123}
|
POSTGRES_PASSWORD: ${DB_PASSWORD:-scraper123}
|
||||||
ports:
|
ports:
|
||||||
- "5435:5432"
|
- "5437:5432"
|
||||||
volumes:
|
volumes:
|
||||||
- postgres_data:/var/lib/postgresql/data
|
- postgres_data:/var/lib/postgresql/data
|
||||||
|
- ./db/init:/docker-entrypoint-initdb.d:ro
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD-SHELL", "pg_isready -U scraper"]
|
test: ["CMD-SHELL", "pg_isready -U scraper"]
|
||||||
interval: 10s
|
interval: 10s
|
||||||
@@ -29,8 +30,8 @@ services:
|
|||||||
container_name: scraper-api
|
container_name: scraper-api
|
||||||
environment:
|
environment:
|
||||||
- DATABASE_URL=postgresql://scraper:${DB_PASSWORD:-scraper123}@db:5432/scraper
|
- DATABASE_URL=postgresql://scraper:${DB_PASSWORD:-scraper123}@db:5432/scraper
|
||||||
- API_BASE_URL=${API_BASE_URL:-http://localhost:8000}
|
- API_BASE_URL=${API_BASE_URL:-http://localhost:8001}
|
||||||
- PORT=8000
|
- PORT=8001
|
||||||
- MAX_CONCURRENT_JOBS=${MAX_CONCURRENT_JOBS:-5}
|
- MAX_CONCURRENT_JOBS=${MAX_CONCURRENT_JOBS:-5}
|
||||||
- CANARY_TEST_URL=${CANARY_TEST_URL:-https://www.google.com/maps/place/Soho+Factory/@54.6738155,25.2595844,17z/}
|
- CANARY_TEST_URL=${CANARY_TEST_URL:-https://www.google.com/maps/place/Soho+Factory/@54.6738155,25.2595844,17z/}
|
||||||
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-}
|
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-}
|
||||||
@@ -48,7 +49,7 @@ services:
|
|||||||
- ./packages:/app/packages:ro
|
- ./packages:/app/packages:ro
|
||||||
- ./api:/app/api:ro
|
- ./api:/app/api:ro
|
||||||
ports:
|
ports:
|
||||||
- "8000:8000"
|
- "8001:8001"
|
||||||
- "5900:5900" # VNC port (for VNC client)
|
- "5900:5900" # VNC port (for VNC client)
|
||||||
- "6080:6080" # noVNC web interface (browser access)
|
- "6080:6080" # noVNC web interface (browser access)
|
||||||
depends_on:
|
depends_on:
|
||||||
|
|||||||
35
migrations/versions/008_add_job_id_to_pipeline_tables.sql
Normal file
35
migrations/versions/008_add_job_id_to_pipeline_tables.sql
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
-- =============================================================================
|
||||||
|
-- Migration: 008_add_job_id_to_pipeline_tables.sql
|
||||||
|
-- Purpose: Add job_id column to pipeline tables for filtering by execution
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- Add job_id to reviews_enriched
|
||||||
|
ALTER TABLE pipeline.reviews_enriched
|
||||||
|
ADD COLUMN IF NOT EXISTS job_id UUID;
|
||||||
|
|
||||||
|
-- Add index for job_id on reviews_enriched
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_reviews_enriched_job_id
|
||||||
|
ON pipeline.reviews_enriched(job_id)
|
||||||
|
WHERE job_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- Add job_id to review_spans
|
||||||
|
ALTER TABLE pipeline.review_spans
|
||||||
|
ADD COLUMN IF NOT EXISTS job_id UUID;
|
||||||
|
|
||||||
|
-- Add index for job_id on review_spans
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_review_spans_job_id
|
||||||
|
ON pipeline.review_spans(job_id)
|
||||||
|
WHERE job_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- Add job_id to issues
|
||||||
|
ALTER TABLE pipeline.issues
|
||||||
|
ADD COLUMN IF NOT EXISTS job_id UUID;
|
||||||
|
|
||||||
|
-- Add index for job_id on issues
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_issues_job_id
|
||||||
|
ON pipeline.issues(job_id)
|
||||||
|
WHERE job_id IS NOT NULL;
|
||||||
|
|
||||||
|
COMMENT ON COLUMN pipeline.reviews_enriched.job_id IS 'Scraper job ID for filtering by execution';
|
||||||
|
COMMENT ON COLUMN pipeline.review_spans.job_id IS 'Scraper job ID for filtering by execution';
|
||||||
|
COMMENT ON COLUMN pipeline.issues.job_id IS 'Scraper job ID for filtering by execution';
|
||||||
174
migrations/versions/009_add_urt_subcodes_table.sql
Normal file
174
migrations/versions/009_add_urt_subcodes_table.sql
Normal file
@@ -0,0 +1,174 @@
|
|||||||
|
-- =============================================================================
|
||||||
|
-- Migration: 009_add_urt_subcodes_table.sql
|
||||||
|
-- Purpose: Add urt_subcodes table with human-readable names and definitions
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- URT Tier-3 subcodes lookup table
|
||||||
|
CREATE TABLE IF NOT EXISTS pipeline.urt_subcodes (
|
||||||
|
code VARCHAR(6) PRIMARY KEY,
|
||||||
|
category_code VARCHAR(2) NOT NULL REFERENCES pipeline.urt_categories(code),
|
||||||
|
domain_code CHAR(1) NOT NULL REFERENCES pipeline.urt_domains(code),
|
||||||
|
name VARCHAR(100) NOT NULL,
|
||||||
|
definition TEXT,
|
||||||
|
positive_example TEXT,
|
||||||
|
negative_example TEXT
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Index for lookups
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_urt_subcodes_category ON pipeline.urt_subcodes(category_code);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_urt_subcodes_domain ON pipeline.urt_subcodes(domain_code);
|
||||||
|
|
||||||
|
COMMENT ON TABLE pipeline.urt_subcodes IS 'URT v5.1 Tier-3 diagnostic subcodes with definitions';
|
||||||
|
|
||||||
|
-- Insert subcode data
|
||||||
|
INSERT INTO pipeline.urt_subcodes (code, category_code, domain_code, name, definition, positive_example, negative_example) VALUES
|
||||||
|
-- O1: Core Product/Service (Function)
|
||||||
|
('O1.01', 'O1', 'O', 'Works/Doesn''t Work', 'Basic functionality success or failure', 'Software runs perfectly', 'Car won''t start'),
|
||||||
|
('O1.02', 'O1', 'O', 'Performance Level', 'How well it operates', 'Incredibly fast processor', 'Sluggish and laggy'),
|
||||||
|
('O1.03', 'O1', 'O', 'Durability', 'Longevity and resistance to wear', 'Still perfect after 5 years', 'Fell apart in a month'),
|
||||||
|
('O1.04', 'O1', 'O', 'Reliability', 'Consistency of function over time', 'Never fails me', 'Works sometimes, not others'),
|
||||||
|
('O1.05', 'O1', 'O', 'Outcome Achievement', 'Did customer accomplish their goal?', 'Passed my exam!', 'Treatment didn''t work'),
|
||||||
|
|
||||||
|
-- O2: Product Features (Quality)
|
||||||
|
('O2.01', 'O2', 'O', 'Materials/Inputs', 'Quality of components or ingredients', 'Real leather, premium feel', 'Cheap plastic parts'),
|
||||||
|
('O2.02', 'O2', 'O', 'Craftsmanship', 'Skill of construction or execution', 'Beautifully sewn seams', 'Sloppy assembly'),
|
||||||
|
('O2.03', 'O2', 'O', 'Presentation', 'Visual and aesthetic quality', 'Gorgeous plating', 'Looked thrown together'),
|
||||||
|
('O2.04', 'O2', 'O', 'Attention to Detail', 'Finishing touches and refinement', 'Every corner perfect', 'Full of typos'),
|
||||||
|
('O2.05', 'O2', 'O', 'Condition at Delivery', 'State when received', 'Still warm from oven', 'Arrived damaged'),
|
||||||
|
|
||||||
|
-- O3: Variety & Selection (Completeness)
|
||||||
|
('O3.01', 'O3', 'O', 'All Components Present', 'Nothing missing from what was promised', 'Everything in the box', 'Missing the charger'),
|
||||||
|
('O3.02', 'O3', 'O', 'Feature Availability', 'Promised features actually work', 'All menu items available', 'Half the features disabled'),
|
||||||
|
('O3.03', 'O3', 'O', 'Scope Delivery', 'Full scope of work completed', 'Cleaned entire house', 'Left the bathrooms'),
|
||||||
|
('O3.04', 'O3', 'O', 'Documentation', 'Supporting materials provided', 'Great user manual', 'No instructions at all'),
|
||||||
|
|
||||||
|
-- O4: Customization (Fit)
|
||||||
|
('O4.01', 'O4', 'O', 'Specification Match', 'Matches what was ordered', 'Exactly what I ordered', 'Wrong size delivered'),
|
||||||
|
('O4.02', 'O4', 'O', 'Personalization', 'Adapted to individual preferences', 'Remembered my usual', 'No way to save prefs'),
|
||||||
|
('O4.03', 'O4', 'O', 'Flexibility', 'Can be modified or adjusted', 'Happy to substitute', 'No modifications allowed'),
|
||||||
|
('O4.04', 'O4', 'O', 'Appropriateness', 'Right solution for the need', 'Perfect recommendation', 'Sold me wrong thing'),
|
||||||
|
|
||||||
|
-- P1: Friendliness (Attitude)
|
||||||
|
('P1.01', 'P1', 'P', 'Warmth', 'Friendly and welcoming manner', 'Made me feel welcome', 'Cold and unfriendly'),
|
||||||
|
('P1.02', 'P1', 'P', 'Respect', 'Treated with dignity', 'Very respectful service', 'Rude and dismissive'),
|
||||||
|
('P1.03', 'P1', 'P', 'Patience', 'Calm and tolerant approach', 'Patient with my questions', 'Rushed and impatient'),
|
||||||
|
('P1.04', 'P1', 'P', 'Enthusiasm', 'Energy and engagement', 'Really passionate about helping', 'Seemed bored and disinterested'),
|
||||||
|
|
||||||
|
-- P2: Helpfulness (Competence)
|
||||||
|
('P2.01', 'P2', 'P', 'Knowledge', 'Expertise and understanding', 'Knew everything about the product', 'Had no idea what they were doing'),
|
||||||
|
('P2.02', 'P2', 'P', 'Skill', 'Technical ability', 'Expertly handled the issue', 'Completely incompetent'),
|
||||||
|
('P2.03', 'P2', 'P', 'Problem Solving', 'Ability to find solutions', 'Found a creative solution', 'Couldn''t figure it out'),
|
||||||
|
|
||||||
|
-- P3: Professionalism (Responsiveness)
|
||||||
|
('P3.01', 'P3', 'P', 'Attentiveness', 'Being present and engaged', 'Always attentive to needs', 'Ignored me completely'),
|
||||||
|
('P3.02', 'P3', 'P', 'Initiative', 'Proactive help', 'Anticipated my needs', 'Had to ask for everything'),
|
||||||
|
('P3.03', 'P3', 'P', 'Follow-through', 'Completing promised actions', 'Did exactly what they promised', 'Never followed up'),
|
||||||
|
|
||||||
|
-- P4: Knowledge & Expertise (Communication)
|
||||||
|
('P4.01', 'P4', 'P', 'Clarity', 'Clear communication', 'Explained everything clearly', 'Confusing and unclear'),
|
||||||
|
('P4.02', 'P4', 'P', 'Listening', 'Understanding customer needs', 'Really listened to me', 'Didn''t listen at all'),
|
||||||
|
('P4.03', 'P4', 'P', 'Transparency', 'Honest and open', 'Upfront about everything', 'Hid information from me'),
|
||||||
|
|
||||||
|
-- J1: Wait Times
|
||||||
|
('J1.01', 'J1', 'J', 'Speed', 'How fast things happen', 'Super fast service', 'Took forever'),
|
||||||
|
('J1.02', 'J1', 'J', 'Punctuality', 'On-time delivery', 'Arrived exactly when promised', 'Two hours late'),
|
||||||
|
('J1.03', 'J1', 'J', 'Queue Management', 'Handling of waiting customers', 'Well-organized queue', 'Chaotic and disorganized'),
|
||||||
|
|
||||||
|
-- J2: Booking & Reservations (Ease)
|
||||||
|
('J2.01', 'J2', 'J', 'Simplicity', 'Easy process', 'Super easy to book', 'Complicated process'),
|
||||||
|
('J2.02', 'J2', 'J', 'Friction', 'Obstacles encountered', 'Seamless experience', 'So many hoops to jump through'),
|
||||||
|
('J2.03', 'J2', 'J', 'Navigation', 'Finding what you need', 'Easy to navigate', 'Got lost multiple times'),
|
||||||
|
|
||||||
|
-- J3: Navigation & Convenience (Reliability)
|
||||||
|
('J3.01', 'J3', 'J', 'Consistency', 'Same experience every time', 'Always consistent', 'Different every visit'),
|
||||||
|
('J3.02', 'J3', 'J', 'Accuracy', 'Getting it right', 'Perfect every time', 'Full of errors'),
|
||||||
|
('J3.03', 'J3', 'J', 'Uptime', 'System availability', 'Never down', 'Constantly having issues'),
|
||||||
|
|
||||||
|
-- J4: Accessibility (Resolution)
|
||||||
|
('J4.01', 'J4', 'J', 'Problem Recognition', 'Acknowledging issues', 'Immediately acknowledged the issue', 'Denied there was a problem'),
|
||||||
|
('J4.02', 'J4', 'J', 'Resolution Speed', 'How fast problems get fixed', 'Fixed immediately', 'Took weeks to resolve'),
|
||||||
|
('J4.03', 'J4', 'J', 'Resolution Fairness', 'Fair handling of issues', 'Very fair resolution', 'Unfair treatment'),
|
||||||
|
('J4.04', 'J4', 'J', 'Resolution Quality', 'How well problems are fixed', 'Completely resolved', 'Problem still exists'),
|
||||||
|
|
||||||
|
-- E1: Physical Environment
|
||||||
|
('E1.01', 'E1', 'E', 'Cleanliness', 'How clean the space is', 'Spotlessly clean', 'Dirty and gross'),
|
||||||
|
('E1.02', 'E1', 'E', 'Comfort', 'Physical comfort', 'Very comfortable seating', 'Uncomfortable chairs'),
|
||||||
|
('E1.03', 'E1', 'E', 'Space Design', 'Layout and organization', 'Well-designed layout', 'Cramped and cluttered'),
|
||||||
|
('E1.04', 'E1', 'E', 'Maintenance', 'State of repair', 'Everything well-maintained', 'Falling apart'),
|
||||||
|
|
||||||
|
-- E2: Ambiance & Atmosphere
|
||||||
|
('E2.01', 'E2', 'E', 'Lighting', 'Light quality and level', 'Perfect lighting', 'Too dark/bright'),
|
||||||
|
('E2.02', 'E2', 'E', 'Sound/Noise', 'Audio environment', 'Nice music', 'Too loud'),
|
||||||
|
('E2.03', 'E2', 'E', 'Temperature', 'Climate control', 'Perfect temperature', 'Freezing/boiling'),
|
||||||
|
('E2.04', 'E2', 'E', 'Smell', 'Odors and scents', 'Smelled wonderful', 'Bad odors'),
|
||||||
|
|
||||||
|
-- E3: Cleanliness
|
||||||
|
('E3.01', 'E3', 'E', 'Interface Design', 'Digital UX/UI', 'Beautiful interface', 'Ugly and confusing'),
|
||||||
|
('E3.02', 'E3', 'E', 'App/Website Speed', 'Digital performance', 'Fast and responsive', 'Slow and laggy'),
|
||||||
|
('E3.03', 'E3', 'E', 'Usability', 'Ease of digital use', 'Intuitive to use', 'Impossible to figure out'),
|
||||||
|
|
||||||
|
-- E4: Digital Experience
|
||||||
|
('E4.01', 'E4', 'E', 'Safety', 'Physical safety', 'Felt completely safe', 'Felt unsafe'),
|
||||||
|
('E4.02', 'E4', 'E', 'Security', 'Protection of belongings/data', 'Very secure', 'Security concerns'),
|
||||||
|
('E4.03', 'E4', 'E', 'Health/Hygiene', 'Health standards', 'Very hygienic', 'Health code violations'),
|
||||||
|
|
||||||
|
-- A1: Friendliness (Availability)
|
||||||
|
('A1.01', 'A1', 'A', 'Hours', 'Operating hours', 'Great hours', 'Never open when I need them'),
|
||||||
|
('A1.02', 'A1', 'A', 'Booking Availability', 'Appointment slots', 'Easy to get an appointment', 'Booked for months'),
|
||||||
|
('A1.03', 'A1', 'A', 'Inventory', 'Product availability', 'Always in stock', 'Always out of stock'),
|
||||||
|
|
||||||
|
-- A2: Helpfulness (Accessibility)
|
||||||
|
('A2.01', 'A2', 'A', 'Physical Access', 'Mobility accessibility', 'Wheelchair accessible', 'Not accessible'),
|
||||||
|
('A2.02', 'A2', 'A', 'Language Access', 'Language accommodation', 'Multiple languages available', 'English only'),
|
||||||
|
('A2.03', 'A2', 'A', 'Digital Accessibility', 'Screen reader/a11y', 'Accessible website', 'Can''t use with screen reader'),
|
||||||
|
|
||||||
|
-- A3: Professionalism (Inclusivity)
|
||||||
|
('A3.01', 'A3', 'A', 'Diversity Welcome', 'All backgrounds welcome', 'Very inclusive', 'Felt unwelcome'),
|
||||||
|
('A3.02', 'A3', 'A', 'Accommodation', 'Special needs accommodation', 'Very accommodating', 'No accommodations available'),
|
||||||
|
|
||||||
|
-- A4: Knowledge & Expertise (Convenience)
|
||||||
|
('A4.01', 'A4', 'A', 'Location', 'Physical location convenience', 'Great location', 'Hard to get to'),
|
||||||
|
('A4.02', 'A4', 'A', 'Parking', 'Parking availability', 'Easy parking', 'No parking'),
|
||||||
|
('A4.03', 'A4', 'A', 'Multiple Channels', 'Ways to engage', 'Many ways to reach them', 'Only one contact method'),
|
||||||
|
|
||||||
|
-- V1: Value Perception (Price)
|
||||||
|
('V1.01', 'V1', 'V', 'Price Level', 'Cost amount', 'Very affordable', 'Way too expensive'),
|
||||||
|
('V1.02', 'V1', 'V', 'Price Fairness', 'Fair for what you get', 'Fair price', 'Overpriced'),
|
||||||
|
('V1.03', 'V1', 'V', 'Hidden Costs', 'Unexpected charges', 'No hidden fees', 'Lots of hidden charges'),
|
||||||
|
|
||||||
|
-- V2: Pricing Structure (Transparency)
|
||||||
|
('V2.01', 'V2', 'V', 'Clear Pricing', 'Easy to understand costs', 'Clear pricing', 'Confusing pricing'),
|
||||||
|
('V2.02', 'V2', 'V', 'Honest Billing', 'Accurate charges', 'Bill was accurate', 'Charged more than quoted'),
|
||||||
|
('V2.03', 'V2', 'V', 'Policy Clarity', 'Clear terms and conditions', 'Clear policies', 'Hidden in fine print'),
|
||||||
|
('V2.04', 'V2', 'V', 'Policy Fairness', 'Fair rules and terms', 'Fair policies', 'Unfair terms'),
|
||||||
|
|
||||||
|
-- V3: Promotions & Deals (Effort)
|
||||||
|
('V3.01', 'V3', 'V', 'Time Investment', 'Time required', 'Quick and easy', 'Took way too long'),
|
||||||
|
('V3.02', 'V3', 'V', 'Hassle Factor', 'Difficulty and inconvenience', 'No hassle', 'Such a hassle'),
|
||||||
|
('V3.03', 'V3', 'V', 'Mental Load', 'Cognitive effort required', 'Easy to understand', 'Too complicated'),
|
||||||
|
|
||||||
|
-- V4: Payment Process (Worth)
|
||||||
|
('V4.01', 'V4', 'V', 'Value for Money', 'Worth what you paid', 'Great value', 'Not worth the money'),
|
||||||
|
('V4.02', 'V4', 'V', 'ROI', 'Return on investment', 'Excellent return', 'Waste of money'),
|
||||||
|
('V4.03', 'V4', 'V', 'Overall Satisfaction', 'Happy with the exchange', 'Very satisfied', 'Totally unsatisfied'),
|
||||||
|
|
||||||
|
-- R1: Loyalty (Integrity)
|
||||||
|
('R1.01', 'R1', 'R', 'Honesty', 'Truthfulness', 'Always honest', 'Lied to me'),
|
||||||
|
('R1.02', 'R1', 'R', 'Ethics', 'Ethical behavior', 'Ethical practices', 'Unethical behavior'),
|
||||||
|
('R1.03', 'R1', 'R', 'Promises Kept', 'Following through on promises', 'Kept all promises', 'Broke their promise'),
|
||||||
|
|
||||||
|
-- R2: Trust (Dependability)
|
||||||
|
('R2.01', 'R2', 'R', 'Consistency', 'Reliable over time', 'Always reliable', 'Inconsistent'),
|
||||||
|
('R2.02', 'R2', 'R', 'Trustworthiness', 'Can be trusted', 'Completely trustworthy', 'Can''t be trusted'),
|
||||||
|
('R2.03', 'R2', 'R', 'Accountability', 'Takes responsibility', 'Takes responsibility', 'Blames others'),
|
||||||
|
|
||||||
|
-- R3: Consistency (Recovery)
|
||||||
|
('R3.01', 'R3', 'R', 'Error Acknowledgment', 'Admits mistakes', 'Quickly admitted the mistake', 'Denied the mistake'),
|
||||||
|
('R3.02', 'R3', 'R', 'Apology Quality', 'Sincere apologies', 'Sincere apology', 'Insincere/no apology'),
|
||||||
|
('R3.03', 'R3', 'R', 'Making It Right', 'Correcting mistakes', 'Made it right', 'Didn''t fix anything'),
|
||||||
|
|
||||||
|
-- R4: Personalization (Loyalty)
|
||||||
|
('R4.01', 'R4', 'R', 'Customer Recognition', 'Remembers customers', 'Remembered me', 'Treated like a stranger'),
|
||||||
|
('R4.02', 'R4', 'R', 'Loyalty Rewards', 'Rewards for loyalty', 'Great loyalty program', 'No recognition for loyalty'),
|
||||||
|
('R4.03', 'R4', 'R', 'Long-term Relationship', 'Builds relationships', 'Values the relationship', 'Just another number')
|
||||||
|
ON CONFLICT (code) DO NOTHING;
|
||||||
31
migrations/versions/010_add_solution_to_urt_subcodes.sql
Normal file
31
migrations/versions/010_add_solution_to_urt_subcodes.sql
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
-- Migration: Add solution column to urt_subcodes
|
||||||
|
-- Version: 010
|
||||||
|
-- Date: 2026-01-25
|
||||||
|
-- Description: Add solution column to store actionable recommendations for each URT subcode
|
||||||
|
|
||||||
|
-- Add solution column for actionable business recommendations
|
||||||
|
ALTER TABLE pipeline.urt_subcodes
|
||||||
|
ADD COLUMN IF NOT EXISTS solution TEXT;
|
||||||
|
|
||||||
|
-- Add comment describing the column
|
||||||
|
COMMENT ON COLUMN pipeline.urt_subcodes.solution IS
|
||||||
|
'Actionable business recommendation for addressing issues related to this subcode';
|
||||||
|
|
||||||
|
-- Also add marketing_angle column for strengths
|
||||||
|
ALTER TABLE pipeline.urt_subcodes
|
||||||
|
ADD COLUMN IF NOT EXISTS marketing_angle TEXT;
|
||||||
|
|
||||||
|
COMMENT ON COLUMN pipeline.urt_subcodes.marketing_angle IS
|
||||||
|
'Marketing suggestion when this subcode appears as a strength (high positive sentiment)';
|
||||||
|
|
||||||
|
-- Add complexity column to help with opportunity matrix
|
||||||
|
ALTER TABLE pipeline.urt_subcodes
|
||||||
|
ADD COLUMN IF NOT EXISTS solution_complexity VARCHAR(10) DEFAULT 'medium';
|
||||||
|
|
||||||
|
COMMENT ON COLUMN pipeline.urt_subcodes.solution_complexity IS
|
||||||
|
'Complexity of implementing the solution: simple, medium, complex';
|
||||||
|
|
||||||
|
-- Add constraint for valid complexity values
|
||||||
|
ALTER TABLE pipeline.urt_subcodes
|
||||||
|
ADD CONSTRAINT valid_solution_complexity
|
||||||
|
CHECK (solution_complexity IN ('simple', 'medium', 'complex'));
|
||||||
210
migrations/versions/011_populate_urt_solutions.py
Normal file
210
migrations/versions/011_populate_urt_solutions.py
Normal file
@@ -0,0 +1,210 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Generate SQL to populate URT subcodes with solutions, marketing angles, and complexity.
|
||||||
|
Parses B1-urt-codes.yaml and creates actionable recommendations.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python 011_populate_urt_solutions.py > 011_populate_urt_solutions.sql
|
||||||
|
# Then run the SQL against the database
|
||||||
|
"""
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Load the URT taxonomy
|
||||||
|
URT_YAML = Path(__file__).parent.parent.parent / "urt-taxonomy" / "track-b-engineering" / "B1-urt-codes.yaml"
|
||||||
|
|
||||||
|
# Solution templates based on domain and common patterns
|
||||||
|
SOLUTION_TEMPLATES = {
|
||||||
|
# Offering (O) - Product/Operations solutions
|
||||||
|
"O1.01": ("Implement quality testing before delivery. Create incident response process for functionality failures.", "Our products work reliably - backed by rigorous quality testing.", "medium"),
|
||||||
|
"O1.02": ("Optimize performance through benchmarking and monitoring. Set performance SLAs.", "Experience lightning-fast performance that exceeds expectations.", "complex"),
|
||||||
|
"O1.03": ("Use higher quality materials. Extend warranty coverage. Implement durability testing.", "Built to last - quality materials that stand the test of time.", "medium"),
|
||||||
|
"O1.04": ("Implement regular maintenance schedules. Add redundancy for critical systems.", "Dependable reliability you can count on, every time.", "medium"),
|
||||||
|
"O1.05": ("Track outcome metrics. Follow up on customer goals. Provide success coaching.", "We measure success by YOUR results, not just our delivery.", "medium"),
|
||||||
|
"O2.01": ("Upgrade to premium materials/ingredients. Source from quality suppliers.", "Premium materials and ingredients you can see and feel.", "medium"),
|
||||||
|
"O2.02": ("Invest in craftsman training. Implement quality checkpoints.", "Master craftsmanship in every detail.", "complex"),
|
||||||
|
"O2.03": ("Train on presentation standards. Create visual guidelines.", "Beautifully presented, every single time.", "simple"),
|
||||||
|
"O2.04": ("Implement finishing checklists. Add quality inspection step.", "Meticulous attention to every detail.", "simple"),
|
||||||
|
"O2.05": ("Improve packaging. Add delivery condition checks. Train delivery staff.", "Arrives in perfect condition, guaranteed.", "medium"),
|
||||||
|
"O3.01": ("Create comprehensive packing lists. Verify completeness before shipping.", "Everything you need, nothing missing.", "simple"),
|
||||||
|
"O3.02": ("Test all features before release. Maintain feature availability dashboard.", "All features available and working as promised.", "medium"),
|
||||||
|
"O3.03": ("Define clear scope of work. Use completion checklists.", "We deliver the full scope, every time.", "simple"),
|
||||||
|
"O3.04": ("Create comprehensive documentation. Include setup guides and FAQs.", "Clear instructions and helpful guides included.", "simple"),
|
||||||
|
"O4.01": ("Implement order verification. Add confirmation step before fulfillment.", "Exactly what you ordered, guaranteed.", "simple"),
|
||||||
|
"O4.02": ("Build preference tracking system. Remember customer choices.", "We remember your preferences for a personalized experience.", "medium"),
|
||||||
|
"O4.03": ("Train staff on customization options. Empower flexibility.", "Flexible options tailored to your needs.", "simple"),
|
||||||
|
"O4.04": ("Improve needs assessment. Train consultative selling.", "Expert recommendations matched to your specific needs.", "medium"),
|
||||||
|
|
||||||
|
# People (P) - HR/Training solutions
|
||||||
|
"P1.01": ("Train staff on warm greetings. Recognize friendly behavior.", "Friendly faces and warm welcomes await you.", "simple"),
|
||||||
|
"P1.02": ("Implement respect training. Address complaints immediately.", "You'll be treated with dignity and respect.", "simple"),
|
||||||
|
"P1.03": ("Train active listening and empathy. Role-play difficult scenarios.", "Staff who truly understand your situation.", "medium"),
|
||||||
|
"P1.04": ("Reduce time pressure on staff. Train patience techniques.", "Take your time - we're here to help, not rush.", "simple"),
|
||||||
|
"P1.05": ("Hire for passion. Recognize enthusiastic service.", "Passionate people who love helping customers.", "medium"),
|
||||||
|
"P2.01": ("Implement ongoing product training. Create knowledge base.", "Expert knowledge to answer any question.", "medium"),
|
||||||
|
"P2.02": ("Invest in skills training. Certify technical competency.", "Skilled professionals at the top of their craft.", "complex"),
|
||||||
|
"P2.03": ("Empower staff to solve problems. Create escalation paths.", "Creative problem-solvers who find solutions.", "medium"),
|
||||||
|
"P2.04": ("Define professional standards. Provide uniforms/dress code.", "Professional service you can trust.", "simple"),
|
||||||
|
"P2.05": ("Hire experienced staff. Pair juniors with mentors.", "Seasoned experts with years of experience.", "complex"),
|
||||||
|
"P3.01": ("Train proactive checking. Reduce multitasking.", "Attentive service that anticipates your needs.", "simple"),
|
||||||
|
"P3.02": ("Encourage proactive service. Reward initiative.", "Proactive help before you even ask.", "simple"),
|
||||||
|
"P3.03": ("Optimize staffing levels. Reduce wait for assistance.", "Help is always available when you need it.", "medium"),
|
||||||
|
"P3.04": ("Implement task tracking. Create follow-up reminders.", "We do what we say we'll do.", "simple"),
|
||||||
|
"P3.05": ("Train prioritization. Empower urgent action.", "Your needs are treated with appropriate urgency.", "simple"),
|
||||||
|
"P4.01": ("Train jargon-free communication. Use visual aids.", "Clear explanations without confusing jargon.", "simple"),
|
||||||
|
"P4.02": ("Train active listening. Implement feedback loops.", "We truly listen and understand your needs.", "simple"),
|
||||||
|
"P4.03": ("Implement status update systems. Set update expectations.", "Regular updates keep you informed every step.", "simple"),
|
||||||
|
"P4.04": ("Verify information before sharing. Create accuracy checks.", "Accurate information you can rely on.", "simple"),
|
||||||
|
"P4.05": ("Train professional communication. Provide tone guidelines.", "Professional yet personable communication.", "simple"),
|
||||||
|
|
||||||
|
# Journey (J) - Operations/Process solutions
|
||||||
|
"J1.01": ("Display estimated wait times. Implement queue management.", "Minimal wait times with clear expectations.", "medium"),
|
||||||
|
"J1.02": ("Optimize delivery processes. Set realistic timelines.", "Fast, reliable delivery every time.", "medium"),
|
||||||
|
"J1.03": ("Set response time SLAs. Implement ticketing system.", "Quick responses when you reach out.", "medium"),
|
||||||
|
"J1.04": ("Improve scheduling. Buffer time for delays.", "On-time, every time.", "simple"),
|
||||||
|
"J1.05": ("Train on pacing. Allow customer control of tempo.", "At your pace, never rushed.", "simple"),
|
||||||
|
"J2.01": ("Simplify processes. Remove unnecessary steps.", "Simple, straightforward processes.", "medium"),
|
||||||
|
"J2.02": ("Improve signage. Create intuitive layouts.", "Easy to find what you're looking for.", "simple"),
|
||||||
|
"J2.03": ("Digitize forms. Pre-fill known information.", "Minimal paperwork, maximum efficiency.", "medium"),
|
||||||
|
"J2.04": ("Improve handoff protocols. Share context between teams.", "Seamless transitions between team members.", "medium"),
|
||||||
|
"J2.05": ("Build self-service portal. Add online options.", "Self-service options for your convenience.", "complex"),
|
||||||
|
"J3.01": ("Standardize processes. Document procedures.", "Consistent quality every single time.", "medium"),
|
||||||
|
"J3.02": ("Implement order verification. Add accuracy checks.", "Accurate orders, no mistakes.", "simple"),
|
||||||
|
"J3.03": ("Improve system reliability. Add monitoring and alerts.", "Reliable systems that are always available.", "complex"),
|
||||||
|
"J3.04": ("Set clear expectations. Document what to expect.", "No surprises - exactly what you expect.", "simple"),
|
||||||
|
"J3.05": ("Implement quality checks. Track and reduce errors.", "Rare mistakes with quick corrections.", "medium"),
|
||||||
|
"J4.01": ("Train problem acknowledgment. Create issue intake process.", "We acknowledge issues immediately.", "simple"),
|
||||||
|
"J4.02": ("Create clear escalation paths. Empower frontline resolution.", "Efficient resolution process.", "medium"),
|
||||||
|
"J4.03": ("Set resolution time targets. Prioritize open issues.", "Fast resolution when things go wrong.", "medium"),
|
||||||
|
"J4.04": ("Verify fixes before closing. Follow up on resolutions.", "Complete solutions, not band-aids.", "medium"),
|
||||||
|
"J4.05": ("Conduct root cause analysis. Implement systemic fixes.", "We fix problems permanently.", "complex"),
|
||||||
|
|
||||||
|
# Environment (E) - Facilities/IT solutions
|
||||||
|
"E1.01": ("Increase cleaning frequency. Create cleaning checklists.", "Spotlessly clean facilities.", "simple"),
|
||||||
|
"E1.02": ("Implement preventive maintenance. Fix issues promptly.", "Well-maintained, everything works.", "medium"),
|
||||||
|
"E1.03": ("Redesign layout for flow. Add wayfinding.", "Intuitive layout, easy to navigate.", "complex"),
|
||||||
|
"E1.04": ("Upgrade equipment. Implement replacement schedule.", "Modern, state-of-the-art equipment.", "complex"),
|
||||||
|
"E1.05": ("Add clear signage. Use consistent design.", "Clear signs and easy navigation.", "simple"),
|
||||||
|
"E2.01": ("Invest in UX design. Conduct usability testing.", "Beautiful, intuitive digital experience.", "complex"),
|
||||||
|
"E2.02": ("Test all features. Fix bugs promptly.", "Everything works, no broken buttons.", "medium"),
|
||||||
|
"E2.03": ("Optimize page load. Improve server response.", "Lightning-fast digital experience.", "complex"),
|
||||||
|
"E2.04": ("Simplify navigation. Reduce menu depth.", "Find what you need in seconds.", "medium"),
|
||||||
|
"E2.05": ("Optimize for mobile. Test on all devices.", "Works beautifully on any device.", "medium"),
|
||||||
|
"E3.01": ("Design for desired mood. Control sensory elements.", "Perfect atmosphere and ambiance.", "medium"),
|
||||||
|
"E3.02": ("Add sound absorption. Create quiet zones.", "Pleasant sound levels.", "medium"),
|
||||||
|
"E3.03": ("Optimize HVAC. Add zone controls.", "Perfect temperature, always comfortable.", "medium"),
|
||||||
|
"E3.04": ("Manage capacity. Control entry rates.", "Comfortable, never overcrowded.", "medium"),
|
||||||
|
"E3.05": ("Invest in design. Update decor regularly.", "Beautiful, inviting space.", "complex"),
|
||||||
|
"E4.01": ("Conduct safety audits. Address hazards immediately.", "Safety is our top priority.", "medium"),
|
||||||
|
"E4.02": ("Implement hygiene protocols. Train staff on standards.", "Highest hygiene standards.", "medium"),
|
||||||
|
"E4.03": ("Add security measures. Protect customer property.", "Secure environment for you and your belongings.", "medium"),
|
||||||
|
"E4.04": ("Upgrade furniture. Add comfort amenities.", "Comfortable facilities for your visit.", "medium"),
|
||||||
|
"E4.05": ("Conduct emergency drills. Mark exits clearly.", "Prepared for any emergency.", "medium"),
|
||||||
|
|
||||||
|
# Access (A) - Compliance/Design solutions
|
||||||
|
"A1.01": ("Extend operating hours. Consider 24/7 options.", "Open when you need us.", "medium"),
|
||||||
|
"A1.02": ("Add online booking. Increase appointment slots.", "Easy scheduling, plenty of availability.", "medium"),
|
||||||
|
"A1.03": ("Improve inventory management. Add stock alerts.", "Always in stock when you need it.", "medium"),
|
||||||
|
"A1.04": ("Hire additional staff. Optimize scheduling.", "Plenty of staff to help you.", "complex"),
|
||||||
|
"A1.05": ("Expand service area. Add new locations.", "Convenient locations near you.", "complex"),
|
||||||
|
"A2.01": ("Add ramps and elevators. Ensure ADA compliance.", "Fully accessible for all abilities.", "complex"),
|
||||||
|
"A2.02": ("Add alt text. Ensure screen reader compatibility.", "Accessible for visually impaired users.", "medium"),
|
||||||
|
"A2.03": ("Add captions and transcripts. Support hearing devices.", "Accessible for hearing impaired users.", "medium"),
|
||||||
|
"A2.04": ("Use plain language. Simplify instructions.", "Easy to understand for everyone.", "simple"),
|
||||||
|
"A2.05": ("Test with assistive technologies. Follow WCAG guidelines.", "Works with all assistive technologies.", "complex"),
|
||||||
|
"A3.01": ("Hire multilingual staff. Add translation services.", "Service in your language.", "medium"),
|
||||||
|
"A3.02": ("Train cultural competency. Celebrate diversity.", "Welcoming to all backgrounds.", "medium"),
|
||||||
|
"A3.03": ("Offer dietary alternatives. Train allergy awareness.", "Options for all dietary needs.", "medium"),
|
||||||
|
"A3.04": ("Add family amenities. Create kid-friendly options.", "Great for the whole family.", "medium"),
|
||||||
|
"A3.05": ("Train bias awareness. Audit for fair treatment.", "Equal, respectful treatment for all.", "medium"),
|
||||||
|
"A4.01": ("Choose high-traffic location. Improve visibility.", "Convenient, easy-to-find location.", "complex"),
|
||||||
|
"A4.02": ("Add parking spaces. Offer validation.", "Easy, hassle-free parking.", "complex"),
|
||||||
|
"A4.03": ("Locate near transit. Add shuttle service.", "Easy access by public transit.", "complex"),
|
||||||
|
"A4.04": ("Accept all payment types. Add mobile pay.", "Pay however you prefer.", "simple"),
|
||||||
|
"A4.05": ("Add contact channels. Reduce hold times.", "Easy to reach through any channel.", "medium"),
|
||||||
|
|
||||||
|
# Value (V) - Finance/Pricing solutions
|
||||||
|
"V1.01": ("Review pricing strategy. Offer value tiers.", "Competitive, fair pricing.", "complex"),
|
||||||
|
"V1.02": ("Benchmark against expectations. Communicate value.", "Pricing that matches expectations.", "medium"),
|
||||||
|
"V1.03": ("Conduct competitor analysis. Justify premium or match.", "Competitive with the market.", "medium"),
|
||||||
|
"V1.04": ("Display ALL fees upfront. Eliminate surprise charges.", "Complete price transparency - no hidden fees.", "simple"),
|
||||||
|
"V1.05": ("Offer payment plans. Add financing options.", "Flexible payment options available.", "medium"),
|
||||||
|
"V2.01": ("Create clear price lists. Explain pricing structure.", "Clear, easy-to-understand pricing.", "simple"),
|
||||||
|
"V2.02": ("List all fees upfront. Include in quotes.", "Full disclosure of all charges.", "simple"),
|
||||||
|
"V2.03": ("Audit marketing claims. Ensure accuracy.", "Honest, accurate advertising.", "simple"),
|
||||||
|
"V2.04": ("Simplify contracts. Highlight key terms.", "Fair, straightforward terms.", "medium"),
|
||||||
|
"V2.05": ("Verify all claims. Provide evidence.", "Honest representation of our services.", "simple"),
|
||||||
|
"V3.01": ("Streamline processes. Reduce customer time.", "Quick and easy, respecting your time.", "medium"),
|
||||||
|
"V3.02": ("Simplify decisions. Provide guidance.", "Easy decisions, minimal stress.", "medium"),
|
||||||
|
"V3.03": ("Offer delivery/pickup. Reduce physical burden.", "Convenient, minimal effort required.", "medium"),
|
||||||
|
"V3.04": ("Reduce friction points. Improve processes.", "Smooth, hassle-free experience.", "medium"),
|
||||||
|
"V3.05": ("Demonstrate value clearly. Compare alternatives.", "Worth every moment of your time.", "simple"),
|
||||||
|
"V4.01": ("Communicate value proposition. Demonstrate ROI.", "Exceptional value for your investment.", "medium"),
|
||||||
|
"V4.02": ("Ensure quality matches price. Add value-adds.", "Quality that justifies the price.", "medium"),
|
||||||
|
"V4.03": ("Track satisfaction. Follow up post-purchase.", "Complete satisfaction guaranteed.", "medium"),
|
||||||
|
"V4.04": ("Encourage referrals. Make sharing easy.", "So good, you'll tell your friends.", "simple"),
|
||||||
|
"V4.05": ("Build loyalty program. Reward returns.", "Worth coming back for, again and again.", "medium"),
|
||||||
|
|
||||||
|
# Relationship (R) - Leadership/CX solutions
|
||||||
|
"R1.01": ("Train honest communication. Build trust culture.", "Complete honesty and transparency.", "medium"),
|
||||||
|
"R1.02": ("Document commitments. Track promises made.", "We always keep our promises.", "simple"),
|
||||||
|
"R1.03": ("Share policies openly. Communicate changes.", "Open and transparent in everything we do.", "simple"),
|
||||||
|
"R1.04": ("Define ethical standards. Train compliance.", "Ethical business practices you can trust.", "medium"),
|
||||||
|
"R1.05": ("Ensure consistent treatment. Audit fairness.", "Fair dealing with every customer.", "medium"),
|
||||||
|
"R2.01": ("Track customer history. Learn from patterns.", "Proven track record of excellence.", "medium"),
|
||||||
|
"R2.02": ("Standardize experience. Reduce variation.", "Consistent excellence, every visit.", "medium"),
|
||||||
|
"R2.03": ("Communicate changes. Maintain core values.", "Stable and reliable, year after year.", "medium"),
|
||||||
|
"R2.04": ("Build trust incrementally. Honor commitments.", "A business you can trust completely.", "medium"),
|
||||||
|
"R2.05": ("Honor warranties promptly. Exceed guarantees.", "We stand behind our guarantees.", "medium"),
|
||||||
|
"R3.01": ("Train admission of mistakes. Empower acknowledgment.", "We own our mistakes.", "simple"),
|
||||||
|
"R3.02": ("Develop sincere apology training. Show genuine regret.", "Genuine apologies when things go wrong.", "simple"),
|
||||||
|
"R3.03": ("Develop compensation policies. Empower service recovery.", "We make things right with meaningful gestures.", "medium"),
|
||||||
|
"R3.04": ("Conduct post-mortem reviews. Implement learnings.", "We continuously improve from feedback.", "medium"),
|
||||||
|
"R3.05": ("Train ownership mentality. Remove blame culture.", "Full accountability when issues arise.", "medium"),
|
||||||
|
"R4.01": ("Implement CRM. Train staff on customer history.", "We remember you and value your loyalty.", "medium"),
|
||||||
|
"R4.02": ("Create meaningful loyalty program. Offer real value.", "Rewarding loyalty with meaningful perks.", "medium"),
|
||||||
|
"R4.03": ("Train relationship building. Encourage personal connections.", "More than transactions - real relationships.", "medium"),
|
||||||
|
"R4.04": ("Personalize communications. Add value in outreach.", "Helpful updates, not just promotions.", "medium"),
|
||||||
|
"R4.05": ("Build community events. Create belonging.", "Part of our community.", "medium"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def escape_sql(s: str) -> str:
|
||||||
|
"""Escape single quotes for SQL."""
|
||||||
|
if s is None:
|
||||||
|
return "NULL"
|
||||||
|
return "'" + s.replace("'", "''") + "'"
|
||||||
|
|
||||||
|
|
||||||
|
def generate_sql():
|
||||||
|
"""Generate SQL UPDATE statements for all subcodes."""
|
||||||
|
|
||||||
|
print("-- Migration: Populate URT subcodes with solutions")
|
||||||
|
print("-- Version: 011")
|
||||||
|
print("-- Date: 2026-01-25")
|
||||||
|
print("-- Generated from: urt-taxonomy/track-b-engineering/B1-urt-codes.yaml")
|
||||||
|
print()
|
||||||
|
print("BEGIN;")
|
||||||
|
print()
|
||||||
|
|
||||||
|
for code, (solution, marketing_angle, complexity) in SOLUTION_TEMPLATES.items():
|
||||||
|
print(f"""UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = {escape_sql(solution)},
|
||||||
|
marketing_angle = {escape_sql(marketing_angle)},
|
||||||
|
solution_complexity = {escape_sql(complexity)}
|
||||||
|
WHERE code = {escape_sql(code)};
|
||||||
|
""")
|
||||||
|
|
||||||
|
print("COMMIT;")
|
||||||
|
print()
|
||||||
|
print("-- Verify updates")
|
||||||
|
print("SELECT code, name, solution_complexity, LEFT(solution, 50) as solution_preview")
|
||||||
|
print("FROM pipeline.urt_subcodes")
|
||||||
|
print("WHERE solution IS NOT NULL")
|
||||||
|
print("ORDER BY code")
|
||||||
|
print("LIMIT 10;")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
generate_sql()
|
||||||
843
migrations/versions/011_populate_urt_solutions.sql
Normal file
843
migrations/versions/011_populate_urt_solutions.sql
Normal file
@@ -0,0 +1,843 @@
|
|||||||
|
-- Migration: Populate URT subcodes with solutions
|
||||||
|
-- Version: 011
|
||||||
|
-- Date: 2026-01-25
|
||||||
|
-- Generated from: urt-taxonomy/track-b-engineering/B1-urt-codes.yaml
|
||||||
|
|
||||||
|
BEGIN;
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Implement quality testing before delivery. Create incident response process for functionality failures.',
|
||||||
|
marketing_angle = 'Our products work reliably - backed by rigorous quality testing.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'O1.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Optimize performance through benchmarking and monitoring. Set performance SLAs.',
|
||||||
|
marketing_angle = 'Experience lightning-fast performance that exceeds expectations.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'O1.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Use higher quality materials. Extend warranty coverage. Implement durability testing.',
|
||||||
|
marketing_angle = 'Built to last - quality materials that stand the test of time.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'O1.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Implement regular maintenance schedules. Add redundancy for critical systems.',
|
||||||
|
marketing_angle = 'Dependable reliability you can count on, every time.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'O1.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Track outcome metrics. Follow up on customer goals. Provide success coaching.',
|
||||||
|
marketing_angle = 'We measure success by YOUR results, not just our delivery.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'O1.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Upgrade to premium materials/ingredients. Source from quality suppliers.',
|
||||||
|
marketing_angle = 'Premium materials and ingredients you can see and feel.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'O2.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Invest in craftsman training. Implement quality checkpoints.',
|
||||||
|
marketing_angle = 'Master craftsmanship in every detail.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'O2.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Train on presentation standards. Create visual guidelines.',
|
||||||
|
marketing_angle = 'Beautifully presented, every single time.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'O2.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Implement finishing checklists. Add quality inspection step.',
|
||||||
|
marketing_angle = 'Meticulous attention to every detail.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'O2.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Improve packaging. Add delivery condition checks. Train delivery staff.',
|
||||||
|
marketing_angle = 'Arrives in perfect condition, guaranteed.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'O2.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Create comprehensive packing lists. Verify completeness before shipping.',
|
||||||
|
marketing_angle = 'Everything you need, nothing missing.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'O3.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Test all features before release. Maintain feature availability dashboard.',
|
||||||
|
marketing_angle = 'All features available and working as promised.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'O3.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Define clear scope of work. Use completion checklists.',
|
||||||
|
marketing_angle = 'We deliver the full scope, every time.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'O3.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Create comprehensive documentation. Include setup guides and FAQs.',
|
||||||
|
marketing_angle = 'Clear instructions and helpful guides included.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'O3.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Implement order verification. Add confirmation step before fulfillment.',
|
||||||
|
marketing_angle = 'Exactly what you ordered, guaranteed.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'O4.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Build preference tracking system. Remember customer choices.',
|
||||||
|
marketing_angle = 'We remember your preferences for a personalized experience.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'O4.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Train staff on customization options. Empower flexibility.',
|
||||||
|
marketing_angle = 'Flexible options tailored to your needs.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'O4.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Improve needs assessment. Train consultative selling.',
|
||||||
|
marketing_angle = 'Expert recommendations matched to your specific needs.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'O4.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Train staff on warm greetings. Recognize friendly behavior.',
|
||||||
|
marketing_angle = 'Friendly faces and warm welcomes await you.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'P1.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Implement respect training. Address complaints immediately.',
|
||||||
|
marketing_angle = 'You''ll be treated with dignity and respect.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'P1.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Train active listening and empathy. Role-play difficult scenarios.',
|
||||||
|
marketing_angle = 'Staff who truly understand your situation.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'P1.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Reduce time pressure on staff. Train patience techniques.',
|
||||||
|
marketing_angle = 'Take your time - we''re here to help, not rush.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'P1.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Hire for passion. Recognize enthusiastic service.',
|
||||||
|
marketing_angle = 'Passionate people who love helping customers.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'P1.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Implement ongoing product training. Create knowledge base.',
|
||||||
|
marketing_angle = 'Expert knowledge to answer any question.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'P2.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Invest in skills training. Certify technical competency.',
|
||||||
|
marketing_angle = 'Skilled professionals at the top of their craft.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'P2.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Empower staff to solve problems. Create escalation paths.',
|
||||||
|
marketing_angle = 'Creative problem-solvers who find solutions.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'P2.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Define professional standards. Provide uniforms/dress code.',
|
||||||
|
marketing_angle = 'Professional service you can trust.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'P2.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Hire experienced staff. Pair juniors with mentors.',
|
||||||
|
marketing_angle = 'Seasoned experts with years of experience.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'P2.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Train proactive checking. Reduce multitasking.',
|
||||||
|
marketing_angle = 'Attentive service that anticipates your needs.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'P3.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Encourage proactive service. Reward initiative.',
|
||||||
|
marketing_angle = 'Proactive help before you even ask.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'P3.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Optimize staffing levels. Reduce wait for assistance.',
|
||||||
|
marketing_angle = 'Help is always available when you need it.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'P3.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Implement task tracking. Create follow-up reminders.',
|
||||||
|
marketing_angle = 'We do what we say we''ll do.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'P3.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Train prioritization. Empower urgent action.',
|
||||||
|
marketing_angle = 'Your needs are treated with appropriate urgency.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'P3.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Train jargon-free communication. Use visual aids.',
|
||||||
|
marketing_angle = 'Clear explanations without confusing jargon.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'P4.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Train active listening. Implement feedback loops.',
|
||||||
|
marketing_angle = 'We truly listen and understand your needs.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'P4.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Implement status update systems. Set update expectations.',
|
||||||
|
marketing_angle = 'Regular updates keep you informed every step.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'P4.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Verify information before sharing. Create accuracy checks.',
|
||||||
|
marketing_angle = 'Accurate information you can rely on.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'P4.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Train professional communication. Provide tone guidelines.',
|
||||||
|
marketing_angle = 'Professional yet personable communication.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'P4.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Display estimated wait times. Implement queue management.',
|
||||||
|
marketing_angle = 'Minimal wait times with clear expectations.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'J1.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Optimize delivery processes. Set realistic timelines.',
|
||||||
|
marketing_angle = 'Fast, reliable delivery every time.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'J1.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Set response time SLAs. Implement ticketing system.',
|
||||||
|
marketing_angle = 'Quick responses when you reach out.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'J1.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Improve scheduling. Buffer time for delays.',
|
||||||
|
marketing_angle = 'On-time, every time.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'J1.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Train on pacing. Allow customer control of tempo.',
|
||||||
|
marketing_angle = 'At your pace, never rushed.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'J1.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Simplify processes. Remove unnecessary steps.',
|
||||||
|
marketing_angle = 'Simple, straightforward processes.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'J2.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Improve signage. Create intuitive layouts.',
|
||||||
|
marketing_angle = 'Easy to find what you''re looking for.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'J2.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Digitize forms. Pre-fill known information.',
|
||||||
|
marketing_angle = 'Minimal paperwork, maximum efficiency.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'J2.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Improve handoff protocols. Share context between teams.',
|
||||||
|
marketing_angle = 'Seamless transitions between team members.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'J2.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Build self-service portal. Add online options.',
|
||||||
|
marketing_angle = 'Self-service options for your convenience.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'J2.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Standardize processes. Document procedures.',
|
||||||
|
marketing_angle = 'Consistent quality every single time.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'J3.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Implement order verification. Add accuracy checks.',
|
||||||
|
marketing_angle = 'Accurate orders, no mistakes.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'J3.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Improve system reliability. Add monitoring and alerts.',
|
||||||
|
marketing_angle = 'Reliable systems that are always available.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'J3.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Set clear expectations. Document what to expect.',
|
||||||
|
marketing_angle = 'No surprises - exactly what you expect.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'J3.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Implement quality checks. Track and reduce errors.',
|
||||||
|
marketing_angle = 'Rare mistakes with quick corrections.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'J3.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Train problem acknowledgment. Create issue intake process.',
|
||||||
|
marketing_angle = 'We acknowledge issues immediately.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'J4.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Create clear escalation paths. Empower frontline resolution.',
|
||||||
|
marketing_angle = 'Efficient resolution process.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'J4.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Set resolution time targets. Prioritize open issues.',
|
||||||
|
marketing_angle = 'Fast resolution when things go wrong.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'J4.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Verify fixes before closing. Follow up on resolutions.',
|
||||||
|
marketing_angle = 'Complete solutions, not band-aids.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'J4.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Conduct root cause analysis. Implement systemic fixes.',
|
||||||
|
marketing_angle = 'We fix problems permanently.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'J4.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Increase cleaning frequency. Create cleaning checklists.',
|
||||||
|
marketing_angle = 'Spotlessly clean facilities.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'E1.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Implement preventive maintenance. Fix issues promptly.',
|
||||||
|
marketing_angle = 'Well-maintained, everything works.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'E1.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Redesign layout for flow. Add wayfinding.',
|
||||||
|
marketing_angle = 'Intuitive layout, easy to navigate.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'E1.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Upgrade equipment. Implement replacement schedule.',
|
||||||
|
marketing_angle = 'Modern, state-of-the-art equipment.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'E1.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Add clear signage. Use consistent design.',
|
||||||
|
marketing_angle = 'Clear signs and easy navigation.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'E1.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Invest in UX design. Conduct usability testing.',
|
||||||
|
marketing_angle = 'Beautiful, intuitive digital experience.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'E2.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Test all features. Fix bugs promptly.',
|
||||||
|
marketing_angle = 'Everything works, no broken buttons.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'E2.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Optimize page load. Improve server response.',
|
||||||
|
marketing_angle = 'Lightning-fast digital experience.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'E2.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Simplify navigation. Reduce menu depth.',
|
||||||
|
marketing_angle = 'Find what you need in seconds.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'E2.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Optimize for mobile. Test on all devices.',
|
||||||
|
marketing_angle = 'Works beautifully on any device.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'E2.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Design for desired mood. Control sensory elements.',
|
||||||
|
marketing_angle = 'Perfect atmosphere and ambiance.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'E3.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Add sound absorption. Create quiet zones.',
|
||||||
|
marketing_angle = 'Pleasant sound levels.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'E3.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Optimize HVAC. Add zone controls.',
|
||||||
|
marketing_angle = 'Perfect temperature, always comfortable.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'E3.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Manage capacity. Control entry rates.',
|
||||||
|
marketing_angle = 'Comfortable, never overcrowded.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'E3.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Invest in design. Update decor regularly.',
|
||||||
|
marketing_angle = 'Beautiful, inviting space.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'E3.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Conduct safety audits. Address hazards immediately.',
|
||||||
|
marketing_angle = 'Safety is our top priority.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'E4.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Implement hygiene protocols. Train staff on standards.',
|
||||||
|
marketing_angle = 'Highest hygiene standards.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'E4.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Add security measures. Protect customer property.',
|
||||||
|
marketing_angle = 'Secure environment for you and your belongings.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'E4.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Upgrade furniture. Add comfort amenities.',
|
||||||
|
marketing_angle = 'Comfortable facilities for your visit.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'E4.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Conduct emergency drills. Mark exits clearly.',
|
||||||
|
marketing_angle = 'Prepared for any emergency.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'E4.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Extend operating hours. Consider 24/7 options.',
|
||||||
|
marketing_angle = 'Open when you need us.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'A1.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Add online booking. Increase appointment slots.',
|
||||||
|
marketing_angle = 'Easy scheduling, plenty of availability.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'A1.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Improve inventory management. Add stock alerts.',
|
||||||
|
marketing_angle = 'Always in stock when you need it.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'A1.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Hire additional staff. Optimize scheduling.',
|
||||||
|
marketing_angle = 'Plenty of staff to help you.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'A1.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Expand service area. Add new locations.',
|
||||||
|
marketing_angle = 'Convenient locations near you.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'A1.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Add ramps and elevators. Ensure ADA compliance.',
|
||||||
|
marketing_angle = 'Fully accessible for all abilities.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'A2.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Add alt text. Ensure screen reader compatibility.',
|
||||||
|
marketing_angle = 'Accessible for visually impaired users.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'A2.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Add captions and transcripts. Support hearing devices.',
|
||||||
|
marketing_angle = 'Accessible for hearing impaired users.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'A2.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Use plain language. Simplify instructions.',
|
||||||
|
marketing_angle = 'Easy to understand for everyone.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'A2.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Test with assistive technologies. Follow WCAG guidelines.',
|
||||||
|
marketing_angle = 'Works with all assistive technologies.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'A2.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Hire multilingual staff. Add translation services.',
|
||||||
|
marketing_angle = 'Service in your language.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'A3.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Train cultural competency. Celebrate diversity.',
|
||||||
|
marketing_angle = 'Welcoming to all backgrounds.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'A3.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Offer dietary alternatives. Train allergy awareness.',
|
||||||
|
marketing_angle = 'Options for all dietary needs.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'A3.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Add family amenities. Create kid-friendly options.',
|
||||||
|
marketing_angle = 'Great for the whole family.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'A3.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Train bias awareness. Audit for fair treatment.',
|
||||||
|
marketing_angle = 'Equal, respectful treatment for all.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'A3.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Choose high-traffic location. Improve visibility.',
|
||||||
|
marketing_angle = 'Convenient, easy-to-find location.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'A4.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Add parking spaces. Offer validation.',
|
||||||
|
marketing_angle = 'Easy, hassle-free parking.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'A4.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Locate near transit. Add shuttle service.',
|
||||||
|
marketing_angle = 'Easy access by public transit.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'A4.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Accept all payment types. Add mobile pay.',
|
||||||
|
marketing_angle = 'Pay however you prefer.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'A4.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Add contact channels. Reduce hold times.',
|
||||||
|
marketing_angle = 'Easy to reach through any channel.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'A4.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Review pricing strategy. Offer value tiers.',
|
||||||
|
marketing_angle = 'Competitive, fair pricing.',
|
||||||
|
solution_complexity = 'complex'
|
||||||
|
WHERE code = 'V1.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Benchmark against expectations. Communicate value.',
|
||||||
|
marketing_angle = 'Pricing that matches expectations.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'V1.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Conduct competitor analysis. Justify premium or match.',
|
||||||
|
marketing_angle = 'Competitive with the market.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'V1.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Display ALL fees upfront. Eliminate surprise charges.',
|
||||||
|
marketing_angle = 'Complete price transparency - no hidden fees.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'V1.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Offer payment plans. Add financing options.',
|
||||||
|
marketing_angle = 'Flexible payment options available.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'V1.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Create clear price lists. Explain pricing structure.',
|
||||||
|
marketing_angle = 'Clear, easy-to-understand pricing.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'V2.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'List all fees upfront. Include in quotes.',
|
||||||
|
marketing_angle = 'Full disclosure of all charges.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'V2.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Audit marketing claims. Ensure accuracy.',
|
||||||
|
marketing_angle = 'Honest, accurate advertising.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'V2.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Simplify contracts. Highlight key terms.',
|
||||||
|
marketing_angle = 'Fair, straightforward terms.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'V2.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Verify all claims. Provide evidence.',
|
||||||
|
marketing_angle = 'Honest representation of our services.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'V2.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Streamline processes. Reduce customer time.',
|
||||||
|
marketing_angle = 'Quick and easy, respecting your time.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'V3.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Simplify decisions. Provide guidance.',
|
||||||
|
marketing_angle = 'Easy decisions, minimal stress.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'V3.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Offer delivery/pickup. Reduce physical burden.',
|
||||||
|
marketing_angle = 'Convenient, minimal effort required.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'V3.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Reduce friction points. Improve processes.',
|
||||||
|
marketing_angle = 'Smooth, hassle-free experience.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'V3.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Demonstrate value clearly. Compare alternatives.',
|
||||||
|
marketing_angle = 'Worth every moment of your time.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'V3.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Communicate value proposition. Demonstrate ROI.',
|
||||||
|
marketing_angle = 'Exceptional value for your investment.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'V4.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Ensure quality matches price. Add value-adds.',
|
||||||
|
marketing_angle = 'Quality that justifies the price.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'V4.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Track satisfaction. Follow up post-purchase.',
|
||||||
|
marketing_angle = 'Complete satisfaction guaranteed.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'V4.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Encourage referrals. Make sharing easy.',
|
||||||
|
marketing_angle = 'So good, you''ll tell your friends.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'V4.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Build loyalty program. Reward returns.',
|
||||||
|
marketing_angle = 'Worth coming back for, again and again.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'V4.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Train honest communication. Build trust culture.',
|
||||||
|
marketing_angle = 'Complete honesty and transparency.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'R1.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Document commitments. Track promises made.',
|
||||||
|
marketing_angle = 'We always keep our promises.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'R1.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Share policies openly. Communicate changes.',
|
||||||
|
marketing_angle = 'Open and transparent in everything we do.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'R1.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Define ethical standards. Train compliance.',
|
||||||
|
marketing_angle = 'Ethical business practices you can trust.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'R1.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Ensure consistent treatment. Audit fairness.',
|
||||||
|
marketing_angle = 'Fair dealing with every customer.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'R1.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Track customer history. Learn from patterns.',
|
||||||
|
marketing_angle = 'Proven track record of excellence.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'R2.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Standardize experience. Reduce variation.',
|
||||||
|
marketing_angle = 'Consistent excellence, every visit.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'R2.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Communicate changes. Maintain core values.',
|
||||||
|
marketing_angle = 'Stable and reliable, year after year.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'R2.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Build trust incrementally. Honor commitments.',
|
||||||
|
marketing_angle = 'A business you can trust completely.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'R2.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Honor warranties promptly. Exceed guarantees.',
|
||||||
|
marketing_angle = 'We stand behind our guarantees.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'R2.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Train admission of mistakes. Empower acknowledgment.',
|
||||||
|
marketing_angle = 'We own our mistakes.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'R3.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Develop sincere apology training. Show genuine regret.',
|
||||||
|
marketing_angle = 'Genuine apologies when things go wrong.',
|
||||||
|
solution_complexity = 'simple'
|
||||||
|
WHERE code = 'R3.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Develop compensation policies. Empower service recovery.',
|
||||||
|
marketing_angle = 'We make things right with meaningful gestures.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'R3.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Conduct post-mortem reviews. Implement learnings.',
|
||||||
|
marketing_angle = 'We continuously improve from feedback.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'R3.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Train ownership mentality. Remove blame culture.',
|
||||||
|
marketing_angle = 'Full accountability when issues arise.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'R3.05';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Implement CRM. Train staff on customer history.',
|
||||||
|
marketing_angle = 'We remember you and value your loyalty.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'R4.01';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Create meaningful loyalty program. Offer real value.',
|
||||||
|
marketing_angle = 'Rewarding loyalty with meaningful perks.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'R4.02';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Train relationship building. Encourage personal connections.',
|
||||||
|
marketing_angle = 'More than transactions - real relationships.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'R4.03';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Personalize communications. Add value in outreach.',
|
||||||
|
marketing_angle = 'Helpful updates, not just promotions.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'R4.04';
|
||||||
|
|
||||||
|
UPDATE pipeline.urt_subcodes
|
||||||
|
SET solution = 'Build community events. Create belonging.',
|
||||||
|
marketing_angle = 'Part of our community.',
|
||||||
|
solution_complexity = 'medium'
|
||||||
|
WHERE code = 'R4.05';
|
||||||
|
|
||||||
|
COMMIT;
|
||||||
|
|
||||||
|
-- Verify updates
|
||||||
|
SELECT code, name, solution_complexity, LEFT(solution, 50) as solution_preview
|
||||||
|
FROM pipeline.urt_subcodes
|
||||||
|
WHERE solution IS NOT NULL
|
||||||
|
ORDER BY code
|
||||||
|
LIMIT 10;
|
||||||
102
migrations/versions/012_sync_urt_subcodes_from_taxonomy.sql
Normal file
102
migrations/versions/012_sync_urt_subcodes_from_taxonomy.sql
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
-- =============================================================================
|
||||||
|
-- Migration: 012_sync_urt_subcodes_from_taxonomy.sql
|
||||||
|
-- Purpose: Sync missing URT subcodes from taxonomy v5.1 to database
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- Insert missing subcodes (ON CONFLICT DO UPDATE to sync names/definitions)
|
||||||
|
INSERT INTO pipeline.urt_subcodes (code, category_code, domain_code, name, definition, positive_example, negative_example) VALUES
|
||||||
|
|
||||||
|
-- J1: Wait Times (missing J1.04, J1.05)
|
||||||
|
('J1.04', 'J1', 'J', 'Punctuality', 'Meeting scheduled times', 'Always on time', 'Two hours late'),
|
||||||
|
('J1.05', 'J1', 'J', 'Pacing', 'Appropriate speed (not rushed/dragged)', 'Perfect pace throughout', 'Felt rushed through everything'),
|
||||||
|
|
||||||
|
-- J2: Booking & Reservations (missing J2.04, J2.05)
|
||||||
|
('J2.04', 'J2', 'J', 'Booking Availability', 'Slots/capacity when needed', 'Always available slots', 'Fully booked for weeks'),
|
||||||
|
('J2.05', 'J2', 'J', 'Inventory', 'Stock availability', 'Always in stock', 'Out of stock constantly'),
|
||||||
|
|
||||||
|
-- J3: System Reliability (missing J3.04, J3.05)
|
||||||
|
('J3.04', 'J3', 'J', 'Data Accuracy', 'Correct info in systems', 'All details correct', 'Wrong info in my account'),
|
||||||
|
('J3.05', 'J3', 'J', 'Integration', 'Systems work together', 'Seamless between channels', 'Info doesn''t sync'),
|
||||||
|
|
||||||
|
-- J4: Problem Resolution (missing J4.04, J4.05)
|
||||||
|
('J4.04', 'J4', 'J', 'Escalation', 'Getting to right person', 'Quickly got to manager', 'Endless transfers'),
|
||||||
|
('J4.05', 'J4', 'J', 'Closure', 'Issue fully resolved', 'Problem completely solved', 'Issue still not fixed'),
|
||||||
|
|
||||||
|
-- A1: Physical Access (missing A1.04, A1.05)
|
||||||
|
('A1.04', 'A1', 'A', 'Wayfinding', 'Finding destination', 'Easy to find', 'Got lost trying to find it'),
|
||||||
|
('A1.05', 'A1', 'A', 'Physical Accessibility', 'Disability accommodations', 'Wheelchair accessible', 'No ramps or elevators'),
|
||||||
|
|
||||||
|
-- A2: Channel Access (missing A2.04, A2.05)
|
||||||
|
('A2.04', 'A2', 'A', 'Language Accessibility', 'Multilingual support', 'Available in my language', 'No translation available'),
|
||||||
|
('A2.05', 'A2', 'A', 'Hours of Operation', 'Service availability times', 'Open when needed', 'Terrible hours'),
|
||||||
|
|
||||||
|
-- A3: Information Access (missing A3.04, A3.05)
|
||||||
|
('A3.04', 'A3', 'A', 'Documentation Clarity', 'Clear instructions', 'Easy to understand docs', 'Confusing instructions'),
|
||||||
|
('A3.05', 'A3', 'A', 'Support Accessibility', 'Getting help when needed', 'Easy to reach support', 'Impossible to get help'),
|
||||||
|
|
||||||
|
-- A4: Financial Access (missing A4.04, A4.05)
|
||||||
|
('A4.04', 'A4', 'A', 'Payment Flexibility', 'Multiple payment options', 'Many payment options', 'Only accepts cash'),
|
||||||
|
('A4.05', 'A4', 'A', 'Refund Accessibility', 'Getting money back', 'Easy refund process', 'Impossible to get refund'),
|
||||||
|
|
||||||
|
-- E1: Physical Environment (missing E1.04, E1.05)
|
||||||
|
('E1.04', 'E1', 'E', 'Ambiance', 'Atmosphere and vibe', 'Great atmosphere', 'Depressing environment'),
|
||||||
|
('E1.05', 'E1', 'E', 'Comfort', 'Physical comfort', 'Very comfortable', 'Uncomfortable seating'),
|
||||||
|
|
||||||
|
-- E2: Digital Environment (missing E2.04, E2.05)
|
||||||
|
('E2.04', 'E2', 'E', 'Visual Design', 'Aesthetics of interface', 'Beautiful design', 'Ugly interface'),
|
||||||
|
('E2.05', 'E2', 'E', 'Mobile Experience', 'Mobile usability', 'Great mobile app', 'Terrible mobile site'),
|
||||||
|
|
||||||
|
-- E3: Safety & Security (missing E3.04, E3.05)
|
||||||
|
('E3.04', 'E3', 'E', 'Health Safety', 'Health precautions', 'Very clean and safe', 'Unsanitary conditions'),
|
||||||
|
('E3.05', 'E3', 'E', 'Cyber Security', 'Digital security', 'Secure platform', 'Got hacked'),
|
||||||
|
|
||||||
|
-- E4: Sustainability (missing E4.04, E4.05)
|
||||||
|
('E4.04', 'E4', 'E', 'Social Responsibility', 'Ethical practices', 'Ethical company', 'Exploitative practices'),
|
||||||
|
('E4.05', 'E4', 'E', 'Community Impact', 'Local community effect', 'Supports local community', 'Hurts local businesses'),
|
||||||
|
|
||||||
|
-- V1: Pricing (missing V1.04, V1.05)
|
||||||
|
('V1.04', 'V1', 'V', 'Price Transparency', 'Clear pricing', 'Clear pricing upfront', 'Hidden costs everywhere'),
|
||||||
|
('V1.05', 'V1', 'V', 'Price Stability', 'Consistent pricing', 'Same price always', 'Prices keep changing'),
|
||||||
|
|
||||||
|
-- V2: Value Perception (missing V2.04, V2.05)
|
||||||
|
('V2.04', 'V2', 'V', 'Quality-Price Ratio', 'Worth vs cost', 'Excellent quality for price', 'Overpriced for quality'),
|
||||||
|
('V2.05', 'V2', 'V', 'Competitive Value', 'Compared to alternatives', 'Best value around', 'Better deals elsewhere'),
|
||||||
|
|
||||||
|
-- V3: Promotions (missing V3.04, V3.05)
|
||||||
|
('V3.04', 'V3', 'V', 'Promotion Clarity', 'Clear offer terms', 'Clear promotion rules', 'Misleading promotions'),
|
||||||
|
('V3.05', 'V3', 'V', 'Reward Redemption', 'Using points/rewards', 'Easy to redeem rewards', 'Hard to use points'),
|
||||||
|
|
||||||
|
-- V4: Billing (missing V4.04, V4.05)
|
||||||
|
('V4.04', 'V4', 'V', 'Billing Accuracy', 'Correct charges', 'Always billed correctly', 'Overcharged constantly'),
|
||||||
|
('V4.05', 'V4', 'V', 'Billing Resolution', 'Fixing billing issues', 'Quick billing fix', 'Billing disputes ignored'),
|
||||||
|
|
||||||
|
-- R1: Trust (missing R1.04, R1.05)
|
||||||
|
('R1.04', 'R1', 'R', 'Ethics', 'Ethical behavior', 'Very ethical company', 'Unethical practices'),
|
||||||
|
('R1.05', 'R1', 'R', 'Accountability', 'Taking responsibility', 'Owned their mistakes', 'Never takes blame'),
|
||||||
|
|
||||||
|
-- R2: Reliability (missing R2.04, R2.05)
|
||||||
|
('R2.04', 'R2', 'R', 'Predictability', 'Consistent experience', 'Always know what to expect', 'Every visit is different'),
|
||||||
|
('R2.05', 'R2', 'R', 'Standards', 'Meeting quality standards', 'High standards maintained', 'Standards have dropped'),
|
||||||
|
|
||||||
|
-- R3: Care (missing R3.04, R3.05)
|
||||||
|
('R3.04', 'R3', 'R', 'Personal Connection', 'Human touch', 'Felt like family', 'Treated like a number'),
|
||||||
|
('R3.05', 'R3', 'R', 'Going Extra Mile', 'Beyond expectations', 'Went above and beyond', 'Minimum effort only'),
|
||||||
|
|
||||||
|
-- R4: Recovery (missing R4.04, R4.05)
|
||||||
|
('R4.04', 'R4', 'R', 'Service Recovery', 'Making things right', 'Fixed problem perfectly', 'Made it worse'),
|
||||||
|
('R4.05', 'R4', 'R', 'Feedback Response', 'Acting on feedback', 'Implemented my suggestion', 'Feedback ignored')
|
||||||
|
|
||||||
|
ON CONFLICT (code) DO UPDATE SET
|
||||||
|
name = EXCLUDED.name,
|
||||||
|
definition = EXCLUDED.definition,
|
||||||
|
positive_example = EXCLUDED.positive_example,
|
||||||
|
negative_example = EXCLUDED.negative_example;
|
||||||
|
|
||||||
|
-- Verify count
|
||||||
|
DO $$
|
||||||
|
DECLARE
|
||||||
|
subcode_count INTEGER;
|
||||||
|
BEGIN
|
||||||
|
SELECT COUNT(*) INTO subcode_count FROM pipeline.urt_subcodes;
|
||||||
|
RAISE NOTICE 'Total subcodes after sync: %', subcode_count;
|
||||||
|
END $$;
|
||||||
411
migrations/versions/013_insert_primitives.sql
Normal file
411
migrations/versions/013_insert_primitives.sql
Normal file
@@ -0,0 +1,411 @@
|
|||||||
|
-- Migration: Insert frozen primitive dictionary (36 primitives)
|
||||||
|
-- Description: Populates the primitives table with the complete URT taxonomy
|
||||||
|
-- Date: 2025-01-31
|
||||||
|
|
||||||
|
-- Quality dimension (8 primitives)
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'EFFECTIVENESS',
|
||||||
|
'QUALITY',
|
||||||
|
'Effectiveness',
|
||||||
|
'Did it achieve its intended purpose?',
|
||||||
|
FALSE,
|
||||||
|
'["worked perfectly", "exactly what I needed", "solved my problem"]',
|
||||||
|
'["didn''t work", "useless", "waste of time"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'TASTE',
|
||||||
|
'QUALITY',
|
||||||
|
'Taste',
|
||||||
|
'Sensory quality (flavor, texture, smell)',
|
||||||
|
FALSE,
|
||||||
|
'["delicious", "amazing taste", "flavorful"]',
|
||||||
|
'["bland", "tasteless", "disgusting"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'CRAFT',
|
||||||
|
'QUALITY',
|
||||||
|
'Craft',
|
||||||
|
'Skill of execution, workmanship',
|
||||||
|
FALSE,
|
||||||
|
'["well-made", "professional", "quality work"]',
|
||||||
|
'["sloppy", "poorly made", "amateur"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'ACCURACY',
|
||||||
|
'QUALITY',
|
||||||
|
'Accuracy',
|
||||||
|
'Correct as ordered/specified',
|
||||||
|
FALSE,
|
||||||
|
'["exactly what I ordered", "perfect", "got everything right"]',
|
||||||
|
'["wrong order", "missing items", "not what I asked for"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'FRESHNESS',
|
||||||
|
'QUALITY',
|
||||||
|
'Freshness',
|
||||||
|
'Fresh vs stale/expired',
|
||||||
|
FALSE,
|
||||||
|
'["fresh", "just made", "new"]',
|
||||||
|
'["stale", "old", "expired"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'TEMPERATURE',
|
||||||
|
'QUALITY',
|
||||||
|
'Temperature',
|
||||||
|
'Appropriate temperature for the item',
|
||||||
|
FALSE,
|
||||||
|
'["hot", "perfect temperature", "cold as it should be"]',
|
||||||
|
'["cold", "lukewarm", "too hot", "room temperature"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'CONDITION',
|
||||||
|
'QUALITY',
|
||||||
|
'Condition',
|
||||||
|
'Physical state, damage, defects',
|
||||||
|
FALSE,
|
||||||
|
'["perfect condition", "like new", "undamaged"]',
|
||||||
|
'["damaged", "broken", "defective"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'CONSISTENCY',
|
||||||
|
'QUALITY',
|
||||||
|
'Consistency',
|
||||||
|
'Same quality across visits/units',
|
||||||
|
FALSE,
|
||||||
|
'["always consistent", "reliable quality", "same every time"]',
|
||||||
|
'["inconsistent", "hit or miss", "varies"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Service dimension (4 primitives)
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'MANNER',
|
||||||
|
'SERVICE',
|
||||||
|
'Manner',
|
||||||
|
'Warmth, respect, patience in interactions',
|
||||||
|
FALSE,
|
||||||
|
'["friendly", "nice", "welcoming", "patient"]',
|
||||||
|
'["rude", "dismissive", "impatient", "attitude"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'COMPETENCE',
|
||||||
|
'SERVICE',
|
||||||
|
'Competence',
|
||||||
|
'Knowledge and skill of staff',
|
||||||
|
FALSE,
|
||||||
|
'["knowledgeable", "professional", "knew what they were doing"]',
|
||||||
|
'["clueless", "incompetent", "didn''t know"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'ATTENTIVENESS',
|
||||||
|
'SERVICE',
|
||||||
|
'Attentiveness',
|
||||||
|
'Present, notices needs, proactive',
|
||||||
|
FALSE,
|
||||||
|
'["attentive", "checked on us", "anticipated needs"]',
|
||||||
|
'["ignored", "had to flag down", "neglected"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'COMMUNICATION',
|
||||||
|
'SERVICE',
|
||||||
|
'Communication',
|
||||||
|
'Clear, listens, keeps informed',
|
||||||
|
FALSE,
|
||||||
|
'["clear", "good communication", "kept us updated"]',
|
||||||
|
'["confusing", "didn''t listen", "no updates"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Process dimension (4 primitives)
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'SPEED',
|
||||||
|
'PROCESS',
|
||||||
|
'Speed',
|
||||||
|
'How fast/slow things happen',
|
||||||
|
FALSE,
|
||||||
|
'["fast", "quick", "no wait"]',
|
||||||
|
'["slow", "took forever", "long wait"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'FRICTION',
|
||||||
|
'PROCESS',
|
||||||
|
'Friction',
|
||||||
|
'Ease vs obstacles in the process',
|
||||||
|
FALSE,
|
||||||
|
'["easy", "smooth", "hassle-free"]',
|
||||||
|
'["complicated", "difficult", "hassle"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'RELIABILITY',
|
||||||
|
'PROCESS',
|
||||||
|
'Reliability',
|
||||||
|
'Process works consistently, no errors',
|
||||||
|
FALSE,
|
||||||
|
'["reliable", "dependable", "always works"]',
|
||||||
|
'["unreliable", "errors", "problems"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'AVAILABILITY',
|
||||||
|
'PROCESS',
|
||||||
|
'Availability',
|
||||||
|
'Hours, capacity, stock availability',
|
||||||
|
FALSE,
|
||||||
|
'["always available", "open when needed", "in stock"]',
|
||||||
|
'["closed", "sold out", "no appointments"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Environment dimension (6 primitives)
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'CLEANLINESS',
|
||||||
|
'ENVIRONMENT',
|
||||||
|
'Cleanliness',
|
||||||
|
'Clean, sanitary conditions',
|
||||||
|
FALSE,
|
||||||
|
'["clean", "spotless", "hygienic"]',
|
||||||
|
'["dirty", "filthy", "unsanitary"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'COMFORT',
|
||||||
|
'ENVIRONMENT',
|
||||||
|
'Comfort',
|
||||||
|
'Physical comfort of the space',
|
||||||
|
FALSE,
|
||||||
|
'["comfortable", "cozy", "spacious"]',
|
||||||
|
'["uncomfortable", "cramped", "hard seats"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'AMBIANCE',
|
||||||
|
'ENVIRONMENT',
|
||||||
|
'Ambiance',
|
||||||
|
'Vibe, atmosphere, noise level',
|
||||||
|
FALSE,
|
||||||
|
'["nice atmosphere", "great vibe", "quiet"]',
|
||||||
|
'["loud", "noisy", "bad atmosphere"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'SAFETY',
|
||||||
|
'ENVIRONMENT',
|
||||||
|
'Safety',
|
||||||
|
'Physical and health safety',
|
||||||
|
FALSE,
|
||||||
|
'["safe", "secure", "clean protocols"]',
|
||||||
|
'["unsafe", "dangerous", "health hazard"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'ACCESSIBILITY',
|
||||||
|
'ENVIRONMENT',
|
||||||
|
'Accessibility',
|
||||||
|
'Disability access, location convenience',
|
||||||
|
FALSE,
|
||||||
|
'["accessible", "easy to get to", "good parking"]',
|
||||||
|
'["hard to access", "no parking", "not wheelchair accessible"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'DIGITAL_UX',
|
||||||
|
'ENVIRONMENT',
|
||||||
|
'Digital UX',
|
||||||
|
'App/website usability and performance',
|
||||||
|
FALSE,
|
||||||
|
'["easy to use", "great app", "fast website"]',
|
||||||
|
'["app crashed", "hard to navigate", "slow website"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Value dimension (4 primitives)
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'PRICE_LEVEL',
|
||||||
|
'VALUE',
|
||||||
|
'Price Level',
|
||||||
|
'Absolute cost perception',
|
||||||
|
FALSE,
|
||||||
|
'["affordable", "cheap", "good prices"]',
|
||||||
|
'["expensive", "overpriced", "pricey"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'PRICE_FAIRNESS',
|
||||||
|
'VALUE',
|
||||||
|
'Price Fairness',
|
||||||
|
'Fair value for what was received',
|
||||||
|
FALSE,
|
||||||
|
'["fair price", "worth it", "good value"]',
|
||||||
|
'["rip off", "not worth it", "overcharged"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'PRICE_TRANSPARENCY',
|
||||||
|
'VALUE',
|
||||||
|
'Price Transparency',
|
||||||
|
'Clear pricing, no surprises',
|
||||||
|
FALSE,
|
||||||
|
'["clear pricing", "no hidden fees", "upfront"]',
|
||||||
|
'["hidden fees", "surprise charges", "bait and switch"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'VALUE_FOR_MONEY',
|
||||||
|
'VALUE',
|
||||||
|
'Value for Money',
|
||||||
|
'Overall worth judgment',
|
||||||
|
FALSE,
|
||||||
|
'["great value", "worth every penny", "good deal"]',
|
||||||
|
'["bad value", "waste of money", "not worth it"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Trust dimension (3 meta primitives)
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'HONESTY',
|
||||||
|
'TRUST',
|
||||||
|
'Honesty',
|
||||||
|
'Truthful, no deception',
|
||||||
|
TRUE,
|
||||||
|
'["honest", "transparent", "truthful"]',
|
||||||
|
'["lied", "deceived", "dishonest", "scam"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'ETHICS',
|
||||||
|
'TRUST',
|
||||||
|
'Ethics',
|
||||||
|
'Ethical, fair dealing',
|
||||||
|
TRUE,
|
||||||
|
'["ethical", "fair", "integrity"]',
|
||||||
|
'["unethical", "shady", "crooked"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'PROMISES',
|
||||||
|
'TRUST',
|
||||||
|
'Promises',
|
||||||
|
'Kept or broken commitments',
|
||||||
|
TRUE,
|
||||||
|
'["kept their word", "delivered as promised", "reliable"]',
|
||||||
|
'["broke promise", "didn''t deliver", "false advertising"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Resolution dimension (3 meta primitives)
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'ACKNOWLEDGMENT',
|
||||||
|
'RESOLUTION',
|
||||||
|
'Acknowledgment',
|
||||||
|
'Recognized the problem',
|
||||||
|
TRUE,
|
||||||
|
'["acknowledged", "apologized", "admitted mistake"]',
|
||||||
|
'["denied", "dismissed", "blamed me"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'RESPONSE_QUALITY',
|
||||||
|
'RESOLUTION',
|
||||||
|
'Response Quality',
|
||||||
|
'How well they handled the issue',
|
||||||
|
TRUE,
|
||||||
|
'["handled well", "resolved quickly", "took care of it"]',
|
||||||
|
'["ignored complaint", "unhelpful", "made it worse"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'RECOVERY',
|
||||||
|
'RESOLUTION',
|
||||||
|
'Recovery',
|
||||||
|
'Made it right, compensation',
|
||||||
|
TRUE,
|
||||||
|
'["made it right", "refunded", "compensated"]',
|
||||||
|
'["refused refund", "no compensation", "wouldn''t fix"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Loyalty dimension (3 meta primitives)
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'RETURN_INTENT',
|
||||||
|
'LOYALTY',
|
||||||
|
'Return Intent',
|
||||||
|
'Will/won''t come back',
|
||||||
|
TRUE,
|
||||||
|
'["will be back", "returning customer", "coming again"]',
|
||||||
|
'["never again", "won''t return", "last time"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'RECOMMEND',
|
||||||
|
'LOYALTY',
|
||||||
|
'Recommend',
|
||||||
|
'Would/wouldn''t recommend',
|
||||||
|
TRUE,
|
||||||
|
'["highly recommend", "tell everyone", "must try"]',
|
||||||
|
'["avoid", "don''t go", "stay away", "wouldn''t recommend"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'RECOGNITION',
|
||||||
|
'LOYALTY',
|
||||||
|
'Recognition',
|
||||||
|
'Felt valued, remembered',
|
||||||
|
TRUE,
|
||||||
|
'["remembered me", "felt valued", "personal touch"]',
|
||||||
|
'["treated like a number", "didn''t care", "no loyalty"]'
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Escape dimension (1 meta primitive)
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES (
|
||||||
|
'UNMAPPED',
|
||||||
|
'ESCAPE',
|
||||||
|
'Unmapped',
|
||||||
|
'Does not fit taxonomy; preserve evidence',
|
||||||
|
TRUE,
|
||||||
|
'[]',
|
||||||
|
'[]'
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Verify count
|
||||||
|
-- SELECT COUNT(*) FROM pipeline.primitives; -- Should return 36
|
||||||
561
migrations/versions/014_primitive_classification_system.sql
Normal file
561
migrations/versions/014_primitive_classification_system.sql
Normal file
@@ -0,0 +1,561 @@
|
|||||||
|
-- =============================================================================
|
||||||
|
-- Migration: 014_primitive_classification_system.sql
|
||||||
|
-- Purpose: Create primitive classification system for context-aware review analysis
|
||||||
|
-- =============================================================================
|
||||||
|
--
|
||||||
|
-- This migration introduces a "primitive" classification system that allows
|
||||||
|
-- industry-specific and category-specific configuration of what aspects to
|
||||||
|
-- look for when classifying reviews.
|
||||||
|
--
|
||||||
|
-- Components:
|
||||||
|
-- 1. pipeline.primitives - Frozen dictionary of primitives (quality dimensions)
|
||||||
|
-- 2. ALTER public.gbp_categories - Add primitive_configs and business_context
|
||||||
|
-- 3. pipeline.jsonb_deep_merge() - Recursive JSONB merge function
|
||||||
|
-- 4. pipeline.resolve_primitive_config() - Resolve configs through category tree
|
||||||
|
-- 5. pipeline.get_classification_context() - Get full classification context
|
||||||
|
--
|
||||||
|
-- Date: 2026-01-31
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- SECTION 1: PRIMITIVES TABLE (Frozen Dictionary)
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS pipeline.primitives (
|
||||||
|
code VARCHAR(30) PRIMARY KEY,
|
||||||
|
dimension VARCHAR(20) NOT NULL, -- quality, service, process, environment, value, trust, resolution, loyalty, escape
|
||||||
|
name VARCHAR(100) NOT NULL,
|
||||||
|
definition TEXT NOT NULL,
|
||||||
|
is_meta BOOLEAN DEFAULT FALSE, -- true for always-active primitives (HONESTY, ETHICS, etc.)
|
||||||
|
base_positive_signals TEXT[],
|
||||||
|
base_negative_signals TEXT[],
|
||||||
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Index on dimension for filtering
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_primitives_dimension ON pipeline.primitives(dimension);
|
||||||
|
|
||||||
|
-- Index on is_meta for quick access to always-active primitives
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_primitives_is_meta ON pipeline.primitives(is_meta) WHERE is_meta = TRUE;
|
||||||
|
|
||||||
|
COMMENT ON TABLE pipeline.primitives IS 'Frozen dictionary of classification primitives (quality dimensions)';
|
||||||
|
COMMENT ON COLUMN pipeline.primitives.code IS 'Unique identifier for the primitive (e.g., FOOD_TASTE, SERVICE_SPEED)';
|
||||||
|
COMMENT ON COLUMN pipeline.primitives.dimension IS 'Category of the primitive (quality, service, process, etc.)';
|
||||||
|
COMMENT ON COLUMN pipeline.primitives.is_meta IS 'If true, this primitive is always active regardless of category config';
|
||||||
|
COMMENT ON COLUMN pipeline.primitives.base_positive_signals IS 'Default positive signal keywords for this primitive';
|
||||||
|
COMMENT ON COLUMN pipeline.primitives.base_negative_signals IS 'Default negative signal keywords for this primitive';
|
||||||
|
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- SECTION 2: ALTER gbp_categories TABLE
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- Add primitive_configs column (JSONB for flexible config)
|
||||||
|
ALTER TABLE public.gbp_categories
|
||||||
|
ADD COLUMN IF NOT EXISTS primitive_configs JSONB DEFAULT '{}';
|
||||||
|
|
||||||
|
-- Add business_context column (JSONB for industry-specific context)
|
||||||
|
ALTER TABLE public.gbp_categories
|
||||||
|
ADD COLUMN IF NOT EXISTS business_context JSONB DEFAULT '{}';
|
||||||
|
|
||||||
|
-- Add config versioning columns
|
||||||
|
ALTER TABLE public.gbp_categories
|
||||||
|
ADD COLUMN IF NOT EXISTS config_version VARCHAR(20);
|
||||||
|
|
||||||
|
ALTER TABLE public.gbp_categories
|
||||||
|
ADD COLUMN IF NOT EXISTS config_generated_by VARCHAR(100);
|
||||||
|
|
||||||
|
ALTER TABLE public.gbp_categories
|
||||||
|
ADD COLUMN IF NOT EXISTS config_updated_at TIMESTAMP WITH TIME ZONE;
|
||||||
|
|
||||||
|
-- GIN indexes for JSONB containment queries
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_gbp_categories_primitive_configs
|
||||||
|
ON public.gbp_categories USING GIN (primitive_configs);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_gbp_categories_business_context
|
||||||
|
ON public.gbp_categories USING GIN (business_context);
|
||||||
|
|
||||||
|
-- Index for config version lookups
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_gbp_categories_config_version
|
||||||
|
ON public.gbp_categories(config_version) WHERE config_version IS NOT NULL;
|
||||||
|
|
||||||
|
COMMENT ON COLUMN public.gbp_categories.primitive_configs IS 'JSONB config for primitives at this category level (inherits from ancestors)';
|
||||||
|
COMMENT ON COLUMN public.gbp_categories.business_context IS 'JSONB business context for this category (industry-specific terminology, etc.)';
|
||||||
|
COMMENT ON COLUMN public.gbp_categories.config_version IS 'Version of the primitive config (for cache invalidation)';
|
||||||
|
COMMENT ON COLUMN public.gbp_categories.config_generated_by IS 'Tool/model that generated this config';
|
||||||
|
COMMENT ON COLUMN public.gbp_categories.config_updated_at IS 'When the config was last updated';
|
||||||
|
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- SECTION 3: JSONB DEEP MERGE FUNCTION
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- Recursive JSONB merge function
|
||||||
|
-- - Objects: recursively merge (override wins on conflicts)
|
||||||
|
-- - Arrays: union with dedup by default
|
||||||
|
-- - If object has "__replace__": true, replace entirely instead of merge
|
||||||
|
CREATE OR REPLACE FUNCTION pipeline.jsonb_deep_merge(
|
||||||
|
base JSONB,
|
||||||
|
override JSONB
|
||||||
|
) RETURNS JSONB AS $$
|
||||||
|
DECLARE
|
||||||
|
result JSONB;
|
||||||
|
key TEXT;
|
||||||
|
base_value JSONB;
|
||||||
|
override_value JSONB;
|
||||||
|
merged_array JSONB;
|
||||||
|
BEGIN
|
||||||
|
-- Handle NULL cases
|
||||||
|
IF base IS NULL THEN
|
||||||
|
RETURN override;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
IF override IS NULL THEN
|
||||||
|
RETURN base;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- If override has __replace__ flag, return override without the flag
|
||||||
|
IF jsonb_typeof(override) = 'object' AND override ? '__replace__' AND (override->>'__replace__')::boolean = true THEN
|
||||||
|
RETURN override - '__replace__';
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- If both are not objects, override wins
|
||||||
|
IF jsonb_typeof(base) != 'object' OR jsonb_typeof(override) != 'object' THEN
|
||||||
|
RETURN override;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Both are objects, merge recursively
|
||||||
|
result := base;
|
||||||
|
|
||||||
|
FOR key IN SELECT jsonb_object_keys(override)
|
||||||
|
LOOP
|
||||||
|
override_value := override->key;
|
||||||
|
|
||||||
|
IF NOT (base ? key) THEN
|
||||||
|
-- Key doesn't exist in base, just add it
|
||||||
|
result := result || jsonb_build_object(key, override_value);
|
||||||
|
ELSE
|
||||||
|
base_value := base->key;
|
||||||
|
|
||||||
|
-- Check for __replace__ flag in the override value
|
||||||
|
IF jsonb_typeof(override_value) = 'object'
|
||||||
|
AND override_value ? '__replace__'
|
||||||
|
AND (override_value->>'__replace__')::boolean = true THEN
|
||||||
|
-- Replace entirely (without the __replace__ flag)
|
||||||
|
result := result || jsonb_build_object(key, override_value - '__replace__');
|
||||||
|
|
||||||
|
-- If both are objects, recurse
|
||||||
|
ELSIF jsonb_typeof(base_value) = 'object' AND jsonb_typeof(override_value) = 'object' THEN
|
||||||
|
result := result || jsonb_build_object(
|
||||||
|
key,
|
||||||
|
pipeline.jsonb_deep_merge(base_value, override_value)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- If both are arrays, union with dedup
|
||||||
|
ELSIF jsonb_typeof(base_value) = 'array' AND jsonb_typeof(override_value) = 'array' THEN
|
||||||
|
-- Union arrays, remove duplicates
|
||||||
|
-- Using a subquery to deduplicate
|
||||||
|
SELECT jsonb_agg(DISTINCT elem)
|
||||||
|
INTO merged_array
|
||||||
|
FROM (
|
||||||
|
SELECT jsonb_array_elements(base_value) AS elem
|
||||||
|
UNION
|
||||||
|
SELECT jsonb_array_elements(override_value) AS elem
|
||||||
|
) AS combined;
|
||||||
|
|
||||||
|
result := result || jsonb_build_object(key, COALESCE(merged_array, '[]'::jsonb));
|
||||||
|
|
||||||
|
-- Otherwise, override wins
|
||||||
|
ELSE
|
||||||
|
result := result || jsonb_build_object(key, override_value);
|
||||||
|
END IF;
|
||||||
|
END IF;
|
||||||
|
END LOOP;
|
||||||
|
|
||||||
|
RETURN result;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql IMMUTABLE;
|
||||||
|
|
||||||
|
COMMENT ON FUNCTION pipeline.jsonb_deep_merge(JSONB, JSONB) IS
|
||||||
|
'Recursively merges two JSONB objects. Objects are merged recursively (override wins on conflicts).
|
||||||
|
Arrays are unioned with dedup. Use {"__replace__": true, ...} to replace instead of merge.';
|
||||||
|
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- SECTION 4: RESOLVE PRIMITIVE CONFIG FUNCTION
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- Resolves primitive config by merging ancestor configs (general -> specific)
|
||||||
|
CREATE OR REPLACE FUNCTION pipeline.resolve_primitive_config(
|
||||||
|
p_path ltree
|
||||||
|
) RETURNS JSONB AS $$
|
||||||
|
DECLARE
|
||||||
|
result JSONB := '{}';
|
||||||
|
row_config JSONB;
|
||||||
|
BEGIN
|
||||||
|
-- Fetch all ancestor nodes (including self), ordered by level ASC (general -> specific)
|
||||||
|
-- Uses @> operator: p_path is a descendant of (or equal to) the node's path
|
||||||
|
FOR row_config IN
|
||||||
|
SELECT primitive_configs
|
||||||
|
FROM public.gbp_categories
|
||||||
|
WHERE p_path <@ path -- p_path is descendant of or equal to path
|
||||||
|
ORDER BY level ASC
|
||||||
|
LOOP
|
||||||
|
-- Skip NULL or empty configs
|
||||||
|
IF row_config IS NOT NULL AND row_config != '{}' THEN
|
||||||
|
result := pipeline.jsonb_deep_merge(result, row_config);
|
||||||
|
END IF;
|
||||||
|
END LOOP;
|
||||||
|
|
||||||
|
RETURN result;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql STABLE;
|
||||||
|
|
||||||
|
COMMENT ON FUNCTION pipeline.resolve_primitive_config(ltree) IS
|
||||||
|
'Resolves the full primitive config for a category path by merging all ancestor configs from general to specific.';
|
||||||
|
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- SECTION 5: RESOLVE BUSINESS CONTEXT FUNCTION
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- Resolves business context by merging ancestor contexts (general -> specific)
|
||||||
|
CREATE OR REPLACE FUNCTION pipeline.resolve_business_context(
|
||||||
|
p_path ltree
|
||||||
|
) RETURNS JSONB AS $$
|
||||||
|
DECLARE
|
||||||
|
result JSONB := '{}';
|
||||||
|
row_context JSONB;
|
||||||
|
BEGIN
|
||||||
|
-- Fetch all ancestor nodes (including self), ordered by level ASC (general -> specific)
|
||||||
|
FOR row_context IN
|
||||||
|
SELECT business_context
|
||||||
|
FROM public.gbp_categories
|
||||||
|
WHERE p_path <@ path
|
||||||
|
ORDER BY level ASC
|
||||||
|
LOOP
|
||||||
|
-- Skip NULL or empty contexts
|
||||||
|
IF row_context IS NOT NULL AND row_context != '{}' THEN
|
||||||
|
result := pipeline.jsonb_deep_merge(result, row_context);
|
||||||
|
END IF;
|
||||||
|
END LOOP;
|
||||||
|
|
||||||
|
RETURN result;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql STABLE;
|
||||||
|
|
||||||
|
COMMENT ON FUNCTION pipeline.resolve_business_context(ltree) IS
|
||||||
|
'Resolves the full business context for a category path by merging all ancestor contexts from general to specific.';
|
||||||
|
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- SECTION 6: GET CLASSIFICATION CONTEXT FUNCTION
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- Returns complete classification context for a category path
|
||||||
|
CREATE OR REPLACE FUNCTION pipeline.get_classification_context(
|
||||||
|
p_path ltree
|
||||||
|
) RETURNS JSONB AS $$
|
||||||
|
DECLARE
|
||||||
|
resolved_primitives JSONB;
|
||||||
|
resolved_context JSONB;
|
||||||
|
primitives_dict JSONB;
|
||||||
|
BEGIN
|
||||||
|
-- Resolve the primitive config for this path
|
||||||
|
resolved_primitives := pipeline.resolve_primitive_config(p_path);
|
||||||
|
|
||||||
|
-- Resolve the business context for this path
|
||||||
|
resolved_context := pipeline.resolve_business_context(p_path);
|
||||||
|
|
||||||
|
-- Build the primitives dictionary from the primitives table
|
||||||
|
SELECT jsonb_object_agg(
|
||||||
|
code,
|
||||||
|
jsonb_build_object(
|
||||||
|
'code', code,
|
||||||
|
'dimension', dimension,
|
||||||
|
'name', name,
|
||||||
|
'definition', definition,
|
||||||
|
'is_meta', is_meta,
|
||||||
|
'base_positive_signals', COALESCE(to_jsonb(base_positive_signals), '[]'::jsonb),
|
||||||
|
'base_negative_signals', COALESCE(to_jsonb(base_negative_signals), '[]'::jsonb)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
INTO primitives_dict
|
||||||
|
FROM pipeline.primitives;
|
||||||
|
|
||||||
|
-- Handle case where primitives table is empty
|
||||||
|
IF primitives_dict IS NULL THEN
|
||||||
|
primitives_dict := '{}'::jsonb;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Return combined context object
|
||||||
|
RETURN jsonb_build_object(
|
||||||
|
'primitive_configs', resolved_primitives,
|
||||||
|
'business_context', resolved_context,
|
||||||
|
'primitives_dictionary', primitives_dict,
|
||||||
|
'category_path', p_path::text,
|
||||||
|
'resolved_at', NOW()
|
||||||
|
);
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql STABLE;
|
||||||
|
|
||||||
|
COMMENT ON FUNCTION pipeline.get_classification_context(ltree) IS
|
||||||
|
'Returns complete classification context for a category path, including resolved primitive configs,
|
||||||
|
business context, and the full primitives dictionary.';
|
||||||
|
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- SECTION 7: HELPER FUNCTION - GET ACTIVE PRIMITIVES
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- Returns the list of active primitive codes for a category path
|
||||||
|
-- (includes meta primitives + enabled primitives from config)
|
||||||
|
CREATE OR REPLACE FUNCTION pipeline.get_active_primitives(
|
||||||
|
p_path ltree
|
||||||
|
) RETURNS TEXT[] AS $$
|
||||||
|
DECLARE
|
||||||
|
resolved_config JSONB;
|
||||||
|
active_codes TEXT[];
|
||||||
|
meta_codes TEXT[];
|
||||||
|
config_enabled TEXT[];
|
||||||
|
config_disabled TEXT[];
|
||||||
|
BEGIN
|
||||||
|
-- Get resolved config
|
||||||
|
resolved_config := pipeline.resolve_primitive_config(p_path);
|
||||||
|
|
||||||
|
-- Get all meta primitives (always active)
|
||||||
|
SELECT array_agg(code)
|
||||||
|
INTO meta_codes
|
||||||
|
FROM pipeline.primitives
|
||||||
|
WHERE is_meta = TRUE;
|
||||||
|
|
||||||
|
-- Get enabled primitives from config
|
||||||
|
IF resolved_config ? 'enabled' THEN
|
||||||
|
SELECT array_agg(elem::text)
|
||||||
|
INTO config_enabled
|
||||||
|
FROM jsonb_array_elements_text(resolved_config->'enabled') AS elem;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Get disabled primitives from config
|
||||||
|
IF resolved_config ? 'disabled' THEN
|
||||||
|
SELECT array_agg(elem::text)
|
||||||
|
INTO config_disabled
|
||||||
|
FROM jsonb_array_elements_text(resolved_config->'disabled') AS elem;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Combine: meta + enabled, minus disabled
|
||||||
|
active_codes := COALESCE(meta_codes, ARRAY[]::TEXT[]) || COALESCE(config_enabled, ARRAY[]::TEXT[]);
|
||||||
|
|
||||||
|
-- Remove disabled primitives
|
||||||
|
IF config_disabled IS NOT NULL THEN
|
||||||
|
active_codes := array(
|
||||||
|
SELECT unnest(active_codes)
|
||||||
|
EXCEPT
|
||||||
|
SELECT unnest(config_disabled)
|
||||||
|
);
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Remove duplicates
|
||||||
|
active_codes := array(SELECT DISTINCT unnest(active_codes));
|
||||||
|
|
||||||
|
RETURN active_codes;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql STABLE;
|
||||||
|
|
||||||
|
COMMENT ON FUNCTION pipeline.get_active_primitives(ltree) IS
|
||||||
|
'Returns array of active primitive codes for a category path (meta primitives + enabled - disabled).';
|
||||||
|
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- SECTION 8: SEED INITIAL PRIMITIVES (Examples)
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- Insert some example primitives (can be extended later)
|
||||||
|
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
|
||||||
|
VALUES
|
||||||
|
-- Meta primitives (always active)
|
||||||
|
('HONESTY', 'trust', 'Honesty & Truthfulness', 'Whether the business is perceived as honest and truthful in their dealings', TRUE,
|
||||||
|
ARRAY['honest', 'truthful', 'transparent', 'upfront', 'no hidden'],
|
||||||
|
ARRAY['lied', 'dishonest', 'deceptive', 'misleading', 'hidden fees', 'bait and switch']),
|
||||||
|
|
||||||
|
('ETHICS', 'trust', 'Ethical Behavior', 'Whether the business behaves ethically and with integrity', TRUE,
|
||||||
|
ARRAY['ethical', 'integrity', 'fair', 'principled', 'moral'],
|
||||||
|
ARRAY['unethical', 'scam', 'fraud', 'cheat', 'ripoff', 'shady']),
|
||||||
|
|
||||||
|
('SAFETY', 'trust', 'Safety & Security', 'Whether customers feel safe and secure', TRUE,
|
||||||
|
ARRAY['safe', 'secure', 'protected', 'trust'],
|
||||||
|
ARRAY['unsafe', 'dangerous', 'security concern', 'risk', 'hazard']),
|
||||||
|
|
||||||
|
-- Quality dimension primitives
|
||||||
|
('FOOD_TASTE', 'quality', 'Food Taste & Flavor', 'Quality and taste of food items', FALSE,
|
||||||
|
ARRAY['delicious', 'tasty', 'flavorful', 'yummy', 'amazing taste', 'perfectly seasoned'],
|
||||||
|
ARRAY['bland', 'tasteless', 'bad taste', 'over-seasoned', 'under-seasoned', 'disgusting']),
|
||||||
|
|
||||||
|
('FOOD_FRESHNESS', 'quality', 'Food Freshness', 'Freshness of ingredients and food items', FALSE,
|
||||||
|
ARRAY['fresh', 'crisp', 'just made', 'homemade', 'organic'],
|
||||||
|
ARRAY['stale', 'old', 'not fresh', 'frozen', 'reheated', 'expired']),
|
||||||
|
|
||||||
|
('FOOD_PORTION', 'quality', 'Portion Size', 'Size and quantity of food servings', FALSE,
|
||||||
|
ARRAY['generous portions', 'large serving', 'filling', 'plenty of food'],
|
||||||
|
ARRAY['small portions', 'tiny', 'not enough', 'skimpy', 'overpriced for size']),
|
||||||
|
|
||||||
|
('PRODUCT_QUALITY', 'quality', 'Product Quality', 'Overall quality of products', FALSE,
|
||||||
|
ARRAY['high quality', 'well made', 'premium', 'durable', 'excellent quality'],
|
||||||
|
ARRAY['poor quality', 'cheap', 'flimsy', 'broke easily', 'defective']),
|
||||||
|
|
||||||
|
-- Service dimension primitives
|
||||||
|
('SERVICE_SPEED', 'service', 'Service Speed', 'Speed and timeliness of service', FALSE,
|
||||||
|
ARRAY['fast', 'quick', 'prompt', 'efficient', 'no wait'],
|
||||||
|
ARRAY['slow', 'long wait', 'took forever', 'delayed', 'waited too long']),
|
||||||
|
|
||||||
|
('SERVICE_FRIENDLINESS', 'service', 'Staff Friendliness', 'Friendliness and warmth of staff', FALSE,
|
||||||
|
ARRAY['friendly', 'welcoming', 'warm', 'nice', 'pleasant', 'smiled'],
|
||||||
|
ARRAY['rude', 'unfriendly', 'cold', 'dismissive', 'attitude', 'ignored']),
|
||||||
|
|
||||||
|
('SERVICE_KNOWLEDGE', 'service', 'Staff Knowledge', 'Knowledge and expertise of staff', FALSE,
|
||||||
|
ARRAY['knowledgeable', 'expert', 'helpful advice', 'knew their stuff', 'professional'],
|
||||||
|
ARRAY['clueless', 'didnt know', 'unhelpful', 'inexperienced', 'untrained']),
|
||||||
|
|
||||||
|
('SERVICE_ATTENTIVENESS', 'service', 'Staff Attentiveness', 'How attentive staff are to customer needs', FALSE,
|
||||||
|
ARRAY['attentive', 'checked on us', 'responsive', 'available', 'proactive'],
|
||||||
|
ARRAY['inattentive', 'ignored', 'couldnt find anyone', 'had to flag down', 'neglected']),
|
||||||
|
|
||||||
|
-- Environment dimension primitives
|
||||||
|
('ENV_CLEANLINESS', 'environment', 'Cleanliness', 'Cleanliness of the establishment', FALSE,
|
||||||
|
ARRAY['clean', 'spotless', 'tidy', 'well-maintained', 'hygienic'],
|
||||||
|
ARRAY['dirty', 'filthy', 'messy', 'gross', 'sticky', 'unhygienic']),
|
||||||
|
|
||||||
|
('ENV_AMBIANCE', 'environment', 'Ambiance & Atmosphere', 'Overall atmosphere and vibe', FALSE,
|
||||||
|
ARRAY['great atmosphere', 'nice ambiance', 'cozy', 'relaxing', 'beautiful decor'],
|
||||||
|
ARRAY['bad atmosphere', 'uncomfortable', 'loud', 'cramped', 'depressing']),
|
||||||
|
|
||||||
|
('ENV_PARKING', 'environment', 'Parking Availability', 'Availability and convenience of parking', FALSE,
|
||||||
|
ARRAY['easy parking', 'plenty of parking', 'free parking', 'valet available'],
|
||||||
|
ARRAY['no parking', 'hard to park', 'paid parking', 'had to park far']),
|
||||||
|
|
||||||
|
-- Value dimension primitives
|
||||||
|
('VALUE_PRICE', 'value', 'Price Level', 'Perception of price levels', FALSE,
|
||||||
|
ARRAY['affordable', 'reasonable prices', 'cheap', 'good deal', 'budget-friendly'],
|
||||||
|
ARRAY['expensive', 'overpriced', 'pricey', 'not worth the price', 'too costly']),
|
||||||
|
|
||||||
|
('VALUE_WORTH', 'value', 'Value for Money', 'Whether the experience is worth the cost', FALSE,
|
||||||
|
ARRAY['worth it', 'great value', 'bang for buck', 'money well spent'],
|
||||||
|
ARRAY['not worth it', 'waste of money', 'rip off', 'should be cheaper']),
|
||||||
|
|
||||||
|
-- Process dimension primitives
|
||||||
|
('PROCESS_BOOKING', 'process', 'Booking & Reservations', 'Ease of making reservations or appointments', FALSE,
|
||||||
|
ARRAY['easy to book', 'simple reservation', 'available appointments', 'online booking'],
|
||||||
|
ARRAY['hard to book', 'no availability', 'complicated booking', 'had to call multiple times']),
|
||||||
|
|
||||||
|
('PROCESS_WAIT', 'process', 'Wait Times', 'Time spent waiting for service', FALSE,
|
||||||
|
ARRAY['no wait', 'seated immediately', 'quick turnaround'],
|
||||||
|
ARRAY['long wait', 'waited forever', 'always busy', 'need to wait in line']),
|
||||||
|
|
||||||
|
-- Resolution dimension primitives
|
||||||
|
('RESOLUTION_RESPONSE', 'resolution', 'Problem Response', 'How problems and complaints are handled', FALSE,
|
||||||
|
ARRAY['fixed the issue', 'made it right', 'apologized', 'took responsibility'],
|
||||||
|
ARRAY['ignored complaint', 'didnt care', 'blamed me', 'no resolution', 'refused to help']),
|
||||||
|
|
||||||
|
-- Loyalty dimension primitives
|
||||||
|
('LOYALTY_RETURN', 'loyalty', 'Return Intent', 'Whether customers intend to return', FALSE,
|
||||||
|
ARRAY['will be back', 'coming back', 'regular customer', 'my go-to place'],
|
||||||
|
ARRAY['never again', 'wont return', 'last time', 'not coming back']),
|
||||||
|
|
||||||
|
('LOYALTY_RECOMMEND', 'loyalty', 'Recommendation Intent', 'Whether customers would recommend to others', FALSE,
|
||||||
|
ARRAY['highly recommend', 'tell everyone', 'bring friends', 'must try'],
|
||||||
|
ARRAY['dont recommend', 'avoid', 'stay away', 'warned friends']),
|
||||||
|
|
||||||
|
-- Escape dimension primitives (when customers leave early or avoid)
|
||||||
|
('ESCAPE_LEFT', 'escape', 'Early Departure', 'Whether customers left early or walked out', FALSE,
|
||||||
|
ARRAY[]::TEXT[], -- No positive signals for escape
|
||||||
|
ARRAY['walked out', 'left early', 'didnt finish', 'had to leave', 'couldnt stay'])
|
||||||
|
|
||||||
|
ON CONFLICT (code) DO UPDATE SET
|
||||||
|
dimension = EXCLUDED.dimension,
|
||||||
|
name = EXCLUDED.name,
|
||||||
|
definition = EXCLUDED.definition,
|
||||||
|
is_meta = EXCLUDED.is_meta,
|
||||||
|
base_positive_signals = EXCLUDED.base_positive_signals,
|
||||||
|
base_negative_signals = EXCLUDED.base_negative_signals;
|
||||||
|
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- SECTION 9: EXAMPLE CATEGORY CONFIGS
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- Example: Set primitive config for Food & Dining sector (level 1)
|
||||||
|
-- This would enable food-related primitives for all food businesses
|
||||||
|
UPDATE public.gbp_categories
|
||||||
|
SET
|
||||||
|
primitive_configs = '{
|
||||||
|
"enabled": ["FOOD_TASTE", "FOOD_FRESHNESS", "FOOD_PORTION", "SERVICE_SPEED", "SERVICE_FRIENDLINESS", "ENV_CLEANLINESS", "ENV_AMBIANCE", "VALUE_PRICE", "VALUE_WORTH", "PROCESS_WAIT"],
|
||||||
|
"weights": {
|
||||||
|
"FOOD_TASTE": 1.5,
|
||||||
|
"FOOD_FRESHNESS": 1.3,
|
||||||
|
"SERVICE_SPEED": 1.2
|
||||||
|
}
|
||||||
|
}'::jsonb,
|
||||||
|
business_context = '{
|
||||||
|
"terminology": {
|
||||||
|
"staff": ["server", "waiter", "waitress", "host", "hostess", "bartender"],
|
||||||
|
"product": ["food", "dish", "meal", "appetizer", "entree", "dessert", "drink"]
|
||||||
|
},
|
||||||
|
"industry": "food_service"
|
||||||
|
}'::jsonb,
|
||||||
|
config_version = 'v1.0.0',
|
||||||
|
config_generated_by = 'migration_014',
|
||||||
|
config_updated_at = NOW()
|
||||||
|
WHERE slug = 'food_dining' AND level = 1;
|
||||||
|
|
||||||
|
-- Example: Override config for Restaurants (level 2) - adds more specific settings
|
||||||
|
UPDATE public.gbp_categories
|
||||||
|
SET
|
||||||
|
primitive_configs = '{
|
||||||
|
"enabled": ["PROCESS_BOOKING", "ENV_PARKING"],
|
||||||
|
"weights": {
|
||||||
|
"PROCESS_WAIT": 1.3
|
||||||
|
},
|
||||||
|
"signals": {
|
||||||
|
"FOOD_TASTE": {
|
||||||
|
"positive": ["perfectly cooked", "chef special", "signature dish"],
|
||||||
|
"negative": ["undercooked", "overcooked", "cold food"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}'::jsonb,
|
||||||
|
business_context = '{
|
||||||
|
"terminology": {
|
||||||
|
"staff": ["chef", "cook", "sous chef", "kitchen staff"]
|
||||||
|
},
|
||||||
|
"typical_visit_duration": "1-2 hours",
|
||||||
|
"reservation_common": true
|
||||||
|
}'::jsonb,
|
||||||
|
config_version = 'v1.0.0',
|
||||||
|
config_generated_by = 'migration_014',
|
||||||
|
config_updated_at = NOW()
|
||||||
|
WHERE slug = 'restaurants' AND level = 2;
|
||||||
|
|
||||||
|
|
||||||
|
-- =============================================================================
|
||||||
|
-- VERIFICATION QUERIES (can be removed in production)
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
-- Verify primitives table
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
RAISE NOTICE 'Primitives table created with % rows', (SELECT COUNT(*) FROM pipeline.primitives);
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- Verify functions exist
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
-- Test jsonb_deep_merge
|
||||||
|
ASSERT pipeline.jsonb_deep_merge('{"a": 1}'::jsonb, '{"b": 2}'::jsonb) = '{"a": 1, "b": 2}'::jsonb,
|
||||||
|
'jsonb_deep_merge basic test failed';
|
||||||
|
|
||||||
|
-- Test __replace__ flag
|
||||||
|
ASSERT pipeline.jsonb_deep_merge('{"a": {"x": 1, "y": 2}}'::jsonb, '{"a": {"__replace__": true, "z": 3}}'::jsonb) = '{"a": {"z": 3}}'::jsonb,
|
||||||
|
'jsonb_deep_merge __replace__ test failed';
|
||||||
|
|
||||||
|
RAISE NOTICE 'All function tests passed';
|
||||||
|
END $$;
|
||||||
29
migrations/versions/015_add_business_info_columns.sql
Normal file
29
migrations/versions/015_add_business_info_columns.sql
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
-- Migration: Add dedicated columns for business info
|
||||||
|
-- Purpose: Move business data from metadata JSONB to queryable/indexable columns
|
||||||
|
-- Date: 2026-01-31
|
||||||
|
|
||||||
|
-- Add business info columns
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS business_name VARCHAR(500);
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS business_category VARCHAR(255);
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS business_address TEXT;
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS business_rating NUMERIC(3,2);
|
||||||
|
|
||||||
|
-- Add indexes for common queries
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_jobs_business_name ON jobs(business_name);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_jobs_business_category ON jobs(business_category);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_jobs_business_rating ON jobs(business_rating);
|
||||||
|
|
||||||
|
-- Migrate existing data from metadata JSONB to new columns
|
||||||
|
UPDATE jobs SET
|
||||||
|
business_name = metadata->>'business_name',
|
||||||
|
business_address = metadata->>'business_address',
|
||||||
|
business_rating = CASE
|
||||||
|
WHEN metadata->>'rating_snapshot' IS NOT NULL
|
||||||
|
THEN (metadata->>'rating_snapshot')::NUMERIC(3,2)
|
||||||
|
ELSE NULL
|
||||||
|
END
|
||||||
|
WHERE metadata IS NOT NULL
|
||||||
|
AND (business_name IS NULL OR business_address IS NULL OR business_rating IS NULL);
|
||||||
|
|
||||||
|
-- Clean up metadata: remove migrated fields (optional - keeps metadata for performance metrics only)
|
||||||
|
-- Note: We keep the data in metadata for backward compatibility, but new code should use columns
|
||||||
22
migrations/versions/016_add_gbp_category_resolution.sql
Normal file
22
migrations/versions/016_add_gbp_category_resolution.sql
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
-- Migration: Add resolved GBP category columns to jobs table
|
||||||
|
-- Purpose: Store the matched taxonomy path for classification context
|
||||||
|
-- Date: 2026-01-31
|
||||||
|
|
||||||
|
-- Add ltree extension if not exists
|
||||||
|
CREATE EXTENSION IF NOT EXISTS ltree;
|
||||||
|
|
||||||
|
-- Add resolved category columns
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS gbp_category_id INTEGER REFERENCES gbp_categories(id);
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS gbp_category_path ltree;
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS category_resolution_method VARCHAR(20); -- 'exact', 'fuzzy', 'llm', 'hierarchical'
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS business_category_source VARCHAR(20); -- 'google' or 'inferred'
|
||||||
|
|
||||||
|
-- Index for fast lookups by category path
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_jobs_gbp_category_path ON jobs USING GIST (gbp_category_path);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_jobs_gbp_category_id ON jobs(gbp_category_id);
|
||||||
|
|
||||||
|
-- Comment on columns
|
||||||
|
COMMENT ON COLUMN jobs.gbp_category_id IS 'FK to gbp_categories - the resolved deepest taxonomy node';
|
||||||
|
COMMENT ON COLUMN jobs.gbp_category_path IS 'ltree path for the resolved category (e.g., Retail.Stores.Toy_store)';
|
||||||
|
COMMENT ON COLUMN jobs.category_resolution_method IS 'How category was resolved: exact (from Google), fuzzy (trigram match), llm (LLM matched), hierarchical (LLM walked tree)';
|
||||||
|
COMMENT ON COLUMN jobs.business_category_source IS 'Where business category originated: google (scraped from Maps) or inferred (LLM inferred from name)';
|
||||||
@@ -5,7 +5,7 @@ This package provides the base abstractions for building pipelines that can be
|
|||||||
discovered, registered, and rendered with dynamic dashboards.
|
discovered, registered, and rendered with dynamic dashboards.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from pipeline_core.base import BasePipeline, PipelineMetadata, PipelineResult
|
from pipeline_core.base import BasePipeline, PipelineMetadata, PipelineResult, StageResult
|
||||||
from pipeline_core.contracts import (
|
from pipeline_core.contracts import (
|
||||||
DashboardConfig,
|
DashboardConfig,
|
||||||
DashboardSection,
|
DashboardSection,
|
||||||
@@ -22,6 +22,7 @@ __all__ = [
|
|||||||
"BasePipeline",
|
"BasePipeline",
|
||||||
"PipelineMetadata",
|
"PipelineMetadata",
|
||||||
"PipelineResult",
|
"PipelineResult",
|
||||||
|
"StageResult",
|
||||||
# Contracts
|
# Contracts
|
||||||
"DashboardConfig",
|
"DashboardConfig",
|
||||||
"DashboardSection",
|
"DashboardSection",
|
||||||
|
|||||||
311
packages/reviewiq-pipeline/IMPROVEMENTS.md
Normal file
311
packages/reviewiq-pipeline/IMPROVEMENTS.md
Normal file
@@ -0,0 +1,311 @@
|
|||||||
|
# ReviewIQ Pipeline Improvement Suggestions
|
||||||
|
|
||||||
|
Based on validation testing and analysis of the classification pipeline.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🔴 High Priority (Quality & Cost Impact)
|
||||||
|
|
||||||
|
### 1. Multi-Aspect Detection Gap
|
||||||
|
**Problem**: LLM misses secondary codes in multi-aspect reviews.
|
||||||
|
- "not too expensive" → V4.01 missed
|
||||||
|
- "easy and fast" → J1.01 missed
|
||||||
|
|
||||||
|
**Solution**: Update classification prompt to:
|
||||||
|
```
|
||||||
|
For reviews with multiple distinct topics:
|
||||||
|
1. Extract ALL aspects, not just the dominant one
|
||||||
|
2. Assign urt_secondary codes for each additional aspect
|
||||||
|
3. Flag reviews with 3+ aspects as "complex"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Impact**: ~15-20% of reviews have multiple aspects being partially captured.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2. Enable Smart Router (Cost Savings)
|
||||||
|
**Problem**: All reviews go through expensive Sonnet model.
|
||||||
|
|
||||||
|
**Solution**: Enable the implemented router:
|
||||||
|
```python
|
||||||
|
Config(
|
||||||
|
router_enabled=True,
|
||||||
|
router_conservative=True,
|
||||||
|
router_cheap_model="claude-3-5-haiku-20241022",
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Impact**:
|
||||||
|
- SKIP (1.6%): $0 cost (was ~$0.05)
|
||||||
|
- CHEAP (31.4%): ~10x cheaper with Haiku
|
||||||
|
- **Estimated 25-30% cost reduction**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3. JSON Truncation Recovery
|
||||||
|
**Problem**: ~33% of batches hit JSON truncation, causing partial failures.
|
||||||
|
|
||||||
|
**Current State**: Partial recovery implemented but still loses some reviews.
|
||||||
|
|
||||||
|
**Solution**:
|
||||||
|
1. Reduce batch size when reviews are long
|
||||||
|
2. Add `max_tokens` buffer based on expected output
|
||||||
|
3. Implement streaming JSON parser for real-time recovery
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Dynamic batch sizing based on review length
|
||||||
|
if avg_review_length > 200:
|
||||||
|
batch_size = min(batch_size, 15)
|
||||||
|
if avg_review_length > 500:
|
||||||
|
batch_size = min(batch_size, 8)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Impact**: Reduce fallback processing by ~50%, saving time and cost.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🟡 Medium Priority (Reliability & Accuracy)
|
||||||
|
|
||||||
|
### 4. LLM Response Caching
|
||||||
|
**Problem**: Retries reprocess already-classified reviews.
|
||||||
|
|
||||||
|
**Solution**: Cache successful LLM responses by content hash:
|
||||||
|
```python
|
||||||
|
class ResponseCache:
|
||||||
|
async def get(self, text_hash: str) -> dict | None:
|
||||||
|
return await redis.get(f"llm:classify:{text_hash}")
|
||||||
|
|
||||||
|
async def set(self, text_hash: str, response: dict, ttl: int = 86400):
|
||||||
|
await redis.setex(f"llm:classify:{text_hash}", ttl, json.dumps(response))
|
||||||
|
```
|
||||||
|
|
||||||
|
**Impact**:
|
||||||
|
- Zero cost for re-runs on same reviews
|
||||||
|
- Faster pipeline retries
|
||||||
|
- Useful for A/B testing prompts
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 5. Confidence-Based Routing
|
||||||
|
**Problem**: LLM assigns codes even when uncertain.
|
||||||
|
|
||||||
|
**Solution**: Add confidence threshold in prompt:
|
||||||
|
```
|
||||||
|
If confidence < 70%:
|
||||||
|
- Set confidence: "low"
|
||||||
|
- Use generic code (V4.03) instead of guessing
|
||||||
|
- Flag for human review
|
||||||
|
```
|
||||||
|
|
||||||
|
**Impact**: Reduces misclassifications, improves data quality.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 6. Post-Classification Validation
|
||||||
|
**Problem**: Some classifications don't match review content.
|
||||||
|
|
||||||
|
**Solution**: Add rule-based validation layer:
|
||||||
|
```python
|
||||||
|
def validate_classification(text: str, urt_code: str) -> bool:
|
||||||
|
# Price mentioned but not V4.xx code?
|
||||||
|
if has_price_mention(text) and not urt_code.startswith("V4"):
|
||||||
|
return False, "V4.01" # Suggest correction
|
||||||
|
|
||||||
|
# Staff mentioned but not P1.xx code?
|
||||||
|
if has_staff_mention(text) and not urt_code.startswith("P1"):
|
||||||
|
return False, "P1.01"
|
||||||
|
|
||||||
|
return True, None
|
||||||
|
```
|
||||||
|
|
||||||
|
**Impact**: Catch ~5-10% of obvious misclassifications.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 7. Span Coverage Validation
|
||||||
|
**Problem**: Some review text not covered by any span.
|
||||||
|
|
||||||
|
**Solution**: Track span coverage percentage:
|
||||||
|
```python
|
||||||
|
def calculate_coverage(text: str, spans: list) -> float:
|
||||||
|
covered_chars = set()
|
||||||
|
for span in spans:
|
||||||
|
covered_chars.update(range(span['start'], span['end']))
|
||||||
|
return len(covered_chars) / len(text)
|
||||||
|
|
||||||
|
# Flag if coverage < 60%
|
||||||
|
if coverage < 0.6:
|
||||||
|
log.warning(f"Low span coverage: {coverage:.0%}")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Impact**: Identify reviews where LLM skipped important content.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🟢 Lower Priority (Optimization & Monitoring)
|
||||||
|
|
||||||
|
### 8. Taxonomy Alignment Scoring
|
||||||
|
**Problem**: Hard to measure classification quality at scale.
|
||||||
|
|
||||||
|
**Solution**: Build automated taxonomy alignment checker:
|
||||||
|
```python
|
||||||
|
# Check if keywords in text match expected domain
|
||||||
|
DOMAIN_KEYWORDS = {
|
||||||
|
"V4": ["price", "money", "worth", "cost", "expensive", "cheap"],
|
||||||
|
"P1": ["staff", "employee", "service", "friendly", "rude"],
|
||||||
|
"J1": ["wait", "fast", "slow", "quick", "time", "minutes"],
|
||||||
|
"E1": ["clean", "dirty", "comfortable", "space", "room"],
|
||||||
|
}
|
||||||
|
|
||||||
|
def alignment_score(text: str, urt_code: str) -> float:
|
||||||
|
domain = urt_code[0:2]
|
||||||
|
keywords = DOMAIN_KEYWORDS.get(domain, [])
|
||||||
|
matches = sum(1 for kw in keywords if kw in text.lower())
|
||||||
|
return matches / len(keywords) if keywords else 0.5
|
||||||
|
```
|
||||||
|
|
||||||
|
**Impact**: Quality dashboard, regression detection.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 9. Batch Size Auto-Tuning
|
||||||
|
**Problem**: Fixed batch size doesn't adapt to review complexity.
|
||||||
|
|
||||||
|
**Solution**: Implement adaptive batch sizing:
|
||||||
|
```python
|
||||||
|
class AdaptiveBatchSizer:
|
||||||
|
def __init__(self):
|
||||||
|
self.history = [] # (batch_size, success_rate, avg_tokens)
|
||||||
|
|
||||||
|
def recommend_size(self, reviews: list) -> int:
|
||||||
|
avg_length = sum(len(r['text']) for r in reviews) / len(reviews)
|
||||||
|
|
||||||
|
# Learn from history
|
||||||
|
if self.history:
|
||||||
|
# Find optimal size for similar review lengths
|
||||||
|
similar = [h for h in self.history if abs(h['avg_len'] - avg_length) < 50]
|
||||||
|
if similar:
|
||||||
|
return max(h['size'] for h in similar if h['success_rate'] > 0.95)
|
||||||
|
|
||||||
|
# Default heuristics
|
||||||
|
if avg_length > 300:
|
||||||
|
return 10
|
||||||
|
elif avg_length > 150:
|
||||||
|
return 20
|
||||||
|
else:
|
||||||
|
return 30
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 10. Cost Tracking Dashboard
|
||||||
|
**Problem**: No visibility into per-job, per-stage costs.
|
||||||
|
|
||||||
|
**Solution**: Add cost tracking to pipeline output:
|
||||||
|
```python
|
||||||
|
@dataclass
|
||||||
|
class CostBreakdown:
|
||||||
|
stage: str
|
||||||
|
model: str
|
||||||
|
input_tokens: int
|
||||||
|
output_tokens: int
|
||||||
|
cached_tokens: int
|
||||||
|
cost_usd: float
|
||||||
|
reviews_processed: int
|
||||||
|
cost_per_review: float
|
||||||
|
|
||||||
|
# Store in database
|
||||||
|
CREATE TABLE pipeline.cost_tracking (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
execution_id UUID,
|
||||||
|
job_id UUID,
|
||||||
|
stage VARCHAR(50),
|
||||||
|
model VARCHAR(100),
|
||||||
|
input_tokens INT,
|
||||||
|
output_tokens INT,
|
||||||
|
cached_tokens INT,
|
||||||
|
cost_usd DECIMAL(10, 6),
|
||||||
|
reviews_processed INT,
|
||||||
|
created_at TIMESTAMP DEFAULT NOW()
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 11. Streaming Classification
|
||||||
|
**Problem**: Large batches block until complete.
|
||||||
|
|
||||||
|
**Solution**: Implement streaming for real-time progress:
|
||||||
|
```python
|
||||||
|
async def classify_streaming(reviews: list):
|
||||||
|
async for partial_result in llm_client.stream_batch(reviews):
|
||||||
|
# Yield each review as it completes
|
||||||
|
yield partial_result
|
||||||
|
|
||||||
|
# Persist immediately
|
||||||
|
await persist_classification(partial_result)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Impact**: Better UX, faster partial results, resilience to failures.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 12. A/B Testing Framework
|
||||||
|
**Problem**: Hard to compare prompt/model changes.
|
||||||
|
|
||||||
|
**Solution**: Built-in A/B testing:
|
||||||
|
```python
|
||||||
|
class ABTestConfig:
|
||||||
|
test_name: str
|
||||||
|
variant_a: ClassificationConfig # Control
|
||||||
|
variant_b: ClassificationConfig # Treatment
|
||||||
|
split_ratio: float = 0.1 # 10% to treatment
|
||||||
|
metrics: list[str] = ["accuracy", "cost", "latency"]
|
||||||
|
|
||||||
|
# Run both variants on same reviews
|
||||||
|
results_a = await classify(reviews, config_a)
|
||||||
|
results_b = await classify(reviews[:int(len(reviews)*0.1)], config_b)
|
||||||
|
|
||||||
|
# Compare metrics
|
||||||
|
compare_results(results_a, results_b)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Implementation Priority Matrix
|
||||||
|
|
||||||
|
| Improvement | Effort | Impact | Priority |
|
||||||
|
|-------------|--------|--------|----------|
|
||||||
|
| 1. Multi-Aspect Detection | Medium | High | 🔴 P1 |
|
||||||
|
| 2. Enable Smart Router | Low | High | 🔴 P1 |
|
||||||
|
| 3. JSON Truncation Fix | Medium | High | 🔴 P1 |
|
||||||
|
| 4. Response Caching | Medium | Medium | 🟡 P2 |
|
||||||
|
| 5. Confidence Routing | Medium | Medium | 🟡 P2 |
|
||||||
|
| 6. Post-Classification Validation | Low | Medium | 🟡 P2 |
|
||||||
|
| 7. Span Coverage Validation | Low | Low | 🟢 P3 |
|
||||||
|
| 8. Taxonomy Alignment | Medium | Low | 🟢 P3 |
|
||||||
|
| 9. Adaptive Batch Sizing | High | Medium | 🟢 P3 |
|
||||||
|
| 10. Cost Dashboard | Medium | Low | 🟢 P3 |
|
||||||
|
| 11. Streaming Classification | High | Medium | 🟢 P3 |
|
||||||
|
| 12. A/B Testing | High | Low | 🟢 P3 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quick Wins (Can implement today)
|
||||||
|
|
||||||
|
1. **Enable router** - Already implemented, just needs config flag
|
||||||
|
2. **Reduce batch size** - Change `classification_batch_size=15` for long reviews
|
||||||
|
3. **Add span coverage logging** - Simple metric to track quality
|
||||||
|
4. **Post-classification keyword check** - Basic validation rules
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Estimated Impact Summary
|
||||||
|
|
||||||
|
| Area | Current | After Improvements |
|
||||||
|
|------|---------|-------------------|
|
||||||
|
| Cost per 1000 reviews | ~$3.40 | ~$2.40 (-30%) |
|
||||||
|
| Classification accuracy | ~85% | ~92% |
|
||||||
|
| Multi-aspect capture | ~65% | ~90% |
|
||||||
|
| Batch failure rate | ~33% | ~10% |
|
||||||
|
| Pipeline retry cost | 100% | ~20% (with caching) |
|
||||||
466
packages/reviewiq-pipeline/INDUSTRY_TAXONOMY_GAPS.md
Normal file
466
packages/reviewiq-pipeline/INDUSTRY_TAXONOMY_GAPS.md
Normal file
@@ -0,0 +1,466 @@
|
|||||||
|
# Industry-Specific Taxonomy Gap Analysis
|
||||||
|
|
||||||
|
## Current URT Coverage
|
||||||
|
- **Spec**: 7 domains, 28 categories, 140 subcodes (universal)
|
||||||
|
- **Database**: 138 subcodes implemented
|
||||||
|
- **Claim**: "Works universally: Any industry, any size, any geography"
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Business Sector Analysis
|
||||||
|
|
||||||
|
### Tier 1: High-Volume Google Review Industries
|
||||||
|
|
||||||
|
These sectors have the most Google reviews and are most likely to be clients.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 🍽️ 1. RESTAURANTS & FOOD SERVICE
|
||||||
|
**Expected Review Volume**: Very High
|
||||||
|
**Current Coverage**: ⚠️ Partial
|
||||||
|
|
||||||
|
| Topic | Frequency | Has Code? | Gap |
|
||||||
|
|-------|-----------|-----------|-----|
|
||||||
|
| Food quality/taste | Very High | ❌ No | **O2.06 Food Quality** |
|
||||||
|
| Portion size | High | ❌ No | **O2.09 Portion Size** |
|
||||||
|
| Drink quality | High | ❌ No | **O2.07 Drink Quality** |
|
||||||
|
| Menu variety | Medium | ❌ No | **O2.08 Menu Variety** |
|
||||||
|
| Freshness | High | ⚠️ O2.01 (Materials) | Needs specific code |
|
||||||
|
| Chef/Cook skill | Medium | ⚠️ P2.02 (Skill) | Generic |
|
||||||
|
| Wait time for food | High | ✅ J1.01 | Covered |
|
||||||
|
| Reservation system | Medium | ✅ J2.xx | Covered |
|
||||||
|
| Ambiance | High | ✅ E1.04 | Covered |
|
||||||
|
| Cleanliness | High | ✅ E1.01 | Covered |
|
||||||
|
|
||||||
|
**Missing Codes**:
|
||||||
|
```sql
|
||||||
|
O2.06 - Food Quality (taste, preparation)
|
||||||
|
O2.07 - Drink Quality (beverages, cocktails, coffee)
|
||||||
|
O2.08 - Menu Variety (selection, options)
|
||||||
|
O2.09 - Portion Size (amount served)
|
||||||
|
O2.10 - Freshness (ingredient freshness)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 🏨 2. HOTELS & ACCOMMODATION
|
||||||
|
**Expected Review Volume**: Very High
|
||||||
|
**Current Coverage**: ⚠️ Partial
|
||||||
|
|
||||||
|
| Topic | Frequency | Has Code? | Gap |
|
||||||
|
|-------|-----------|-----------|-----|
|
||||||
|
| Room cleanliness | Very High | ✅ E1.01 | Covered |
|
||||||
|
| Bed comfort | High | ⚠️ E1.02 (Comfort) | Needs specific |
|
||||||
|
| Bathroom quality | High | ❌ No | **E1.09 Bathroom Quality** |
|
||||||
|
| Noise level | High | ❌ No | **E1.10 Noise Level** |
|
||||||
|
| WiFi quality | High | ⚠️ E2.xx | Needs specific |
|
||||||
|
| Breakfast quality | High | ❌ No | Links to F&B gap |
|
||||||
|
| Check-in/out speed | High | ✅ J1.01 | Covered |
|
||||||
|
| Pool/Gym facilities | Medium | ❌ No | **E1.11 Amenity Quality** |
|
||||||
|
| View | Medium | ❌ No | **E1.12 Room View** |
|
||||||
|
| Location | High | ✅ A4.01 | Covered |
|
||||||
|
| Value for money | High | ✅ V4.01 | Covered |
|
||||||
|
|
||||||
|
**Missing Codes**:
|
||||||
|
```sql
|
||||||
|
E1.09 - Bathroom Quality (fixtures, water pressure, toiletries)
|
||||||
|
E1.10 - Noise Level (soundproofing, street noise, neighbors)
|
||||||
|
E1.11 - Amenity Quality (pool, gym, spa facilities)
|
||||||
|
E1.12 - Room View (scenery, outlook)
|
||||||
|
E2.06 - WiFi/Internet Quality (speed, reliability)
|
||||||
|
O2.11 - Bed/Sleep Quality (mattress, pillows, linens)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 🏥 3. HEALTHCARE (Clinics, Dentists, Doctors)
|
||||||
|
**Expected Review Volume**: High
|
||||||
|
**Current Coverage**: ⚠️ Partial
|
||||||
|
|
||||||
|
| Topic | Frequency | Has Code? | Gap |
|
||||||
|
|-------|-----------|-----------|-----|
|
||||||
|
| Treatment effectiveness | Very High | ✅ O1.05 | Covered |
|
||||||
|
| Doctor manner | High | ✅ P1.01-05 | Covered |
|
||||||
|
| Wait time | Very High | ✅ J1.01-03 | Covered |
|
||||||
|
| Pain management | High | ❌ No | **O1.12 Pain/Comfort Management** |
|
||||||
|
| Diagnosis accuracy | High | ⚠️ O1.02 | Needs specific |
|
||||||
|
| Explanation clarity | High | ❌ No | **P2.06 Medical Communication** |
|
||||||
|
| Insurance handling | High | ❌ No | **V3.06 Insurance Processing** |
|
||||||
|
| Appointment availability | High | ✅ A1.xx | Covered |
|
||||||
|
| Follow-up care | Medium | ❌ No | **R3.06 Follow-up Care** |
|
||||||
|
| Hygiene/Sterilization | High | ✅ E3.04 | Covered |
|
||||||
|
|
||||||
|
**Missing Codes**:
|
||||||
|
```sql
|
||||||
|
O1.12 - Pain/Comfort Management (during procedures)
|
||||||
|
P2.06 - Medical Communication (explaining diagnosis, treatment)
|
||||||
|
V3.06 - Insurance Processing (claims, billing, coverage)
|
||||||
|
R3.06 - Follow-up Care (post-treatment support)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 🚗 4. AUTOMOTIVE (Dealerships, Mechanics, Car Wash)
|
||||||
|
**Expected Review Volume**: High
|
||||||
|
**Current Coverage**: ✅ Good (based on ClickRent data)
|
||||||
|
|
||||||
|
| Topic | Frequency | Has Code? | Gap |
|
||||||
|
|-------|-----------|-----------|-----|
|
||||||
|
| Vehicle condition | High | ✅ O1.01-02 | Covered |
|
||||||
|
| Hidden fees | Very High | ✅ V1.03 | Covered |
|
||||||
|
| Staff honesty | High | ✅ R1.01 | Covered |
|
||||||
|
| Repair quality | High | ✅ O2.02 | Covered |
|
||||||
|
| Price fairness | High | ✅ V1.02 | Covered |
|
||||||
|
| Wait time | High | ✅ J1.01 | Covered |
|
||||||
|
| Warranty honoring | Medium | ⚠️ V2.04 | Covered |
|
||||||
|
| Test drive experience | Medium | ❌ No | **O1.13 Demo/Trial Experience** |
|
||||||
|
| Trade-in fairness | Medium | ❌ No | **V1.06 Trade-in Value** |
|
||||||
|
|
||||||
|
**Missing Codes**:
|
||||||
|
```sql
|
||||||
|
O1.13 - Demo/Trial Experience (test drives, product demos)
|
||||||
|
V1.06 - Trade-in Value (exchange/trade fairness)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 💇 5. BEAUTY & WELLNESS (Salons, Spas, Gyms)
|
||||||
|
**Expected Review Volume**: High
|
||||||
|
**Current Coverage**: ⚠️ Partial
|
||||||
|
|
||||||
|
| Topic | Frequency | Has Code? | Gap |
|
||||||
|
|-------|-----------|-----------|-----|
|
||||||
|
| Service result | Very High | ✅ O1.05 | Covered |
|
||||||
|
| Stylist skill | High | ✅ P2.02 | Covered |
|
||||||
|
| Hygiene | High | ✅ E3.04 | Covered |
|
||||||
|
| Relaxation | High | ❌ No | **O1.14 Relaxation/Wellness Outcome** |
|
||||||
|
| Equipment quality | Medium | ⚠️ E1.xx | Generic |
|
||||||
|
| Class quality (gym) | Medium | ❌ No | **O1.15 Class/Instruction Quality** |
|
||||||
|
| Membership value | Medium | ✅ V4.01 | Covered |
|
||||||
|
| Trainer expertise | Medium | ✅ P2.01 | Covered |
|
||||||
|
| Appointment booking | High | ✅ J2.xx | Covered |
|
||||||
|
| Atmosphere | High | ✅ E1.04 | Covered |
|
||||||
|
|
||||||
|
**Missing Codes**:
|
||||||
|
```sql
|
||||||
|
O1.14 - Relaxation/Wellness Outcome (stress relief, rejuvenation)
|
||||||
|
O1.15 - Class/Instruction Quality (fitness classes, workshops)
|
||||||
|
E1.13 - Equipment Quality (gym machines, salon tools)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 🎢 6. ENTERTAINMENT & RECREATION
|
||||||
|
**Expected Review Volume**: High
|
||||||
|
**Current Coverage**: ❌ Poor (confirmed by Go Karts data)
|
||||||
|
|
||||||
|
| Topic | Frequency | Has Code? | Gap |
|
||||||
|
|-------|-----------|-----------|-----|
|
||||||
|
| Fun factor | Very High | ❌ No | **O1.08 Entertainment Value** |
|
||||||
|
| Excitement/Thrill | High | ❌ No | **O1.09 Excitement Level** |
|
||||||
|
| Family suitability | High | ❌ No | **O1.06 Family Suitability** |
|
||||||
|
| Group experience | High | ❌ No | **O1.11 Group Suitability** |
|
||||||
|
| Safety (rides) | High | ✅ E4.01 | Covered |
|
||||||
|
| Queue/Wait | High | ✅ J1.03 | Covered |
|
||||||
|
| Value for money | High | ✅ V4.01 | Covered |
|
||||||
|
| Staff friendliness | High | ✅ P1.01 | Covered |
|
||||||
|
| Would recommend | High | ❌ No | **R1.06 Would Recommend** |
|
||||||
|
| Would return | High | ❌ No | **R1.08 Will Return** |
|
||||||
|
|
||||||
|
**Missing Codes**: (Already documented)
|
||||||
|
```sql
|
||||||
|
O1.06 - Family Suitability
|
||||||
|
O1.08 - Entertainment Value
|
||||||
|
O1.09 - Excitement Level
|
||||||
|
O1.11 - Group Suitability
|
||||||
|
R1.06 - Would Recommend
|
||||||
|
R1.08 - Will Return
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 🏬 7. RETAIL (Shops, Stores, E-commerce)
|
||||||
|
**Expected Review Volume**: Very High
|
||||||
|
**Current Coverage**: ✅ Good
|
||||||
|
|
||||||
|
| Topic | Frequency | Has Code? | Gap |
|
||||||
|
|-------|-----------|-----------|-----|
|
||||||
|
| Product quality | High | ✅ O2.01 | Covered |
|
||||||
|
| Stock availability | High | ✅ A1.03 | Covered |
|
||||||
|
| Price competitiveness | High | ✅ V2.05 | Covered |
|
||||||
|
| Return policy | High | ✅ V2.04 | Covered |
|
||||||
|
| Staff helpfulness | High | ✅ P2.xx | Covered |
|
||||||
|
| Store organization | High | ✅ E1.03 | Covered |
|
||||||
|
| Checkout speed | High | ✅ J1.01 | Covered |
|
||||||
|
| Delivery (e-comm) | High | ✅ J1.02 | Covered |
|
||||||
|
| Packaging | Medium | ⚠️ O2.05 | Partial |
|
||||||
|
|
||||||
|
**Minor Gaps**:
|
||||||
|
```sql
|
||||||
|
O2.12 - Packaging Quality (e-commerce specific)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 🎓 8. EDUCATION & TRAINING
|
||||||
|
**Expected Review Volume**: Medium
|
||||||
|
**Current Coverage**: ⚠️ Partial
|
||||||
|
|
||||||
|
| Topic | Frequency | Has Code? | Gap |
|
||||||
|
|-------|-----------|-----------|-----|
|
||||||
|
| Learning outcome | Very High | ✅ O1.05 | Covered |
|
||||||
|
| Teacher quality | High | ✅ P2.xx | Covered |
|
||||||
|
| Course content | High | ❌ No | **O2.13 Course/Curriculum Quality** |
|
||||||
|
| Materials quality | Medium | ✅ O2.01 | Covered |
|
||||||
|
| Value for tuition | High | ✅ V4.01 | Covered |
|
||||||
|
| Schedule flexibility | Medium | ⚠️ O4.03 | Generic |
|
||||||
|
| Progress tracking | Medium | ❌ No | **J4.06 Progress Communication** |
|
||||||
|
| Certification value | Medium | ❌ No | **O1.16 Credential Value** |
|
||||||
|
|
||||||
|
**Missing Codes**:
|
||||||
|
```sql
|
||||||
|
O2.13 - Course/Curriculum Quality (content, structure, relevance)
|
||||||
|
O1.16 - Credential/Certification Value
|
||||||
|
J4.06 - Progress Communication (tracking, feedback)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 🏠 9. HOME SERVICES (Plumbers, Electricians, Cleaners)
|
||||||
|
**Expected Review Volume**: High
|
||||||
|
**Current Coverage**: ✅ Good
|
||||||
|
|
||||||
|
| Topic | Frequency | Has Code? | Gap |
|
||||||
|
|-------|-----------|-----------|-----|
|
||||||
|
| Work quality | Very High | ✅ O2.02 | Covered |
|
||||||
|
| Punctuality | Very High | ✅ J1.02 | Covered |
|
||||||
|
| Price transparency | High | ✅ V1.03 | Covered |
|
||||||
|
| Cleanliness after work | High | ✅ E1.01 | Covered |
|
||||||
|
| Professionalism | High | ✅ P1.xx | Covered |
|
||||||
|
| Problem solved | High | ✅ O1.05 | Covered |
|
||||||
|
| Quote accuracy | High | ⚠️ V1.02 | Covered |
|
||||||
|
| Warranty/Guarantee | Medium | ⚠️ V2.04 | Covered |
|
||||||
|
|
||||||
|
**No major gaps** - well covered by existing codes.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 🌍 10. TRAVEL & TOURISM (Tours, Attractions, Museums)
|
||||||
|
**Expected Review Volume**: High
|
||||||
|
**Current Coverage**: ⚠️ Partial
|
||||||
|
|
||||||
|
| Topic | Frequency | Has Code? | Gap |
|
||||||
|
|-------|-----------|-----------|-----|
|
||||||
|
| Experience quality | High | ⚠️ V4.03 | Too generic |
|
||||||
|
| Guide quality | High | ✅ P2.xx | Covered |
|
||||||
|
| Value for money | High | ✅ V4.01 | Covered |
|
||||||
|
| Educational value | Medium | ❌ No | **O1.17 Educational/Informative Value** |
|
||||||
|
| Crowd management | Medium | ✅ J1.03 | Covered |
|
||||||
|
| Photo opportunities | Medium | ❌ No | **E1.14 Photo/Visual Appeal** |
|
||||||
|
| Accessibility | Medium | ✅ A3.xx | Covered |
|
||||||
|
| Authenticity | Medium | ❌ No | **O2.14 Authenticity/Genuineness** |
|
||||||
|
| Memorable experience | High | ❌ No | Links to Entertainment gap |
|
||||||
|
|
||||||
|
**Missing Codes**:
|
||||||
|
```sql
|
||||||
|
O1.17 - Educational/Informative Value (learning experience)
|
||||||
|
O2.14 - Authenticity/Genuineness (cultural accuracy, real experience)
|
||||||
|
E1.14 - Photo/Visual Appeal (Instagram-worthy, scenic)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 🐾 11. PET SERVICES (Vets, Groomers, Pet Stores)
|
||||||
|
**Expected Review Volume**: Medium
|
||||||
|
**Current Coverage**: ⚠️ Partial
|
||||||
|
|
||||||
|
| Topic | Frequency | Has Code? | Gap |
|
||||||
|
|-------|-----------|-----------|-----|
|
||||||
|
| Animal care quality | Very High | ⚠️ O1.05 | Needs specific |
|
||||||
|
| Handling gentleness | High | ❌ No | **P1.06 Animal Handling** |
|
||||||
|
| Treatment outcome | High | ✅ O1.05 | Covered |
|
||||||
|
| Pet comfort/stress | High | ❌ No | **O1.18 Pet Comfort/Stress** |
|
||||||
|
| Staff knowledge | High | ✅ P2.01 | Covered |
|
||||||
|
| Emergency availability | Medium | ✅ A1.01 | Covered |
|
||||||
|
| Price transparency | High | ✅ V1.xx | Covered |
|
||||||
|
| Facility cleanliness | High | ✅ E1.01 | Covered |
|
||||||
|
|
||||||
|
**Missing Codes**:
|
||||||
|
```sql
|
||||||
|
P1.06 - Animal Handling (gentleness, care with pets)
|
||||||
|
O1.18 - Pet Comfort/Stress (during service)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 🎵 12. NIGHTLIFE (Bars, Clubs, Live Music)
|
||||||
|
**Expected Review Volume**: Medium
|
||||||
|
**Current Coverage**: ⚠️ Partial
|
||||||
|
|
||||||
|
| Topic | Frequency | Has Code? | Gap |
|
||||||
|
|-------|-----------|-----------|-----|
|
||||||
|
| Music/DJ quality | Very High | ❌ No | **E2.07 Music/Sound Quality** |
|
||||||
|
| Drink quality/variety | High | ❌ No | Links to F&B gap |
|
||||||
|
| Atmosphere/Vibe | High | ✅ E1.04 | Covered |
|
||||||
|
| Crowd quality | Medium | ❌ No | **E2.08 Crowd/Clientele Quality** |
|
||||||
|
| Door policy | Medium | ❌ No | **A1.06 Entry/Door Policy** |
|
||||||
|
| Dance floor | Medium | ❌ No | **E1.15 Dance Floor Quality** |
|
||||||
|
| Security/Safety | High | ✅ E4.01-02 | Covered |
|
||||||
|
| Drink prices | High | ✅ V1.01 | Covered |
|
||||||
|
| Staff attitude | High | ✅ P1.xx | Covered |
|
||||||
|
|
||||||
|
**Missing Codes**:
|
||||||
|
```sql
|
||||||
|
E2.07 - Music/Sound Quality (DJ, live music, sound system)
|
||||||
|
E2.08 - Crowd/Clientele Quality (type of people, vibe)
|
||||||
|
A1.06 - Entry/Door Policy (fairness, accessibility)
|
||||||
|
E1.15 - Dance Floor Quality (space, surface, lighting)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Priority Summary: Missing Codes by Urgency
|
||||||
|
|
||||||
|
### 🔴 Critical (Universal - All Industries)
|
||||||
|
```sql
|
||||||
|
R1.06 - Would Recommend (recommendation intent)
|
||||||
|
R1.07 - Would Not Recommend (anti-recommendation)
|
||||||
|
R1.08 - Will Return (return intent positive)
|
||||||
|
R1.09 - Won't Return (return intent negative)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 🟠 High (Multiple Industries)
|
||||||
|
```sql
|
||||||
|
-- Food & Beverage (Restaurants, Hotels, Nightlife, Cafes)
|
||||||
|
O2.06 - Food Quality
|
||||||
|
O2.07 - Drink Quality
|
||||||
|
O2.08 - Menu Variety
|
||||||
|
O2.09 - Portion Size
|
||||||
|
|
||||||
|
-- Entertainment & Tourism
|
||||||
|
O1.06 - Family Suitability
|
||||||
|
O1.08 - Entertainment Value
|
||||||
|
O1.09 - Excitement Level
|
||||||
|
O1.11 - Group Suitability
|
||||||
|
|
||||||
|
-- Hospitality
|
||||||
|
E1.09 - Bathroom Quality
|
||||||
|
E1.10 - Noise Level
|
||||||
|
E1.11 - Amenity Quality
|
||||||
|
E2.06 - WiFi Quality
|
||||||
|
```
|
||||||
|
|
||||||
|
### 🟡 Medium (Industry-Specific)
|
||||||
|
```sql
|
||||||
|
-- Healthcare
|
||||||
|
O1.12 - Pain/Comfort Management
|
||||||
|
P2.06 - Medical Communication
|
||||||
|
V3.06 - Insurance Processing
|
||||||
|
|
||||||
|
-- Nightlife
|
||||||
|
E2.07 - Music/Sound Quality
|
||||||
|
E2.08 - Crowd/Clientele Quality
|
||||||
|
|
||||||
|
-- Education
|
||||||
|
O2.13 - Course/Curriculum Quality
|
||||||
|
O1.16 - Credential Value
|
||||||
|
|
||||||
|
-- Hotels
|
||||||
|
O2.11 - Bed/Sleep Quality
|
||||||
|
E1.12 - Room View
|
||||||
|
```
|
||||||
|
|
||||||
|
### 🟢 Lower (Niche)
|
||||||
|
```sql
|
||||||
|
-- Automotive
|
||||||
|
O1.13 - Demo/Trial Experience
|
||||||
|
V1.06 - Trade-in Value
|
||||||
|
|
||||||
|
-- Pet Services
|
||||||
|
P1.06 - Animal Handling
|
||||||
|
O1.18 - Pet Comfort
|
||||||
|
|
||||||
|
-- Tourism
|
||||||
|
O1.17 - Educational Value
|
||||||
|
O2.14 - Authenticity
|
||||||
|
E1.14 - Photo Appeal
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recommended Implementation Phases
|
||||||
|
|
||||||
|
### Phase 1: Universal Codes (Add Immediately)
|
||||||
|
4 codes - Covers ALL industries
|
||||||
|
```sql
|
||||||
|
INSERT INTO pipeline.urt_subcodes VALUES
|
||||||
|
('R1.06', 'R1', 'R', 'Would Recommend', 'Intent to recommend to others'),
|
||||||
|
('R1.07', 'R1', 'R', 'Would Not Recommend', 'Explicit anti-recommendation'),
|
||||||
|
('R1.08', 'R1', 'R', 'Will Return', 'Intent to visit again'),
|
||||||
|
('R1.09', 'R1', 'R', 'Won''t Return', 'Explicit no-return statement');
|
||||||
|
```
|
||||||
|
|
||||||
|
### Phase 2: High-Frequency Gaps (Next Sprint)
|
||||||
|
12 codes - Covers Hospitality, F&B, Entertainment
|
||||||
|
```sql
|
||||||
|
-- Food & Beverage
|
||||||
|
('O2.06', 'O2', 'O', 'Food Quality', 'Taste, preparation, cooking quality'),
|
||||||
|
('O2.07', 'O2', 'O', 'Drink Quality', 'Beverage quality and preparation'),
|
||||||
|
('O2.08', 'O2', 'O', 'Menu Variety', 'Range of food/drink options'),
|
||||||
|
('O2.09', 'O2', 'O', 'Portion Size', 'Amount of food served'),
|
||||||
|
|
||||||
|
-- Entertainment
|
||||||
|
('O1.06', 'O1', 'O', 'Family Suitability', 'Appropriate for children and families'),
|
||||||
|
('O1.08', 'O1', 'O', 'Entertainment Value', 'How enjoyable/fun the experience was'),
|
||||||
|
('O1.09', 'O1', 'O', 'Excitement Level', 'Thrill and adrenaline factor'),
|
||||||
|
('O1.11', 'O1', 'O', 'Group Suitability', 'Good for groups/parties'),
|
||||||
|
|
||||||
|
-- Hospitality
|
||||||
|
('E1.09', 'E1', 'E', 'Bathroom Quality', 'Fixtures, water pressure, toiletries'),
|
||||||
|
('E1.10', 'E1', 'E', 'Noise Level', 'Soundproofing, ambient noise'),
|
||||||
|
('E1.11', 'E1', 'E', 'Amenity Quality', 'Pool, gym, spa facilities'),
|
||||||
|
('E2.06', 'E2', 'E', 'WiFi Quality', 'Internet speed and reliability');
|
||||||
|
```
|
||||||
|
|
||||||
|
### Phase 3: Industry-Specific (As Clients Onboard)
|
||||||
|
Add codes when specific industries become clients.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Coverage Score by Industry
|
||||||
|
|
||||||
|
| Industry | Current Coverage | After Phase 1 | After Phase 2 |
|
||||||
|
|----------|-----------------|---------------|---------------|
|
||||||
|
| Restaurants | 60% | 65% | **90%** |
|
||||||
|
| Hotels | 65% | 70% | **90%** |
|
||||||
|
| Healthcare | 70% | 75% | 80% |
|
||||||
|
| Automotive | 85% | 90% | 90% |
|
||||||
|
| Beauty/Wellness | 75% | 80% | 85% |
|
||||||
|
| Entertainment | 50% | 60% | **90%** |
|
||||||
|
| Retail | 90% | 95% | 95% |
|
||||||
|
| Education | 70% | 75% | 80% |
|
||||||
|
| Home Services | 95% | 95% | 95% |
|
||||||
|
| Travel/Tourism | 60% | 70% | **85%** |
|
||||||
|
| Pet Services | 75% | 80% | 85% |
|
||||||
|
| Nightlife | 55% | 60% | **85%** |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
**Is the taxonomy ready for production?**
|
||||||
|
|
||||||
|
| Scenario | Ready? |
|
||||||
|
|----------|--------|
|
||||||
|
| Service businesses (auto, home services) | ✅ Yes |
|
||||||
|
| Retail | ✅ Yes |
|
||||||
|
| Healthcare | ⚠️ Mostly (add Phase 1) |
|
||||||
|
| Restaurants/F&B | ❌ No (need Phase 1+2) |
|
||||||
|
| Hotels | ❌ No (need Phase 1+2) |
|
||||||
|
| Entertainment | ❌ No (need Phase 1+2) |
|
||||||
|
| Nightlife | ❌ No (need Phase 1+2) |
|
||||||
|
|
||||||
|
**Recommended Action**:
|
||||||
|
1. Add Phase 1 codes immediately (4 universal codes)
|
||||||
|
2. Add Phase 2 codes before onboarding hospitality/entertainment clients
|
||||||
|
3. Add Phase 3 codes as specific industries come online
|
||||||
238
packages/reviewiq-pipeline/TAXONOMY_GAPS.md
Normal file
238
packages/reviewiq-pipeline/TAXONOMY_GAPS.md
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
# URT Taxonomy Gap Analysis
|
||||||
|
|
||||||
|
## Executive Summary
|
||||||
|
|
||||||
|
The current taxonomy has **significant gaps** that cause ~30-40% of review content to be classified as generic codes (V4.03, O1.05) when more specific codes would be appropriate.
|
||||||
|
|
||||||
|
**Current State**: 7 domains, 28 categories, 552 subcodes
|
||||||
|
**Gap Impact**: ~653 reviews (58% of dataset) mention topics without specific codes
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Critical Gaps (High Frequency, No Coverage)
|
||||||
|
|
||||||
|
### 🔴 Gap 1: Family/Kids Experience
|
||||||
|
**Mentions**: 205 reviews (18% of dataset)
|
||||||
|
**Current Mapping**: → V4.03 (Generic) or O1.05 (Outcome)
|
||||||
|
|
||||||
|
**Missing Codes**:
|
||||||
|
| Proposed Code | Name | Definition |
|
||||||
|
|---------------|------|------------|
|
||||||
|
| O1.06 | Family Suitability | Appropriate for children and families |
|
||||||
|
| O1.07 | Age Appropriateness | Suitable for specific age groups |
|
||||||
|
| E1.06 | Child-Friendly Facilities | Amenities for children |
|
||||||
|
|
||||||
|
**Example Reviews Being Misclassified**:
|
||||||
|
- "Brilliant day for adults and kids" → V4.03 (should be O1.06)
|
||||||
|
- "Great family fun" → O1.05 (should be O1.06)
|
||||||
|
- "Los niños disfrutaron mucho" → V4.03 (should be O1.06)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 🔴 Gap 2: Fun/Entertainment Value
|
||||||
|
**Mentions**: 198 reviews (18% of dataset)
|
||||||
|
**Current Mapping**: → V4.03 (Generic) or O1.05 (Outcome)
|
||||||
|
|
||||||
|
**Missing Codes**:
|
||||||
|
| Proposed Code | Name | Definition |
|
||||||
|
|---------------|------|------------|
|
||||||
|
| O1.08 | Entertainment Value | How enjoyable/fun the experience was |
|
||||||
|
| O1.09 | Excitement Level | Thrill and adrenaline factor |
|
||||||
|
| O1.10 | Engagement | How captivating the experience was |
|
||||||
|
|
||||||
|
**Example Reviews Being Misclassified**:
|
||||||
|
- "Everyone had a blast" → V4.03 (should be O1.08)
|
||||||
|
- "Muy divertido" → V4.03 (should be O1.08)
|
||||||
|
- "Fantastische kartbaan" → V4.03 (should be O1.08)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 🔴 Gap 3: Recommendation Intent
|
||||||
|
**Mentions**: 103 reviews (9% of dataset)
|
||||||
|
**Current Mapping**: → V4.03 (Generic)
|
||||||
|
|
||||||
|
**Missing Codes**:
|
||||||
|
| Proposed Code | Name | Definition |
|
||||||
|
|---------------|------|------------|
|
||||||
|
| R1.06 | Would Recommend | Intent to recommend to others |
|
||||||
|
| R1.07 | Would Not Recommend | Explicit anti-recommendation |
|
||||||
|
| V4.06 | Net Promoter Signal | Explicit NPS-style sentiment |
|
||||||
|
|
||||||
|
**Example Reviews Being Misclassified**:
|
||||||
|
- "100% recomendable" → V4.03 (should be R1.06)
|
||||||
|
- "Highly recommend" → V4.03 (should be R1.06)
|
||||||
|
- "Don't come here" → V4.03 V- (should be R1.07)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 🟡 Gap 4: Return Intent
|
||||||
|
**Mentions**: 65 reviews (6% of dataset)
|
||||||
|
**Current Mapping**: → V4.03 or R4.03
|
||||||
|
|
||||||
|
**Missing Codes**:
|
||||||
|
| Proposed Code | Name | Definition |
|
||||||
|
|---------------|------|------------|
|
||||||
|
| R1.08 | Will Return | Intent to visit again |
|
||||||
|
| R1.09 | Won't Return | Explicit no-return statement |
|
||||||
|
|
||||||
|
**Example Reviews**:
|
||||||
|
- "We'll definitely be back" → R4.03 (should be R1.08)
|
||||||
|
- "No volveré" → V4.03 (should be R1.09)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 🟡 Gap 5: Food & Beverage
|
||||||
|
**Mentions**: 59 reviews (5% of dataset)
|
||||||
|
**Current Mapping**: → O1.01 or V4.03
|
||||||
|
|
||||||
|
**Missing Codes**:
|
||||||
|
| Proposed Code | Name | Definition |
|
||||||
|
|---------------|------|------------|
|
||||||
|
| O2.06 | Food Quality | Taste, freshness, presentation |
|
||||||
|
| O2.07 | Drink Quality | Beverage quality |
|
||||||
|
| O2.08 | Menu Variety | Range of food/drink options |
|
||||||
|
| O2.09 | Portion Size | Amount of food served |
|
||||||
|
|
||||||
|
**Example Reviews**:
|
||||||
|
- "Great food at the cafe" → O1.01 (should be O2.06)
|
||||||
|
- "Drinks were overpriced" → V1.01 (should be O2.07 + V1.01)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 🟡 Gap 6: Excitement/Thrill
|
||||||
|
**Mentions**: 23 reviews (2% of dataset)
|
||||||
|
**Current Mapping**: → V4.03 or O1.05
|
||||||
|
|
||||||
|
**Missing Code**:
|
||||||
|
| Proposed Code | Name | Definition |
|
||||||
|
|---------------|------|------------|
|
||||||
|
| O1.09 | Excitement Level | Thrill and adrenaline factor |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Medium Gaps (Moderate Frequency)
|
||||||
|
|
||||||
|
### Gap 7: Booking/Reservation Process
|
||||||
|
**Current**: J2.xx exists but limited
|
||||||
|
|
||||||
|
**Missing**:
|
||||||
|
| Code | Name | Definition |
|
||||||
|
|------|------|------------|
|
||||||
|
| J2.06 | Online Booking | Digital reservation experience |
|
||||||
|
| J2.07 | Booking Confirmation | Clear confirmation process |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Gap 8: Group Experience
|
||||||
|
**Missing**:
|
||||||
|
| Code | Name | Definition |
|
||||||
|
|------|------|------------|
|
||||||
|
| O1.11 | Group Suitability | Good for groups/parties |
|
||||||
|
| O1.12 | Team Building | Corporate/team activities |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Gap 9: Seasonal/Weather Factors
|
||||||
|
**Missing**:
|
||||||
|
| Code | Name | Definition |
|
||||||
|
|------|------|------------|
|
||||||
|
| E1.07 | Weather Protection | Shelter from elements |
|
||||||
|
| E1.08 | Seasonal Suitability | Appropriate for season |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Impact Analysis
|
||||||
|
|
||||||
|
### Current Classification Distribution (V4.03 Overuse)
|
||||||
|
|
||||||
|
```
|
||||||
|
Code | Count | % | Issue
|
||||||
|
--------|-------|------|-------
|
||||||
|
P1.01 | 477 | 14% | ✅ Correct usage
|
||||||
|
V4.03 | 319 | 10% | ⚠️ Likely 50%+ misclassified
|
||||||
|
O1.02 | 270 | 8% | ✅ Correct usage
|
||||||
|
V1.01 | 211 | 6% | ✅ Correct usage
|
||||||
|
O1.01 | 174 | 5% | ✅ Correct usage
|
||||||
|
```
|
||||||
|
|
||||||
|
### Estimated Misclassification Rate
|
||||||
|
|
||||||
|
| Gap Topic | Reviews | Est. Misclassified | % of Total |
|
||||||
|
|-----------|---------|-------------------|------------|
|
||||||
|
| Family/Kids | 205 | ~180 | 16% |
|
||||||
|
| Fun/Entertainment | 198 | ~170 | 15% |
|
||||||
|
| Recommendation | 103 | ~95 | 8% |
|
||||||
|
| Return Intent | 65 | ~50 | 4% |
|
||||||
|
| Food/Drinks | 59 | ~40 | 4% |
|
||||||
|
| Excitement | 23 | ~20 | 2% |
|
||||||
|
| **TOTAL** | **653** | **~555** | **~49%** |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recommended Taxonomy Additions
|
||||||
|
|
||||||
|
### Priority 1: Add to O1 (Core Product/Service)
|
||||||
|
```sql
|
||||||
|
INSERT INTO pipeline.urt_subcodes (code, category_code, domain_code, name, definition) VALUES
|
||||||
|
('O1.06', 'O1', 'O', 'Family Suitability', 'Appropriate for children and families'),
|
||||||
|
('O1.07', 'O1', 'O', 'Age Appropriateness', 'Suitable for specific age groups'),
|
||||||
|
('O1.08', 'O1', 'O', 'Entertainment Value', 'How enjoyable/fun the experience was'),
|
||||||
|
('O1.09', 'O1', 'O', 'Excitement Level', 'Thrill and adrenaline factor'),
|
||||||
|
('O1.10', 'O1', 'O', 'Engagement', 'How captivating the experience was'),
|
||||||
|
('O1.11', 'O1', 'O', 'Group Suitability', 'Good for groups/parties');
|
||||||
|
```
|
||||||
|
|
||||||
|
### Priority 2: Add to R1 (Relationship/Loyalty)
|
||||||
|
```sql
|
||||||
|
INSERT INTO pipeline.urt_subcodes (code, category_code, domain_code, name, definition) VALUES
|
||||||
|
('R1.06', 'R1', 'R', 'Would Recommend', 'Intent to recommend to others'),
|
||||||
|
('R1.07', 'R1', 'R', 'Would Not Recommend', 'Explicit anti-recommendation'),
|
||||||
|
('R1.08', 'R1', 'R', 'Will Return', 'Intent to visit again'),
|
||||||
|
('R1.09', 'R1', 'R', 'Won''t Return', 'Explicit no-return statement');
|
||||||
|
```
|
||||||
|
|
||||||
|
### Priority 3: Add to O2 (Product Features)
|
||||||
|
```sql
|
||||||
|
INSERT INTO pipeline.urt_subcodes (code, category_code, domain_code, name, definition) VALUES
|
||||||
|
('O2.06', 'O2', 'O', 'Food Quality', 'Taste, freshness, presentation of food'),
|
||||||
|
('O2.07', 'O2', 'O', 'Drink Quality', 'Quality of beverages'),
|
||||||
|
('O2.08', 'O2', 'O', 'Menu Variety', 'Range of food/drink options'),
|
||||||
|
('O2.09', 'O2', 'O', 'Portion Size', 'Amount of food served');
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Validation Query
|
||||||
|
|
||||||
|
After adding codes, verify reduction in V4.03 usage:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
-- Before: V4.03 count
|
||||||
|
SELECT COUNT(*) FROM pipeline.review_spans WHERE urt_primary = 'V4.03';
|
||||||
|
-- Expected: ~319
|
||||||
|
|
||||||
|
-- After reclassification, target:
|
||||||
|
-- V4.03: ~100 (true generic)
|
||||||
|
-- O1.06-O1.11: ~200 (entertainment/family)
|
||||||
|
-- R1.06-R1.09: ~150 (recommendation/return)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
**Is the taxonomy ready for production?** ❌ **No**
|
||||||
|
|
||||||
|
**Critical Issues**:
|
||||||
|
1. ~49% of reviews mention topics without specific codes
|
||||||
|
2. V4.03 is a catch-all masking actionable insights
|
||||||
|
3. Industry-specific codes (entertainment, F&B) are missing
|
||||||
|
|
||||||
|
**Recommendation**: Add 14 new subcodes before production to capture:
|
||||||
|
- Family/Kids experience (O1.06, O1.07)
|
||||||
|
- Entertainment value (O1.08, O1.09, O1.10)
|
||||||
|
- Recommendation intent (R1.06, R1.07)
|
||||||
|
- Return intent (R1.08, R1.09)
|
||||||
|
- Food/Beverage (O2.06-O2.09)
|
||||||
|
|
||||||
|
**Estimated Improvement**: Classification accuracy from ~50% specific to ~85% specific.
|
||||||
@@ -308,11 +308,15 @@ You are a review classifier using primitive-based analysis.
|
|||||||
"spans": [
|
"spans": [
|
||||||
{
|
{
|
||||||
"text": "exact text from review",
|
"text": "exact text from review",
|
||||||
|
"start": 0,
|
||||||
|
"end": 25,
|
||||||
"primitive": "MANNER",
|
"primitive": "MANNER",
|
||||||
"valence": "+",
|
"valence": "+",
|
||||||
"intensity": 2,
|
"intensity": 2,
|
||||||
"detail": 2,
|
"detail": 2,
|
||||||
"confidence": 0.85
|
"confidence": 0.85,
|
||||||
|
"entity": null,
|
||||||
|
"entity_type": null
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@@ -427,13 +431,16 @@ ORDER BY span_count DESC;
|
|||||||
python run_classification_v2.py [OPTIONS]
|
python run_classification_v2.py [OPTIONS]
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
--business TEXT Business name or pattern (required for classify/evaluate)
|
--business TEXT Business name or pattern (required for classify/evaluate)
|
||||||
--limit INT Max reviews to process (default: 100)
|
--limit INT Max reviews to process (default: 100)
|
||||||
--dry-run Don't store results to database
|
--dry-run Don't store results to database
|
||||||
--evaluate BUSINESS Evaluate existing classification quality
|
--evaluate BUSINESS Evaluate existing classification quality
|
||||||
--language-analysis Analyze UNMAPPED rates by language across all data
|
--language-analysis Analyze UNMAPPED rates by language across all data
|
||||||
--use-llm Use real LLM classification (default: mock)
|
--ignore-legacy-language Exclude rows with language='auto'/'unknown'/NULL
|
||||||
--model TEXT Model for LLM (default: gpt-4o-mini)
|
--latest-hours INT Only include spans from last N hours
|
||||||
|
--use-existing Use existing spans instead of jobs
|
||||||
|
--use-llm Use real LLM classification (requires OPENAI_API_KEY)
|
||||||
|
--model TEXT Model for LLM (default: gpt-4o-mini)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Models
|
### Models
|
||||||
|
|||||||
401
packages/reviewiq-pipeline/prompts/wave0_sector_brief.md
Normal file
401
packages/reviewiq-pipeline/prompts/wave0_sector_brief.md
Normal file
@@ -0,0 +1,401 @@
|
|||||||
|
# Wave 0: Sector Brief Generation Prompt
|
||||||
|
|
||||||
|
## Purpose
|
||||||
|
|
||||||
|
Generate a **sector brief** that provides alignment context for classification agents. This brief describes what customers care about in this sector — NOT how to classify it, NOT what primitives to use, NOT what solutions exist.
|
||||||
|
|
||||||
|
## Critical Guardrails
|
||||||
|
|
||||||
|
**DO:**
|
||||||
|
- Describe customer concerns in plain language
|
||||||
|
- Use real review language patterns
|
||||||
|
- Focus on what customers judge, complain about, praise
|
||||||
|
- Include industry-specific terminology
|
||||||
|
- Identify mode-specific concerns (dine-in vs delivery, etc.)
|
||||||
|
|
||||||
|
**DO NOT:**
|
||||||
|
- Assign primitive codes
|
||||||
|
- Suggest priorities or weights
|
||||||
|
- Propose solutions or playbooks
|
||||||
|
- Define new categories or dimensions
|
||||||
|
- Include KPIs or metrics
|
||||||
|
- Make up statistics
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Input
|
||||||
|
|
||||||
|
You will receive:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"sector_code": "FOOD_DINING",
|
||||||
|
"sector_name": "Food & Dining",
|
||||||
|
"description": "Restaurants, cafes, bars, bakeries, food trucks, catering services",
|
||||||
|
"sample_business_types": [
|
||||||
|
"Restaurants",
|
||||||
|
"Cafes & Coffee",
|
||||||
|
"Bars & Nightlife",
|
||||||
|
"Bakeries & Desserts",
|
||||||
|
"Food Services",
|
||||||
|
"Quick Service"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Output Schema
|
||||||
|
|
||||||
|
Return ONLY valid JSON matching this schema:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"sector_code": "string",
|
||||||
|
"sector_name": "string",
|
||||||
|
"generated_at": "ISO timestamp",
|
||||||
|
"version": "1.0",
|
||||||
|
|
||||||
|
"what_customers_judge": {
|
||||||
|
"description": "The primary dimensions customers evaluate in this sector",
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"aspect": "string (2-5 words)",
|
||||||
|
"importance": "critical | high | moderate",
|
||||||
|
"why_it_matters": "string (1 sentence)"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
"critical_pain_points": {
|
||||||
|
"description": "What damages reputation most severely in this sector",
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"pain_point": "string (2-5 words)",
|
||||||
|
"typical_language": ["array of phrases customers actually use"],
|
||||||
|
"reputation_impact": "severe | significant | moderate"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
"common_praise": {
|
||||||
|
"description": "What earns customer loyalty and positive reviews",
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"praise_area": "string (2-5 words)",
|
||||||
|
"typical_language": ["array of phrases customers actually use"],
|
||||||
|
"loyalty_impact": "high | moderate"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
"industry_terminology": {
|
||||||
|
"description": "Domain-specific vocabulary used in this sector",
|
||||||
|
"staff_terms": ["waiter", "server", "bartender", "chef"],
|
||||||
|
"product_terms": ["dish", "meal", "appetizer", "entree"],
|
||||||
|
"process_terms": ["reservation", "seating", "check", "tab"],
|
||||||
|
"quality_terms": ["fresh", "authentic", "homemade"],
|
||||||
|
"problem_terms": ["cold", "undercooked", "wrong order"]
|
||||||
|
},
|
||||||
|
|
||||||
|
"mode_specific_concerns": {
|
||||||
|
"description": "Different service modes have different customer priorities",
|
||||||
|
"modes": [
|
||||||
|
{
|
||||||
|
"mode": "string (e.g., 'Dine-in', 'Takeout', 'Delivery')",
|
||||||
|
"primary_concerns": ["array of top concerns for this mode"],
|
||||||
|
"unique_pain_points": ["pain points specific to this mode"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
"what_is_actionable": {
|
||||||
|
"description": "Feedback that businesses can realistically act on",
|
||||||
|
"actionable_examples": [
|
||||||
|
{
|
||||||
|
"feedback_type": "string",
|
||||||
|
"example": "string",
|
||||||
|
"action_owner": "string (role/team that can fix it)"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"not_actionable_examples": [
|
||||||
|
{
|
||||||
|
"feedback_type": "string",
|
||||||
|
"example": "string",
|
||||||
|
"why_not_actionable": "string"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
"sector_specific_signals": {
|
||||||
|
"description": "Signals that have different meaning in this sector vs others",
|
||||||
|
"examples": [
|
||||||
|
{
|
||||||
|
"signal": "string",
|
||||||
|
"meaning_in_this_sector": "string",
|
||||||
|
"contrast_with": "string (how it differs in other sectors)"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Example Output (Food & Dining)
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"sector_code": "FOOD_DINING",
|
||||||
|
"sector_name": "Food & Dining",
|
||||||
|
"generated_at": "2026-01-31T10:00:00Z",
|
||||||
|
"version": "1.0",
|
||||||
|
|
||||||
|
"what_customers_judge": {
|
||||||
|
"description": "The primary dimensions customers evaluate in this sector",
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"aspect": "Food taste and quality",
|
||||||
|
"importance": "critical",
|
||||||
|
"why_it_matters": "The core product - customers primarily visit for the food experience"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aspect": "Service speed and attentiveness",
|
||||||
|
"importance": "critical",
|
||||||
|
"why_it_matters": "Direct impact on dining experience and whether they feel valued"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aspect": "Cleanliness and hygiene",
|
||||||
|
"importance": "critical",
|
||||||
|
"why_it_matters": "Health/safety concern that can override all other positives if failed"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aspect": "Value for money",
|
||||||
|
"importance": "high",
|
||||||
|
"why_it_matters": "Portion size and quality relative to price affects return intent"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aspect": "Ambiance and atmosphere",
|
||||||
|
"importance": "moderate",
|
||||||
|
"why_it_matters": "Sets expectations and affects overall enjoyment, especially for special occasions"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
"critical_pain_points": {
|
||||||
|
"description": "What damages reputation most severely in this sector",
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"pain_point": "Food safety incidents",
|
||||||
|
"typical_language": ["got sick", "food poisoning", "found hair", "bug in food", "raw chicken"],
|
||||||
|
"reputation_impact": "severe"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pain_point": "Cold or wrong food",
|
||||||
|
"typical_language": ["food was cold", "wrong order", "not what I ordered", "missing items"],
|
||||||
|
"reputation_impact": "significant"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pain_point": "Rude or dismissive staff",
|
||||||
|
"typical_language": ["rude waiter", "ignored us", "attitude", "condescending", "eye roll"],
|
||||||
|
"reputation_impact": "significant"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pain_point": "Excessive wait times",
|
||||||
|
"typical_language": ["waited forever", "40 minutes for food", "never came back", "forgotten"],
|
||||||
|
"reputation_impact": "significant"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pain_point": "Dirty facilities",
|
||||||
|
"typical_language": ["dirty bathroom", "sticky table", "flies everywhere", "unclean"],
|
||||||
|
"reputation_impact": "severe"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
"common_praise": {
|
||||||
|
"description": "What earns customer loyalty and positive reviews",
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"praise_area": "Exceptional food quality",
|
||||||
|
"typical_language": ["best I've ever had", "delicious", "perfectly cooked", "authentic", "fresh ingredients"],
|
||||||
|
"loyalty_impact": "high"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"praise_area": "Attentive friendly service",
|
||||||
|
"typical_language": ["amazing server", "made us feel welcome", "remembered us", "went above and beyond"],
|
||||||
|
"loyalty_impact": "high"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"praise_area": "Great value",
|
||||||
|
"typical_language": ["huge portions", "great price", "worth every penny", "can't beat it"],
|
||||||
|
"loyalty_impact": "high"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"praise_area": "Perfect ambiance",
|
||||||
|
"typical_language": ["beautiful setting", "romantic", "cozy atmosphere", "perfect for date night"],
|
||||||
|
"loyalty_impact": "moderate"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
"industry_terminology": {
|
||||||
|
"description": "Domain-specific vocabulary used in this sector",
|
||||||
|
"staff_terms": ["server", "waiter", "waitress", "host", "hostess", "bartender", "chef", "cook", "manager", "busboy"],
|
||||||
|
"product_terms": ["dish", "meal", "appetizer", "entree", "main course", "dessert", "special", "daily special", "sides"],
|
||||||
|
"process_terms": ["reservation", "walk-in", "seated", "table", "check", "bill", "tab", "tip", "takeout", "to-go", "delivery"],
|
||||||
|
"quality_terms": ["fresh", "homemade", "authentic", "crispy", "tender", "juicy", "flavorful", "seasoned", "cooked to perfection"],
|
||||||
|
"problem_terms": ["cold", "lukewarm", "overcooked", "undercooked", "raw", "burnt", "soggy", "bland", "stale", "greasy"]
|
||||||
|
},
|
||||||
|
|
||||||
|
"mode_specific_concerns": {
|
||||||
|
"description": "Different service modes have different customer priorities",
|
||||||
|
"modes": [
|
||||||
|
{
|
||||||
|
"mode": "Dine-in",
|
||||||
|
"primary_concerns": ["ambiance", "service attentiveness", "wait time to be seated", "table cleanliness"],
|
||||||
|
"unique_pain_points": ["loud neighbors", "rushed out", "ignored by server", "wrong seating"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"mode": "Takeout",
|
||||||
|
"primary_concerns": ["order accuracy", "ready on time", "packaging quality", "ease of pickup"],
|
||||||
|
"unique_pain_points": ["order not ready", "missing items", "cold by pickup", "wrong order in bag"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"mode": "Delivery",
|
||||||
|
"primary_concerns": ["delivery time", "food temperature", "order accuracy", "packaging integrity"],
|
||||||
|
"unique_pain_points": ["arrived cold", "leaked in bag", "missing sauces", "driver got lost", "late delivery"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"mode": "Catering",
|
||||||
|
"primary_concerns": ["on-time setup", "quantity accuracy", "presentation", "dietary accommodation"],
|
||||||
|
"unique_pain_points": ["not enough food", "late arrival", "wrong items", "poor presentation"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
"what_is_actionable": {
|
||||||
|
"description": "Feedback that businesses can realistically act on",
|
||||||
|
"actionable_examples": [
|
||||||
|
{
|
||||||
|
"feedback_type": "Specific staff behavior",
|
||||||
|
"example": "Our server John was rude and rolled his eyes when we asked for substitutions",
|
||||||
|
"action_owner": "Front of house manager"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"feedback_type": "Food quality issue",
|
||||||
|
"example": "The chicken was undercooked - pink in the middle",
|
||||||
|
"action_owner": "Kitchen manager / Chef"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"feedback_type": "Facility issue",
|
||||||
|
"example": "Men's bathroom was out of soap and paper towels",
|
||||||
|
"action_owner": "Facilities / Shift manager"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"feedback_type": "Process issue",
|
||||||
|
"example": "Waited 20 minutes to get our check after flagging the server twice",
|
||||||
|
"action_owner": "FOH manager / Training"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"not_actionable_examples": [
|
||||||
|
{
|
||||||
|
"feedback_type": "Subjective taste preference",
|
||||||
|
"example": "I just don't like spicy food",
|
||||||
|
"why_not_actionable": "Personal preference, not a quality issue"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"feedback_type": "Location/parking",
|
||||||
|
"example": "Hard to find parking in this area",
|
||||||
|
"why_not_actionable": "External factor beyond restaurant control"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"feedback_type": "Price objection without context",
|
||||||
|
"example": "Too expensive",
|
||||||
|
"why_not_actionable": "Vague, no specifics on what was mispriced"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
"sector_specific_signals": {
|
||||||
|
"description": "Signals that have different meaning in this sector vs others",
|
||||||
|
"examples": [
|
||||||
|
{
|
||||||
|
"signal": "long wait",
|
||||||
|
"meaning_in_this_sector": "Usually negative - food taking too long, being ignored",
|
||||||
|
"contrast_with": "Healthcare: expected and sometimes indicates thoroughness"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"signal": "portion size",
|
||||||
|
"meaning_in_this_sector": "Critical value indicator - directly affects perceived value",
|
||||||
|
"contrast_with": "Healthcare: not applicable"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"signal": "noisy",
|
||||||
|
"meaning_in_this_sector": "Context-dependent - negative for fine dining, expected at sports bars",
|
||||||
|
"contrast_with": "Professional services: always negative"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Sectors to Generate
|
||||||
|
|
||||||
|
Generate one brief for each of these L1 sectors:
|
||||||
|
|
||||||
|
| Code | Sector Name | Sample Business Types |
|
||||||
|
|------|-------------|----------------------|
|
||||||
|
| `FOOD_DINING` | Food & Dining | Restaurants, Cafes, Bars, Bakeries, Catering |
|
||||||
|
| `RETAIL_SHOPPING` | Retail & Shopping | Clothing, Electronics, Grocery, Specialty stores |
|
||||||
|
| `AUTOMOTIVE` | Automotive | Dealers, Repair, Car Wash, Parts |
|
||||||
|
| `HEALTHCARE` | Healthcare | Hospitals, Clinics, Dental, Mental Health, Veterinary |
|
||||||
|
| `PROFESSIONAL_SERVICES` | Professional Services | Legal, Accounting, Consulting, Marketing |
|
||||||
|
| `HOME_SERVICES` | Home Services | Plumbing, Electrical, HVAC, Cleaning, Landscaping |
|
||||||
|
| `PERSONAL_SERVICES` | Personal Services | Salons, Spas, Fitness, Tattoo |
|
||||||
|
| `EDUCATION` | Education | Schools, Tutoring, Driving Schools, Language |
|
||||||
|
| `HOSPITALITY_TRAVEL` | Hospitality & Travel | Hotels, Tours, Travel Agencies |
|
||||||
|
| `ENTERTAINMENT` | Entertainment | Movies, Museums, Amusement Parks, Sports |
|
||||||
|
| `FINANCE_INSURANCE` | Finance & Insurance | Banks, Insurance, Investment, Loans |
|
||||||
|
| `REAL_ESTATE` | Real Estate | Agents, Property Management, Appraisers |
|
||||||
|
| `INDUSTRIAL` | Industrial | Manufacturing, Construction, Warehousing |
|
||||||
|
| `TRANSPORTATION` | Transportation | Taxis, Moving, Shipping, Courier |
|
||||||
|
| `GOVERNMENT` | Government | DMV, Courts, Public Services |
|
||||||
|
| `EVENTS_WEDDINGS` | Events & Weddings | Wedding Venues, Planners, DJ, Photography |
|
||||||
|
| `RELIGIOUS` | Religious | Churches, Temples, Mosques, Spiritual |
|
||||||
|
| `NONPROFIT` | Non-Profit | Charities, Community Organizations |
|
||||||
|
| `TECHNOLOGY` | Technology | IT Services, Software, Web Design |
|
||||||
|
| `PETS_ANIMALS` | Pets & Animals | Pet Stores, Grooming, Boarding, Training |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
This brief will be injected into Wave 1 and Wave 2 prompts as alignment context:
|
||||||
|
|
||||||
|
```
|
||||||
|
You are configuring primitives for: {sector_name}
|
||||||
|
|
||||||
|
## Sector Context (READ-ONLY, do not modify or extend)
|
||||||
|
|
||||||
|
{sector_brief_json}
|
||||||
|
|
||||||
|
## Your Task
|
||||||
|
|
||||||
|
Using the above context to understand what matters in this sector,
|
||||||
|
configure the following primitives...
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Validation
|
||||||
|
|
||||||
|
Before returning, verify:
|
||||||
|
- [ ] All arrays have 3-10 items (not empty, not excessive)
|
||||||
|
- [ ] `typical_language` arrays contain realistic review phrases
|
||||||
|
- [ ] No primitive codes, priorities, or solutions appear anywhere
|
||||||
|
- [ ] Industry terminology is accurate for this sector
|
||||||
|
- [ ] Modes are appropriate for the sector (not all sectors have delivery)
|
||||||
|
- [ ] Actionable vs not-actionable distinction is clear
|
||||||
132
packages/reviewiq-pipeline/run_classification.py
Normal file
132
packages/reviewiq-pipeline/run_classification.py
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
Run classification pipeline for a scraping job.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python run_classification.py 22c747a6-b913-4ae4-82bc-14b4195008b6
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||||
|
)
|
||||||
|
logger = logging.getLogger("run_classification")
|
||||||
|
|
||||||
|
|
||||||
|
async def run_pipeline(job_id: str):
|
||||||
|
"""Run the classification pipeline for a job."""
|
||||||
|
from reviewiq_pipeline import Pipeline
|
||||||
|
from reviewiq_pipeline.config import Config
|
||||||
|
|
||||||
|
# Get database URL from environment or use default
|
||||||
|
database_url = os.environ.get(
|
||||||
|
"DATABASE_URL",
|
||||||
|
"postgresql://scraper:scraper123@localhost:5437/scraper"
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Processing job {job_id}")
|
||||||
|
|
||||||
|
# Initialize pipeline
|
||||||
|
config = Config(
|
||||||
|
database_url=database_url,
|
||||||
|
llm_provider="anthropic",
|
||||||
|
llm_model="claude-sonnet-4-5-20250929",
|
||||||
|
openai_api_key=os.environ.get("OPENAI_API_KEY"),
|
||||||
|
anthropic_api_key="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",
|
||||||
|
classification_batch_size=25,
|
||||||
|
classification_max_concurrent=5,
|
||||||
|
classification_target_utilization=0.70,
|
||||||
|
)
|
||||||
|
|
||||||
|
pipeline = Pipeline(config)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await pipeline.initialize()
|
||||||
|
logger.info("Pipeline initialized")
|
||||||
|
|
||||||
|
# Run all stages (normalize, classify, route, aggregate)
|
||||||
|
# Just pass job_id - pipeline will fetch and transform reviews from database
|
||||||
|
logger.info("Starting pipeline execution...")
|
||||||
|
start_time = datetime.now()
|
||||||
|
|
||||||
|
result = await pipeline.process(
|
||||||
|
{"job_id": job_id},
|
||||||
|
stages=["normalize", "classify", "route", "aggregate"],
|
||||||
|
)
|
||||||
|
|
||||||
|
elapsed = (datetime.now() - start_time).total_seconds()
|
||||||
|
|
||||||
|
# Print results
|
||||||
|
if result.success:
|
||||||
|
logger.info(f"Pipeline completed successfully in {elapsed:.1f}s")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Pipeline completed with errors in {elapsed:.1f}s")
|
||||||
|
if result.error:
|
||||||
|
logger.error(f"Error: {result.error}")
|
||||||
|
|
||||||
|
# Stage summaries
|
||||||
|
for stage_name, stage_result in result.stage_results.items():
|
||||||
|
# Handle both object and dict access
|
||||||
|
success = getattr(stage_result, 'success', None) or stage_result.get('success', False)
|
||||||
|
data = getattr(stage_result, 'data', None) or stage_result.get('data', {})
|
||||||
|
error = getattr(stage_result, 'error', None) or stage_result.get('error')
|
||||||
|
duration_ms = getattr(stage_result, 'duration_ms', None) or stage_result.get('duration_ms', 0)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
stats = data.get("stats", {}) if data else {}
|
||||||
|
|
||||||
|
if stage_name == "normalize":
|
||||||
|
logger.info(f" Stage 1 (Normalize): {stats.get('output_count', '?')} reviews")
|
||||||
|
elif stage_name == "classify":
|
||||||
|
logger.info(
|
||||||
|
f" Stage 2 (Classify): {stats.get('success_count', '?')} reviews, "
|
||||||
|
f"{stats.get('total_spans', '?')} spans, "
|
||||||
|
f"${stats.get('llm_cost_usd', 0):.4f} LLM cost"
|
||||||
|
)
|
||||||
|
elif stage_name == "route":
|
||||||
|
logger.info(
|
||||||
|
f" Stage 3 (Route): {stats.get('spans_routed', '?')} spans, "
|
||||||
|
f"{stats.get('issues_created', '?')} issues"
|
||||||
|
)
|
||||||
|
elif stage_name == "aggregate":
|
||||||
|
logger.info(f" Stage 4 (Aggregate): {stats.get('facts_upserted', '?')} facts")
|
||||||
|
|
||||||
|
logger.info(f" Duration: {duration_ms}ms")
|
||||||
|
else:
|
||||||
|
logger.error(f" {stage_name}: FAILED - {error}")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(f"Pipeline failed: {e}")
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
await pipeline.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print("Usage: python run_classification.py <job_id>")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
job_id = sys.argv[1]
|
||||||
|
|
||||||
|
# Validate UUID format
|
||||||
|
import uuid
|
||||||
|
try:
|
||||||
|
uuid.UUID(job_id)
|
||||||
|
except ValueError:
|
||||||
|
print(f"Invalid job ID format: {job_id}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
result = asyncio.run(run_pipeline(job_id))
|
||||||
|
|
||||||
|
if result and not result.success:
|
||||||
|
sys.exit(1)
|
||||||
409
packages/reviewiq-pipeline/scripts/backfill_review_facts.py
Normal file
409
packages/reviewiq-pipeline/scripts/backfill_review_facts.py
Normal file
@@ -0,0 +1,409 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Backfill review_facts_v1 from public.jobs.reviews_data.
|
||||||
|
|
||||||
|
Parses relative timestamps ("17 hours ago", "2 weeks ago") into absolute
|
||||||
|
timestamps anchored to job.created_at.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python backfill_review_facts.py
|
||||||
|
python backfill_review_facts.py --dry-run
|
||||||
|
python backfill_review_facts.py --job-id <uuid>
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
# Database URL
|
||||||
|
DB_URL = os.environ.get(
|
||||||
|
"DATABASE_URL",
|
||||||
|
"postgresql://scraper:scraper123@localhost:5437/scraper"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# RELATIVE TIMESTAMP PARSER
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# Regex patterns for relative timestamps
|
||||||
|
RELATIVE_PATTERNS = [
|
||||||
|
# "17 hours ago", "2 weeks ago", "a month ago"
|
||||||
|
(r"(?:edited\s+)?(\d+|a|an)\s+(second|minute|hour|day|week|month|year)s?\s+ago", "standard"),
|
||||||
|
# "just now"
|
||||||
|
(r"just\s+now", "just_now"),
|
||||||
|
# "yesterday"
|
||||||
|
(r"yesterday", "yesterday"),
|
||||||
|
# "today"
|
||||||
|
(r"today", "today"),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Time unit multipliers (in seconds)
|
||||||
|
TIME_UNITS = {
|
||||||
|
"second": 1,
|
||||||
|
"minute": 60,
|
||||||
|
"hour": 3600,
|
||||||
|
"day": 86400,
|
||||||
|
"week": 604800,
|
||||||
|
"month": 2592000, # 30 days
|
||||||
|
"year": 31536000, # 365 days
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_relative_timestamp(raw: str, reference_time: datetime) -> datetime | None:
|
||||||
|
"""
|
||||||
|
Parse a relative timestamp string into an absolute datetime.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw: Relative timestamp like "17 hours ago", "Edited 2 weeks ago"
|
||||||
|
reference_time: The reference point (usually job.created_at)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Absolute datetime or None if parsing failed
|
||||||
|
"""
|
||||||
|
if not raw:
|
||||||
|
return None
|
||||||
|
|
||||||
|
text = raw.lower().strip()
|
||||||
|
|
||||||
|
# Handle "just now"
|
||||||
|
if "just now" in text:
|
||||||
|
return reference_time
|
||||||
|
|
||||||
|
# Handle "yesterday"
|
||||||
|
if text == "yesterday":
|
||||||
|
return reference_time - timedelta(days=1)
|
||||||
|
|
||||||
|
# Handle "today"
|
||||||
|
if text == "today":
|
||||||
|
return reference_time
|
||||||
|
|
||||||
|
# Handle standard relative format
|
||||||
|
# Remove "edited " prefix if present
|
||||||
|
text = re.sub(r"^edited\s+", "", text)
|
||||||
|
|
||||||
|
# Match "N unit(s) ago"
|
||||||
|
match = re.match(r"(\d+|a|an)\s+(second|minute|hour|day|week|month|year)s?\s+ago", text)
|
||||||
|
if match:
|
||||||
|
quantity_str = match.group(1)
|
||||||
|
unit = match.group(2)
|
||||||
|
|
||||||
|
# Convert "a"/"an" to 1
|
||||||
|
if quantity_str in ("a", "an"):
|
||||||
|
quantity = 1
|
||||||
|
else:
|
||||||
|
quantity = int(quantity_str)
|
||||||
|
|
||||||
|
seconds = quantity * TIME_UNITS.get(unit, 0)
|
||||||
|
return reference_time - timedelta(seconds=seconds)
|
||||||
|
|
||||||
|
# Unknown format
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_relative_timestamp_safe(raw: str, reference_time: datetime) -> tuple[datetime | None, bool]:
|
||||||
|
"""
|
||||||
|
Safe wrapper that returns (parsed_time, success).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
result = parse_relative_timestamp(raw, reference_time)
|
||||||
|
return result, result is not None
|
||||||
|
except Exception:
|
||||||
|
return None, False
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# BACKFILL LOGIC
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
async def get_jobs_with_reviews(pool: asyncpg.Pool, job_id: str | None = None) -> list[dict]:
|
||||||
|
"""Get all jobs with reviews_data."""
|
||||||
|
if job_id:
|
||||||
|
query = """
|
||||||
|
SELECT job_id, created_at, reviews_data,
|
||||||
|
COALESCE(metadata->>'business_name', url) as business_id
|
||||||
|
FROM public.jobs
|
||||||
|
WHERE job_id = $1
|
||||||
|
AND reviews_data IS NOT NULL
|
||||||
|
AND jsonb_typeof(reviews_data) = 'array'
|
||||||
|
"""
|
||||||
|
rows = await pool.fetch(query, job_id)
|
||||||
|
else:
|
||||||
|
query = """
|
||||||
|
SELECT job_id, created_at, reviews_data,
|
||||||
|
COALESCE(metadata->>'business_name', url) as business_id
|
||||||
|
FROM public.jobs
|
||||||
|
WHERE reviews_data IS NOT NULL
|
||||||
|
AND jsonb_typeof(reviews_data) = 'array'
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
"""
|
||||||
|
rows = await pool.fetch(query)
|
||||||
|
|
||||||
|
return [dict(r) for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
async def get_run_id_for_job(pool: asyncpg.Pool, job_id: str) -> str | None:
|
||||||
|
"""Get the run_id associated with a job from detected_spans_v2."""
|
||||||
|
row = await pool.fetchrow("""
|
||||||
|
SELECT DISTINCT run_id FROM pipeline.detected_spans_v2
|
||||||
|
WHERE job_id = $1 AND run_id IS NOT NULL
|
||||||
|
LIMIT 1
|
||||||
|
""", job_id)
|
||||||
|
return str(row["run_id"]) if row and row["run_id"] else None
|
||||||
|
|
||||||
|
|
||||||
|
async def get_language_for_review(pool: asyncpg.Pool, review_id: str) -> str | None:
|
||||||
|
"""Get detected language for a review from spans."""
|
||||||
|
row = await pool.fetchrow("""
|
||||||
|
SELECT language FROM pipeline.detected_spans_v2
|
||||||
|
WHERE review_id = $1 AND language IS NOT NULL
|
||||||
|
LIMIT 1
|
||||||
|
""", review_id)
|
||||||
|
return row["language"] if row else None
|
||||||
|
|
||||||
|
|
||||||
|
async def upsert_review_facts(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
facts: list[dict],
|
||||||
|
dry_run: bool = False,
|
||||||
|
) -> tuple[int, int]:
|
||||||
|
"""
|
||||||
|
Upsert review facts into the database.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(inserted_count, updated_count)
|
||||||
|
"""
|
||||||
|
if dry_run or not facts:
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
# Use executemany with ON CONFLICT
|
||||||
|
query = """
|
||||||
|
INSERT INTO pipeline.review_facts_v1
|
||||||
|
(review_id, business_id, job_id, run_id, rating, review_time_utc, raw_timestamp, author, language)
|
||||||
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
||||||
|
ON CONFLICT (review_id) DO UPDATE SET
|
||||||
|
business_id = EXCLUDED.business_id,
|
||||||
|
job_id = EXCLUDED.job_id,
|
||||||
|
run_id = COALESCE(EXCLUDED.run_id, pipeline.review_facts_v1.run_id),
|
||||||
|
rating = EXCLUDED.rating,
|
||||||
|
review_time_utc = EXCLUDED.review_time_utc,
|
||||||
|
raw_timestamp = EXCLUDED.raw_timestamp,
|
||||||
|
author = EXCLUDED.author,
|
||||||
|
language = COALESCE(EXCLUDED.language, pipeline.review_facts_v1.language)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Prepare records
|
||||||
|
records = [
|
||||||
|
(
|
||||||
|
f["review_id"],
|
||||||
|
f["business_id"],
|
||||||
|
f["job_id"],
|
||||||
|
f.get("run_id"),
|
||||||
|
f.get("rating"),
|
||||||
|
f.get("review_time_utc"),
|
||||||
|
f.get("raw_timestamp"),
|
||||||
|
f.get("author"),
|
||||||
|
f.get("language"),
|
||||||
|
)
|
||||||
|
for f in facts
|
||||||
|
]
|
||||||
|
|
||||||
|
await pool.executemany(query, records)
|
||||||
|
return len(records), 0
|
||||||
|
|
||||||
|
|
||||||
|
async def backfill_job(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
job: dict,
|
||||||
|
dry_run: bool = False,
|
||||||
|
verbose: bool = False,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Backfill review facts for a single job.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Stats dict with counts and errors
|
||||||
|
"""
|
||||||
|
job_id = job["job_id"]
|
||||||
|
job_created = job["created_at"]
|
||||||
|
business_id = job["business_id"]
|
||||||
|
reviews_data = job["reviews_data"]
|
||||||
|
|
||||||
|
# asyncpg may return JSONB as string
|
||||||
|
if isinstance(reviews_data, str):
|
||||||
|
reviews_data = json.loads(reviews_data)
|
||||||
|
|
||||||
|
# Make job_created timezone-aware if it isn't
|
||||||
|
if job_created.tzinfo is None:
|
||||||
|
job_created = job_created.replace(tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
# Get run_id for this job
|
||||||
|
run_id = await get_run_id_for_job(pool, str(job_id))
|
||||||
|
|
||||||
|
stats = {
|
||||||
|
"job_id": str(job_id),
|
||||||
|
"total_reviews": 0,
|
||||||
|
"parsed_ok": 0,
|
||||||
|
"parsed_failed": 0,
|
||||||
|
"inserted": 0,
|
||||||
|
"sample_failures": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
facts = []
|
||||||
|
|
||||||
|
for review in reviews_data:
|
||||||
|
stats["total_reviews"] += 1
|
||||||
|
|
||||||
|
# Handle both dict and JSON string
|
||||||
|
if isinstance(review, str):
|
||||||
|
try:
|
||||||
|
review = json.loads(review)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
review_id = review.get("review_id")
|
||||||
|
if not review_id:
|
||||||
|
continue
|
||||||
|
|
||||||
|
raw_timestamp = review.get("timestamp", "")
|
||||||
|
review_time, success = parse_relative_timestamp_safe(raw_timestamp, job_created)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
stats["parsed_ok"] += 1
|
||||||
|
else:
|
||||||
|
stats["parsed_failed"] += 1
|
||||||
|
if len(stats["sample_failures"]) < 5:
|
||||||
|
stats["sample_failures"].append(raw_timestamp)
|
||||||
|
|
||||||
|
# Get language from spans if available
|
||||||
|
language = await get_language_for_review(pool, review_id) if not dry_run else None
|
||||||
|
|
||||||
|
facts.append({
|
||||||
|
"review_id": review_id,
|
||||||
|
"business_id": business_id,
|
||||||
|
"job_id": job_id,
|
||||||
|
"run_id": run_id,
|
||||||
|
"rating": review.get("rating"),
|
||||||
|
"review_time_utc": review_time,
|
||||||
|
"raw_timestamp": raw_timestamp,
|
||||||
|
"author": review.get("author"),
|
||||||
|
"language": language,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Upsert
|
||||||
|
inserted, _ = await upsert_review_facts(pool, facts, dry_run=dry_run)
|
||||||
|
stats["inserted"] = inserted
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(f" Job {job_id}: {stats['total_reviews']} reviews, "
|
||||||
|
f"{stats['parsed_ok']} parsed OK, {stats['parsed_failed']} failed")
|
||||||
|
if stats["sample_failures"]:
|
||||||
|
print(f" Sample failures: {stats['sample_failures'][:3]}")
|
||||||
|
|
||||||
|
return stats
|
||||||
|
|
||||||
|
|
||||||
|
async def backfill_all(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
job_id: str | None = None,
|
||||||
|
dry_run: bool = False,
|
||||||
|
verbose: bool = False,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Backfill review facts for all jobs (or a specific job).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Aggregate stats
|
||||||
|
"""
|
||||||
|
jobs = await get_jobs_with_reviews(pool, job_id)
|
||||||
|
|
||||||
|
print(f"\n{'[DRY RUN] ' if dry_run else ''}Backfilling review_facts_v1 from {len(jobs)} jobs...")
|
||||||
|
|
||||||
|
aggregate = {
|
||||||
|
"jobs_processed": 0,
|
||||||
|
"total_reviews": 0,
|
||||||
|
"parsed_ok": 0,
|
||||||
|
"parsed_failed": 0,
|
||||||
|
"inserted": 0,
|
||||||
|
"unique_failure_patterns": set(),
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, job in enumerate(jobs, 1):
|
||||||
|
if verbose:
|
||||||
|
print(f"\n[{i}/{len(jobs)}] Processing job {job['job_id']}...")
|
||||||
|
|
||||||
|
stats = await backfill_job(pool, job, dry_run=dry_run, verbose=verbose)
|
||||||
|
|
||||||
|
aggregate["jobs_processed"] += 1
|
||||||
|
aggregate["total_reviews"] += stats["total_reviews"]
|
||||||
|
aggregate["parsed_ok"] += stats["parsed_ok"]
|
||||||
|
aggregate["parsed_failed"] += stats["parsed_failed"]
|
||||||
|
aggregate["inserted"] += stats["inserted"]
|
||||||
|
aggregate["unique_failure_patterns"].update(stats["sample_failures"])
|
||||||
|
|
||||||
|
# Convert set to list for JSON serialization
|
||||||
|
aggregate["unique_failure_patterns"] = list(aggregate["unique_failure_patterns"])[:20]
|
||||||
|
|
||||||
|
return aggregate
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# CLI
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
async def main_async(args):
|
||||||
|
"""Main async entry point."""
|
||||||
|
pool = await asyncpg.create_pool(DB_URL)
|
||||||
|
|
||||||
|
try:
|
||||||
|
stats = await backfill_all(
|
||||||
|
pool,
|
||||||
|
job_id=args.job_id,
|
||||||
|
dry_run=args.dry_run,
|
||||||
|
verbose=args.verbose,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("BACKFILL COMPLETE")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"Jobs processed: {stats['jobs_processed']}")
|
||||||
|
print(f"Total reviews: {stats['total_reviews']}")
|
||||||
|
print(f"Timestamps parsed: {stats['parsed_ok']} ({stats['parsed_ok']/max(stats['total_reviews'],1)*100:.1f}%)")
|
||||||
|
print(f"Timestamps failed: {stats['parsed_failed']} ({stats['parsed_failed']/max(stats['total_reviews'],1)*100:.1f}%)")
|
||||||
|
if not args.dry_run:
|
||||||
|
print(f"Records upserted: {stats['inserted']}")
|
||||||
|
|
||||||
|
if stats["unique_failure_patterns"]:
|
||||||
|
print(f"\nUnparsed timestamp patterns ({len(stats['unique_failure_patterns'])}):")
|
||||||
|
for p in stats["unique_failure_patterns"][:10]:
|
||||||
|
print(f" - \"{p}\"")
|
||||||
|
|
||||||
|
# Calculate coverage
|
||||||
|
coverage = stats['parsed_ok'] / max(stats['total_reviews'], 1) * 100
|
||||||
|
if coverage < 90:
|
||||||
|
print(f"\n⚠️ WARNING: Timestamp coverage is {coverage:.1f}% (target: >90%)")
|
||||||
|
else:
|
||||||
|
print(f"\n✅ Timestamp coverage: {coverage:.1f}%")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await pool.close()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Backfill review_facts_v1")
|
||||||
|
parser.add_argument("--job-id", help="Process a specific job only")
|
||||||
|
parser.add_argument("--dry-run", action="store_true", help="Don't write to database")
|
||||||
|
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
asyncio.run(main_async(args))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
226
packages/reviewiq-pipeline/scripts/config_resolver_standalone.py
Normal file
226
packages/reviewiq-pipeline/scripts/config_resolver_standalone.py
Normal file
@@ -0,0 +1,226 @@
|
|||||||
|
"""
|
||||||
|
Config Resolver - Standalone version for scripts.
|
||||||
|
|
||||||
|
Resolves L1 config + sector brief for classification.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Paths
|
||||||
|
DATA_DIR = Path(__file__).parent.parent / "data"
|
||||||
|
CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l1"
|
||||||
|
L2_CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l2"
|
||||||
|
BRIEFS_DIR = DATA_DIR / "sector_briefs"
|
||||||
|
|
||||||
|
# Meta primitives - always enabled
|
||||||
|
META_PRIMITIVES = frozenset([
|
||||||
|
"HONESTY", "ETHICS", "PROMISES",
|
||||||
|
"ACKNOWLEDGMENT", "RESPONSE_QUALITY", "RECOVERY",
|
||||||
|
"RETURN_INTENT", "RECOMMEND", "RECOGNITION",
|
||||||
|
"UNMAPPED",
|
||||||
|
])
|
||||||
|
|
||||||
|
# Core primitives dictionary
|
||||||
|
CORE_PRIMITIVES = {
|
||||||
|
"TASTE": {"domain": "O", "name": "Taste/Flavor", "def": "Sensory quality of food/beverage"},
|
||||||
|
"CRAFT": {"domain": "O", "name": "Craftsmanship", "def": "Skill of execution/preparation"},
|
||||||
|
"FRESHNESS": {"domain": "O", "name": "Freshness", "def": "Newness, not stale or old"},
|
||||||
|
"TEMPERATURE": {"domain": "O", "name": "Temperature", "def": "Hot/cold as expected"},
|
||||||
|
"EFFECTIVENESS": {"domain": "O", "name": "Effectiveness", "def": "Achieves intended purpose"},
|
||||||
|
"ACCURACY": {"domain": "O", "name": "Accuracy", "def": "Correct, as ordered/specified"},
|
||||||
|
"CONDITION": {"domain": "O", "name": "Condition", "def": "Physical state, wear, damage"},
|
||||||
|
"CONSISTENCY": {"domain": "O", "name": "Consistency", "def": "Same quality each time"},
|
||||||
|
"MANNER": {"domain": "P", "name": "Manner/Attitude", "def": "Friendliness, respect, warmth"},
|
||||||
|
"COMPETENCE": {"domain": "P", "name": "Competence", "def": "Knowledge and skill of staff"},
|
||||||
|
"ATTENTIVENESS": {"domain": "P", "name": "Attentiveness", "def": "Being present, responsive"},
|
||||||
|
"COMMUNICATION": {"domain": "P", "name": "Communication", "def": "Clarity, listening, updates"},
|
||||||
|
"SPEED": {"domain": "J", "name": "Speed/Wait", "def": "Time to service, waiting"},
|
||||||
|
"FRICTION": {"domain": "J", "name": "Friction", "def": "Obstacles, hassles, complexity"},
|
||||||
|
"RELIABILITY": {"domain": "J", "name": "Reliability", "def": "Dependable, keeps promises"},
|
||||||
|
"AVAILABILITY": {"domain": "J", "name": "Availability", "def": "Open when needed, bookable"},
|
||||||
|
"CLEANLINESS": {"domain": "E", "name": "Cleanliness", "def": "Hygiene, tidiness"},
|
||||||
|
"COMFORT": {"domain": "E", "name": "Comfort", "def": "Physical ease, seating"},
|
||||||
|
"SAFETY": {"domain": "E", "name": "Safety", "def": "Free from harm/danger"},
|
||||||
|
"AMBIANCE": {"domain": "E", "name": "Ambiance", "def": "Atmosphere, mood, vibe"},
|
||||||
|
"ACCESSIBILITY": {"domain": "E", "name": "Accessibility", "def": "Easy to reach, navigate"},
|
||||||
|
"DIGITAL_UX": {"domain": "E", "name": "Digital Experience", "def": "Website, app, online"},
|
||||||
|
"PRICE_LEVEL": {"domain": "V", "name": "Price Level", "def": "Absolute cost (cheap/expensive)"},
|
||||||
|
"PRICE_FAIRNESS": {"domain": "V", "name": "Price Fairness", "def": "Reasonable for what you get"},
|
||||||
|
"PRICE_TRANSPARENCY": {"domain": "V", "name": "Price Transparency", "def": "No hidden fees, clear pricing"},
|
||||||
|
"VALUE_FOR_MONEY": {"domain": "V", "name": "Value for Money", "def": "Worth what you paid"},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class ConfigResolver:
|
||||||
|
"""Resolves classification config for a business."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._l1_cache: dict[str, dict] = {}
|
||||||
|
self._l2_cache: dict[str, dict] = {}
|
||||||
|
self._brief_cache: dict[str, dict] = {}
|
||||||
|
|
||||||
|
def _load_l2_configs(self) -> list[dict[str, Any]]:
|
||||||
|
"""Load all L2 config files."""
|
||||||
|
if not L2_CONFIGS_DIR.exists():
|
||||||
|
return []
|
||||||
|
|
||||||
|
configs = []
|
||||||
|
for config_path in L2_CONFIGS_DIR.glob("*_config.json"):
|
||||||
|
try:
|
||||||
|
with open(config_path) as f:
|
||||||
|
config = json.load(f)
|
||||||
|
configs.append(config)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to load L2 config {config_path}: {e}")
|
||||||
|
return configs
|
||||||
|
|
||||||
|
def _find_matching_l2(self, gbp_path: str) -> dict[str, Any] | None:
|
||||||
|
"""Find L2 config that matches the GBP path (most specific wins)."""
|
||||||
|
l2_configs = self._load_l2_configs()
|
||||||
|
|
||||||
|
# Find all matching configs (path starts with L2 gbp_path)
|
||||||
|
matches = []
|
||||||
|
for config in l2_configs:
|
||||||
|
l2_path = config.get("gbp_path", "")
|
||||||
|
if gbp_path.startswith(l2_path) or gbp_path == l2_path:
|
||||||
|
matches.append((len(l2_path), config))
|
||||||
|
|
||||||
|
if not matches:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Return most specific match (longest path)
|
||||||
|
matches.sort(key=lambda x: x[0], reverse=True)
|
||||||
|
return matches[0][1]
|
||||||
|
|
||||||
|
def _apply_l2_delta(self, l1_config: dict, l2_config: dict) -> dict:
|
||||||
|
"""Apply L2 delta to L1 config."""
|
||||||
|
result = l1_config.copy()
|
||||||
|
delta = l2_config.get("delta", {})
|
||||||
|
|
||||||
|
# Enable additional primitives
|
||||||
|
if "enable" in delta:
|
||||||
|
enabled = set(result.get("enabled", []))
|
||||||
|
enabled.update(delta["enable"])
|
||||||
|
result["enabled"] = list(enabled)
|
||||||
|
|
||||||
|
# Merge weights
|
||||||
|
if "weights" in delta:
|
||||||
|
weights = dict(result.get("weights", {}))
|
||||||
|
weights.update(delta["weights"])
|
||||||
|
result["weights"] = weights
|
||||||
|
|
||||||
|
# Update config version to indicate L2
|
||||||
|
result["config_version"] = l2_config.get("config_version", result.get("config_version", "1.0"))
|
||||||
|
result["l2_applied"] = l2_config.get("gbp_path")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _load_l1_config(self, sector_code: str) -> dict[str, Any] | None:
|
||||||
|
if sector_code in self._l1_cache:
|
||||||
|
return self._l1_cache[sector_code]
|
||||||
|
|
||||||
|
config_path = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
|
||||||
|
if not config_path.exists():
|
||||||
|
return None
|
||||||
|
|
||||||
|
with open(config_path) as f:
|
||||||
|
config = json.load(f)
|
||||||
|
|
||||||
|
self._l1_cache[sector_code] = config
|
||||||
|
return config
|
||||||
|
|
||||||
|
def _load_sector_brief(self, sector_code: str) -> dict[str, Any] | None:
|
||||||
|
if sector_code in self._brief_cache:
|
||||||
|
return self._brief_cache[sector_code]
|
||||||
|
|
||||||
|
brief_path = BRIEFS_DIR / f"{sector_code.lower()}_brief.json"
|
||||||
|
if not brief_path.exists():
|
||||||
|
return None
|
||||||
|
|
||||||
|
with open(brief_path) as f:
|
||||||
|
brief = json.load(f)
|
||||||
|
|
||||||
|
self._brief_cache[sector_code] = brief
|
||||||
|
return brief
|
||||||
|
|
||||||
|
async def get_business_mapping(self, pool, business_id: str) -> dict[str, Any] | None:
|
||||||
|
query = """
|
||||||
|
SELECT business_id, gbp_path::text, sector_code
|
||||||
|
FROM pipeline.business_taxonomy_map
|
||||||
|
WHERE business_id = $1
|
||||||
|
"""
|
||||||
|
row = await pool.fetchrow(query, business_id)
|
||||||
|
return dict(row) if row else None
|
||||||
|
|
||||||
|
def resolve_enabled_set(self, l1_config: dict) -> set[str]:
|
||||||
|
enabled = set(l1_config.get("enabled", []))
|
||||||
|
enabled.update(META_PRIMITIVES)
|
||||||
|
return enabled
|
||||||
|
|
||||||
|
def build_primitives_for_prompt(self, enabled: set[str], weights: dict[str, float]) -> dict[str, dict]:
|
||||||
|
result = {}
|
||||||
|
for prim in enabled:
|
||||||
|
if prim in CORE_PRIMITIVES:
|
||||||
|
entry = CORE_PRIMITIVES[prim].copy()
|
||||||
|
if prim in weights:
|
||||||
|
entry["weight"] = weights[prim]
|
||||||
|
result[prim] = entry
|
||||||
|
elif prim in META_PRIMITIVES:
|
||||||
|
result[prim] = {"domain": "M", "name": prim.replace("_", " ").title(), "meta": True}
|
||||||
|
return result
|
||||||
|
|
||||||
|
def extract_brief_signals(self, brief: dict) -> dict[str, Any]:
|
||||||
|
if not brief:
|
||||||
|
return {}
|
||||||
|
return {
|
||||||
|
"sector": brief.get("sector_code"),
|
||||||
|
"what_customers_judge": brief.get("what_customers_judge"),
|
||||||
|
"critical_pain_points": brief.get("critical_pain_points"),
|
||||||
|
"industry_terminology": brief.get("industry_terminology"),
|
||||||
|
}
|
||||||
|
|
||||||
|
async def resolve(self, business_id: str, pool, mode: str | None = None) -> dict[str, Any] | None:
|
||||||
|
mapping = await self.get_business_mapping(pool, business_id)
|
||||||
|
if not mapping:
|
||||||
|
return None
|
||||||
|
|
||||||
|
sector_code = mapping["sector_code"]
|
||||||
|
gbp_path = mapping["gbp_path"]
|
||||||
|
|
||||||
|
# Load L1 config (sector-level)
|
||||||
|
l1_config = self._load_l1_config(sector_code)
|
||||||
|
if not l1_config:
|
||||||
|
l1_config = {"enabled": list(CORE_PRIMITIVES.keys()), "weights": {}}
|
||||||
|
|
||||||
|
# Check for L2 config (category-level delta)
|
||||||
|
l2_config = self._find_matching_l2(gbp_path)
|
||||||
|
if l2_config:
|
||||||
|
logger.info(f"Applying L2 delta for {gbp_path}: {l2_config.get('gbp_path')}")
|
||||||
|
l1_config = self._apply_l2_delta(l1_config, l2_config)
|
||||||
|
|
||||||
|
brief = self._load_sector_brief(sector_code)
|
||||||
|
|
||||||
|
enabled = self.resolve_enabled_set(l1_config)
|
||||||
|
weights = dict(l1_config.get("weights", {}))
|
||||||
|
primitives = self.build_primitives_for_prompt(enabled, weights)
|
||||||
|
brief_signals = self.extract_brief_signals(brief)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"business_id": business_id,
|
||||||
|
"gbp_path": gbp_path,
|
||||||
|
"sector_code": sector_code,
|
||||||
|
"config_version": l1_config.get("config_version", "1.0"),
|
||||||
|
"l2_applied": l1_config.get("l2_applied"),
|
||||||
|
"modes": [mode] if mode else ["in_person"],
|
||||||
|
"default_mode": mode or "in_person",
|
||||||
|
"enabled_primitives": sorted(enabled),
|
||||||
|
"disabled_primitives": sorted(l1_config.get("disabled", [])),
|
||||||
|
"weights": weights,
|
||||||
|
"brief": brief_signals,
|
||||||
|
"primitives": primitives,
|
||||||
|
}
|
||||||
148
packages/reviewiq-pipeline/scripts/fix_l1_configs.py
Normal file
148
packages/reviewiq-pipeline/scripts/fix_l1_configs.py
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Fix L1 configs based on validation results.
|
||||||
|
|
||||||
|
Applies fixes discovered during validation:
|
||||||
|
1. Enable primitives that were disabled but appearing frequently
|
||||||
|
2. Remove weights for primitives with zero appearances
|
||||||
|
3. Add weights for high-frequency unweighted primitives
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
CONFIGS_DIR = Path(__file__).parent.parent / "data" / "primitive_configs" / "l1"
|
||||||
|
|
||||||
|
# Fixes based on validation results
|
||||||
|
# Format: { sector: { "enable": [primitives], "disable": [primitives], "add_weight": {prim: weight}, "remove_weight": [prims] } }
|
||||||
|
FIXES = {
|
||||||
|
"ENTERTAINMENT": {
|
||||||
|
"enable": ["CRAFT", "CONSISTENCY", "COMMUNICATION", "FRICTION"],
|
||||||
|
"disable": [],
|
||||||
|
"add_weight": {},
|
||||||
|
"remove_weight": ["CONDITION"], # 0 appearances despite 1.4x weight
|
||||||
|
},
|
||||||
|
"FOOD_DINING": {
|
||||||
|
"enable": ["PRICE_LEVEL", "ACCESSIBILITY", "PRICE_TRANSPARENCY", "FRICTION", "EFFECTIVENESS"],
|
||||||
|
"disable": [],
|
||||||
|
"add_weight": {},
|
||||||
|
"remove_weight": [],
|
||||||
|
},
|
||||||
|
"AUTOMOTIVE": {
|
||||||
|
"enable": ["CRAFT", "CONSISTENCY", "PRICE_LEVEL", "AMBIANCE"],
|
||||||
|
"disable": [],
|
||||||
|
"add_weight": {},
|
||||||
|
"remove_weight": [],
|
||||||
|
},
|
||||||
|
"HEALTHCARE": {
|
||||||
|
"enable": ["CRAFT", "PRICE_LEVEL", "AMBIANCE"],
|
||||||
|
"disable": [],
|
||||||
|
"add_weight": {},
|
||||||
|
"remove_weight": [],
|
||||||
|
},
|
||||||
|
"RETAIL_SHOPPING": {
|
||||||
|
"enable": ["CRAFT", "PRICE_LEVEL", "AMBIANCE"],
|
||||||
|
"disable": [],
|
||||||
|
"add_weight": {},
|
||||||
|
"remove_weight": [],
|
||||||
|
},
|
||||||
|
"HOSPITALITY_TRAVEL": {
|
||||||
|
"enable": ["CRAFT", "CONSISTENCY", "PRICE_LEVEL"],
|
||||||
|
"disable": [],
|
||||||
|
"add_weight": {},
|
||||||
|
"remove_weight": [],
|
||||||
|
},
|
||||||
|
"PERSONAL_SERVICES": {
|
||||||
|
"enable": ["PRICE_LEVEL", "SPEED", "FRICTION"],
|
||||||
|
"disable": [],
|
||||||
|
"add_weight": {},
|
||||||
|
"remove_weight": [],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def fix_config(sector_code: str, fixes: dict) -> dict:
|
||||||
|
"""Apply fixes to a sector config."""
|
||||||
|
config_path = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
|
||||||
|
|
||||||
|
if not config_path.exists():
|
||||||
|
print(f" ⚠️ Config not found: {config_path}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
with open(config_path) as f:
|
||||||
|
config = json.load(f)
|
||||||
|
|
||||||
|
enabled = set(config.get("enabled", []))
|
||||||
|
disabled = set(config.get("disabled", []))
|
||||||
|
weights = config.get("weights", {})
|
||||||
|
|
||||||
|
changes = []
|
||||||
|
|
||||||
|
# Apply enables (move from disabled to enabled)
|
||||||
|
for prim in fixes.get("enable", []):
|
||||||
|
if prim in disabled:
|
||||||
|
disabled.remove(prim)
|
||||||
|
enabled.add(prim)
|
||||||
|
changes.append(f"✓ Enabled {prim}")
|
||||||
|
elif prim not in enabled:
|
||||||
|
enabled.add(prim)
|
||||||
|
changes.append(f"✓ Added {prim} to enabled")
|
||||||
|
|
||||||
|
# Apply disables (move from enabled to disabled)
|
||||||
|
for prim in fixes.get("disable", []):
|
||||||
|
if prim in enabled:
|
||||||
|
enabled.remove(prim)
|
||||||
|
disabled.add(prim)
|
||||||
|
changes.append(f"✗ Disabled {prim}")
|
||||||
|
|
||||||
|
# Add weights
|
||||||
|
for prim, weight in fixes.get("add_weight", {}).items():
|
||||||
|
if prim not in weights:
|
||||||
|
weights[prim] = weight
|
||||||
|
changes.append(f"⚖️ Added weight {prim}: {weight}x")
|
||||||
|
|
||||||
|
# Remove weights
|
||||||
|
for prim in fixes.get("remove_weight", []):
|
||||||
|
if prim in weights:
|
||||||
|
del weights[prim]
|
||||||
|
changes.append(f"⚖️ Removed weight for {prim}")
|
||||||
|
|
||||||
|
# Update config
|
||||||
|
config["enabled"] = sorted(enabled)
|
||||||
|
config["disabled"] = sorted(disabled)
|
||||||
|
config["weights"] = dict(sorted(weights.items()))
|
||||||
|
config["config_version"] = "1.1" # Bump version
|
||||||
|
|
||||||
|
# Save
|
||||||
|
with open(config_path, "w") as f:
|
||||||
|
json.dump(config, f, indent=2)
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
return changes
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 60)
|
||||||
|
print("L1 CONFIG FIXER - Applying validation-based fixes")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
total_changes = 0
|
||||||
|
|
||||||
|
for sector, fixes in FIXES.items():
|
||||||
|
print(f"\n📁 {sector}")
|
||||||
|
changes = fix_config(sector, fixes)
|
||||||
|
if changes:
|
||||||
|
for change in changes:
|
||||||
|
print(f" {change}")
|
||||||
|
total_changes += len(changes)
|
||||||
|
else:
|
||||||
|
print(" No changes applied")
|
||||||
|
|
||||||
|
print(f"\n{'=' * 60}")
|
||||||
|
print(f"Total changes applied: {total_changes}")
|
||||||
|
print("Config version bumped to 1.1")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
238
packages/reviewiq-pipeline/scripts/fix_l1_configs_v2.py
Normal file
238
packages/reviewiq-pipeline/scripts/fix_l1_configs_v2.py
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Guarded L1 Config Fixer - V2 (Threshold-based, Sector-scoped)
|
||||||
|
|
||||||
|
Only applies fixes when:
|
||||||
|
1. Evidence is from sector-scoped validation
|
||||||
|
2. Frequency exceeds threshold (default 3%)
|
||||||
|
3. Changes are logged with version bump
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python fix_l1_configs_v2.py --apply # Apply fixes from validation
|
||||||
|
python fix_l1_configs_v2.py --dry-run # Show what would change
|
||||||
|
python fix_l1_configs_v2.py --revert SECTOR # Revert to previous version
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
CONFIGS_DIR = Path(__file__).parent.parent / "data" / "primitive_configs" / "l1"
|
||||||
|
CHANGELOG_FILE = CONFIGS_DIR / "CHANGELOG.json"
|
||||||
|
|
||||||
|
# Minimum threshold for auto-enabling (% of sector spans)
|
||||||
|
ENABLE_THRESHOLD_PCT = 3.0
|
||||||
|
|
||||||
|
# Fixes derived from sector-scoped validation (validate_l1_configs_v2.py output)
|
||||||
|
# These are the ONLY fixes that should be applied
|
||||||
|
SECTOR_SCOPED_FIXES = {
|
||||||
|
"ENTERTAINMENT": {
|
||||||
|
"evidence": "2,320 spans from Go Karts + Soho Club",
|
||||||
|
"enable": [
|
||||||
|
("TASTE", 4.3, "Entertainment venues have concessions/food service"),
|
||||||
|
],
|
||||||
|
"add_weight": [
|
||||||
|
("CRAFT", 1.3, "13.4% frequency but unweighted"),
|
||||||
|
],
|
||||||
|
"remove_weight": [],
|
||||||
|
},
|
||||||
|
"FOOD_DINING": {
|
||||||
|
"evidence": "61 spans from Fika cafe",
|
||||||
|
"enable": [
|
||||||
|
("COMFORT", 9.8, "Seating/atmosphere comfort matters for cafes"),
|
||||||
|
],
|
||||||
|
"add_weight": [
|
||||||
|
("AVAILABILITY", 1.2, "16.4% frequency but unweighted"),
|
||||||
|
],
|
||||||
|
"remove_weight": [
|
||||||
|
# Note: Small sample size (61 spans) - these may be false negatives
|
||||||
|
# Keep weights but flag for review with more data
|
||||||
|
],
|
||||||
|
},
|
||||||
|
"AUTOMOTIVE": {
|
||||||
|
"evidence": "1,201 spans from ClickRent car rental",
|
||||||
|
"enable": [], # Nothing exceeds 3% threshold
|
||||||
|
"add_weight": [],
|
||||||
|
"remove_weight": [
|
||||||
|
# CONDITION, HONESTY, PROMISES, RECOVERY all have 0 appearances
|
||||||
|
# However, may be specific to rental vs repair - keep for now
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load_changelog() -> list[dict]:
|
||||||
|
"""Load the changelog file."""
|
||||||
|
if CHANGELOG_FILE.exists():
|
||||||
|
with open(CHANGELOG_FILE) as f:
|
||||||
|
return json.load(f)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def save_changelog(entries: list[dict]) -> None:
|
||||||
|
"""Save the changelog file."""
|
||||||
|
CHANGELOG_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(CHANGELOG_FILE, "w") as f:
|
||||||
|
json.dump(entries, f, indent=2)
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
|
||||||
|
def load_config(sector_code: str) -> dict[str, Any] | None:
|
||||||
|
"""Load a sector config."""
|
||||||
|
config_path = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
|
||||||
|
if not config_path.exists():
|
||||||
|
return None
|
||||||
|
with open(config_path) as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def save_config(sector_code: str, config: dict[str, Any]) -> None:
|
||||||
|
"""Save a sector config."""
|
||||||
|
config_path = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
|
||||||
|
with open(config_path, "w") as f:
|
||||||
|
json.dump(config, f, indent=2)
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
|
||||||
|
def apply_fixes(sector_code: str, fixes: dict, dry_run: bool = False) -> list[str]:
|
||||||
|
"""Apply fixes to a sector config."""
|
||||||
|
config = load_config(sector_code)
|
||||||
|
if not config:
|
||||||
|
return [f"❌ Config not found for {sector_code}"]
|
||||||
|
|
||||||
|
enabled = set(config.get("enabled", []))
|
||||||
|
disabled = set(config.get("disabled", []))
|
||||||
|
weights = config.get("weights", {})
|
||||||
|
|
||||||
|
changes = []
|
||||||
|
evidence = fixes.get("evidence", "unknown")
|
||||||
|
|
||||||
|
# Enable primitives
|
||||||
|
for prim, pct, reason in fixes.get("enable", []):
|
||||||
|
if pct < ENABLE_THRESHOLD_PCT:
|
||||||
|
changes.append(f"⚠️ SKIP {prim}: {pct:.1f}% below {ENABLE_THRESHOLD_PCT}% threshold")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if prim in disabled:
|
||||||
|
disabled.remove(prim)
|
||||||
|
enabled.add(prim)
|
||||||
|
changes.append(f"✓ ENABLE {prim}: {pct:.1f}% in sector data ({reason})")
|
||||||
|
elif prim not in enabled:
|
||||||
|
enabled.add(prim)
|
||||||
|
changes.append(f"✓ ADD {prim}: {pct:.1f}% in sector data ({reason})")
|
||||||
|
|
||||||
|
# Add weights
|
||||||
|
for prim, weight, reason in fixes.get("add_weight", []):
|
||||||
|
if prim not in weights:
|
||||||
|
weights[prim] = weight
|
||||||
|
changes.append(f"⚖️ WEIGHT {prim}: {weight}x ({reason})")
|
||||||
|
|
||||||
|
# Remove weights
|
||||||
|
for prim, reason in fixes.get("remove_weight", []):
|
||||||
|
if prim in weights:
|
||||||
|
del weights[prim]
|
||||||
|
changes.append(f"⚖️ UNWEIGHT {prim}: ({reason})")
|
||||||
|
|
||||||
|
if not changes:
|
||||||
|
return ["✓ No changes needed"]
|
||||||
|
|
||||||
|
if not dry_run:
|
||||||
|
# Bump version
|
||||||
|
old_version = config.get("config_version", "1.0")
|
||||||
|
major, minor = old_version.split(".")
|
||||||
|
new_version = f"{major}.{int(minor) + 1}"
|
||||||
|
|
||||||
|
config["enabled"] = sorted(enabled)
|
||||||
|
config["disabled"] = sorted(disabled)
|
||||||
|
config["weights"] = dict(sorted(weights.items()))
|
||||||
|
config["config_version"] = new_version
|
||||||
|
config["config_updated_at"] = datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
save_config(sector_code, config)
|
||||||
|
|
||||||
|
# Log to changelog
|
||||||
|
changelog = load_changelog()
|
||||||
|
changelog.append({
|
||||||
|
"sector": sector_code,
|
||||||
|
"version": new_version,
|
||||||
|
"previous_version": old_version,
|
||||||
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"evidence": evidence,
|
||||||
|
"changes": changes,
|
||||||
|
})
|
||||||
|
save_changelog(changelog)
|
||||||
|
|
||||||
|
changes.append(f"📝 Version: {old_version} → {new_version}")
|
||||||
|
|
||||||
|
return changes
|
||||||
|
|
||||||
|
|
||||||
|
def revert_config(sector_code: str, to_version: str | None = None) -> list[str]:
|
||||||
|
"""Revert a config to a previous version."""
|
||||||
|
changelog = load_changelog()
|
||||||
|
|
||||||
|
# Find entries for this sector
|
||||||
|
sector_entries = [e for e in changelog if e["sector"] == sector_code]
|
||||||
|
if not sector_entries:
|
||||||
|
return [f"❌ No changelog entries for {sector_code}"]
|
||||||
|
|
||||||
|
# TODO: Implement actual revert by storing full config snapshots
|
||||||
|
return [f"⚠️ Revert not yet implemented - manual restore required"]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Guarded L1 config fixer")
|
||||||
|
parser.add_argument("--apply", action="store_true", help="Apply sector-scoped fixes")
|
||||||
|
parser.add_argument("--dry-run", action="store_true", help="Show what would change")
|
||||||
|
parser.add_argument("--revert", metavar="SECTOR", help="Revert sector to previous version")
|
||||||
|
parser.add_argument("--sector", help="Apply to specific sector only")
|
||||||
|
parser.add_argument("--show-changelog", action="store_true", help="Show changelog")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.show_changelog:
|
||||||
|
changelog = load_changelog()
|
||||||
|
print(json.dumps(changelog, indent=2))
|
||||||
|
return
|
||||||
|
|
||||||
|
if args.revert:
|
||||||
|
changes = revert_config(args.revert.upper())
|
||||||
|
for change in changes:
|
||||||
|
print(change)
|
||||||
|
return
|
||||||
|
|
||||||
|
if args.apply or args.dry_run:
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"L1 CONFIG FIXER V2 - {'DRY RUN' if args.dry_run else 'APPLYING FIXES'}")
|
||||||
|
print(f"Threshold: {ENABLE_THRESHOLD_PCT}%")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
sectors = [args.sector.upper()] if args.sector else SECTOR_SCOPED_FIXES.keys()
|
||||||
|
|
||||||
|
for sector in sectors:
|
||||||
|
if sector not in SECTOR_SCOPED_FIXES:
|
||||||
|
print(f"\n⚠️ {sector}: No sector-scoped fixes defined")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"\n📁 {sector}")
|
||||||
|
print(f" Evidence: {SECTOR_SCOPED_FIXES[sector]['evidence']}")
|
||||||
|
|
||||||
|
changes = apply_fixes(sector, SECTOR_SCOPED_FIXES[sector], dry_run=args.dry_run)
|
||||||
|
for change in changes:
|
||||||
|
print(f" {change}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
if args.dry_run:
|
||||||
|
print("DRY RUN - No changes applied")
|
||||||
|
else:
|
||||||
|
print("Fixes applied - see CHANGELOG.json for history")
|
||||||
|
print("=" * 60)
|
||||||
|
return
|
||||||
|
|
||||||
|
parser.print_help()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
372
packages/reviewiq-pipeline/scripts/generate_sector_briefs.py
Normal file
372
packages/reviewiq-pipeline/scripts/generate_sector_briefs.py
Normal file
@@ -0,0 +1,372 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Wave 0: Sector Brief Generator
|
||||||
|
|
||||||
|
Generates alignment context briefs for each sector.
|
||||||
|
These briefs inform Wave 1 and Wave 2 primitive config generation.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python generate_sector_briefs.py # Generate all sectors
|
||||||
|
python generate_sector_briefs.py --sector FOOD_DINING # Generate one sector
|
||||||
|
python generate_sector_briefs.py --dry-run # Show what would be generated
|
||||||
|
python generate_sector_briefs.py --validate # Validate existing briefs
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
try:
|
||||||
|
from openai import OpenAI
|
||||||
|
except ImportError:
|
||||||
|
print("ERROR: openai package required. Install with: pip install openai")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
PROMPT_TEMPLATE = '''You are an expert in customer experience analysis across industries.
|
||||||
|
|
||||||
|
Your task: Generate a **sector brief** for the "{sector_name}" sector.
|
||||||
|
|
||||||
|
This brief will be used to align classification agents with industry-specific context.
|
||||||
|
It describes what customers care about — NOT how to classify, NOT what primitives to use.
|
||||||
|
|
||||||
|
## Sector Information
|
||||||
|
|
||||||
|
- **Code**: {sector_code}
|
||||||
|
- **Name**: {sector_name}
|
||||||
|
- **Description**: {description}
|
||||||
|
- **Sample Business Types**: {business_types}
|
||||||
|
|
||||||
|
## Output Requirements
|
||||||
|
|
||||||
|
Generate a JSON object with this exact structure:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{{
|
||||||
|
"sector_code": "{sector_code}",
|
||||||
|
"sector_name": "{sector_name}",
|
||||||
|
"generated_at": "<ISO timestamp>",
|
||||||
|
"version": "1.0",
|
||||||
|
|
||||||
|
"what_customers_judge": {{
|
||||||
|
"description": "The primary dimensions customers evaluate in this sector",
|
||||||
|
"items": [
|
||||||
|
{{
|
||||||
|
"aspect": "string (2-5 words)",
|
||||||
|
"importance": "critical | high | moderate",
|
||||||
|
"why_it_matters": "string (1 sentence)"
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}},
|
||||||
|
|
||||||
|
"critical_pain_points": {{
|
||||||
|
"description": "What damages reputation most severely",
|
||||||
|
"items": [
|
||||||
|
{{
|
||||||
|
"pain_point": "string (2-5 words)",
|
||||||
|
"typical_language": ["phrases customers actually use in reviews"],
|
||||||
|
"reputation_impact": "severe | significant | moderate"
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}},
|
||||||
|
|
||||||
|
"common_praise": {{
|
||||||
|
"description": "What earns customer loyalty and positive reviews",
|
||||||
|
"items": [
|
||||||
|
{{
|
||||||
|
"praise_area": "string (2-5 words)",
|
||||||
|
"typical_language": ["phrases customers actually use in reviews"],
|
||||||
|
"loyalty_impact": "high | moderate"
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}},
|
||||||
|
|
||||||
|
"industry_terminology": {{
|
||||||
|
"description": "Domain-specific vocabulary",
|
||||||
|
"staff_terms": ["terms for staff roles in this sector"],
|
||||||
|
"product_terms": ["terms for products/services"],
|
||||||
|
"process_terms": ["terms for processes/interactions"],
|
||||||
|
"quality_terms": ["positive quality descriptors"],
|
||||||
|
"problem_terms": ["negative quality descriptors"]
|
||||||
|
}},
|
||||||
|
|
||||||
|
"mode_specific_concerns": {{
|
||||||
|
"description": "Different service modes have different priorities",
|
||||||
|
"modes": [
|
||||||
|
{{
|
||||||
|
"mode": "string (e.g., 'In-person', 'Online', 'Phone')",
|
||||||
|
"primary_concerns": ["top concerns for this mode"],
|
||||||
|
"unique_pain_points": ["pain points specific to this mode"]
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}},
|
||||||
|
|
||||||
|
"what_is_actionable": {{
|
||||||
|
"description": "Feedback businesses can act on",
|
||||||
|
"actionable_examples": [
|
||||||
|
{{
|
||||||
|
"feedback_type": "string",
|
||||||
|
"example": "string (realistic review excerpt)",
|
||||||
|
"action_owner": "role/team that can fix it"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"not_actionable_examples": [
|
||||||
|
{{
|
||||||
|
"feedback_type": "string",
|
||||||
|
"example": "string (realistic review excerpt)",
|
||||||
|
"why_not_actionable": "string"
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}},
|
||||||
|
|
||||||
|
"sector_specific_signals": {{
|
||||||
|
"description": "Signals with sector-specific meaning",
|
||||||
|
"examples": [
|
||||||
|
{{
|
||||||
|
"signal": "string (word or phrase)",
|
||||||
|
"meaning_in_this_sector": "string",
|
||||||
|
"contrast_with": "how it differs in other sectors"
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Critical Rules
|
||||||
|
|
||||||
|
1. **Use realistic review language** in `typical_language` arrays - actual phrases customers write
|
||||||
|
2. **Include 4-8 items** per array (not too few, not excessive)
|
||||||
|
3. **Be sector-specific** - don't use generic phrases that apply to all businesses
|
||||||
|
4. **Include appropriate modes** - only modes that actually exist in this sector
|
||||||
|
5. **NO primitive codes, priorities, weights, or solutions**
|
||||||
|
6. **Focus on WHAT customers care about**, not HOW to classify it
|
||||||
|
|
||||||
|
Return ONLY the JSON object, no markdown formatting or explanation.'''
|
||||||
|
|
||||||
|
|
||||||
|
def load_sectors(data_path: Path) -> list[dict]:
|
||||||
|
"""Load sector definitions from JSON file."""
|
||||||
|
with open(data_path) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
return data["sectors"]
|
||||||
|
|
||||||
|
|
||||||
|
def generate_sector_brief(client: OpenAI, sector: dict, model: str) -> dict:
|
||||||
|
"""Generate a sector brief using LLM."""
|
||||||
|
prompt = PROMPT_TEMPLATE.format(
|
||||||
|
sector_code=sector["sector_code"],
|
||||||
|
sector_name=sector["sector_name"],
|
||||||
|
description=sector["description"],
|
||||||
|
business_types=", ".join(sector["sample_business_types"])
|
||||||
|
)
|
||||||
|
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are an expert customer experience analyst. Return only valid JSON, no markdown."
|
||||||
|
},
|
||||||
|
{"role": "user", "content": prompt}
|
||||||
|
],
|
||||||
|
temperature=0.3,
|
||||||
|
max_tokens=4000,
|
||||||
|
response_format={"type": "json_object"}
|
||||||
|
)
|
||||||
|
|
||||||
|
text = response.choices[0].message.content.strip()
|
||||||
|
|
||||||
|
# Parse JSON
|
||||||
|
brief = json.loads(text)
|
||||||
|
|
||||||
|
# Ensure required fields
|
||||||
|
brief["sector_code"] = sector["sector_code"]
|
||||||
|
brief["sector_name"] = sector["sector_name"]
|
||||||
|
brief["generated_at"] = datetime.utcnow().isoformat() + "Z"
|
||||||
|
brief["version"] = "1.0"
|
||||||
|
|
||||||
|
return brief
|
||||||
|
|
||||||
|
|
||||||
|
def validate_brief(brief: dict) -> list[str]:
|
||||||
|
"""Validate a sector brief, return list of issues."""
|
||||||
|
issues = []
|
||||||
|
|
||||||
|
required_keys = [
|
||||||
|
"what_customers_judge",
|
||||||
|
"critical_pain_points",
|
||||||
|
"common_praise",
|
||||||
|
"industry_terminology",
|
||||||
|
"mode_specific_concerns",
|
||||||
|
"what_is_actionable",
|
||||||
|
"sector_specific_signals"
|
||||||
|
]
|
||||||
|
|
||||||
|
for key in required_keys:
|
||||||
|
if key not in brief:
|
||||||
|
issues.append(f"Missing required key: {key}")
|
||||||
|
|
||||||
|
# Check array lengths
|
||||||
|
if "what_customers_judge" in brief:
|
||||||
|
items = brief["what_customers_judge"].get("items", [])
|
||||||
|
if len(items) < 3:
|
||||||
|
issues.append(f"what_customers_judge has only {len(items)} items (need 3+)")
|
||||||
|
if len(items) > 10:
|
||||||
|
issues.append(f"what_customers_judge has {len(items)} items (max 10)")
|
||||||
|
|
||||||
|
if "critical_pain_points" in brief:
|
||||||
|
items = brief["critical_pain_points"].get("items", [])
|
||||||
|
if len(items) < 3:
|
||||||
|
issues.append(f"critical_pain_points has only {len(items)} items (need 3+)")
|
||||||
|
|
||||||
|
if "common_praise" in brief:
|
||||||
|
items = brief["common_praise"].get("items", [])
|
||||||
|
if len(items) < 3:
|
||||||
|
issues.append(f"common_praise has only {len(items)} items (need 3+)")
|
||||||
|
|
||||||
|
# Check for forbidden content
|
||||||
|
text = json.dumps(brief).lower()
|
||||||
|
forbidden = ["priority", "weight", "primitive", "enabled", "disabled", "solution"]
|
||||||
|
for word in forbidden:
|
||||||
|
if word in text and word != "solution": # solution can appear in context
|
||||||
|
issues.append(f"Contains potentially forbidden term: {word}")
|
||||||
|
|
||||||
|
return issues
|
||||||
|
|
||||||
|
|
||||||
|
def save_brief(brief: dict, output_dir: Path) -> Path:
|
||||||
|
"""Save brief to JSON file."""
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
filename = f"{brief['sector_code'].lower()}_brief.json"
|
||||||
|
output_path = output_dir / filename
|
||||||
|
|
||||||
|
with open(output_path, "w") as f:
|
||||||
|
json.dump(brief, f, indent=2)
|
||||||
|
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
|
||||||
|
def validate_existing_briefs(output_dir: Path) -> None:
|
||||||
|
"""Validate all existing brief files."""
|
||||||
|
if not output_dir.exists():
|
||||||
|
print(f"Output directory does not exist: {output_dir}")
|
||||||
|
return
|
||||||
|
|
||||||
|
files = list(output_dir.glob("*_brief.json"))
|
||||||
|
if not files:
|
||||||
|
print("No brief files found")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Validating {len(files)} brief files...\n")
|
||||||
|
|
||||||
|
all_valid = True
|
||||||
|
for filepath in sorted(files):
|
||||||
|
with open(filepath) as f:
|
||||||
|
brief = json.load(f)
|
||||||
|
|
||||||
|
issues = validate_brief(brief)
|
||||||
|
status = "✓" if not issues else "✗"
|
||||||
|
print(f"{status} {filepath.name}")
|
||||||
|
|
||||||
|
if issues:
|
||||||
|
all_valid = False
|
||||||
|
for issue in issues:
|
||||||
|
print(f" - {issue}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
if all_valid:
|
||||||
|
print("All briefs valid!")
|
||||||
|
else:
|
||||||
|
print("Some briefs have issues.")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Generate sector briefs for Wave 0")
|
||||||
|
parser.add_argument("--sector", help="Generate only this sector code")
|
||||||
|
parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
|
||||||
|
parser.add_argument("--validate", action="store_true", help="Validate existing briefs")
|
||||||
|
parser.add_argument("--output-dir", default="data/sector_briefs", help="Output directory")
|
||||||
|
parser.add_argument("--model", default="gpt-4o", help="OpenAI model to use")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Paths
|
||||||
|
script_dir = Path(__file__).parent
|
||||||
|
package_dir = script_dir.parent
|
||||||
|
data_path = package_dir / "data" / "sectors.json"
|
||||||
|
output_dir = package_dir / args.output_dir
|
||||||
|
|
||||||
|
# Validate mode
|
||||||
|
if args.validate:
|
||||||
|
validate_existing_briefs(output_dir)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Load sectors
|
||||||
|
sectors = load_sectors(data_path)
|
||||||
|
print(f"Loaded {len(sectors)} sectors")
|
||||||
|
|
||||||
|
# Filter to single sector if specified
|
||||||
|
if args.sector:
|
||||||
|
sectors = [s for s in sectors if s["sector_code"] == args.sector]
|
||||||
|
if not sectors:
|
||||||
|
print(f"ERROR: Sector '{args.sector}' not found")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
print("\n[DRY RUN] Would generate briefs for:")
|
||||||
|
for sector in sectors:
|
||||||
|
print(f" - {sector['sector_code']}: {sector['sector_name']}")
|
||||||
|
print(f"\nOutput directory: {output_dir}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check API key
|
||||||
|
api_key = os.environ.get("OPENAI_API_KEY")
|
||||||
|
if not api_key:
|
||||||
|
print("ERROR: OPENAI_API_KEY environment variable required")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Initialize client
|
||||||
|
client = OpenAI(api_key=api_key)
|
||||||
|
print(f"Using model: {args.model}")
|
||||||
|
|
||||||
|
# Generate briefs
|
||||||
|
results = {"success": [], "failed": []}
|
||||||
|
|
||||||
|
for i, sector in enumerate(sectors, 1):
|
||||||
|
print(f"\n[{i}/{len(sectors)}] Generating brief for: {sector['sector_name']}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
brief = generate_sector_brief(client, sector, args.model)
|
||||||
|
|
||||||
|
# Validate
|
||||||
|
issues = validate_brief(brief)
|
||||||
|
if issues:
|
||||||
|
print(f" Warnings:")
|
||||||
|
for issue in issues:
|
||||||
|
print(f" - {issue}")
|
||||||
|
|
||||||
|
# Save
|
||||||
|
output_path = save_brief(brief, output_dir)
|
||||||
|
print(f" ✓ Saved to: {output_path}")
|
||||||
|
results["success"].append(sector["sector_code"])
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ✗ FAILED: {e}")
|
||||||
|
results["failed"].append(sector["sector_code"])
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"SUMMARY")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"Success: {len(results['success'])}")
|
||||||
|
print(f"Failed: {len(results['failed'])}")
|
||||||
|
|
||||||
|
if results["failed"]:
|
||||||
|
print(f"\nFailed sectors: {', '.join(results['failed'])}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
523
packages/reviewiq-pipeline/scripts/llm_classifier.py
Normal file
523
packages/reviewiq-pipeline/scripts/llm_classifier.py
Normal file
@@ -0,0 +1,523 @@
|
|||||||
|
"""
|
||||||
|
LLM Classifier - Real classification using OpenAI Responses API.
|
||||||
|
|
||||||
|
Uses JSON Schema to enforce strict output format.
|
||||||
|
Validates primitives against enabled set.
|
||||||
|
Stores raw response for audit.
|
||||||
|
Supports multilingual reviews with language detection.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
# Language detection - try langdetect, fall back to heuristics
|
||||||
|
try:
|
||||||
|
from langdetect import detect as langdetect_detect, LangDetectException
|
||||||
|
LANGDETECT_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
LANGDETECT_AVAILABLE = False
|
||||||
|
LangDetectException = Exception # Placeholder
|
||||||
|
|
||||||
|
|
||||||
|
def detect_language(text: str) -> tuple[str, float]:
|
||||||
|
"""
|
||||||
|
Detect the language of a text.
|
||||||
|
|
||||||
|
Returns (language_code, confidence).
|
||||||
|
Supported languages: en, es, de, fr, it, pt, ru, zh, ja, ko, ar, etc.
|
||||||
|
|
||||||
|
Falls back to heuristic detection if langdetect unavailable.
|
||||||
|
"""
|
||||||
|
if not text or len(text.strip()) < 3:
|
||||||
|
return "unknown", 0.0
|
||||||
|
|
||||||
|
text = text.strip()
|
||||||
|
|
||||||
|
# Try langdetect first (most accurate)
|
||||||
|
if LANGDETECT_AVAILABLE:
|
||||||
|
try:
|
||||||
|
lang = langdetect_detect(text)
|
||||||
|
# langdetect doesn't provide confidence directly, estimate based on text length
|
||||||
|
confidence = min(0.95, 0.5 + len(text) / 200)
|
||||||
|
return lang, confidence
|
||||||
|
except LangDetectException:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Fallback: Simple heuristic detection based on character ranges
|
||||||
|
# This is less accurate but works without dependencies
|
||||||
|
|
||||||
|
# Count characters in different scripts
|
||||||
|
latin = sum(1 for c in text if '\u0041' <= c <= '\u024F') # Latin extended
|
||||||
|
cyrillic = sum(1 for c in text if '\u0400' <= c <= '\u04FF') # Cyrillic
|
||||||
|
cjk = sum(1 for c in text if '\u4E00' <= c <= '\u9FFF') # CJK Unified
|
||||||
|
japanese = sum(1 for c in text if '\u3040' <= c <= '\u30FF') # Hiragana + Katakana
|
||||||
|
korean = sum(1 for c in text if '\uAC00' <= c <= '\uD7AF') # Hangul
|
||||||
|
arabic = sum(1 for c in text if '\u0600' <= c <= '\u06FF') # Arabic
|
||||||
|
|
||||||
|
total = len(text)
|
||||||
|
if total == 0:
|
||||||
|
return "unknown", 0.0
|
||||||
|
|
||||||
|
# Determine primary script
|
||||||
|
if cjk / total > 0.3:
|
||||||
|
return "zh", 0.6 # Chinese
|
||||||
|
if japanese / total > 0.2:
|
||||||
|
return "ja", 0.6 # Japanese
|
||||||
|
if korean / total > 0.3:
|
||||||
|
return "ko", 0.6 # Korean
|
||||||
|
if cyrillic / total > 0.3:
|
||||||
|
return "ru", 0.5 # Russian (could be other Cyrillic)
|
||||||
|
if arabic / total > 0.3:
|
||||||
|
return "ar", 0.5 # Arabic
|
||||||
|
|
||||||
|
if latin / total > 0.5:
|
||||||
|
# Latin script - try to distinguish languages by common words
|
||||||
|
text_lower = text.lower()
|
||||||
|
|
||||||
|
# Spanish indicators (expanded for better detection)
|
||||||
|
es_words = ['el', 'la', 'los', 'las', 'de', 'que', 'es', 'en', 'un', 'una',
|
||||||
|
'muy', 'pero', 'con', 'está', 'están', 'para', 'por', 'como',
|
||||||
|
'excelente', 'recomendado', 'servicio', 'bueno', 'malo', 'bien',
|
||||||
|
'todo', 'nada', 'más', 'sin', 'nunca', 'siempre', 'también']
|
||||||
|
es_score = sum(1 for w in es_words if re.search(rf'\b{w}\b', text_lower))
|
||||||
|
|
||||||
|
# Spanish-specific patterns (accents, ñ, inverted punctuation)
|
||||||
|
if 'ñ' in text_lower or '¿' in text or '¡' in text:
|
||||||
|
es_score += 3
|
||||||
|
if any(c in text_lower for c in 'áéíóúü'):
|
||||||
|
es_score += 1
|
||||||
|
|
||||||
|
# English indicators
|
||||||
|
en_words = ['the', 'and', 'is', 'are', 'was', 'were', 'this', 'that',
|
||||||
|
'with', 'for', 'but', 'not', 'very', 'great', 'good',
|
||||||
|
'service', 'place', 'food', 'staff', 'friendly', 'amazing',
|
||||||
|
'would', 'recommend', 'will', 'definitely', 'really']
|
||||||
|
en_score = sum(1 for w in en_words if re.search(rf'\b{w}\b', text_lower))
|
||||||
|
|
||||||
|
# German indicators
|
||||||
|
de_words = ['der', 'die', 'das', 'und', 'ist', 'sind', 'war', 'sehr',
|
||||||
|
'mit', 'für', 'aber', 'nicht', 'ein', 'eine', 'wir', 'ich',
|
||||||
|
'auch', 'gut', 'schlecht', 'toll', 'super']
|
||||||
|
de_score = sum(1 for w in de_words if re.search(rf'\b{w}\b', text_lower))
|
||||||
|
# German umlauts
|
||||||
|
if any(c in text_lower for c in 'äöüß'):
|
||||||
|
de_score += 2
|
||||||
|
|
||||||
|
# French indicators
|
||||||
|
fr_words = ['le', 'la', 'les', 'est', 'sont', 'très', 'mais', 'avec',
|
||||||
|
'pour', 'pas', 'un', 'une', 'et', 'nous', 'vous', 'bien',
|
||||||
|
'bon', 'mauvais', 'excellent', 'super', "c'est", "j'ai"]
|
||||||
|
fr_score = sum(1 for w in fr_words if re.search(rf'\b{w}\b', text_lower))
|
||||||
|
# French accents and patterns
|
||||||
|
if any(c in text_lower for c in 'àâçèêëîïôùûÿœæ'):
|
||||||
|
fr_score += 2
|
||||||
|
|
||||||
|
scores = {'es': es_score, 'en': en_score, 'de': de_score, 'fr': fr_score}
|
||||||
|
best_lang = max(scores, key=scores.get)
|
||||||
|
best_score = scores[best_lang]
|
||||||
|
|
||||||
|
if best_score >= 1: # Lowered threshold
|
||||||
|
confidence = min(0.75, 0.3 + best_score * 0.08)
|
||||||
|
return best_lang, confidence
|
||||||
|
|
||||||
|
# Default to English for Latin script
|
||||||
|
return "en", 0.3
|
||||||
|
|
||||||
|
return "unknown", 0.1
|
||||||
|
|
||||||
|
# Lazy client initialization
|
||||||
|
_client = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_client() -> OpenAI:
|
||||||
|
"""Get OpenAI client, initializing lazily on first use."""
|
||||||
|
global _client
|
||||||
|
if _client is None:
|
||||||
|
api_key = os.environ.get("OPENAI_API_KEY")
|
||||||
|
if not api_key:
|
||||||
|
raise RuntimeError(
|
||||||
|
"OPENAI_API_KEY environment variable not set. "
|
||||||
|
"Set it or use --dry-run / mock classifier."
|
||||||
|
)
|
||||||
|
_client = OpenAI(api_key=api_key)
|
||||||
|
return _client
|
||||||
|
|
||||||
|
# Default model
|
||||||
|
DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")
|
||||||
|
|
||||||
|
# Meta primitives - always available
|
||||||
|
META_PRIMITIVES = frozenset([
|
||||||
|
"HONESTY", "ETHICS", "PROMISES",
|
||||||
|
"ACKNOWLEDGMENT", "RESPONSE_QUALITY", "RECOVERY",
|
||||||
|
"RETURN_INTENT", "RECOMMEND", "RECOGNITION",
|
||||||
|
"UNMAPPED",
|
||||||
|
])
|
||||||
|
|
||||||
|
# JSON Schema for structured output
|
||||||
|
SPAN_SCHEMA = {
|
||||||
|
"name": "review_classification",
|
||||||
|
"strict": True,
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": False,
|
||||||
|
"properties": {
|
||||||
|
"spans": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": False,
|
||||||
|
"properties": {
|
||||||
|
"primitive": {"type": "string"},
|
||||||
|
"valence": {"type": "string", "enum": ["positive", "negative", "mixed", "neutral"]},
|
||||||
|
"intensity": {"type": "integer", "minimum": 1, "maximum": 5},
|
||||||
|
"evidence": {"type": "string"},
|
||||||
|
"start_char": {"type": ["integer", "null"]},
|
||||||
|
"end_char": {"type": ["integer", "null"]},
|
||||||
|
"confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0},
|
||||||
|
"details": {"type": "null"}
|
||||||
|
},
|
||||||
|
"required": ["primitive", "valence", "intensity", "evidence", "confidence", "start_char", "end_char", "details"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"unmapped": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": False,
|
||||||
|
"properties": {
|
||||||
|
"label": {"type": "string"},
|
||||||
|
"evidence": {"type": "string"},
|
||||||
|
"confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0}
|
||||||
|
},
|
||||||
|
"required": ["label", "evidence", "confidence"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["spans", "unmapped"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# System prompt
|
||||||
|
SYSTEM_PROMPT = """You are a review classification system that extracts semantic spans and maps them to primitives.
|
||||||
|
|
||||||
|
## RULES (MUST FOLLOW)
|
||||||
|
|
||||||
|
1. Use ONLY primitives from the enabled_primitives list provided. Do NOT invent new primitives.
|
||||||
|
|
||||||
|
2. Meta primitives are always available: HONESTY, ETHICS, PROMISES, ACKNOWLEDGMENT, RESPONSE_QUALITY, RECOVERY, RETURN_INTENT, RECOMMEND, RECOGNITION, UNMAPPED
|
||||||
|
|
||||||
|
3. If content doesn't fit any enabled primitive, use UNMAPPED or put it in the unmapped array with a descriptive label.
|
||||||
|
|
||||||
|
4. Output MUST match the JSON schema exactly. No extra keys.
|
||||||
|
|
||||||
|
5. Evidence must be a SHORT EXACT QUOTE from the review text (in original language).
|
||||||
|
|
||||||
|
6. Extract 1-5 spans per review. Prefer fewer, larger spans over many small ones.
|
||||||
|
|
||||||
|
7. If unsure about classification, lower the confidence score.
|
||||||
|
|
||||||
|
## VALENCE
|
||||||
|
- positive: praise, satisfaction, recommendation
|
||||||
|
- negative: complaint, dissatisfaction, warning
|
||||||
|
- mixed: both positive and negative in same span
|
||||||
|
- neutral: factual observation, no sentiment
|
||||||
|
|
||||||
|
## INTENSITY (1-5)
|
||||||
|
- 1: mild ("okay", "fine")
|
||||||
|
- 2: moderate ("good", "bad")
|
||||||
|
- 3: strong ("great", "terrible")
|
||||||
|
- 4: very strong ("amazing", "awful")
|
||||||
|
- 5: extreme ("best ever", "worst nightmare")
|
||||||
|
|
||||||
|
## CONFIDENCE
|
||||||
|
- 0.9+: Very certain the primitive fits
|
||||||
|
- 0.7-0.9: Confident
|
||||||
|
- 0.5-0.7: Moderate confidence
|
||||||
|
- <0.5: Low confidence (consider UNMAPPED)
|
||||||
|
|
||||||
|
Output valid JSON only. No markdown, no explanations."""
|
||||||
|
|
||||||
|
|
||||||
|
def compute_review_hash(text: str, config_version: str) -> str:
|
||||||
|
"""Compute hash for caching."""
|
||||||
|
key = f"{config_version}:{text}"
|
||||||
|
return hashlib.sha256(key.encode()).hexdigest()[:16]
|
||||||
|
|
||||||
|
|
||||||
|
def build_user_payload(
|
||||||
|
review_text: str,
|
||||||
|
rating: int | None,
|
||||||
|
config: dict[str, Any],
|
||||||
|
language: str = "auto",
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Build the user message payload for the LLM."""
|
||||||
|
# Extract only what the model needs
|
||||||
|
enabled = set(config.get("enabled_primitives", []))
|
||||||
|
enabled.update(META_PRIMITIVES)
|
||||||
|
|
||||||
|
# Build primitive definitions (minimal)
|
||||||
|
primitives_dict = config.get("primitives", {})
|
||||||
|
primitive_defs = {}
|
||||||
|
for prim in enabled:
|
||||||
|
if prim in primitives_dict:
|
||||||
|
info = primitives_dict[prim]
|
||||||
|
primitive_defs[prim] = info.get("def", info.get("name", prim))
|
||||||
|
elif prim in META_PRIMITIVES:
|
||||||
|
primitive_defs[prim] = f"Meta primitive: {prim.replace('_', ' ').lower()}"
|
||||||
|
|
||||||
|
# Extract brief signals (keep it short)
|
||||||
|
brief = config.get("brief", {})
|
||||||
|
brief_summary = {}
|
||||||
|
if brief.get("what_customers_judge"):
|
||||||
|
items = brief["what_customers_judge"]
|
||||||
|
if isinstance(items, dict):
|
||||||
|
items = items.get("items", [])
|
||||||
|
brief_summary["key_judgment_areas"] = [
|
||||||
|
item.get("aspect", item.get("area", str(item))) if isinstance(item, dict) else str(item)
|
||||||
|
for item in items[:5]
|
||||||
|
]
|
||||||
|
if brief.get("critical_pain_points"):
|
||||||
|
pains = brief["critical_pain_points"]
|
||||||
|
if isinstance(pains, dict):
|
||||||
|
pains = pains.get("items", [])
|
||||||
|
brief_summary["critical_pains"] = [
|
||||||
|
item.get("pain", str(item)) if isinstance(item, dict) else str(item)
|
||||||
|
for item in pains[:3]
|
||||||
|
]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"business": {
|
||||||
|
"name": config.get("business_id"),
|
||||||
|
"sector": config.get("sector_code"),
|
||||||
|
"config_version": config.get("config_version"),
|
||||||
|
},
|
||||||
|
"enabled_primitives": sorted(enabled),
|
||||||
|
"primitive_definitions": primitive_defs,
|
||||||
|
"weights": config.get("weights", {}),
|
||||||
|
"sector_brief": brief_summary,
|
||||||
|
"review": {
|
||||||
|
"text": review_text,
|
||||||
|
"rating": rating,
|
||||||
|
"language": language,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def validate_response(
|
||||||
|
response: dict[str, Any],
|
||||||
|
enabled_primitives: set[str],
|
||||||
|
) -> tuple[dict[str, Any], list[str]]:
|
||||||
|
"""
|
||||||
|
Validate LLM response and fix invalid primitives.
|
||||||
|
|
||||||
|
Returns (validated_response, warnings).
|
||||||
|
"""
|
||||||
|
warnings = []
|
||||||
|
all_valid = enabled_primitives | META_PRIMITIVES
|
||||||
|
|
||||||
|
validated_spans = []
|
||||||
|
for span in response.get("spans", []):
|
||||||
|
prim = span.get("primitive")
|
||||||
|
if prim not in all_valid:
|
||||||
|
warnings.append(f"Invalid primitive '{prim}' → UNMAPPED (original: {prim})")
|
||||||
|
span["primitive"] = "UNMAPPED"
|
||||||
|
validated_spans.append(span)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"spans": validated_spans,
|
||||||
|
"unmapped": response.get("unmapped", []),
|
||||||
|
}, warnings
|
||||||
|
|
||||||
|
|
||||||
|
def classify_review(
|
||||||
|
review_text: str,
|
||||||
|
rating: int | None,
|
||||||
|
config: dict[str, Any],
|
||||||
|
language: str = "auto",
|
||||||
|
model: str | None = None,
|
||||||
|
max_retries: int = 3,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Classify a single review using OpenAI.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
review_text: The review text to classify
|
||||||
|
rating: Star rating (1-5) if available
|
||||||
|
config: Resolved config from ConfigResolver
|
||||||
|
language: Language hint (default: auto-detect)
|
||||||
|
model: Model to use (default: gpt-4o-mini)
|
||||||
|
max_retries: Max retries on transient errors
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
{
|
||||||
|
"spans": [...],
|
||||||
|
"unmapped": [...],
|
||||||
|
"model": str,
|
||||||
|
"raw_response": str,
|
||||||
|
"review_hash": str,
|
||||||
|
"warnings": [...],
|
||||||
|
"detected_language": str,
|
||||||
|
"language_confidence": float,
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
model = model or DEFAULT_MODEL
|
||||||
|
|
||||||
|
# Detect language if auto
|
||||||
|
detected_lang = "unknown"
|
||||||
|
lang_confidence = 0.0
|
||||||
|
if language == "auto":
|
||||||
|
detected_lang, lang_confidence = detect_language(review_text)
|
||||||
|
language = detected_lang
|
||||||
|
else:
|
||||||
|
detected_lang = language
|
||||||
|
lang_confidence = 1.0 # User-specified
|
||||||
|
|
||||||
|
# Build payload with detected language
|
||||||
|
payload = build_user_payload(review_text, rating, config, detected_lang)
|
||||||
|
user_content = json.dumps(payload, ensure_ascii=False, indent=None)
|
||||||
|
|
||||||
|
# Compute hash for caching
|
||||||
|
review_hash = compute_review_hash(review_text, config.get("config_version", "1.0"))
|
||||||
|
|
||||||
|
# Call OpenAI with retries
|
||||||
|
last_error = None
|
||||||
|
client = get_client()
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": user_content},
|
||||||
|
],
|
||||||
|
response_format={
|
||||||
|
"type": "json_schema",
|
||||||
|
"json_schema": SPAN_SCHEMA,
|
||||||
|
},
|
||||||
|
temperature=0.1, # Low temperature for consistency
|
||||||
|
max_tokens=2000,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Parse response
|
||||||
|
raw_text = response.choices[0].message.content
|
||||||
|
parsed = json.loads(raw_text)
|
||||||
|
|
||||||
|
# Validate primitives
|
||||||
|
enabled = set(config.get("enabled_primitives", []))
|
||||||
|
validated, warnings = validate_response(parsed, enabled)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"spans": validated["spans"],
|
||||||
|
"unmapped": validated["unmapped"],
|
||||||
|
"model": model,
|
||||||
|
"raw_response": raw_text,
|
||||||
|
"review_hash": review_hash,
|
||||||
|
"warnings": warnings,
|
||||||
|
"tokens": {
|
||||||
|
"prompt": response.usage.prompt_tokens if response.usage else 0,
|
||||||
|
"completion": response.usage.completion_tokens if response.usage else 0,
|
||||||
|
},
|
||||||
|
"detected_language": detected_lang,
|
||||||
|
"language_confidence": lang_confidence,
|
||||||
|
}
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
last_error = f"JSON parse error: {e}"
|
||||||
|
# Don't retry parse errors - log and return fallback
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
last_error = str(e)
|
||||||
|
if "rate_limit" in str(e).lower() or "429" in str(e):
|
||||||
|
# Exponential backoff for rate limits
|
||||||
|
wait = 2 ** attempt
|
||||||
|
time.sleep(wait)
|
||||||
|
continue
|
||||||
|
elif "500" in str(e) or "502" in str(e) or "503" in str(e):
|
||||||
|
# Retry on server errors
|
||||||
|
time.sleep(1)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Don't retry other errors
|
||||||
|
break
|
||||||
|
|
||||||
|
# Fallback response on error
|
||||||
|
return {
|
||||||
|
"spans": [{
|
||||||
|
"primitive": "UNMAPPED",
|
||||||
|
"valence": "neutral",
|
||||||
|
"intensity": 1,
|
||||||
|
"evidence": review_text[:100] if review_text else "",
|
||||||
|
"start_char": 0,
|
||||||
|
"end_char": min(100, len(review_text)) if review_text else 0,
|
||||||
|
"confidence": 0.1,
|
||||||
|
"details": {"error": last_error},
|
||||||
|
}],
|
||||||
|
"unmapped": [],
|
||||||
|
"model": model,
|
||||||
|
"raw_response": json.dumps({"error": last_error}),
|
||||||
|
"review_hash": review_hash,
|
||||||
|
"warnings": [f"Classification failed: {last_error}"],
|
||||||
|
"tokens": {"prompt": 0, "completion": 0},
|
||||||
|
"detected_language": detected_lang,
|
||||||
|
"language_confidence": lang_confidence,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def classify_review_async(
|
||||||
|
review_text: str,
|
||||||
|
rating: int | None,
|
||||||
|
config: dict[str, Any],
|
||||||
|
language: str = "auto",
|
||||||
|
model: str | None = None,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Async wrapper for classify_review."""
|
||||||
|
import asyncio
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
return await loop.run_in_executor(
|
||||||
|
None,
|
||||||
|
lambda: classify_review(review_text, rating, config, language, model),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Batch classification (for later optimization)
|
||||||
|
async def classify_batch(
|
||||||
|
reviews: list[dict[str, Any]],
|
||||||
|
config: dict[str, Any],
|
||||||
|
model: str | None = None,
|
||||||
|
max_concurrent: int = 5,
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Classify multiple reviews concurrently.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
reviews: List of {"text": str, "rating": int, "language": str}
|
||||||
|
config: Resolved config
|
||||||
|
model: Model to use
|
||||||
|
max_concurrent: Max concurrent requests
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of classification results
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
semaphore = asyncio.Semaphore(max_concurrent)
|
||||||
|
|
||||||
|
async def classify_one(review: dict) -> dict:
|
||||||
|
async with semaphore:
|
||||||
|
return await classify_review_async(
|
||||||
|
review.get("text", ""),
|
||||||
|
review.get("rating"),
|
||||||
|
config,
|
||||||
|
review.get("language", "auto"),
|
||||||
|
model,
|
||||||
|
)
|
||||||
|
|
||||||
|
tasks = [classify_one(r) for r in reviews]
|
||||||
|
return await asyncio.gather(*tasks)
|
||||||
1102
packages/reviewiq-pipeline/scripts/run_classification_v2.py
Normal file
1102
packages/reviewiq-pipeline/scripts/run_classification_v2.py
Normal file
File diff suppressed because it is too large
Load Diff
457
packages/reviewiq-pipeline/scripts/validate_l1_configs.py
Normal file
457
packages/reviewiq-pipeline/scripts/validate_l1_configs.py
Normal file
@@ -0,0 +1,457 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Wave 1 L1 Config Validation Script
|
||||||
|
|
||||||
|
Validates L1 primitive configs against real review data by analyzing:
|
||||||
|
1. Coverage: % of spans mapped to enabled primitives
|
||||||
|
2. Top primitives by frequency
|
||||||
|
3. Disabled primitives appearing (potential misconfig)
|
||||||
|
4. Weight effectiveness
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python validate_l1_configs.py --sector ENTERTAINMENT --job-url "gokarts"
|
||||||
|
python validate_l1_configs.py --sector AUTOMOTIVE --job-url "clickrent"
|
||||||
|
python validate_l1_configs.py --all
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from collections import Counter, defaultdict
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
# Paths
|
||||||
|
DATA_DIR = Path(__file__).parent.parent / "data"
|
||||||
|
CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l1"
|
||||||
|
BRIEFS_DIR = DATA_DIR / "sector_briefs"
|
||||||
|
|
||||||
|
# Primitive to URT domain mapping
|
||||||
|
# Primitives map to URT domains: O=Offering, P=People, J=Journey, E=Environment, A=Access, V=Value, R=Relationship
|
||||||
|
PRIMITIVE_TO_DOMAIN = {
|
||||||
|
# Quality -> Offering (O)
|
||||||
|
"TASTE": "O", "CRAFT": "O", "FRESHNESS": "O", "TEMPERATURE": "O",
|
||||||
|
"EFFECTIVENESS": "O", "ACCURACY": "O", "CONDITION": "O", "CONSISTENCY": "O",
|
||||||
|
# Service -> People (P)
|
||||||
|
"MANNER": "P", "COMPETENCE": "P", "ATTENTIVENESS": "P", "COMMUNICATION": "P",
|
||||||
|
# Process -> Journey (J)
|
||||||
|
"SPEED": "J", "FRICTION": "J", "RELIABILITY": "J", "AVAILABILITY": "J",
|
||||||
|
# Environment -> Environment (E)
|
||||||
|
"CLEANLINESS": "E", "COMFORT": "E", "SAFETY": "E", "AMBIANCE": "E",
|
||||||
|
"ACCESSIBILITY": "E", "DIGITAL_UX": "E",
|
||||||
|
# Value -> Value (V)
|
||||||
|
"PRICE_LEVEL": "V", "PRICE_FAIRNESS": "V", "PRICE_TRANSPARENCY": "V", "VALUE_FOR_MONEY": "V",
|
||||||
|
}
|
||||||
|
|
||||||
|
# URT code to primitive mapping (simplified - maps URT codes to closest primitive)
|
||||||
|
URT_TO_PRIMITIVE = {
|
||||||
|
# Offering codes
|
||||||
|
"O1.01": "CONSISTENCY", "O1.02": "CRAFT", "O1.03": "FRESHNESS",
|
||||||
|
"O1.04": "EFFECTIVENESS", "O1.05": "TASTE", "O1.06": "CONDITION",
|
||||||
|
"O2.01": "ACCURACY", "O2.02": "EFFECTIVENESS", "O2.03": "CRAFT",
|
||||||
|
"O3.01": "ACCURACY", "O3.02": "CONSISTENCY", "O3.03": "EFFECTIVENESS",
|
||||||
|
# People codes
|
||||||
|
"P1.01": "MANNER", "P1.02": "MANNER", "P1.03": "ATTENTIVENESS",
|
||||||
|
"P1.04": "COMMUNICATION", "P1.05": "ATTENTIVENESS",
|
||||||
|
"P2.01": "COMPETENCE", "P2.02": "COMPETENCE", "P2.03": "COMPETENCE",
|
||||||
|
"P3.01": "COMMUNICATION", "P3.02": "COMMUNICATION", "P3.03": "COMMUNICATION",
|
||||||
|
# Journey codes
|
||||||
|
"J1.01": "SPEED", "J1.02": "RELIABILITY", "J1.03": "FRICTION",
|
||||||
|
"J1.04": "SPEED", "J1.05": "RELIABILITY",
|
||||||
|
"J2.01": "RELIABILITY", "J2.02": "RELIABILITY", "J2.03": "FRICTION",
|
||||||
|
"J3.01": "FRICTION", "J3.02": "FRICTION", "J3.03": "FRICTION",
|
||||||
|
# Environment codes
|
||||||
|
"E1.01": "CLEANLINESS", "E1.02": "COMFORT", "E1.03": "AMBIANCE",
|
||||||
|
"E1.04": "AMBIANCE", "E1.05": "COMFORT",
|
||||||
|
"E2.01": "AMBIANCE", "E2.02": "COMFORT", "E2.03": "COMFORT",
|
||||||
|
"E2.04": "AMBIANCE", "E2.05": "DIGITAL_UX",
|
||||||
|
"E3.01": "SAFETY", "E3.02": "SAFETY", "E3.03": "ACCESSIBILITY",
|
||||||
|
"E4.01": "ACCESSIBILITY", "E4.02": "ACCESSIBILITY", "E4.03": "DIGITAL_UX",
|
||||||
|
# Access codes
|
||||||
|
"A1.01": "AVAILABILITY", "A1.02": "AVAILABILITY", "A1.03": "AVAILABILITY",
|
||||||
|
"A1.04": "ACCESSIBILITY", "A1.05": "ACCESSIBILITY",
|
||||||
|
"A2.01": "ACCESSIBILITY", "A2.02": "ACCESSIBILITY", "A2.03": "DIGITAL_UX",
|
||||||
|
"A3.01": "ACCESSIBILITY", "A3.02": "ACCESSIBILITY", "A3.03": "SPEED",
|
||||||
|
"A4.01": "ACCESSIBILITY", "A4.02": "ACCESSIBILITY", "A4.03": "AVAILABILITY",
|
||||||
|
# Value codes
|
||||||
|
"V1.01": "PRICE_LEVEL", "V1.02": "PRICE_FAIRNESS", "V1.03": "PRICE_TRANSPARENCY",
|
||||||
|
"V2.01": "PRICE_FAIRNESS", "V2.02": "PRICE_TRANSPARENCY", "V2.03": "VALUE_FOR_MONEY",
|
||||||
|
"V3.01": "VALUE_FOR_MONEY", "V3.02": "VALUE_FOR_MONEY", "V3.03": "PRICE_FAIRNESS",
|
||||||
|
"V4.01": "VALUE_FOR_MONEY", "V4.02": "VALUE_FOR_MONEY", "V4.03": "VALUE_FOR_MONEY",
|
||||||
|
# Relationship codes
|
||||||
|
"R1.01": "RELIABILITY", "R1.02": "RELIABILITY", "R1.03": "RELIABILITY",
|
||||||
|
"R2.01": "RELIABILITY", "R2.02": "CONSISTENCY", "R2.03": "RELIABILITY",
|
||||||
|
"R3.01": "MANNER", "R3.02": "MANNER", "R3.03": "COMMUNICATION",
|
||||||
|
"R4.01": "CONSISTENCY", "R4.02": "RELIABILITY", "R4.03": "CONSISTENCY",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ValidationResult:
|
||||||
|
"""Validation results for a sector."""
|
||||||
|
sector_code: str
|
||||||
|
job_count: int
|
||||||
|
review_count: int
|
||||||
|
span_count: int
|
||||||
|
|
||||||
|
# Coverage metrics
|
||||||
|
enabled_coverage: float # % spans using enabled primitives
|
||||||
|
disabled_hits: dict[str, int] # disabled primitives that appeared
|
||||||
|
unmapped_count: int # spans that couldn't be mapped
|
||||||
|
|
||||||
|
# Distribution
|
||||||
|
primitive_counts: dict[str, int] # all primitives by count
|
||||||
|
domain_distribution: dict[str, int] # O, P, J, E, A, V, R
|
||||||
|
valence_distribution: dict[str, int] # V+, V-, V0, V±
|
||||||
|
|
||||||
|
# Top codes
|
||||||
|
top_urt_codes: list[tuple[str, int]]
|
||||||
|
|
||||||
|
# Recommendations
|
||||||
|
recommendations: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
def load_l1_config(sector_code: str) -> dict[str, Any] | None:
|
||||||
|
"""Load L1 config for a sector."""
|
||||||
|
config_file = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
|
||||||
|
if not config_file.exists():
|
||||||
|
return None
|
||||||
|
with open(config_file) as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def load_sector_brief(sector_code: str) -> dict[str, Any] | None:
|
||||||
|
"""Load sector brief for a sector."""
|
||||||
|
brief_file = BRIEFS_DIR / f"{sector_code.lower()}_brief.json"
|
||||||
|
if not brief_file.exists():
|
||||||
|
return None
|
||||||
|
with open(brief_file) as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def map_urt_to_primitive(urt_code: str) -> str | None:
|
||||||
|
"""Map URT code to primitive."""
|
||||||
|
return URT_TO_PRIMITIVE.get(urt_code)
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_spans_for_jobs(pool: asyncpg.Pool, job_url_pattern: str) -> list[dict]:
|
||||||
|
"""Fetch spans for jobs matching URL pattern."""
|
||||||
|
query = """
|
||||||
|
SELECT
|
||||||
|
rs.urt_primary,
|
||||||
|
rs.valence,
|
||||||
|
rs.intensity,
|
||||||
|
rs.span_text,
|
||||||
|
j.url
|
||||||
|
FROM pipeline.review_spans rs
|
||||||
|
JOIN pipeline.reviews_raw rr ON rs.review_id = rr.review_id
|
||||||
|
JOIN public.jobs j ON rr.job_id = j.job_id
|
||||||
|
WHERE LOWER(j.url) LIKE $1
|
||||||
|
ORDER BY rs.created_at DESC
|
||||||
|
"""
|
||||||
|
rows = await pool.fetch(query, f"%{job_url_pattern.lower()}%")
|
||||||
|
return [dict(row) for row in rows]
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_all_spans(pool: asyncpg.Pool) -> list[dict]:
|
||||||
|
"""Fetch all spans from database."""
|
||||||
|
query = """
|
||||||
|
SELECT
|
||||||
|
urt_primary,
|
||||||
|
valence,
|
||||||
|
intensity,
|
||||||
|
span_text
|
||||||
|
FROM pipeline.review_spans
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
"""
|
||||||
|
rows = await pool.fetch(query)
|
||||||
|
return [dict(row) for row in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_spans(
|
||||||
|
spans: list[dict],
|
||||||
|
config: dict[str, Any],
|
||||||
|
) -> ValidationResult:
|
||||||
|
"""Analyze spans against L1 config."""
|
||||||
|
sector_code = config["sector_code"]
|
||||||
|
enabled = set(config.get("enabled", []))
|
||||||
|
disabled = set(config.get("disabled", []))
|
||||||
|
weights = config.get("weights", {})
|
||||||
|
|
||||||
|
# Counters
|
||||||
|
primitive_counts: Counter = Counter()
|
||||||
|
domain_counts: Counter = Counter()
|
||||||
|
valence_counts: Counter = Counter()
|
||||||
|
urt_counts: Counter = Counter()
|
||||||
|
disabled_hits: Counter = Counter()
|
||||||
|
unmapped = 0
|
||||||
|
enabled_hits = 0
|
||||||
|
|
||||||
|
for span in spans:
|
||||||
|
urt_code = span["urt_primary"]
|
||||||
|
valence = span.get("valence", "V0")
|
||||||
|
|
||||||
|
# Count URT codes
|
||||||
|
urt_counts[urt_code] += 1
|
||||||
|
|
||||||
|
# Count valence
|
||||||
|
valence_counts[valence] += 1
|
||||||
|
|
||||||
|
# Map to primitive
|
||||||
|
primitive = map_urt_to_primitive(urt_code)
|
||||||
|
if primitive:
|
||||||
|
primitive_counts[primitive] += 1
|
||||||
|
|
||||||
|
# Count domain
|
||||||
|
domain = PRIMITIVE_TO_DOMAIN.get(primitive, urt_code[0])
|
||||||
|
domain_counts[domain] += 1
|
||||||
|
|
||||||
|
# Check if enabled or disabled
|
||||||
|
if primitive in enabled:
|
||||||
|
enabled_hits += 1
|
||||||
|
elif primitive in disabled:
|
||||||
|
disabled_hits[primitive] += 1
|
||||||
|
else:
|
||||||
|
unmapped += 1
|
||||||
|
# Still count domain from URT code
|
||||||
|
domain_counts[urt_code[0]] += 1
|
||||||
|
|
||||||
|
# Calculate coverage
|
||||||
|
total = len(spans)
|
||||||
|
enabled_coverage = enabled_hits / total if total > 0 else 0
|
||||||
|
|
||||||
|
# Generate recommendations
|
||||||
|
recommendations = []
|
||||||
|
|
||||||
|
# Check disabled primitives that appeared frequently
|
||||||
|
for prim, count in disabled_hits.most_common(5):
|
||||||
|
if count >= 10:
|
||||||
|
pct = count / total * 100
|
||||||
|
recommendations.append(
|
||||||
|
f"ENABLE {prim}: Disabled but appeared {count} times ({pct:.1f}%)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check for missing high-weight primitives
|
||||||
|
weighted_set = set(weights.keys())
|
||||||
|
for prim in weighted_set:
|
||||||
|
if primitive_counts[prim] == 0 and prim in enabled:
|
||||||
|
recommendations.append(
|
||||||
|
f"CHECK {prim}: Weighted ({weights[prim]}x) but no appearances"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check for frequently appearing unweighted primitives
|
||||||
|
for prim, count in primitive_counts.most_common(10):
|
||||||
|
if prim in enabled and prim not in weights and count >= total * 0.1:
|
||||||
|
pct = count / total * 100
|
||||||
|
recommendations.append(
|
||||||
|
f"WEIGHT {prim}: High frequency ({count}, {pct:.1f}%) but not weighted"
|
||||||
|
)
|
||||||
|
|
||||||
|
return ValidationResult(
|
||||||
|
sector_code=sector_code,
|
||||||
|
job_count=1, # Will be updated by caller
|
||||||
|
review_count=0, # Not tracked at span level
|
||||||
|
span_count=total,
|
||||||
|
enabled_coverage=enabled_coverage,
|
||||||
|
disabled_hits=dict(disabled_hits),
|
||||||
|
unmapped_count=unmapped,
|
||||||
|
primitive_counts=dict(primitive_counts),
|
||||||
|
domain_distribution=dict(domain_counts),
|
||||||
|
valence_distribution=dict(valence_counts),
|
||||||
|
top_urt_codes=urt_counts.most_common(15),
|
||||||
|
recommendations=recommendations,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def print_validation_report(result: ValidationResult, config: dict, brief: dict | None):
|
||||||
|
"""Print formatted validation report."""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print(f"VALIDATION REPORT: {result.sector_code}")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
# Overview
|
||||||
|
print(f"\n📊 OVERVIEW")
|
||||||
|
print(f" Spans analyzed: {result.span_count:,}")
|
||||||
|
print(f" Enabled coverage: {result.enabled_coverage:.1%}")
|
||||||
|
print(f" Unmapped spans: {result.unmapped_count} ({result.unmapped_count/result.span_count*100:.1f}%)" if result.span_count > 0 else " No spans")
|
||||||
|
|
||||||
|
# Config summary
|
||||||
|
print(f"\n⚙️ CONFIG SUMMARY")
|
||||||
|
print(f" Enabled: {len(config.get('enabled', []))} primitives")
|
||||||
|
print(f" Disabled: {len(config.get('disabled', []))} primitives")
|
||||||
|
print(f" Weighted: {len(config.get('weights', {}))} primitives")
|
||||||
|
|
||||||
|
# Domain distribution
|
||||||
|
print(f"\n📁 DOMAIN DISTRIBUTION")
|
||||||
|
domain_names = {"O": "Offering", "P": "People", "J": "Journey",
|
||||||
|
"E": "Environment", "A": "Access", "V": "Value", "R": "Relationship"}
|
||||||
|
for domain in "OPJEVRA":
|
||||||
|
count = result.domain_distribution.get(domain, 0)
|
||||||
|
pct = count / result.span_count * 100 if result.span_count > 0 else 0
|
||||||
|
bar = "█" * int(pct / 2)
|
||||||
|
print(f" {domain} {domain_names.get(domain, '?'):12} {count:4} ({pct:5.1f}%) {bar}")
|
||||||
|
|
||||||
|
# Valence distribution
|
||||||
|
print(f"\n😊 VALENCE DISTRIBUTION")
|
||||||
|
for val in ["V+", "V-", "V0", "V±"]:
|
||||||
|
count = result.valence_distribution.get(val, 0)
|
||||||
|
pct = count / result.span_count * 100 if result.span_count > 0 else 0
|
||||||
|
print(f" {val}: {count:4} ({pct:5.1f}%)")
|
||||||
|
|
||||||
|
# Top primitives
|
||||||
|
print(f"\n🔝 TOP PRIMITIVES")
|
||||||
|
enabled_set = set(config.get("enabled", []))
|
||||||
|
weights = config.get("weights", {})
|
||||||
|
for prim, count in sorted(result.primitive_counts.items(), key=lambda x: -x[1])[:12]:
|
||||||
|
pct = count / result.span_count * 100 if result.span_count > 0 else 0
|
||||||
|
status = "✓" if prim in enabled_set else "✗"
|
||||||
|
weight = f"({weights[prim]}x)" if prim in weights else ""
|
||||||
|
print(f" {status} {prim:20} {count:4} ({pct:5.1f}%) {weight}")
|
||||||
|
|
||||||
|
# Top URT codes
|
||||||
|
print(f"\n📋 TOP URT CODES")
|
||||||
|
for code, count in result.top_urt_codes[:10]:
|
||||||
|
pct = count / result.span_count * 100 if result.span_count > 0 else 0
|
||||||
|
mapped = URT_TO_PRIMITIVE.get(code, "UNMAPPED")
|
||||||
|
print(f" {code}: {count:4} ({pct:5.1f}%) → {mapped}")
|
||||||
|
|
||||||
|
# Disabled but appearing
|
||||||
|
if result.disabled_hits:
|
||||||
|
print(f"\n⚠️ DISABLED BUT APPEARING")
|
||||||
|
for prim, count in sorted(result.disabled_hits.items(), key=lambda x: -x[1]):
|
||||||
|
pct = count / result.span_count * 100 if result.span_count > 0 else 0
|
||||||
|
print(f" {prim}: {count} ({pct:.1f}%)")
|
||||||
|
|
||||||
|
# Recommendations
|
||||||
|
if result.recommendations:
|
||||||
|
print(f"\n💡 RECOMMENDATIONS")
|
||||||
|
for rec in result.recommendations:
|
||||||
|
print(f" • {rec}")
|
||||||
|
|
||||||
|
# Brief signals check (if available)
|
||||||
|
if brief:
|
||||||
|
print(f"\n📝 BRIEF SIGNALS CHECK")
|
||||||
|
what_customers_judge = brief.get("what_customers_judge", {})
|
||||||
|
if isinstance(what_customers_judge, dict):
|
||||||
|
items = what_customers_judge.get("items", [])
|
||||||
|
else:
|
||||||
|
items = what_customers_judge if isinstance(what_customers_judge, list) else []
|
||||||
|
|
||||||
|
print(f" Key judgment areas from brief:")
|
||||||
|
for item in items[:5]:
|
||||||
|
if isinstance(item, dict):
|
||||||
|
print(f" • {item.get('area', item)}")
|
||||||
|
else:
|
||||||
|
print(f" • {item}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_validation(
|
||||||
|
sector_code: str,
|
||||||
|
job_url_pattern: str | None = None,
|
||||||
|
db_url: str | None = None,
|
||||||
|
):
|
||||||
|
"""Run validation for a sector."""
|
||||||
|
# Load config
|
||||||
|
config = load_l1_config(sector_code)
|
||||||
|
if not config:
|
||||||
|
print(f"❌ No L1 config found for {sector_code}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Load brief
|
||||||
|
brief = load_sector_brief(sector_code)
|
||||||
|
|
||||||
|
# Connect to database
|
||||||
|
db_url = db_url or os.environ.get(
|
||||||
|
"DATABASE_URL",
|
||||||
|
"postgresql://scraper:scraper123@localhost:5437/scraper"
|
||||||
|
)
|
||||||
|
|
||||||
|
pool = await asyncpg.create_pool(db_url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Fetch spans
|
||||||
|
if job_url_pattern:
|
||||||
|
spans = await fetch_spans_for_jobs(pool, job_url_pattern)
|
||||||
|
if not spans:
|
||||||
|
print(f"⚠️ No spans found for jobs matching '{job_url_pattern}'")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
spans = await fetch_all_spans(pool)
|
||||||
|
|
||||||
|
# Analyze
|
||||||
|
result = analyze_spans(spans, config)
|
||||||
|
|
||||||
|
# Print report
|
||||||
|
print_validation_report(result, config, brief)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await pool.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def run_all_validations(db_url: str | None = None):
|
||||||
|
"""Run validation for all sectors with available data."""
|
||||||
|
# Known jobs and their sectors
|
||||||
|
jobs_by_sector = {
|
||||||
|
"ENTERTAINMENT": ["gokarts", "soho"],
|
||||||
|
"AUTOMOTIVE": ["clickrent"],
|
||||||
|
"PERSONAL_SERVICES": ["fleitas"],
|
||||||
|
"FOOD_DINING": ["fika"],
|
||||||
|
}
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
for sector, job_patterns in jobs_by_sector.items():
|
||||||
|
print(f"\n{'='*70}")
|
||||||
|
print(f"Validating {sector}...")
|
||||||
|
print(f"{'='*70}")
|
||||||
|
|
||||||
|
for pattern in job_patterns:
|
||||||
|
result = await run_validation(sector, pattern, db_url)
|
||||||
|
if result:
|
||||||
|
results[f"{sector}:{pattern}"] = result
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("VALIDATION SUMMARY")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
for key, result in results.items():
|
||||||
|
sector, pattern = key.split(":")
|
||||||
|
print(f"\n{sector} ({pattern}):")
|
||||||
|
print(f" Coverage: {result.enabled_coverage:.1%}")
|
||||||
|
print(f" Spans: {result.span_count}")
|
||||||
|
if result.disabled_hits:
|
||||||
|
print(f" ⚠️ Disabled hits: {sum(result.disabled_hits.values())}")
|
||||||
|
if result.recommendations:
|
||||||
|
print(f" Recommendations: {len(result.recommendations)}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Validate L1 primitive configs")
|
||||||
|
parser.add_argument("--sector", help="Sector code (e.g., ENTERTAINMENT)")
|
||||||
|
parser.add_argument("--job-url", help="Job URL pattern to filter (e.g., 'gokarts')")
|
||||||
|
parser.add_argument("--all", action="store_true", help="Run all validations")
|
||||||
|
parser.add_argument("--db-url", help="Database URL")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.all:
|
||||||
|
asyncio.run(run_all_validations(args.db_url))
|
||||||
|
elif args.sector:
|
||||||
|
asyncio.run(run_validation(args.sector, args.job_url, args.db_url))
|
||||||
|
else:
|
||||||
|
parser.print_help()
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
421
packages/reviewiq-pipeline/scripts/validate_l1_configs_v2.py
Normal file
421
packages/reviewiq-pipeline/scripts/validate_l1_configs_v2.py
Normal file
@@ -0,0 +1,421 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Wave 1 L1 Config Validation Script - V2 (Sector-Scoped)
|
||||||
|
|
||||||
|
Validates L1 primitive configs against SECTOR-SPECIFIC review data.
|
||||||
|
Only validates sectors where we have real business data.
|
||||||
|
|
||||||
|
Key improvement over v1: spans are filtered by business → sector mapping,
|
||||||
|
ensuring "TASTE in HEALTHCARE" noise doesn't pollute results.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python validate_l1_configs_v2.py --sector ENTERTAINMENT
|
||||||
|
python validate_l1_configs_v2.py --sector AUTOMOTIVE
|
||||||
|
python validate_l1_configs_v2.py --all
|
||||||
|
python validate_l1_configs_v2.py --report # Summary only
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from collections import Counter
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
# Paths
|
||||||
|
DATA_DIR = Path(__file__).parent.parent / "data"
|
||||||
|
CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l1"
|
||||||
|
BRIEFS_DIR = DATA_DIR / "sector_briefs"
|
||||||
|
|
||||||
|
# Business → Sector mapping (ground truth)
|
||||||
|
BUSINESS_TO_SECTOR = {
|
||||||
|
"Go Karts Mar Menor": "ENTERTAINMENT",
|
||||||
|
"ClickRent Gran Canaria | Alquiler de Coches y Furgonetas": "AUTOMOTIVE",
|
||||||
|
"Soho Club": "ENTERTAINMENT",
|
||||||
|
"Fika": "FOOD_DINING",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Sectors with real data
|
||||||
|
SECTORS_WITH_DATA = {"ENTERTAINMENT", "AUTOMOTIVE", "FOOD_DINING"}
|
||||||
|
|
||||||
|
# URT code to primitive mapping
|
||||||
|
URT_TO_PRIMITIVE = {
|
||||||
|
# Offering codes
|
||||||
|
"O1.01": "CONSISTENCY", "O1.02": "CRAFT", "O1.03": "FRESHNESS",
|
||||||
|
"O1.04": "EFFECTIVENESS", "O1.05": "TASTE", "O1.06": "CONDITION",
|
||||||
|
"O2.01": "ACCURACY", "O2.02": "EFFECTIVENESS", "O2.03": "CRAFT",
|
||||||
|
"O3.01": "ACCURACY", "O3.02": "CONSISTENCY", "O3.03": "EFFECTIVENESS",
|
||||||
|
# People codes
|
||||||
|
"P1.01": "MANNER", "P1.02": "MANNER", "P1.03": "ATTENTIVENESS",
|
||||||
|
"P1.04": "COMMUNICATION", "P1.05": "ATTENTIVENESS",
|
||||||
|
"P2.01": "COMPETENCE", "P2.02": "COMPETENCE", "P2.03": "COMPETENCE",
|
||||||
|
"P3.01": "COMMUNICATION", "P3.02": "COMMUNICATION", "P3.03": "COMMUNICATION",
|
||||||
|
# Journey codes
|
||||||
|
"J1.01": "SPEED", "J1.02": "RELIABILITY", "J1.03": "FRICTION",
|
||||||
|
"J1.04": "SPEED", "J1.05": "RELIABILITY",
|
||||||
|
"J2.01": "RELIABILITY", "J2.02": "RELIABILITY", "J2.03": "FRICTION",
|
||||||
|
"J3.01": "FRICTION", "J3.02": "FRICTION", "J3.03": "FRICTION",
|
||||||
|
# Environment codes
|
||||||
|
"E1.01": "CLEANLINESS", "E1.02": "COMFORT", "E1.03": "AMBIANCE",
|
||||||
|
"E1.04": "AMBIANCE", "E1.05": "COMFORT",
|
||||||
|
"E2.01": "AMBIANCE", "E2.02": "COMFORT", "E2.03": "COMFORT",
|
||||||
|
"E2.04": "AMBIANCE", "E2.05": "DIGITAL_UX",
|
||||||
|
"E3.01": "SAFETY", "E3.02": "SAFETY", "E3.03": "ACCESSIBILITY",
|
||||||
|
"E4.01": "ACCESSIBILITY", "E4.02": "ACCESSIBILITY", "E4.03": "DIGITAL_UX",
|
||||||
|
# Access codes
|
||||||
|
"A1.01": "AVAILABILITY", "A1.02": "AVAILABILITY", "A1.03": "AVAILABILITY",
|
||||||
|
"A1.04": "ACCESSIBILITY", "A1.05": "ACCESSIBILITY",
|
||||||
|
"A2.01": "ACCESSIBILITY", "A2.02": "ACCESSIBILITY", "A2.03": "DIGITAL_UX",
|
||||||
|
"A3.01": "ACCESSIBILITY", "A3.02": "ACCESSIBILITY", "A3.03": "SPEED",
|
||||||
|
"A4.01": "ACCESSIBILITY", "A4.02": "ACCESSIBILITY", "A4.03": "AVAILABILITY",
|
||||||
|
# Value codes
|
||||||
|
"V1.01": "PRICE_LEVEL", "V1.02": "PRICE_FAIRNESS", "V1.03": "PRICE_TRANSPARENCY",
|
||||||
|
"V2.01": "PRICE_FAIRNESS", "V2.02": "PRICE_TRANSPARENCY", "V2.03": "VALUE_FOR_MONEY",
|
||||||
|
"V3.01": "VALUE_FOR_MONEY", "V3.02": "VALUE_FOR_MONEY", "V3.03": "PRICE_FAIRNESS",
|
||||||
|
"V4.01": "VALUE_FOR_MONEY", "V4.02": "VALUE_FOR_MONEY", "V4.03": "VALUE_FOR_MONEY",
|
||||||
|
# Relationship codes (map to meta - these should stay unmapped)
|
||||||
|
"R1.01": None, "R1.02": None, "R1.03": None,
|
||||||
|
"R2.01": None, "R2.02": None, "R2.03": None,
|
||||||
|
"R3.01": None, "R3.02": None, "R3.03": None,
|
||||||
|
"R4.01": None, "R4.02": None, "R4.03": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Minimum threshold for "enable" recommendations (% of sector spans)
|
||||||
|
ENABLE_THRESHOLD_PCT = 3.0 # Only recommend enable if >= 3% of sector spans
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SectorValidation:
|
||||||
|
"""Validation result for a single sector."""
|
||||||
|
sector_code: str
|
||||||
|
businesses: list[str]
|
||||||
|
span_count: int
|
||||||
|
|
||||||
|
# Coverage
|
||||||
|
enabled_coverage: float
|
||||||
|
disabled_hits: dict[str, int] = field(default_factory=dict)
|
||||||
|
unmapped_count: int = 0
|
||||||
|
|
||||||
|
# Distribution
|
||||||
|
primitive_counts: dict[str, int] = field(default_factory=dict)
|
||||||
|
domain_distribution: dict[str, int] = field(default_factory=dict)
|
||||||
|
valence_distribution: dict[str, int] = field(default_factory=dict)
|
||||||
|
top_urt_codes: list[tuple[str, int]] = field(default_factory=list)
|
||||||
|
|
||||||
|
# Recommendations (threshold-gated)
|
||||||
|
recommended_enables: list[tuple[str, float]] = field(default_factory=list) # (primitive, pct)
|
||||||
|
recommended_disables: list[tuple[str, float]] = field(default_factory=list)
|
||||||
|
weight_issues: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
# Metadata
|
||||||
|
validated_at: str = ""
|
||||||
|
config_version: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
def load_l1_config(sector_code: str) -> dict[str, Any] | None:
|
||||||
|
"""Load L1 config for a sector."""
|
||||||
|
config_file = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
|
||||||
|
if not config_file.exists():
|
||||||
|
return None
|
||||||
|
with open(config_file) as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def get_businesses_for_sector(sector_code: str) -> list[str]:
|
||||||
|
"""Get list of businesses belonging to a sector."""
|
||||||
|
return [biz for biz, sector in BUSINESS_TO_SECTOR.items() if sector == sector_code]
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_spans_for_businesses(pool: asyncpg.Pool, businesses: list[str]) -> list[dict]:
|
||||||
|
"""Fetch spans for specific businesses only."""
|
||||||
|
if not businesses:
|
||||||
|
return []
|
||||||
|
|
||||||
|
query = """
|
||||||
|
SELECT
|
||||||
|
business_id,
|
||||||
|
urt_primary,
|
||||||
|
valence,
|
||||||
|
intensity,
|
||||||
|
span_text
|
||||||
|
FROM pipeline.review_spans
|
||||||
|
WHERE business_id = ANY($1)
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
"""
|
||||||
|
rows = await pool.fetch(query, businesses)
|
||||||
|
return [dict(row) for row in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_sector_spans(
|
||||||
|
spans: list[dict],
|
||||||
|
config: dict[str, Any],
|
||||||
|
businesses: list[str],
|
||||||
|
) -> SectorValidation:
|
||||||
|
"""Analyze spans for a specific sector."""
|
||||||
|
sector_code = config["sector_code"]
|
||||||
|
enabled = set(config.get("enabled", []))
|
||||||
|
disabled = set(config.get("disabled", []))
|
||||||
|
weights = config.get("weights", {})
|
||||||
|
config_version = config.get("config_version", "1.0")
|
||||||
|
|
||||||
|
# Counters
|
||||||
|
primitive_counts: Counter = Counter()
|
||||||
|
domain_counts: Counter = Counter()
|
||||||
|
valence_counts: Counter = Counter()
|
||||||
|
urt_counts: Counter = Counter()
|
||||||
|
disabled_hits: Counter = Counter()
|
||||||
|
unmapped = 0
|
||||||
|
enabled_hits = 0
|
||||||
|
|
||||||
|
for span in spans:
|
||||||
|
urt_code = span["urt_primary"]
|
||||||
|
valence = span.get("valence", "V0")
|
||||||
|
|
||||||
|
urt_counts[urt_code] += 1
|
||||||
|
valence_counts[valence] += 1
|
||||||
|
domain_counts[urt_code[0]] += 1
|
||||||
|
|
||||||
|
primitive = URT_TO_PRIMITIVE.get(urt_code)
|
||||||
|
if primitive:
|
||||||
|
primitive_counts[primitive] += 1
|
||||||
|
if primitive in enabled:
|
||||||
|
enabled_hits += 1
|
||||||
|
elif primitive in disabled:
|
||||||
|
disabled_hits[primitive] += 1
|
||||||
|
else:
|
||||||
|
unmapped += 1
|
||||||
|
|
||||||
|
total = len(spans)
|
||||||
|
enabled_coverage = enabled_hits / total if total > 0 else 0
|
||||||
|
|
||||||
|
# Threshold-gated recommendations
|
||||||
|
recommended_enables = []
|
||||||
|
for prim, count in disabled_hits.most_common():
|
||||||
|
pct = count / total * 100 if total > 0 else 0
|
||||||
|
if pct >= ENABLE_THRESHOLD_PCT:
|
||||||
|
recommended_enables.append((prim, pct))
|
||||||
|
|
||||||
|
# Weight issues
|
||||||
|
weight_issues = []
|
||||||
|
for prim in weights:
|
||||||
|
if primitive_counts[prim] == 0 and prim in enabled:
|
||||||
|
weight_issues.append(f"{prim} weighted ({weights[prim]}x) but 0 appearances")
|
||||||
|
|
||||||
|
# High-frequency unweighted
|
||||||
|
for prim, count in primitive_counts.most_common(5):
|
||||||
|
pct = count / total * 100 if total > 0 else 0
|
||||||
|
if prim in enabled and prim not in weights and pct >= 10:
|
||||||
|
weight_issues.append(f"{prim} high freq ({pct:.1f}%) but unweighted")
|
||||||
|
|
||||||
|
return SectorValidation(
|
||||||
|
sector_code=sector_code,
|
||||||
|
businesses=businesses,
|
||||||
|
span_count=total,
|
||||||
|
enabled_coverage=enabled_coverage,
|
||||||
|
disabled_hits=dict(disabled_hits),
|
||||||
|
unmapped_count=unmapped,
|
||||||
|
primitive_counts=dict(primitive_counts),
|
||||||
|
domain_distribution=dict(domain_counts),
|
||||||
|
valence_distribution=dict(valence_counts),
|
||||||
|
top_urt_codes=urt_counts.most_common(15),
|
||||||
|
recommended_enables=recommended_enables,
|
||||||
|
weight_issues=weight_issues,
|
||||||
|
validated_at=datetime.utcnow().isoformat(),
|
||||||
|
config_version=config_version,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def print_sector_report(result: SectorValidation, config: dict):
|
||||||
|
"""Print detailed validation report for a sector."""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print(f"SECTOR-SCOPED VALIDATION: {result.sector_code}")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
print(f"\n📊 DATA SOURCE")
|
||||||
|
print(f" Businesses: {', '.join(result.businesses)}")
|
||||||
|
print(f" Total spans: {result.span_count:,}")
|
||||||
|
print(f" Config version: {result.config_version}")
|
||||||
|
|
||||||
|
print(f"\n📈 COVERAGE")
|
||||||
|
print(f" Enabled coverage: {result.enabled_coverage:.1%}")
|
||||||
|
print(f" Unmapped (R-domain): {result.unmapped_count} ({result.unmapped_count/result.span_count*100:.1f}%)" if result.span_count > 0 else "")
|
||||||
|
|
||||||
|
# Domain distribution
|
||||||
|
print(f"\n📁 DOMAIN DISTRIBUTION")
|
||||||
|
domain_names = {"O": "Offering", "P": "People", "J": "Journey",
|
||||||
|
"E": "Environment", "A": "Access", "V": "Value", "R": "Relationship"}
|
||||||
|
for domain in "OPJEVRA":
|
||||||
|
count = result.domain_distribution.get(domain, 0)
|
||||||
|
pct = count / result.span_count * 100 if result.span_count > 0 else 0
|
||||||
|
bar = "█" * int(pct / 2)
|
||||||
|
print(f" {domain} {domain_names.get(domain, '?'):12} {count:4} ({pct:5.1f}%) {bar}")
|
||||||
|
|
||||||
|
# Top primitives
|
||||||
|
print(f"\n🔝 TOP PRIMITIVES (sector-scoped)")
|
||||||
|
enabled_set = set(config.get("enabled", []))
|
||||||
|
disabled_set = set(config.get("disabled", []))
|
||||||
|
weights = config.get("weights", {})
|
||||||
|
|
||||||
|
for prim, count in sorted(result.primitive_counts.items(), key=lambda x: -x[1])[:12]:
|
||||||
|
pct = count / result.span_count * 100 if result.span_count > 0 else 0
|
||||||
|
if prim in enabled_set:
|
||||||
|
status = "✓"
|
||||||
|
elif prim in disabled_set:
|
||||||
|
status = "✗"
|
||||||
|
else:
|
||||||
|
status = "?"
|
||||||
|
weight = f"({weights[prim]}x)" if prim in weights else ""
|
||||||
|
print(f" {status} {prim:20} {count:4} ({pct:5.1f}%) {weight}")
|
||||||
|
|
||||||
|
# Threshold-gated recommendations
|
||||||
|
if result.recommended_enables:
|
||||||
|
print(f"\n⚠️ RECOMMENDED ENABLES (≥{ENABLE_THRESHOLD_PCT}% threshold)")
|
||||||
|
for prim, pct in result.recommended_enables:
|
||||||
|
count = result.disabled_hits.get(prim, 0)
|
||||||
|
print(f" → ENABLE {prim}: {count} spans ({pct:.1f}%)")
|
||||||
|
else:
|
||||||
|
print(f"\n✅ No primitives exceed {ENABLE_THRESHOLD_PCT}% threshold for enabling")
|
||||||
|
|
||||||
|
# Low-frequency disabled (info only)
|
||||||
|
low_freq_disabled = [(p, c) for p, c in result.disabled_hits.items()
|
||||||
|
if c / result.span_count * 100 < ENABLE_THRESHOLD_PCT]
|
||||||
|
if low_freq_disabled:
|
||||||
|
print(f"\n📋 DISABLED BUT APPEARING (below threshold - no action)")
|
||||||
|
for prim, count in sorted(low_freq_disabled, key=lambda x: -x[1])[:5]:
|
||||||
|
pct = count / result.span_count * 100
|
||||||
|
print(f" {prim}: {count} ({pct:.1f}%)")
|
||||||
|
|
||||||
|
# Weight issues
|
||||||
|
if result.weight_issues:
|
||||||
|
print(f"\n⚖️ WEIGHT ISSUES")
|
||||||
|
for issue in result.weight_issues:
|
||||||
|
print(f" • {issue}")
|
||||||
|
|
||||||
|
print(f"\n⏱️ Validated at: {result.validated_at}")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
|
||||||
|
async def validate_sector(
|
||||||
|
sector_code: str,
|
||||||
|
db_url: str | None = None,
|
||||||
|
verbose: bool = True,
|
||||||
|
) -> SectorValidation | None:
|
||||||
|
"""Validate a single sector with sector-scoped data."""
|
||||||
|
|
||||||
|
if sector_code not in SECTORS_WITH_DATA:
|
||||||
|
if verbose:
|
||||||
|
print(f"⚠️ {sector_code}: No real business data available for validation")
|
||||||
|
return None
|
||||||
|
|
||||||
|
config = load_l1_config(sector_code)
|
||||||
|
if not config:
|
||||||
|
if verbose:
|
||||||
|
print(f"❌ No L1 config found for {sector_code}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
businesses = get_businesses_for_sector(sector_code)
|
||||||
|
if not businesses:
|
||||||
|
if verbose:
|
||||||
|
print(f"⚠️ {sector_code}: No businesses mapped")
|
||||||
|
return None
|
||||||
|
|
||||||
|
db_url = db_url or os.environ.get(
|
||||||
|
"DATABASE_URL",
|
||||||
|
"postgresql://scraper:scraper123@localhost:5437/scraper"
|
||||||
|
)
|
||||||
|
|
||||||
|
pool = await asyncpg.create_pool(db_url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
spans = await fetch_spans_for_businesses(pool, businesses)
|
||||||
|
if not spans:
|
||||||
|
if verbose:
|
||||||
|
print(f"⚠️ {sector_code}: No spans found for businesses")
|
||||||
|
return None
|
||||||
|
|
||||||
|
result = analyze_sector_spans(spans, config, businesses)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print_sector_report(result, config)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await pool.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def validate_all_sectors(db_url: str | None = None) -> dict[str, SectorValidation]:
|
||||||
|
"""Validate all sectors with available data."""
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
for sector in SECTORS_WITH_DATA:
|
||||||
|
result = await validate_sector(sector, db_url, verbose=True)
|
||||||
|
if result:
|
||||||
|
results[sector] = result
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("VALIDATION SUMMARY")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"\n{'Sector':<20} {'Spans':>8} {'Coverage':>10} {'Enables':>10}")
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
|
for sector, result in results.items():
|
||||||
|
enables = len(result.recommended_enables)
|
||||||
|
enables_str = f"{enables} recs" if enables > 0 else "✓ OK"
|
||||||
|
print(f"{sector:<20} {result.span_count:>8,} {result.enabled_coverage:>9.1%} {enables_str:>10}")
|
||||||
|
|
||||||
|
print("-" * 50)
|
||||||
|
print(f"Sectors validated: {len(results)}/{len(SECTORS_WITH_DATA)}")
|
||||||
|
print(f"Sectors without data: {20 - len(SECTORS_WITH_DATA)}")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
async def generate_summary_report(db_url: str | None = None) -> dict:
|
||||||
|
"""Generate a JSON summary report for all sectors."""
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
for sector in SECTORS_WITH_DATA:
|
||||||
|
result = await validate_sector(sector, db_url, verbose=False)
|
||||||
|
if result:
|
||||||
|
results[sector] = {
|
||||||
|
"span_count": result.span_count,
|
||||||
|
"enabled_coverage": round(result.enabled_coverage, 3),
|
||||||
|
"recommended_enables": result.recommended_enables,
|
||||||
|
"weight_issues": result.weight_issues,
|
||||||
|
"config_version": result.config_version,
|
||||||
|
"validated_at": result.validated_at,
|
||||||
|
}
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Sector-scoped L1 config validation")
|
||||||
|
parser.add_argument("--sector", help="Validate specific sector")
|
||||||
|
parser.add_argument("--all", action="store_true", help="Validate all sectors with data")
|
||||||
|
parser.add_argument("--report", action="store_true", help="Generate JSON summary report")
|
||||||
|
parser.add_argument("--db-url", help="Database URL")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.report:
|
||||||
|
results = asyncio.run(generate_summary_report(args.db_url))
|
||||||
|
print(json.dumps(results, indent=2))
|
||||||
|
elif args.all:
|
||||||
|
asyncio.run(validate_all_sectors(args.db_url))
|
||||||
|
elif args.sector:
|
||||||
|
asyncio.run(validate_sector(args.sector.upper(), args.db_url))
|
||||||
|
else:
|
||||||
|
parser.print_help()
|
||||||
|
print("\n\nSectors with real data:", ", ".join(sorted(SECTORS_WITH_DATA)))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,733 @@
|
|||||||
|
"""
|
||||||
|
Classification Pipeline - LLM-powered primitives classification.
|
||||||
|
|
||||||
|
Classifies reviews using the primitives taxonomy (MANNER, SPEED, VALUE_FOR_MONEY, etc.)
|
||||||
|
and stores results in detected_spans_v2.
|
||||||
|
|
||||||
|
Stages:
|
||||||
|
- fetch: Find reviews without classification
|
||||||
|
- classify: LLM-powered span extraction with primitives
|
||||||
|
- save: Store results to detected_spans_v2
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
pipeline = ClassificationPipeline()
|
||||||
|
await pipeline.initialize()
|
||||||
|
result = await pipeline.process({"business_id": "Go Karts Mar Menor", "limit": 100})
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import unicodedata
|
||||||
|
import uuid
|
||||||
|
from collections import Counter
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
from pipeline_core import (
|
||||||
|
BasePipeline,
|
||||||
|
DashboardConfig,
|
||||||
|
DashboardSection,
|
||||||
|
PipelineMetadata,
|
||||||
|
PipelineResult as BasePipelineResult,
|
||||||
|
StageResult,
|
||||||
|
WidgetConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
from reviewiq_pipeline.config import Config
|
||||||
|
from reviewiq_pipeline.db.connection import DatabasePool
|
||||||
|
from reviewiq_pipeline.services.llm_client import LLMClient, LLMClientBase
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Stage names
|
||||||
|
STAGE_NAMES = ["fetch", "classify", "save"]
|
||||||
|
|
||||||
|
# Primitives taxonomy - maps primitive to domain
|
||||||
|
PRIMITIVES_BY_DOMAIN = {
|
||||||
|
"O": ["TASTE", "CRAFT", "FRESHNESS", "TEMPERATURE", "EFFECTIVENESS", "ACCURACY", "CONDITION", "CONSISTENCY"],
|
||||||
|
"P": ["MANNER", "COMPETENCE", "ATTENTIVENESS", "COMMUNICATION"],
|
||||||
|
"J": ["SPEED", "FRICTION", "RELIABILITY", "AVAILABILITY"],
|
||||||
|
"E": ["CLEANLINESS", "COMFORT", "SAFETY", "AMBIANCE", "ACCESSIBILITY", "DIGITAL_UX"],
|
||||||
|
"V": ["PRICE_LEVEL", "PRICE_FAIRNESS", "PRICE_TRANSPARENCY", "VALUE_FOR_MONEY"],
|
||||||
|
"meta": ["HONESTY", "ETHICS", "PROMISES", "ACKNOWLEDGMENT", "RESPONSE_QUALITY", "RECOVERY",
|
||||||
|
"RETURN_INTENT", "RECOMMEND", "RECOGNITION", "UNMAPPED", "NON_INFORMATIVE"],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Flatten for lookup
|
||||||
|
ALL_PRIMITIVES = []
|
||||||
|
PRIMITIVE_TO_DOMAIN = {}
|
||||||
|
for domain, primitives in PRIMITIVES_BY_DOMAIN.items():
|
||||||
|
for p in primitives:
|
||||||
|
ALL_PRIMITIVES.append(p)
|
||||||
|
PRIMITIVE_TO_DOMAIN[p] = domain
|
||||||
|
|
||||||
|
# Classification prompt
|
||||||
|
CLASSIFICATION_PROMPT = """You are a review classifier using primitive-based analysis.
|
||||||
|
|
||||||
|
## TASK
|
||||||
|
Extract semantic spans from customer reviews and classify each span to exactly ONE primitive.
|
||||||
|
|
||||||
|
## PRIMITIVES (use ONLY these)
|
||||||
|
### OUTPUT (O) - Product/Service Quality
|
||||||
|
- TASTE: Flavor quality (food/beverage)
|
||||||
|
- CRAFT: Skill of execution, craftsmanship
|
||||||
|
- FRESHNESS: How fresh/new the product is
|
||||||
|
- TEMPERATURE: Serving temperature
|
||||||
|
- EFFECTIVENESS: Does it work/achieve purpose
|
||||||
|
- ACCURACY: Correct execution of order
|
||||||
|
- CONDITION: State at delivery
|
||||||
|
- CONSISTENCY: Same quality each time
|
||||||
|
|
||||||
|
### PEOPLE (P) - Staff Interactions
|
||||||
|
- MANNER: Friendliness and warmth
|
||||||
|
- COMPETENCE: Knowledge and skill
|
||||||
|
- ATTENTIVENESS: Being present and responsive
|
||||||
|
- COMMUNICATION: Clarity and updates
|
||||||
|
|
||||||
|
### JOURNEY (J) - Process and Timing
|
||||||
|
- SPEED: How fast things happen
|
||||||
|
- FRICTION: Ease of process
|
||||||
|
- RELIABILITY: Dependable service
|
||||||
|
- AVAILABILITY: Access to service/staff
|
||||||
|
|
||||||
|
### ENVIRONMENT (E) - Physical/Digital Space
|
||||||
|
- CLEANLINESS: Hygiene and tidiness
|
||||||
|
- COMFORT: Physical ease
|
||||||
|
- SAFETY: Physical safety
|
||||||
|
- AMBIANCE: Overall mood/atmosphere
|
||||||
|
- ACCESSIBILITY: Ease of access
|
||||||
|
- DIGITAL_UX: Digital experience
|
||||||
|
|
||||||
|
### VALUE (V) - Cost and Worth
|
||||||
|
- PRICE_LEVEL: Absolute cost
|
||||||
|
- PRICE_FAIRNESS: Fair for what you get
|
||||||
|
- PRICE_TRANSPARENCY: Clear about costs
|
||||||
|
- VALUE_FOR_MONEY: Overall value assessment
|
||||||
|
|
||||||
|
### META - Trust and Sentiment
|
||||||
|
- HONESTY: Truthfulness
|
||||||
|
- ETHICS: Moral conduct
|
||||||
|
- PROMISES: Keeping commitments
|
||||||
|
- ACKNOWLEDGMENT: Recognizing issues
|
||||||
|
- RESPONSE_QUALITY: How business responds
|
||||||
|
- RECOVERY: Making amends
|
||||||
|
- RETURN_INTENT: Would come back
|
||||||
|
- RECOMMEND: Would suggest to others
|
||||||
|
- RECOGNITION: Customer acknowledgment
|
||||||
|
- UNMAPPED: Cannot classify (use sparingly)
|
||||||
|
- NON_INFORMATIVE: No actionable content
|
||||||
|
|
||||||
|
## RULES
|
||||||
|
1. Extract 1-5 spans per review (prefer fewer, larger spans about same topic)
|
||||||
|
2. Each span gets exactly ONE primitive (most specific match)
|
||||||
|
3. Valence: + (positive), - (negative), 0 (neutral), ± (mixed)
|
||||||
|
4. Intensity: 1 (low), 2 (moderate), 3 (high/extreme)
|
||||||
|
5. Detail: 1 (vague), 2 (some detail), 3 (specific/actionable)
|
||||||
|
6. Confidence: 0.0 to 1.0
|
||||||
|
|
||||||
|
## OUTPUT FORMAT (JSON only)
|
||||||
|
{
|
||||||
|
"spans": [
|
||||||
|
{
|
||||||
|
"text": "exact text from review",
|
||||||
|
"start": 0,
|
||||||
|
"end": 25,
|
||||||
|
"primitive": "MANNER",
|
||||||
|
"valence": "+",
|
||||||
|
"intensity": 2,
|
||||||
|
"detail": 2,
|
||||||
|
"confidence": 0.85,
|
||||||
|
"entity": null,
|
||||||
|
"entity_type": null
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
Return valid JSON only, no markdown."""
|
||||||
|
|
||||||
|
# Non-informative detection
|
||||||
|
PURE_JUNK_RE = re.compile(
|
||||||
|
r'^[\s\.\!\?\,\-\_\~\*\#\@]+$'
|
||||||
|
r'|^[\U0001F300-\U0001F9FF\U0001FA00-\U0001FAFF\U00002600-\U000027BF\s\.\!\?]+$'
|
||||||
|
r'|^(translated by google|traducido por google)[\.\s]*$',
|
||||||
|
re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def is_non_informative(text: str) -> tuple[bool, str]:
|
||||||
|
"""Check if review is non-informative (skip LLM)."""
|
||||||
|
if not text or not text.strip():
|
||||||
|
return True, "empty"
|
||||||
|
|
||||||
|
text = text.strip()
|
||||||
|
|
||||||
|
if PURE_JUNK_RE.match(text):
|
||||||
|
return True, "junk_pattern"
|
||||||
|
|
||||||
|
# No alphanumeric content
|
||||||
|
alpha_chars = sum(1 for c in text if unicodedata.category(c).startswith('L'))
|
||||||
|
digit_chars = sum(1 for c in text if unicodedata.category(c).startswith('N'))
|
||||||
|
if alpha_chars == 0 and digit_chars == 0:
|
||||||
|
return True, "no_content"
|
||||||
|
|
||||||
|
# Pure repetition
|
||||||
|
tokens = text.split()
|
||||||
|
if len(tokens) >= 3:
|
||||||
|
unique_tokens = len(set(t.lower() for t in tokens))
|
||||||
|
if unique_tokens == 1 and alpha_chars < 20:
|
||||||
|
return True, "pure_repetition"
|
||||||
|
|
||||||
|
return False, ""
|
||||||
|
|
||||||
|
|
||||||
|
def compute_review_hash(text: str) -> str:
|
||||||
|
"""Compute hash for review text (for deduplication)."""
|
||||||
|
normalized = text.strip().lower()
|
||||||
|
return hashlib.sha256(normalized.encode()).hexdigest()[:16]
|
||||||
|
|
||||||
|
|
||||||
|
class ClassificationPipeline(BasePipeline):
|
||||||
|
"""
|
||||||
|
Classification Pipeline - LLM-powered primitives classification.
|
||||||
|
|
||||||
|
Processes reviews through LLM to extract semantic spans and classify
|
||||||
|
them using the primitives taxonomy.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: Config | None = None):
|
||||||
|
"""Initialize the pipeline."""
|
||||||
|
self._config = config or Config()
|
||||||
|
self._db: DatabasePool | None = None
|
||||||
|
self._llm: LLMClientBase | None = None
|
||||||
|
self._initialized = False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def config(self) -> Config:
|
||||||
|
"""Get pipeline configuration."""
|
||||||
|
return self._config
|
||||||
|
|
||||||
|
@property
|
||||||
|
def metadata(self) -> PipelineMetadata:
|
||||||
|
"""Get pipeline metadata."""
|
||||||
|
return PipelineMetadata(
|
||||||
|
id="classification",
|
||||||
|
name="Primitives Classification Pipeline",
|
||||||
|
description="LLM-powered span extraction and primitives classification. Processes reviews and stores results in detected_spans_v2.",
|
||||||
|
version="1.0.0",
|
||||||
|
stages=STAGE_NAMES,
|
||||||
|
input_type="BusinessInput",
|
||||||
|
)
|
||||||
|
|
||||||
|
async def initialize(self) -> None:
|
||||||
|
"""Initialize database and LLM connections."""
|
||||||
|
if self._initialized:
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("Initializing Classification pipeline...")
|
||||||
|
|
||||||
|
self._db = DatabasePool(self._config)
|
||||||
|
await self._db.initialize()
|
||||||
|
|
||||||
|
self._llm = LLMClient.create(self._config)
|
||||||
|
self._llm.set_prompt(CLASSIFICATION_PROMPT)
|
||||||
|
|
||||||
|
self._initialized = True
|
||||||
|
logger.info("Classification pipeline initialized")
|
||||||
|
|
||||||
|
async def close(self) -> None:
|
||||||
|
"""Close all connections."""
|
||||||
|
if self._llm:
|
||||||
|
await self._llm.close()
|
||||||
|
self._llm = None
|
||||||
|
|
||||||
|
if self._db:
|
||||||
|
await self._db.close()
|
||||||
|
self._db = None
|
||||||
|
|
||||||
|
self._initialized = False
|
||||||
|
logger.info("Classification pipeline closed")
|
||||||
|
|
||||||
|
async def process(
|
||||||
|
self,
|
||||||
|
input_data: dict[str, Any],
|
||||||
|
stages: list[str] | None = None,
|
||||||
|
) -> BasePipelineResult:
|
||||||
|
"""
|
||||||
|
Process reviews through classification.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_data: Must contain business_id OR job_id. Optional: limit, batch_size
|
||||||
|
stages: List of stage names to run (default: all)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BasePipelineResult with classification stats
|
||||||
|
"""
|
||||||
|
await self.initialize()
|
||||||
|
|
||||||
|
stages = stages or STAGE_NAMES
|
||||||
|
stages_run: list[str] = []
|
||||||
|
stage_results: dict[str, Any] = {}
|
||||||
|
|
||||||
|
business_id = input_data.get("business_id")
|
||||||
|
job_id = input_data.get("job_id")
|
||||||
|
limit = input_data.get("limit", 100)
|
||||||
|
batch_size = input_data.get("batch_size", 10)
|
||||||
|
|
||||||
|
# Resolve business_id from job_id
|
||||||
|
if not business_id and job_id:
|
||||||
|
try:
|
||||||
|
async with self._db.pool.acquire() as conn:
|
||||||
|
row = await conn.fetchrow(
|
||||||
|
"SELECT business_name FROM jobs WHERE job_id = $1",
|
||||||
|
uuid.UUID(job_id) if isinstance(job_id, str) else job_id,
|
||||||
|
)
|
||||||
|
if row and row["business_name"]:
|
||||||
|
business_id = row["business_name"]
|
||||||
|
logger.info(f"Resolved business_id '{business_id}' from job_id")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to resolve business_id: {e}")
|
||||||
|
|
||||||
|
if not business_id:
|
||||||
|
return BasePipelineResult(
|
||||||
|
pipeline_id="classification",
|
||||||
|
stages_run=[],
|
||||||
|
stage_results={},
|
||||||
|
success=False,
|
||||||
|
error="business_id is required (provide business_id or job_id)",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate run_id for this execution
|
||||||
|
run_id = uuid.uuid4()
|
||||||
|
context = {
|
||||||
|
"business_id": business_id,
|
||||||
|
"job_id": job_id,
|
||||||
|
"limit": limit,
|
||||||
|
"batch_size": batch_size,
|
||||||
|
"run_id": run_id,
|
||||||
|
"reviews": [],
|
||||||
|
"classified": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Stage: Fetch unclassified reviews
|
||||||
|
if "fetch" in stages:
|
||||||
|
start = time.time()
|
||||||
|
logger.info(f"Fetching unclassified reviews for {business_id}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
reviews = await self._fetch_unclassified(business_id, limit)
|
||||||
|
context["reviews"] = reviews
|
||||||
|
duration_ms = int((time.time() - start) * 1000)
|
||||||
|
stages_run.append("fetch")
|
||||||
|
stage_results["fetch"] = StageResult(
|
||||||
|
stage="fetch",
|
||||||
|
success=True,
|
||||||
|
data={"reviews_found": len(reviews)},
|
||||||
|
error=None,
|
||||||
|
duration_ms=duration_ms,
|
||||||
|
)
|
||||||
|
logger.info(f"Found {len(reviews)} unclassified reviews")
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Fetch failed")
|
||||||
|
return BasePipelineResult(
|
||||||
|
pipeline_id="classification",
|
||||||
|
stages_run=stages_run,
|
||||||
|
stage_results=stage_results,
|
||||||
|
success=False,
|
||||||
|
error=f"Fetch failed: {e}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Stage: Classify reviews
|
||||||
|
if "classify" in stages and context["reviews"]:
|
||||||
|
start = time.time()
|
||||||
|
logger.info(f"Classifying {len(context['reviews'])} reviews")
|
||||||
|
|
||||||
|
try:
|
||||||
|
classified = await self._classify_reviews(
|
||||||
|
context["reviews"],
|
||||||
|
business_id,
|
||||||
|
batch_size,
|
||||||
|
)
|
||||||
|
context["classified"] = classified
|
||||||
|
duration_ms = int((time.time() - start) * 1000)
|
||||||
|
stages_run.append("classify")
|
||||||
|
|
||||||
|
total_spans = sum(len(c.get("spans", [])) for c in classified)
|
||||||
|
stage_results["classify"] = StageResult(
|
||||||
|
stage="classify",
|
||||||
|
success=True,
|
||||||
|
data={
|
||||||
|
"reviews_classified": len(classified),
|
||||||
|
"total_spans": total_spans,
|
||||||
|
"llm_cost_usd": self._llm.total_cost_usd if self._llm else 0,
|
||||||
|
},
|
||||||
|
error=None,
|
||||||
|
duration_ms=duration_ms,
|
||||||
|
)
|
||||||
|
logger.info(f"Classified {len(classified)} reviews, {total_spans} spans")
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Classification failed")
|
||||||
|
stage_results["classify"] = StageResult(
|
||||||
|
stage="classify",
|
||||||
|
success=False,
|
||||||
|
data={},
|
||||||
|
error=str(e),
|
||||||
|
duration_ms=int((time.time() - start) * 1000),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Stage: Save results
|
||||||
|
if "save" in stages and context["classified"]:
|
||||||
|
start = time.time()
|
||||||
|
logger.info(f"Saving {len(context['classified'])} classifications")
|
||||||
|
|
||||||
|
try:
|
||||||
|
saved_count = await self._save_classifications(
|
||||||
|
context["classified"],
|
||||||
|
business_id,
|
||||||
|
job_id,
|
||||||
|
run_id,
|
||||||
|
)
|
||||||
|
duration_ms = int((time.time() - start) * 1000)
|
||||||
|
stages_run.append("save")
|
||||||
|
stage_results["save"] = StageResult(
|
||||||
|
stage="save",
|
||||||
|
success=True,
|
||||||
|
data={"spans_saved": saved_count},
|
||||||
|
error=None,
|
||||||
|
duration_ms=duration_ms,
|
||||||
|
)
|
||||||
|
logger.info(f"Saved {saved_count} spans to detected_spans_v2")
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Save failed")
|
||||||
|
stage_results["save"] = StageResult(
|
||||||
|
stage="save",
|
||||||
|
success=False,
|
||||||
|
data={},
|
||||||
|
error=str(e),
|
||||||
|
duration_ms=int((time.time() - start) * 1000),
|
||||||
|
)
|
||||||
|
|
||||||
|
return BasePipelineResult(
|
||||||
|
pipeline_id="classification",
|
||||||
|
stages_run=stages_run,
|
||||||
|
stage_results=stage_results,
|
||||||
|
success=all(stage_results.get(s, {}).get("success", False) for s in stages_run),
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Pipeline failed")
|
||||||
|
return BasePipelineResult(
|
||||||
|
pipeline_id="classification",
|
||||||
|
stages_run=stages_run,
|
||||||
|
stage_results=stage_results,
|
||||||
|
success=False,
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _fetch_unclassified(
|
||||||
|
self,
|
||||||
|
business_id: str,
|
||||||
|
limit: int,
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
"""Fetch reviews that haven't been classified yet."""
|
||||||
|
async with self._db.pool.acquire() as conn:
|
||||||
|
# Get reviews from reviews_latest that don't have spans in detected_spans_v2
|
||||||
|
rows = await conn.fetch(
|
||||||
|
"""
|
||||||
|
SELECT
|
||||||
|
r.review_id,
|
||||||
|
r.business_id,
|
||||||
|
r.text AS review_text,
|
||||||
|
r.rating,
|
||||||
|
r.review_time
|
||||||
|
FROM pipeline.reviews_latest r
|
||||||
|
LEFT JOIN (
|
||||||
|
SELECT DISTINCT review_id, business_id
|
||||||
|
FROM pipeline.detected_spans_v2
|
||||||
|
) s ON s.review_id = r.review_id AND s.business_id = r.business_id
|
||||||
|
WHERE r.business_id = $1
|
||||||
|
AND s.review_id IS NULL
|
||||||
|
AND r.text IS NOT NULL
|
||||||
|
AND LENGTH(r.text) > 0
|
||||||
|
ORDER BY r.review_time DESC
|
||||||
|
LIMIT $2
|
||||||
|
""",
|
||||||
|
business_id,
|
||||||
|
limit,
|
||||||
|
)
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"review_id": row["review_id"],
|
||||||
|
"business_id": row["business_id"],
|
||||||
|
"text": row["review_text"],
|
||||||
|
"rating": row["rating"] or 3,
|
||||||
|
"review_time": row["review_time"],
|
||||||
|
}
|
||||||
|
for row in rows
|
||||||
|
]
|
||||||
|
|
||||||
|
async def _classify_reviews(
|
||||||
|
self,
|
||||||
|
reviews: list[dict[str, Any]],
|
||||||
|
business_id: str,
|
||||||
|
batch_size: int,
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
"""Classify reviews using LLM."""
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for review in reviews:
|
||||||
|
text = review.get("text", "")
|
||||||
|
rating = review.get("rating", 3)
|
||||||
|
|
||||||
|
# Check for non-informative
|
||||||
|
is_junk, reason = is_non_informative(text)
|
||||||
|
if is_junk:
|
||||||
|
results.append({
|
||||||
|
"review_id": review["review_id"],
|
||||||
|
"business_id": business_id,
|
||||||
|
"text": text,
|
||||||
|
"rating": rating,
|
||||||
|
"spans": [{
|
||||||
|
"text": text,
|
||||||
|
"start": 0,
|
||||||
|
"end": len(text),
|
||||||
|
"primitive": "NON_INFORMATIVE",
|
||||||
|
"valence": "0",
|
||||||
|
"intensity": 1,
|
||||||
|
"detail": 1,
|
||||||
|
"confidence": 1.0,
|
||||||
|
"entity": None,
|
||||||
|
"entity_type": None,
|
||||||
|
"mode": reason,
|
||||||
|
}],
|
||||||
|
"review_hash": compute_review_hash(text),
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Classify with LLM
|
||||||
|
try:
|
||||||
|
user_prompt = f"Rating: {rating}/5\nText: {text}"
|
||||||
|
response, metadata = await self._llm.classify(text)
|
||||||
|
|
||||||
|
spans = response.get("spans", [])
|
||||||
|
|
||||||
|
# Validate primitives
|
||||||
|
for span in spans:
|
||||||
|
if span.get("primitive") not in ALL_PRIMITIVES:
|
||||||
|
span["primitive"] = "UNMAPPED"
|
||||||
|
span["unmapped_keywords"] = [span.get("primitive", "unknown")]
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"review_id": review["review_id"],
|
||||||
|
"business_id": business_id,
|
||||||
|
"text": text,
|
||||||
|
"rating": rating,
|
||||||
|
"spans": spans,
|
||||||
|
"review_hash": compute_review_hash(text),
|
||||||
|
"model": metadata.get("model"),
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"LLM classification failed for review {review['review_id']}: {e}")
|
||||||
|
# Fallback to UNMAPPED
|
||||||
|
results.append({
|
||||||
|
"review_id": review["review_id"],
|
||||||
|
"business_id": business_id,
|
||||||
|
"text": text,
|
||||||
|
"rating": rating,
|
||||||
|
"spans": [{
|
||||||
|
"text": text,
|
||||||
|
"start": 0,
|
||||||
|
"end": len(text),
|
||||||
|
"primitive": "UNMAPPED",
|
||||||
|
"valence": "0",
|
||||||
|
"intensity": 1,
|
||||||
|
"detail": 1,
|
||||||
|
"confidence": 0.0,
|
||||||
|
"entity": None,
|
||||||
|
"entity_type": None,
|
||||||
|
"mode": "llm_error",
|
||||||
|
}],
|
||||||
|
"review_hash": compute_review_hash(text),
|
||||||
|
})
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def _save_classifications(
|
||||||
|
self,
|
||||||
|
classifications: list[dict[str, Any]],
|
||||||
|
business_id: str,
|
||||||
|
job_id: str | None,
|
||||||
|
run_id: uuid.UUID,
|
||||||
|
) -> int:
|
||||||
|
"""Save classification results to detected_spans_v2."""
|
||||||
|
saved_count = 0
|
||||||
|
config_version = f"primitives_v1_{datetime.utcnow().strftime('%Y%m%d')}"
|
||||||
|
|
||||||
|
async with self._db.pool.acquire() as conn:
|
||||||
|
# Get GBP path for business
|
||||||
|
gbp_row = await conn.fetchrow(
|
||||||
|
"""
|
||||||
|
SELECT gbp_category_path
|
||||||
|
FROM jobs
|
||||||
|
WHERE business_name = $1
|
||||||
|
AND gbp_category_path IS NOT NULL
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
LIMIT 1
|
||||||
|
""",
|
||||||
|
business_id,
|
||||||
|
)
|
||||||
|
gbp_path = str(gbp_row["gbp_category_path"]) if gbp_row and gbp_row["gbp_category_path"] else "unknown"
|
||||||
|
|
||||||
|
for classification in classifications:
|
||||||
|
review_id = classification["review_id"]
|
||||||
|
review_hash = classification.get("review_hash")
|
||||||
|
model = classification.get("model")
|
||||||
|
|
||||||
|
for span in classification.get("spans", []):
|
||||||
|
try:
|
||||||
|
await conn.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO pipeline.detected_spans_v2 (
|
||||||
|
job_id, business_id, review_id, gbp_path, sector_code,
|
||||||
|
config_version, primitive, valence, intensity, detail, mode,
|
||||||
|
confidence, span_text, span_start, span_end,
|
||||||
|
unmapped_keywords, entity, entity_type,
|
||||||
|
model, review_hash, run_id, created_at
|
||||||
|
) VALUES (
|
||||||
|
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11,
|
||||||
|
$12, $13, $14, $15, $16, $17, $18, $19, $20, $21, NOW()
|
||||||
|
)
|
||||||
|
""",
|
||||||
|
uuid.UUID(job_id) if job_id else None,
|
||||||
|
business_id,
|
||||||
|
review_id,
|
||||||
|
gbp_path,
|
||||||
|
gbp_path.split(".")[0] if "." in gbp_path else gbp_path,
|
||||||
|
config_version,
|
||||||
|
span.get("primitive", "UNMAPPED"),
|
||||||
|
span.get("valence", "0"),
|
||||||
|
span.get("intensity", 1),
|
||||||
|
span.get("detail", 1),
|
||||||
|
span.get("mode"),
|
||||||
|
span.get("confidence", 0.5),
|
||||||
|
span.get("text", ""),
|
||||||
|
span.get("start", 0),
|
||||||
|
span.get("end", 0),
|
||||||
|
span.get("unmapped_keywords"),
|
||||||
|
span.get("entity"),
|
||||||
|
span.get("entity_type"),
|
||||||
|
model,
|
||||||
|
review_hash,
|
||||||
|
run_id,
|
||||||
|
)
|
||||||
|
saved_count += 1
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to save span: {e}")
|
||||||
|
|
||||||
|
return saved_count
|
||||||
|
|
||||||
|
def get_dashboard_config(self) -> DashboardConfig:
|
||||||
|
"""Get dashboard configuration."""
|
||||||
|
return DashboardConfig(
|
||||||
|
pipeline_id="classification",
|
||||||
|
title="Classification Pipeline",
|
||||||
|
description="Monitor classification progress and quality",
|
||||||
|
sections=[
|
||||||
|
DashboardSection(
|
||||||
|
id="stats",
|
||||||
|
title="Classification Stats",
|
||||||
|
widgets=[
|
||||||
|
WidgetConfig(
|
||||||
|
id="reviews_classified",
|
||||||
|
type="stat_card",
|
||||||
|
title="Reviews Classified",
|
||||||
|
grid={"x": 0, "y": 0, "w": 3, "h": 1},
|
||||||
|
config={"value_key": "reviews_classified"},
|
||||||
|
),
|
||||||
|
WidgetConfig(
|
||||||
|
id="total_spans",
|
||||||
|
type="stat_card",
|
||||||
|
title="Total Spans",
|
||||||
|
grid={"x": 3, "y": 0, "w": 3, "h": 1},
|
||||||
|
config={"value_key": "total_spans"},
|
||||||
|
),
|
||||||
|
WidgetConfig(
|
||||||
|
id="llm_cost",
|
||||||
|
type="stat_card",
|
||||||
|
title="LLM Cost",
|
||||||
|
grid={"x": 6, "y": 0, "w": 3, "h": 1},
|
||||||
|
config={"value_key": "llm_cost_usd", "format": "${value:.4f}"},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
default_time_range="7d",
|
||||||
|
refresh_interval=60,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def get_widget_data(
|
||||||
|
self,
|
||||||
|
widget_id: str,
|
||||||
|
params: dict[str, Any],
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Get data for dashboard widgets."""
|
||||||
|
await self.initialize()
|
||||||
|
|
||||||
|
business_id = params.get("business_id")
|
||||||
|
if not business_id:
|
||||||
|
return {"error": "business_id required"}
|
||||||
|
|
||||||
|
async with self._db.pool.acquire() as conn:
|
||||||
|
row = await conn.fetchrow(
|
||||||
|
"""
|
||||||
|
SELECT
|
||||||
|
COUNT(DISTINCT review_id) as reviews_classified,
|
||||||
|
COUNT(*) as total_spans
|
||||||
|
FROM pipeline.detected_spans_v2
|
||||||
|
WHERE business_id = $1
|
||||||
|
""",
|
||||||
|
business_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"reviews_classified": row["reviews_classified"] or 0,
|
||||||
|
"total_spans": row["total_spans"] or 0,
|
||||||
|
"llm_cost_usd": 0, # Would need to track this
|
||||||
|
}
|
||||||
|
|
||||||
|
async def health_check(self) -> dict[str, Any]:
|
||||||
|
"""Check pipeline health."""
|
||||||
|
await self.initialize()
|
||||||
|
|
||||||
|
checks = {}
|
||||||
|
healthy = True
|
||||||
|
|
||||||
|
# Check database
|
||||||
|
try:
|
||||||
|
async with self._db.pool.acquire() as conn:
|
||||||
|
await conn.fetchval("SELECT 1")
|
||||||
|
checks["database"] = "ok"
|
||||||
|
except Exception as e:
|
||||||
|
checks["database"] = str(e)
|
||||||
|
healthy = False
|
||||||
|
|
||||||
|
# Check LLM
|
||||||
|
try:
|
||||||
|
if self._llm:
|
||||||
|
checks["llm"] = f"{self._config.llm_provider}/{self._config.llm_model}"
|
||||||
|
else:
|
||||||
|
checks["llm"] = "not_initialized"
|
||||||
|
except Exception as e:
|
||||||
|
checks["llm"] = str(e)
|
||||||
|
|
||||||
|
return {"healthy": healthy, "checks": checks}
|
||||||
@@ -76,6 +76,51 @@ class Config(BaseSettings):
|
|||||||
batch_size: int = Field(default=50, ge=1, le=500)
|
batch_size: int = Field(default=50, ge=1, le=500)
|
||||||
trust_score_floor: float = Field(default=0.2, ge=0.0, le=1.0)
|
trust_score_floor: float = Field(default=0.2, ge=0.0, le=1.0)
|
||||||
|
|
||||||
|
# Batched Classification
|
||||||
|
classification_batch_size: int = Field(
|
||||||
|
default=0,
|
||||||
|
ge=0,
|
||||||
|
le=200,
|
||||||
|
description="Number of reviews per LLM call. 0 = auto-calculate based on context window",
|
||||||
|
)
|
||||||
|
classification_max_concurrent: int = Field(
|
||||||
|
default=0,
|
||||||
|
ge=0,
|
||||||
|
description="Maximum concurrent batch requests. 0 = unlimited (run all batches in parallel)",
|
||||||
|
)
|
||||||
|
classification_target_utilization: float = Field(
|
||||||
|
default=0.70,
|
||||||
|
ge=0.3,
|
||||||
|
le=0.85,
|
||||||
|
description="Target context window utilization. Optimal: 0.60-0.75. Above 0.85 causes ~23% quality degradation.",
|
||||||
|
)
|
||||||
|
use_prompt_caching: bool = Field(
|
||||||
|
default=True,
|
||||||
|
description="Enable prompt caching for cost reduction (OpenAI/Anthropic)",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Smart Review Router (cost optimization)
|
||||||
|
router_enabled: bool = Field(
|
||||||
|
default=False,
|
||||||
|
description="Enable smart review routing to skip/route trivial reviews",
|
||||||
|
)
|
||||||
|
router_skip_enabled: bool = Field(
|
||||||
|
default=True,
|
||||||
|
description="Allow SKIP tier (no LLM, assign generic code)",
|
||||||
|
)
|
||||||
|
router_cheap_model_enabled: bool = Field(
|
||||||
|
default=True,
|
||||||
|
description="Allow CHEAP tier (use Haiku instead of Sonnet)",
|
||||||
|
)
|
||||||
|
router_cheap_model: str = Field(
|
||||||
|
default="claude-3-5-haiku-20241022",
|
||||||
|
description="Model to use for CHEAP tier routing",
|
||||||
|
)
|
||||||
|
router_conservative: bool = Field(
|
||||||
|
default=True,
|
||||||
|
description="Use conservative routing (fewer false negatives)",
|
||||||
|
)
|
||||||
|
|
||||||
# Migrations
|
# Migrations
|
||||||
migrations_path: str = Field(
|
migrations_path: str = Field(
|
||||||
default="",
|
default="",
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ enabling independent development and validation of each stage.
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import date
|
||||||
from typing import Any, Literal, TypedDict
|
from typing import Any, Literal, TypedDict
|
||||||
|
|
||||||
|
|
||||||
@@ -181,13 +182,14 @@ class ReviewToClassify(TypedDict):
|
|||||||
review_time: str
|
review_time: str
|
||||||
|
|
||||||
|
|
||||||
class ClassificationConfig(TypedDict):
|
class ClassificationConfig(TypedDict, total=False):
|
||||||
"""Configuration for LLM classification."""
|
"""Configuration for LLM classification."""
|
||||||
|
|
||||||
model: str
|
model: str
|
||||||
taxonomy_version: str
|
taxonomy_version: str
|
||||||
profile: ProfileType
|
profile: ProfileType
|
||||||
max_spans_per_review: int
|
max_spans_per_review: int
|
||||||
|
job_id: str | None # Optional job_id for tracking
|
||||||
|
|
||||||
|
|
||||||
class Stage2Input(TypedDict):
|
class Stage2Input(TypedDict):
|
||||||
@@ -329,6 +331,7 @@ class Stage3Input(TypedDict):
|
|||||||
"""Input to Stage 3 issue routing."""
|
"""Input to Stage 3 issue routing."""
|
||||||
|
|
||||||
spans: list[SpanToRoute]
|
spans: list[SpanToRoute]
|
||||||
|
job_id: str | None # Optional job_id for linking issues to pipeline executions
|
||||||
|
|
||||||
|
|
||||||
class RoutedSpan(TypedDict):
|
class RoutedSpan(TypedDict):
|
||||||
@@ -379,7 +382,7 @@ class FactRecord(TypedDict, total=False):
|
|||||||
# Keys
|
# Keys
|
||||||
business_id: str
|
business_id: str
|
||||||
place_id: str
|
place_id: str
|
||||||
period_date: str
|
period_date: date
|
||||||
bucket_type: str
|
bucket_type: str
|
||||||
subject_type: SubjectType
|
subject_type: SubjectType
|
||||||
subject_id: str
|
subject_id: str
|
||||||
@@ -574,7 +577,7 @@ class FactTimeseries(TypedDict, total=False):
|
|||||||
id: int
|
id: int
|
||||||
business_id: str
|
business_id: str
|
||||||
place_id: str
|
place_id: str
|
||||||
period_date: str
|
period_date: date
|
||||||
bucket_type: BucketType
|
bucket_type: BucketType
|
||||||
subject_type: SubjectType
|
subject_type: SubjectType
|
||||||
subject_id: str
|
subject_id: str
|
||||||
|
|||||||
@@ -0,0 +1,10 @@
|
|||||||
|
-- Migration: 006_add_job_id_to_issues.sql
|
||||||
|
-- Purpose: Add job_id column to issues table for tracking pipeline execution context
|
||||||
|
|
||||||
|
-- Add job_id column to issues table
|
||||||
|
ALTER TABLE pipeline.issues ADD COLUMN IF NOT EXISTS job_id UUID;
|
||||||
|
|
||||||
|
-- Create index for filtering by job_id
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_issues_job_id ON pipeline.issues(job_id);
|
||||||
|
|
||||||
|
COMMENT ON COLUMN pipeline.issues.job_id IS 'References the scraper job that triggered the pipeline execution';
|
||||||
@@ -0,0 +1,352 @@
|
|||||||
|
-- Migration: Implement URT taxonomy with PostgreSQL ltree
|
||||||
|
-- Benefits:
|
||||||
|
-- 1. Hierarchical queries (find all codes under a domain/category)
|
||||||
|
-- 2. Ancestor/descendant lookups in O(1)
|
||||||
|
-- 3. Pattern matching on paths (e.g., 'O.*' for all Offering codes)
|
||||||
|
-- 4. Efficient GiST indexing for tree operations
|
||||||
|
-- 5. Aggregations at any level of hierarchy
|
||||||
|
|
||||||
|
-- Enable ltree extension
|
||||||
|
CREATE EXTENSION IF NOT EXISTS ltree;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- NEW UNIFIED TAXONOMY TABLE
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS pipeline.urt_taxonomy (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
|
||||||
|
-- ltree path: Domain.Category.Subcode (e.g., 'O.O1.O1_01')
|
||||||
|
path ltree NOT NULL UNIQUE,
|
||||||
|
|
||||||
|
-- Human-readable code (e.g., 'O1.01')
|
||||||
|
code VARCHAR(10) NOT NULL UNIQUE,
|
||||||
|
|
||||||
|
-- Node type for filtering
|
||||||
|
node_type VARCHAR(20) NOT NULL CHECK (node_type IN ('domain', 'category', 'subcode')),
|
||||||
|
|
||||||
|
-- Hierarchy level (1=domain, 2=category, 3=subcode)
|
||||||
|
level INT GENERATED ALWAYS AS (nlevel(path)) STORED,
|
||||||
|
|
||||||
|
-- Names and definitions
|
||||||
|
name VARCHAR(100) NOT NULL,
|
||||||
|
definition TEXT,
|
||||||
|
|
||||||
|
-- Examples (for subcodes)
|
||||||
|
positive_example TEXT,
|
||||||
|
negative_example TEXT,
|
||||||
|
|
||||||
|
-- Actionability (for subcodes)
|
||||||
|
solution TEXT,
|
||||||
|
solution_complexity VARCHAR(10) DEFAULT 'medium',
|
||||||
|
marketing_angle TEXT,
|
||||||
|
|
||||||
|
-- Owner routing
|
||||||
|
default_owner VARCHAR(50),
|
||||||
|
|
||||||
|
-- Metadata
|
||||||
|
is_active BOOLEAN DEFAULT TRUE,
|
||||||
|
created_at TIMESTAMP DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMP DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- INDEXES FOR LTREE OPERATIONS
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- GiST index for ltree operations (ancestor, descendant, pattern matching)
|
||||||
|
CREATE INDEX idx_urt_taxonomy_path_gist ON pipeline.urt_taxonomy USING GIST (path);
|
||||||
|
|
||||||
|
-- B-tree index for exact path lookups and sorting
|
||||||
|
CREATE INDEX idx_urt_taxonomy_path_btree ON pipeline.urt_taxonomy USING BTREE (path);
|
||||||
|
|
||||||
|
-- Index for code lookups (most common operation)
|
||||||
|
CREATE INDEX idx_urt_taxonomy_code ON pipeline.urt_taxonomy (code);
|
||||||
|
|
||||||
|
-- Index for node type filtering
|
||||||
|
CREATE INDEX idx_urt_taxonomy_node_type ON pipeline.urt_taxonomy (node_type);
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- MIGRATE EXISTING DATA
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Insert domains (level 1)
|
||||||
|
INSERT INTO pipeline.urt_taxonomy (path, code, node_type, name, definition, default_owner)
|
||||||
|
SELECT
|
||||||
|
code::ltree as path,
|
||||||
|
code,
|
||||||
|
'domain',
|
||||||
|
name,
|
||||||
|
-- Domain definitions from spec
|
||||||
|
CASE code
|
||||||
|
WHEN 'O' THEN 'Does the core product/service deliver?'
|
||||||
|
WHEN 'P' THEN 'How do personnel behave and perform?'
|
||||||
|
WHEN 'J' THEN 'Is the process smooth and timely?'
|
||||||
|
WHEN 'E' THEN 'Is the space functional and pleasant?'
|
||||||
|
WHEN 'A' THEN 'Can everyone participate fully?'
|
||||||
|
WHEN 'V' THEN 'Is the exchange fair and transparent?'
|
||||||
|
WHEN 'R' THEN 'Is trust built and maintained?'
|
||||||
|
END,
|
||||||
|
CASE code
|
||||||
|
WHEN 'O' THEN 'Product/Operations'
|
||||||
|
WHEN 'P' THEN 'HR/Training'
|
||||||
|
WHEN 'J' THEN 'Operations/Process'
|
||||||
|
WHEN 'E' THEN 'Facilities/IT'
|
||||||
|
WHEN 'A' THEN 'Compliance/Design'
|
||||||
|
WHEN 'V' THEN 'Finance/Pricing'
|
||||||
|
WHEN 'R' THEN 'Leadership/CX'
|
||||||
|
END
|
||||||
|
FROM pipeline.urt_domains
|
||||||
|
ON CONFLICT (code) DO NOTHING;
|
||||||
|
|
||||||
|
-- Insert categories (level 2)
|
||||||
|
INSERT INTO pipeline.urt_taxonomy (path, code, node_type, name, definition)
|
||||||
|
SELECT
|
||||||
|
(domain_code || '.' || code)::ltree as path,
|
||||||
|
code,
|
||||||
|
'category',
|
||||||
|
name,
|
||||||
|
NULL -- Categories don't have definitions in current schema
|
||||||
|
FROM pipeline.urt_categories
|
||||||
|
ON CONFLICT (code) DO NOTHING;
|
||||||
|
|
||||||
|
-- Insert subcodes (level 3)
|
||||||
|
INSERT INTO pipeline.urt_taxonomy (path, code, node_type, name, definition, positive_example, negative_example, solution, solution_complexity, marketing_angle)
|
||||||
|
SELECT
|
||||||
|
(domain_code || '.' || category_code || '.' || replace(code, '.', '_'))::ltree as path,
|
||||||
|
code,
|
||||||
|
'subcode',
|
||||||
|
name,
|
||||||
|
definition,
|
||||||
|
positive_example,
|
||||||
|
negative_example,
|
||||||
|
solution,
|
||||||
|
solution_complexity,
|
||||||
|
marketing_angle
|
||||||
|
FROM pipeline.urt_subcodes
|
||||||
|
ON CONFLICT (code) DO NOTHING;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- HELPER FUNCTIONS
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Get all ancestors of a code (e.g., O1.01 -> [O, O1])
|
||||||
|
CREATE OR REPLACE FUNCTION pipeline.urt_ancestors(p_code VARCHAR)
|
||||||
|
RETURNS TABLE(code VARCHAR, name VARCHAR, node_type VARCHAR, level INT) AS $$
|
||||||
|
BEGIN
|
||||||
|
RETURN QUERY
|
||||||
|
SELECT t.code, t.name, t.node_type, t.level
|
||||||
|
FROM pipeline.urt_taxonomy t
|
||||||
|
WHERE t.path @> (SELECT path FROM pipeline.urt_taxonomy WHERE code = p_code)
|
||||||
|
AND t.code != p_code
|
||||||
|
ORDER BY t.level;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- Get all descendants of a code (e.g., O -> all O* codes)
|
||||||
|
CREATE OR REPLACE FUNCTION pipeline.urt_descendants(p_code VARCHAR)
|
||||||
|
RETURNS TABLE(code VARCHAR, name VARCHAR, node_type VARCHAR, level INT) AS $$
|
||||||
|
BEGIN
|
||||||
|
RETURN QUERY
|
||||||
|
SELECT t.code, t.name, t.node_type, t.level
|
||||||
|
FROM pipeline.urt_taxonomy t
|
||||||
|
WHERE t.path <@ (SELECT path FROM pipeline.urt_taxonomy WHERE code = p_code)
|
||||||
|
AND t.code != p_code
|
||||||
|
ORDER BY t.path;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- Get siblings (same parent)
|
||||||
|
CREATE OR REPLACE FUNCTION pipeline.urt_siblings(p_code VARCHAR)
|
||||||
|
RETURNS TABLE(code VARCHAR, name VARCHAR, level INT) AS $$
|
||||||
|
DECLARE
|
||||||
|
v_parent ltree;
|
||||||
|
BEGIN
|
||||||
|
SELECT subpath(path, 0, nlevel(path) - 1) INTO v_parent
|
||||||
|
FROM pipeline.urt_taxonomy WHERE code = p_code;
|
||||||
|
|
||||||
|
RETURN QUERY
|
||||||
|
SELECT t.code, t.name, t.level
|
||||||
|
FROM pipeline.urt_taxonomy t
|
||||||
|
WHERE subpath(t.path, 0, nlevel(t.path) - 1) = v_parent
|
||||||
|
AND t.code != p_code
|
||||||
|
ORDER BY t.path;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- Get domain for any code
|
||||||
|
CREATE OR REPLACE FUNCTION pipeline.urt_domain(p_code VARCHAR)
|
||||||
|
RETURNS VARCHAR AS $$
|
||||||
|
SELECT code FROM pipeline.urt_taxonomy
|
||||||
|
WHERE path @> (SELECT path FROM pipeline.urt_taxonomy WHERE code = p_code)
|
||||||
|
AND node_type = 'domain';
|
||||||
|
$$ LANGUAGE SQL;
|
||||||
|
|
||||||
|
-- Get category for a subcode
|
||||||
|
CREATE OR REPLACE FUNCTION pipeline.urt_category(p_code VARCHAR)
|
||||||
|
RETURNS VARCHAR AS $$
|
||||||
|
SELECT code FROM pipeline.urt_taxonomy
|
||||||
|
WHERE path @> (SELECT path FROM pipeline.urt_taxonomy WHERE code = p_code)
|
||||||
|
AND node_type = 'category';
|
||||||
|
$$ LANGUAGE SQL;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- VIEW: FLATTENED TAXONOMY WITH HIERARCHY INFO
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
CREATE OR REPLACE VIEW pipeline.v_urt_taxonomy AS
|
||||||
|
SELECT
|
||||||
|
t.id,
|
||||||
|
t.path,
|
||||||
|
t.code,
|
||||||
|
t.node_type,
|
||||||
|
t.level,
|
||||||
|
t.name,
|
||||||
|
t.definition,
|
||||||
|
-- Parent info
|
||||||
|
CASE
|
||||||
|
WHEN t.level > 1 THEN subpath(t.path, 0, t.level - 1)::text
|
||||||
|
ELSE NULL
|
||||||
|
END as parent_path,
|
||||||
|
-- Domain info (for rollups)
|
||||||
|
subpath(t.path, 0, 1)::text as domain_code,
|
||||||
|
(SELECT name FROM pipeline.urt_taxonomy WHERE path = subpath(t.path, 0, 1)) as domain_name,
|
||||||
|
-- Category info (for subcodes)
|
||||||
|
CASE
|
||||||
|
WHEN t.level >= 2 THEN subpath(t.path, 0, 2)::text
|
||||||
|
ELSE NULL
|
||||||
|
END as category_path,
|
||||||
|
-- Full path as breadcrumb
|
||||||
|
t.path::text as full_path,
|
||||||
|
-- Actionability
|
||||||
|
t.solution,
|
||||||
|
t.default_owner,
|
||||||
|
t.is_active
|
||||||
|
FROM pipeline.urt_taxonomy t
|
||||||
|
ORDER BY t.path;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- UPDATE REVIEW_SPANS TO USE LTREE
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Add ltree column to review_spans for efficient hierarchy queries
|
||||||
|
ALTER TABLE pipeline.review_spans
|
||||||
|
ADD COLUMN IF NOT EXISTS urt_path ltree;
|
||||||
|
|
||||||
|
-- Populate ltree paths from existing codes
|
||||||
|
UPDATE pipeline.review_spans rs
|
||||||
|
SET urt_path = t.path
|
||||||
|
FROM pipeline.urt_taxonomy t
|
||||||
|
WHERE rs.urt_primary = t.code
|
||||||
|
AND rs.urt_path IS NULL;
|
||||||
|
|
||||||
|
-- Create GiST index for hierarchy queries on spans
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_review_spans_urt_path_gist
|
||||||
|
ON pipeline.review_spans USING GIST (urt_path);
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- EXAMPLE QUERIES (for reference)
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- These are example queries, not executed:
|
||||||
|
/*
|
||||||
|
|
||||||
|
-- 1. Find all subcodes under "People" domain
|
||||||
|
SELECT code, name FROM pipeline.urt_taxonomy
|
||||||
|
WHERE path <@ 'P' AND node_type = 'subcode';
|
||||||
|
|
||||||
|
-- 2. Find all codes matching pattern (e.g., all Value subcodes)
|
||||||
|
SELECT code, name FROM pipeline.urt_taxonomy
|
||||||
|
WHERE path ~ 'V.*' AND node_type = 'subcode';
|
||||||
|
|
||||||
|
-- 3. Aggregate span counts by domain
|
||||||
|
SELECT
|
||||||
|
subpath(urt_path, 0, 1)::text as domain,
|
||||||
|
COUNT(*) as span_count
|
||||||
|
FROM pipeline.review_spans
|
||||||
|
WHERE urt_path IS NOT NULL
|
||||||
|
GROUP BY subpath(urt_path, 0, 1)
|
||||||
|
ORDER BY span_count DESC;
|
||||||
|
|
||||||
|
-- 4. Aggregate by category within a domain
|
||||||
|
SELECT
|
||||||
|
subpath(urt_path, 0, 2)::text as category,
|
||||||
|
COUNT(*) as span_count
|
||||||
|
FROM pipeline.review_spans
|
||||||
|
WHERE urt_path <@ 'O' -- All Offering codes
|
||||||
|
GROUP BY subpath(urt_path, 0, 2)
|
||||||
|
ORDER BY span_count DESC;
|
||||||
|
|
||||||
|
-- 5. Get ancestors of a specific code
|
||||||
|
SELECT * FROM pipeline.urt_ancestors('O1.01');
|
||||||
|
-- Returns: O (Offering), O1 (Function)
|
||||||
|
|
||||||
|
-- 6. Get all descendants of a category
|
||||||
|
SELECT * FROM pipeline.urt_descendants('O1');
|
||||||
|
-- Returns: O1.01, O1.02, O1.03, O1.04, O1.05
|
||||||
|
|
||||||
|
-- 7. Find the domain owner for a code
|
||||||
|
SELECT pipeline.urt_domain('P1.01');
|
||||||
|
-- Returns: P (People)
|
||||||
|
|
||||||
|
-- 8. Drill-down query: Domain -> Category -> Subcode
|
||||||
|
WITH RECURSIVE tree AS (
|
||||||
|
SELECT path, code, name, level
|
||||||
|
FROM pipeline.urt_taxonomy
|
||||||
|
WHERE node_type = 'domain' AND code = 'O'
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT t.path, t.code, t.name, t.level
|
||||||
|
FROM pipeline.urt_taxonomy t
|
||||||
|
JOIN tree ON t.path <@ tree.path AND nlevel(t.path) = nlevel(tree.path) + 1
|
||||||
|
)
|
||||||
|
SELECT * FROM tree ORDER BY path;
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- TRIGGER: Auto-update urt_path on review_spans
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION pipeline.set_urt_path()
|
||||||
|
RETURNS TRIGGER AS $$
|
||||||
|
BEGIN
|
||||||
|
NEW.urt_path := (SELECT path FROM pipeline.urt_taxonomy WHERE code = NEW.urt_primary);
|
||||||
|
RETURN NEW;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
DROP TRIGGER IF EXISTS trg_set_urt_path ON pipeline.review_spans;
|
||||||
|
CREATE TRIGGER trg_set_urt_path
|
||||||
|
BEFORE INSERT OR UPDATE OF urt_primary ON pipeline.review_spans
|
||||||
|
FOR EACH ROW
|
||||||
|
EXECUTE FUNCTION pipeline.set_urt_path();
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- MATERIALIZED VIEW: Pre-computed hierarchy rollups
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
CREATE MATERIALIZED VIEW IF NOT EXISTS pipeline.mv_urt_domain_stats AS
|
||||||
|
SELECT
|
||||||
|
subpath(rs.urt_path, 0, 1)::text as domain_code,
|
||||||
|
t.name as domain_name,
|
||||||
|
rs.valence,
|
||||||
|
COUNT(*) as span_count,
|
||||||
|
COUNT(DISTINCT rs.review_id) as review_count,
|
||||||
|
AVG(CASE rs.intensity
|
||||||
|
WHEN 'I1' THEN 1
|
||||||
|
WHEN 'I2' THEN 2
|
||||||
|
WHEN 'I3' THEN 3
|
||||||
|
END) as avg_intensity
|
||||||
|
FROM pipeline.review_spans rs
|
||||||
|
JOIN pipeline.urt_taxonomy t ON subpath(rs.urt_path, 0, 1) = t.path
|
||||||
|
WHERE rs.urt_path IS NOT NULL
|
||||||
|
GROUP BY subpath(rs.urt_path, 0, 1), t.name, rs.valence;
|
||||||
|
|
||||||
|
CREATE UNIQUE INDEX ON pipeline.mv_urt_domain_stats (domain_code, valence);
|
||||||
|
|
||||||
|
-- Refresh command (run periodically):
|
||||||
|
-- REFRESH MATERIALIZED VIEW CONCURRENTLY pipeline.mv_urt_domain_stats;
|
||||||
|
|
||||||
|
COMMENT ON TABLE pipeline.urt_taxonomy IS 'Unified URT taxonomy using ltree for hierarchical queries. Replaces urt_domains, urt_categories, urt_subcodes.';
|
||||||
@@ -70,16 +70,18 @@ class ReviewRepository:
|
|||||||
self,
|
self,
|
||||||
review: NormalizedReview,
|
review: NormalizedReview,
|
||||||
raw_id: int,
|
raw_id: int,
|
||||||
|
job_id: str | None = None,
|
||||||
) -> int:
|
) -> int:
|
||||||
"""Insert an enriched review stub (pre-classification)."""
|
"""Insert an enriched review stub (pre-classification)."""
|
||||||
query = """
|
query = """
|
||||||
INSERT INTO pipeline.reviews_enriched (
|
INSERT INTO pipeline.reviews_enriched (
|
||||||
source, review_id, review_version, is_latest, raw_id,
|
source, review_id, review_version, is_latest, raw_id,
|
||||||
business_id, place_id, text, text_normalized, rating, review_time,
|
business_id, place_id, text, text_normalized, rating, review_time,
|
||||||
language, taxonomy_version
|
language, taxonomy_version, job_id
|
||||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13)
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14::uuid)
|
||||||
ON CONFLICT (source, review_id, review_version) DO UPDATE SET
|
ON CONFLICT (source, review_id, review_version) DO UPDATE SET
|
||||||
is_latest = EXCLUDED.is_latest
|
is_latest = EXCLUDED.is_latest,
|
||||||
|
job_id = COALESCE(EXCLUDED.job_id, pipeline.reviews_enriched.job_id)
|
||||||
RETURNING id
|
RETURNING id
|
||||||
"""
|
"""
|
||||||
enriched_id = await self.db.fetchval(
|
enriched_id = await self.db.fetchval(
|
||||||
@@ -97,6 +99,7 @@ class ReviewRepository:
|
|||||||
review["review_time"],
|
review["review_time"],
|
||||||
review["text_language"],
|
review["text_language"],
|
||||||
"v5.1", # taxonomy_version - will be updated by Stage 2
|
"v5.1", # taxonomy_version - will be updated by Stage 2
|
||||||
|
job_id,
|
||||||
)
|
)
|
||||||
return enriched_id
|
return enriched_id
|
||||||
|
|
||||||
@@ -213,6 +216,7 @@ class SpanRepository:
|
|||||||
batch_id: str,
|
batch_id: str,
|
||||||
model_version: str,
|
model_version: str,
|
||||||
taxonomy_version: str,
|
taxonomy_version: str,
|
||||||
|
job_id: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Insert a span into the database."""
|
"""Insert a span into the database."""
|
||||||
query = """
|
query = """
|
||||||
@@ -224,15 +228,17 @@ class SpanRepository:
|
|||||||
entity, entity_type, entity_normalized,
|
entity, entity_type, entity_normalized,
|
||||||
relation_type, related_span_id, causal_chain,
|
relation_type, related_span_id, causal_chain,
|
||||||
is_primary, is_active, review_time,
|
is_primary, is_active, review_time,
|
||||||
confidence, usn, taxonomy_version, model_version, ingest_batch_id
|
confidence, usn, taxonomy_version, model_version, ingest_batch_id,
|
||||||
|
job_id
|
||||||
) VALUES (
|
) VALUES (
|
||||||
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
|
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
|
||||||
$11, $12, $13, $14, $15, $16, $17, $18, $19, $20,
|
$11, $12, $13, $14, $15, $16, $17, $18, $19, $20,
|
||||||
$21, $22, $23, $24, $25, $26, $27, $28, $29, $30,
|
$21, $22, $23, $24, $25, $26, $27, $28, $29, $30,
|
||||||
$31, $32, $33, $34
|
$31, $32, $33, $34, $35::uuid
|
||||||
)
|
)
|
||||||
ON CONFLICT (span_id) DO UPDATE SET
|
ON CONFLICT (span_id) DO UPDATE SET
|
||||||
is_active = EXCLUDED.is_active
|
is_active = EXCLUDED.is_active,
|
||||||
|
job_id = COALESCE(EXCLUDED.job_id, pipeline.review_spans.job_id)
|
||||||
"""
|
"""
|
||||||
# Build related_span_id from index if needed
|
# Build related_span_id from index if needed
|
||||||
related_span_id = None
|
related_span_id = None
|
||||||
@@ -276,6 +282,7 @@ class SpanRepository:
|
|||||||
taxonomy_version,
|
taxonomy_version,
|
||||||
model_version,
|
model_version,
|
||||||
batch_id,
|
batch_id,
|
||||||
|
job_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
async def get_unrouted_negative_spans(
|
async def get_unrouted_negative_spans(
|
||||||
@@ -312,6 +319,24 @@ class SpanRepository:
|
|||||||
row = await self.db.fetchrow(query, span_id)
|
row = await self.db.fetchrow(query, span_id)
|
||||||
return dict(row) if row else None
|
return dict(row) if row else None
|
||||||
|
|
||||||
|
async def deactivate_spans_for_job(self, job_id: str) -> int:
|
||||||
|
"""Deactivate all spans for a job (used before reclassification).
|
||||||
|
|
||||||
|
Returns the number of spans deactivated.
|
||||||
|
"""
|
||||||
|
result = await self.db.execute(
|
||||||
|
"""
|
||||||
|
UPDATE pipeline.review_spans
|
||||||
|
SET is_active = FALSE
|
||||||
|
WHERE job_id = $1::uuid AND is_active = TRUE
|
||||||
|
""",
|
||||||
|
job_id,
|
||||||
|
)
|
||||||
|
# Extract count from result string like "UPDATE 42"
|
||||||
|
if result and result.startswith("UPDATE "):
|
||||||
|
return int(result.split()[1])
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
class IssueRepository:
|
class IssueRepository:
|
||||||
"""Repository for issue data operations."""
|
"""Repository for issue data operations."""
|
||||||
@@ -329,6 +354,7 @@ class IssueRepository:
|
|||||||
entity: str | None,
|
entity: str | None,
|
||||||
entity_normalized: str | None,
|
entity_normalized: str | None,
|
||||||
taxonomy_version: str,
|
taxonomy_version: str,
|
||||||
|
job_id: str | None = None,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Create or update an issue. Returns True if newly created."""
|
"""Create or update an issue. Returns True if newly created."""
|
||||||
# First check if exists
|
# First check if exists
|
||||||
@@ -363,8 +389,8 @@ class IssueRepository:
|
|||||||
INSERT INTO pipeline.issues (
|
INSERT INTO pipeline.issues (
|
||||||
issue_id, business_id, place_id, primary_subcode, domain,
|
issue_id, business_id, place_id, primary_subcode, domain,
|
||||||
state, priority_score, confidence_score, span_count, max_intensity,
|
state, priority_score, confidence_score, span_count, max_intensity,
|
||||||
entity, entity_normalized, taxonomy_version
|
entity, entity_normalized, taxonomy_version, job_id
|
||||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13)
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14::uuid)
|
||||||
""",
|
""",
|
||||||
issue_id,
|
issue_id,
|
||||||
business_id,
|
business_id,
|
||||||
@@ -379,6 +405,7 @@ class IssueRepository:
|
|||||||
entity,
|
entity,
|
||||||
entity_normalized,
|
entity_normalized,
|
||||||
taxonomy_version,
|
taxonomy_version,
|
||||||
|
job_id,
|
||||||
)
|
)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@@ -448,6 +475,41 @@ class IssueRepository:
|
|||||||
span_id,
|
span_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
async def delete_issues_for_job(self, job_id: str) -> int:
|
||||||
|
"""Delete all issues for a job (used before reclassification).
|
||||||
|
|
||||||
|
Also deletes related issue_spans and issue_events.
|
||||||
|
Returns the number of issues deleted.
|
||||||
|
"""
|
||||||
|
# First delete related records
|
||||||
|
await self.db.execute(
|
||||||
|
"""
|
||||||
|
DELETE FROM pipeline.issue_spans
|
||||||
|
WHERE issue_id IN (
|
||||||
|
SELECT issue_id FROM pipeline.issues WHERE job_id = $1::uuid
|
||||||
|
)
|
||||||
|
""",
|
||||||
|
job_id,
|
||||||
|
)
|
||||||
|
await self.db.execute(
|
||||||
|
"""
|
||||||
|
DELETE FROM pipeline.issue_events
|
||||||
|
WHERE issue_id IN (
|
||||||
|
SELECT issue_id FROM pipeline.issues WHERE job_id = $1::uuid
|
||||||
|
)
|
||||||
|
""",
|
||||||
|
job_id,
|
||||||
|
)
|
||||||
|
# Then delete issues
|
||||||
|
result = await self.db.execute(
|
||||||
|
"DELETE FROM pipeline.issues WHERE job_id = $1::uuid",
|
||||||
|
job_id,
|
||||||
|
)
|
||||||
|
# Extract count from result string like "DELETE 42"
|
||||||
|
if result and result.startswith("DELETE "):
|
||||||
|
return int(result.split()[1])
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
class FactRepository:
|
class FactRepository:
|
||||||
"""Repository for fact time series operations."""
|
"""Repository for fact time series operations."""
|
||||||
|
|||||||
@@ -0,0 +1,764 @@
|
|||||||
|
"""
|
||||||
|
Reputation Pipeline - Primitives-based classification and reputation analytics.
|
||||||
|
|
||||||
|
This pipeline uses the new primitives taxonomy (MANNER, SPEED, VALUE_FOR_MONEY, etc.)
|
||||||
|
instead of the legacy URT codes. It powers the Reputation Report product.
|
||||||
|
|
||||||
|
Stages:
|
||||||
|
- classify: LLM-powered span extraction with primitives (stored in detected_spans_v2)
|
||||||
|
- report: Generate reputation report JSON
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
pipeline = ReputationPipeline()
|
||||||
|
await pipeline.initialize()
|
||||||
|
result = await pipeline.process({"business_id": "Go Karts Mar Menor", "days": 365})
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
from pipeline_core import (
|
||||||
|
BasePipeline,
|
||||||
|
DashboardConfig,
|
||||||
|
DashboardSection,
|
||||||
|
PipelineMetadata,
|
||||||
|
PipelineResult as BasePipelineResult,
|
||||||
|
StageResult,
|
||||||
|
WidgetConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
from reviewiq_pipeline.config import Config
|
||||||
|
from reviewiq_pipeline.db.connection import DatabasePool
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Stage names
|
||||||
|
STAGE_NAMES = ["classify", "report"]
|
||||||
|
|
||||||
|
# Domain mapping for primitives
|
||||||
|
DOMAIN_MAP = {
|
||||||
|
# Output/Product (O)
|
||||||
|
"TASTE": "O", "CRAFT": "O", "FRESHNESS": "O", "TEMPERATURE": "O",
|
||||||
|
"EFFECTIVENESS": "O", "ACCURACY": "O", "CONDITION": "O", "CONSISTENCY": "O",
|
||||||
|
# People/Service (P)
|
||||||
|
"MANNER": "P", "COMPETENCE": "P", "ATTENTIVENESS": "P", "COMMUNICATION": "P",
|
||||||
|
# Journey/Process (J)
|
||||||
|
"SPEED": "J", "FRICTION": "J", "RELIABILITY": "J", "AVAILABILITY": "J",
|
||||||
|
# Environment (E)
|
||||||
|
"CLEANLINESS": "E", "COMFORT": "E", "SAFETY": "E", "AMBIANCE": "E",
|
||||||
|
"ACCESSIBILITY": "E", "DIGITAL_UX": "E",
|
||||||
|
# Value (V)
|
||||||
|
"PRICE_LEVEL": "V", "PRICE_FAIRNESS": "V", "PRICE_TRANSPARENCY": "V",
|
||||||
|
"VALUE_FOR_MONEY": "V",
|
||||||
|
# Meta
|
||||||
|
"HONESTY": "meta", "ETHICS": "meta", "PROMISES": "meta",
|
||||||
|
"ACKNOWLEDGMENT": "meta", "RESPONSE_QUALITY": "meta", "RECOVERY": "meta",
|
||||||
|
"RETURN_INTENT": "meta", "RECOMMEND": "meta", "RECOGNITION": "meta",
|
||||||
|
"UNMAPPED": "meta", "NON_INFORMATIVE": "meta",
|
||||||
|
}
|
||||||
|
|
||||||
|
DOMAIN_NAMES = {
|
||||||
|
"O": "Output/Product",
|
||||||
|
"P": "People/Service",
|
||||||
|
"J": "Journey/Process",
|
||||||
|
"E": "Environment",
|
||||||
|
"V": "Value",
|
||||||
|
"meta": "Meta",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class ReputationPipeline(BasePipeline):
|
||||||
|
"""
|
||||||
|
Reputation Pipeline - Primitives-based classification and analytics.
|
||||||
|
|
||||||
|
Uses the new primitives taxonomy (37 primitives across 5 domains + meta)
|
||||||
|
for more actionable, business-friendly insights.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: Config | None = None):
|
||||||
|
"""Initialize the pipeline."""
|
||||||
|
self._config = config or Config()
|
||||||
|
self._db: DatabasePool | None = None
|
||||||
|
self._initialized = False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def config(self) -> Config:
|
||||||
|
"""Get pipeline configuration."""
|
||||||
|
return self._config
|
||||||
|
|
||||||
|
@property
|
||||||
|
def metadata(self) -> PipelineMetadata:
|
||||||
|
"""Get pipeline metadata."""
|
||||||
|
return PipelineMetadata(
|
||||||
|
id="reputation",
|
||||||
|
name="Reputation Analytics Pipeline",
|
||||||
|
description="Primitives-based classification and reputation scoring. Generates business-facing analytics reports.",
|
||||||
|
version="2.0.0",
|
||||||
|
stages=STAGE_NAMES,
|
||||||
|
input_type="BusinessInput",
|
||||||
|
)
|
||||||
|
|
||||||
|
async def initialize(self) -> None:
|
||||||
|
"""Initialize database connections."""
|
||||||
|
if self._initialized:
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("Initializing Reputation pipeline...")
|
||||||
|
|
||||||
|
self._db = DatabasePool(self._config)
|
||||||
|
await self._db.initialize()
|
||||||
|
|
||||||
|
self._initialized = True
|
||||||
|
logger.info("Reputation pipeline initialized")
|
||||||
|
|
||||||
|
async def close(self) -> None:
|
||||||
|
"""Close all connections."""
|
||||||
|
if self._db:
|
||||||
|
await self._db.close()
|
||||||
|
self._db = None
|
||||||
|
|
||||||
|
self._initialized = False
|
||||||
|
logger.info("Reputation pipeline closed")
|
||||||
|
|
||||||
|
async def process(
|
||||||
|
self,
|
||||||
|
input_data: dict[str, Any],
|
||||||
|
stages: list[str] | None = None,
|
||||||
|
) -> BasePipelineResult:
|
||||||
|
"""
|
||||||
|
Process input data through the pipeline.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_data: Must contain business_id OR job_id. Optional: days, start, end
|
||||||
|
stages: List of stage names to run (default: all)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BasePipelineResult with stage outputs
|
||||||
|
"""
|
||||||
|
await self.initialize()
|
||||||
|
|
||||||
|
stages = stages or STAGE_NAMES
|
||||||
|
stages_run: list[str] = []
|
||||||
|
stage_results: dict[str, StageResult] = {}
|
||||||
|
|
||||||
|
business_id = input_data.get("business_id")
|
||||||
|
job_id = input_data.get("job_id")
|
||||||
|
|
||||||
|
# Resolve business_id from job_id if not provided directly
|
||||||
|
if not business_id and job_id:
|
||||||
|
try:
|
||||||
|
async with self._db.pool.acquire() as conn:
|
||||||
|
row = await conn.fetchrow(
|
||||||
|
"SELECT business_name FROM jobs WHERE job_id = $1",
|
||||||
|
uuid.UUID(job_id) if isinstance(job_id, str) else job_id,
|
||||||
|
)
|
||||||
|
if row and row["business_name"]:
|
||||||
|
business_id = row["business_name"]
|
||||||
|
logger.info(f"Resolved business_id '{business_id}' from job_id '{job_id}'")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to resolve business_id from job_id: {e}")
|
||||||
|
|
||||||
|
if not business_id:
|
||||||
|
return BasePipelineResult(
|
||||||
|
pipeline_id="reputation",
|
||||||
|
stages_run=[],
|
||||||
|
stage_results={},
|
||||||
|
success=False,
|
||||||
|
error="business_id is required (provide business_id or job_id)",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Parse time window
|
||||||
|
days = input_data.get("days", 365)
|
||||||
|
end_date = datetime.utcnow()
|
||||||
|
start_date = end_date - timedelta(days=days)
|
||||||
|
|
||||||
|
if input_data.get("start"):
|
||||||
|
start_date = datetime.fromisoformat(input_data["start"])
|
||||||
|
if input_data.get("end"):
|
||||||
|
end_date = datetime.fromisoformat(input_data["end"])
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Stage: Classify (uses existing spans from detected_spans_v2)
|
||||||
|
if "classify" in stages:
|
||||||
|
start = time.time()
|
||||||
|
logger.info(f"Running Classification check for {business_id}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
classify_result = await self._check_classification(
|
||||||
|
business_id, start_date, end_date
|
||||||
|
)
|
||||||
|
duration_ms = int((time.time() - start) * 1000)
|
||||||
|
stages_run.append("classify")
|
||||||
|
stage_results["classify"] = StageResult(
|
||||||
|
stage="classify",
|
||||||
|
success=True,
|
||||||
|
data=classify_result,
|
||||||
|
error=None,
|
||||||
|
duration_ms=duration_ms,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Classification check failed")
|
||||||
|
stage_results["classify"] = StageResult(
|
||||||
|
stage="classify",
|
||||||
|
success=False,
|
||||||
|
data={},
|
||||||
|
error=str(e),
|
||||||
|
duration_ms=int((time.time() - start) * 1000),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Stage: Report (generate reputation report)
|
||||||
|
if "report" in stages:
|
||||||
|
start = time.time()
|
||||||
|
logger.info(f"Generating Reputation Report for {business_id}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
report_result = await self._generate_report(
|
||||||
|
business_id, start_date, end_date
|
||||||
|
)
|
||||||
|
duration_ms = int((time.time() - start) * 1000)
|
||||||
|
stages_run.append("report")
|
||||||
|
stage_results["report"] = StageResult(
|
||||||
|
stage="report",
|
||||||
|
success=True,
|
||||||
|
data=report_result,
|
||||||
|
error=None,
|
||||||
|
duration_ms=duration_ms,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Report generation failed")
|
||||||
|
stage_results["report"] = StageResult(
|
||||||
|
stage="report",
|
||||||
|
success=False,
|
||||||
|
data={},
|
||||||
|
error=str(e),
|
||||||
|
duration_ms=int((time.time() - start) * 1000),
|
||||||
|
)
|
||||||
|
|
||||||
|
return BasePipelineResult(
|
||||||
|
pipeline_id="reputation",
|
||||||
|
stages_run=stages_run,
|
||||||
|
stage_results=stage_results,
|
||||||
|
success=all(r["success"] for r in stage_results.values()),
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Pipeline failed with unexpected error")
|
||||||
|
return BasePipelineResult(
|
||||||
|
pipeline_id="reputation",
|
||||||
|
stages_run=stages_run,
|
||||||
|
stage_results=stage_results,
|
||||||
|
success=False,
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _check_classification(
|
||||||
|
self,
|
||||||
|
business_id: str,
|
||||||
|
start_date: datetime,
|
||||||
|
end_date: datetime,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Check classification coverage for the business."""
|
||||||
|
if not self._db:
|
||||||
|
return {"error": "Database not initialized"}
|
||||||
|
|
||||||
|
async with self._db.pool.acquire() as conn:
|
||||||
|
# Get span counts
|
||||||
|
row = await conn.fetchrow(
|
||||||
|
"""
|
||||||
|
SELECT
|
||||||
|
COUNT(*) as total_spans,
|
||||||
|
COUNT(*) FILTER (WHERE valence = '+') as positive,
|
||||||
|
COUNT(*) FILTER (WHERE valence = '-') as negative,
|
||||||
|
COUNT(*) FILTER (WHERE valence = '0') as neutral,
|
||||||
|
COUNT(*) FILTER (WHERE valence = '±') as mixed,
|
||||||
|
COUNT(*) FILTER (WHERE primitive = 'UNMAPPED') as unmapped,
|
||||||
|
COUNT(*) FILTER (WHERE primitive = 'NON_INFORMATIVE') as non_informative,
|
||||||
|
COUNT(DISTINCT s.review_id) as reviews_with_spans
|
||||||
|
FROM pipeline.detected_spans_v2 s
|
||||||
|
JOIN pipeline.review_facts_v1 f
|
||||||
|
ON f.review_id = s.review_id AND f.business_id = s.business_id
|
||||||
|
WHERE s.business_id = $1
|
||||||
|
AND f.review_time_utc >= $2
|
||||||
|
AND f.review_time_utc < $3
|
||||||
|
""",
|
||||||
|
business_id,
|
||||||
|
start_date,
|
||||||
|
end_date,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not row or row["total_spans"] == 0:
|
||||||
|
return {
|
||||||
|
"status": "no_data",
|
||||||
|
"message": "No classified spans found for this business/period",
|
||||||
|
"total_spans": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
total = row["total_spans"]
|
||||||
|
unmapped_rate = row["unmapped"] / total if total > 0 else 0
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "ok" if unmapped_rate < 0.10 else "needs_attention",
|
||||||
|
"total_spans": total,
|
||||||
|
"reviews_with_spans": row["reviews_with_spans"],
|
||||||
|
"positive_count": row["positive"],
|
||||||
|
"negative_count": row["negative"],
|
||||||
|
"neutral_count": row["neutral"],
|
||||||
|
"mixed_count": row["mixed"],
|
||||||
|
"unmapped_count": row["unmapped"],
|
||||||
|
"non_informative_count": row["non_informative"],
|
||||||
|
"unmapped_rate": round(unmapped_rate * 100, 1),
|
||||||
|
}
|
||||||
|
|
||||||
|
async def _generate_report(
|
||||||
|
self,
|
||||||
|
business_id: str,
|
||||||
|
start_date: datetime,
|
||||||
|
end_date: datetime,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Generate a reputation report summary."""
|
||||||
|
if not self._db:
|
||||||
|
return {"error": "Database not initialized"}
|
||||||
|
|
||||||
|
async with self._db.pool.acquire() as conn:
|
||||||
|
# Get overall scores
|
||||||
|
row = await conn.fetchrow(
|
||||||
|
"""
|
||||||
|
WITH span_data AS (
|
||||||
|
SELECT
|
||||||
|
s.primitive,
|
||||||
|
s.valence,
|
||||||
|
s.confidence,
|
||||||
|
s.intensity,
|
||||||
|
CASE s.valence
|
||||||
|
WHEN '+' THEN 1
|
||||||
|
WHEN '-' THEN -1
|
||||||
|
ELSE 0
|
||||||
|
END as valence_num
|
||||||
|
FROM pipeline.detected_spans_v2 s
|
||||||
|
JOIN pipeline.review_facts_v1 f
|
||||||
|
ON f.review_id = s.review_id AND f.business_id = s.business_id
|
||||||
|
WHERE s.business_id = $1
|
||||||
|
AND f.review_time_utc >= $2
|
||||||
|
AND f.review_time_utc < $3
|
||||||
|
AND s.primitive NOT IN ('UNMAPPED', 'NON_INFORMATIVE')
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
COUNT(*) as content_spans,
|
||||||
|
ROUND(
|
||||||
|
100.0 * SUM(valence_num * confidence * intensity) /
|
||||||
|
NULLIF(SUM(confidence * intensity), 0),
|
||||||
|
1
|
||||||
|
) as overall_score,
|
||||||
|
ROUND(100.0 * COUNT(*) FILTER (WHERE valence = '+') / NULLIF(COUNT(*), 0), 1) as positive_share
|
||||||
|
FROM span_data
|
||||||
|
""",
|
||||||
|
business_id,
|
||||||
|
start_date,
|
||||||
|
end_date,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not row or row["content_spans"] == 0:
|
||||||
|
return {
|
||||||
|
"status": "no_data",
|
||||||
|
"message": "No content spans found",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Get domain breakdown
|
||||||
|
domain_rows = await conn.fetch(
|
||||||
|
"""
|
||||||
|
SELECT
|
||||||
|
s.primitive,
|
||||||
|
COUNT(*) as count,
|
||||||
|
ROUND(
|
||||||
|
100.0 * SUM(
|
||||||
|
CASE s.valence WHEN '+' THEN 1 WHEN '-' THEN -1 ELSE 0 END
|
||||||
|
* s.confidence * s.intensity
|
||||||
|
) / NULLIF(SUM(s.confidence * s.intensity), 0),
|
||||||
|
1
|
||||||
|
) as score
|
||||||
|
FROM pipeline.detected_spans_v2 s
|
||||||
|
JOIN pipeline.review_facts_v1 f
|
||||||
|
ON f.review_id = s.review_id AND f.business_id = s.business_id
|
||||||
|
WHERE s.business_id = $1
|
||||||
|
AND f.review_time_utc >= $2
|
||||||
|
AND f.review_time_utc < $3
|
||||||
|
AND s.primitive NOT IN ('UNMAPPED', 'NON_INFORMATIVE')
|
||||||
|
GROUP BY s.primitive
|
||||||
|
ORDER BY count DESC
|
||||||
|
""",
|
||||||
|
business_id,
|
||||||
|
start_date,
|
||||||
|
end_date,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Aggregate by domain
|
||||||
|
domain_scores = {}
|
||||||
|
primitive_scores = {}
|
||||||
|
for r in domain_rows:
|
||||||
|
prim = r["primitive"]
|
||||||
|
domain = DOMAIN_MAP.get(prim, "meta")
|
||||||
|
|
||||||
|
primitive_scores[prim] = {
|
||||||
|
"domain": domain,
|
||||||
|
"score": float(r["score"]) if r["score"] else 0,
|
||||||
|
"volume": r["count"],
|
||||||
|
}
|
||||||
|
|
||||||
|
if domain not in domain_scores:
|
||||||
|
domain_scores[domain] = {"total_score": 0, "total_volume": 0}
|
||||||
|
domain_scores[domain]["total_score"] += (r["score"] or 0) * r["count"]
|
||||||
|
domain_scores[domain]["total_volume"] += r["count"]
|
||||||
|
|
||||||
|
# Calculate domain averages
|
||||||
|
domains = {}
|
||||||
|
for domain, data in domain_scores.items():
|
||||||
|
if data["total_volume"] > 0:
|
||||||
|
domains[domain] = {
|
||||||
|
"name": DOMAIN_NAMES.get(domain, domain),
|
||||||
|
"score": round(data["total_score"] / data["total_volume"], 1),
|
||||||
|
"volume": data["total_volume"],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Get top drivers
|
||||||
|
top_positive = await conn.fetch(
|
||||||
|
"""
|
||||||
|
SELECT
|
||||||
|
s.primitive,
|
||||||
|
COUNT(*) as count,
|
||||||
|
ROUND(100.0 * COUNT(*) / (
|
||||||
|
SELECT COUNT(*) FROM pipeline.detected_spans_v2 s2
|
||||||
|
JOIN pipeline.review_facts_v1 f2 ON f2.review_id = s2.review_id AND f2.business_id = s2.business_id
|
||||||
|
WHERE s2.business_id = $1 AND s2.valence = '+'
|
||||||
|
AND f2.review_time_utc >= $2 AND f2.review_time_utc < $3
|
||||||
|
), 1) as impact
|
||||||
|
FROM pipeline.detected_spans_v2 s
|
||||||
|
JOIN pipeline.review_facts_v1 f
|
||||||
|
ON f.review_id = s.review_id AND f.business_id = s.business_id
|
||||||
|
WHERE s.business_id = $1 AND s.valence = '+'
|
||||||
|
AND f.review_time_utc >= $2 AND f.review_time_utc < $3
|
||||||
|
AND s.primitive NOT IN ('UNMAPPED', 'NON_INFORMATIVE')
|
||||||
|
GROUP BY s.primitive
|
||||||
|
ORDER BY count DESC
|
||||||
|
LIMIT 5
|
||||||
|
""",
|
||||||
|
business_id,
|
||||||
|
start_date,
|
||||||
|
end_date,
|
||||||
|
)
|
||||||
|
|
||||||
|
top_negative = await conn.fetch(
|
||||||
|
"""
|
||||||
|
SELECT
|
||||||
|
s.primitive,
|
||||||
|
COUNT(*) as count,
|
||||||
|
ROUND(100.0 * COUNT(*) / NULLIF((
|
||||||
|
SELECT COUNT(*) FROM pipeline.detected_spans_v2 s2
|
||||||
|
JOIN pipeline.review_facts_v1 f2 ON f2.review_id = s2.review_id AND f2.business_id = s2.business_id
|
||||||
|
WHERE s2.business_id = $1 AND s2.valence = '-'
|
||||||
|
AND f2.review_time_utc >= $2 AND f2.review_time_utc < $3
|
||||||
|
), 0), 1) as impact
|
||||||
|
FROM pipeline.detected_spans_v2 s
|
||||||
|
JOIN pipeline.review_facts_v1 f
|
||||||
|
ON f.review_id = s.review_id AND f.business_id = s.business_id
|
||||||
|
WHERE s.business_id = $1 AND s.valence = '-'
|
||||||
|
AND f.review_time_utc >= $2 AND f.review_time_utc < $3
|
||||||
|
AND s.primitive NOT IN ('UNMAPPED', 'NON_INFORMATIVE')
|
||||||
|
GROUP BY s.primitive
|
||||||
|
ORDER BY count DESC
|
||||||
|
LIMIT 5
|
||||||
|
""",
|
||||||
|
business_id,
|
||||||
|
start_date,
|
||||||
|
end_date,
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "ok",
|
||||||
|
"business_id": business_id,
|
||||||
|
"window": {
|
||||||
|
"start": start_date.isoformat(),
|
||||||
|
"end": end_date.isoformat(),
|
||||||
|
},
|
||||||
|
"scores": {
|
||||||
|
"overall": float(row["overall_score"]) if row["overall_score"] else 0,
|
||||||
|
"positive_share": float(row["positive_share"]) if row["positive_share"] else 0,
|
||||||
|
"content_spans": row["content_spans"],
|
||||||
|
},
|
||||||
|
"domains": domains,
|
||||||
|
"primitives": primitive_scores,
|
||||||
|
"drivers": {
|
||||||
|
"positives": [
|
||||||
|
{"primitive": r["primitive"], "count": r["count"], "impact": float(r["impact"]) if r["impact"] else 0}
|
||||||
|
for r in top_positive
|
||||||
|
],
|
||||||
|
"negatives": [
|
||||||
|
{"primitive": r["primitive"], "count": r["count"], "impact": float(r["impact"]) if r["impact"] else 0}
|
||||||
|
for r in top_negative
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_dashboard_config(self) -> DashboardConfig:
|
||||||
|
"""Get the dashboard configuration for Reputation Pipeline."""
|
||||||
|
return DashboardConfig(
|
||||||
|
pipeline_id="reputation",
|
||||||
|
title="Reputation Analytics",
|
||||||
|
description="Primitives-based reputation scoring and business insights",
|
||||||
|
sections=[
|
||||||
|
DashboardSection(
|
||||||
|
id="overview",
|
||||||
|
title="Reputation Overview",
|
||||||
|
description="Overall reputation score and key metrics",
|
||||||
|
widgets=[
|
||||||
|
WidgetConfig(
|
||||||
|
id="reputation_score",
|
||||||
|
type="stat_card",
|
||||||
|
title="Reputation Score",
|
||||||
|
grid={"x": 0, "y": 0, "w": 3, "h": 1},
|
||||||
|
config={
|
||||||
|
"value_key": "overall_score",
|
||||||
|
"format": "{value:.0f}",
|
||||||
|
"icon": "trending-up",
|
||||||
|
"color": "blue",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
WidgetConfig(
|
||||||
|
id="positive_share",
|
||||||
|
type="stat_card",
|
||||||
|
title="Positive Share",
|
||||||
|
grid={"x": 3, "y": 0, "w": 3, "h": 1},
|
||||||
|
config={
|
||||||
|
"value_key": "positive_share",
|
||||||
|
"format": "{value:.1f}%",
|
||||||
|
"icon": "thumbs-up",
|
||||||
|
"color": "green",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
WidgetConfig(
|
||||||
|
id="content_spans",
|
||||||
|
type="stat_card",
|
||||||
|
title="Content Spans",
|
||||||
|
grid={"x": 6, "y": 0, "w": 3, "h": 1},
|
||||||
|
config={
|
||||||
|
"value_key": "content_spans",
|
||||||
|
"format": "{value:,}",
|
||||||
|
"icon": "message-square",
|
||||||
|
"color": "purple",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
WidgetConfig(
|
||||||
|
id="unmapped_rate",
|
||||||
|
type="stat_card",
|
||||||
|
title="Unmapped Rate",
|
||||||
|
grid={"x": 9, "y": 0, "w": 3, "h": 1},
|
||||||
|
config={
|
||||||
|
"value_key": "unmapped_rate",
|
||||||
|
"format": "{value:.1f}%",
|
||||||
|
"icon": "alert-circle",
|
||||||
|
"color": "orange",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
collapsed=False,
|
||||||
|
),
|
||||||
|
DashboardSection(
|
||||||
|
id="domains",
|
||||||
|
title="Domain Breakdown",
|
||||||
|
description="Performance across experience domains",
|
||||||
|
widgets=[
|
||||||
|
WidgetConfig(
|
||||||
|
id="domain_scores",
|
||||||
|
type="bar_chart",
|
||||||
|
title="Domain Scores",
|
||||||
|
grid={"x": 0, "y": 0, "w": 6, "h": 2},
|
||||||
|
config={
|
||||||
|
"x_axis": {"key": "domain", "type": "category"},
|
||||||
|
"y_axis": {"key": "score", "label": "Score"},
|
||||||
|
"series": [{"key": "score", "name": "Score"}],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
WidgetConfig(
|
||||||
|
id="domain_volume",
|
||||||
|
type="pie_chart",
|
||||||
|
title="Mentions by Domain",
|
||||||
|
grid={"x": 6, "y": 0, "w": 6, "h": 2},
|
||||||
|
config={
|
||||||
|
"value_key": "volume",
|
||||||
|
"label_key": "name",
|
||||||
|
"show_legend": True,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
collapsed=False,
|
||||||
|
),
|
||||||
|
DashboardSection(
|
||||||
|
id="drivers",
|
||||||
|
title="Key Drivers",
|
||||||
|
description="Top positive and negative drivers",
|
||||||
|
widgets=[
|
||||||
|
WidgetConfig(
|
||||||
|
id="positive_drivers",
|
||||||
|
type="bar_chart",
|
||||||
|
title="Top Strengths",
|
||||||
|
grid={"x": 0, "y": 0, "w": 6, "h": 2},
|
||||||
|
config={
|
||||||
|
"x_axis": {"key": "primitive", "type": "category"},
|
||||||
|
"y_axis": {"key": "impact", "label": "Impact %"},
|
||||||
|
"series": [{"key": "impact", "name": "Impact", "color": "#22c55e"}],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
WidgetConfig(
|
||||||
|
id="negative_drivers",
|
||||||
|
type="bar_chart",
|
||||||
|
title="Top Weaknesses",
|
||||||
|
grid={"x": 6, "y": 0, "w": 6, "h": 2},
|
||||||
|
config={
|
||||||
|
"x_axis": {"key": "primitive", "type": "category"},
|
||||||
|
"y_axis": {"key": "impact", "label": "Impact %"},
|
||||||
|
"series": [{"key": "impact", "name": "Impact", "color": "#ef4444"}],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
collapsed=False,
|
||||||
|
),
|
||||||
|
DashboardSection(
|
||||||
|
id="primitives",
|
||||||
|
title="Primitive Analysis",
|
||||||
|
description="Detailed breakdown by primitive",
|
||||||
|
widgets=[
|
||||||
|
WidgetConfig(
|
||||||
|
id="primitives_table",
|
||||||
|
type="table",
|
||||||
|
title="All Primitives",
|
||||||
|
grid={"x": 0, "y": 0, "w": 12, "h": 3},
|
||||||
|
config={
|
||||||
|
"columns": [
|
||||||
|
{"key": "primitive", "header": "Primitive", "width": 150},
|
||||||
|
{"key": "domain", "header": "Domain", "width": 100},
|
||||||
|
{"key": "score", "header": "Score", "width": 80, "align": "right"},
|
||||||
|
{"key": "volume", "header": "Mentions", "width": 80, "align": "right"},
|
||||||
|
],
|
||||||
|
"row_key": "primitive",
|
||||||
|
"page_size": 15,
|
||||||
|
"sortable": True,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
collapsed=True,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
default_time_range="365d",
|
||||||
|
refresh_interval=600,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def get_widget_data(
|
||||||
|
self,
|
||||||
|
widget_id: str,
|
||||||
|
params: dict[str, Any],
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Get data for a specific dashboard widget."""
|
||||||
|
await self.initialize()
|
||||||
|
|
||||||
|
business_id = params.get("business_id")
|
||||||
|
if not business_id:
|
||||||
|
return {"error": "business_id required"}
|
||||||
|
|
||||||
|
days = 365
|
||||||
|
time_range = params.get("time_range", "365d")
|
||||||
|
if time_range.endswith("d"):
|
||||||
|
days = int(time_range[:-1])
|
||||||
|
|
||||||
|
end_date = datetime.utcnow()
|
||||||
|
start_date = end_date - timedelta(days=days)
|
||||||
|
|
||||||
|
# Get classification check data
|
||||||
|
classify_data = await self._check_classification(business_id, start_date, end_date)
|
||||||
|
|
||||||
|
# Get report data
|
||||||
|
report_data = await self._generate_report(business_id, start_date, end_date)
|
||||||
|
|
||||||
|
match widget_id:
|
||||||
|
# Overview stats
|
||||||
|
case "reputation_score":
|
||||||
|
return {"overall_score": report_data.get("scores", {}).get("overall", 0)}
|
||||||
|
case "positive_share":
|
||||||
|
return {"positive_share": report_data.get("scores", {}).get("positive_share", 0)}
|
||||||
|
case "content_spans":
|
||||||
|
return {"content_spans": report_data.get("scores", {}).get("content_spans", 0)}
|
||||||
|
case "unmapped_rate":
|
||||||
|
return {"unmapped_rate": classify_data.get("unmapped_rate", 0)}
|
||||||
|
|
||||||
|
# Domain charts
|
||||||
|
case "domain_scores":
|
||||||
|
domains = report_data.get("domains", {})
|
||||||
|
return {"data": [{"domain": k, **v} for k, v in domains.items()]}
|
||||||
|
case "domain_volume":
|
||||||
|
domains = report_data.get("domains", {})
|
||||||
|
return {"data": [{"name": v["name"], "volume": v["volume"]} for v in domains.values()]}
|
||||||
|
|
||||||
|
# Driver charts
|
||||||
|
case "positive_drivers":
|
||||||
|
return {"data": report_data.get("drivers", {}).get("positives", [])}
|
||||||
|
case "negative_drivers":
|
||||||
|
return {"data": report_data.get("drivers", {}).get("negatives", [])}
|
||||||
|
|
||||||
|
# Primitives table
|
||||||
|
case "primitives_table":
|
||||||
|
primitives = report_data.get("primitives", {})
|
||||||
|
return {
|
||||||
|
"data": [
|
||||||
|
{"primitive": k, **v}
|
||||||
|
for k, v in primitives.items()
|
||||||
|
],
|
||||||
|
"total": len(primitives),
|
||||||
|
}
|
||||||
|
|
||||||
|
case _:
|
||||||
|
logger.warning(f"Unknown widget: {widget_id}")
|
||||||
|
return {"error": f"Unknown widget: {widget_id}"}
|
||||||
|
|
||||||
|
async def health_check(self) -> dict[str, Any]:
|
||||||
|
"""Check pipeline health."""
|
||||||
|
await self.initialize()
|
||||||
|
|
||||||
|
checks = {}
|
||||||
|
healthy = True
|
||||||
|
|
||||||
|
# Check database connection
|
||||||
|
try:
|
||||||
|
if self._db:
|
||||||
|
async with self._db.pool.acquire() as conn:
|
||||||
|
await conn.fetchval("SELECT 1")
|
||||||
|
checks["database"] = "ok"
|
||||||
|
else:
|
||||||
|
checks["database"] = "not_initialized"
|
||||||
|
healthy = False
|
||||||
|
except Exception as e:
|
||||||
|
checks["database"] = str(e)
|
||||||
|
healthy = False
|
||||||
|
|
||||||
|
# Check spans table exists
|
||||||
|
try:
|
||||||
|
if self._db:
|
||||||
|
async with self._db.pool.acquire() as conn:
|
||||||
|
count = await conn.fetchval(
|
||||||
|
"SELECT COUNT(*) FROM pipeline.detected_spans_v2 LIMIT 1"
|
||||||
|
)
|
||||||
|
checks["spans_table"] = "ok"
|
||||||
|
except Exception as e:
|
||||||
|
checks["spans_table"] = str(e)
|
||||||
|
healthy = False
|
||||||
|
|
||||||
|
return {
|
||||||
|
"healthy": healthy,
|
||||||
|
"checks": checks,
|
||||||
|
}
|
||||||
@@ -2,10 +2,22 @@
|
|||||||
|
|
||||||
from reviewiq_pipeline.services.embeddings import EmbeddingService
|
from reviewiq_pipeline.services.embeddings import EmbeddingService
|
||||||
from reviewiq_pipeline.services.llm_client import LLMClient
|
from reviewiq_pipeline.services.llm_client import LLMClient
|
||||||
|
from reviewiq_pipeline.services.review_router import (
|
||||||
|
ReviewRouter,
|
||||||
|
RouterConfig,
|
||||||
|
RoutingDecision,
|
||||||
|
RoutingTier,
|
||||||
|
create_router,
|
||||||
|
)
|
||||||
from reviewiq_pipeline.services.text_processor import TextProcessor
|
from reviewiq_pipeline.services.text_processor import TextProcessor
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"LLMClient",
|
"LLMClient",
|
||||||
"EmbeddingService",
|
"EmbeddingService",
|
||||||
"TextProcessor",
|
"TextProcessor",
|
||||||
|
"ReviewRouter",
|
||||||
|
"RouterConfig",
|
||||||
|
"RoutingDecision",
|
||||||
|
"RoutingTier",
|
||||||
|
"create_router",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -0,0 +1,392 @@
|
|||||||
|
"""
|
||||||
|
Category Resolver Service
|
||||||
|
|
||||||
|
Resolves business categories to the deepest node in the GBP taxonomy.
|
||||||
|
Uses a multi-phase approach:
|
||||||
|
1. Exact match from Google's category
|
||||||
|
2. LLM matching when no exact match
|
||||||
|
3. Hierarchical LLM classification when no Google category
|
||||||
|
|
||||||
|
This is critical for the classification pipeline as it provides context
|
||||||
|
for understanding and categorizing reviews.
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
from .llm_client import LLMClient
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ResolvedCategory:
|
||||||
|
"""Result of category resolution."""
|
||||||
|
category_id: int
|
||||||
|
path: str # ltree path as string
|
||||||
|
name: str
|
||||||
|
level: int
|
||||||
|
method: str # 'exact', 'llm', 'hierarchical'
|
||||||
|
confidence: float # 0.0 - 1.0
|
||||||
|
|
||||||
|
|
||||||
|
class CategoryResolver:
|
||||||
|
"""
|
||||||
|
Resolves business categories to GBP taxonomy nodes.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
resolver = CategoryResolver(db_pool, llm_client)
|
||||||
|
|
||||||
|
# With Google category
|
||||||
|
result = await resolver.resolve("Toy store")
|
||||||
|
# -> ResolvedCategory(path="Retail.Stores.Toy_store", method="exact")
|
||||||
|
|
||||||
|
# Without Google category (infer from name)
|
||||||
|
result = await resolver.resolve(None, business_name="Pura Vida Hostel")
|
||||||
|
# -> ResolvedCategory(path="Travel_Hospitality.Hotels.Hostel", method="hierarchical")
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, pool: asyncpg.Pool, llm_client: Optional[LLMClient] = None):
|
||||||
|
self.pool = pool
|
||||||
|
self.llm = llm_client
|
||||||
|
self._level1_cache: list[dict] = []
|
||||||
|
self._level2_cache: dict[str, list[dict]] = {}
|
||||||
|
self._level3_cache: dict[str, list[dict]] = {}
|
||||||
|
|
||||||
|
async def resolve(
|
||||||
|
self,
|
||||||
|
google_category: Optional[str] = None,
|
||||||
|
business_name: Optional[str] = None,
|
||||||
|
business_address: Optional[str] = None
|
||||||
|
) -> Optional[ResolvedCategory]:
|
||||||
|
"""
|
||||||
|
Resolve to the deepest taxonomy node.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
google_category: Category from Google Maps (e.g., "Toy store")
|
||||||
|
business_name: Business name for inference if no Google category
|
||||||
|
business_address: Address for additional context
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ResolvedCategory or None if resolution failed
|
||||||
|
"""
|
||||||
|
# Phase 1: Try exact match if we have Google category
|
||||||
|
if google_category:
|
||||||
|
result = await self._exact_match(google_category)
|
||||||
|
if result:
|
||||||
|
log.info(f"Exact match: '{google_category}' -> {result.path}")
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Phase 2: LLM matching for Google category
|
||||||
|
if self.llm:
|
||||||
|
result = await self._llm_match(google_category)
|
||||||
|
if result:
|
||||||
|
log.info(f"LLM match: '{google_category}' -> {result.path}")
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Phase 3: Hierarchical classification from business name
|
||||||
|
if business_name and self.llm:
|
||||||
|
result = await self._hierarchical_classify(
|
||||||
|
business_name=business_name,
|
||||||
|
business_address=business_address,
|
||||||
|
google_category=google_category # May be None or unmatched
|
||||||
|
)
|
||||||
|
if result:
|
||||||
|
log.info(f"Hierarchical: '{business_name}' -> {result.path}")
|
||||||
|
return result
|
||||||
|
|
||||||
|
log.warning(f"Could not resolve category for: {google_category or business_name}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def _exact_match(self, google_category: str) -> Optional[ResolvedCategory]:
|
||||||
|
"""Try exact match against taxonomy."""
|
||||||
|
async with self.pool.acquire() as conn:
|
||||||
|
# Try exact match (case-insensitive)
|
||||||
|
row = await conn.fetchrow("""
|
||||||
|
SELECT id, name, path::text as path, level
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE LOWER(name) = LOWER($1) AND level = 3
|
||||||
|
""", google_category)
|
||||||
|
|
||||||
|
if row:
|
||||||
|
return ResolvedCategory(
|
||||||
|
category_id=row['id'],
|
||||||
|
path=row['path'],
|
||||||
|
name=row['name'],
|
||||||
|
level=row['level'],
|
||||||
|
method='exact',
|
||||||
|
confidence=1.0
|
||||||
|
)
|
||||||
|
|
||||||
|
# Try fuzzy match (contains)
|
||||||
|
row = await conn.fetchrow("""
|
||||||
|
SELECT id, name, path::text as path, level
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE LOWER(name) LIKE LOWER($1) AND level = 3
|
||||||
|
ORDER BY length(name) ASC
|
||||||
|
LIMIT 1
|
||||||
|
""", f"%{google_category}%")
|
||||||
|
|
||||||
|
if row:
|
||||||
|
return ResolvedCategory(
|
||||||
|
category_id=row['id'],
|
||||||
|
path=row['path'],
|
||||||
|
name=row['name'],
|
||||||
|
level=row['level'],
|
||||||
|
method='exact',
|
||||||
|
confidence=0.9
|
||||||
|
)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def _llm_match(self, google_category: str) -> Optional[ResolvedCategory]:
|
||||||
|
"""Use LLM to match Google category to taxonomy."""
|
||||||
|
# Get candidate categories (level 3) that might match
|
||||||
|
async with self.pool.acquire() as conn:
|
||||||
|
# Get categories with similar words
|
||||||
|
words = google_category.lower().split()
|
||||||
|
conditions = " OR ".join([f"LOWER(name) LIKE '%{w}%'" for w in words if len(w) > 2])
|
||||||
|
|
||||||
|
if not conditions:
|
||||||
|
return None
|
||||||
|
|
||||||
|
candidates = await conn.fetch(f"""
|
||||||
|
SELECT id, name, path::text as path, level
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE ({conditions}) AND level = 3
|
||||||
|
ORDER BY name
|
||||||
|
LIMIT 20
|
||||||
|
""")
|
||||||
|
|
||||||
|
if not candidates:
|
||||||
|
# Get random sample for LLM to choose from
|
||||||
|
candidates = await conn.fetch("""
|
||||||
|
SELECT id, name, path::text as path, level
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE level = 3
|
||||||
|
ORDER BY RANDOM()
|
||||||
|
LIMIT 50
|
||||||
|
""")
|
||||||
|
|
||||||
|
if not candidates:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Ask LLM to pick best match
|
||||||
|
candidate_list = "\n".join([f"- {c['name']} ({c['path']})" for c in candidates])
|
||||||
|
|
||||||
|
prompt = f"""Given the Google Maps business category "{google_category}", select the BEST matching category from this taxonomy list.
|
||||||
|
|
||||||
|
Candidates:
|
||||||
|
{candidate_list}
|
||||||
|
|
||||||
|
Respond with ONLY the exact category name from the list, nothing else.
|
||||||
|
If none match well, respond with "NONE"."""
|
||||||
|
|
||||||
|
response = await self.llm.complete(prompt, max_tokens=50)
|
||||||
|
selected_name = response.strip().strip('"').strip("'")
|
||||||
|
|
||||||
|
if selected_name == "NONE":
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Find the selected category
|
||||||
|
for c in candidates:
|
||||||
|
if c['name'].lower() == selected_name.lower():
|
||||||
|
return ResolvedCategory(
|
||||||
|
category_id=c['id'],
|
||||||
|
path=c['path'],
|
||||||
|
name=c['name'],
|
||||||
|
level=c['level'],
|
||||||
|
method='llm',
|
||||||
|
confidence=0.8
|
||||||
|
)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def _hierarchical_classify(
|
||||||
|
self,
|
||||||
|
business_name: str,
|
||||||
|
business_address: Optional[str] = None,
|
||||||
|
google_category: Optional[str] = None
|
||||||
|
) -> Optional[ResolvedCategory]:
|
||||||
|
"""
|
||||||
|
Walk down the taxonomy tree using LLM at each level.
|
||||||
|
|
||||||
|
Level 1 (16 sectors) -> Level 2 (91 types) -> Level 3 (4034 categories)
|
||||||
|
"""
|
||||||
|
context = f"Business: {business_name}"
|
||||||
|
if business_address:
|
||||||
|
context += f"\nAddress: {business_address}"
|
||||||
|
if google_category:
|
||||||
|
context += f"\nGoogle category hint: {google_category}"
|
||||||
|
|
||||||
|
# Level 1: Select sector
|
||||||
|
level1_categories = await self._get_level_categories(1)
|
||||||
|
sector = await self._llm_select_category(
|
||||||
|
context=context,
|
||||||
|
categories=level1_categories,
|
||||||
|
level_name="sector"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not sector:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Level 2: Select business type within sector
|
||||||
|
level2_categories = await self._get_level_categories(2, parent_path=sector['path'])
|
||||||
|
business_type = await self._llm_select_category(
|
||||||
|
context=context,
|
||||||
|
categories=level2_categories,
|
||||||
|
level_name="business type",
|
||||||
|
parent=sector['name']
|
||||||
|
)
|
||||||
|
|
||||||
|
if not business_type:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Level 3: Select specific category
|
||||||
|
level3_categories = await self._get_level_categories(3, parent_path=business_type['path'])
|
||||||
|
specific = await self._llm_select_category(
|
||||||
|
context=context,
|
||||||
|
categories=level3_categories,
|
||||||
|
level_name="specific category",
|
||||||
|
parent=business_type['name']
|
||||||
|
)
|
||||||
|
|
||||||
|
if not specific:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return ResolvedCategory(
|
||||||
|
category_id=specific['id'],
|
||||||
|
path=specific['path'],
|
||||||
|
name=specific['name'],
|
||||||
|
level=specific['level'],
|
||||||
|
method='hierarchical',
|
||||||
|
confidence=0.7
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _get_level_categories(
|
||||||
|
self,
|
||||||
|
level: int,
|
||||||
|
parent_path: Optional[str] = None
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Get categories at a specific level, optionally filtered by parent."""
|
||||||
|
cache_key = f"{level}:{parent_path or 'root'}"
|
||||||
|
|
||||||
|
# Check cache
|
||||||
|
if level == 1 and self._level1_cache:
|
||||||
|
return self._level1_cache
|
||||||
|
if level == 2 and parent_path in self._level2_cache:
|
||||||
|
return self._level2_cache[parent_path]
|
||||||
|
if level == 3 and parent_path in self._level3_cache:
|
||||||
|
return self._level3_cache[parent_path]
|
||||||
|
|
||||||
|
async with self.pool.acquire() as conn:
|
||||||
|
if parent_path:
|
||||||
|
rows = await conn.fetch("""
|
||||||
|
SELECT id, name, path::text as path, level
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE level = $1 AND path <@ $2::ltree
|
||||||
|
ORDER BY name
|
||||||
|
""", level, parent_path)
|
||||||
|
else:
|
||||||
|
rows = await conn.fetch("""
|
||||||
|
SELECT id, name, path::text as path, level
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE level = $1
|
||||||
|
ORDER BY name
|
||||||
|
""", level)
|
||||||
|
|
||||||
|
result = [dict(r) for r in rows]
|
||||||
|
|
||||||
|
# Cache results
|
||||||
|
if level == 1:
|
||||||
|
self._level1_cache = result
|
||||||
|
elif level == 2 and parent_path:
|
||||||
|
self._level2_cache[parent_path] = result
|
||||||
|
elif level == 3 and parent_path:
|
||||||
|
self._level3_cache[parent_path] = result
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def _llm_select_category(
|
||||||
|
self,
|
||||||
|
context: str,
|
||||||
|
categories: list[dict],
|
||||||
|
level_name: str,
|
||||||
|
parent: Optional[str] = None
|
||||||
|
) -> Optional[dict]:
|
||||||
|
"""Ask LLM to select best category from list."""
|
||||||
|
if not categories:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# If only one option, return it
|
||||||
|
if len(categories) == 1:
|
||||||
|
return categories[0]
|
||||||
|
|
||||||
|
category_list = "\n".join([f"- {c['name']}" for c in categories])
|
||||||
|
|
||||||
|
parent_context = f" within {parent}" if parent else ""
|
||||||
|
|
||||||
|
prompt = f"""{context}
|
||||||
|
|
||||||
|
Select the most appropriate {level_name}{parent_context} for this business.
|
||||||
|
|
||||||
|
Options:
|
||||||
|
{category_list}
|
||||||
|
|
||||||
|
Respond with ONLY the exact category name from the list, nothing else."""
|
||||||
|
|
||||||
|
response = await self.llm.complete(prompt, max_tokens=50)
|
||||||
|
selected_name = response.strip().strip('"').strip("'")
|
||||||
|
|
||||||
|
# Find the selected category
|
||||||
|
for c in categories:
|
||||||
|
if c['name'].lower() == selected_name.lower():
|
||||||
|
return c
|
||||||
|
|
||||||
|
# Fuzzy match if exact not found
|
||||||
|
for c in categories:
|
||||||
|
if selected_name.lower() in c['name'].lower() or c['name'].lower() in selected_name.lower():
|
||||||
|
return c
|
||||||
|
|
||||||
|
# Return first as fallback
|
||||||
|
log.warning(f"LLM selected '{selected_name}' not in list, using first option")
|
||||||
|
return categories[0] if categories else None
|
||||||
|
|
||||||
|
|
||||||
|
async def resolve_job_category(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
llm_client: LLMClient,
|
||||||
|
job_id: str,
|
||||||
|
google_category: Optional[str],
|
||||||
|
business_name: Optional[str],
|
||||||
|
business_address: Optional[str] = None
|
||||||
|
) -> Optional[ResolvedCategory]:
|
||||||
|
"""
|
||||||
|
Resolve and save category for a job.
|
||||||
|
|
||||||
|
This is the main entry point for pre-flight category resolution.
|
||||||
|
"""
|
||||||
|
resolver = CategoryResolver(pool, llm_client)
|
||||||
|
result = await resolver.resolve(
|
||||||
|
google_category=google_category,
|
||||||
|
business_name=business_name,
|
||||||
|
business_address=business_address
|
||||||
|
)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
# Save to database
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
await conn.execute("""
|
||||||
|
UPDATE jobs
|
||||||
|
SET gbp_category_id = $2,
|
||||||
|
gbp_category_path = $3::ltree,
|
||||||
|
category_resolution_method = $4,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE job_id = $1::uuid
|
||||||
|
""", job_id, result.category_id, result.path, result.method)
|
||||||
|
|
||||||
|
log.info(f"Job {job_id}: resolved category to {result.path} ({result.method})")
|
||||||
|
|
||||||
|
return result
|
||||||
@@ -0,0 +1,210 @@
|
|||||||
|
"""
|
||||||
|
Classification validator for post-LLM validation.
|
||||||
|
|
||||||
|
Catches common misclassification patterns based on keyword detection
|
||||||
|
and suggests corrections before persisting to database.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Price/money indicators → should be V codes
|
||||||
|
PRICE_PATTERNS = [
|
||||||
|
r'\b\d+\s*[€$£]\b', # "50€", "100$"
|
||||||
|
r'\b[€$£]\s*\d+\b', # "€50", "$100"
|
||||||
|
r'\beur(o|os)?\b',
|
||||||
|
r'\bprice[sd]?\b',
|
||||||
|
r'\bcost[s]?\b',
|
||||||
|
r'\bfee[s]?\b',
|
||||||
|
r'\bcharge[sd]?\b',
|
||||||
|
r'\bdeposit[s]?\b',
|
||||||
|
r'\brefund[s]?\b',
|
||||||
|
r'\bcheap\b',
|
||||||
|
r'\bexpensive\b',
|
||||||
|
r'\baffordable\b',
|
||||||
|
r'\bpreis\b', # German
|
||||||
|
r'\bprecio[s]?\b', # Spanish
|
||||||
|
r'\bgünstig\b', # German "cheap"
|
||||||
|
r'\bteuer\b', # German "expensive"
|
||||||
|
r'\bbarato\b', # Spanish "cheap"
|
||||||
|
r'\bcaro\b', # Spanish "expensive"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Staff behavior indicators → should be P codes
|
||||||
|
STAFF_PATTERNS = [
|
||||||
|
r'\bfriendly\b',
|
||||||
|
r'\brude\b',
|
||||||
|
r'\bhelpful\b',
|
||||||
|
r'\bpatient\b',
|
||||||
|
r'\bimpatient\b',
|
||||||
|
r'\bwelcoming\b',
|
||||||
|
r'\battentive\b',
|
||||||
|
r'\bprofessional\b',
|
||||||
|
r'\bunprofessional\b',
|
||||||
|
r'\bamable\b', # Spanish "friendly"
|
||||||
|
r'\bsimpático\b', # Spanish
|
||||||
|
r'\bmuy amable\b',
|
||||||
|
r'\bnett\b', # German "nice"
|
||||||
|
r'\bfreundlich\b', # German "friendly"
|
||||||
|
r'\bunfreundlich\b', # German "unfriendly"
|
||||||
|
r'\bgentil\b', # French/Spanish
|
||||||
|
]
|
||||||
|
|
||||||
|
# Scam/ethics indicators → should be R codes
|
||||||
|
ETHICS_PATTERNS = [
|
||||||
|
r'\bscam\b',
|
||||||
|
r'\bfraud\b',
|
||||||
|
r'\bcheat\b',
|
||||||
|
r'\bdishonest\b',
|
||||||
|
r'\blied\b',
|
||||||
|
r'\blie[s]?\b',
|
||||||
|
r'\bscammer[s]?\b',
|
||||||
|
r'\bsteal\b',
|
||||||
|
r'\bstole\b',
|
||||||
|
r'\brobber[y]?\b',
|
||||||
|
r'\bestafa\b', # Spanish "scam"
|
||||||
|
r'\btramp[a]?\b', # Spanish "trap/trick"
|
||||||
|
r'\bengaño\b', # Spanish "deception"
|
||||||
|
r'\bAbzocker\b', # German "rip-off"
|
||||||
|
r'\bBetrug\b', # German "fraud"
|
||||||
|
r'\barnaque\b', # French "scam"
|
||||||
|
r'\bvoleur[s]?\b', # French "thief"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Wayfinding indicators → should be A1.04
|
||||||
|
WAYFINDING_PATTERNS = [
|
||||||
|
r"\bcouldn'?t find\b",
|
||||||
|
r'\bhard to find\b',
|
||||||
|
r'\bdifficult to find\b',
|
||||||
|
r'\bconfusing\b.*\b(direction|location|shuttle)\b',
|
||||||
|
r'\blost\b',
|
||||||
|
r'\bno signs?\b',
|
||||||
|
r'\bno señal\b', # Spanish
|
||||||
|
r'\bkeine Schilder\b', # German
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def validate_classification(
|
||||||
|
span_text: str,
|
||||||
|
urt_code: str,
|
||||||
|
valence: str,
|
||||||
|
) -> dict[str, Any] | None:
|
||||||
|
"""
|
||||||
|
Validate a classification and suggest correction if needed.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
span_text: The span text
|
||||||
|
urt_code: The assigned URT code
|
||||||
|
valence: The assigned valence
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Correction dict if misclassified, None if OK
|
||||||
|
"""
|
||||||
|
text_lower = span_text.lower()
|
||||||
|
domain = urt_code[0] # First letter is domain
|
||||||
|
|
||||||
|
# Rule 1: Price mentions should be V codes
|
||||||
|
if domain != 'V':
|
||||||
|
for pattern in PRICE_PATTERNS:
|
||||||
|
if re.search(pattern, text_lower, re.IGNORECASE):
|
||||||
|
# Determine which V code
|
||||||
|
if any(re.search(p, text_lower, re.I) for p in [r'hidden', r'extra', r'surprise', r'unexpected', r'trampa']):
|
||||||
|
suggested = 'V1.03'
|
||||||
|
elif any(re.search(p, text_lower, re.I) for p in [r'overcharge', r'wrong.*charge', r'billing']):
|
||||||
|
suggested = 'V4.04'
|
||||||
|
else:
|
||||||
|
suggested = 'V1.01'
|
||||||
|
|
||||||
|
logger.debug(f"Validation: {urt_code} → {suggested} (price mention)")
|
||||||
|
return {
|
||||||
|
'suggested_urt': suggested,
|
||||||
|
'reason': 'price_mention',
|
||||||
|
'pattern': pattern,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Rule 2: Staff behavior should be P codes
|
||||||
|
if domain != 'P':
|
||||||
|
for pattern in STAFF_PATTERNS:
|
||||||
|
if re.search(pattern, text_lower, re.IGNORECASE):
|
||||||
|
# Determine which P code
|
||||||
|
if any(re.search(p, text_lower, re.I) for p in [r'rude', r'unfriendly', r'disrespect', r'unfreundlich']):
|
||||||
|
suggested = 'P1.02'
|
||||||
|
elif any(re.search(p, text_lower, re.I) for p in [r'impatient', r'rushed']):
|
||||||
|
suggested = 'P1.03'
|
||||||
|
else:
|
||||||
|
suggested = 'P1.01'
|
||||||
|
|
||||||
|
logger.debug(f"Validation: {urt_code} → {suggested} (staff behavior)")
|
||||||
|
return {
|
||||||
|
'suggested_urt': suggested,
|
||||||
|
'reason': 'staff_behavior',
|
||||||
|
'pattern': pattern,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Rule 3: Scam/ethics should be R codes
|
||||||
|
if domain != 'R':
|
||||||
|
for pattern in ETHICS_PATTERNS:
|
||||||
|
if re.search(pattern, text_lower, re.IGNORECASE):
|
||||||
|
if any(re.search(p, text_lower, re.I) for p in [r'scam', r'fraud', r'cheat', r'estafa', r'Betrug', r'arnaque']):
|
||||||
|
suggested = 'R1.02'
|
||||||
|
else:
|
||||||
|
suggested = 'R1.01'
|
||||||
|
|
||||||
|
logger.debug(f"Validation: {urt_code} → {suggested} (ethics issue)")
|
||||||
|
return {
|
||||||
|
'suggested_urt': suggested,
|
||||||
|
'reason': 'ethics_issue',
|
||||||
|
'pattern': pattern,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Rule 4: Wayfinding should be A1.04
|
||||||
|
if urt_code not in ('A1.04', 'A4.01'):
|
||||||
|
for pattern in WAYFINDING_PATTERNS:
|
||||||
|
if re.search(pattern, text_lower, re.IGNORECASE):
|
||||||
|
logger.debug(f"Validation: {urt_code} → A1.04 (wayfinding)")
|
||||||
|
return {
|
||||||
|
'suggested_urt': 'A1.04',
|
||||||
|
'reason': 'wayfinding',
|
||||||
|
'pattern': pattern,
|
||||||
|
}
|
||||||
|
|
||||||
|
return None # Classification looks OK
|
||||||
|
|
||||||
|
|
||||||
|
def validate_and_fix_spans(spans: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Validate and fix a list of spans.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
spans: List of span dicts with 'span_text', 'urt_primary', 'valence'
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of spans with corrections applied
|
||||||
|
"""
|
||||||
|
corrections_made = 0
|
||||||
|
|
||||||
|
for span in spans:
|
||||||
|
correction = validate_classification(
|
||||||
|
span.get('span_text', ''),
|
||||||
|
span.get('urt_primary', 'O1.01'),
|
||||||
|
span.get('valence', 'V0'),
|
||||||
|
)
|
||||||
|
|
||||||
|
if correction:
|
||||||
|
original = span['urt_primary']
|
||||||
|
span['urt_primary'] = correction['suggested_urt']
|
||||||
|
span['_validation_correction'] = {
|
||||||
|
'original': original,
|
||||||
|
'reason': correction['reason'],
|
||||||
|
}
|
||||||
|
corrections_made += 1
|
||||||
|
|
||||||
|
if corrections_made:
|
||||||
|
logger.info(f"Validation corrected {corrections_made} spans")
|
||||||
|
|
||||||
|
return spans
|
||||||
@@ -0,0 +1,262 @@
|
|||||||
|
"""
|
||||||
|
Config Resolver - Resolves L1 config + sector brief for classification.
|
||||||
|
|
||||||
|
Builds a single JSON payload per business containing:
|
||||||
|
- Enabled primitives (L1 + always-on meta)
|
||||||
|
- Weights
|
||||||
|
- Sector brief (language/signals)
|
||||||
|
- Minimal primitive dictionary
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, TYPE_CHECKING
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Paths
|
||||||
|
DATA_DIR = Path(__file__).parent.parent.parent.parent / "data"
|
||||||
|
CONFIGS_DIR = DATA_DIR / "primitive_configs" / "l1"
|
||||||
|
BRIEFS_DIR = DATA_DIR / "sector_briefs"
|
||||||
|
PRIMITIVES_FILE = DATA_DIR / "primitives.json"
|
||||||
|
|
||||||
|
# Meta primitives - always enabled, never weighted
|
||||||
|
META_PRIMITIVES = frozenset([
|
||||||
|
"HONESTY", "ETHICS", "PROMISES", # Trust
|
||||||
|
"ACKNOWLEDGMENT", "RESPONSE_QUALITY", "RECOVERY", # Resolution
|
||||||
|
"RETURN_INTENT", "RECOMMEND", "RECOGNITION", # Loyalty
|
||||||
|
"UNMAPPED", # Escape
|
||||||
|
])
|
||||||
|
|
||||||
|
# Core primitives dictionary (frozen 36 - minimal for prompt)
|
||||||
|
CORE_PRIMITIVES = {
|
||||||
|
# Quality (8)
|
||||||
|
"TASTE": {"domain": "O", "name": "Taste/Flavor", "def": "Sensory quality of food/beverage"},
|
||||||
|
"CRAFT": {"domain": "O", "name": "Craftsmanship", "def": "Skill of execution/preparation"},
|
||||||
|
"FRESHNESS": {"domain": "O", "name": "Freshness", "def": "Newness, not stale or old"},
|
||||||
|
"TEMPERATURE": {"domain": "O", "name": "Temperature", "def": "Hot/cold as expected"},
|
||||||
|
"EFFECTIVENESS": {"domain": "O", "name": "Effectiveness", "def": "Achieves intended purpose"},
|
||||||
|
"ACCURACY": {"domain": "O", "name": "Accuracy", "def": "Correct, as ordered/specified"},
|
||||||
|
"CONDITION": {"domain": "O", "name": "Condition", "def": "Physical state, wear, damage"},
|
||||||
|
"CONSISTENCY": {"domain": "O", "name": "Consistency", "def": "Same quality each time"},
|
||||||
|
# Service (4)
|
||||||
|
"MANNER": {"domain": "P", "name": "Manner/Attitude", "def": "Friendliness, respect, warmth"},
|
||||||
|
"COMPETENCE": {"domain": "P", "name": "Competence", "def": "Knowledge and skill of staff"},
|
||||||
|
"ATTENTIVENESS": {"domain": "P", "name": "Attentiveness", "def": "Being present, responsive"},
|
||||||
|
"COMMUNICATION": {"domain": "P", "name": "Communication", "def": "Clarity, listening, updates"},
|
||||||
|
# Process (4)
|
||||||
|
"SPEED": {"domain": "J", "name": "Speed/Wait", "def": "Time to service, waiting"},
|
||||||
|
"FRICTION": {"domain": "J", "name": "Friction", "def": "Obstacles, hassles, complexity"},
|
||||||
|
"RELIABILITY": {"domain": "J", "name": "Reliability", "def": "Dependable, keeps promises"},
|
||||||
|
"AVAILABILITY": {"domain": "J", "name": "Availability", "def": "Open when needed, bookable"},
|
||||||
|
# Environment (6)
|
||||||
|
"CLEANLINESS": {"domain": "E", "name": "Cleanliness", "def": "Hygiene, tidiness"},
|
||||||
|
"COMFORT": {"domain": "E", "name": "Comfort", "def": "Physical ease, seating"},
|
||||||
|
"SAFETY": {"domain": "E", "name": "Safety", "def": "Free from harm/danger"},
|
||||||
|
"AMBIANCE": {"domain": "E", "name": "Ambiance", "def": "Atmosphere, mood, vibe"},
|
||||||
|
"ACCESSIBILITY": {"domain": "E", "name": "Accessibility", "def": "Easy to reach, navigate"},
|
||||||
|
"DIGITAL_UX": {"domain": "E", "name": "Digital Experience", "def": "Website, app, online"},
|
||||||
|
# Value (4)
|
||||||
|
"PRICE_LEVEL": {"domain": "V", "name": "Price Level", "def": "Absolute cost (cheap/expensive)"},
|
||||||
|
"PRICE_FAIRNESS": {"domain": "V", "name": "Price Fairness", "def": "Reasonable for what you get"},
|
||||||
|
"PRICE_TRANSPARENCY": {"domain": "V", "name": "Price Transparency", "def": "No hidden fees, clear pricing"},
|
||||||
|
"VALUE_FOR_MONEY": {"domain": "V", "name": "Value for Money", "def": "Worth what you paid"},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class ConfigResolver:
|
||||||
|
"""
|
||||||
|
Resolves classification config for a business.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
resolver = ConfigResolver()
|
||||||
|
payload = await resolver.resolve("Go Karts Mar Menor", pool)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._l1_cache: dict[str, dict] = {}
|
||||||
|
self._brief_cache: dict[str, dict] = {}
|
||||||
|
|
||||||
|
def _load_l1_config(self, sector_code: str) -> dict[str, Any] | None:
|
||||||
|
"""Load L1 config from file."""
|
||||||
|
if sector_code in self._l1_cache:
|
||||||
|
return self._l1_cache[sector_code]
|
||||||
|
|
||||||
|
config_path = CONFIGS_DIR / f"{sector_code.lower()}_config.json"
|
||||||
|
if not config_path.exists():
|
||||||
|
logger.warning(f"No L1 config for sector {sector_code}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
with open(config_path) as f:
|
||||||
|
config = json.load(f)
|
||||||
|
|
||||||
|
self._l1_cache[sector_code] = config
|
||||||
|
return config
|
||||||
|
|
||||||
|
def _load_sector_brief(self, sector_code: str) -> dict[str, Any] | None:
|
||||||
|
"""Load sector brief from file."""
|
||||||
|
if sector_code in self._brief_cache:
|
||||||
|
return self._brief_cache[sector_code]
|
||||||
|
|
||||||
|
brief_path = BRIEFS_DIR / f"{sector_code.lower()}_brief.json"
|
||||||
|
if not brief_path.exists():
|
||||||
|
logger.warning(f"No sector brief for {sector_code}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
with open(brief_path) as f:
|
||||||
|
brief = json.load(f)
|
||||||
|
|
||||||
|
self._brief_cache[sector_code] = brief
|
||||||
|
return brief
|
||||||
|
|
||||||
|
async def get_business_mapping(
|
||||||
|
self,
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
business_id: str,
|
||||||
|
) -> dict[str, Any] | None:
|
||||||
|
"""Get business → taxonomy mapping from database."""
|
||||||
|
query = """
|
||||||
|
SELECT business_id, gbp_path::text, sector_code
|
||||||
|
FROM pipeline.business_taxonomy_map
|
||||||
|
WHERE business_id = $1
|
||||||
|
"""
|
||||||
|
row = await pool.fetchrow(query, business_id)
|
||||||
|
if not row:
|
||||||
|
return None
|
||||||
|
return dict(row)
|
||||||
|
|
||||||
|
def resolve_enabled_set(self, l1_config: dict) -> set[str]:
|
||||||
|
"""
|
||||||
|
Compute final enabled primitive set.
|
||||||
|
|
||||||
|
= L1.enabled + META_PRIMITIVES (always-on)
|
||||||
|
"""
|
||||||
|
enabled = set(l1_config.get("enabled", []))
|
||||||
|
enabled.update(META_PRIMITIVES)
|
||||||
|
return enabled
|
||||||
|
|
||||||
|
def resolve_weights(self, l1_config: dict) -> dict[str, float]:
|
||||||
|
"""Get weights from L1 config."""
|
||||||
|
return dict(l1_config.get("weights", {}))
|
||||||
|
|
||||||
|
def build_primitives_for_prompt(
|
||||||
|
self,
|
||||||
|
enabled: set[str],
|
||||||
|
weights: dict[str, float],
|
||||||
|
) -> dict[str, dict]:
|
||||||
|
"""
|
||||||
|
Build minimal primitives dict for prompt.
|
||||||
|
|
||||||
|
Only includes enabled primitives with their definitions.
|
||||||
|
"""
|
||||||
|
result = {}
|
||||||
|
for prim in enabled:
|
||||||
|
if prim in CORE_PRIMITIVES:
|
||||||
|
entry = CORE_PRIMITIVES[prim].copy()
|
||||||
|
if prim in weights:
|
||||||
|
entry["weight"] = weights[prim]
|
||||||
|
result[prim] = entry
|
||||||
|
elif prim in META_PRIMITIVES:
|
||||||
|
# Meta primitives - minimal entry
|
||||||
|
result[prim] = {"domain": "M", "name": prim.replace("_", " ").title(), "meta": True}
|
||||||
|
return result
|
||||||
|
|
||||||
|
def extract_brief_signals(self, brief: dict) -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Extract relevant signals from sector brief for prompt.
|
||||||
|
|
||||||
|
Keeps it minimal to avoid bloating context.
|
||||||
|
"""
|
||||||
|
if not brief:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"sector": brief.get("sector_code"),
|
||||||
|
"what_customers_judge": brief.get("what_customers_judge"),
|
||||||
|
"critical_pain_points": brief.get("critical_pain_points"),
|
||||||
|
"industry_terminology": brief.get("industry_terminology"),
|
||||||
|
}
|
||||||
|
|
||||||
|
async def resolve(
|
||||||
|
self,
|
||||||
|
business_id: str,
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
mode: str | None = None,
|
||||||
|
) -> dict[str, Any] | None:
|
||||||
|
"""
|
||||||
|
Resolve full classification payload for a business.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
business_id: Business identifier
|
||||||
|
pool: Database connection pool
|
||||||
|
mode: Optional service mode (e.g., "dine_in", "delivery")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Classification payload or None if business not mapped
|
||||||
|
"""
|
||||||
|
# Get business mapping
|
||||||
|
mapping = await self.get_business_mapping(pool, business_id)
|
||||||
|
if not mapping:
|
||||||
|
logger.warning(f"Business not mapped: {business_id}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
sector_code = mapping["sector_code"]
|
||||||
|
gbp_path = mapping["gbp_path"]
|
||||||
|
|
||||||
|
# Load L1 config
|
||||||
|
l1_config = self._load_l1_config(sector_code)
|
||||||
|
if not l1_config:
|
||||||
|
logger.warning(f"No L1 config for {sector_code}, using defaults")
|
||||||
|
l1_config = {"enabled": list(CORE_PRIMITIVES.keys()), "weights": {}}
|
||||||
|
|
||||||
|
# Load sector brief
|
||||||
|
brief = self._load_sector_brief(sector_code)
|
||||||
|
|
||||||
|
# Resolve enabled set and weights
|
||||||
|
enabled = self.resolve_enabled_set(l1_config)
|
||||||
|
weights = self.resolve_weights(l1_config)
|
||||||
|
|
||||||
|
# Build primitives for prompt
|
||||||
|
primitives = self.build_primitives_for_prompt(enabled, weights)
|
||||||
|
|
||||||
|
# Extract brief signals
|
||||||
|
brief_signals = self.extract_brief_signals(brief)
|
||||||
|
|
||||||
|
# Build payload
|
||||||
|
payload = {
|
||||||
|
"business_id": business_id,
|
||||||
|
"gbp_path": gbp_path,
|
||||||
|
"sector_code": sector_code,
|
||||||
|
"config_version": l1_config.get("config_version", "1.0"),
|
||||||
|
"modes": [mode] if mode else ["in_person"],
|
||||||
|
"default_mode": mode or "in_person",
|
||||||
|
"enabled_primitives": sorted(enabled),
|
||||||
|
"disabled_primitives": sorted(l1_config.get("disabled", [])),
|
||||||
|
"weights": weights,
|
||||||
|
"brief": brief_signals,
|
||||||
|
"primitives": primitives,
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Resolved config for {business_id}: "
|
||||||
|
f"sector={sector_code}, enabled={len(enabled)}, weights={len(weights)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return payload
|
||||||
|
|
||||||
|
|
||||||
|
# Convenience function
|
||||||
|
async def resolve_business_config(
|
||||||
|
business_id: str,
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
mode: str | None = None,
|
||||||
|
) -> dict[str, Any] | None:
|
||||||
|
"""Resolve classification config for a business."""
|
||||||
|
resolver = ConfigResolver()
|
||||||
|
return await resolver.resolve(business_id, pool, mode)
|
||||||
@@ -0,0 +1,571 @@
|
|||||||
|
"""
|
||||||
|
LLM prompts for generating sparse primitive config deltas for GBP hierarchy nodes.
|
||||||
|
|
||||||
|
These prompts are used to populate L1 (sector) and L2 (category) nodes in the
|
||||||
|
GBP category tree with business-specific primitive configurations.
|
||||||
|
|
||||||
|
The output is a sparse delta that only includes primitives that need overrides
|
||||||
|
for that specific business type. Configuration inheritance handles the rest.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# SYSTEM PROMPT
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
SYSTEM_PROMPT_GBP_PRIMITIVE_CONFIG = """You are a customer experience taxonomy configuration specialist. Your task is to generate sparse primitive configuration deltas for Google Business Profile (GBP) category nodes.
|
||||||
|
|
||||||
|
## YOUR ROLE
|
||||||
|
|
||||||
|
You configure how the Universal Review Taxonomy (URT) primitives should be weighted, labeled, and detected for specific business types. Each primitive represents a distinct dimension of customer experience that appears in reviews.
|
||||||
|
|
||||||
|
## THE 36 PRIMITIVES (Grouped by Domain)
|
||||||
|
|
||||||
|
### OFFERING (O) - What the business provides
|
||||||
|
- WORKS: Does the product/service function correctly?
|
||||||
|
- PERFORMANCE: How well does it perform?
|
||||||
|
- DURABILITY: How long does it last?
|
||||||
|
- RELIABILITY: Is it consistent over time?
|
||||||
|
- OUTCOME: Did the customer achieve their goal?
|
||||||
|
- MATERIALS: Quality of ingredients/components
|
||||||
|
- CRAFTSMANSHIP: Skill of construction/execution
|
||||||
|
- PRESENTATION: Visual/aesthetic quality
|
||||||
|
- ATTENTION_TO_DETAIL: Finishing touches
|
||||||
|
- CONDITION: State at delivery
|
||||||
|
- COMPLETENESS: All components present?
|
||||||
|
- FEATURES: Promised features available?
|
||||||
|
- SCOPE: Full scope delivered?
|
||||||
|
- DOCUMENTATION: Supporting materials
|
||||||
|
- SPEC_MATCH: Matches what was ordered?
|
||||||
|
- PERSONALIZATION: Adapted to individual
|
||||||
|
- FLEXIBILITY: Can be modified?
|
||||||
|
- APPROPRIATENESS: Right solution for need?
|
||||||
|
|
||||||
|
### PEOPLE (P) - Staff interactions
|
||||||
|
- WARMTH: Friendly manner
|
||||||
|
- RESPECT: Dignity and courtesy
|
||||||
|
- EMPATHY: Understanding feelings
|
||||||
|
- PATIENCE: Calm and tolerant
|
||||||
|
- ENTHUSIASM: Energy and engagement
|
||||||
|
- KNOWLEDGE: Expertise level
|
||||||
|
- SKILL: Technical ability
|
||||||
|
- PROBLEM_SOLVING: Finding solutions
|
||||||
|
- PROFESSIONALISM: Conduct standards
|
||||||
|
- EXPERIENCE: Depth of expertise
|
||||||
|
- ATTENTIVENESS: Being present
|
||||||
|
- INITIATIVE: Proactive help
|
||||||
|
- AVAILABILITY: Present when needed
|
||||||
|
- FOLLOW_THROUGH: Completing promises
|
||||||
|
- URGENCY: Appropriate prioritization
|
||||||
|
- CLARITY: Clear communication
|
||||||
|
- LISTENING: Understanding needs
|
||||||
|
- PROACTIVE_UPDATES: Keeping informed
|
||||||
|
- ACCURACY: Correct information
|
||||||
|
- TONE: Communication style
|
||||||
|
|
||||||
|
### JOURNEY (J) - Process and timing
|
||||||
|
- WAIT_TIME: Time spent waiting
|
||||||
|
- SPEED: How fast things happen
|
||||||
|
- RESPONSE_TIME: Time to respond
|
||||||
|
- PUNCTUALITY: On-time delivery
|
||||||
|
- PACING: Appropriate speed
|
||||||
|
- SIMPLICITY: Easy process
|
||||||
|
- NAVIGATION: Finding things
|
||||||
|
- PAPERWORK: Documentation burden
|
||||||
|
- HANDOFFS: Transitions
|
||||||
|
- SELF_SERVICE: Autonomy options
|
||||||
|
- CONSISTENCY: Same each time
|
||||||
|
- PROCESS_ACCURACY: Correct execution
|
||||||
|
- UPTIME: System availability
|
||||||
|
- PREDICTABILITY: Expectations met
|
||||||
|
- ERROR_RATE: Frequency of mistakes
|
||||||
|
- ACKNOWLEDGMENT: Recognizing issues
|
||||||
|
- RESOLUTION_PROCESS: How problems handled
|
||||||
|
- RESOLUTION_SPEED: Time to fix
|
||||||
|
- RESOLUTION_QUALITY: Adequacy of fix
|
||||||
|
- PREVENTION: Avoiding recurrence
|
||||||
|
|
||||||
|
### ENVIRONMENT (E) - Physical and digital space
|
||||||
|
- CLEANLINESS: Hygiene and tidiness
|
||||||
|
- MAINTENANCE: Condition and upkeep
|
||||||
|
- LAYOUT: Functional arrangement
|
||||||
|
- EQUIPMENT: Tools and amenities
|
||||||
|
- SIGNAGE: Navigation aids
|
||||||
|
- INTERFACE_DESIGN: Digital UX
|
||||||
|
- DIGITAL_FUNCTIONALITY: Features working
|
||||||
|
- DIGITAL_PERFORMANCE: Speed/responsiveness
|
||||||
|
- DIGITAL_NAVIGATION: Finding things online
|
||||||
|
- MOBILE_EXPERIENCE: Smartphone optimization
|
||||||
|
- ATMOSPHERE: Overall mood
|
||||||
|
- NOISE: Sound environment
|
||||||
|
- TEMPERATURE: Climate comfort
|
||||||
|
- CROWDING: Density/space
|
||||||
|
- AESTHETICS: Visual appeal
|
||||||
|
- PHYSICAL_SAFETY: Protection from harm
|
||||||
|
- HEALTH_HYGIENE: Sanitation standards
|
||||||
|
- SECURITY: Protection of person/property
|
||||||
|
- COMFORT: Physical ease
|
||||||
|
- EMERGENCY_READINESS: Preparedness
|
||||||
|
|
||||||
|
### ACCESS (A) - Availability and accessibility
|
||||||
|
- HOURS: Operating hours
|
||||||
|
- BOOKING: Appointment access
|
||||||
|
- INVENTORY: Product availability
|
||||||
|
- STAFFING: Personnel available
|
||||||
|
- GEOGRAPHIC_REACH: Service area
|
||||||
|
- PHYSICAL_ACCESSIBILITY: Mobility access
|
||||||
|
- VISUAL_ACCESSIBILITY: Sight accommodations
|
||||||
|
- HEARING_ACCESSIBILITY: Audio accommodations
|
||||||
|
- COGNITIVE_ACCESSIBILITY: Mental accommodations
|
||||||
|
- DIGITAL_ACCESSIBILITY: Assistive tech support
|
||||||
|
- LANGUAGE_SUPPORT: Multiple languages
|
||||||
|
- CULTURAL_SENSITIVITY: Background respect
|
||||||
|
- DIETARY_MEDICAL: Restriction accommodations
|
||||||
|
- FAMILY_FRIENDLY: Children accommodation
|
||||||
|
- EQUAL_TREATMENT: Non-discrimination
|
||||||
|
- LOCATION: Convenience
|
||||||
|
- PARKING: Vehicle accommodation
|
||||||
|
- TRANSIT: Public transport
|
||||||
|
- PAYMENT_OPTIONS: How you can pay
|
||||||
|
- CONTACT_OPTIONS: Ways to reach
|
||||||
|
|
||||||
|
### VALUE (V) - Cost and worth
|
||||||
|
- ABSOLUTE_PRICE: The actual cost
|
||||||
|
- PRICE_VS_EXPECTATION: Compared to anticipated
|
||||||
|
- PRICE_VS_MARKET: Compared to competitors
|
||||||
|
- HIDDEN_COSTS: Unexpected charges
|
||||||
|
- PAYMENT_FLEXIBILITY: Terms and options
|
||||||
|
- PRICING_CLARITY: Understanding costs
|
||||||
|
- FEE_DISCLOSURE: Upfront about charges
|
||||||
|
- ADVERTISING_ACCURACY: Marketing matches reality
|
||||||
|
- TERMS_FAIRNESS: Policy reasonableness
|
||||||
|
- HONEST_REPRESENTATION: Truthful claims
|
||||||
|
- TIME_INVESTMENT: Hours required
|
||||||
|
- MENTAL_EFFORT: Cognitive load
|
||||||
|
- PHYSICAL_EFFORT: Bodily exertion
|
||||||
|
- HASSLE_FACTOR: Cumulative frustration
|
||||||
|
- OPPORTUNITY_COST: What else could be done
|
||||||
|
- OVERALL_VALUE: Total assessment
|
||||||
|
- QUALITY_PRICE_RATIO: What you get for what you pay
|
||||||
|
- SATISFACTION: Contentment with exchange
|
||||||
|
- RECOMMENDATION: Would suggest to others
|
||||||
|
- RETURN_INTENT: Would come back
|
||||||
|
|
||||||
|
### RELATIONSHIP (R) - Trust and loyalty
|
||||||
|
- TRUTHFULNESS: Accurate representations
|
||||||
|
- PROMISE_KEEPING: Honoring commitments
|
||||||
|
- TRANSPARENCY: Openness about practices
|
||||||
|
- ETHICS: Moral business conduct
|
||||||
|
- FAIR_DEALING: Equitable treatment
|
||||||
|
- TRACK_RECORD: Historical performance
|
||||||
|
- DEPENDABILITY: Same over time
|
||||||
|
- STABILITY: Organizational continuity
|
||||||
|
- TRUSTWORTHINESS: Warranting confidence
|
||||||
|
- GUARANTEE_HONOR: Standing behind product
|
||||||
|
- ERROR_ACKNOWLEDGMENT: Admitting failures
|
||||||
|
- APOLOGY: Expression of regret
|
||||||
|
- COMPENSATION: Making amends
|
||||||
|
- IMPROVEMENT: Actions to prevent recurrence
|
||||||
|
- OWNERSHIP: Taking responsibility
|
||||||
|
- RECOGNITION: Acknowledging customers
|
||||||
|
- REWARDS: Loyalty benefits
|
||||||
|
- RELATIONSHIP_BUILDING: Investment in connection
|
||||||
|
- ONGOING_COMMUNICATION: Contact quality
|
||||||
|
- COMMUNITY: Belonging and connection
|
||||||
|
|
||||||
|
## META PRIMITIVES (DO NOT INCLUDE IN OUTPUT)
|
||||||
|
|
||||||
|
These are always globally active and should NEVER appear in your output:
|
||||||
|
- HONESTY, ETHICS, PROMISES, ACKNOWLEDGMENT, RESPONSE_QUALITY
|
||||||
|
- RECOVERY, RETURN_INTENT, RECOMMEND, RECOGNITION, UNMAPPED
|
||||||
|
|
||||||
|
## OUTPUT RULES
|
||||||
|
|
||||||
|
1. **SPARSE OUTPUT ONLY**: Only include primitives that DIFFER from parent configuration
|
||||||
|
- If parent has WAIT_TIME at "normal" priority and this business needs "critical", include it
|
||||||
|
- If parent already has the right configuration, do NOT include it
|
||||||
|
|
||||||
|
2. **PRIORITY LEVELS** (use exact strings):
|
||||||
|
- "critical": Essential for this business (top 3-5 per business)
|
||||||
|
- "high": Very important (next 5-8)
|
||||||
|
- "normal": Standard relevance (default)
|
||||||
|
- "low": Less common for this business
|
||||||
|
- "very_low": Rarely relevant (prefer over active: false)
|
||||||
|
|
||||||
|
3. **WHEN TO SET active: false**:
|
||||||
|
- Only when a primitive is truly IRRELEVANT (not just uncommon)
|
||||||
|
- Example: PARKING for an online-only business
|
||||||
|
- Prefer priority: "very_low" unless truly N/A
|
||||||
|
|
||||||
|
4. **SIGNALS**: 5-15 realistic customer phrases per side
|
||||||
|
- Use actual language customers use in reviews
|
||||||
|
- Include colloquial expressions, not formal descriptions
|
||||||
|
- Positive and negative should be opposites of the same dimension
|
||||||
|
- Use __replace__: true ONLY if parent signals are wrong (rare)
|
||||||
|
|
||||||
|
5. **MODES**: Only include if this business has distinct service modes
|
||||||
|
- Examples: "dine_in" vs "delivery" for restaurants
|
||||||
|
- "in_store" vs "online" for retailers
|
||||||
|
- Most businesses: omit modes entirely
|
||||||
|
|
||||||
|
6. **business_context**: Include for L1 sectors and leaf categories
|
||||||
|
- name: Human-friendly display name
|
||||||
|
- description: 1-2 sentence description
|
||||||
|
- modes: Array of applicable modes (if any)
|
||||||
|
- default_mode: Primary mode (if modes exist)
|
||||||
|
|
||||||
|
## VALIDATION RULES
|
||||||
|
|
||||||
|
Your output MUST:
|
||||||
|
- Be valid JSON only (no markdown, no explanations)
|
||||||
|
- Use ONLY primitive codes from the dictionary provided
|
||||||
|
- NOT create new primitive codes
|
||||||
|
- NOT include meta primitives
|
||||||
|
- NOT include playbooks, solutions, or action recommendations
|
||||||
|
- Have at least one primitive_config entry
|
||||||
|
- Use exact priority strings: "critical", "high", "normal", "low", "very_low"
|
||||||
|
|
||||||
|
## OUTPUT SCHEMA
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"business_context": {
|
||||||
|
"name": "Human-Friendly Name",
|
||||||
|
"description": "What this business type does and what matters to customers",
|
||||||
|
"modes": ["mode1", "mode2"],
|
||||||
|
"default_mode": "mode1"
|
||||||
|
},
|
||||||
|
"primitive_configs": {
|
||||||
|
"PRIMITIVE_CODE": {
|
||||||
|
"active": true,
|
||||||
|
"priority": "critical|high|normal|low|very_low",
|
||||||
|
"label": "Business-specific label for this primitive",
|
||||||
|
"description": "What this primitive means for this specific business",
|
||||||
|
"signals": {
|
||||||
|
"positive": ["signal 1", "signal 2", "..."],
|
||||||
|
"negative": ["signal 1", "signal 2", "..."],
|
||||||
|
"__replace__": false
|
||||||
|
},
|
||||||
|
"modes": {
|
||||||
|
"mode_name": {
|
||||||
|
"applicable": true,
|
||||||
|
"label": "Mode-specific label"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Return ONLY the JSON object. No preamble, no explanation, no markdown."""
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# USER PROMPT TEMPLATE
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
USER_PROMPT_TEMPLATE = """Generate a sparse primitive configuration delta for this GBP node.
|
||||||
|
|
||||||
|
## NODE INFORMATION
|
||||||
|
|
||||||
|
**GBP Path**: {gbp_path}
|
||||||
|
**Node Name**: {node_name}
|
||||||
|
**Node Description**: {node_description}
|
||||||
|
**Node Level**: {node_level} (L1=Sector, L2=Category, L3=Subcategory, L4=Leaf)
|
||||||
|
|
||||||
|
## PARENT RESOLVED CONFIGURATION
|
||||||
|
|
||||||
|
This is the already-resolved configuration from all ancestors. Only include primitives that need to CHANGE from this:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{parent_resolved_config}
|
||||||
|
```
|
||||||
|
|
||||||
|
## PRIMITIVE DICTIONARY
|
||||||
|
|
||||||
|
Reference for all available primitives with their base definitions:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{primitive_dictionary}
|
||||||
|
```
|
||||||
|
|
||||||
|
## YOUR TASK
|
||||||
|
|
||||||
|
Generate a sparse delta configuration for "{node_name}" that:
|
||||||
|
|
||||||
|
1. Identifies the 5-10 MOST CRITICAL primitives for this business type
|
||||||
|
2. Adjusts priority levels to reflect what customers actually care about
|
||||||
|
3. Provides business-specific labels and signals where helpful
|
||||||
|
4. Only includes primitives that DIFFER from parent_resolved_config
|
||||||
|
5. Uses realistic customer language for signals
|
||||||
|
|
||||||
|
Think about:
|
||||||
|
- What do customers of {node_name} businesses typically praise or complain about?
|
||||||
|
- Which URT primitives are most actionable for this business type?
|
||||||
|
- What unique aspects distinguish this business type from others?
|
||||||
|
|
||||||
|
Return ONLY valid JSON matching the output schema."""
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# HELPER FUNCTIONS
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
def build_user_prompt(
|
||||||
|
gbp_path: str,
|
||||||
|
node_name: str,
|
||||||
|
node_description: str,
|
||||||
|
node_level: int,
|
||||||
|
parent_resolved_config: dict,
|
||||||
|
primitive_dictionary: dict,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Build the user prompt with actual values substituted.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
gbp_path: The ltree path (e.g., "Food_Beverage" or "Food_Beverage.Restaurants")
|
||||||
|
node_name: Human-readable name (e.g., "Food & Beverage" or "Restaurants")
|
||||||
|
node_description: Brief description of this business category
|
||||||
|
node_level: 1-4 indicating hierarchy depth
|
||||||
|
parent_resolved_config: Already-resolved config from ancestors (or {} for L1)
|
||||||
|
primitive_dictionary: All primitives with definitions and base signals
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Formatted user prompt string
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
|
||||||
|
level_labels = {
|
||||||
|
1: "L1=Sector",
|
||||||
|
2: "L2=Category",
|
||||||
|
3: "L3=Subcategory",
|
||||||
|
4: "L4=Leaf",
|
||||||
|
}
|
||||||
|
|
||||||
|
return USER_PROMPT_TEMPLATE.format(
|
||||||
|
gbp_path=gbp_path,
|
||||||
|
node_name=node_name,
|
||||||
|
node_description=node_description,
|
||||||
|
node_level=level_labels.get(node_level, f"L{node_level}"),
|
||||||
|
parent_resolved_config=json.dumps(parent_resolved_config, indent=2),
|
||||||
|
primitive_dictionary=json.dumps(primitive_dictionary, indent=2),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_primitive_config_output(output: dict, primitive_codes: set[str]) -> list[str]:
|
||||||
|
"""
|
||||||
|
Validate the LLM output against schema and rules.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
output: Parsed JSON output from LLM
|
||||||
|
primitive_codes: Set of valid primitive codes
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of validation errors (empty if valid)
|
||||||
|
"""
|
||||||
|
errors = []
|
||||||
|
|
||||||
|
# Meta primitives that should never appear
|
||||||
|
META_PRIMITIVES = {
|
||||||
|
"HONESTY", "ETHICS", "PROMISES", "ACKNOWLEDGMENT", "RESPONSE_QUALITY",
|
||||||
|
"RECOVERY", "RETURN_INTENT", "RECOMMEND", "RECOGNITION", "UNMAPPED"
|
||||||
|
}
|
||||||
|
|
||||||
|
VALID_PRIORITIES = {"critical", "high", "normal", "low", "very_low"}
|
||||||
|
|
||||||
|
# Check required structure
|
||||||
|
if not isinstance(output, dict):
|
||||||
|
errors.append("Output must be a JSON object")
|
||||||
|
return errors
|
||||||
|
|
||||||
|
primitive_configs = output.get("primitive_configs", {})
|
||||||
|
if not primitive_configs:
|
||||||
|
errors.append("primitive_configs is required and must not be empty")
|
||||||
|
|
||||||
|
if not isinstance(primitive_configs, dict):
|
||||||
|
errors.append("primitive_configs must be an object")
|
||||||
|
return errors
|
||||||
|
|
||||||
|
for code, config in primitive_configs.items():
|
||||||
|
# Check code is valid
|
||||||
|
if code not in primitive_codes:
|
||||||
|
errors.append(f"Unknown primitive code: {code}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check for meta primitives
|
||||||
|
if code in META_PRIMITIVES:
|
||||||
|
errors.append(f"Meta primitive should not appear: {code}")
|
||||||
|
|
||||||
|
if not isinstance(config, dict):
|
||||||
|
errors.append(f"{code}: config must be an object")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check priority if present
|
||||||
|
priority = config.get("priority")
|
||||||
|
if priority and priority not in VALID_PRIORITIES:
|
||||||
|
errors.append(f"{code}: invalid priority '{priority}', must be one of {VALID_PRIORITIES}")
|
||||||
|
|
||||||
|
# Check signals structure if present
|
||||||
|
signals = config.get("signals")
|
||||||
|
if signals:
|
||||||
|
if not isinstance(signals, dict):
|
||||||
|
errors.append(f"{code}: signals must be an object")
|
||||||
|
else:
|
||||||
|
pos = signals.get("positive", [])
|
||||||
|
neg = signals.get("negative", [])
|
||||||
|
if pos and not isinstance(pos, list):
|
||||||
|
errors.append(f"{code}: signals.positive must be an array")
|
||||||
|
if neg and not isinstance(neg, list):
|
||||||
|
errors.append(f"{code}: signals.negative must be an array")
|
||||||
|
|
||||||
|
# Check business_context if present
|
||||||
|
business_context = output.get("business_context")
|
||||||
|
if business_context:
|
||||||
|
if not isinstance(business_context, dict):
|
||||||
|
errors.append("business_context must be an object")
|
||||||
|
else:
|
||||||
|
modes = business_context.get("modes")
|
||||||
|
if modes and not isinstance(modes, list):
|
||||||
|
errors.append("business_context.modes must be an array")
|
||||||
|
|
||||||
|
return errors
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# EXAMPLE PRIMITIVE DICTIONARY (subset for reference)
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
EXAMPLE_PRIMITIVE_DICTIONARY = {
|
||||||
|
"WAIT_TIME": {
|
||||||
|
"code": "WAIT_TIME",
|
||||||
|
"domain": "J",
|
||||||
|
"category": "Timing",
|
||||||
|
"name": "Wait Time",
|
||||||
|
"definition": "Time spent waiting for service",
|
||||||
|
"base_signals": {
|
||||||
|
"positive": ["no wait", "seated immediately", "right away", "quick turnaround"],
|
||||||
|
"negative": ["long wait", "waited forever", "45 minutes", "hours to be seen"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"WARMTH": {
|
||||||
|
"code": "WARMTH",
|
||||||
|
"domain": "P",
|
||||||
|
"category": "Attitude",
|
||||||
|
"name": "Warmth/Friendliness",
|
||||||
|
"definition": "Approachability and pleasantness of staff",
|
||||||
|
"base_signals": {
|
||||||
|
"positive": ["so friendly", "welcoming", "made us feel at home", "warm greeting"],
|
||||||
|
"negative": ["cold", "unfriendly", "rude", "didn't acknowledge us"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"CRAFTSMANSHIP": {
|
||||||
|
"code": "CRAFTSMANSHIP",
|
||||||
|
"domain": "O",
|
||||||
|
"category": "Quality",
|
||||||
|
"name": "Craftsmanship",
|
||||||
|
"definition": "Skill of construction or execution",
|
||||||
|
"base_signals": {
|
||||||
|
"positive": ["beautifully made", "expert work", "attention to detail", "quality craftsmanship"],
|
||||||
|
"negative": ["sloppy work", "poorly made", "amateur job", "uneven"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
# ... more primitives would be included in full dictionary
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# EXAMPLE OUTPUT (for reference and testing)
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
EXAMPLE_OUTPUT_RESTAURANT = {
|
||||||
|
"business_context": {
|
||||||
|
"name": "Restaurants",
|
||||||
|
"description": "Food service establishments where customers dine on-premises or order for delivery/takeout",
|
||||||
|
"modes": ["dine_in", "takeout", "delivery"],
|
||||||
|
"default_mode": "dine_in"
|
||||||
|
},
|
||||||
|
"primitive_configs": {
|
||||||
|
"WAIT_TIME": {
|
||||||
|
"priority": "critical",
|
||||||
|
"label": "Wait for Table/Food",
|
||||||
|
"description": "Time waiting to be seated and for food to arrive",
|
||||||
|
"signals": {
|
||||||
|
"positive": [
|
||||||
|
"seated immediately",
|
||||||
|
"food came out fast",
|
||||||
|
"no wait for a table",
|
||||||
|
"quick service",
|
||||||
|
"didn't have to wait long"
|
||||||
|
],
|
||||||
|
"negative": [
|
||||||
|
"waited 45 minutes for a table",
|
||||||
|
"food took forever",
|
||||||
|
"an hour for appetizers",
|
||||||
|
"still waiting for our entrees",
|
||||||
|
"had to flag down the waiter"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"modes": {
|
||||||
|
"dine_in": {
|
||||||
|
"applicable": True,
|
||||||
|
"label": "Wait for Table & Food"
|
||||||
|
},
|
||||||
|
"takeout": {
|
||||||
|
"applicable": True,
|
||||||
|
"label": "Order Ready Time"
|
||||||
|
},
|
||||||
|
"delivery": {
|
||||||
|
"applicable": True,
|
||||||
|
"label": "Delivery Time"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"CRAFTSMANSHIP": {
|
||||||
|
"priority": "critical",
|
||||||
|
"label": "Food Preparation Quality",
|
||||||
|
"description": "Skill and care in cooking and food preparation",
|
||||||
|
"signals": {
|
||||||
|
"positive": [
|
||||||
|
"cooked to perfection",
|
||||||
|
"beautifully plated",
|
||||||
|
"chef knows what they're doing",
|
||||||
|
"perfectly seasoned",
|
||||||
|
"amazing flavor"
|
||||||
|
],
|
||||||
|
"negative": [
|
||||||
|
"overcooked",
|
||||||
|
"bland and tasteless",
|
||||||
|
"clearly microwaved",
|
||||||
|
"burnt edges",
|
||||||
|
"undercooked chicken"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"WARMTH": {
|
||||||
|
"priority": "high",
|
||||||
|
"label": "Server Friendliness",
|
||||||
|
"description": "Warmth and hospitality from hosts, servers, and staff"
|
||||||
|
},
|
||||||
|
"CLEANLINESS": {
|
||||||
|
"priority": "high",
|
||||||
|
"label": "Restaurant Cleanliness",
|
||||||
|
"description": "Hygiene of dining area, bathrooms, and visible kitchen areas"
|
||||||
|
},
|
||||||
|
"ATMOSPHERE": {
|
||||||
|
"priority": "high",
|
||||||
|
"label": "Dining Ambiance",
|
||||||
|
"description": "Overall mood, decor, lighting, and vibe of the restaurant"
|
||||||
|
},
|
||||||
|
"PARKING": {
|
||||||
|
"priority": "normal",
|
||||||
|
"modes": {
|
||||||
|
"dine_in": {"applicable": True},
|
||||||
|
"takeout": {"applicable": True},
|
||||||
|
"delivery": {"applicable": False}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"DIGITAL_ACCESSIBILITY": {
|
||||||
|
"priority": "very_low",
|
||||||
|
"description": "Screen reader support and digital accessibility - rarely mentioned in restaurant reviews"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -6,6 +6,7 @@ Provides a unified interface for classification requests with:
|
|||||||
- Structured output (JSON mode)
|
- Structured output (JSON mode)
|
||||||
- Retry handling
|
- Retry handling
|
||||||
- Cost tracking
|
- Cost tracking
|
||||||
|
- Adaptive batch sizing based on context window
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
@@ -14,7 +15,8 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import TYPE_CHECKING, Any
|
from dataclasses import dataclass, field
|
||||||
|
from typing import TYPE_CHECKING, Any, TypedDict
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from reviewiq_pipeline.config import Config
|
from reviewiq_pipeline.config import Config
|
||||||
@@ -22,6 +24,240 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Exceptions
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class PartialBatchResult(Exception):
|
||||||
|
"""
|
||||||
|
Exception raised when batch JSON parsing partially fails but some results were recovered.
|
||||||
|
|
||||||
|
Carries the partial results and indices of missing reviews so the caller can
|
||||||
|
only reprocess the missing ones instead of the entire batch.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
message: str,
|
||||||
|
partial_results: list[dict[str, Any]],
|
||||||
|
missing_indices: list[int],
|
||||||
|
metadata: dict[str, Any] | None = None,
|
||||||
|
):
|
||||||
|
super().__init__(message)
|
||||||
|
self.partial_results = partial_results
|
||||||
|
self.missing_indices = missing_indices
|
||||||
|
self.metadata = metadata or {}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Model Context Windows and Token Estimation
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
MODEL_CONTEXT_WINDOWS = {
|
||||||
|
# OpenAI models
|
||||||
|
"gpt-4o": 128_000,
|
||||||
|
"gpt-4o-mini": 128_000,
|
||||||
|
"gpt-4-turbo": 128_000,
|
||||||
|
"gpt-4": 8_192,
|
||||||
|
"gpt-3.5-turbo": 16_385,
|
||||||
|
# Anthropic models
|
||||||
|
"claude-3-opus-20240229": 200_000,
|
||||||
|
"claude-3-sonnet-20240229": 200_000,
|
||||||
|
"claude-3-haiku-20240307": 200_000,
|
||||||
|
"claude-3-5-sonnet-20241022": 200_000,
|
||||||
|
"claude-sonnet-4-20250514": 200_000,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Average tokens per character (rough estimate, varies by language)
|
||||||
|
CHARS_PER_TOKEN = 4
|
||||||
|
|
||||||
|
# Output tokens per review (classification response)
|
||||||
|
OUTPUT_TOKENS_PER_REVIEW = 450 # Conservative estimate
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BatchSizeCalculation:
|
||||||
|
"""Result of batch size calculation."""
|
||||||
|
batch_size: int
|
||||||
|
system_prompt_tokens: int
|
||||||
|
avg_tokens_per_review: int
|
||||||
|
output_tokens_reserved: int
|
||||||
|
context_window: int
|
||||||
|
utilization_target: float
|
||||||
|
reasoning: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TokenStats:
|
||||||
|
"""Running statistics for token estimation."""
|
||||||
|
total_reviews: int = 0
|
||||||
|
total_input_tokens: int = 0
|
||||||
|
total_output_tokens: int = 0
|
||||||
|
min_review_tokens: int = 999999
|
||||||
|
max_review_tokens: int = 0
|
||||||
|
|
||||||
|
def update(self, review_tokens: int, output_tokens: int):
|
||||||
|
"""Update stats with new observation."""
|
||||||
|
self.total_reviews += 1
|
||||||
|
self.total_input_tokens += review_tokens
|
||||||
|
self.total_output_tokens += output_tokens
|
||||||
|
self.min_review_tokens = min(self.min_review_tokens, review_tokens)
|
||||||
|
self.max_review_tokens = max(self.max_review_tokens, review_tokens)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def avg_review_tokens(self) -> int:
|
||||||
|
"""Average tokens per review."""
|
||||||
|
if self.total_reviews == 0:
|
||||||
|
return 150 # Default estimate
|
||||||
|
return self.total_input_tokens // self.total_reviews
|
||||||
|
|
||||||
|
@property
|
||||||
|
def avg_output_tokens(self) -> int:
|
||||||
|
"""Average output tokens per review."""
|
||||||
|
if self.total_reviews == 0:
|
||||||
|
return OUTPUT_TOKENS_PER_REVIEW
|
||||||
|
return self.total_output_tokens // self.total_reviews
|
||||||
|
|
||||||
|
|
||||||
|
class BatchSizer:
|
||||||
|
"""
|
||||||
|
Calculates optimal batch size based on context window and actual token usage.
|
||||||
|
|
||||||
|
Adapts in real-time based on observed token counts from previous batches.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
system_prompt_tokens: int,
|
||||||
|
target_utilization: float = 0.6,
|
||||||
|
):
|
||||||
|
self.model = model
|
||||||
|
self.system_prompt_tokens = system_prompt_tokens
|
||||||
|
self.target_utilization = target_utilization
|
||||||
|
self.context_window = MODEL_CONTEXT_WINDOWS.get(model, 128_000)
|
||||||
|
self.stats = TokenStats()
|
||||||
|
|
||||||
|
def estimate_tokens(self, text: str) -> int:
|
||||||
|
"""Estimate token count for text (fast approximation)."""
|
||||||
|
# Simple heuristic: ~4 chars per token for English
|
||||||
|
# More accurate would be to use tiktoken, but this is faster
|
||||||
|
return max(1, len(text) // CHARS_PER_TOKEN)
|
||||||
|
|
||||||
|
def calculate_batch_size(
|
||||||
|
self,
|
||||||
|
reviews: list[dict],
|
||||||
|
fixed_size: int | None = None,
|
||||||
|
) -> BatchSizeCalculation:
|
||||||
|
"""
|
||||||
|
Calculate optimal batch size for a set of reviews.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
reviews: List of reviews with 'text' field
|
||||||
|
fixed_size: If set, use this size (skip calculation)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BatchSizeCalculation with recommended size and reasoning
|
||||||
|
"""
|
||||||
|
if fixed_size and fixed_size > 0:
|
||||||
|
return BatchSizeCalculation(
|
||||||
|
batch_size=min(fixed_size, len(reviews)),
|
||||||
|
system_prompt_tokens=self.system_prompt_tokens,
|
||||||
|
avg_tokens_per_review=self.stats.avg_review_tokens,
|
||||||
|
output_tokens_reserved=fixed_size * self.stats.avg_output_tokens,
|
||||||
|
context_window=self.context_window,
|
||||||
|
utilization_target=self.target_utilization,
|
||||||
|
reasoning=f"Fixed batch size: {fixed_size}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Calculate actual token counts for these reviews
|
||||||
|
review_tokens = [self.estimate_tokens(r.get("text", "")) for r in reviews]
|
||||||
|
avg_review_tokens = sum(review_tokens) // len(review_tokens) if review_tokens else 150
|
||||||
|
max_review_tokens = max(review_tokens) if review_tokens else 300
|
||||||
|
|
||||||
|
# Use learned average if we have history, otherwise use current batch
|
||||||
|
effective_avg = (
|
||||||
|
(self.stats.avg_review_tokens + avg_review_tokens) // 2
|
||||||
|
if self.stats.total_reviews > 0
|
||||||
|
else avg_review_tokens
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use learned output average
|
||||||
|
output_per_review = self.stats.avg_output_tokens
|
||||||
|
|
||||||
|
# Calculate available space
|
||||||
|
available = int(self.context_window * self.target_utilization)
|
||||||
|
available -= self.system_prompt_tokens
|
||||||
|
available -= 1000 # Safety buffer for JSON overhead
|
||||||
|
|
||||||
|
# Calculate batch size
|
||||||
|
# Each review needs: input tokens + output tokens
|
||||||
|
tokens_per_review = effective_avg + output_per_review
|
||||||
|
|
||||||
|
# Use 80th percentile estimate to handle variance
|
||||||
|
# (avg + (max - avg) * 0.3) gives room for longer reviews
|
||||||
|
variance_adjusted = effective_avg + int((max_review_tokens - effective_avg) * 0.3)
|
||||||
|
tokens_per_review_safe = variance_adjusted + output_per_review
|
||||||
|
|
||||||
|
batch_size = max(1, available // tokens_per_review_safe)
|
||||||
|
|
||||||
|
# Cap at reasonable limits
|
||||||
|
batch_size = min(batch_size, 100, len(reviews))
|
||||||
|
|
||||||
|
reasoning = (
|
||||||
|
f"Context: {self.context_window:,} | "
|
||||||
|
f"System: {self.system_prompt_tokens:,} | "
|
||||||
|
f"Avg review: {effective_avg} (variance-adjusted: {variance_adjusted}) | "
|
||||||
|
f"Output/review: {output_per_review} | "
|
||||||
|
f"Target utilization: {self.target_utilization:.0%} | "
|
||||||
|
f"→ Batch size: {batch_size}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return BatchSizeCalculation(
|
||||||
|
batch_size=batch_size,
|
||||||
|
system_prompt_tokens=self.system_prompt_tokens,
|
||||||
|
avg_tokens_per_review=effective_avg,
|
||||||
|
output_tokens_reserved=batch_size * output_per_review,
|
||||||
|
context_window=self.context_window,
|
||||||
|
utilization_target=self.target_utilization,
|
||||||
|
reasoning=reasoning,
|
||||||
|
)
|
||||||
|
|
||||||
|
def update_from_response(self, batch_size: int, input_tokens: int, output_tokens: int):
|
||||||
|
"""
|
||||||
|
Update statistics from actual LLM response.
|
||||||
|
|
||||||
|
Call this after each batch to improve future estimates.
|
||||||
|
"""
|
||||||
|
if batch_size > 0:
|
||||||
|
avg_input = input_tokens // batch_size
|
||||||
|
avg_output = output_tokens // batch_size
|
||||||
|
|
||||||
|
# Update stats for each review in batch
|
||||||
|
for _ in range(batch_size):
|
||||||
|
self.stats.update(avg_input, avg_output)
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"BatchSizer updated: {batch_size} reviews, "
|
||||||
|
f"avg input={avg_input}, avg output={avg_output}, "
|
||||||
|
f"running avg input={self.stats.avg_review_tokens}, "
|
||||||
|
f"running avg output={self.stats.avg_output_tokens}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_stats_summary(self) -> dict:
|
||||||
|
"""Get current statistics summary."""
|
||||||
|
return {
|
||||||
|
"total_reviews_processed": self.stats.total_reviews,
|
||||||
|
"avg_input_tokens": self.stats.avg_review_tokens,
|
||||||
|
"avg_output_tokens": self.stats.avg_output_tokens,
|
||||||
|
"min_review_tokens": self.stats.min_review_tokens if self.stats.total_reviews > 0 else 0,
|
||||||
|
"max_review_tokens": self.stats.max_review_tokens if self.stats.total_reviews > 0 else 0,
|
||||||
|
"model": self.model,
|
||||||
|
"context_window": self.context_window,
|
||||||
|
}
|
||||||
|
|
||||||
# System prompt for URT classification
|
# System prompt for URT classification
|
||||||
SYSTEM_PROMPT = """You are a review classification system using URT (Universal Review Taxonomy) v5.1.
|
SYSTEM_PROMPT = """You are a review classification system using URT (Universal Review Taxonomy) v5.1.
|
||||||
|
|
||||||
@@ -329,6 +565,18 @@ Return valid JSON matching this schema. No markdown, no explanations.
|
|||||||
}"""
|
}"""
|
||||||
|
|
||||||
|
|
||||||
|
class BatchReviewInput(TypedDict):
|
||||||
|
"""Input format for batch classification."""
|
||||||
|
review_id: str
|
||||||
|
text: str
|
||||||
|
rating: int
|
||||||
|
|
||||||
|
|
||||||
|
class BatchClassificationResponse(TypedDict):
|
||||||
|
"""Response format for batch classification."""
|
||||||
|
reviews: list[dict[str, Any]] # Each contains review_index, spans, review_summary
|
||||||
|
|
||||||
|
|
||||||
class LLMClientBase(ABC):
|
class LLMClientBase(ABC):
|
||||||
"""Abstract base class for LLM clients."""
|
"""Abstract base class for LLM clients."""
|
||||||
|
|
||||||
@@ -337,18 +585,24 @@ class LLMClientBase(ABC):
|
|||||||
self.total_tokens_used = 0
|
self.total_tokens_used = 0
|
||||||
self.total_cost_usd = 0.0
|
self.total_cost_usd = 0.0
|
||||||
self._custom_prompt: str | None = None
|
self._custom_prompt: str | None = None
|
||||||
|
self._custom_prompt_batch: str | None = None
|
||||||
|
self._cached_tokens: int = 0 # Track cached token usage
|
||||||
|
|
||||||
def set_prompt(self, prompt: str) -> None:
|
def set_prompt(self, prompt: str, batch_prompt: str | None = None) -> None:
|
||||||
"""
|
"""
|
||||||
Set a custom system prompt (e.g., built dynamically from database).
|
Set custom system prompts (e.g., built dynamically from database).
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
prompt: The system prompt to use for classification
|
prompt: The system prompt for single review classification
|
||||||
|
batch_prompt: The system prompt for batch classification (if different)
|
||||||
"""
|
"""
|
||||||
self._custom_prompt = prompt
|
self._custom_prompt = prompt
|
||||||
|
self._custom_prompt_batch = batch_prompt or prompt
|
||||||
|
|
||||||
def get_prompt(self) -> str:
|
def get_prompt(self, batch_mode: bool = False) -> str:
|
||||||
"""Get the current system prompt (custom or default)."""
|
"""Get the current system prompt (custom or default)."""
|
||||||
|
if batch_mode:
|
||||||
|
return self._custom_prompt_batch or self._custom_prompt or SYSTEM_PROMPT
|
||||||
return self._custom_prompt or SYSTEM_PROMPT
|
return self._custom_prompt or SYSTEM_PROMPT
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@@ -358,7 +612,7 @@ class LLMClientBase(ABC):
|
|||||||
profile: str = "standard",
|
profile: str = "standard",
|
||||||
) -> tuple[LLMClassificationResponse, dict[str, Any]]:
|
) -> tuple[LLMClassificationResponse, dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Classify a review and extract spans.
|
Classify a single review and extract spans.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
review_text: The review text to classify
|
review_text: The review text to classify
|
||||||
@@ -369,6 +623,24 @@ class LLMClientBase(ABC):
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def classify_batch(
|
||||||
|
self,
|
||||||
|
reviews: list[BatchReviewInput],
|
||||||
|
profile: str = "standard",
|
||||||
|
) -> tuple[list[LLMClassificationResponse], dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Classify multiple reviews in a single LLM call.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
reviews: List of reviews with review_id, text, and rating
|
||||||
|
profile: Classification profile (lite/core/standard/full)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (list of classification responses, aggregated metadata)
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def generate(
|
async def generate(
|
||||||
self,
|
self,
|
||||||
@@ -396,16 +668,31 @@ class LLMClientBase(ABC):
|
|||||||
"""Close the client and cleanup resources."""
|
"""Close the client and cleanup resources."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def _build_batch_user_prompt(self, reviews: list[BatchReviewInput]) -> str:
|
||||||
|
"""Build user prompt for batch classification."""
|
||||||
|
lines = [
|
||||||
|
f"Classify these {len(reviews)} reviews. Return JSON with 'reviews' array.",
|
||||||
|
""
|
||||||
|
]
|
||||||
|
|
||||||
|
for i, review in enumerate(reviews):
|
||||||
|
lines.append(f"---REVIEW {i} (rating={review['rating']}★)---")
|
||||||
|
lines.append(review["text"])
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
class OpenAIClient(LLMClientBase):
|
class OpenAIClient(LLMClientBase):
|
||||||
"""OpenAI LLM client implementation."""
|
"""OpenAI LLM client implementation with batch support and prompt caching."""
|
||||||
|
|
||||||
# Pricing per 1M tokens (as of 2024)
|
# Pricing per 1M tokens (as of 2024)
|
||||||
|
# Cached input tokens are 50% cheaper
|
||||||
PRICING = {
|
PRICING = {
|
||||||
"gpt-4o": {"input": 5.0, "output": 15.0},
|
"gpt-4o": {"input": 2.50, "cached_input": 1.25, "output": 10.0},
|
||||||
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
|
"gpt-4o-mini": {"input": 0.15, "cached_input": 0.075, "output": 0.60},
|
||||||
"gpt-4-turbo": {"input": 10.0, "output": 30.0},
|
"gpt-4-turbo": {"input": 10.0, "cached_input": 5.0, "output": 30.0},
|
||||||
"gpt-3.5-turbo": {"input": 0.50, "output": 1.50},
|
"gpt-3.5-turbo": {"input": 0.50, "cached_input": 0.25, "output": 1.50},
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, config: Config):
|
def __init__(self, config: Config):
|
||||||
@@ -420,7 +707,7 @@ class OpenAIClient(LLMClientBase):
|
|||||||
review_text: str,
|
review_text: str,
|
||||||
profile: str = "standard",
|
profile: str = "standard",
|
||||||
) -> tuple[LLMClassificationResponse, dict[str, Any]]:
|
) -> tuple[LLMClassificationResponse, dict[str, Any]]:
|
||||||
"""Classify using OpenAI."""
|
"""Classify a single review using OpenAI."""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
@@ -446,27 +733,154 @@ class OpenAIClient(LLMClientBase):
|
|||||||
|
|
||||||
result = json.loads(content)
|
result = json.loads(content)
|
||||||
|
|
||||||
# Calculate costs
|
# Calculate costs (with caching support)
|
||||||
|
metadata = self._calculate_openai_costs(response, start_time)
|
||||||
|
|
||||||
|
return result, metadata
|
||||||
|
|
||||||
|
async def classify_batch(
|
||||||
|
self,
|
||||||
|
reviews: list[BatchReviewInput],
|
||||||
|
profile: str = "standard",
|
||||||
|
) -> tuple[list[LLMClassificationResponse], dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Classify multiple reviews in a single LLM call.
|
||||||
|
|
||||||
|
Uses prompt caching - the system prompt is cached after first call,
|
||||||
|
reducing input token costs by ~50% on subsequent calls.
|
||||||
|
"""
|
||||||
|
if not reviews:
|
||||||
|
return [], {"error": "No reviews provided"}
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Build batch user prompt
|
||||||
|
user_prompt = self._build_batch_user_prompt(reviews)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": self.get_prompt(batch_mode=True)},
|
||||||
|
{"role": "user", "content": user_prompt},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Calculate max tokens based on batch size (estimate ~400 tokens per review output)
|
||||||
|
max_output_tokens = min(16000, len(reviews) * 500)
|
||||||
|
|
||||||
|
response = await self.client.chat.completions.create(
|
||||||
|
model=self.model,
|
||||||
|
messages=messages,
|
||||||
|
temperature=self.config.llm_temperature,
|
||||||
|
response_format={"type": "json_object"},
|
||||||
|
max_tokens=max_output_tokens,
|
||||||
|
timeout=self.config.llm_timeout_seconds * 2, # Longer timeout for batch
|
||||||
|
)
|
||||||
|
|
||||||
|
# Parse response
|
||||||
|
content = response.choices[0].message.content
|
||||||
|
if not content:
|
||||||
|
raise ValueError("Empty response from OpenAI")
|
||||||
|
|
||||||
|
metadata = self._calculate_openai_costs(response, start_time, batch_size=len(reviews))
|
||||||
|
|
||||||
|
# Try full JSON parse first
|
||||||
|
try:
|
||||||
|
batch_result = json.loads(content)
|
||||||
|
review_results = self._parse_batch_response(batch_result, reviews)
|
||||||
|
return review_results, metadata
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
# Full parse failed - try partial extraction
|
||||||
|
logger.warning(f"Full JSON parse failed: {e}, attempting partial recovery...")
|
||||||
|
|
||||||
|
partial_reviews, missing_indices = self._extract_partial_batch_json(
|
||||||
|
content, len(reviews)
|
||||||
|
)
|
||||||
|
|
||||||
|
if partial_reviews:
|
||||||
|
raise PartialBatchResult(
|
||||||
|
message=f"Recovered {len(partial_reviews)}/{len(reviews)} reviews from malformed JSON",
|
||||||
|
partial_results=partial_reviews,
|
||||||
|
missing_indices=missing_indices,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
def _calculate_openai_costs(
|
||||||
|
self,
|
||||||
|
response: Any,
|
||||||
|
start_time: float,
|
||||||
|
batch_size: int = 1,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Calculate costs from OpenAI response, accounting for cached tokens."""
|
||||||
input_tokens = response.usage.prompt_tokens if response.usage else 0
|
input_tokens = response.usage.prompt_tokens if response.usage else 0
|
||||||
output_tokens = response.usage.completion_tokens if response.usage else 0
|
output_tokens = response.usage.completion_tokens if response.usage else 0
|
||||||
total_tokens = input_tokens + output_tokens
|
total_tokens = input_tokens + output_tokens
|
||||||
|
|
||||||
pricing = self.PRICING.get(self.model, {"input": 0.15, "output": 0.60})
|
# Check for cached tokens (OpenAI returns this in newer API versions)
|
||||||
cost = (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1_000_000
|
cached_tokens = 0
|
||||||
|
if hasattr(response.usage, "prompt_tokens_details") and response.usage.prompt_tokens_details:
|
||||||
|
cached_tokens = getattr(response.usage.prompt_tokens_details, "cached_tokens", 0)
|
||||||
|
|
||||||
|
uncached_input = input_tokens - cached_tokens
|
||||||
|
|
||||||
|
pricing = self.PRICING.get(self.model, {"input": 0.15, "cached_input": 0.075, "output": 0.60})
|
||||||
|
cost = (
|
||||||
|
uncached_input * pricing["input"]
|
||||||
|
+ cached_tokens * pricing.get("cached_input", pricing["input"] * 0.5)
|
||||||
|
+ output_tokens * pricing["output"]
|
||||||
|
) / 1_000_000
|
||||||
|
|
||||||
self.total_tokens_used += total_tokens
|
self.total_tokens_used += total_tokens
|
||||||
self.total_cost_usd += cost
|
self.total_cost_usd += cost
|
||||||
|
self._cached_tokens += cached_tokens
|
||||||
|
|
||||||
metadata = {
|
return {
|
||||||
"model": self.model,
|
"model": self.model,
|
||||||
"input_tokens": input_tokens,
|
"input_tokens": input_tokens,
|
||||||
|
"cached_tokens": cached_tokens,
|
||||||
"output_tokens": output_tokens,
|
"output_tokens": output_tokens,
|
||||||
"total_tokens": total_tokens,
|
"total_tokens": total_tokens,
|
||||||
"cost_usd": cost,
|
"cost_usd": cost,
|
||||||
"latency_ms": int((time.time() - start_time) * 1000),
|
"latency_ms": int((time.time() - start_time) * 1000),
|
||||||
|
"batch_size": batch_size,
|
||||||
|
"tokens_per_review": total_tokens / batch_size if batch_size > 0 else 0,
|
||||||
}
|
}
|
||||||
|
|
||||||
return result, metadata
|
def _parse_batch_response(
|
||||||
|
self,
|
||||||
|
batch_result: dict[str, Any],
|
||||||
|
original_reviews: list[BatchReviewInput],
|
||||||
|
) -> list[LLMClassificationResponse]:
|
||||||
|
"""Parse batch response into individual review results."""
|
||||||
|
results: list[LLMClassificationResponse] = []
|
||||||
|
|
||||||
|
# Handle both formats: {"reviews": [...]} and direct list
|
||||||
|
review_data = batch_result.get("reviews", [])
|
||||||
|
if not review_data and isinstance(batch_result, list):
|
||||||
|
review_data = batch_result
|
||||||
|
|
||||||
|
# Create a lookup by review_index
|
||||||
|
results_by_index = {r.get("review_index", i): r for i, r in enumerate(review_data)}
|
||||||
|
|
||||||
|
for i, original in enumerate(original_reviews):
|
||||||
|
if i in results_by_index:
|
||||||
|
review_result = results_by_index[i]
|
||||||
|
# Convert to standard format
|
||||||
|
results.append({
|
||||||
|
"spans": review_result.get("spans", []),
|
||||||
|
"review_summary": review_result.get("review_summary", {
|
||||||
|
"dominant_valence": "V0",
|
||||||
|
"dominant_domain": "O",
|
||||||
|
"span_count": len(review_result.get("spans", [])),
|
||||||
|
"has_comparative": False,
|
||||||
|
"has_entity": False,
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
# Missing review - create fallback
|
||||||
|
logger.warning(f"Review index {i} missing from batch response, using fallback")
|
||||||
|
results.append(create_fallback_response(original["text"]))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
async def generate(
|
async def generate(
|
||||||
self,
|
self,
|
||||||
@@ -511,14 +925,16 @@ class OpenAIClient(LLMClientBase):
|
|||||||
|
|
||||||
|
|
||||||
class AnthropicClient(LLMClientBase):
|
class AnthropicClient(LLMClientBase):
|
||||||
"""Anthropic LLM client implementation."""
|
"""Anthropic LLM client implementation with batch support and prompt caching."""
|
||||||
|
|
||||||
# Pricing per 1M tokens (as of 2024)
|
# Pricing per 1M tokens (as of 2024)
|
||||||
|
# Cached input tokens are 90% cheaper with Anthropic
|
||||||
PRICING = {
|
PRICING = {
|
||||||
"claude-3-opus-20240229": {"input": 15.0, "output": 75.0},
|
"claude-3-opus-20240229": {"input": 15.0, "cached_input": 1.50, "output": 75.0},
|
||||||
"claude-3-sonnet-20240229": {"input": 3.0, "output": 15.0},
|
"claude-3-sonnet-20240229": {"input": 3.0, "cached_input": 0.30, "output": 15.0},
|
||||||
"claude-3-haiku-20240307": {"input": 0.25, "output": 1.25},
|
"claude-3-haiku-20240307": {"input": 0.25, "cached_input": 0.03, "output": 1.25},
|
||||||
"claude-3-5-sonnet-20241022": {"input": 3.0, "output": 15.0},
|
"claude-3-5-sonnet-20241022": {"input": 3.0, "cached_input": 0.30, "output": 15.0},
|
||||||
|
"claude-sonnet-4-20250514": {"input": 3.0, "cached_input": 0.30, "output": 15.0},
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, config: Config):
|
def __init__(self, config: Config):
|
||||||
@@ -533,13 +949,16 @@ class AnthropicClient(LLMClientBase):
|
|||||||
review_text: str,
|
review_text: str,
|
||||||
profile: str = "standard",
|
profile: str = "standard",
|
||||||
) -> tuple[LLMClassificationResponse, dict[str, Any]]:
|
) -> tuple[LLMClassificationResponse, dict[str, Any]]:
|
||||||
"""Classify using Anthropic."""
|
"""Classify a single review using Anthropic."""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Use cache_control for prompt caching
|
||||||
|
system_content = self._build_cached_system(self.get_prompt())
|
||||||
|
|
||||||
response = await self.client.messages.create(
|
response = await self.client.messages.create(
|
||||||
model=self.model,
|
model=self.model,
|
||||||
max_tokens=4096,
|
max_tokens=4096,
|
||||||
system=self.get_prompt(),
|
system=system_content,
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@@ -554,30 +973,161 @@ class AnthropicClient(LLMClientBase):
|
|||||||
if not content:
|
if not content:
|
||||||
raise ValueError("Empty response from Anthropic")
|
raise ValueError("Empty response from Anthropic")
|
||||||
|
|
||||||
# Try to extract JSON from response
|
|
||||||
result = self._extract_json(content)
|
result = self._extract_json(content)
|
||||||
|
metadata = self._calculate_anthropic_costs(response, start_time)
|
||||||
|
|
||||||
# Calculate costs
|
return result, metadata
|
||||||
|
|
||||||
|
async def classify_batch(
|
||||||
|
self,
|
||||||
|
reviews: list[BatchReviewInput],
|
||||||
|
profile: str = "standard",
|
||||||
|
) -> tuple[list[LLMClassificationResponse], dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Classify multiple reviews in a single LLM call.
|
||||||
|
|
||||||
|
Uses Anthropic's prompt caching with cache_control - the system prompt
|
||||||
|
is cached after first call, reducing input costs by ~90%.
|
||||||
|
"""
|
||||||
|
if not reviews:
|
||||||
|
return [], {"error": "No reviews provided"}
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Build batch user prompt
|
||||||
|
user_prompt = self._build_batch_user_prompt(reviews)
|
||||||
|
|
||||||
|
# Use cache_control for prompt caching (system prompt is cacheable)
|
||||||
|
system_content = self._build_cached_system(self.get_prompt(batch_mode=True))
|
||||||
|
|
||||||
|
# Calculate max tokens based on batch size
|
||||||
|
max_output_tokens = min(16000, len(reviews) * 500)
|
||||||
|
|
||||||
|
response = await self.client.messages.create(
|
||||||
|
model=self.model,
|
||||||
|
max_tokens=max_output_tokens,
|
||||||
|
system=system_content,
|
||||||
|
messages=[{"role": "user", "content": user_prompt}],
|
||||||
|
temperature=self.config.llm_temperature,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Parse response
|
||||||
|
content = response.content[0].text if response.content else ""
|
||||||
|
if not content:
|
||||||
|
raise ValueError("Empty response from Anthropic")
|
||||||
|
|
||||||
|
metadata = self._calculate_anthropic_costs(response, start_time, batch_size=len(reviews))
|
||||||
|
|
||||||
|
# Try full JSON extraction first
|
||||||
|
try:
|
||||||
|
batch_result = self._extract_json(content)
|
||||||
|
review_results = self._parse_batch_response(batch_result, reviews)
|
||||||
|
return review_results, metadata
|
||||||
|
except (json.JSONDecodeError, ValueError) as e:
|
||||||
|
# Full parse failed - try partial extraction
|
||||||
|
logger.warning(f"Full JSON parse failed: {e}, attempting partial recovery...")
|
||||||
|
|
||||||
|
partial_reviews, missing_indices = self._extract_partial_batch_json(
|
||||||
|
content, len(reviews)
|
||||||
|
)
|
||||||
|
|
||||||
|
if partial_reviews:
|
||||||
|
# We recovered some results - raise PartialBatchResult
|
||||||
|
raise PartialBatchResult(
|
||||||
|
message=f"Recovered {len(partial_reviews)}/{len(reviews)} reviews from malformed JSON",
|
||||||
|
partial_results=partial_reviews,
|
||||||
|
missing_indices=missing_indices,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Couldn't recover anything - re-raise original error
|
||||||
|
raise
|
||||||
|
|
||||||
|
def _build_cached_system(self, prompt: str) -> list[dict[str, Any]]:
|
||||||
|
"""Build system content with cache_control for prompt caching."""
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": prompt,
|
||||||
|
"cache_control": {"type": "ephemeral"},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
def _calculate_anthropic_costs(
|
||||||
|
self,
|
||||||
|
response: Any,
|
||||||
|
start_time: float,
|
||||||
|
batch_size: int = 1,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Calculate costs from Anthropic response, accounting for cached tokens."""
|
||||||
input_tokens = response.usage.input_tokens
|
input_tokens = response.usage.input_tokens
|
||||||
output_tokens = response.usage.output_tokens
|
output_tokens = response.usage.output_tokens
|
||||||
total_tokens = input_tokens + output_tokens
|
total_tokens = input_tokens + output_tokens
|
||||||
|
|
||||||
pricing = self.PRICING.get(self.model, {"input": 3.0, "output": 15.0})
|
# Anthropic returns cache info in usage
|
||||||
cost = (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1_000_000
|
cached_tokens = getattr(response.usage, "cache_read_input_tokens", 0) or 0
|
||||||
|
cache_creation_tokens = getattr(response.usage, "cache_creation_input_tokens", 0) or 0
|
||||||
|
|
||||||
|
uncached_input = input_tokens - cached_tokens
|
||||||
|
|
||||||
|
pricing = self.PRICING.get(self.model, {"input": 3.0, "cached_input": 0.30, "output": 15.0})
|
||||||
|
cost = (
|
||||||
|
uncached_input * pricing["input"]
|
||||||
|
+ cached_tokens * pricing.get("cached_input", pricing["input"] * 0.1)
|
||||||
|
+ output_tokens * pricing["output"]
|
||||||
|
) / 1_000_000
|
||||||
|
|
||||||
self.total_tokens_used += total_tokens
|
self.total_tokens_used += total_tokens
|
||||||
self.total_cost_usd += cost
|
self.total_cost_usd += cost
|
||||||
|
self._cached_tokens += cached_tokens
|
||||||
|
|
||||||
metadata = {
|
return {
|
||||||
"model": self.model,
|
"model": self.model,
|
||||||
"input_tokens": input_tokens,
|
"input_tokens": input_tokens,
|
||||||
|
"cached_tokens": cached_tokens,
|
||||||
|
"cache_creation_tokens": cache_creation_tokens,
|
||||||
"output_tokens": output_tokens,
|
"output_tokens": output_tokens,
|
||||||
"total_tokens": total_tokens,
|
"total_tokens": total_tokens,
|
||||||
"cost_usd": cost,
|
"cost_usd": cost,
|
||||||
"latency_ms": int((time.time() - start_time) * 1000),
|
"latency_ms": int((time.time() - start_time) * 1000),
|
||||||
|
"batch_size": batch_size,
|
||||||
|
"tokens_per_review": total_tokens / batch_size if batch_size > 0 else 0,
|
||||||
}
|
}
|
||||||
|
|
||||||
return result, metadata
|
def _parse_batch_response(
|
||||||
|
self,
|
||||||
|
batch_result: dict[str, Any],
|
||||||
|
original_reviews: list[BatchReviewInput],
|
||||||
|
) -> list[LLMClassificationResponse]:
|
||||||
|
"""Parse batch response into individual review results."""
|
||||||
|
results: list[LLMClassificationResponse] = []
|
||||||
|
|
||||||
|
# Handle both formats: {"reviews": [...]} and direct list
|
||||||
|
review_data = batch_result.get("reviews", [])
|
||||||
|
if not review_data and isinstance(batch_result, list):
|
||||||
|
review_data = batch_result
|
||||||
|
|
||||||
|
# Create a lookup by review_index
|
||||||
|
results_by_index = {r.get("review_index", i): r for i, r in enumerate(review_data)}
|
||||||
|
|
||||||
|
for i, original in enumerate(original_reviews):
|
||||||
|
if i in results_by_index:
|
||||||
|
review_result = results_by_index[i]
|
||||||
|
results.append({
|
||||||
|
"spans": review_result.get("spans", []),
|
||||||
|
"review_summary": review_result.get("review_summary", {
|
||||||
|
"dominant_valence": "V0",
|
||||||
|
"dominant_domain": "O",
|
||||||
|
"span_count": len(review_result.get("spans", [])),
|
||||||
|
"has_comparative": False,
|
||||||
|
"has_entity": False,
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
logger.warning(f"Review index {i} missing from batch response, using fallback")
|
||||||
|
results.append(create_fallback_response(original["text"]))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
async def generate(
|
async def generate(
|
||||||
self,
|
self,
|
||||||
@@ -607,7 +1157,6 @@ class AnthropicClient(LLMClientBase):
|
|||||||
self.total_tokens_used += input_tokens + output_tokens
|
self.total_tokens_used += input_tokens + output_tokens
|
||||||
self.total_cost_usd += cost
|
self.total_cost_usd += cost
|
||||||
|
|
||||||
# Extract JSON from response (handles code blocks)
|
|
||||||
return self._extract_json_string(content)
|
return self._extract_json_string(content)
|
||||||
|
|
||||||
def _extract_json_string(self, content: str) -> str:
|
def _extract_json_string(self, content: str) -> str:
|
||||||
@@ -615,16 +1164,13 @@ class AnthropicClient(LLMClientBase):
|
|||||||
import re
|
import re
|
||||||
content = content.strip()
|
content = content.strip()
|
||||||
|
|
||||||
# If it starts with {, return as-is
|
|
||||||
if content.startswith("{"):
|
if content.startswith("{"):
|
||||||
return content
|
return content
|
||||||
|
|
||||||
# Try to find JSON in code blocks
|
|
||||||
json_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", content)
|
json_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", content)
|
||||||
if json_match:
|
if json_match:
|
||||||
return json_match.group(1)
|
return json_match.group(1)
|
||||||
|
|
||||||
# Try to find JSON object
|
|
||||||
json_match = re.search(r"\{[\s\S]*\}", content)
|
json_match = re.search(r"\{[\s\S]*\}", content)
|
||||||
if json_match:
|
if json_match:
|
||||||
return json_match.group(0)
|
return json_match.group(0)
|
||||||
@@ -635,26 +1181,162 @@ class AnthropicClient(LLMClientBase):
|
|||||||
"""Extract JSON from response, handling markdown code blocks."""
|
"""Extract JSON from response, handling markdown code blocks."""
|
||||||
content = content.strip()
|
content = content.strip()
|
||||||
|
|
||||||
# Try direct parse first
|
|
||||||
try:
|
try:
|
||||||
return json.loads(content)
|
return json.loads(content)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Try to find JSON in code blocks
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
json_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", content)
|
json_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", content)
|
||||||
if json_match:
|
if json_match:
|
||||||
return json.loads(json_match.group(1))
|
return json.loads(json_match.group(1))
|
||||||
|
|
||||||
# Try to find JSON object
|
|
||||||
json_match = re.search(r"\{[\s\S]*\}", content)
|
json_match = re.search(r"\{[\s\S]*\}", content)
|
||||||
if json_match:
|
if json_match:
|
||||||
return json.loads(json_match.group(0))
|
return json.loads(json_match.group(0))
|
||||||
|
|
||||||
raise ValueError(f"Could not extract JSON from response: {content[:200]}")
|
raise ValueError(f"Could not extract JSON from response: {content[:200]}")
|
||||||
|
|
||||||
|
def _extract_partial_batch_json(
|
||||||
|
self, content: str, expected_count: int
|
||||||
|
) -> tuple[list[dict[str, Any]], list[int]]:
|
||||||
|
"""
|
||||||
|
Extract partial results from truncated/malformed batch JSON.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (successfully_parsed_reviews, missing_indices)
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
parsed_reviews: list[dict[str, Any]] = []
|
||||||
|
found_indices: set[int] = set()
|
||||||
|
|
||||||
|
# Pattern to match complete review objects with review_index
|
||||||
|
# Matches: {"review_index": N, ... } with balanced braces
|
||||||
|
review_pattern = r'\{\s*"review_index"\s*:\s*(\d+)[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
|
||||||
|
|
||||||
|
# Try to find all complete review objects
|
||||||
|
for match in re.finditer(review_pattern, content):
|
||||||
|
try:
|
||||||
|
# Extract the matched text and try to parse
|
||||||
|
obj_text = match.group(0)
|
||||||
|
|
||||||
|
# Try to parse as JSON - may need to fix trailing issues
|
||||||
|
try:
|
||||||
|
obj = json.loads(obj_text)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# Try adding closing brace if truncated
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "review_index" in obj and "spans" in obj:
|
||||||
|
idx = obj["review_index"]
|
||||||
|
if idx not in found_indices:
|
||||||
|
parsed_reviews.append(obj)
|
||||||
|
found_indices.add(idx)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Alternative: try parsing incrementally by finding review_index markers
|
||||||
|
if len(parsed_reviews) < expected_count // 2:
|
||||||
|
# Find all review_index positions
|
||||||
|
index_matches = list(re.finditer(r'"review_index"\s*:\s*(\d+)', content))
|
||||||
|
|
||||||
|
for i, match in enumerate(index_matches):
|
||||||
|
idx = int(match.group(1))
|
||||||
|
if idx in found_indices:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Find the start of this review object
|
||||||
|
start = content.rfind('{', 0, match.start())
|
||||||
|
if start == -1:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Find the end - either next review_index or end of content
|
||||||
|
if i + 1 < len(index_matches):
|
||||||
|
end_search = index_matches[i + 1].start()
|
||||||
|
else:
|
||||||
|
end_search = len(content)
|
||||||
|
|
||||||
|
# Find the closing brace
|
||||||
|
obj_text = content[start:end_search]
|
||||||
|
|
||||||
|
# Count braces to find proper end
|
||||||
|
brace_count = 0
|
||||||
|
end_pos = 0
|
||||||
|
for j, char in enumerate(obj_text):
|
||||||
|
if char == '{':
|
||||||
|
brace_count += 1
|
||||||
|
elif char == '}':
|
||||||
|
brace_count -= 1
|
||||||
|
if brace_count == 0:
|
||||||
|
end_pos = j + 1
|
||||||
|
break
|
||||||
|
|
||||||
|
if end_pos > 0:
|
||||||
|
try:
|
||||||
|
obj = json.loads(obj_text[:end_pos])
|
||||||
|
# Validate required fields and data integrity
|
||||||
|
if self._validate_recovered_review(obj):
|
||||||
|
review_idx = obj["review_index"]
|
||||||
|
if review_idx not in found_indices:
|
||||||
|
parsed_reviews.append(obj)
|
||||||
|
found_indices.add(review_idx)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Determine missing indices
|
||||||
|
missing_indices = [i for i in range(expected_count) if i not in found_indices]
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Partial JSON recovery: {len(parsed_reviews)}/{expected_count} reviews recovered, "
|
||||||
|
f"{len(missing_indices)} missing"
|
||||||
|
)
|
||||||
|
|
||||||
|
return parsed_reviews, missing_indices
|
||||||
|
|
||||||
|
def _validate_recovered_review(self, obj: dict[str, Any]) -> bool:
|
||||||
|
"""
|
||||||
|
Validate a recovered review has all required fields with valid data.
|
||||||
|
|
||||||
|
Returns True only if the review is complete and usable.
|
||||||
|
Rejects:
|
||||||
|
- Missing review_index or spans
|
||||||
|
- Empty spans array
|
||||||
|
- Spans missing required fields (text, urt_primary, valence, intensity)
|
||||||
|
- Empty field values
|
||||||
|
"""
|
||||||
|
# Check required top-level fields
|
||||||
|
if "review_index" not in obj:
|
||||||
|
return False
|
||||||
|
if not isinstance(obj.get("review_index"), int):
|
||||||
|
return False
|
||||||
|
|
||||||
|
if "spans" not in obj:
|
||||||
|
return False
|
||||||
|
if not isinstance(obj["spans"], list):
|
||||||
|
return False
|
||||||
|
if len(obj["spans"]) == 0:
|
||||||
|
# Empty spans = no useful classification data
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Validate each span has required fields with non-empty values
|
||||||
|
required_span_fields = ["text", "urt_primary", "valence", "intensity"]
|
||||||
|
for span in obj["spans"]:
|
||||||
|
if not isinstance(span, dict):
|
||||||
|
return False
|
||||||
|
for field in required_span_fields:
|
||||||
|
if field not in span:
|
||||||
|
return False
|
||||||
|
if not span[field]: # Empty string or None
|
||||||
|
return False
|
||||||
|
|
||||||
|
# review_summary is optional but if present should be a dict
|
||||||
|
if "review_summary" in obj and not isinstance(obj["review_summary"], dict):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
async def close(self) -> None:
|
async def close(self) -> None:
|
||||||
"""Close the Anthropic client."""
|
"""Close the Anthropic client."""
|
||||||
await self.client.close()
|
await self.client.close()
|
||||||
|
|||||||
@@ -0,0 +1,480 @@
|
|||||||
|
"""
|
||||||
|
Dynamic prompt builder for URT classification.
|
||||||
|
|
||||||
|
Fetches taxonomy from database to build the system prompt,
|
||||||
|
ensuring single source of truth and including examples.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Entity extraction rules for staff recognition
|
||||||
|
ENTITY_EXTRACTION_RULES = """
|
||||||
|
## ENTITY EXTRACTION (Staff Recognition)
|
||||||
|
|
||||||
|
When a span mentions a SPECIFIC PERSON by name, extract:
|
||||||
|
- entity: The person's name exactly as written
|
||||||
|
- entity_type: "staff" for employees, "customer" for other people mentioned
|
||||||
|
|
||||||
|
### EXTRACT (set entity + entity_type):
|
||||||
|
- "Miglė was amazing" → entity: "Miglė", entity_type: "staff"
|
||||||
|
- "Thank you Carlos!" → entity: "Carlos", entity_type: "staff"
|
||||||
|
- "Adrian helped us" → entity: "Adrian", entity_type: "staff"
|
||||||
|
- "Ačiū Artūrui" → entity: "Artūrui", entity_type: "staff"
|
||||||
|
- "bartender Eivydas" → entity: "Eivydas", entity_type: "staff"
|
||||||
|
- "our server Maria" → entity: "Maria", entity_type: "staff"
|
||||||
|
|
||||||
|
### DO NOT EXTRACT (keep entity: null):
|
||||||
|
- "The bartender was rude" → no specific name, keep null
|
||||||
|
- "Staff was friendly" → generic reference, keep null
|
||||||
|
- "Service was great" → no person mentioned, keep null
|
||||||
|
- "The manager helped" → role only, no name, keep null
|
||||||
|
|
||||||
|
### Name Recognition Tips:
|
||||||
|
- Look for CAPITALIZED words that are NOT at sentence start
|
||||||
|
- Common patterns: "[Name] was/is [adjective]", "thank [Name]", "[role] [Name]"
|
||||||
|
- International names: Miglė, Eivydas, Žydrė, Artūras (Lithuanian), Carlos, María (Spanish), etc.
|
||||||
|
- When a name appears near: bartender, waiter, server, staff, manager, helped, thank, amazing, great, rude
|
||||||
|
|
||||||
|
IMPORTANT: When in doubt, extract the name. Staff recognition is valuable - false positives are acceptable.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Static parts of the prompt that don't change
|
||||||
|
PROMPT_HEADER = """You are a review classification system using URT (Universal Review Taxonomy) v5.1.
|
||||||
|
|
||||||
|
Your task is to extract semantic spans from customer reviews and classify each span independently.
|
||||||
|
|
||||||
|
## SPAN EXTRACTION RULES
|
||||||
|
|
||||||
|
**CRITICAL: Use TOPIC-BASED splitting, NOT sentence-based splitting.**
|
||||||
|
|
||||||
|
A span = all consecutive text about the SAME topic/domain, regardless of sentence count.
|
||||||
|
|
||||||
|
### When to KEEP TOGETHER (same span):
|
||||||
|
- Multiple sentences about the same topic: "The food was great. I loved the pasta. The sauce was perfect." → ONE span (all about Offering)
|
||||||
|
- Cause and effect: "The wait was long because they were understaffed" → ONE span
|
||||||
|
- Elaboration: "Staff was rude. They ignored us for 20 minutes." → ONE span (both about People)
|
||||||
|
- Single-topic reviews: Even if 5 sentences, if all about food → ONE span
|
||||||
|
|
||||||
|
### When to SPLIT (separate spans):
|
||||||
|
- Contrasting conjunctions that change topic: "Food was great BUT service was slow" → TWO spans
|
||||||
|
- Domain change: food (O) → staff (P) → ambiance (E) = split at each change
|
||||||
|
- Target change: "The waiter was nice but the manager was rude" → TWO spans (different people)
|
||||||
|
|
||||||
|
### Examples:
|
||||||
|
- "Amazing food. Best burger ever. Fries were crispy too." → 1 span (all Offering, V+)
|
||||||
|
- "Food was great but we waited an hour." → 2 spans (Offering V+, Journey V-)
|
||||||
|
- "I've been coming here for years. Always consistent quality." → 1 span (Relationship)
|
||||||
|
- "The staff are lovely and amazing with kids. More highchairs are definitely needed though." → 2 spans (People V+, Access V-)
|
||||||
|
|
||||||
|
**Guardrails**:
|
||||||
|
- Prefer FEWER, LARGER spans over many small ones
|
||||||
|
- Most reviews should have 1-3 spans, rarely more
|
||||||
|
- Min 1 span per review
|
||||||
|
- Spans must be non-overlapping
|
||||||
|
|
||||||
|
## CRITICAL CLASSIFICATION RULES (Common Mistakes to Avoid)
|
||||||
|
|
||||||
|
### RULE 1: Money/Price → ALWAYS use V codes (Value)
|
||||||
|
Any mention of: price, cost, fee, charge, €, $, deposit, refund, expensive, cheap, affordable
|
||||||
|
- ✅ "50€ extra" → V1.03 Hidden Costs
|
||||||
|
- ✅ "good price" → V1.01 Price Level
|
||||||
|
- ❌ NEVER use P codes for pricing (P is for People/staff behavior)
|
||||||
|
|
||||||
|
### RULE 2: Staff Behavior → ALWAYS use P codes (People)
|
||||||
|
Any mention of: friendly, rude, helpful, patient, amable, nett, simpático, attentive
|
||||||
|
- ✅ "staff was friendly" → P1.01 Warmth
|
||||||
|
- ✅ "rude employee" → P1.02 Respect
|
||||||
|
- ❌ NEVER use A codes for staff behavior (A is for Access/availability)
|
||||||
|
|
||||||
|
### RULE 3: Scam/Fraud/Deception → ALWAYS use R codes (Relationship)
|
||||||
|
Any mention of: scam, estafa, fraud, lied, cheat, dishonest, robbery, Abzocker
|
||||||
|
- ✅ "felt scammed" → R1.02 Ethics
|
||||||
|
- ✅ "they lied" → R1.01 Honesty
|
||||||
|
- ❌ NEVER use P or V codes for ethical issues
|
||||||
|
|
||||||
|
### RULE 4: Location/Finding → Use A codes (Access)
|
||||||
|
Difficulty finding a place, shuttle, meeting point, confusing directions
|
||||||
|
- ✅ "couldn't find shuttle" → A1.04 Wayfinding
|
||||||
|
- ✅ "far from airport" → A4.01 Location
|
||||||
|
- ❌ Don't confuse with J1.02 Punctuality (which is about being on time)
|
||||||
|
|
||||||
|
### RULE 5: Wait Time vs Punctuality
|
||||||
|
- J1.01 Speed = how FAST service is ("waited 2 hours", "slow service")
|
||||||
|
- J1.02 Punctuality = being ON TIME vs scheduled ("arrived late", "delayed")
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
PROMPT_BATCH_OUTPUT_FORMAT = """
|
||||||
|
## BATCH OUTPUT FORMAT
|
||||||
|
|
||||||
|
When given multiple reviews, return a JSON object with a "reviews" array.
|
||||||
|
Each review in the array contains its own spans and summary.
|
||||||
|
|
||||||
|
{
|
||||||
|
"reviews": [
|
||||||
|
{
|
||||||
|
"review_index": 0,
|
||||||
|
"spans": [
|
||||||
|
{
|
||||||
|
"span_index": 0,
|
||||||
|
"span_text": "exact text from this review",
|
||||||
|
"span_start": 0,
|
||||||
|
"span_end": 25,
|
||||||
|
"urt_primary": "P1.01",
|
||||||
|
"urt_secondary": [],
|
||||||
|
"valence": "V+",
|
||||||
|
"intensity": "I2",
|
||||||
|
"specificity": "S2",
|
||||||
|
"actionability": "A1",
|
||||||
|
"temporal": "TC",
|
||||||
|
"evidence": "ES",
|
||||||
|
"comparative": "CR-N",
|
||||||
|
"is_primary": true,
|
||||||
|
"confidence": "high",
|
||||||
|
"entity": "Maria",
|
||||||
|
"entity_type": "staff",
|
||||||
|
"usn": "URT:S:P1.01:+2:21TC.ES.N"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"review_summary": {
|
||||||
|
"dominant_valence": "V+",
|
||||||
|
"dominant_domain": "P",
|
||||||
|
"span_count": 1,
|
||||||
|
"has_comparative": false,
|
||||||
|
"has_entity": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"review_index": 1,
|
||||||
|
"spans": [ ... ],
|
||||||
|
"review_summary": { ... }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
CRITICAL RULES FOR BATCH PROCESSING:
|
||||||
|
1. Process each review INDEPENDENTLY - do not mix content between reviews
|
||||||
|
2. review_index MUST match the input order (0, 1, 2, ...)
|
||||||
|
3. span_start and span_end are relative to THAT review's text only
|
||||||
|
4. If you see the same entity (e.g., staff name "Maria") in multiple reviews, use consistent spelling
|
||||||
|
5. Output ALL reviews in the batch - never skip any
|
||||||
|
6. Each review must have at least 1 span
|
||||||
|
"""
|
||||||
|
|
||||||
|
PROMPT_SINGLE_OUTPUT_FORMAT = """
|
||||||
|
## SINGLE REVIEW OUTPUT FORMAT
|
||||||
|
|
||||||
|
Return valid JSON matching this schema. No markdown, no explanations.
|
||||||
|
|
||||||
|
{
|
||||||
|
"spans": [
|
||||||
|
{
|
||||||
|
"span_index": 0,
|
||||||
|
"span_text": "exact text from review",
|
||||||
|
"span_start": 0,
|
||||||
|
"span_end": 25,
|
||||||
|
"urt_primary": "O1.01",
|
||||||
|
"urt_secondary": [],
|
||||||
|
"valence": "V+",
|
||||||
|
"intensity": "I2",
|
||||||
|
"specificity": "S2",
|
||||||
|
"actionability": "A1",
|
||||||
|
"temporal": "TC",
|
||||||
|
"evidence": "ES",
|
||||||
|
"comparative": "CR-N",
|
||||||
|
"is_primary": true,
|
||||||
|
"confidence": "high",
|
||||||
|
"entity": null,
|
||||||
|
"entity_type": null,
|
||||||
|
"relation_type": null,
|
||||||
|
"related_span_index": null,
|
||||||
|
"usn": "URT:S:O1.01:+2:21TC.ES.N"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"review_summary": {
|
||||||
|
"dominant_valence": "V+",
|
||||||
|
"dominant_domain": "O",
|
||||||
|
"span_count": 1,
|
||||||
|
"has_comparative": false,
|
||||||
|
"has_entity": false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
PROMPT_DIMENSIONS = """
|
||||||
|
## DIMENSION CODES
|
||||||
|
|
||||||
|
### Valence
|
||||||
|
- V+ : Positive sentiment
|
||||||
|
- V- : Negative sentiment
|
||||||
|
- V0 : Neutral/factual
|
||||||
|
- V± : Mixed within the span
|
||||||
|
|
||||||
|
### Intensity
|
||||||
|
- I1 : Low ("okay", "fine", "decent")
|
||||||
|
- I2 : Moderate ("good", "bad", "slow")
|
||||||
|
- I3 : High ("amazing", "terrible", "unacceptable")
|
||||||
|
|
||||||
|
### Specificity
|
||||||
|
- S1 : Vague ("it was bad")
|
||||||
|
- S2 : Some detail ("the food was cold")
|
||||||
|
- S3 : Precise ("waited 45 minutes for appetizers")
|
||||||
|
|
||||||
|
### Actionability
|
||||||
|
- A1 : No clear action possible
|
||||||
|
- A2 : Possible actions, unclear which
|
||||||
|
- A3 : Clear, specific action ("train staff on X", "fix Y")
|
||||||
|
|
||||||
|
### Temporal
|
||||||
|
- TC : Current visit (default when no markers)
|
||||||
|
- TR : Recent pattern ("lately", "recently", "again")
|
||||||
|
- TH : Historical ("for years", "always", "used to")
|
||||||
|
- TF : Future ("won't return", "next time", "I expect")
|
||||||
|
|
||||||
|
### Evidence
|
||||||
|
- ES : Stated explicitly in text (default)
|
||||||
|
- EI : Inferred logically (not stated, but entailed)
|
||||||
|
- EC : Contextual (depends on surrounding text)
|
||||||
|
|
||||||
|
### Comparative
|
||||||
|
- CR-N : No comparison (default)
|
||||||
|
- CR-B : Better than alternatives
|
||||||
|
- CR-W : Worse than alternatives
|
||||||
|
- CR-S : Same as alternatives
|
||||||
|
|
||||||
|
## PRIMARY SPAN SELECTION
|
||||||
|
|
||||||
|
Mark exactly ONE span as is_primary=true using this order:
|
||||||
|
1. Highest intensity (I3 > I2 > I1)
|
||||||
|
2. Tie-break: negative over positive (V- > V± > V0 > V+)
|
||||||
|
3. Tie-break: earliest span_index
|
||||||
|
|
||||||
|
## USN (URT String Notation)
|
||||||
|
|
||||||
|
Generate a USN string for each span:
|
||||||
|
```
|
||||||
|
URT:S:{primary}[+{sec1}][+{sec2}]:{valence_sign}{intensity_num}:{S#}{A#}{temporal}.{evidence}.{CR_suffix}
|
||||||
|
```
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
- `URT:S:J1.03:-2:22TC.ES.N` (J1.03, V-, I2, S2, A2, TC, ES, CR-N)
|
||||||
|
- `URT:S:P1.01+O2.03:+3:33TR.ES.B` (P1.01 primary, O2.03 secondary, V+, I3, S3, A3, TR, ES, CR-B)
|
||||||
|
|
||||||
|
Valence encoding: + for V+, - for V-, 0 for V0, ± for V±
|
||||||
|
CR suffix: N=CR-N, B=CR-B, W=CR-W, S=CR-S"""
|
||||||
|
|
||||||
|
# Domain-specific warnings to include
|
||||||
|
DOMAIN_WARNINGS = {
|
||||||
|
"V": "USE FOR ALL PRICE/COST/FEE/MONEY MENTIONS",
|
||||||
|
"P": "USE FOR STAFF BEHAVIOR ONLY, NOT PRICING",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class PromptBuilder:
|
||||||
|
"""
|
||||||
|
Builds the classification prompt dynamically from database taxonomy.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
builder = PromptBuilder(db_pool)
|
||||||
|
prompt = await builder.build() # For single review
|
||||||
|
prompt = await builder.build(batch_mode=True) # For batch processing
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, pool: asyncpg.Pool):
|
||||||
|
self.pool = pool
|
||||||
|
self._cached_prompt_single: str | None = None
|
||||||
|
self._cached_prompt_batch: str | None = None
|
||||||
|
self._cached_taxonomy: str | None = None
|
||||||
|
|
||||||
|
async def build(self, force_refresh: bool = False, batch_mode: bool = False) -> str:
|
||||||
|
"""
|
||||||
|
Build the complete system prompt from database taxonomy.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
force_refresh: If True, rebuild even if cached
|
||||||
|
batch_mode: If True, include batch output format
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Complete system prompt string
|
||||||
|
"""
|
||||||
|
# Check if we can use cached version
|
||||||
|
cache = self._cached_prompt_batch if batch_mode else self._cached_prompt_single
|
||||||
|
if not force_refresh and cache:
|
||||||
|
return cache
|
||||||
|
|
||||||
|
# Build taxonomy section (shared between single and batch)
|
||||||
|
if not self._cached_taxonomy or force_refresh:
|
||||||
|
domains = await self._fetch_domains()
|
||||||
|
subcodes = await self._fetch_subcodes()
|
||||||
|
self._cached_taxonomy = self._build_taxonomy_section(domains, subcodes)
|
||||||
|
logger.info(f"Built taxonomy section with {len(subcodes)} subcodes")
|
||||||
|
|
||||||
|
# Combine all parts with appropriate output format
|
||||||
|
output_format = PROMPT_BATCH_OUTPUT_FORMAT if batch_mode else PROMPT_SINGLE_OUTPUT_FORMAT
|
||||||
|
prompt = (
|
||||||
|
PROMPT_HEADER
|
||||||
|
+ self._cached_taxonomy
|
||||||
|
+ ENTITY_EXTRACTION_RULES
|
||||||
|
+ PROMPT_DIMENSIONS
|
||||||
|
+ output_format
|
||||||
|
)
|
||||||
|
|
||||||
|
# Cache it
|
||||||
|
if batch_mode:
|
||||||
|
self._cached_prompt_batch = prompt
|
||||||
|
else:
|
||||||
|
self._cached_prompt_single = prompt
|
||||||
|
|
||||||
|
logger.info(f"Built {'batch' if batch_mode else 'single'} classification prompt")
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
async def build_cacheable_parts(self) -> tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Build the prompt split into cacheable (static) and dynamic parts.
|
||||||
|
|
||||||
|
For prompt caching, we want to separate:
|
||||||
|
- Static part (taxonomy, rules) - can be cached
|
||||||
|
- Dynamic part (output format) - varies by mode
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (cacheable_prefix, suffix_for_batch)
|
||||||
|
"""
|
||||||
|
if not self._cached_taxonomy:
|
||||||
|
domains = await self._fetch_domains()
|
||||||
|
subcodes = await self._fetch_subcodes()
|
||||||
|
self._cached_taxonomy = self._build_taxonomy_section(domains, subcodes)
|
||||||
|
|
||||||
|
# Static cacheable prefix (same for all calls)
|
||||||
|
cacheable_prefix = (
|
||||||
|
PROMPT_HEADER
|
||||||
|
+ self._cached_taxonomy
|
||||||
|
+ ENTITY_EXTRACTION_RULES
|
||||||
|
+ PROMPT_DIMENSIONS
|
||||||
|
)
|
||||||
|
|
||||||
|
return cacheable_prefix, PROMPT_BATCH_OUTPUT_FORMAT
|
||||||
|
|
||||||
|
async def _fetch_domains(self) -> list[dict[str, Any]]:
|
||||||
|
"""Fetch domain definitions from database."""
|
||||||
|
query = """
|
||||||
|
SELECT code, name, description
|
||||||
|
FROM pipeline.urt_domains
|
||||||
|
ORDER BY code
|
||||||
|
"""
|
||||||
|
rows = await self.pool.fetch(query)
|
||||||
|
return [dict(row) for row in rows]
|
||||||
|
|
||||||
|
async def _fetch_subcodes(self) -> list[dict[str, Any]]:
|
||||||
|
"""Fetch subcode definitions with examples from database."""
|
||||||
|
query = """
|
||||||
|
SELECT
|
||||||
|
code,
|
||||||
|
name,
|
||||||
|
definition,
|
||||||
|
positive_example,
|
||||||
|
negative_example
|
||||||
|
FROM pipeline.urt_subcodes
|
||||||
|
ORDER BY code
|
||||||
|
"""
|
||||||
|
rows = await self.pool.fetch(query)
|
||||||
|
return [dict(row) for row in rows]
|
||||||
|
|
||||||
|
def _build_taxonomy_section(
|
||||||
|
self,
|
||||||
|
domains: list[dict[str, Any]],
|
||||||
|
subcodes: list[dict[str, Any]]
|
||||||
|
) -> str:
|
||||||
|
"""Build the taxonomy section of the prompt."""
|
||||||
|
# Group subcodes by domain
|
||||||
|
subcodes_by_domain: dict[str, list[dict]] = {}
|
||||||
|
for subcode in subcodes:
|
||||||
|
domain_code = subcode["code"][0]
|
||||||
|
if domain_code not in subcodes_by_domain:
|
||||||
|
subcodes_by_domain[domain_code] = []
|
||||||
|
subcodes_by_domain[domain_code].append(subcode)
|
||||||
|
|
||||||
|
# Build the section
|
||||||
|
lines = ["## URT TAXONOMY (Use EXACT codes from database)", ""]
|
||||||
|
|
||||||
|
for domain in domains:
|
||||||
|
code = domain["code"]
|
||||||
|
name = domain["name"]
|
||||||
|
desc = domain["description"]
|
||||||
|
domain_subcodes = subcodes_by_domain.get(code, [])
|
||||||
|
|
||||||
|
# Domain header with warning if applicable
|
||||||
|
warning = DOMAIN_WARNINGS.get(code, "")
|
||||||
|
if warning:
|
||||||
|
lines.append(f"### {code} - {name.upper()} ({len(domain_subcodes)} codes) ⚠️ {warning}")
|
||||||
|
else:
|
||||||
|
lines.append(f"### {code} - {name.upper()} ({len(domain_subcodes)} codes)")
|
||||||
|
|
||||||
|
# Add each subcode with definition and examples
|
||||||
|
for sc in domain_subcodes:
|
||||||
|
sc_code = sc["code"]
|
||||||
|
sc_name = sc["name"]
|
||||||
|
sc_def = sc["definition"] or sc_name
|
||||||
|
pos_ex = sc.get("positive_example")
|
||||||
|
neg_ex = sc.get("negative_example")
|
||||||
|
|
||||||
|
# Main line: code, name, definition
|
||||||
|
line = f"{sc_code} {sc_name}: {sc_def}"
|
||||||
|
|
||||||
|
# Add examples if available (helps LLM distinguish)
|
||||||
|
if pos_ex and neg_ex:
|
||||||
|
line += f' [+"{pos_ex}" / -"{neg_ex}"]'
|
||||||
|
elif pos_ex:
|
||||||
|
line += f' [+"{pos_ex}"]'
|
||||||
|
elif neg_ex:
|
||||||
|
line += f' [-"{neg_ex}"]'
|
||||||
|
|
||||||
|
lines.append(line)
|
||||||
|
|
||||||
|
lines.append("") # Blank line between domains
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
def invalidate_cache(self) -> None:
|
||||||
|
"""Invalidate the cached prompt, forcing rebuild on next call."""
|
||||||
|
self._cached_prompt_single = None
|
||||||
|
self._cached_prompt_batch = None
|
||||||
|
self._cached_taxonomy = None
|
||||||
|
|
||||||
|
|
||||||
|
# Global prompt cache for when DB is not available
|
||||||
|
_static_prompt_cache: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
async def build_prompt_from_db(pool: asyncpg.Pool) -> str:
|
||||||
|
"""
|
||||||
|
Convenience function to build prompt from database.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pool: Database connection pool
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Complete system prompt
|
||||||
|
"""
|
||||||
|
builder = PromptBuilder(pool)
|
||||||
|
return await builder.build()
|
||||||
|
|
||||||
|
|
||||||
|
def get_static_fallback_prompt() -> str:
|
||||||
|
"""
|
||||||
|
Get a static fallback prompt when database is not available.
|
||||||
|
This should only be used in testing or when DB connection fails.
|
||||||
|
"""
|
||||||
|
global _static_prompt_cache
|
||||||
|
if _static_prompt_cache is None:
|
||||||
|
# Import the hardcoded version as fallback
|
||||||
|
from reviewiq_pipeline.services.llm_client import SYSTEM_PROMPT
|
||||||
|
_static_prompt_cache = SYSTEM_PROMPT
|
||||||
|
return _static_prompt_cache
|
||||||
@@ -0,0 +1,375 @@
|
|||||||
|
"""
|
||||||
|
Language-agnostic review router for cost-optimized LLM classification.
|
||||||
|
|
||||||
|
Routes reviews to different processing paths based on structural signals only:
|
||||||
|
- SKIP: Extremely low-value reviews (skip LLM entirely, assign generic code)
|
||||||
|
- CHEAP_MODEL: Short, simple reviews (use Haiku for classification)
|
||||||
|
- FULL_MODEL: Complex reviews (use Sonnet for full classification)
|
||||||
|
|
||||||
|
IMPORTANT: All routing decisions use ONLY language-agnostic signals:
|
||||||
|
- Word count / character count (numeric)
|
||||||
|
- Presence of numbers in text (pattern-based)
|
||||||
|
- Sentence count (punctuation-based)
|
||||||
|
- Emoji-only detection (pattern-based)
|
||||||
|
- Star rating (numeric)
|
||||||
|
|
||||||
|
NO hardcoded word lists (like "great", "bueno", "gut") are used because:
|
||||||
|
- Reviews span 7+ languages (Spanish, English, Dutch, German, Polish, Finnish, Danish, etc.)
|
||||||
|
- Typography errors are common
|
||||||
|
- False negatives (skipping valuable reviews) are worse than false positives
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from enum import Enum
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from reviewiq_pipeline.contracts import ReviewToClassify
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class RoutingTier(Enum):
|
||||||
|
"""Processing tier for a review."""
|
||||||
|
|
||||||
|
SKIP = "skip" # Skip LLM, assign generic URT code
|
||||||
|
CHEAP_MODEL = "cheap" # Use fast/cheap model (Haiku)
|
||||||
|
FULL_MODEL = "full" # Use full model (Sonnet)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RoutingDecision:
|
||||||
|
"""Result of routing decision for a review."""
|
||||||
|
|
||||||
|
tier: RoutingTier
|
||||||
|
reason: str
|
||||||
|
signals: dict[str, any]
|
||||||
|
# For SKIP tier, pre-assign the generic classification
|
||||||
|
skip_classification: dict | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RouterConfig:
|
||||||
|
"""Configuration for the review router."""
|
||||||
|
|
||||||
|
# SKIP tier thresholds (very conservative - prefer false positives)
|
||||||
|
skip_max_words: int = 1
|
||||||
|
skip_max_chars: int = 15
|
||||||
|
skip_require_extreme_rating: bool = True # Only skip if rating is 1 or 5
|
||||||
|
|
||||||
|
# CHEAP_MODEL tier thresholds
|
||||||
|
cheap_max_words: int = 10
|
||||||
|
cheap_max_chars: int = 100
|
||||||
|
|
||||||
|
# Signals that force FULL_MODEL regardless of length
|
||||||
|
full_model_if_has_numbers: bool = True
|
||||||
|
full_model_if_multiple_sentences: bool = True
|
||||||
|
full_model_min_sentences: int = 2
|
||||||
|
|
||||||
|
|
||||||
|
class ReviewRouter:
|
||||||
|
"""
|
||||||
|
Routes reviews to appropriate processing tier using language-agnostic signals.
|
||||||
|
|
||||||
|
Design principles:
|
||||||
|
- Conservative: Prefer false positives (processing simple reviews fully)
|
||||||
|
over false negatives (skipping valuable reviews)
|
||||||
|
- Language-agnostic: No word lists, only structural/numeric signals
|
||||||
|
- Transparent: Every decision includes the signals used
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Pattern to detect numbers (dates, amounts, room numbers, etc.)
|
||||||
|
NUMBER_PATTERN = re.compile(r'\d+')
|
||||||
|
|
||||||
|
# Pattern for sentence-ending punctuation (language-agnostic)
|
||||||
|
SENTENCE_END_PATTERN = re.compile(r'[.!?。!?]+')
|
||||||
|
|
||||||
|
# Emoji pattern (same as TextProcessor)
|
||||||
|
EMOJI_PATTERN = re.compile(
|
||||||
|
"["
|
||||||
|
"\U0001F600-\U0001F64F" # emoticons
|
||||||
|
"\U0001F300-\U0001F5FF" # symbols & pictographs
|
||||||
|
"\U0001F680-\U0001F6FF" # transport & map symbols
|
||||||
|
"\U0001F1E0-\U0001F1FF" # flags
|
||||||
|
"\U00002702-\U000027B0" # dingbats
|
||||||
|
"\U000024C2-\U0001F251" # enclosed characters
|
||||||
|
"]+",
|
||||||
|
flags=re.UNICODE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generic classification for skipped reviews
|
||||||
|
GENERIC_POSITIVE = {
|
||||||
|
"urt_primary": "V4.03", # Overall Satisfaction - General
|
||||||
|
"valence": "V+",
|
||||||
|
"intensity": "I1",
|
||||||
|
"confidence": "low",
|
||||||
|
"skip_reason": "auto_routed_positive",
|
||||||
|
}
|
||||||
|
|
||||||
|
GENERIC_NEGATIVE = {
|
||||||
|
"urt_primary": "V4.03", # Overall Satisfaction - General
|
||||||
|
"valence": "V-",
|
||||||
|
"intensity": "I1",
|
||||||
|
"confidence": "low",
|
||||||
|
"skip_reason": "auto_routed_negative",
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, config: RouterConfig | None = None):
|
||||||
|
self.config = config or RouterConfig()
|
||||||
|
self._stats = {
|
||||||
|
"skip": 0,
|
||||||
|
"cheap": 0,
|
||||||
|
"full": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
def route(self, review: ReviewToClassify) -> RoutingDecision:
|
||||||
|
"""
|
||||||
|
Determine the processing tier for a review.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
review: Review to route
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
RoutingDecision with tier, reason, and signals
|
||||||
|
"""
|
||||||
|
text = review.get("text_normalized") or review.get("text") or ""
|
||||||
|
rating = review.get("rating", 3)
|
||||||
|
|
||||||
|
# Extract language-agnostic signals
|
||||||
|
signals = self._extract_signals(text, rating)
|
||||||
|
|
||||||
|
# Decision logic (conservative - start with FULL, demote only if safe)
|
||||||
|
decision = self._make_decision(signals, rating)
|
||||||
|
|
||||||
|
# Update stats
|
||||||
|
self._stats[decision.tier.value] += 1
|
||||||
|
|
||||||
|
return decision
|
||||||
|
|
||||||
|
def route_batch(
|
||||||
|
self,
|
||||||
|
reviews: list[ReviewToClassify]
|
||||||
|
) -> dict[RoutingTier, list[ReviewToClassify]]:
|
||||||
|
"""
|
||||||
|
Route a batch of reviews, grouping by tier.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
reviews: List of reviews to route
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary mapping tiers to lists of reviews
|
||||||
|
"""
|
||||||
|
result = {
|
||||||
|
RoutingTier.SKIP: [],
|
||||||
|
RoutingTier.CHEAP_MODEL: [],
|
||||||
|
RoutingTier.FULL_MODEL: [],
|
||||||
|
}
|
||||||
|
|
||||||
|
for review in reviews:
|
||||||
|
decision = self.route(review)
|
||||||
|
# Attach routing decision to review for downstream use
|
||||||
|
review["_routing"] = decision
|
||||||
|
result[decision.tier].append(review)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Routed {len(reviews)} reviews: "
|
||||||
|
f"SKIP={len(result[RoutingTier.SKIP])}, "
|
||||||
|
f"CHEAP={len(result[RoutingTier.CHEAP_MODEL])}, "
|
||||||
|
f"FULL={len(result[RoutingTier.FULL_MODEL])}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _extract_signals(self, text: str, rating: int) -> dict[str, any]:
|
||||||
|
"""
|
||||||
|
Extract language-agnostic signals from review text.
|
||||||
|
|
||||||
|
All signals are structural/numeric, never word-based.
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return {
|
||||||
|
"word_count": 0,
|
||||||
|
"char_count": 0,
|
||||||
|
"has_numbers": False,
|
||||||
|
"sentence_count": 0,
|
||||||
|
"emoji_count": 0,
|
||||||
|
"is_emoji_only": False,
|
||||||
|
"rating": rating,
|
||||||
|
"is_extreme_rating": rating in (1, 5),
|
||||||
|
}
|
||||||
|
|
||||||
|
words = text.split()
|
||||||
|
word_count = len(words)
|
||||||
|
char_count = len(text)
|
||||||
|
|
||||||
|
# Check for numbers (dates, amounts, room numbers - often signal specific details)
|
||||||
|
has_numbers = bool(self.NUMBER_PATTERN.search(text))
|
||||||
|
|
||||||
|
# Count sentences by punctuation
|
||||||
|
sentences = self.SENTENCE_END_PATTERN.split(text)
|
||||||
|
sentence_count = len([s for s in sentences if s.strip()])
|
||||||
|
|
||||||
|
# Count emoji
|
||||||
|
emoji_matches = self.EMOJI_PATTERN.findall(text)
|
||||||
|
emoji_count = len(emoji_matches)
|
||||||
|
|
||||||
|
# Check if text is emoji-only (after stripping whitespace)
|
||||||
|
text_without_emoji = self.EMOJI_PATTERN.sub("", text).strip()
|
||||||
|
is_emoji_only = emoji_count > 0 and len(text_without_emoji) == 0
|
||||||
|
|
||||||
|
return {
|
||||||
|
"word_count": word_count,
|
||||||
|
"char_count": char_count,
|
||||||
|
"has_numbers": has_numbers,
|
||||||
|
"sentence_count": sentence_count,
|
||||||
|
"emoji_count": emoji_count,
|
||||||
|
"is_emoji_only": is_emoji_only,
|
||||||
|
"rating": rating,
|
||||||
|
"is_extreme_rating": rating in (1, 5),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _make_decision(
|
||||||
|
self,
|
||||||
|
signals: dict[str, any],
|
||||||
|
rating: int
|
||||||
|
) -> RoutingDecision:
|
||||||
|
"""
|
||||||
|
Make routing decision based on signals.
|
||||||
|
|
||||||
|
Decision order (conservative):
|
||||||
|
1. Check for FULL_MODEL forcing signals first
|
||||||
|
2. Check for SKIP eligibility (very strict)
|
||||||
|
3. Check for CHEAP_MODEL eligibility
|
||||||
|
4. Default to FULL_MODEL
|
||||||
|
"""
|
||||||
|
cfg = self.config
|
||||||
|
|
||||||
|
# FULL_MODEL forcing conditions
|
||||||
|
if cfg.full_model_if_has_numbers and signals["has_numbers"]:
|
||||||
|
return RoutingDecision(
|
||||||
|
tier=RoutingTier.FULL_MODEL,
|
||||||
|
reason="contains_numbers",
|
||||||
|
signals=signals,
|
||||||
|
)
|
||||||
|
|
||||||
|
if (cfg.full_model_if_multiple_sentences and
|
||||||
|
signals["sentence_count"] >= cfg.full_model_min_sentences):
|
||||||
|
return RoutingDecision(
|
||||||
|
tier=RoutingTier.FULL_MODEL,
|
||||||
|
reason="multiple_sentences",
|
||||||
|
signals=signals,
|
||||||
|
)
|
||||||
|
|
||||||
|
if signals["word_count"] > cfg.cheap_max_words:
|
||||||
|
return RoutingDecision(
|
||||||
|
tier=RoutingTier.FULL_MODEL,
|
||||||
|
reason="long_text",
|
||||||
|
signals=signals,
|
||||||
|
)
|
||||||
|
|
||||||
|
# SKIP eligibility (very strict)
|
||||||
|
skip_eligible = (
|
||||||
|
signals["word_count"] <= cfg.skip_max_words and
|
||||||
|
signals["char_count"] <= cfg.skip_max_chars and
|
||||||
|
not signals["has_numbers"] and
|
||||||
|
signals["sentence_count"] <= 1
|
||||||
|
)
|
||||||
|
|
||||||
|
if cfg.skip_require_extreme_rating:
|
||||||
|
skip_eligible = skip_eligible and signals["is_extreme_rating"]
|
||||||
|
|
||||||
|
if skip_eligible:
|
||||||
|
# Determine generic classification based on rating
|
||||||
|
if rating >= 4:
|
||||||
|
skip_class = self.GENERIC_POSITIVE.copy()
|
||||||
|
else:
|
||||||
|
skip_class = self.GENERIC_NEGATIVE.copy()
|
||||||
|
|
||||||
|
return RoutingDecision(
|
||||||
|
tier=RoutingTier.SKIP,
|
||||||
|
reason="trivial_review",
|
||||||
|
signals=signals,
|
||||||
|
skip_classification=skip_class,
|
||||||
|
)
|
||||||
|
|
||||||
|
# CHEAP_MODEL eligibility
|
||||||
|
if (signals["word_count"] <= cfg.cheap_max_words and
|
||||||
|
signals["char_count"] <= cfg.cheap_max_chars and
|
||||||
|
signals["sentence_count"] <= 1):
|
||||||
|
return RoutingDecision(
|
||||||
|
tier=RoutingTier.CHEAP_MODEL,
|
||||||
|
reason="short_simple_review",
|
||||||
|
signals=signals,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Default to FULL_MODEL
|
||||||
|
return RoutingDecision(
|
||||||
|
tier=RoutingTier.FULL_MODEL,
|
||||||
|
reason="default",
|
||||||
|
signals=signals,
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_stats(self) -> dict[str, int]:
|
||||||
|
"""Get routing statistics."""
|
||||||
|
return self._stats.copy()
|
||||||
|
|
||||||
|
def reset_stats(self):
|
||||||
|
"""Reset routing statistics."""
|
||||||
|
self._stats = {"skip": 0, "cheap": 0, "full": 0}
|
||||||
|
|
||||||
|
|
||||||
|
def create_router(
|
||||||
|
conservative: bool = True,
|
||||||
|
skip_enabled: bool = True,
|
||||||
|
cheap_model_enabled: bool = True,
|
||||||
|
) -> ReviewRouter:
|
||||||
|
"""
|
||||||
|
Factory function to create a router with common configurations.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
conservative: If True, use very strict thresholds (recommended)
|
||||||
|
skip_enabled: If True, allow SKIP tier
|
||||||
|
cheap_model_enabled: If True, allow CHEAP_MODEL tier
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Configured ReviewRouter instance
|
||||||
|
"""
|
||||||
|
if conservative:
|
||||||
|
# Very conservative - only skip 1-word reviews with extreme ratings
|
||||||
|
config = RouterConfig(
|
||||||
|
skip_max_words=1,
|
||||||
|
skip_max_chars=15,
|
||||||
|
skip_require_extreme_rating=True,
|
||||||
|
cheap_max_words=10,
|
||||||
|
cheap_max_chars=100,
|
||||||
|
full_model_if_has_numbers=True,
|
||||||
|
full_model_if_multiple_sentences=True,
|
||||||
|
full_model_min_sentences=2,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Less conservative - skip more, cheaper processing
|
||||||
|
config = RouterConfig(
|
||||||
|
skip_max_words=3,
|
||||||
|
skip_max_chars=30,
|
||||||
|
skip_require_extreme_rating=False,
|
||||||
|
cheap_max_words=15,
|
||||||
|
cheap_max_chars=150,
|
||||||
|
full_model_if_has_numbers=True,
|
||||||
|
full_model_if_multiple_sentences=True,
|
||||||
|
full_model_min_sentences=3,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Override if tiers disabled
|
||||||
|
if not skip_enabled:
|
||||||
|
config.skip_max_words = 0
|
||||||
|
config.skip_max_chars = 0
|
||||||
|
|
||||||
|
if not cheap_model_enabled:
|
||||||
|
config.cheap_max_words = 0
|
||||||
|
config.cheap_max_chars = 0
|
||||||
|
|
||||||
|
return ReviewRouter(config)
|
||||||
@@ -205,10 +205,11 @@ class Stage1Normalizer:
|
|||||||
source="google",
|
source="google",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Insert enriched review stub
|
# Insert enriched review stub with job_id
|
||||||
await self.review_repo.insert_enriched_review(
|
await self.review_repo.insert_enriched_review(
|
||||||
normalized,
|
normalized,
|
||||||
raw_id,
|
raw_id,
|
||||||
|
job_id=input_data.get("job_id"),
|
||||||
)
|
)
|
||||||
|
|
||||||
return raw_id
|
return raw_id
|
||||||
|
|||||||
@@ -4,15 +4,21 @@ Stage 2: LLM Classification
|
|||||||
Classify normalized reviews into URT codes with span-level extraction.
|
Classify normalized reviews into URT codes with span-level extraction.
|
||||||
|
|
||||||
Responsibilities:
|
Responsibilities:
|
||||||
- Call LLM for span extraction and classification
|
- Call LLM for span extraction and classification (batched for efficiency)
|
||||||
- Generate embeddings
|
- Generate embeddings
|
||||||
- Calculate trust scores
|
- Calculate trust scores
|
||||||
- Select primary span
|
- Select primary span
|
||||||
- Write to reviews_enriched and review_spans tables
|
- Write to reviews_enriched and review_spans tables
|
||||||
|
|
||||||
|
Efficiency Features:
|
||||||
|
- Batch processing: Multiple reviews per LLM call (configurable batch_size)
|
||||||
|
- Prompt caching: System prompt cached to reduce input token costs
|
||||||
|
- Parallel execution: Multiple batches processed concurrently
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
@@ -27,7 +33,20 @@ from reviewiq_pipeline.contracts import (
|
|||||||
Stage2Output,
|
Stage2Output,
|
||||||
Stage2Stats,
|
Stage2Stats,
|
||||||
)
|
)
|
||||||
from reviewiq_pipeline.services.llm_client import LLMClient, create_fallback_response
|
from reviewiq_pipeline.services.llm_client import (
|
||||||
|
LLMClient,
|
||||||
|
create_fallback_response,
|
||||||
|
BatchReviewInput,
|
||||||
|
BatchSizer,
|
||||||
|
PartialBatchResult,
|
||||||
|
)
|
||||||
|
from reviewiq_pipeline.services.prompt_builder import PromptBuilder
|
||||||
|
from reviewiq_pipeline.services.classification_validator import validate_classification
|
||||||
|
from reviewiq_pipeline.services.review_router import (
|
||||||
|
ReviewRouter,
|
||||||
|
RoutingTier,
|
||||||
|
create_router,
|
||||||
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from reviewiq_pipeline.config import Config
|
from reviewiq_pipeline.config import Config
|
||||||
@@ -75,22 +94,100 @@ class Stage2Classifier:
|
|||||||
self.span_repo = span_repo
|
self.span_repo = span_repo
|
||||||
self.embedding_service = embedding_service
|
self.embedding_service = embedding_service
|
||||||
self._llm_client: LLMClientBase | None = None
|
self._llm_client: LLMClientBase | None = None
|
||||||
|
self._cheap_llm_client: LLMClientBase | None = None # For CHEAP tier
|
||||||
|
self._prompt_builder: PromptBuilder | None = None
|
||||||
|
self._batch_sizer: BatchSizer | None = None
|
||||||
|
self._system_prompt_tokens: int = 0
|
||||||
|
|
||||||
|
# Initialize router if enabled
|
||||||
|
self._router: ReviewRouter | None = None
|
||||||
|
if config.router_enabled:
|
||||||
|
self._router = create_router(
|
||||||
|
conservative=config.router_conservative,
|
||||||
|
skip_enabled=config.router_skip_enabled,
|
||||||
|
cheap_model_enabled=config.router_cheap_model_enabled,
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f"Review router enabled: conservative={config.router_conservative}, "
|
||||||
|
f"skip={config.router_skip_enabled}, cheap={config.router_cheap_model_enabled}"
|
||||||
|
)
|
||||||
|
|
||||||
async def _get_llm_client(self) -> LLMClientBase:
|
async def _get_llm_client(self) -> LLMClientBase:
|
||||||
"""Get or create LLM client."""
|
"""Get or create LLM client with dynamic prompt from database."""
|
||||||
if self._llm_client is None:
|
if self._llm_client is None:
|
||||||
self._llm_client = LLMClient.create(self.config)
|
self._llm_client = LLMClient.create(self.config)
|
||||||
|
|
||||||
|
# Build prompt dynamically from database if available
|
||||||
|
batch_prompt = None
|
||||||
|
if self.db and self.db.pool:
|
||||||
|
try:
|
||||||
|
self._prompt_builder = PromptBuilder(self.db.pool)
|
||||||
|
# Build both single and batch prompts
|
||||||
|
single_prompt = await self._prompt_builder.build(batch_mode=False)
|
||||||
|
batch_prompt = await self._prompt_builder.build(batch_mode=True)
|
||||||
|
self._llm_client.set_prompt(single_prompt, batch_prompt)
|
||||||
|
logger.info("Using dynamic prompts from database taxonomy (single + batch)")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to build dynamic prompt, using static: {e}")
|
||||||
|
|
||||||
|
# Estimate system prompt tokens for batch sizing
|
||||||
|
prompt_for_sizing = batch_prompt or self._llm_client.get_prompt(batch_mode=True)
|
||||||
|
self._system_prompt_tokens = len(prompt_for_sizing) // 4 # ~4 chars per token
|
||||||
|
|
||||||
|
# Initialize batch sizer
|
||||||
|
self._batch_sizer = BatchSizer(
|
||||||
|
model=self.config.llm_model,
|
||||||
|
system_prompt_tokens=self._system_prompt_tokens,
|
||||||
|
target_utilization=self.config.classification_target_utilization,
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f"BatchSizer initialized: model={self.config.llm_model}, "
|
||||||
|
f"system_prompt_tokens≈{self._system_prompt_tokens}, "
|
||||||
|
f"target_utilization={self.config.classification_target_utilization:.0%}"
|
||||||
|
)
|
||||||
|
|
||||||
return self._llm_client
|
return self._llm_client
|
||||||
|
|
||||||
|
async def _get_cheap_llm_client(self) -> LLMClientBase:
|
||||||
|
"""Get or create cheap LLM client for CHEAP tier routing."""
|
||||||
|
if self._cheap_llm_client is None:
|
||||||
|
# Create a copy of config with cheap model
|
||||||
|
from copy import copy
|
||||||
|
cheap_config = copy(self.config)
|
||||||
|
cheap_config.llm_model = self.config.router_cheap_model
|
||||||
|
|
||||||
|
self._cheap_llm_client = LLMClient.create(cheap_config)
|
||||||
|
|
||||||
|
# Use same prompts as main client
|
||||||
|
if self._llm_client:
|
||||||
|
single_prompt = self._llm_client.get_prompt(batch_mode=False)
|
||||||
|
batch_prompt = self._llm_client.get_prompt(batch_mode=True)
|
||||||
|
self._cheap_llm_client.set_prompt(single_prompt, batch_prompt)
|
||||||
|
|
||||||
|
logger.info(f"Cheap LLM client initialized with model: {self.config.router_cheap_model}")
|
||||||
|
|
||||||
|
return self._cheap_llm_client
|
||||||
|
|
||||||
async def close(self) -> None:
|
async def close(self) -> None:
|
||||||
"""Close resources."""
|
"""Close resources."""
|
||||||
if self._llm_client:
|
if self._llm_client:
|
||||||
await self._llm_client.close()
|
await self._llm_client.close()
|
||||||
self._llm_client = None
|
self._llm_client = None
|
||||||
|
if self._cheap_llm_client:
|
||||||
|
await self._cheap_llm_client.close()
|
||||||
|
self._cheap_llm_client = None
|
||||||
|
|
||||||
async def process(self, input_data: Stage2Input) -> Stage2Output:
|
async def process(self, input_data: Stage2Input) -> Stage2Output:
|
||||||
"""
|
"""
|
||||||
Process reviews through classification stage.
|
Process reviews through classification stage using batched LLM calls.
|
||||||
|
|
||||||
|
This method:
|
||||||
|
1. Routes reviews to appropriate tier (if router enabled)
|
||||||
|
2. Calculates optimal batch size based on context window and review lengths
|
||||||
|
3. Splits reviews into batches dynamically
|
||||||
|
4. Processes batches in parallel (with concurrency limit)
|
||||||
|
5. Uses prompt caching to reduce costs on subsequent batches
|
||||||
|
6. Adapts batch size based on actual token usage
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
input_data: Stage 2 input with reviews and config
|
input_data: Stage 2 input with reviews and config
|
||||||
@@ -99,65 +196,284 @@ class Stage2Classifier:
|
|||||||
Stage2Output with classified reviews and stats
|
Stage2Output with classified reviews and stats
|
||||||
"""
|
"""
|
||||||
batch_id = str(uuid.uuid4())[:8]
|
batch_id = str(uuid.uuid4())[:8]
|
||||||
logger.info(
|
reviews = input_data["reviews"]
|
||||||
f"Stage 2: Classifying {len(input_data['reviews'])} reviews "
|
max_concurrent = self.config.classification_max_concurrent
|
||||||
f"(batch {batch_id})"
|
fixed_batch_size = self.config.classification_batch_size # 0 = auto
|
||||||
)
|
|
||||||
|
|
||||||
classified_reviews: list[ClassifiedReview] = []
|
|
||||||
total_tokens = 0
|
|
||||||
total_cost = 0.0
|
|
||||||
total_spans = 0
|
|
||||||
error_count = 0
|
|
||||||
|
|
||||||
llm_client = await self._get_llm_client()
|
llm_client = await self._get_llm_client()
|
||||||
|
|
||||||
for review in input_data["reviews"]:
|
# Smart routing (if enabled)
|
||||||
try:
|
skip_classified: list[ClassifiedReview] = []
|
||||||
classified, metadata = await self._classify_review(
|
reviews_to_process = reviews
|
||||||
review,
|
cheap_reviews: list[ReviewToClassify] = []
|
||||||
input_data["config"]["profile"],
|
full_reviews: list[ReviewToClassify] = []
|
||||||
llm_client,
|
|
||||||
batch_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
if classified:
|
if self._router:
|
||||||
classified_reviews.append(classified)
|
routed = self._router.route_batch(reviews)
|
||||||
total_spans += len(classified.get("spans", []))
|
|
||||||
total_tokens += metadata.get("total_tokens", 0)
|
|
||||||
total_cost += metadata.get("cost_usd", 0.0)
|
|
||||||
|
|
||||||
# Persist to database if configured
|
# Process SKIP tier immediately (no LLM)
|
||||||
|
for review in routed[RoutingTier.SKIP]:
|
||||||
|
routing = review.get("_routing")
|
||||||
|
if routing and routing.skip_classification:
|
||||||
|
classified = self._create_skip_classification(
|
||||||
|
review,
|
||||||
|
routing.skip_classification,
|
||||||
|
batch_id,
|
||||||
|
)
|
||||||
|
skip_classified.append(classified)
|
||||||
|
|
||||||
|
# Persist if configured
|
||||||
if self.review_repo and self.span_repo:
|
if self.review_repo and self.span_repo:
|
||||||
await self._persist_classification(
|
await self._persist_classification(
|
||||||
classified,
|
classified, review, batch_id, input_data["config"]
|
||||||
review,
|
)
|
||||||
|
|
||||||
|
cheap_reviews = routed[RoutingTier.CHEAP_MODEL]
|
||||||
|
full_reviews = routed[RoutingTier.FULL_MODEL]
|
||||||
|
|
||||||
|
router_stats = self._router.get_stats()
|
||||||
|
logger.info(
|
||||||
|
f"Router results: SKIP={len(routed[RoutingTier.SKIP])}, "
|
||||||
|
f"CHEAP={len(cheap_reviews)}, FULL={len(full_reviews)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# If no cheap model enabled, merge into full
|
||||||
|
if not self.config.router_cheap_model_enabled:
|
||||||
|
full_reviews = cheap_reviews + full_reviews
|
||||||
|
cheap_reviews = []
|
||||||
|
else:
|
||||||
|
# No router - all reviews go to full model
|
||||||
|
full_reviews = reviews
|
||||||
|
|
||||||
|
# Calculate optimal batch size dynamically (based on full_reviews)
|
||||||
|
all_llm_reviews = full_reviews + cheap_reviews # Combined for batch sizing
|
||||||
|
if all_llm_reviews:
|
||||||
|
review_dicts = [{"text": r["text"]} for r in all_llm_reviews]
|
||||||
|
batch_calc = self._batch_sizer.calculate_batch_size(
|
||||||
|
reviews=review_dicts,
|
||||||
|
fixed_size=fixed_batch_size if fixed_batch_size > 0 else None,
|
||||||
|
)
|
||||||
|
batch_size = batch_calc.batch_size
|
||||||
|
logger.info(f"Batch sizing: {batch_calc.reasoning}")
|
||||||
|
else:
|
||||||
|
batch_size = fixed_batch_size or 25
|
||||||
|
|
||||||
|
llm_review_count = len(full_reviews) + len(cheap_reviews)
|
||||||
|
logger.info(
|
||||||
|
f"Stage 2: Classifying {len(reviews)} reviews "
|
||||||
|
f"(batch_id={batch_id}, batch_size={batch_size}, max_concurrent={max_concurrent}, "
|
||||||
|
f"skip={len(skip_classified)}, llm={llm_review_count})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Split FULL tier reviews into batches
|
||||||
|
full_batches = [
|
||||||
|
full_reviews[i:i + batch_size]
|
||||||
|
for i in range(0, len(full_reviews), batch_size)
|
||||||
|
] if full_reviews else []
|
||||||
|
|
||||||
|
# Split CHEAP tier reviews into batches
|
||||||
|
cheap_batches = [
|
||||||
|
cheap_reviews[i:i + batch_size]
|
||||||
|
for i in range(0, len(cheap_reviews), batch_size)
|
||||||
|
] if cheap_reviews else []
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Split into {len(full_batches)} FULL batches + {len(cheap_batches)} CHEAP batches "
|
||||||
|
f"({'unlimited' if max_concurrent == 0 else max_concurrent} concurrent)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process batches - unlimited concurrency by default (0 = no limit)
|
||||||
|
semaphore = asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None
|
||||||
|
total_tokens = 0
|
||||||
|
total_cost = 0.0
|
||||||
|
total_cached_tokens = 0
|
||||||
|
classified_reviews: list[ClassifiedReview] = []
|
||||||
|
error_count = 0
|
||||||
|
|
||||||
|
# Get cheap client if needed
|
||||||
|
cheap_client = None
|
||||||
|
if cheap_batches:
|
||||||
|
cheap_client = await self._get_cheap_llm_client()
|
||||||
|
|
||||||
|
async def process_batch(
|
||||||
|
batch_reviews: list[ReviewToClassify],
|
||||||
|
batch_num: int,
|
||||||
|
client: LLMClientBase,
|
||||||
|
tier_label: str = "FULL",
|
||||||
|
):
|
||||||
|
"""Process a single batch of reviews."""
|
||||||
|
|
||||||
|
async def do_batch():
|
||||||
|
nonlocal total_tokens, total_cost, total_cached_tokens, error_count
|
||||||
|
try:
|
||||||
|
batch_classified, batch_metadata = await self._classify_batch(
|
||||||
|
batch_reviews,
|
||||||
|
input_data["config"]["profile"],
|
||||||
|
client,
|
||||||
|
batch_id,
|
||||||
|
input_data["config"],
|
||||||
|
)
|
||||||
|
|
||||||
|
batch_tokens = batch_metadata.get("total_tokens", 0)
|
||||||
|
batch_cost = batch_metadata.get("cost_usd", 0.0)
|
||||||
|
batch_cached = batch_metadata.get("cached_tokens", 0)
|
||||||
|
|
||||||
|
total_tokens += batch_tokens
|
||||||
|
total_cost += batch_cost
|
||||||
|
total_cached_tokens += batch_cached
|
||||||
|
|
||||||
|
# Update batch sizer with actual token usage for adaptive sizing
|
||||||
|
if self._batch_sizer:
|
||||||
|
input_tokens = batch_metadata.get("input_tokens", 0)
|
||||||
|
output_tokens = batch_metadata.get("output_tokens", 0)
|
||||||
|
self._batch_sizer.update_from_response(
|
||||||
|
batch_size=len(batch_reviews),
|
||||||
|
input_tokens=input_tokens - self._system_prompt_tokens, # Exclude system prompt
|
||||||
|
output_tokens=output_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
|
total_batches = len(full_batches) + len(cheap_batches)
|
||||||
|
logger.info(
|
||||||
|
f"[{tier_label}] Batch {batch_num}/{total_batches}: "
|
||||||
|
f"{len(batch_classified)} reviews, "
|
||||||
|
f"{batch_tokens:,} tokens ({batch_cached:,} cached), "
|
||||||
|
f"${batch_cost:.4f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return batch_classified
|
||||||
|
|
||||||
|
except PartialBatchResult as e:
|
||||||
|
# Partial success - we recovered some reviews
|
||||||
|
logger.info(
|
||||||
|
f"Batch {batch_num} partial success: {len(e.partial_results)} recovered, "
|
||||||
|
f"{len(e.missing_indices)} need reprocessing"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process the recovered results
|
||||||
|
partial_classified: list[ClassifiedReview] = []
|
||||||
|
profile = input_data["config"]["profile"]
|
||||||
|
|
||||||
|
for partial_review in e.partial_results:
|
||||||
|
idx = partial_review.get("review_index", -1)
|
||||||
|
if 0 <= idx < len(batch_reviews):
|
||||||
|
review = batch_reviews[idx]
|
||||||
|
try:
|
||||||
|
classified = self._process_llm_response(
|
||||||
|
review,
|
||||||
|
{
|
||||||
|
"spans": partial_review.get("spans", []),
|
||||||
|
"review_summary": partial_review.get("review_summary", {}),
|
||||||
|
},
|
||||||
|
profile,
|
||||||
|
batch_id,
|
||||||
|
is_fallback=False,
|
||||||
|
)
|
||||||
|
partial_classified.append(classified)
|
||||||
|
|
||||||
|
if self.review_repo and self.span_repo:
|
||||||
|
await self._persist_classification(
|
||||||
|
classified, review, batch_id, input_data["config"]
|
||||||
|
)
|
||||||
|
except Exception as pe:
|
||||||
|
logger.warning(f"Error processing recovered review {idx}: {pe}")
|
||||||
|
e.missing_indices.append(idx)
|
||||||
|
|
||||||
|
# Update cost tracking from partial metadata
|
||||||
|
if e.metadata:
|
||||||
|
total_tokens += e.metadata.get("total_tokens", 0)
|
||||||
|
total_cost += e.metadata.get("cost_usd", 0.0)
|
||||||
|
total_cached_tokens += e.metadata.get("cached_tokens", 0)
|
||||||
|
|
||||||
|
# Only fallback process the missing reviews
|
||||||
|
if e.missing_indices:
|
||||||
|
missing_reviews = [batch_reviews[i] for i in e.missing_indices if 0 <= i < len(batch_reviews)]
|
||||||
|
error_count += len(missing_reviews)
|
||||||
|
logger.info(f"Reprocessing {len(missing_reviews)} missing reviews individually")
|
||||||
|
fallback_results = await self._fallback_individual_processing(
|
||||||
|
missing_reviews,
|
||||||
|
input_data["config"]["profile"],
|
||||||
|
client, # Use same client as batch
|
||||||
batch_id,
|
batch_id,
|
||||||
input_data["config"],
|
input_data["config"],
|
||||||
)
|
)
|
||||||
|
partial_classified.extend(fallback_results)
|
||||||
|
|
||||||
except Exception as e:
|
return partial_classified
|
||||||
logger.error(
|
|
||||||
f"Error classifying review {review['review_id']}: {e}",
|
|
||||||
exc_info=True,
|
|
||||||
)
|
|
||||||
error_count += 1
|
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[{tier_label}] Batch {batch_num} failed: {e}", exc_info=True)
|
||||||
|
error_count += len(batch_reviews)
|
||||||
|
# Fallback: process individually
|
||||||
|
return await self._fallback_individual_processing(
|
||||||
|
batch_reviews,
|
||||||
|
input_data["config"]["profile"],
|
||||||
|
client, # Use same client as batch
|
||||||
|
batch_id,
|
||||||
|
input_data["config"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Run with or without semaphore
|
||||||
|
if semaphore:
|
||||||
|
async with semaphore:
|
||||||
|
return await do_batch()
|
||||||
|
else:
|
||||||
|
return await do_batch()
|
||||||
|
|
||||||
|
# Process all batches concurrently (both FULL and CHEAP tiers)
|
||||||
|
all_batch_tasks = []
|
||||||
|
|
||||||
|
# FULL tier batches
|
||||||
|
for i, batch in enumerate(full_batches):
|
||||||
|
all_batch_tasks.append(
|
||||||
|
process_batch(batch, i + 1, llm_client, "FULL")
|
||||||
|
)
|
||||||
|
|
||||||
|
# CHEAP tier batches
|
||||||
|
for i, batch in enumerate(cheap_batches):
|
||||||
|
all_batch_tasks.append(
|
||||||
|
process_batch(batch, len(full_batches) + i + 1, cheap_client, "CHEAP")
|
||||||
|
)
|
||||||
|
|
||||||
|
batch_results = await asyncio.gather(*all_batch_tasks) if all_batch_tasks else []
|
||||||
|
|
||||||
|
# Flatten results from LLM processing
|
||||||
|
for batch_result in batch_results:
|
||||||
|
classified_reviews.extend(batch_result)
|
||||||
|
|
||||||
|
# Add skip-classified reviews (no LLM)
|
||||||
|
classified_reviews.extend(skip_classified)
|
||||||
|
|
||||||
|
# Calculate stats
|
||||||
|
total_spans = sum(len(r.get("spans", [])) for r in classified_reviews)
|
||||||
avg_spans = total_spans / len(classified_reviews) if classified_reviews else 0
|
avg_spans = total_spans / len(classified_reviews) if classified_reviews else 0
|
||||||
|
|
||||||
|
# Log final statistics
|
||||||
|
skip_count = len(skip_classified)
|
||||||
|
llm_count = len(classified_reviews) - skip_count
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Stage 2 complete: {len(classified_reviews)} classified, "
|
f"Stage 2 complete: {len(classified_reviews)} classified "
|
||||||
f"{error_count} errors, {total_spans} spans total"
|
f"(LLM={llm_count}, skipped={skip_count}), "
|
||||||
|
f"{error_count} errors, {total_spans} spans total, "
|
||||||
|
f"${total_cost:.4f} cost, {total_cached_tokens:,} cached tokens"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if self._batch_sizer:
|
||||||
|
stats = self._batch_sizer.get_stats_summary()
|
||||||
|
logger.info(
|
||||||
|
f"Batch sizing stats: "
|
||||||
|
f"avg_input={stats['avg_input_tokens']} tokens/review, "
|
||||||
|
f"avg_output={stats['avg_output_tokens']} tokens/review, "
|
||||||
|
f"range=[{stats['min_review_tokens']}-{stats['max_review_tokens']}]"
|
||||||
|
)
|
||||||
|
|
||||||
return Stage2Output(
|
return Stage2Output(
|
||||||
batch_id=batch_id,
|
batch_id=batch_id,
|
||||||
taxonomy_version=input_data["config"]["taxonomy_version"],
|
taxonomy_version=input_data["config"]["taxonomy_version"],
|
||||||
model_version=self.config.llm_model,
|
model_version=self.config.llm_model,
|
||||||
prompt_version="v1.0",
|
prompt_version="v2.0-batched",
|
||||||
reviews_classified=classified_reviews,
|
reviews_classified=classified_reviews,
|
||||||
stats=Stage2Stats(
|
stats=Stage2Stats(
|
||||||
input_count=len(input_data["reviews"]),
|
input_count=len(reviews),
|
||||||
success_count=len(classified_reviews),
|
success_count=len(classified_reviews),
|
||||||
error_count=error_count,
|
error_count=error_count,
|
||||||
total_spans=total_spans,
|
total_spans=total_spans,
|
||||||
@@ -167,42 +483,127 @@ class Stage2Classifier:
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
async def _classify_review(
|
async def _classify_batch(
|
||||||
self,
|
self,
|
||||||
review: ReviewToClassify,
|
reviews: list[ReviewToClassify],
|
||||||
profile: str,
|
profile: str,
|
||||||
llm_client: LLMClientBase,
|
llm_client: LLMClientBase,
|
||||||
batch_id: str,
|
batch_id: str,
|
||||||
) -> tuple[ClassifiedReview | None, dict[str, Any]]:
|
config: dict[str, Any],
|
||||||
|
) -> tuple[list[ClassifiedReview], dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Classify a single review.
|
Classify a batch of reviews in a single LLM call.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
review: Review to classify
|
reviews: List of reviews to classify
|
||||||
profile: Classification profile
|
profile: Classification profile
|
||||||
llm_client: LLM client instance
|
llm_client: LLM client instance
|
||||||
batch_id: Batch identifier
|
batch_id: Batch identifier
|
||||||
|
config: Classification config
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (classified review, metadata)
|
Tuple of (list of classified reviews, aggregated metadata)
|
||||||
"""
|
"""
|
||||||
metadata: dict[str, Any] = {}
|
# Prepare batch input
|
||||||
|
batch_input: list[BatchReviewInput] = [
|
||||||
# Call LLM for classification
|
BatchReviewInput(
|
||||||
try:
|
review_id=r["review_id"],
|
||||||
llm_response, llm_metadata = await llm_client.classify(
|
text=r["text"],
|
||||||
review["text"],
|
rating=r["rating"],
|
||||||
profile,
|
|
||||||
)
|
)
|
||||||
metadata.update(llm_metadata)
|
for r in reviews
|
||||||
except Exception as e:
|
]
|
||||||
logger.warning(
|
|
||||||
f"LLM classification failed for {review['review_id']}, "
|
|
||||||
f"using fallback: {e}"
|
|
||||||
)
|
|
||||||
llm_response = create_fallback_response(review["text"])
|
|
||||||
metadata["fallback"] = True
|
|
||||||
|
|
||||||
|
# Call LLM for batch classification
|
||||||
|
llm_responses, metadata = await llm_client.classify_batch(batch_input, profile)
|
||||||
|
|
||||||
|
# Process each response
|
||||||
|
classified_reviews: list[ClassifiedReview] = []
|
||||||
|
|
||||||
|
for i, (review, llm_response) in enumerate(zip(reviews, llm_responses)):
|
||||||
|
try:
|
||||||
|
classified = self._process_llm_response(
|
||||||
|
review,
|
||||||
|
llm_response,
|
||||||
|
profile,
|
||||||
|
batch_id,
|
||||||
|
is_fallback=False,
|
||||||
|
)
|
||||||
|
classified_reviews.append(classified)
|
||||||
|
|
||||||
|
# Persist to database if configured
|
||||||
|
if self.review_repo and self.span_repo:
|
||||||
|
await self._persist_classification(
|
||||||
|
classified,
|
||||||
|
review,
|
||||||
|
batch_id,
|
||||||
|
config,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error processing review {review['review_id']}: {e}")
|
||||||
|
# Use fallback for this review
|
||||||
|
fallback = create_fallback_response(review["text"])
|
||||||
|
classified = self._process_llm_response(
|
||||||
|
review, fallback, profile, batch_id, is_fallback=True
|
||||||
|
)
|
||||||
|
classified_reviews.append(classified)
|
||||||
|
|
||||||
|
return classified_reviews, metadata
|
||||||
|
|
||||||
|
async def _fallback_individual_processing(
|
||||||
|
self,
|
||||||
|
reviews: list[ReviewToClassify],
|
||||||
|
profile: str,
|
||||||
|
llm_client: LLMClientBase,
|
||||||
|
batch_id: str,
|
||||||
|
config: dict[str, Any],
|
||||||
|
) -> list[ClassifiedReview]:
|
||||||
|
"""
|
||||||
|
Fallback to individual processing when batch fails.
|
||||||
|
|
||||||
|
This ensures we can still classify reviews even if batching fails.
|
||||||
|
"""
|
||||||
|
logger.warning(f"Falling back to individual processing for {len(reviews)} reviews")
|
||||||
|
classified_reviews: list[ClassifiedReview] = []
|
||||||
|
|
||||||
|
for review in reviews:
|
||||||
|
try:
|
||||||
|
classified, _ = await self._classify_review(
|
||||||
|
review, profile, llm_client, batch_id
|
||||||
|
)
|
||||||
|
if classified:
|
||||||
|
classified_reviews.append(classified)
|
||||||
|
|
||||||
|
if self.review_repo and self.span_repo:
|
||||||
|
await self._persist_classification(
|
||||||
|
classified, review, batch_id, config
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Individual classification failed for {review['review_id']}: {e}")
|
||||||
|
# Use fallback
|
||||||
|
fallback = create_fallback_response(review["text"])
|
||||||
|
classified = self._process_llm_response(
|
||||||
|
review, fallback, profile, batch_id, is_fallback=True
|
||||||
|
)
|
||||||
|
classified_reviews.append(classified)
|
||||||
|
|
||||||
|
return classified_reviews
|
||||||
|
|
||||||
|
def _process_llm_response(
|
||||||
|
self,
|
||||||
|
review: ReviewToClassify,
|
||||||
|
llm_response: LLMClassificationResponse,
|
||||||
|
profile: str,
|
||||||
|
batch_id: str,
|
||||||
|
is_fallback: bool = False,
|
||||||
|
) -> ClassifiedReview:
|
||||||
|
"""
|
||||||
|
Process an LLM response into a ClassifiedReview.
|
||||||
|
|
||||||
|
This is shared logic for both batch and individual processing.
|
||||||
|
"""
|
||||||
# Validate and fix response
|
# Validate and fix response
|
||||||
llm_response = self._validate_and_fix_response(llm_response, review["text"])
|
llm_response = self._validate_and_fix_response(llm_response, review["text"])
|
||||||
|
|
||||||
@@ -217,7 +618,10 @@ class Stage2Classifier:
|
|||||||
# Ensure exactly one primary span
|
# Ensure exactly one primary span
|
||||||
spans = self._ensure_primary_span(spans)
|
spans = self._ensure_primary_span(spans)
|
||||||
|
|
||||||
# Find the primary span for review-level classification
|
# Post-LLM validation
|
||||||
|
spans = self._validate_span_classifications(spans)
|
||||||
|
|
||||||
|
# Find primary span
|
||||||
primary_span = next((s for s in spans if s.get("is_primary")), spans[0] if spans else None)
|
primary_span = next((s for s in spans if s.get("is_primary")), spans[0] if spans else None)
|
||||||
|
|
||||||
# Generate embedding
|
# Generate embedding
|
||||||
@@ -247,10 +651,59 @@ class Stage2Classifier:
|
|||||||
embedding=embedding,
|
embedding=embedding,
|
||||||
spans=spans,
|
spans=spans,
|
||||||
classification_confidence={
|
classification_confidence={
|
||||||
"overall": 0.8 if not metadata.get("fallback") else 0.3
|
"overall": 0.8 if not is_fallback else 0.3
|
||||||
},
|
},
|
||||||
processing_time_ms=metadata.get("latency_ms", 0),
|
processing_time_ms=0, # Set at batch level
|
||||||
), metadata
|
)
|
||||||
|
|
||||||
|
async def _classify_review(
|
||||||
|
self,
|
||||||
|
review: ReviewToClassify,
|
||||||
|
profile: str,
|
||||||
|
llm_client: LLMClientBase,
|
||||||
|
batch_id: str,
|
||||||
|
) -> tuple[ClassifiedReview | None, dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Classify a single review (used for fallback when batching fails).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
review: Review to classify
|
||||||
|
profile: Classification profile
|
||||||
|
llm_client: LLM client instance
|
||||||
|
batch_id: Batch identifier
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (classified review, metadata)
|
||||||
|
"""
|
||||||
|
metadata: dict[str, Any] = {}
|
||||||
|
is_fallback = False
|
||||||
|
|
||||||
|
# Call LLM for classification
|
||||||
|
try:
|
||||||
|
llm_response, llm_metadata = await llm_client.classify(
|
||||||
|
review["text"],
|
||||||
|
profile,
|
||||||
|
)
|
||||||
|
metadata.update(llm_metadata)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"LLM classification failed for {review['review_id']}, "
|
||||||
|
f"using fallback: {e}"
|
||||||
|
)
|
||||||
|
llm_response = create_fallback_response(review["text"])
|
||||||
|
metadata["fallback"] = True
|
||||||
|
is_fallback = True
|
||||||
|
|
||||||
|
# Use shared processing logic
|
||||||
|
classified = self._process_llm_response(
|
||||||
|
review,
|
||||||
|
llm_response,
|
||||||
|
profile,
|
||||||
|
batch_id,
|
||||||
|
is_fallback=is_fallback,
|
||||||
|
)
|
||||||
|
|
||||||
|
return classified, metadata
|
||||||
|
|
||||||
def _validate_and_fix_response(
|
def _validate_and_fix_response(
|
||||||
self,
|
self,
|
||||||
@@ -405,6 +858,45 @@ class Stage2Classifier:
|
|||||||
|
|
||||||
return spans
|
return spans
|
||||||
|
|
||||||
|
def _validate_span_classifications(
|
||||||
|
self,
|
||||||
|
spans: list[ExtractedSpan],
|
||||||
|
) -> list[ExtractedSpan]:
|
||||||
|
"""
|
||||||
|
Post-LLM validation to catch common misclassifications.
|
||||||
|
|
||||||
|
Uses keyword detection to identify obvious errors like:
|
||||||
|
- Price mentions classified as P codes (should be V)
|
||||||
|
- Staff behavior classified as A codes (should be P)
|
||||||
|
- Scam mentions classified as P/V codes (should be R)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
spans: List of classified spans
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of spans with corrections applied
|
||||||
|
"""
|
||||||
|
corrections = 0
|
||||||
|
for span in spans:
|
||||||
|
correction = validate_classification(
|
||||||
|
span.get("span_text", ""),
|
||||||
|
span.get("urt_primary", "O1.01"),
|
||||||
|
span.get("valence", "V0"),
|
||||||
|
)
|
||||||
|
if correction:
|
||||||
|
original = span["urt_primary"]
|
||||||
|
span["urt_primary"] = correction["suggested_urt"]
|
||||||
|
corrections += 1
|
||||||
|
logger.debug(
|
||||||
|
f"Validation corrected {original} → {correction['suggested_urt']} "
|
||||||
|
f"({correction['reason']})"
|
||||||
|
)
|
||||||
|
|
||||||
|
if corrections:
|
||||||
|
logger.info(f"Post-LLM validation corrected {corrections} spans")
|
||||||
|
|
||||||
|
return spans
|
||||||
|
|
||||||
def _calculate_trust_score(
|
def _calculate_trust_score(
|
||||||
self,
|
self,
|
||||||
review: ReviewToClassify,
|
review: ReviewToClassify,
|
||||||
@@ -467,6 +959,72 @@ class Stage2Classifier:
|
|||||||
quotes[code] = span["span_text"][:100]
|
quotes[code] = span["span_text"][:100]
|
||||||
return quotes
|
return quotes
|
||||||
|
|
||||||
|
def _create_skip_classification(
|
||||||
|
self,
|
||||||
|
review: ReviewToClassify,
|
||||||
|
skip_classification: dict,
|
||||||
|
batch_id: str,
|
||||||
|
) -> ClassifiedReview:
|
||||||
|
"""
|
||||||
|
Create a ClassifiedReview for a SKIP tier review (no LLM).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
review: Source review
|
||||||
|
skip_classification: Pre-assigned classification from router
|
||||||
|
batch_id: Batch identifier
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ClassifiedReview with generic classification
|
||||||
|
"""
|
||||||
|
urt_primary = skip_classification.get("urt_primary", "V4.03")
|
||||||
|
valence = skip_classification.get("valence", "V0")
|
||||||
|
intensity = skip_classification.get("intensity", "I1")
|
||||||
|
|
||||||
|
# Create a single span for the entire review
|
||||||
|
span_key = f"{review['review_id']}:0:{review['text'][:50]}"
|
||||||
|
span_hash = hashlib.sha256(span_key.encode()).hexdigest()[:16]
|
||||||
|
span_id = f"SPN-{span_hash}"
|
||||||
|
|
||||||
|
span = ExtractedSpan(
|
||||||
|
span_id=span_id,
|
||||||
|
span_index=0,
|
||||||
|
span_text=review["text"],
|
||||||
|
span_start=0,
|
||||||
|
span_end=len(review["text"]),
|
||||||
|
profile="lite", # type: ignore
|
||||||
|
urt_primary=urt_primary,
|
||||||
|
urt_secondary=[],
|
||||||
|
valence=valence,
|
||||||
|
intensity=intensity,
|
||||||
|
comparative="CR-N",
|
||||||
|
confidence="low",
|
||||||
|
usn=f"URT:S:{urt_primary}:{valence[1]}{intensity[1]}:11TC.ES.N",
|
||||||
|
is_primary=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate embedding if available
|
||||||
|
embedding: list[float] = []
|
||||||
|
if self.embedding_service:
|
||||||
|
embedding = self.embedding_service.embed(review.get("text_normalized", review["text"]))
|
||||||
|
|
||||||
|
return ClassifiedReview(
|
||||||
|
source=review["source"],
|
||||||
|
review_id=review["review_id"],
|
||||||
|
review_version=review["review_version"],
|
||||||
|
urt_primary=urt_primary,
|
||||||
|
urt_secondary=[],
|
||||||
|
valence=valence,
|
||||||
|
intensity=intensity,
|
||||||
|
comparative="CR-N",
|
||||||
|
staff_mentions=[],
|
||||||
|
quotes={},
|
||||||
|
trust_score=self.config.trust_score_floor, # Minimum trust for skipped reviews
|
||||||
|
embedding=embedding,
|
||||||
|
spans=[span],
|
||||||
|
classification_confidence={"overall": 0.2, "skip_reason": skip_classification.get("skip_reason", "auto_routed")},
|
||||||
|
processing_time_ms=0,
|
||||||
|
)
|
||||||
|
|
||||||
def _generate_usn(self, span: LLMSpanResponse) -> str:
|
def _generate_usn(self, span: LLMSpanResponse) -> str:
|
||||||
"""
|
"""
|
||||||
Generate USN (URT String Notation) for a span.
|
Generate USN (URT String Notation) for a span.
|
||||||
@@ -536,4 +1094,5 @@ class Stage2Classifier:
|
|||||||
batch_id,
|
batch_id,
|
||||||
self.config.llm_model,
|
self.config.llm_model,
|
||||||
config["taxonomy_version"],
|
config["taxonomy_version"],
|
||||||
|
job_id=config.get("job_id"),
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -69,6 +69,9 @@ class Stage3Router:
|
|||||||
"""
|
"""
|
||||||
logger.info(f"Stage 3: Routing {len(input_data['spans'])} spans")
|
logger.info(f"Stage 3: Routing {len(input_data['spans'])} spans")
|
||||||
|
|
||||||
|
# Get job_id from input (may be None)
|
||||||
|
job_id = input_data.get("job_id")
|
||||||
|
|
||||||
routed_spans: list[RoutedSpan] = []
|
routed_spans: list[RoutedSpan] = []
|
||||||
issues_created: list[str] = []
|
issues_created: list[str] = []
|
||||||
issues_updated: list[str] = []
|
issues_updated: list[str] = []
|
||||||
@@ -81,7 +84,7 @@ class Stage3Router:
|
|||||||
spans_skipped += 1
|
spans_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
routed = await self._route_span(span)
|
routed = await self._route_span(span, job_id=job_id)
|
||||||
if routed:
|
if routed:
|
||||||
routed_spans.append(routed)
|
routed_spans.append(routed)
|
||||||
|
|
||||||
@@ -114,12 +117,13 @@ class Stage3Router:
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
async def _route_span(self, span: SpanToRoute) -> RoutedSpan | None:
|
async def _route_span(self, span: SpanToRoute, job_id: str | None = None) -> RoutedSpan | None:
|
||||||
"""
|
"""
|
||||||
Route a single span to an issue.
|
Route a single span to an issue.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
span: Span to route
|
span: Span to route
|
||||||
|
job_id: Optional job ID to link issues to pipeline executions
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
RoutedSpan with routing info, or None if skipped
|
RoutedSpan with routing info, or None if skipped
|
||||||
@@ -149,6 +153,7 @@ class Stage3Router:
|
|||||||
entity=span.get("entity_normalized"),
|
entity=span.get("entity_normalized"),
|
||||||
entity_normalized=span.get("entity_normalized"),
|
entity_normalized=span.get("entity_normalized"),
|
||||||
taxonomy_version=self.config.taxonomy_version,
|
taxonomy_version=self.config.taxonomy_version,
|
||||||
|
job_id=job_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
routed = RoutedSpan(
|
routed = RoutedSpan(
|
||||||
|
|||||||
@@ -194,25 +194,24 @@ class Stage4Aggregator:
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown bucket type: {bucket_type}")
|
raise ValueError(f"Unknown bucket type: {bucket_type}")
|
||||||
|
|
||||||
def _get_period_date(self, target_date: date, bucket_type: str) -> str:
|
def _get_period_date(self, target_date: date, bucket_type: str) -> date:
|
||||||
"""Get the period date string for a bucket."""
|
"""Get the period date for a bucket."""
|
||||||
if bucket_type == "day":
|
if bucket_type == "day":
|
||||||
return target_date.isoformat()
|
return target_date
|
||||||
elif bucket_type == "week":
|
elif bucket_type == "week":
|
||||||
# Week starts on Monday
|
# Week starts on Monday
|
||||||
start = target_date - timedelta(days=target_date.weekday())
|
return target_date - timedelta(days=target_date.weekday())
|
||||||
return start.isoformat()
|
|
||||||
elif bucket_type == "month":
|
elif bucket_type == "month":
|
||||||
return target_date.replace(day=1).isoformat()
|
return target_date.replace(day=1)
|
||||||
else:
|
else:
|
||||||
return target_date.isoformat()
|
return target_date
|
||||||
|
|
||||||
def _aggregate_by_code(
|
def _aggregate_by_code(
|
||||||
self,
|
self,
|
||||||
span_data: list[dict[str, Any]],
|
span_data: list[dict[str, Any]],
|
||||||
business_id: str,
|
business_id: str,
|
||||||
place_id: str,
|
place_id: str,
|
||||||
period_date: str,
|
period_date: date,
|
||||||
bucket_type: str,
|
bucket_type: str,
|
||||||
taxonomy_version: str,
|
taxonomy_version: str,
|
||||||
) -> list[FactRecord]:
|
) -> list[FactRecord]:
|
||||||
@@ -243,7 +242,7 @@ class Stage4Aggregator:
|
|||||||
span_data: list[dict[str, Any]],
|
span_data: list[dict[str, Any]],
|
||||||
business_id: str,
|
business_id: str,
|
||||||
place_id: str,
|
place_id: str,
|
||||||
period_date: str,
|
period_date: date,
|
||||||
bucket_type: str,
|
bucket_type: str,
|
||||||
taxonomy_version: str,
|
taxonomy_version: str,
|
||||||
) -> list[FactRecord]:
|
) -> list[FactRecord]:
|
||||||
@@ -275,7 +274,7 @@ class Stage4Aggregator:
|
|||||||
span_data: list[dict[str, Any]],
|
span_data: list[dict[str, Any]],
|
||||||
business_id: str,
|
business_id: str,
|
||||||
place_id: str,
|
place_id: str,
|
||||||
period_date: str,
|
period_date: date,
|
||||||
bucket_type: str,
|
bucket_type: str,
|
||||||
taxonomy_version: str,
|
taxonomy_version: str,
|
||||||
) -> FactRecord:
|
) -> FactRecord:
|
||||||
@@ -296,7 +295,7 @@ class Stage4Aggregator:
|
|||||||
spans: list[dict[str, Any]],
|
spans: list[dict[str, Any]],
|
||||||
business_id: str,
|
business_id: str,
|
||||||
place_id: str,
|
place_id: str,
|
||||||
period_date: str,
|
period_date: date,
|
||||||
bucket_type: str,
|
bucket_type: str,
|
||||||
subject_type: str,
|
subject_type: str,
|
||||||
subject_id: str,
|
subject_id: str,
|
||||||
@@ -449,7 +448,7 @@ class Stage4Aggregator:
|
|||||||
self,
|
self,
|
||||||
business_id: str,
|
business_id: str,
|
||||||
place_id: str,
|
place_id: str,
|
||||||
period_date: str,
|
period_date: date,
|
||||||
bucket_type: str,
|
bucket_type: str,
|
||||||
subject_type: str,
|
subject_type: str,
|
||||||
subject_id: str,
|
subject_id: str,
|
||||||
|
|||||||
@@ -1,477 +0,0 @@
|
|||||||
"""
|
|
||||||
Stage 4: Synthesize - Generate AI narratives and action plans.
|
|
||||||
|
|
||||||
This stage runs after classification and routing to produce:
|
|
||||||
- Executive narrative (business-specific story)
|
|
||||||
- Section insights (sentiment, category, timeline)
|
|
||||||
- Action plan with prioritized recommendations
|
|
||||||
- Timeline annotations for key events
|
|
||||||
- Marketing angles from strengths
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from datetime import datetime
|
|
||||||
from typing import TYPE_CHECKING, Any
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
import asyncpg
|
|
||||||
|
|
||||||
from reviewiq_pipeline.services.llm_client import LLMClientBase
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ActionItem:
|
|
||||||
"""A specific action recommendation."""
|
|
||||||
id: str
|
|
||||||
title: str
|
|
||||||
why: str
|
|
||||||
what: str
|
|
||||||
who: str
|
|
||||||
impact: str
|
|
||||||
evidence: list[str]
|
|
||||||
estimated_rating_lift: float | None
|
|
||||||
complexity: str # 'quick' | 'medium' | 'complex'
|
|
||||||
priority: str # 'critical' | 'high' | 'medium' | 'low'
|
|
||||||
timeline: str
|
|
||||||
related_subcode: str
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class TimelineAnnotation:
|
|
||||||
"""An annotation for a key event on the timeline."""
|
|
||||||
date: str
|
|
||||||
label: str
|
|
||||||
description: str
|
|
||||||
type: str # 'positive' | 'negative' | 'neutral' | 'event'
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Synthesis:
|
|
||||||
"""Complete synthesis output from Stage 4."""
|
|
||||||
executive_narrative: str
|
|
||||||
sentiment_insight: str
|
|
||||||
category_insight: str
|
|
||||||
timeline_insight: str
|
|
||||||
priority_domain: str | None
|
|
||||||
priority_issue: str | None
|
|
||||||
action_plan: list[ActionItem]
|
|
||||||
issue_actions: dict[str, str]
|
|
||||||
timeline_annotations: list[TimelineAnnotation]
|
|
||||||
marketing_angles: list[str]
|
|
||||||
competitor_context: str | None
|
|
||||||
generated_at: str
|
|
||||||
|
|
||||||
|
|
||||||
SYNTHESIS_SYSTEM_PROMPT = """You are an expert business analyst specializing in customer experience and review analysis.
|
|
||||||
|
|
||||||
Your task is to analyze classified review data and generate actionable business insights.
|
|
||||||
|
|
||||||
You will receive:
|
|
||||||
1. Summary statistics (total reviews, rating, sentiment distribution)
|
|
||||||
2. Top issues by category with example quotes
|
|
||||||
3. Top strengths with example quotes
|
|
||||||
4. Domain breakdown (what customers talk about most)
|
|
||||||
|
|
||||||
Generate a JSON response with these fields:
|
|
||||||
|
|
||||||
{
|
|
||||||
"executive_narrative": "2-3 paragraph story explaining the business situation, key problems, and path forward. Be specific with numbers and examples.",
|
|
||||||
|
|
||||||
"sentiment_insight": "1-2 sentences explaining WHY sentiment is distributed this way. Connect to specific issues.",
|
|
||||||
|
|
||||||
"category_insight": "1-2 sentences about the pattern in categories. Which domain needs most attention and why?",
|
|
||||||
|
|
||||||
"timeline_insight": "1-2 sentences about trends if data shows changes over time.",
|
|
||||||
|
|
||||||
"priority_domain": "Single letter code (P/V/J/O/A/E/R) for the domain needing most attention, or null",
|
|
||||||
|
|
||||||
"priority_issue": "The subcode (e.g., 'V1.03') that should be fixed first, or null",
|
|
||||||
|
|
||||||
"action_plan": [
|
|
||||||
{
|
|
||||||
"id": "action_1",
|
|
||||||
"title": "Clear action title",
|
|
||||||
"why": "Root cause from the reviews",
|
|
||||||
"what": "Specific steps to take",
|
|
||||||
"who": "Department or role responsible",
|
|
||||||
"impact": "Expected outcome",
|
|
||||||
"evidence": ["Quote 1", "Quote 2"],
|
|
||||||
"estimated_rating_lift": 0.3,
|
|
||||||
"complexity": "quick|medium|complex",
|
|
||||||
"priority": "critical|high|medium|low",
|
|
||||||
"timeline": "This week|This month|This quarter",
|
|
||||||
"related_subcode": "V1.03"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
|
|
||||||
"timeline_annotations": [
|
|
||||||
{
|
|
||||||
"date": "2024-01-15",
|
|
||||||
"label": "Short label",
|
|
||||||
"description": "What happened",
|
|
||||||
"type": "positive|negative|neutral|event"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
|
|
||||||
"marketing_angles": [
|
|
||||||
"Way to promote strength 1",
|
|
||||||
"Way to promote strength 2"
|
|
||||||
],
|
|
||||||
|
|
||||||
"competitor_context": "How this compares to industry/competitors, or null if unknown"
|
|
||||||
}
|
|
||||||
|
|
||||||
Be specific, actionable, and business-focused. Use actual numbers and quotes from the data.
|
|
||||||
Prioritize actions by impact and feasibility.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class SynthesisStage:
|
|
||||||
"""
|
|
||||||
Stage 4: Generate AI synthesis from classified review data.
|
|
||||||
|
|
||||||
This stage:
|
|
||||||
1. Aggregates classification results
|
|
||||||
2. Identifies patterns and priorities
|
|
||||||
3. Generates narrative insights via LLM
|
|
||||||
4. Produces actionable recommendations
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, pool: asyncpg.Pool, llm_client: LLMClientBase):
|
|
||||||
self.pool = pool
|
|
||||||
self.llm_client = llm_client
|
|
||||||
|
|
||||||
async def run(self, job_id: str, execution_id: str) -> Synthesis:
|
|
||||||
"""
|
|
||||||
Generate synthesis for a completed pipeline execution.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
job_id: The scraping job ID
|
|
||||||
execution_id: The pipeline execution ID
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Synthesis object with all generated insights
|
|
||||||
"""
|
|
||||||
logger.info(f"Stage 4: Generating synthesis for job {job_id}")
|
|
||||||
|
|
||||||
# Gather all the data we need
|
|
||||||
context = await self._gather_context(job_id)
|
|
||||||
|
|
||||||
# Generate synthesis via LLM
|
|
||||||
synthesis = await self._generate_synthesis(context)
|
|
||||||
|
|
||||||
# Store synthesis in database
|
|
||||||
await self._store_synthesis(execution_id, synthesis)
|
|
||||||
|
|
||||||
logger.info(f"Stage 4: Synthesis complete - {len(synthesis.action_plan)} actions generated")
|
|
||||||
return synthesis
|
|
||||||
|
|
||||||
async def _gather_context(self, job_id: str) -> dict[str, Any]:
|
|
||||||
"""Gather all context needed for synthesis."""
|
|
||||||
|
|
||||||
# Get overview stats
|
|
||||||
overview = await self.pool.fetchrow("""
|
|
||||||
SELECT
|
|
||||||
COUNT(DISTINCT r.review_id) as total_reviews,
|
|
||||||
AVG(r.rating) as avg_rating,
|
|
||||||
COUNT(s.span_id) as total_spans
|
|
||||||
FROM reviews r
|
|
||||||
LEFT JOIN pipeline.spans s ON s.source_review_id = r.review_id
|
|
||||||
WHERE r.job_id = $1
|
|
||||||
""", job_id)
|
|
||||||
|
|
||||||
# Get sentiment distribution
|
|
||||||
sentiment = await self.pool.fetch("""
|
|
||||||
SELECT
|
|
||||||
valence,
|
|
||||||
COUNT(*) as count,
|
|
||||||
COUNT(DISTINCT source_review_id) as review_count
|
|
||||||
FROM pipeline.spans
|
|
||||||
WHERE job_id = $1 AND valence IS NOT NULL
|
|
||||||
GROUP BY valence
|
|
||||||
ORDER BY count DESC
|
|
||||||
""", job_id)
|
|
||||||
|
|
||||||
# Get top issues (weaknesses)
|
|
||||||
top_issues = await self.pool.fetch("""
|
|
||||||
SELECT
|
|
||||||
s.urt_primary as subcode,
|
|
||||||
sc.name as subcode_name,
|
|
||||||
sc.definition,
|
|
||||||
d.code as domain,
|
|
||||||
d.name as domain_name,
|
|
||||||
COUNT(*) as span_count,
|
|
||||||
COUNT(*) FILTER (WHERE s.valence = 'V-') as negative_count,
|
|
||||||
ARRAY_AGG(s.span_text ORDER BY s.intensity DESC) FILTER (WHERE s.valence = 'V-') as example_quotes
|
|
||||||
FROM pipeline.spans s
|
|
||||||
JOIN pipeline.urt_subcodes sc ON sc.code = s.urt_primary
|
|
||||||
JOIN pipeline.urt_domains d ON d.code = SUBSTRING(s.urt_primary, 1, 1)
|
|
||||||
WHERE s.job_id = $1 AND s.valence = 'V-'
|
|
||||||
GROUP BY s.urt_primary, sc.name, sc.definition, d.code, d.name
|
|
||||||
ORDER BY negative_count DESC
|
|
||||||
LIMIT 10
|
|
||||||
""", job_id)
|
|
||||||
|
|
||||||
# Get top strengths
|
|
||||||
top_strengths = await self.pool.fetch("""
|
|
||||||
SELECT
|
|
||||||
s.urt_primary as subcode,
|
|
||||||
sc.name as subcode_name,
|
|
||||||
sc.definition,
|
|
||||||
d.code as domain,
|
|
||||||
d.name as domain_name,
|
|
||||||
COUNT(*) as span_count,
|
|
||||||
COUNT(*) FILTER (WHERE s.valence = 'V+') as positive_count,
|
|
||||||
ARRAY_AGG(s.span_text ORDER BY s.intensity DESC) FILTER (WHERE s.valence = 'V+') as example_quotes
|
|
||||||
FROM pipeline.spans s
|
|
||||||
JOIN pipeline.urt_subcodes sc ON sc.code = s.urt_primary
|
|
||||||
JOIN pipeline.urt_domains d ON d.code = SUBSTRING(s.urt_primary, 1, 1)
|
|
||||||
WHERE s.job_id = $1 AND s.valence = 'V+'
|
|
||||||
GROUP BY s.urt_primary, sc.name, sc.definition, d.code, d.name
|
|
||||||
ORDER BY positive_count DESC
|
|
||||||
LIMIT 5
|
|
||||||
""", job_id)
|
|
||||||
|
|
||||||
# Get domain distribution
|
|
||||||
domains = await self.pool.fetch("""
|
|
||||||
SELECT
|
|
||||||
SUBSTRING(urt_primary, 1, 1) as domain,
|
|
||||||
d.name as domain_name,
|
|
||||||
COUNT(*) as total_count,
|
|
||||||
COUNT(*) FILTER (WHERE valence = 'V+') as positive_count,
|
|
||||||
COUNT(*) FILTER (WHERE valence = 'V-') as negative_count
|
|
||||||
FROM pipeline.spans s
|
|
||||||
JOIN pipeline.urt_domains d ON d.code = SUBSTRING(s.urt_primary, 1, 1)
|
|
||||||
WHERE s.job_id = $1
|
|
||||||
GROUP BY SUBSTRING(urt_primary, 1, 1), d.name
|
|
||||||
ORDER BY total_count DESC
|
|
||||||
""", job_id)
|
|
||||||
|
|
||||||
# Get business name if available
|
|
||||||
business = await self.pool.fetchrow("""
|
|
||||||
SELECT DISTINCT business_name
|
|
||||||
FROM reviews
|
|
||||||
WHERE job_id = $1 AND business_name IS NOT NULL
|
|
||||||
LIMIT 1
|
|
||||||
""", job_id)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"business_name": business["business_name"] if business else "This business",
|
|
||||||
"overview": dict(overview) if overview else {},
|
|
||||||
"sentiment": [dict(r) for r in sentiment],
|
|
||||||
"top_issues": [dict(r) for r in top_issues],
|
|
||||||
"top_strengths": [dict(r) for r in top_strengths],
|
|
||||||
"domains": [dict(r) for r in domains],
|
|
||||||
}
|
|
||||||
|
|
||||||
async def _generate_synthesis(self, context: dict[str, Any]) -> Synthesis:
|
|
||||||
"""Generate synthesis using LLM."""
|
|
||||||
|
|
||||||
# Build the user prompt with context
|
|
||||||
user_prompt = f"""Analyze this review data for {context['business_name']}:
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
- Total Reviews: {context['overview'].get('total_reviews', 0)}
|
|
||||||
- Average Rating: {context['overview'].get('avg_rating', 'N/A')}
|
|
||||||
- Total Insights Extracted: {context['overview'].get('total_spans', 0)}
|
|
||||||
|
|
||||||
## Sentiment Distribution
|
|
||||||
{self._format_sentiment(context['sentiment'])}
|
|
||||||
|
|
||||||
## Top Issues (Problems)
|
|
||||||
{self._format_issues(context['top_issues'])}
|
|
||||||
|
|
||||||
## Top Strengths
|
|
||||||
{self._format_strengths(context['top_strengths'])}
|
|
||||||
|
|
||||||
## Domain Breakdown
|
|
||||||
{self._format_domains(context['domains'])}
|
|
||||||
|
|
||||||
Generate a complete synthesis with actionable insights.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Call LLM
|
|
||||||
try:
|
|
||||||
response = await self.llm_client.generate(
|
|
||||||
system_prompt=SYNTHESIS_SYSTEM_PROMPT,
|
|
||||||
user_prompt=user_prompt,
|
|
||||||
temperature=0.7, # Allow some creativity
|
|
||||||
max_tokens=4000,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Parse JSON response
|
|
||||||
result = json.loads(response)
|
|
||||||
|
|
||||||
# Convert to Synthesis object
|
|
||||||
return Synthesis(
|
|
||||||
executive_narrative=result.get("executive_narrative", ""),
|
|
||||||
sentiment_insight=result.get("sentiment_insight", ""),
|
|
||||||
category_insight=result.get("category_insight", ""),
|
|
||||||
timeline_insight=result.get("timeline_insight", ""),
|
|
||||||
priority_domain=result.get("priority_domain"),
|
|
||||||
priority_issue=result.get("priority_issue"),
|
|
||||||
action_plan=[
|
|
||||||
ActionItem(
|
|
||||||
id=a.get("id", f"action_{i}"),
|
|
||||||
title=a.get("title", ""),
|
|
||||||
why=a.get("why", ""),
|
|
||||||
what=a.get("what", ""),
|
|
||||||
who=a.get("who", ""),
|
|
||||||
impact=a.get("impact", ""),
|
|
||||||
evidence=a.get("evidence", []),
|
|
||||||
estimated_rating_lift=a.get("estimated_rating_lift"),
|
|
||||||
complexity=a.get("complexity", "medium"),
|
|
||||||
priority=a.get("priority", "medium"),
|
|
||||||
timeline=a.get("timeline", "This month"),
|
|
||||||
related_subcode=a.get("related_subcode", ""),
|
|
||||||
)
|
|
||||||
for i, a in enumerate(result.get("action_plan", []))
|
|
||||||
],
|
|
||||||
issue_actions={}, # Can be populated from action_plan
|
|
||||||
timeline_annotations=[
|
|
||||||
TimelineAnnotation(
|
|
||||||
date=t.get("date", ""),
|
|
||||||
label=t.get("label", ""),
|
|
||||||
description=t.get("description", ""),
|
|
||||||
type=t.get("type", "neutral"),
|
|
||||||
)
|
|
||||||
for t in result.get("timeline_annotations", [])
|
|
||||||
],
|
|
||||||
marketing_angles=result.get("marketing_angles", []),
|
|
||||||
competitor_context=result.get("competitor_context"),
|
|
||||||
generated_at=datetime.utcnow().isoformat(),
|
|
||||||
)
|
|
||||||
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
logger.error(f"Failed to parse LLM response: {e}")
|
|
||||||
return self._create_fallback_synthesis()
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Synthesis generation failed: {e}")
|
|
||||||
return self._create_fallback_synthesis()
|
|
||||||
|
|
||||||
def _format_sentiment(self, sentiment: list[dict]) -> str:
|
|
||||||
"""Format sentiment data for prompt."""
|
|
||||||
lines = []
|
|
||||||
for s in sentiment:
|
|
||||||
valence = s.get("valence", "Unknown")
|
|
||||||
count = s.get("count", 0)
|
|
||||||
reviews = s.get("review_count", 0)
|
|
||||||
label = {"V+": "Positive", "V-": "Negative", "V0": "Neutral", "V±": "Mixed"}.get(valence, valence)
|
|
||||||
lines.append(f"- {label}: {count} mentions ({reviews} reviews)")
|
|
||||||
return "\n".join(lines) or "No sentiment data"
|
|
||||||
|
|
||||||
def _format_issues(self, issues: list[dict]) -> str:
|
|
||||||
"""Format issues for prompt."""
|
|
||||||
lines = []
|
|
||||||
for i, issue in enumerate(issues[:5], 1):
|
|
||||||
subcode = issue.get("subcode", "")
|
|
||||||
name = issue.get("subcode_name", "")
|
|
||||||
domain = issue.get("domain_name", "")
|
|
||||||
count = issue.get("negative_count", 0)
|
|
||||||
quotes = issue.get("example_quotes", [])[:2]
|
|
||||||
|
|
||||||
lines.append(f"{i}. [{subcode}] {name} ({domain})")
|
|
||||||
lines.append(f" - {count} negative mentions")
|
|
||||||
for q in quotes:
|
|
||||||
if q:
|
|
||||||
lines.append(f' - Example: "{q[:100]}..."' if len(q) > 100 else f' - Example: "{q}"')
|
|
||||||
return "\n".join(lines) or "No issues found"
|
|
||||||
|
|
||||||
def _format_strengths(self, strengths: list[dict]) -> str:
|
|
||||||
"""Format strengths for prompt."""
|
|
||||||
lines = []
|
|
||||||
for i, strength in enumerate(strengths[:3], 1):
|
|
||||||
subcode = strength.get("subcode", "")
|
|
||||||
name = strength.get("subcode_name", "")
|
|
||||||
domain = strength.get("domain_name", "")
|
|
||||||
count = strength.get("positive_count", 0)
|
|
||||||
quotes = strength.get("example_quotes", [])[:2]
|
|
||||||
|
|
||||||
lines.append(f"{i}. [{subcode}] {name} ({domain})")
|
|
||||||
lines.append(f" - {count} positive mentions")
|
|
||||||
for q in quotes:
|
|
||||||
if q:
|
|
||||||
lines.append(f' - Example: "{q[:100]}..."' if len(q) > 100 else f' - Example: "{q}"')
|
|
||||||
return "\n".join(lines) or "No strengths found"
|
|
||||||
|
|
||||||
def _format_domains(self, domains: list[dict]) -> str:
|
|
||||||
"""Format domain distribution for prompt."""
|
|
||||||
lines = []
|
|
||||||
for d in domains:
|
|
||||||
domain = d.get("domain", "")
|
|
||||||
name = d.get("domain_name", "")
|
|
||||||
total = d.get("total_count", 0)
|
|
||||||
positive = d.get("positive_count", 0)
|
|
||||||
negative = d.get("negative_count", 0)
|
|
||||||
lines.append(f"- {domain} ({name}): {total} total ({positive} positive, {negative} negative)")
|
|
||||||
return "\n".join(lines) or "No domain data"
|
|
||||||
|
|
||||||
def _create_fallback_synthesis(self) -> Synthesis:
|
|
||||||
"""Create a minimal synthesis when LLM fails."""
|
|
||||||
return Synthesis(
|
|
||||||
executive_narrative="Unable to generate detailed analysis. Please review the data manually.",
|
|
||||||
sentiment_insight="",
|
|
||||||
category_insight="",
|
|
||||||
timeline_insight="",
|
|
||||||
priority_domain=None,
|
|
||||||
priority_issue=None,
|
|
||||||
action_plan=[],
|
|
||||||
issue_actions={},
|
|
||||||
timeline_annotations=[],
|
|
||||||
marketing_angles=[],
|
|
||||||
competitor_context=None,
|
|
||||||
generated_at=datetime.utcnow().isoformat(),
|
|
||||||
)
|
|
||||||
|
|
||||||
async def _store_synthesis(self, execution_id: str, synthesis: Synthesis) -> None:
|
|
||||||
"""Store synthesis in database."""
|
|
||||||
await self.pool.execute("""
|
|
||||||
UPDATE pipeline.executions
|
|
||||||
SET
|
|
||||||
synthesis = $2,
|
|
||||||
updated_at = NOW()
|
|
||||||
WHERE execution_id = $1
|
|
||||||
""", execution_id, json.dumps({
|
|
||||||
"executive_narrative": synthesis.executive_narrative,
|
|
||||||
"sentiment_insight": synthesis.sentiment_insight,
|
|
||||||
"category_insight": synthesis.category_insight,
|
|
||||||
"timeline_insight": synthesis.timeline_insight,
|
|
||||||
"priority_domain": synthesis.priority_domain,
|
|
||||||
"priority_issue": synthesis.priority_issue,
|
|
||||||
"action_plan": [
|
|
||||||
{
|
|
||||||
"id": a.id,
|
|
||||||
"title": a.title,
|
|
||||||
"why": a.why,
|
|
||||||
"what": a.what,
|
|
||||||
"who": a.who,
|
|
||||||
"impact": a.impact,
|
|
||||||
"evidence": a.evidence,
|
|
||||||
"estimated_rating_lift": a.estimated_rating_lift,
|
|
||||||
"complexity": a.complexity,
|
|
||||||
"priority": a.priority,
|
|
||||||
"timeline": a.timeline,
|
|
||||||
"related_subcode": a.related_subcode,
|
|
||||||
}
|
|
||||||
for a in synthesis.action_plan
|
|
||||||
],
|
|
||||||
"issue_actions": synthesis.issue_actions,
|
|
||||||
"timeline_annotations": [
|
|
||||||
{
|
|
||||||
"date": t.date,
|
|
||||||
"label": t.label,
|
|
||||||
"description": t.description,
|
|
||||||
"type": t.type,
|
|
||||||
}
|
|
||||||
for t in synthesis.timeline_annotations
|
|
||||||
],
|
|
||||||
"marketing_angles": synthesis.marketing_angles,
|
|
||||||
"competitor_context": synthesis.competitor_context,
|
|
||||||
"generated_at": synthesis.generated_at,
|
|
||||||
}))
|
|
||||||
486
packages/reviewiq-pipeline/validate_router.py
Normal file
486
packages/reviewiq-pipeline/validate_router.py
Normal file
@@ -0,0 +1,486 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
Validate router decisions against real reviews with minimal LLM cost.
|
||||||
|
|
||||||
|
This script:
|
||||||
|
1. Loads real reviews from database
|
||||||
|
2. Routes them through the router
|
||||||
|
3. Cherry-picks samples from each tier for validation
|
||||||
|
4. Optionally runs LLM on small samples to validate decisions
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
# Dry run - just show routing decisions, no LLM calls
|
||||||
|
python validate_router.py <job_id> --dry-run
|
||||||
|
|
||||||
|
# Validate with LLM (costs ~$0.05-0.10)
|
||||||
|
python validate_router.py <job_id> --validate
|
||||||
|
|
||||||
|
# Custom sample sizes
|
||||||
|
python validate_router.py <job_id> --validate --skip-samples=3 --cheap-samples=5 --full-samples=3
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||||
|
)
|
||||||
|
logger = logging.getLogger("validate_router")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ValidationResult:
|
||||||
|
"""Result of validating a single review."""
|
||||||
|
review_id: str
|
||||||
|
text: str
|
||||||
|
rating: int
|
||||||
|
routed_tier: str
|
||||||
|
routing_reason: str
|
||||||
|
routing_signals: dict
|
||||||
|
# LLM results (if validated)
|
||||||
|
llm_urt: str | None = None
|
||||||
|
llm_valence: str | None = None
|
||||||
|
llm_span_count: int | None = None
|
||||||
|
llm_cost: float | None = None
|
||||||
|
# Validation verdict
|
||||||
|
routing_correct: bool | None = None
|
||||||
|
notes: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
async def load_reviews_from_db(job_id: str, database_url: str) -> list[dict]:
|
||||||
|
"""Load reviews from database for a job."""
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
conn = await asyncpg.connect(database_url)
|
||||||
|
try:
|
||||||
|
# Get reviews with text from pipeline schema
|
||||||
|
rows = await conn.fetch("""
|
||||||
|
SELECT
|
||||||
|
re.review_id,
|
||||||
|
re.text,
|
||||||
|
re.rating,
|
||||||
|
re.business_id,
|
||||||
|
re.place_id
|
||||||
|
FROM pipeline.reviews_enriched re
|
||||||
|
WHERE re.job_id = $1::uuid
|
||||||
|
AND re.text IS NOT NULL
|
||||||
|
AND re.text != ''
|
||||||
|
ORDER BY re.id
|
||||||
|
""", job_id)
|
||||||
|
|
||||||
|
reviews = []
|
||||||
|
for row in rows:
|
||||||
|
text = row["text"] or ""
|
||||||
|
reviews.append({
|
||||||
|
"review_id": row["review_id"],
|
||||||
|
"text": text,
|
||||||
|
"text_normalized": text.lower().strip(),
|
||||||
|
"rating": row["rating"],
|
||||||
|
"business_id": row["business_id"],
|
||||||
|
"place_id": row["place_id"],
|
||||||
|
"source": "google",
|
||||||
|
"review_version": 1,
|
||||||
|
"review_time": "2024-01-01T00:00:00Z",
|
||||||
|
})
|
||||||
|
|
||||||
|
logger.info(f"Loaded {len(reviews)} reviews from job {job_id}")
|
||||||
|
return reviews
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def route_reviews(reviews: list[dict]) -> dict[str, list[dict]]:
|
||||||
|
"""Route reviews and return grouped by tier."""
|
||||||
|
from reviewiq_pipeline.services.review_router import (
|
||||||
|
ReviewRouter,
|
||||||
|
RoutingTier,
|
||||||
|
create_router,
|
||||||
|
)
|
||||||
|
|
||||||
|
router = create_router(conservative=True)
|
||||||
|
routed = router.route_batch(reviews)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"skip": routed[RoutingTier.SKIP],
|
||||||
|
"cheap": routed[RoutingTier.CHEAP_MODEL],
|
||||||
|
"full": routed[RoutingTier.FULL_MODEL],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def select_diverse_samples(
|
||||||
|
reviews: list[dict],
|
||||||
|
tier: str,
|
||||||
|
n_samples: int,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""
|
||||||
|
Select diverse samples from a tier for validation.
|
||||||
|
|
||||||
|
Strategy:
|
||||||
|
- For SKIP: Pick different ratings, different lengths
|
||||||
|
- For CHEAP: Pick different word counts, different ratings
|
||||||
|
- For FULL: Pick different routing reasons
|
||||||
|
"""
|
||||||
|
if not reviews or n_samples <= 0:
|
||||||
|
return []
|
||||||
|
|
||||||
|
samples = []
|
||||||
|
seen_reasons = set()
|
||||||
|
seen_ratings = set()
|
||||||
|
|
||||||
|
# First pass: get diversity by reason and rating
|
||||||
|
for review in reviews:
|
||||||
|
routing = review.get("_routing")
|
||||||
|
if not routing:
|
||||||
|
continue
|
||||||
|
|
||||||
|
reason = routing.reason
|
||||||
|
rating = review["rating"]
|
||||||
|
|
||||||
|
# Prioritize diversity
|
||||||
|
key = (reason, rating)
|
||||||
|
if key not in seen_reasons or len(samples) < n_samples:
|
||||||
|
if len(samples) < n_samples:
|
||||||
|
samples.append(review)
|
||||||
|
seen_reasons.add(key)
|
||||||
|
seen_ratings.add(rating)
|
||||||
|
|
||||||
|
# Fill remaining slots if needed
|
||||||
|
for review in reviews:
|
||||||
|
if len(samples) >= n_samples:
|
||||||
|
break
|
||||||
|
if review not in samples:
|
||||||
|
samples.append(review)
|
||||||
|
|
||||||
|
return samples[:n_samples]
|
||||||
|
|
||||||
|
|
||||||
|
def print_routing_summary(routed: dict[str, list[dict]]):
|
||||||
|
"""Print summary of routing decisions."""
|
||||||
|
total = sum(len(v) for v in routed.values())
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("ROUTING SUMMARY")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
for tier, reviews in routed.items():
|
||||||
|
pct = len(reviews) / total * 100 if total > 0 else 0
|
||||||
|
print(f"\n{tier.upper()} TIER: {len(reviews)} reviews ({pct:.1f}%)")
|
||||||
|
|
||||||
|
# Group by reason
|
||||||
|
reasons = {}
|
||||||
|
for r in reviews:
|
||||||
|
routing = r.get("_routing")
|
||||||
|
if routing:
|
||||||
|
reason = routing.reason
|
||||||
|
reasons[reason] = reasons.get(reason, 0) + 1
|
||||||
|
|
||||||
|
for reason, count in sorted(reasons.items(), key=lambda x: -x[1]):
|
||||||
|
print(f" - {reason}: {count}")
|
||||||
|
|
||||||
|
|
||||||
|
def print_samples(samples: list[dict], tier: str):
|
||||||
|
"""Print sample reviews for inspection."""
|
||||||
|
print(f"\n{'=' * 70}")
|
||||||
|
print(f"{tier.upper()} TIER SAMPLES ({len(samples)} reviews)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
for i, review in enumerate(samples, 1):
|
||||||
|
routing = review.get("_routing")
|
||||||
|
signals = routing.signals if routing else {}
|
||||||
|
|
||||||
|
print(f"\n[{i}] Review ID: {review['review_id']}")
|
||||||
|
print(f" Rating: {'⭐' * review['rating']}")
|
||||||
|
print(f" Text: \"{review['text'][:100]}{'...' if len(review['text']) > 100 else ''}\"")
|
||||||
|
print(f" Routing: {routing.reason if routing else 'N/A'}")
|
||||||
|
print(f" Signals: words={signals.get('word_count', '?')}, "
|
||||||
|
f"chars={signals.get('char_count', '?')}, "
|
||||||
|
f"numbers={signals.get('has_numbers', '?')}, "
|
||||||
|
f"sentences={signals.get('sentence_count', '?')}")
|
||||||
|
|
||||||
|
|
||||||
|
async def validate_with_llm(
|
||||||
|
samples: list[dict],
|
||||||
|
tier: str,
|
||||||
|
config: Any,
|
||||||
|
) -> list[ValidationResult]:
|
||||||
|
"""
|
||||||
|
Run LLM classification on samples to validate routing decisions.
|
||||||
|
|
||||||
|
Returns validation results with verdicts.
|
||||||
|
"""
|
||||||
|
from reviewiq_pipeline.services.llm_client import LLMClient, BatchReviewInput, PartialBatchResult
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
if not samples:
|
||||||
|
return results
|
||||||
|
|
||||||
|
# Create LLM client
|
||||||
|
client = LLMClient.create(config)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Prepare batch input
|
||||||
|
batch_input = [
|
||||||
|
BatchReviewInput(
|
||||||
|
review_id=r["review_id"],
|
||||||
|
text=r["text"],
|
||||||
|
rating=r["rating"],
|
||||||
|
)
|
||||||
|
for r in samples
|
||||||
|
]
|
||||||
|
|
||||||
|
# Run classification
|
||||||
|
logger.info(f"Running LLM on {len(samples)} {tier} tier samples...")
|
||||||
|
|
||||||
|
llm_responses = []
|
||||||
|
metadata = {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
llm_responses, metadata = await client.classify_batch(batch_input, "standard")
|
||||||
|
except PartialBatchResult as e:
|
||||||
|
# Handle partial results
|
||||||
|
logger.warning(f"Partial result for {tier} tier: {len(e.partial_results)} recovered")
|
||||||
|
metadata = e.metadata or {}
|
||||||
|
|
||||||
|
# Build responses from partial results
|
||||||
|
for partial in e.partial_results:
|
||||||
|
idx = partial.get("review_index", -1)
|
||||||
|
if 0 <= idx < len(samples):
|
||||||
|
llm_responses.append({
|
||||||
|
"spans": partial.get("spans", []),
|
||||||
|
"review_summary": partial.get("review_summary", {}),
|
||||||
|
"_index": idx,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Pad with empty responses for missing indices
|
||||||
|
processed_indices = {r.get("_index", -1) for r in llm_responses}
|
||||||
|
for i, sample in enumerate(samples):
|
||||||
|
if i not in processed_indices:
|
||||||
|
llm_responses.append({
|
||||||
|
"spans": [],
|
||||||
|
"review_summary": {},
|
||||||
|
"_index": i,
|
||||||
|
"_error": "partial_recovery_failed",
|
||||||
|
})
|
||||||
|
|
||||||
|
# Sort by original index
|
||||||
|
llm_responses.sort(key=lambda x: x.get("_index", 999))
|
||||||
|
|
||||||
|
cost = metadata.get("cost_usd", 0)
|
||||||
|
logger.info(f"LLM cost for {tier} tier: ${cost:.4f}")
|
||||||
|
|
||||||
|
# Process results
|
||||||
|
for review, llm_response in zip(samples, llm_responses):
|
||||||
|
routing = review.get("_routing")
|
||||||
|
signals = routing.signals if routing else {}
|
||||||
|
|
||||||
|
spans = llm_response.get("spans", [])
|
||||||
|
primary_span = next((s for s in spans if s.get("is_primary")), spans[0] if spans else {})
|
||||||
|
|
||||||
|
urt = primary_span.get("urt_primary", "N/A")
|
||||||
|
valence = primary_span.get("valence", "N/A")
|
||||||
|
|
||||||
|
# Determine if routing was correct
|
||||||
|
routing_correct = None
|
||||||
|
notes = ""
|
||||||
|
|
||||||
|
if tier == "skip":
|
||||||
|
# SKIP is correct if LLM gives generic code (V4.03) or single low-info span
|
||||||
|
is_generic = urt in ("V4.03", "V4.01", "V4.02", "O1.01")
|
||||||
|
is_simple = len(spans) == 1 and primary_span.get("intensity") == "I1"
|
||||||
|
routing_correct = is_generic or is_simple
|
||||||
|
if not routing_correct:
|
||||||
|
notes = f"LLM found specific content: {urt}"
|
||||||
|
else:
|
||||||
|
notes = "Correctly skipped (generic/simple)"
|
||||||
|
|
||||||
|
elif tier == "cheap":
|
||||||
|
# CHEAP is correct if classification is straightforward
|
||||||
|
# (single domain, no complex causal chains)
|
||||||
|
is_simple = len(spans) <= 2
|
||||||
|
routing_correct = is_simple
|
||||||
|
if not routing_correct:
|
||||||
|
notes = f"Complex: {len(spans)} spans found"
|
||||||
|
else:
|
||||||
|
notes = "Simple enough for cheap model"
|
||||||
|
|
||||||
|
elif tier == "full":
|
||||||
|
# FULL is correct if there's meaningful content
|
||||||
|
has_content = len(spans) >= 1 and urt not in ("V4.03", "O1.01")
|
||||||
|
routing_correct = has_content
|
||||||
|
if routing_correct:
|
||||||
|
notes = f"Correctly sent to full: {len(spans)} spans, {urt}"
|
||||||
|
else:
|
||||||
|
notes = "Could have been cheaper"
|
||||||
|
|
||||||
|
result = ValidationResult(
|
||||||
|
review_id=review["review_id"],
|
||||||
|
text=review["text"],
|
||||||
|
rating=review["rating"],
|
||||||
|
routed_tier=tier,
|
||||||
|
routing_reason=routing.reason if routing else "N/A",
|
||||||
|
routing_signals=signals,
|
||||||
|
llm_urt=urt,
|
||||||
|
llm_valence=valence,
|
||||||
|
llm_span_count=len(spans),
|
||||||
|
llm_cost=cost / len(samples),
|
||||||
|
routing_correct=routing_correct,
|
||||||
|
notes=notes,
|
||||||
|
)
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await client.close()
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def print_validation_results(results: list[ValidationResult], tier: str):
|
||||||
|
"""Print validation results."""
|
||||||
|
if not results:
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"\n{'=' * 70}")
|
||||||
|
print(f"{tier.upper()} TIER VALIDATION RESULTS")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
correct = sum(1 for r in results if r.routing_correct)
|
||||||
|
total = len(results)
|
||||||
|
accuracy = correct / total * 100 if total > 0 else 0
|
||||||
|
|
||||||
|
print(f"\nAccuracy: {correct}/{total} ({accuracy:.1f}%)")
|
||||||
|
|
||||||
|
for r in results:
|
||||||
|
status = "✅" if r.routing_correct else "❌"
|
||||||
|
print(f"\n{status} [{r.review_id}] \"{r.text[:60]}...\"")
|
||||||
|
print(f" Rating: {r.rating}, Routed: {r.routed_tier} ({r.routing_reason})")
|
||||||
|
print(f" LLM: URT={r.llm_urt}, Valence={r.llm_valence}, Spans={r.llm_span_count}")
|
||||||
|
print(f" Notes: {r.notes}")
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Validate router decisions")
|
||||||
|
parser.add_argument("job_id", help="Job ID to analyze")
|
||||||
|
parser.add_argument("--dry-run", action="store_true", help="Show routing only, no LLM")
|
||||||
|
parser.add_argument("--validate", action="store_true", help="Run LLM validation")
|
||||||
|
parser.add_argument("--skip-samples", type=int, default=3, help="SKIP tier samples")
|
||||||
|
parser.add_argument("--cheap-samples", type=int, default=5, help="CHEAP tier samples")
|
||||||
|
parser.add_argument("--full-samples", type=int, default=3, help="FULL tier samples")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Database URL
|
||||||
|
database_url = os.environ.get(
|
||||||
|
"DATABASE_URL",
|
||||||
|
"postgresql://scraper:scraper123@localhost:5437/scraper"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load reviews
|
||||||
|
reviews = await load_reviews_from_db(args.job_id, database_url)
|
||||||
|
if not reviews:
|
||||||
|
print("No reviews found for job")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Route reviews
|
||||||
|
routed = route_reviews(reviews)
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print_routing_summary(routed)
|
||||||
|
|
||||||
|
# Select samples
|
||||||
|
skip_samples = select_diverse_samples(routed["skip"], "skip", args.skip_samples)
|
||||||
|
cheap_samples = select_diverse_samples(routed["cheap"], "cheap", args.cheap_samples)
|
||||||
|
full_samples = select_diverse_samples(routed["full"], "full", args.full_samples)
|
||||||
|
|
||||||
|
# Print samples
|
||||||
|
print_samples(skip_samples, "skip")
|
||||||
|
print_samples(cheap_samples, "cheap")
|
||||||
|
print_samples(full_samples, "full")
|
||||||
|
|
||||||
|
# Estimate cost
|
||||||
|
total_samples = len(skip_samples) + len(cheap_samples) + len(full_samples)
|
||||||
|
estimated_cost = total_samples * 0.003 # ~$0.003 per review with Sonnet
|
||||||
|
print(f"\n{'=' * 70}")
|
||||||
|
print(f"VALIDATION COST ESTIMATE: ~${estimated_cost:.3f} for {total_samples} samples")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
print("\n[DRY RUN] No LLM calls made. Use --validate to run validation.")
|
||||||
|
return
|
||||||
|
|
||||||
|
if not args.validate:
|
||||||
|
print("\nUse --validate to run LLM validation on these samples.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Run validation
|
||||||
|
from reviewiq_pipeline.config import Config
|
||||||
|
|
||||||
|
config = Config(
|
||||||
|
database_url=database_url,
|
||||||
|
llm_provider="anthropic",
|
||||||
|
llm_model="claude-sonnet-4-5-20250929",
|
||||||
|
anthropic_api_key=os.environ.get("ANTHROPIC_API_KEY",
|
||||||
|
"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"),
|
||||||
|
)
|
||||||
|
|
||||||
|
all_results = []
|
||||||
|
total_cost = 0
|
||||||
|
|
||||||
|
# Validate each tier
|
||||||
|
for tier, samples in [("skip", skip_samples), ("cheap", cheap_samples), ("full", full_samples)]:
|
||||||
|
if samples:
|
||||||
|
results = await validate_with_llm(samples, tier, config)
|
||||||
|
all_results.extend(results)
|
||||||
|
total_cost += sum(r.llm_cost or 0 for r in results)
|
||||||
|
print_validation_results(results, tier)
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print(f"\n{'=' * 70}")
|
||||||
|
print("VALIDATION SUMMARY")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
for tier in ["skip", "cheap", "full"]:
|
||||||
|
tier_results = [r for r in all_results if r.routed_tier == tier]
|
||||||
|
if tier_results:
|
||||||
|
correct = sum(1 for r in tier_results if r.routing_correct)
|
||||||
|
total = len(tier_results)
|
||||||
|
print(f"{tier.upper()}: {correct}/{total} correct ({correct/total*100:.0f}%)")
|
||||||
|
|
||||||
|
overall_correct = sum(1 for r in all_results if r.routing_correct)
|
||||||
|
overall_total = len(all_results)
|
||||||
|
print(f"\nOVERALL: {overall_correct}/{overall_total} correct ({overall_correct/overall_total*100:.0f}%)")
|
||||||
|
print(f"TOTAL COST: ${total_cost:.4f}")
|
||||||
|
|
||||||
|
# Recommendations
|
||||||
|
print(f"\n{'=' * 70}")
|
||||||
|
print("RECOMMENDATIONS")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
skip_errors = [r for r in all_results if r.routed_tier == "skip" and not r.routing_correct]
|
||||||
|
if skip_errors:
|
||||||
|
print("\n⚠️ SKIP tier false negatives found:")
|
||||||
|
for r in skip_errors:
|
||||||
|
print(f" - \"{r.text[:50]}...\" → {r.llm_urt}")
|
||||||
|
print(" Consider tightening SKIP criteria")
|
||||||
|
else:
|
||||||
|
print("\n✅ SKIP tier looks safe")
|
||||||
|
|
||||||
|
cheap_errors = [r for r in all_results if r.routed_tier == "cheap" and not r.routing_correct]
|
||||||
|
if cheap_errors:
|
||||||
|
print("\n⚠️ CHEAP tier may miss complexity:")
|
||||||
|
for r in cheap_errors:
|
||||||
|
print(f" - \"{r.text[:50]}...\" → {r.llm_span_count} spans")
|
||||||
|
else:
|
||||||
|
print("\n✅ CHEAP tier thresholds look good")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
83
run_synthesis.py
Normal file
83
run_synthesis.py
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Regenerate synthesis with new report format."""
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
sys.path.insert(0, '/app/packages/reviewiq-pipeline/src')
|
||||||
|
sys.path.insert(0, '/app/packages/pipeline-core/src')
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
import asyncpg
|
||||||
|
from reviewiq_pipeline.config import Config
|
||||||
|
from reviewiq_pipeline.services.llm_client import LLMClient
|
||||||
|
from reviewiq_pipeline.stages.stage5_synthesize import Stage5Synthesizer
|
||||||
|
|
||||||
|
job_id = "a3813665-ea23-4fb0-aab7-b282ef9443e4"
|
||||||
|
|
||||||
|
database_url = os.getenv(
|
||||||
|
'DATABASE_URL',
|
||||||
|
'postgresql://scraper:scraper123@scraper-db:5432/scraper'
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Connecting to database...")
|
||||||
|
pool = await asyncpg.create_pool(database_url)
|
||||||
|
|
||||||
|
# Check if execution exists for this job, create one if not
|
||||||
|
print("Checking for existing execution...")
|
||||||
|
row = await pool.fetchrow(
|
||||||
|
"SELECT id FROM pipeline.executions WHERE job_id = $1::uuid ORDER BY created_at DESC LIMIT 1",
|
||||||
|
job_id
|
||||||
|
)
|
||||||
|
|
||||||
|
if row:
|
||||||
|
execution_id = str(row['id'])
|
||||||
|
print(f"Found existing execution: {execution_id}")
|
||||||
|
else:
|
||||||
|
execution_id = str(uuid.uuid4())
|
||||||
|
print(f"Creating new execution: {execution_id}")
|
||||||
|
await pool.execute("""
|
||||||
|
INSERT INTO pipeline.executions (id, pipeline_id, job_id, status, stages_requested, created_at)
|
||||||
|
VALUES ($1::uuid, 'reviewiq', $2::uuid, 'running', ARRAY['synthesize'], NOW())
|
||||||
|
""", execution_id, job_id)
|
||||||
|
|
||||||
|
print("Creating LLM client...")
|
||||||
|
config = Config()
|
||||||
|
llm_client = LLMClient.create(config)
|
||||||
|
|
||||||
|
try:
|
||||||
|
print(f"Generating analyst report for job {job_id}...")
|
||||||
|
stage5 = Stage5Synthesizer(pool=pool, llm_client=llm_client)
|
||||||
|
synthesis = await stage5.run(job_id, execution_id)
|
||||||
|
|
||||||
|
# Mark execution as completed
|
||||||
|
await pool.execute(
|
||||||
|
"UPDATE pipeline.executions SET status = 'completed', completed_at = NOW() WHERE id = $1::uuid",
|
||||||
|
execution_id
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\n{'='*70}")
|
||||||
|
print("ANALYST REPORT GENERATED")
|
||||||
|
print(f"{'='*70}")
|
||||||
|
print(f"\nHEADLINE: {synthesis.headline}")
|
||||||
|
print(f"\nVERDICT: {synthesis.verdict}")
|
||||||
|
print(f"\nRATING: {synthesis.current_rating:.1f} → {synthesis.potential_rating:.1f} (gap: +{synthesis.rating_gap:.1f})")
|
||||||
|
print(f"\nNARRATIVE:\n{synthesis.narrative[:500]}...")
|
||||||
|
print(f"\nPRIMARY PROBLEM: {synthesis.primary_problem}")
|
||||||
|
print(f"ROOT CAUSE: {synthesis.root_cause}")
|
||||||
|
print(f"\nACTIONS ({len(synthesis.actions)}):")
|
||||||
|
for a in synthesis.actions:
|
||||||
|
print(f" [{a.priority}] {a.action}")
|
||||||
|
print(f" Owner: {a.owner} | Impact: {a.impact}")
|
||||||
|
print(f"\nEVIDENCE ({len(synthesis.evidence)}):")
|
||||||
|
for e in synthesis.evidence[:3]:
|
||||||
|
print(f" [{e.sentiment}] \"{e.quote[:60]}...\"")
|
||||||
|
print(f" Context: {e.context}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await llm_client.close()
|
||||||
|
await pool.close()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
266
scrapers/google_reviews/session_manager.py
Normal file
266
scrapers/google_reviews/session_manager.py
Normal file
@@ -0,0 +1,266 @@
|
|||||||
|
"""
|
||||||
|
Session Manager for Google Reviews Scraper
|
||||||
|
|
||||||
|
Manages browser sessions between validation and scraping phases.
|
||||||
|
Allows reusing the same browser instance to avoid duplicate navigation.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
# During validation
|
||||||
|
session_id = session_manager.create_session(driver, business_info, total_reviews)
|
||||||
|
return {"session_id": session_id, "business_info": business_info}
|
||||||
|
|
||||||
|
# During scraping (with session_id from validation)
|
||||||
|
session = session_manager.get_session(session_id)
|
||||||
|
if session:
|
||||||
|
driver = session['driver']
|
||||||
|
# Continue from where validation left off
|
||||||
|
"""
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
import time
|
||||||
|
import threading
|
||||||
|
from typing import Optional, Dict, Any
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BrowserSession:
|
||||||
|
"""Represents a validated browser session ready for scraping."""
|
||||||
|
session_id: str
|
||||||
|
driver: Any # WebDriver instance
|
||||||
|
url: str
|
||||||
|
business_info: Dict[str, Any]
|
||||||
|
total_reviews: int
|
||||||
|
created_at: float
|
||||||
|
expires_at: float
|
||||||
|
browser_fingerprint: Optional[Dict[str, Any]] = None
|
||||||
|
log_capture: Any = None # LogCapture instance
|
||||||
|
# Track session state
|
||||||
|
state: str = "validated" # validated -> scraping -> completed/expired
|
||||||
|
|
||||||
|
|
||||||
|
class SessionManager:
|
||||||
|
"""
|
||||||
|
Manages browser sessions between validation and scraping.
|
||||||
|
|
||||||
|
Sessions have a TTL (default 5 minutes) after which they're automatically
|
||||||
|
cleaned up and the browser is closed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
DEFAULT_TTL_SECONDS = 300 # 5 minutes
|
||||||
|
CLEANUP_INTERVAL_SECONDS = 30 # Check for expired sessions every 30s
|
||||||
|
|
||||||
|
def __init__(self, ttl_seconds: int = None):
|
||||||
|
self.ttl_seconds = ttl_seconds or self.DEFAULT_TTL_SECONDS
|
||||||
|
self._sessions: Dict[str, BrowserSession] = {}
|
||||||
|
self._lock = threading.RLock()
|
||||||
|
self._cleanup_thread: Optional[threading.Thread] = None
|
||||||
|
self._running = False
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
"""Start the background cleanup thread."""
|
||||||
|
if self._running:
|
||||||
|
return
|
||||||
|
self._running = True
|
||||||
|
self._cleanup_thread = threading.Thread(target=self._cleanup_loop, daemon=True)
|
||||||
|
self._cleanup_thread.start()
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
"""Stop the background cleanup thread."""
|
||||||
|
self._running = False
|
||||||
|
if self._cleanup_thread:
|
||||||
|
self._cleanup_thread.join(timeout=5)
|
||||||
|
|
||||||
|
def _cleanup_loop(self):
|
||||||
|
"""Background loop to clean up expired sessions."""
|
||||||
|
while self._running:
|
||||||
|
try:
|
||||||
|
self._cleanup_expired()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[SessionManager] Cleanup error: {e}")
|
||||||
|
time.sleep(self.CLEANUP_INTERVAL_SECONDS)
|
||||||
|
|
||||||
|
def _cleanup_expired(self):
|
||||||
|
"""Remove expired sessions and close their browsers."""
|
||||||
|
now = time.time()
|
||||||
|
expired_ids = []
|
||||||
|
|
||||||
|
with self._lock:
|
||||||
|
for session_id, session in self._sessions.items():
|
||||||
|
if now > session.expires_at:
|
||||||
|
expired_ids.append(session_id)
|
||||||
|
|
||||||
|
for session_id in expired_ids:
|
||||||
|
self.release_session(session_id, reason="expired")
|
||||||
|
|
||||||
|
def create_session(
|
||||||
|
self,
|
||||||
|
driver: Any,
|
||||||
|
url: str,
|
||||||
|
business_info: Dict[str, Any],
|
||||||
|
total_reviews: int,
|
||||||
|
browser_fingerprint: Optional[Dict[str, Any]] = None,
|
||||||
|
log_capture: Any = None,
|
||||||
|
ttl_seconds: Optional[int] = None
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Create a new browser session after validation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
driver: WebDriver instance (positioned on Google Maps page)
|
||||||
|
url: The validated Google Maps URL
|
||||||
|
business_info: Extracted business information
|
||||||
|
total_reviews: Total review count from page
|
||||||
|
browser_fingerprint: Browser fingerprint settings used
|
||||||
|
log_capture: LogCapture instance for logging
|
||||||
|
ttl_seconds: Custom TTL for this session (default: 5 min)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
session_id: Unique identifier for this session
|
||||||
|
"""
|
||||||
|
session_id = str(uuid.uuid4())[:8] # Short ID for easier use
|
||||||
|
ttl = ttl_seconds or self.ttl_seconds
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
session = BrowserSession(
|
||||||
|
session_id=session_id,
|
||||||
|
driver=driver,
|
||||||
|
url=url,
|
||||||
|
business_info=business_info,
|
||||||
|
total_reviews=total_reviews,
|
||||||
|
created_at=now,
|
||||||
|
expires_at=now + ttl,
|
||||||
|
browser_fingerprint=browser_fingerprint,
|
||||||
|
log_capture=log_capture,
|
||||||
|
state="validated"
|
||||||
|
)
|
||||||
|
|
||||||
|
with self._lock:
|
||||||
|
self._sessions[session_id] = session
|
||||||
|
|
||||||
|
print(f"[SessionManager] Created session {session_id} for {business_info.get('name', 'unknown')} (TTL: {ttl}s)")
|
||||||
|
return session_id
|
||||||
|
|
||||||
|
def get_session(self, session_id: str) -> Optional[BrowserSession]:
|
||||||
|
"""
|
||||||
|
Retrieve a session by ID.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
session_id: The session identifier
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BrowserSession if found and not expired, None otherwise
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
session = self._sessions.get(session_id)
|
||||||
|
if not session:
|
||||||
|
print(f"[SessionManager] Session {session_id} not found")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Check if expired
|
||||||
|
if time.time() > session.expires_at:
|
||||||
|
print(f"[SessionManager] Session {session_id} expired")
|
||||||
|
self.release_session(session_id, reason="expired")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return session
|
||||||
|
|
||||||
|
def claim_session(self, session_id: str) -> Optional[BrowserSession]:
|
||||||
|
"""
|
||||||
|
Claim a session for scraping (marks it as in-use).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
session_id: The session identifier
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BrowserSession if successfully claimed, None otherwise
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
session = self.get_session(session_id)
|
||||||
|
if not session:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if session.state != "validated":
|
||||||
|
print(f"[SessionManager] Session {session_id} already in state: {session.state}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
session.state = "scraping"
|
||||||
|
# Extend TTL during scraping (1 hour max)
|
||||||
|
session.expires_at = time.time() + 3600
|
||||||
|
|
||||||
|
print(f"[SessionManager] Claimed session {session_id} for scraping")
|
||||||
|
return session
|
||||||
|
|
||||||
|
def release_session(self, session_id: str, reason: str = "completed"):
|
||||||
|
"""
|
||||||
|
Release a session and close the browser.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
session_id: The session identifier
|
||||||
|
reason: Why the session is being released
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
session = self._sessions.pop(session_id, None)
|
||||||
|
|
||||||
|
if session:
|
||||||
|
print(f"[SessionManager] Releasing session {session_id} ({reason})")
|
||||||
|
try:
|
||||||
|
if session.driver:
|
||||||
|
session.driver.quit()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[SessionManager] Error closing driver for {session_id}: {e}")
|
||||||
|
|
||||||
|
def extend_session(self, session_id: str, additional_seconds: int = 300) -> bool:
|
||||||
|
"""
|
||||||
|
Extend a session's TTL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
session_id: The session identifier
|
||||||
|
additional_seconds: Seconds to add to TTL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if extended, False if session not found
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
session = self._sessions.get(session_id)
|
||||||
|
if not session:
|
||||||
|
return False
|
||||||
|
session.expires_at = time.time() + additional_seconds
|
||||||
|
return True
|
||||||
|
|
||||||
|
def get_stats(self) -> Dict[str, Any]:
|
||||||
|
"""Get session manager statistics."""
|
||||||
|
with self._lock:
|
||||||
|
now = time.time()
|
||||||
|
sessions = []
|
||||||
|
for sid, s in self._sessions.items():
|
||||||
|
sessions.append({
|
||||||
|
"session_id": sid,
|
||||||
|
"business": s.business_info.get("name", "unknown"),
|
||||||
|
"state": s.state,
|
||||||
|
"age_seconds": int(now - s.created_at),
|
||||||
|
"ttl_remaining": int(s.expires_at - now)
|
||||||
|
})
|
||||||
|
return {
|
||||||
|
"total_sessions": len(self._sessions),
|
||||||
|
"sessions": sessions
|
||||||
|
}
|
||||||
|
|
||||||
|
def list_sessions(self) -> list:
|
||||||
|
"""List all active sessions."""
|
||||||
|
with self._lock:
|
||||||
|
return list(self._sessions.keys())
|
||||||
|
|
||||||
|
|
||||||
|
# Global singleton instance
|
||||||
|
_session_manager: Optional[SessionManager] = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_session_manager() -> SessionManager:
|
||||||
|
"""Get or create the global session manager instance."""
|
||||||
|
global _session_manager
|
||||||
|
if _session_manager is None:
|
||||||
|
_session_manager = SessionManager()
|
||||||
|
_session_manager.start()
|
||||||
|
return _session_manager
|
||||||
@@ -732,7 +732,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
progress_callback=None, validation_only: bool = False,
|
progress_callback=None, validation_only: bool = False,
|
||||||
sort_strategy: str = SORT_AUTO, sort_order: List[str] = None,
|
sort_strategy: str = SORT_AUTO, sort_order: List[str] = None,
|
||||||
multi_sort_threshold: int = MULTI_SORT_THRESHOLD,
|
multi_sort_threshold: int = MULTI_SORT_THRESHOLD,
|
||||||
close_enough_pct: float = 95.0) -> dict:
|
close_enough_pct: float = 95.0, initial_sort: str = None) -> dict:
|
||||||
"""
|
"""
|
||||||
Scrape Google Maps reviews with optional multi-sort strategy.
|
Scrape Google Maps reviews with optional multi-sort strategy.
|
||||||
|
|
||||||
@@ -754,6 +754,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
sort_order: Custom sort order for multi-sort (default: newest, lowest, highest, relevant)
|
sort_order: Custom sort order for multi-sort (default: newest, lowest, highest, relevant)
|
||||||
multi_sort_threshold: Auto-enable multi-sort if total reviews > this (default: 1000)
|
multi_sort_threshold: Auto-enable multi-sort if total reviews > this (default: 1000)
|
||||||
close_enough_pct: Stop retrying if we have this % of total reviews (default: 95.0)
|
close_enough_pct: Stop retrying if we have this % of total reviews (default: 95.0)
|
||||||
|
initial_sort: Initial sort order to use (default: newest). Used for retry with different sort
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict with reviews list and metadata
|
dict with reviews list and metadata
|
||||||
@@ -1381,8 +1382,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
log.info('browser', "Sort button found")
|
log.info('browser', "Sort button found")
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Track bot detection - if sort button hidden, Google likely detected bot
|
||||||
|
bot_detected = not sort_found
|
||||||
if not sort_found:
|
if not sort_found:
|
||||||
log.warn('browser', "Sort button not found after waiting, continuing without sorting")
|
log.warn('browser', "Sort button not found after waiting, continuing without sorting (bot detection likely)")
|
||||||
|
|
||||||
# Sort by specified order (default: newest)
|
# Sort by specified order (default: newest)
|
||||||
target_sort = initial_sort or SORT_NEWEST
|
target_sort = initial_sort or SORT_NEWEST
|
||||||
@@ -1815,6 +1818,71 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
}
|
}
|
||||||
text = longestText;
|
text = longestText;
|
||||||
|
|
||||||
|
// OWNER RESPONSE: Find by "Response from the owner" text anchor
|
||||||
|
var ownerResponse = null;
|
||||||
|
var ownerSpan = null;
|
||||||
|
var cardSpans = card.querySelectorAll('span');
|
||||||
|
for (var k = 0; k < cardSpans.length; k++) {
|
||||||
|
if (cardSpans[k].textContent.trim() === 'Response from the owner') {
|
||||||
|
ownerSpan = cardSpans[k];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ownerSpan) {
|
||||||
|
// Navigate: span -> header div -> container div
|
||||||
|
var headerDiv = ownerSpan.closest('div');
|
||||||
|
var respContainer = headerDiv ? headerDiv.parentElement : null;
|
||||||
|
|
||||||
|
if (respContainer) {
|
||||||
|
// Click expand button if exists and not expanded
|
||||||
|
var expandBtn = respContainer.querySelector('button[aria-label="See more"]');
|
||||||
|
if (expandBtn && expandBtn.getAttribute('aria-expanded') !== 'true') {
|
||||||
|
expandBtn.click();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get timestamp from header spans
|
||||||
|
var respTimestamp = '';
|
||||||
|
var headerSpans = headerDiv.querySelectorAll('span');
|
||||||
|
for (var m = 0; m < headerSpans.length; m++) {
|
||||||
|
var spanTxt = headerSpans[m].textContent.trim();
|
||||||
|
if (spanTxt.match(/ago$/i)) {
|
||||||
|
respTimestamp = spanTxt;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get response text from direct child div[lang]
|
||||||
|
var respText = '';
|
||||||
|
var langDivs = respContainer.children;
|
||||||
|
for (var m = 0; m < langDivs.length; m++) {
|
||||||
|
if (langDivs[m].tagName === 'DIV' && langDivs[m].hasAttribute('lang')) {
|
||||||
|
respText = langDivs[m].textContent.trim();
|
||||||
|
respText = respText.replace(/(More|Less)$/, '').trim();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: find longest text div that's not the header
|
||||||
|
if (!respText) {
|
||||||
|
for (var m = 0; m < langDivs.length; m++) {
|
||||||
|
if (langDivs[m].tagName === 'DIV') {
|
||||||
|
var divTxt = langDivs[m].textContent.trim();
|
||||||
|
if (divTxt.includes('Response from the owner')) continue;
|
||||||
|
divTxt = divTxt.replace(/(More|Less)$/, '').trim();
|
||||||
|
if (divTxt.length > respText.length) {
|
||||||
|
respText = divTxt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (respText) {
|
||||||
|
ownerResponse = {text: respText, timestamp: respTimestamp};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (author && rating >= 1 && rating <= 5) {
|
if (author && rating >= 1 && rating <= 5) {
|
||||||
results.push({
|
results.push({
|
||||||
id: rid,
|
id: rid,
|
||||||
@@ -1823,6 +1891,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
text: text,
|
text: text,
|
||||||
rating: rating,
|
rating: rating,
|
||||||
timestamp: timestamp,
|
timestamp: timestamp,
|
||||||
|
owner_response: ownerResponse,
|
||||||
source: 'dom'
|
source: 'dom'
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -2198,6 +2267,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
topics_inferred_count += 1
|
topics_inferred_count += 1
|
||||||
log.info('scraper', f"Topics inferred for {topics_inferred_count}/{len(review_list)} reviews", metrics={'topics_inferred_count': topics_inferred_count, 'reviews_count': len(review_list)})
|
log.info('scraper', f"Topics inferred for {topics_inferred_count}/{len(review_list)} reviews", metrics={'topics_inferred_count': topics_inferred_count, 'reviews_count': len(review_list)})
|
||||||
|
|
||||||
|
# Include business info captured from Overview page
|
||||||
|
business_info = business_info_cache[0] or {}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"reviews": review_list, # Only unflushed reviews (flushed already sent to callback)
|
"reviews": review_list, # Only unflushed reviews (flushed already sent to callback)
|
||||||
"total": grand_total,
|
"total": grand_total,
|
||||||
@@ -2209,10 +2281,19 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
"metrics_history": metrics_history, # For crash detection
|
"metrics_history": metrics_history, # For crash detection
|
||||||
"start_time": start_time, # For crash report elapsed time
|
"start_time": start_time, # For crash report elapsed time
|
||||||
"session_fingerprint": session_fingerprint, # Browser fingerprint for bot detection analysis
|
"session_fingerprint": session_fingerprint, # Browser fingerprint for bot detection analysis
|
||||||
|
"bot_detected": bot_detected if 'bot_detected' in dir() else False, # True if sort button was hidden
|
||||||
|
"initial_sort_used": target_sort if 'target_sort' in dir() else SORT_NEWEST, # Sort order used for first pass
|
||||||
"multi_sort": {
|
"multi_sort": {
|
||||||
"enabled": should_multi_sort if 'should_multi_sort' in dir() else False,
|
"enabled": should_multi_sort if 'should_multi_sort' in dir() else False,
|
||||||
"completed_sorts": completed_sorts if 'completed_sorts' in dir() else [SORT_NEWEST],
|
"completed_sorts": completed_sorts if 'completed_sorts' in dir() else [SORT_NEWEST],
|
||||||
"first_pass_count": first_pass_count if 'first_pass_count' in dir() else grand_total
|
"first_pass_count": first_pass_count if 'first_pass_count' in dir() else grand_total
|
||||||
|
},
|
||||||
|
# Business info captured from Google Maps page
|
||||||
|
"business_info": {
|
||||||
|
"name": business_info.get("name"),
|
||||||
|
"category": business_info.get("category"),
|
||||||
|
"address": business_info.get("address"),
|
||||||
|
"rating": business_info.get("rating")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2220,7 +2301,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
|
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
|
||||||
progress_callback=None, driver=None, return_driver: bool = False,
|
progress_callback=None, driver=None, return_driver: bool = False,
|
||||||
log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False,
|
log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False,
|
||||||
browser_fingerprint: dict = None):
|
browser_fingerprint: dict = None, initial_sort: str = None,
|
||||||
|
sort_strategy: str = SORT_AUTO, max_reviews: int = None):
|
||||||
"""
|
"""
|
||||||
Production-compatible wrapper for scrape_reviews.
|
Production-compatible wrapper for scrape_reviews.
|
||||||
Matches the API expected by job_manager.py.
|
Matches the API expected by job_manager.py.
|
||||||
@@ -2240,6 +2322,10 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
- timezone: string (e.g., "Europe/Madrid")
|
- timezone: string (e.g., "Europe/Madrid")
|
||||||
- language: string (e.g., "en-US")
|
- language: string (e.g., "en-US")
|
||||||
- platform: string (e.g., "MacIntel")
|
- platform: string (e.g., "MacIntel")
|
||||||
|
initial_sort: Initial sort order to use ("newest", "lowest", "highest", "relevant")
|
||||||
|
Used for retry with different sort strategy
|
||||||
|
sort_strategy: Sort strategy ("auto", "multi", "single", or specific sort)
|
||||||
|
max_reviews: Maximum reviews to collect (for testing). None = unlimited (default: 5000)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
|
Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
|
||||||
@@ -2329,13 +2415,15 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
result = scrape_reviews(
|
result = scrape_reviews(
|
||||||
driver=driver,
|
driver=driver,
|
||||||
url=url,
|
url=url,
|
||||||
max_reviews=999999, # Effectively unlimited
|
max_reviews=max_reviews if max_reviews else 999999, # Unlimited by default, or custom limit for testing
|
||||||
timeout_no_new=15,
|
timeout_no_new=15,
|
||||||
flush_callback=internal_flush,
|
flush_callback=internal_flush,
|
||||||
flush_batch_size=100, # Smaller batches for more frequent progress
|
flush_batch_size=100, # Smaller batches for more frequent progress
|
||||||
log_capture=log_capture,
|
log_capture=log_capture,
|
||||||
progress_callback=progress_callback, # Pass through for real-time log updates
|
progress_callback=progress_callback, # Pass through for real-time log updates
|
||||||
validation_only=validation_only # Return early if just validating
|
validation_only=validation_only, # Return early if just validating
|
||||||
|
sort_strategy=sort_strategy, # Sort strategy (auto, multi, single)
|
||||||
|
initial_sort=initial_sort # Initial sort order for retry with different sort
|
||||||
)
|
)
|
||||||
|
|
||||||
elapsed = time.time() - start_time
|
elapsed = time.time() - start_time
|
||||||
@@ -2350,7 +2438,13 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
"error": None,
|
"error": None,
|
||||||
"logs": result.get("logs", []),
|
"logs": result.get("logs", []),
|
||||||
"review_topics": result.get("review_topics", []), # Topic filters with mention counts
|
"review_topics": result.get("review_topics", []), # Topic filters with mention counts
|
||||||
"session_fingerprint": result.get("session_fingerprint") # Browser fingerprint for bot detection
|
"session_fingerprint": result.get("session_fingerprint"), # Browser fingerprint for bot detection
|
||||||
|
# Tracking info for retry strategy
|
||||||
|
"bot_detected": result.get("bot_detected", False), # True if sort button was hidden by Google
|
||||||
|
"initial_sort_used": result.get("initial_sort_used", "newest"), # Sort order used
|
||||||
|
"multi_sort": result.get("multi_sort", {}), # Multi-sort completion info
|
||||||
|
# Business info captured from Google Maps page
|
||||||
|
"business_info": result.get("business_info", {})
|
||||||
}
|
}
|
||||||
|
|
||||||
# Include validation_info if in validation_only mode
|
# Include validation_info if in validation_only mode
|
||||||
|
|||||||
3264
scrapers/google_reviews/v1_2_0.py
Normal file
3264
scrapers/google_reviews/v1_2_0.py
Normal file
File diff suppressed because it is too large
Load Diff
87
scripts/backfill_business_category.py
Normal file
87
scripts/backfill_business_category.py
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Backfill missing business_category for existing jobs.
|
||||||
|
Uses validation_only mode to quickly capture business info without re-scraping reviews.
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import asyncpg
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Add project root to path
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
from scrapers.google_reviews.v1_1_0 import fast_scrape_reviews
|
||||||
|
|
||||||
|
DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://scraper:scraper123@localhost:5437/scraper')
|
||||||
|
|
||||||
|
|
||||||
|
async def backfill_categories():
|
||||||
|
"""Fetch and update missing business categories."""
|
||||||
|
|
||||||
|
# Connect to database
|
||||||
|
conn = await asyncpg.connect(DATABASE_URL)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get jobs missing business_category
|
||||||
|
rows = await conn.fetch("""
|
||||||
|
SELECT job_id, url, business_name
|
||||||
|
FROM jobs
|
||||||
|
WHERE business_category IS NULL
|
||||||
|
AND status = 'completed'
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
""")
|
||||||
|
|
||||||
|
print(f"Found {len(rows)} jobs missing business_category\n")
|
||||||
|
|
||||||
|
updated = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
job_id = row['job_id']
|
||||||
|
url = row['url']
|
||||||
|
name = row['business_name'] or 'Unknown'
|
||||||
|
|
||||||
|
print(f"Processing: {name[:50]}...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Run validation-only scrape (fast - just captures business info)
|
||||||
|
result = await asyncio.to_thread(
|
||||||
|
fast_scrape_reviews,
|
||||||
|
url=url,
|
||||||
|
headless=True,
|
||||||
|
validation_only=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract category from validation_info
|
||||||
|
validation_info = result.get('validation_info', {})
|
||||||
|
category = validation_info.get('category')
|
||||||
|
|
||||||
|
if category:
|
||||||
|
# Update the database
|
||||||
|
await conn.execute("""
|
||||||
|
UPDATE jobs
|
||||||
|
SET business_category = $2,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE job_id = $1
|
||||||
|
""", job_id, category)
|
||||||
|
|
||||||
|
print(f" ✓ Category: {category}")
|
||||||
|
updated += 1
|
||||||
|
else:
|
||||||
|
print(f" ✗ No category found")
|
||||||
|
failed += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ✗ Error: {e}")
|
||||||
|
failed += 1
|
||||||
|
|
||||||
|
print(f"\n{'='*50}")
|
||||||
|
print(f"Done! Updated: {updated}, Failed: {failed}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
asyncio.run(backfill_categories())
|
||||||
142
scripts/register_reputation_pipeline.py
Normal file
142
scripts/register_reputation_pipeline.py
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Register the Reputation Pipeline in the pipeline registry.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/register_reputation_pipeline.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
# Database URL
|
||||||
|
DB_URL = os.environ.get(
|
||||||
|
"DATABASE_URL",
|
||||||
|
"postgresql://scraper:scraper123@localhost:5437/scraper"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def register_pipeline():
|
||||||
|
"""Register the Reputation Pipeline in the database."""
|
||||||
|
print(f"Connecting to database...")
|
||||||
|
|
||||||
|
conn = await asyncpg.connect(DB_URL)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Ensure the registry table exists
|
||||||
|
await conn.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS pipeline.registry (
|
||||||
|
pipeline_id VARCHAR(50) PRIMARY KEY,
|
||||||
|
name VARCHAR(255) NOT NULL,
|
||||||
|
description TEXT,
|
||||||
|
version VARCHAR(50) NOT NULL,
|
||||||
|
module_path VARCHAR(500) NOT NULL,
|
||||||
|
stages TEXT[] NOT NULL DEFAULT '{}',
|
||||||
|
input_type VARCHAR(100),
|
||||||
|
config JSONB,
|
||||||
|
is_enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Register the Reputation Pipeline
|
||||||
|
result = await conn.execute("""
|
||||||
|
INSERT INTO pipeline.registry (
|
||||||
|
pipeline_id,
|
||||||
|
name,
|
||||||
|
description,
|
||||||
|
version,
|
||||||
|
module_path,
|
||||||
|
stages,
|
||||||
|
input_type,
|
||||||
|
is_enabled,
|
||||||
|
updated_at
|
||||||
|
)
|
||||||
|
VALUES (
|
||||||
|
'reputation',
|
||||||
|
'Reputation Analytics Pipeline',
|
||||||
|
'Primitives-based classification and reputation scoring. Generates business-facing analytics reports with domain breakdown, key drivers, and actionable insights.',
|
||||||
|
'2.0.0',
|
||||||
|
'reviewiq_pipeline.reputation_pipeline:ReputationPipeline',
|
||||||
|
ARRAY['classify', 'report'],
|
||||||
|
'BusinessInput',
|
||||||
|
TRUE,
|
||||||
|
NOW()
|
||||||
|
)
|
||||||
|
ON CONFLICT (pipeline_id) DO UPDATE SET
|
||||||
|
name = EXCLUDED.name,
|
||||||
|
description = EXCLUDED.description,
|
||||||
|
version = EXCLUDED.version,
|
||||||
|
module_path = EXCLUDED.module_path,
|
||||||
|
stages = EXCLUDED.stages,
|
||||||
|
input_type = EXCLUDED.input_type,
|
||||||
|
is_enabled = EXCLUDED.is_enabled,
|
||||||
|
updated_at = NOW()
|
||||||
|
""")
|
||||||
|
|
||||||
|
print(f"✓ Registered 'reputation' pipeline")
|
||||||
|
|
||||||
|
# Also ensure the ReviewIQ pipeline is registered
|
||||||
|
result = await conn.execute("""
|
||||||
|
INSERT INTO pipeline.registry (
|
||||||
|
pipeline_id,
|
||||||
|
name,
|
||||||
|
description,
|
||||||
|
version,
|
||||||
|
module_path,
|
||||||
|
stages,
|
||||||
|
input_type,
|
||||||
|
is_enabled,
|
||||||
|
updated_at
|
||||||
|
)
|
||||||
|
VALUES (
|
||||||
|
'reviewiq',
|
||||||
|
'ReviewIQ Classification Pipeline',
|
||||||
|
'Classifies reviews using URT taxonomy, detects issues, and aggregates metrics for dashboards.',
|
||||||
|
'1.0.0',
|
||||||
|
'reviewiq_pipeline.pipeline:ReviewIQPipeline',
|
||||||
|
ARRAY['normalize', 'classify', 'route', 'aggregate', 'synthesize'],
|
||||||
|
'ScraperV1Output',
|
||||||
|
TRUE,
|
||||||
|
NOW()
|
||||||
|
)
|
||||||
|
ON CONFLICT (pipeline_id) DO UPDATE SET
|
||||||
|
name = EXCLUDED.name,
|
||||||
|
description = EXCLUDED.description,
|
||||||
|
version = EXCLUDED.version,
|
||||||
|
module_path = EXCLUDED.module_path,
|
||||||
|
stages = EXCLUDED.stages,
|
||||||
|
input_type = EXCLUDED.input_type,
|
||||||
|
is_enabled = EXCLUDED.is_enabled,
|
||||||
|
updated_at = NOW()
|
||||||
|
""")
|
||||||
|
|
||||||
|
print(f"✓ Registered 'reviewiq' pipeline")
|
||||||
|
|
||||||
|
# List all registered pipelines
|
||||||
|
rows = await conn.fetch("""
|
||||||
|
SELECT pipeline_id, name, version, is_enabled, stages
|
||||||
|
FROM pipeline.registry
|
||||||
|
ORDER BY name
|
||||||
|
""")
|
||||||
|
|
||||||
|
print(f"\n📋 Registered Pipelines:")
|
||||||
|
print("-" * 80)
|
||||||
|
for row in rows:
|
||||||
|
status = "✓ enabled" if row["is_enabled"] else "✗ disabled"
|
||||||
|
stages = ", ".join(row["stages"]) if row["stages"] else "none"
|
||||||
|
print(f" {row['pipeline_id']:20} v{row['version']:10} {status}")
|
||||||
|
print(f" → {row['name']}")
|
||||||
|
print(f" → Stages: {stages}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(register_pipeline())
|
||||||
414
scripts/resolve_job_categories.py
Normal file
414
scripts/resolve_job_categories.py
Normal file
@@ -0,0 +1,414 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Resolve GBP taxonomy categories for all jobs.
|
||||||
|
Uses exact match, LLM match, or hierarchical classification.
|
||||||
|
|
||||||
|
Usage: source .env && python scripts/resolve_job_categories.py
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import asyncpg
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://scraper:scraper123@localhost:5437/scraper')
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ResolvedCategory:
|
||||||
|
"""Result of category resolution."""
|
||||||
|
category_id: int
|
||||||
|
path: str
|
||||||
|
name: str
|
||||||
|
level: int
|
||||||
|
method: str # 'exact', 'llm', 'hierarchical'
|
||||||
|
confidence: float
|
||||||
|
|
||||||
|
|
||||||
|
class SimpleLLM:
|
||||||
|
"""Simple OpenAI wrapper for category resolution."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.client = OpenAI()
|
||||||
|
|
||||||
|
async def complete(self, prompt: str, max_tokens: int = 50, temperature: float = 0) -> str:
|
||||||
|
"""Get completion from OpenAI."""
|
||||||
|
response = await asyncio.to_thread(
|
||||||
|
self.client.chat.completions.create,
|
||||||
|
model="gpt-4o-mini",
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
temperature=temperature
|
||||||
|
)
|
||||||
|
return response.choices[0].message.content.strip()
|
||||||
|
|
||||||
|
|
||||||
|
class CategoryResolver:
|
||||||
|
"""Resolves business categories to GBP taxonomy nodes."""
|
||||||
|
|
||||||
|
def __init__(self, pool: asyncpg.Pool, llm: SimpleLLM):
|
||||||
|
self.pool = pool
|
||||||
|
self.llm = llm
|
||||||
|
self._level1_cache: list[dict] = []
|
||||||
|
self._level2_cache: dict[str, list[dict]] = {}
|
||||||
|
self._level3_cache: dict[str, list[dict]] = {}
|
||||||
|
|
||||||
|
async def resolve(
|
||||||
|
self,
|
||||||
|
google_category: Optional[str] = None,
|
||||||
|
business_name: Optional[str] = None,
|
||||||
|
business_address: Optional[str] = None
|
||||||
|
) -> Optional[ResolvedCategory]:
|
||||||
|
"""Resolve to the deepest taxonomy node."""
|
||||||
|
|
||||||
|
# Phase 1: Exact match
|
||||||
|
if google_category:
|
||||||
|
result = await self._exact_match(google_category)
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Phase 2: LLM match
|
||||||
|
result = await self._llm_match(google_category)
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Phase 3: Hierarchical classification
|
||||||
|
if business_name:
|
||||||
|
result = await self._hierarchical_classify(
|
||||||
|
business_name=business_name,
|
||||||
|
business_address=business_address,
|
||||||
|
google_category=google_category
|
||||||
|
)
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def _exact_match(self, google_category: str) -> Optional[ResolvedCategory]:
|
||||||
|
"""Try exact match against taxonomy."""
|
||||||
|
async with self.pool.acquire() as conn:
|
||||||
|
# Exact match (case-insensitive)
|
||||||
|
row = await conn.fetchrow("""
|
||||||
|
SELECT id, name, path::text as path, level
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE LOWER(name) = LOWER($1) AND level = 3
|
||||||
|
""", google_category)
|
||||||
|
|
||||||
|
if row:
|
||||||
|
return ResolvedCategory(
|
||||||
|
category_id=row['id'],
|
||||||
|
path=row['path'],
|
||||||
|
name=row['name'],
|
||||||
|
level=row['level'],
|
||||||
|
method='exact',
|
||||||
|
confidence=1.0
|
||||||
|
)
|
||||||
|
|
||||||
|
# Trigram similarity match (handles typos, slight variations)
|
||||||
|
# Threshold 0.7 = high confidence only, else fall through to LLM
|
||||||
|
row = await conn.fetchrow("""
|
||||||
|
SELECT id, name, path::text as path, level,
|
||||||
|
similarity(LOWER(name), LOWER($1)) as sim
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE level = 3 AND similarity(LOWER(name), LOWER($1)) > 0.7
|
||||||
|
ORDER BY sim DESC
|
||||||
|
LIMIT 1
|
||||||
|
""", google_category)
|
||||||
|
|
||||||
|
if row:
|
||||||
|
return ResolvedCategory(
|
||||||
|
category_id=row['id'],
|
||||||
|
path=row['path'],
|
||||||
|
name=row['name'],
|
||||||
|
level=row['level'],
|
||||||
|
method='fuzzy',
|
||||||
|
confidence=float(row['sim'])
|
||||||
|
)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def _llm_match(self, google_category: str) -> Optional[ResolvedCategory]:
|
||||||
|
"""Use LLM to match Google category to taxonomy."""
|
||||||
|
# Synonym expansion for common variations
|
||||||
|
SYNONYMS = {
|
||||||
|
'shop': ['store', 'shop', 'outlet'],
|
||||||
|
'store': ['store', 'shop', 'outlet'],
|
||||||
|
'house': ['house', 'home'],
|
||||||
|
'home': ['house', 'home'],
|
||||||
|
'office': ['office', 'clinic', 'center'],
|
||||||
|
'clinic': ['clinic', 'office', 'center'],
|
||||||
|
'center': ['center', 'centre'],
|
||||||
|
'centre': ['center', 'centre'],
|
||||||
|
'repair': ['repair', 'service', 'maintenance'],
|
||||||
|
}
|
||||||
|
|
||||||
|
async with self.pool.acquire() as conn:
|
||||||
|
# Get candidates using multiple strategies:
|
||||||
|
# 1. Word matches with synonym expansion
|
||||||
|
# 2. Trigram similarity
|
||||||
|
words = google_category.lower().split()
|
||||||
|
expanded_words = set()
|
||||||
|
for w in words:
|
||||||
|
if len(w) > 2:
|
||||||
|
expanded_words.add(w)
|
||||||
|
if w in SYNONYMS:
|
||||||
|
expanded_words.update(SYNONYMS[w])
|
||||||
|
|
||||||
|
word_conditions = " OR ".join([f"LOWER(name) LIKE '%{w}%'" for w in expanded_words])
|
||||||
|
primary_word = google_category.lower().split()[0] # First word is usually most important
|
||||||
|
|
||||||
|
# Order by: starts with primary word, then by similarity
|
||||||
|
candidates = await conn.fetch(f"""
|
||||||
|
SELECT DISTINCT id, name, path::text as path, level,
|
||||||
|
CASE WHEN LOWER(name) LIKE $2 THEN 1 ELSE 0 END as starts_with,
|
||||||
|
similarity(LOWER(name), LOWER($1)) as sim
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE level = 3 AND (
|
||||||
|
({word_conditions if word_conditions else 'FALSE'})
|
||||||
|
OR similarity(LOWER(name), LOWER($1)) > 0.3
|
||||||
|
)
|
||||||
|
ORDER BY starts_with DESC, sim DESC
|
||||||
|
LIMIT 20
|
||||||
|
""", google_category, f"{primary_word}%")
|
||||||
|
|
||||||
|
if not candidates:
|
||||||
|
return None
|
||||||
|
|
||||||
|
candidate_list = "\n".join([f"- {c['name']}" for c in candidates])
|
||||||
|
|
||||||
|
prompt = f"""Match business category "{google_category}" to the closest option.
|
||||||
|
Synonyms: shop=store, house=cafe/home, office=clinic/center
|
||||||
|
|
||||||
|
Options:
|
||||||
|
{candidate_list}
|
||||||
|
|
||||||
|
Reply with ONLY the exact category name from the list."""
|
||||||
|
|
||||||
|
response = await self.llm.complete(prompt, max_tokens=30)
|
||||||
|
selected = response.strip().strip('"').strip("'")
|
||||||
|
|
||||||
|
if selected.upper() == "NONE":
|
||||||
|
return None
|
||||||
|
|
||||||
|
for c in candidates:
|
||||||
|
if c['name'].lower() == selected.lower():
|
||||||
|
return ResolvedCategory(
|
||||||
|
category_id=c['id'],
|
||||||
|
path=c['path'],
|
||||||
|
name=c['name'],
|
||||||
|
level=c['level'],
|
||||||
|
method='llm',
|
||||||
|
confidence=0.85
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fuzzy match selected name to candidates
|
||||||
|
for c in candidates:
|
||||||
|
if selected.lower() in c['name'].lower() or c['name'].lower() in selected.lower():
|
||||||
|
return ResolvedCategory(
|
||||||
|
category_id=c['id'],
|
||||||
|
path=c['path'],
|
||||||
|
name=c['name'],
|
||||||
|
level=c['level'],
|
||||||
|
method='llm',
|
||||||
|
confidence=0.75
|
||||||
|
)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def _hierarchical_classify(
|
||||||
|
self,
|
||||||
|
business_name: str,
|
||||||
|
business_address: Optional[str] = None,
|
||||||
|
google_category: Optional[str] = None
|
||||||
|
) -> Optional[ResolvedCategory]:
|
||||||
|
"""Walk down taxonomy tree using LLM."""
|
||||||
|
context = f"Business: {business_name}"
|
||||||
|
if business_address:
|
||||||
|
context += f"\nAddress: {business_address}"
|
||||||
|
if google_category:
|
||||||
|
context += f"\nHint: {google_category}"
|
||||||
|
|
||||||
|
# Level 1
|
||||||
|
level1 = await self._get_categories(1)
|
||||||
|
sector = await self._llm_select(context, level1, "sector")
|
||||||
|
if not sector:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Level 2
|
||||||
|
level2 = await self._get_categories(2, sector['path'])
|
||||||
|
biz_type = await self._llm_select(context, level2, "business type", sector['name'])
|
||||||
|
if not biz_type:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Level 3
|
||||||
|
level3 = await self._get_categories(3, biz_type['path'])
|
||||||
|
specific = await self._llm_select(context, level3, "specific category", biz_type['name'])
|
||||||
|
if not specific:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return ResolvedCategory(
|
||||||
|
category_id=specific['id'],
|
||||||
|
path=specific['path'],
|
||||||
|
name=specific['name'],
|
||||||
|
level=specific['level'],
|
||||||
|
method='hierarchical',
|
||||||
|
confidence=0.7
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _get_categories(self, level: int, parent_path: str = None) -> list[dict]:
|
||||||
|
"""Get categories at level, optionally under parent."""
|
||||||
|
async with self.pool.acquire() as conn:
|
||||||
|
if parent_path:
|
||||||
|
rows = await conn.fetch("""
|
||||||
|
SELECT id, name, path::text as path, level
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE level = $1 AND path <@ $2::ltree
|
||||||
|
ORDER BY name
|
||||||
|
""", level, parent_path)
|
||||||
|
else:
|
||||||
|
rows = await conn.fetch("""
|
||||||
|
SELECT id, name, path::text as path, level
|
||||||
|
FROM gbp_categories
|
||||||
|
WHERE level = $1
|
||||||
|
ORDER BY name
|
||||||
|
""", level)
|
||||||
|
return [dict(r) for r in rows]
|
||||||
|
|
||||||
|
async def _llm_select(
|
||||||
|
self,
|
||||||
|
context: str,
|
||||||
|
categories: list[dict],
|
||||||
|
level_name: str,
|
||||||
|
parent: str = None
|
||||||
|
) -> Optional[dict]:
|
||||||
|
"""Ask LLM to select best category."""
|
||||||
|
if not categories:
|
||||||
|
return None
|
||||||
|
if len(categories) == 1:
|
||||||
|
return categories[0]
|
||||||
|
|
||||||
|
cat_list = "\n".join([f"- {c['name']}" for c in categories])
|
||||||
|
parent_ctx = f" within {parent}" if parent else ""
|
||||||
|
|
||||||
|
prompt = f"""{context}
|
||||||
|
|
||||||
|
Select the most appropriate {level_name}{parent_ctx}.
|
||||||
|
|
||||||
|
Options:
|
||||||
|
{cat_list}
|
||||||
|
|
||||||
|
Respond with ONLY the exact name from the list."""
|
||||||
|
|
||||||
|
response = await self.llm.complete(prompt)
|
||||||
|
selected = response.strip().strip('"').strip("'")
|
||||||
|
|
||||||
|
for c in categories:
|
||||||
|
if c['name'].lower() == selected.lower():
|
||||||
|
return c
|
||||||
|
|
||||||
|
# Fuzzy fallback
|
||||||
|
for c in categories:
|
||||||
|
if selected.lower() in c['name'].lower():
|
||||||
|
return c
|
||||||
|
|
||||||
|
return categories[0] if categories else None
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
# Connect to database
|
||||||
|
pool = await asyncpg.create_pool(DATABASE_URL, min_size=2, max_size=5)
|
||||||
|
|
||||||
|
# Initialize LLM client
|
||||||
|
llm = SimpleLLM()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get jobs needing category resolution
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
jobs = await conn.fetch("""
|
||||||
|
SELECT job_id, business_name, business_category, business_address
|
||||||
|
FROM jobs
|
||||||
|
WHERE status = 'completed'
|
||||||
|
AND gbp_category_path IS NULL
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
""")
|
||||||
|
|
||||||
|
print(f"Found {len(jobs)} jobs needing category resolution\n")
|
||||||
|
|
||||||
|
resolver = CategoryResolver(pool, llm)
|
||||||
|
|
||||||
|
resolved = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
|
for job in jobs:
|
||||||
|
job_id = str(job['job_id'])
|
||||||
|
name = job['business_name'] or 'Unknown'
|
||||||
|
google_cat = job['business_category']
|
||||||
|
address = job['business_address']
|
||||||
|
|
||||||
|
print(f"Processing: {name[:50]}...")
|
||||||
|
if google_cat:
|
||||||
|
print(f" Google category: {google_cat}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await resolver.resolve(
|
||||||
|
google_category=google_cat,
|
||||||
|
business_name=name,
|
||||||
|
business_address=address
|
||||||
|
)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
# Determine source: google if they had a category, inferred if we used business name
|
||||||
|
category_source = 'google' if google_cat else 'inferred'
|
||||||
|
|
||||||
|
# Save to database
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
await conn.execute("""
|
||||||
|
UPDATE jobs
|
||||||
|
SET gbp_category_id = $2,
|
||||||
|
gbp_category_path = $3::ltree,
|
||||||
|
category_resolution_method = $4,
|
||||||
|
business_category_source = $5,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE job_id = $1::uuid
|
||||||
|
""", job_id, result.category_id, result.path, result.method, category_source)
|
||||||
|
|
||||||
|
print(f" ✓ Resolved: {result.path} ({result.method}, source={category_source})")
|
||||||
|
resolved += 1
|
||||||
|
else:
|
||||||
|
print(f" ✗ Could not resolve")
|
||||||
|
failed += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ✗ Error: {e}")
|
||||||
|
failed += 1
|
||||||
|
|
||||||
|
print(f"\n{'='*50}")
|
||||||
|
print(f"Done! Resolved: {resolved}, Failed: {failed}")
|
||||||
|
|
||||||
|
# Show results
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
results = await conn.fetch("""
|
||||||
|
SELECT business_name, business_category,
|
||||||
|
gbp_category_path::text as resolved_path,
|
||||||
|
category_resolution_method,
|
||||||
|
business_category_source
|
||||||
|
FROM jobs
|
||||||
|
WHERE status = 'completed' AND gbp_category_path IS NOT NULL
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
LIMIT 10
|
||||||
|
""")
|
||||||
|
|
||||||
|
print(f"\n{'='*50}")
|
||||||
|
print("Recent resolved categories:")
|
||||||
|
for r in results:
|
||||||
|
source = r['business_category_source'] or '-'
|
||||||
|
print(f" {r['business_name'][:30]:30} | {r['business_category'] or '-':20} | {source:8} -> {r['resolved_path']} ({r['category_resolution_method']})")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await pool.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
asyncio.run(main())
|
||||||
@@ -68,8 +68,11 @@ export default function AnalyticsDetailPage() {
|
|||||||
: `/api/jobs/${jobId}/reviews?limit=10000`;
|
: `/api/jobs/${jobId}/reviews?limit=10000`;
|
||||||
|
|
||||||
fetch(url)
|
fetch(url)
|
||||||
.then(res => {
|
.then(async res => {
|
||||||
if (!res.ok) throw new Error('Failed to fetch reviews');
|
if (!res.ok) {
|
||||||
|
const errorData = await res.json().catch(() => ({}));
|
||||||
|
throw new Error(errorData.error || `Failed to fetch reviews (${res.status})`);
|
||||||
|
}
|
||||||
return res.json();
|
return res.json();
|
||||||
})
|
})
|
||||||
.then(data => {
|
.then(data => {
|
||||||
|
|||||||
63
web/app/api/categories/route.ts
Normal file
63
web/app/api/categories/route.ts
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
const DB_URL = process.env.DATABASE_URL || 'postgresql://scraper:scraper123@localhost:5437/scraper';
|
||||||
|
|
||||||
|
// Direct database query for categories
|
||||||
|
async function fetchCategoriesFromDB() {
|
||||||
|
// For now, we'll fetch from the API server which has DB access
|
||||||
|
// In production, you might want to use a direct DB connection or cache
|
||||||
|
const response = await fetch(`${API_BASE_URL}/categories/tree`, {
|
||||||
|
cache: 'no-store',
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error('Failed to fetch categories from API');
|
||||||
|
}
|
||||||
|
|
||||||
|
return response.json();
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function GET(request: NextRequest) {
|
||||||
|
try {
|
||||||
|
const searchParams = request.nextUrl.searchParams;
|
||||||
|
const search = searchParams.get('search');
|
||||||
|
const parentPath = searchParams.get('parent');
|
||||||
|
const level = searchParams.get('level');
|
||||||
|
|
||||||
|
// Build query params for backend
|
||||||
|
const params = new URLSearchParams();
|
||||||
|
if (search) params.set('search', search);
|
||||||
|
if (parentPath) params.set('parent', parentPath);
|
||||||
|
if (level) params.set('level', level);
|
||||||
|
|
||||||
|
const url = `${API_BASE_URL}/categories?${params.toString()}`;
|
||||||
|
|
||||||
|
const response = await fetch(url, {
|
||||||
|
cache: 'no-store',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
// Fallback: return mock data for development
|
||||||
|
console.error('API not available, returning mock data');
|
||||||
|
return NextResponse.json({
|
||||||
|
categories: [],
|
||||||
|
total: 0,
|
||||||
|
message: 'API not available'
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = await response.json();
|
||||||
|
return NextResponse.json(data);
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error fetching categories:', error);
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Failed to fetch categories', categories: [], total: 0 },
|
||||||
|
{ status: 500 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
import { NextRequest, NextResponse } from 'next/server';
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
export async function POST(request: NextRequest) {
|
export async function POST(request: NextRequest) {
|
||||||
try {
|
try {
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import { NextRequest, NextResponse } from 'next/server';
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
// GET /api/jobs/[jobId]/compare?previous=<previousJobId>
|
// GET /api/jobs/[jobId]/compare?previous=<previousJobId>
|
||||||
// Returns reviews from current job with a flag indicating if they're new
|
// Returns reviews from current job with a flag indicating if they're new
|
||||||
@@ -16,8 +16,10 @@ export async function GET(
|
|||||||
// Fetch current job reviews
|
// Fetch current job reviews
|
||||||
const currentResponse = await fetch(`${API_BASE_URL}/jobs/${jobId}/reviews?limit=10000`);
|
const currentResponse = await fetch(`${API_BASE_URL}/jobs/${jobId}/reviews?limit=10000`);
|
||||||
if (!currentResponse.ok) {
|
if (!currentResponse.ok) {
|
||||||
|
const errorText = await currentResponse.text().catch(() => '');
|
||||||
|
console.error(`Failed to get current job reviews: ${currentResponse.status} - ${errorText}`);
|
||||||
return NextResponse.json(
|
return NextResponse.json(
|
||||||
{ error: 'Failed to get current job reviews' },
|
{ error: `Failed to get reviews for job ${jobId} (${currentResponse.status})` },
|
||||||
{ status: currentResponse.status }
|
{ status: currentResponse.status }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import { NextRequest, NextResponse } from 'next/server';
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* GET /api/jobs/[jobId]/crash-report
|
* GET /api/jobs/[jobId]/crash-report
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import { NextRequest, NextResponse } from 'next/server';
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
export async function GET(
|
export async function GET(
|
||||||
request: NextRequest,
|
request: NextRequest,
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import { NextRequest, NextResponse } from 'next/server';
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* POST /api/jobs/[jobId]/retry
|
* POST /api/jobs/[jobId]/retry
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import { NextRequest, NextResponse } from 'next/server';
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
export async function GET(
|
export async function GET(
|
||||||
request: NextRequest,
|
request: NextRequest,
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import { NextRequest, NextResponse } from 'next/server';
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
export async function GET(
|
export async function GET(
|
||||||
request: NextRequest,
|
request: NextRequest,
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import { NextRequest } from 'next/server';
|
import { NextRequest } from 'next/server';
|
||||||
|
|
||||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
export const dynamic = 'force-dynamic';
|
export const dynamic = 'force-dynamic';
|
||||||
|
|
||||||
|
|||||||
@@ -1,13 +1,19 @@
|
|||||||
import { NextRequest, NextResponse } from 'next/server';
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
export async function GET(request: NextRequest) {
|
export async function GET(request: NextRequest) {
|
||||||
try {
|
try {
|
||||||
const { searchParams } = new URL(request.url);
|
const { searchParams } = new URL(request.url);
|
||||||
const limit = searchParams.get('limit') || '100';
|
const limit = searchParams.get('limit') || '100';
|
||||||
|
const status = searchParams.get('status');
|
||||||
|
|
||||||
const response = await fetch(`${API_BASE_URL}/jobs?limit=${limit}`);
|
let url = `${API_BASE_URL}/jobs?limit=${limit}`;
|
||||||
|
if (status) {
|
||||||
|
url += `&status=${status}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
const response = await fetch(url);
|
||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
return NextResponse.json(
|
return NextResponse.json(
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import { NextRequest } from 'next/server';
|
import { NextRequest } from 'next/server';
|
||||||
|
|
||||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
export const dynamic = 'force-dynamic';
|
export const dynamic = 'force-dynamic';
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import { NextRequest, NextResponse } from 'next/server';
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
export async function GET(
|
export async function GET(
|
||||||
request: NextRequest,
|
request: NextRequest,
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import { NextRequest, NextResponse } from 'next/server';
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
export async function POST(
|
export async function POST(
|
||||||
request: NextRequest,
|
request: NextRequest,
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import { NextRequest, NextResponse } from 'next/server';
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
export async function GET(
|
export async function GET(
|
||||||
request: NextRequest,
|
request: NextRequest,
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import { NextRequest, NextResponse } from 'next/server';
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
export async function GET(
|
export async function GET(
|
||||||
request: NextRequest,
|
request: NextRequest,
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import { NextRequest, NextResponse } from 'next/server';
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
export async function GET(
|
export async function GET(
|
||||||
request: NextRequest,
|
request: NextRequest,
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import { NextRequest, NextResponse } from 'next/server';
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
export async function GET(
|
export async function GET(
|
||||||
request: NextRequest,
|
request: NextRequest,
|
||||||
|
|||||||
42
web/app/api/pipelines/reviewiq/analytics/route.ts
Normal file
42
web/app/api/pipelines/reviewiq/analytics/route.ts
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
|
const API_BASE_URL = process.env.API_BASE_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Proxy route for ReviewIQ analytics endpoint.
|
||||||
|
* GET /api/pipelines/reviewiq/analytics
|
||||||
|
*/
|
||||||
|
export async function GET(request: NextRequest) {
|
||||||
|
try {
|
||||||
|
// Forward query parameters
|
||||||
|
const searchParams = request.nextUrl.searchParams;
|
||||||
|
const queryString = searchParams.toString();
|
||||||
|
|
||||||
|
const url = `${API_BASE_URL}/api/pipelines/reviewiq/analytics${queryString ? `?${queryString}` : ''}`;
|
||||||
|
|
||||||
|
const response = await fetch(url, {
|
||||||
|
method: 'GET',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
cache: 'no-store',
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const errorData = await response.json().catch(() => ({}));
|
||||||
|
return NextResponse.json(
|
||||||
|
{ detail: errorData.detail || `Backend error: ${response.status}` },
|
||||||
|
{ status: response.status }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = await response.json();
|
||||||
|
return NextResponse.json(data);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('ReviewIQ analytics proxy error:', error);
|
||||||
|
return NextResponse.json(
|
||||||
|
{ detail: 'Failed to fetch analytics data' },
|
||||||
|
{ status: 500 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,42 @@
|
|||||||
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
|
const API_BASE_URL = process.env.API_BASE_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Proxy route for fetching spans related to an issue.
|
||||||
|
* GET /api/pipelines/reviewiq/issues/[issueId]/spans
|
||||||
|
*/
|
||||||
|
export async function GET(
|
||||||
|
request: NextRequest,
|
||||||
|
{ params }: { params: Promise<{ issueId: string }> }
|
||||||
|
) {
|
||||||
|
try {
|
||||||
|
const { issueId } = await params;
|
||||||
|
const url = `${API_BASE_URL}/api/pipelines/reviewiq/issues/${issueId}/spans`;
|
||||||
|
|
||||||
|
const response = await fetch(url, {
|
||||||
|
method: 'GET',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
cache: 'no-store',
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const errorData = await response.json().catch(() => ({}));
|
||||||
|
return NextResponse.json(
|
||||||
|
{ detail: errorData.detail || `Backend error: ${response.status}` },
|
||||||
|
{ status: response.status }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = await response.json();
|
||||||
|
return NextResponse.json(data);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Issue spans proxy error:', error);
|
||||||
|
return NextResponse.json(
|
||||||
|
{ detail: 'Failed to fetch issue spans' },
|
||||||
|
{ status: 500 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
43
web/app/api/pipelines/reviewiq/reviews/[reviewId]/route.ts
Normal file
43
web/app/api/pipelines/reviewiq/reviews/[reviewId]/route.ts
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
|
const API_BASE = process.env.API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/pipelines/reviewiq/reviews/[reviewId]
|
||||||
|
* Proxy to backend for fetching a full review with all its spans.
|
||||||
|
*/
|
||||||
|
export async function GET(
|
||||||
|
request: NextRequest,
|
||||||
|
{ params }: { params: Promise<{ reviewId: string }> }
|
||||||
|
) {
|
||||||
|
const { reviewId } = await params;
|
||||||
|
const { searchParams } = new URL(request.url);
|
||||||
|
const source = searchParams.get('source') || 'google';
|
||||||
|
|
||||||
|
try {
|
||||||
|
const url = `${API_BASE}/api/pipelines/reviewiq/reviews/${encodeURIComponent(reviewId)}?source=${encodeURIComponent(source)}`;
|
||||||
|
|
||||||
|
const response = await fetch(url, {
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const error = await response.text();
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: `Backend error: ${error}` },
|
||||||
|
{ status: response.status }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = await response.json();
|
||||||
|
return NextResponse.json(data);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error fetching review:', error);
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Failed to fetch review' },
|
||||||
|
{ status: 500 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
42
web/app/api/pipelines/reviewiq/trends/route.ts
Normal file
42
web/app/api/pipelines/reviewiq/trends/route.ts
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
|
const API_BASE_URL = process.env.API_BASE_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Proxy route for ReviewIQ trends endpoint.
|
||||||
|
* GET /api/pipelines/reviewiq/trends
|
||||||
|
*/
|
||||||
|
export async function GET(request: NextRequest) {
|
||||||
|
try {
|
||||||
|
// Forward query parameters
|
||||||
|
const searchParams = request.nextUrl.searchParams;
|
||||||
|
const queryString = searchParams.toString();
|
||||||
|
|
||||||
|
const url = `${API_BASE_URL}/api/pipelines/reviewiq/trends${queryString ? `?${queryString}` : ''}`;
|
||||||
|
|
||||||
|
const response = await fetch(url, {
|
||||||
|
method: 'GET',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
cache: 'no-store',
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const errorData = await response.json().catch(() => ({}));
|
||||||
|
return NextResponse.json(
|
||||||
|
{ detail: errorData.detail || `Backend error: ${response.status}` },
|
||||||
|
{ status: response.status }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = await response.json();
|
||||||
|
return NextResponse.json(data);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('ReviewIQ trends proxy error:', error);
|
||||||
|
return NextResponse.json(
|
||||||
|
{ detail: 'Failed to fetch trends data' },
|
||||||
|
{ status: 500 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
import { NextRequest, NextResponse } from 'next/server';
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
export async function GET(request: NextRequest) {
|
export async function GET(request: NextRequest) {
|
||||||
try {
|
try {
|
||||||
|
|||||||
@@ -1,16 +1,38 @@
|
|||||||
import { NextRequest, NextResponse } from 'next/server';
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
export async function POST(request: NextRequest) {
|
export async function POST(request: NextRequest) {
|
||||||
try {
|
try {
|
||||||
const body = await request.json();
|
const body = await request.json();
|
||||||
const { url, business_name, business_address, rating_snapshot, total_reviews_snapshot, scraper_version } = body;
|
const { url, business_name, business_address, rating_snapshot, total_reviews_snapshot, scraper_version, session_id, browser_fingerprint, geolocation } = body;
|
||||||
|
|
||||||
if (!url) {
|
if (!url) {
|
||||||
return NextResponse.json({ error: 'URL is required' }, { status: 400 });
|
return NextResponse.json({ error: 'URL is required' }, { status: 400 });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Build metadata object
|
||||||
|
const metadata: Record<string, unknown> = {
|
||||||
|
business_name,
|
||||||
|
business_address,
|
||||||
|
rating_snapshot,
|
||||||
|
total_reviews_snapshot,
|
||||||
|
scraper_version, // Store in metadata for job tracking
|
||||||
|
};
|
||||||
|
|
||||||
|
// Include session_id for browser reuse (session handoff from validation)
|
||||||
|
if (session_id) {
|
||||||
|
metadata.session_id = session_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Include browser fingerprint if provided
|
||||||
|
if (browser_fingerprint) {
|
||||||
|
metadata.browser_fingerprint = browser_fingerprint;
|
||||||
|
}
|
||||||
|
if (geolocation) {
|
||||||
|
metadata.geolocation = geolocation;
|
||||||
|
}
|
||||||
|
|
||||||
// Call the containerized scraper API with business metadata and version
|
// Call the containerized scraper API with business metadata and version
|
||||||
const response = await fetch(`${API_BASE_URL}/scrape`, {
|
const response = await fetch(`${API_BASE_URL}/scrape`, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
@@ -18,13 +40,8 @@ export async function POST(request: NextRequest) {
|
|||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
url,
|
url,
|
||||||
scraper_version, // Pass version to backend for routing
|
scraper_version, // Pass version to backend for routing
|
||||||
metadata: {
|
session_id, // Pass session_id for browser reuse
|
||||||
business_name,
|
metadata,
|
||||||
business_address,
|
|
||||||
rating_snapshot,
|
|
||||||
total_reviews_snapshot,
|
|
||||||
scraper_version, // Also store in metadata for job tracking
|
|
||||||
},
|
|
||||||
}),
|
}),
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
37
web/app/api/sessions/validate/route.ts
Normal file
37
web/app/api/sessions/validate/route.ts
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
|
export async function POST(request: NextRequest) {
|
||||||
|
try {
|
||||||
|
const body = await request.json();
|
||||||
|
|
||||||
|
if (!body.url) {
|
||||||
|
return NextResponse.json({ error: 'URL is required' }, { status: 400 });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call the backend session validation endpoint
|
||||||
|
const response = await fetch(`${API_BASE_URL}/sessions/validate`, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify(body),
|
||||||
|
});
|
||||||
|
|
||||||
|
const data = await response.json();
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: data.detail || 'Failed to validate session' },
|
||||||
|
{ status: response.status }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return NextResponse.json(data);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Session validation API error:', error);
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Failed to connect to scraper API' },
|
||||||
|
{ status: 500 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
435
web/app/categories/page.tsx
Normal file
435
web/app/categories/page.tsx
Normal file
@@ -0,0 +1,435 @@
|
|||||||
|
'use client';
|
||||||
|
|
||||||
|
import { useState, useEffect, useMemo, useCallback } from 'react';
|
||||||
|
import { Search, TreePine, Network, ChevronRight, ChevronDown, Folder, FolderOpen, Tag, Loader2 } from 'lucide-react';
|
||||||
|
import dynamic from 'next/dynamic';
|
||||||
|
import {
|
||||||
|
Category,
|
||||||
|
CategoryTreeNode,
|
||||||
|
buildCategoryTree,
|
||||||
|
toD3Tree,
|
||||||
|
getLevelName,
|
||||||
|
getLevelColor,
|
||||||
|
searchCategories,
|
||||||
|
getCategoryBreadcrumb,
|
||||||
|
} from '@/lib/categories';
|
||||||
|
|
||||||
|
// Dynamic import for react-d3-tree (SSR issues)
|
||||||
|
const Tree = dynamic(() => import('react-d3-tree').then((mod) => mod.default), {
|
||||||
|
ssr: false,
|
||||||
|
loading: () => <div className="flex items-center justify-center h-full"><Loader2 className="animate-spin" /></div>,
|
||||||
|
});
|
||||||
|
|
||||||
|
// API base URL
|
||||||
|
const API_BASE = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8001';
|
||||||
|
|
||||||
|
export default function CategoriesPage() {
|
||||||
|
const [categories, setCategories] = useState<Category[]>([]);
|
||||||
|
const [loading, setLoading] = useState(true);
|
||||||
|
const [error, setError] = useState<string | null>(null);
|
||||||
|
const [searchQuery, setSearchQuery] = useState('');
|
||||||
|
const [viewMode, setViewMode] = useState<'explorer' | 'diagram'>('explorer');
|
||||||
|
const [expandedPaths, setExpandedPaths] = useState<Set<string>>(new Set());
|
||||||
|
const [selectedCategory, setSelectedCategory] = useState<Category | null>(null);
|
||||||
|
const [stats, setStats] = useState({ total: 0, sectors: 0, types: 0, subs: 0, leaves: 0 });
|
||||||
|
|
||||||
|
// Fetch categories from API
|
||||||
|
useEffect(() => {
|
||||||
|
async function fetchCategories() {
|
||||||
|
try {
|
||||||
|
setLoading(true);
|
||||||
|
const response = await fetch(`${API_BASE}/categories`);
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error('Failed to fetch categories');
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = await response.json();
|
||||||
|
setCategories(data.categories || []);
|
||||||
|
setStats({
|
||||||
|
total: data.total || 0,
|
||||||
|
sectors: data.categories?.filter((c: Category) => c.level === 1).length || 0,
|
||||||
|
types: data.categories?.filter((c: Category) => c.level === 2).length || 0,
|
||||||
|
subs: data.categories?.filter((c: Category) => c.level === 3).length || 0,
|
||||||
|
leaves: data.categories?.filter((c: Category) => c.level === 4).length || 0,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Expand level 1 by default
|
||||||
|
const level1Paths = new Set<string>(
|
||||||
|
data.categories
|
||||||
|
?.filter((c: Category) => c.level === 1)
|
||||||
|
.map((c: Category) => c.path) || []
|
||||||
|
);
|
||||||
|
setExpandedPaths(level1Paths);
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Error fetching categories:', err);
|
||||||
|
setError(err instanceof Error ? err.message : 'Failed to load categories');
|
||||||
|
} finally {
|
||||||
|
setLoading(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fetchCategories();
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
// Filter categories based on search
|
||||||
|
const filteredCategories = useMemo(() => {
|
||||||
|
if (!searchQuery.trim()) return categories;
|
||||||
|
return searchCategories(categories, searchQuery);
|
||||||
|
}, [categories, searchQuery]);
|
||||||
|
|
||||||
|
// Build tree structure
|
||||||
|
const tree = useMemo(() => buildCategoryTree(filteredCategories), [filteredCategories]);
|
||||||
|
|
||||||
|
// D3 tree data
|
||||||
|
const d3TreeData = useMemo(() => {
|
||||||
|
if (tree.length === 0) return null;
|
||||||
|
return {
|
||||||
|
name: 'GBP Categories',
|
||||||
|
children: toD3Tree(tree),
|
||||||
|
};
|
||||||
|
}, [tree]);
|
||||||
|
|
||||||
|
// Toggle expand/collapse
|
||||||
|
const toggleExpand = useCallback((path: string) => {
|
||||||
|
setExpandedPaths((prev) => {
|
||||||
|
const next = new Set(prev);
|
||||||
|
if (next.has(path)) {
|
||||||
|
next.delete(path);
|
||||||
|
} else {
|
||||||
|
next.add(path);
|
||||||
|
}
|
||||||
|
return next;
|
||||||
|
});
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
// Expand all ancestors when searching
|
||||||
|
useEffect(() => {
|
||||||
|
if (searchQuery.trim()) {
|
||||||
|
const pathsToExpand = new Set<string>();
|
||||||
|
for (const cat of filteredCategories) {
|
||||||
|
const parts = cat.path.split('.');
|
||||||
|
for (let i = 1; i < parts.length; i++) {
|
||||||
|
pathsToExpand.add(parts.slice(0, i).join('.'));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
setExpandedPaths(pathsToExpand);
|
||||||
|
}
|
||||||
|
}, [searchQuery, filteredCategories]);
|
||||||
|
|
||||||
|
// Get breadcrumb for selected category
|
||||||
|
const breadcrumb = useMemo(() => {
|
||||||
|
if (!selectedCategory) return [];
|
||||||
|
return getCategoryBreadcrumb(selectedCategory.path, categories);
|
||||||
|
}, [selectedCategory, categories]);
|
||||||
|
|
||||||
|
// Render tree node (recursive)
|
||||||
|
const renderTreeNode = (node: CategoryTreeNode, depth: number = 0) => {
|
||||||
|
const isExpanded = expandedPaths.has(node.id);
|
||||||
|
const hasChildren = node.children && node.children.length > 0;
|
||||||
|
const isSelected = selectedCategory?.path === node.id;
|
||||||
|
const level = node.data?.level || 1;
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div key={node.id} className="select-none">
|
||||||
|
<div
|
||||||
|
className={`flex items-center gap-2 py-1.5 px-2 rounded cursor-pointer hover:bg-gray-100 dark:hover:bg-gray-800 ${
|
||||||
|
isSelected ? 'bg-blue-50 dark:bg-blue-900/30 border-l-2 border-blue-500' : ''
|
||||||
|
}`}
|
||||||
|
style={{ paddingLeft: `${depth * 20 + 8}px` }}
|
||||||
|
onClick={() => {
|
||||||
|
setSelectedCategory(node.data || null);
|
||||||
|
if (hasChildren) {
|
||||||
|
toggleExpand(node.id);
|
||||||
|
}
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{/* Expand/Collapse Icon */}
|
||||||
|
<span className="w-4 h-4 flex items-center justify-center">
|
||||||
|
{hasChildren ? (
|
||||||
|
isExpanded ? (
|
||||||
|
<ChevronDown className="w-4 h-4 text-gray-500" />
|
||||||
|
) : (
|
||||||
|
<ChevronRight className="w-4 h-4 text-gray-500" />
|
||||||
|
)
|
||||||
|
) : (
|
||||||
|
<span className="w-4" />
|
||||||
|
)}
|
||||||
|
</span>
|
||||||
|
|
||||||
|
{/* Folder/Tag Icon */}
|
||||||
|
{hasChildren ? (
|
||||||
|
isExpanded ? (
|
||||||
|
<FolderOpen className="w-4 h-4 text-yellow-500" />
|
||||||
|
) : (
|
||||||
|
<Folder className="w-4 h-4 text-yellow-600" />
|
||||||
|
)
|
||||||
|
) : (
|
||||||
|
<Tag className="w-4 h-4 text-purple-500" />
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Name */}
|
||||||
|
<span className="flex-1 truncate text-sm">{node.name}</span>
|
||||||
|
|
||||||
|
{/* Level Badge */}
|
||||||
|
<span
|
||||||
|
className={`text-xs px-1.5 py-0.5 rounded ${getLevelColor(level)} text-white`}
|
||||||
|
>
|
||||||
|
L{level}
|
||||||
|
</span>
|
||||||
|
|
||||||
|
{/* Count */}
|
||||||
|
{node.data && node.data.category_count > 0 && (
|
||||||
|
<span className="text-xs text-gray-400">
|
||||||
|
({node.data.category_count})
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Children */}
|
||||||
|
{hasChildren && isExpanded && (
|
||||||
|
<div>
|
||||||
|
{node.children!.map((child) => renderTreeNode(child, depth + 1))}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
if (loading) {
|
||||||
|
return (
|
||||||
|
<div className="flex items-center justify-center min-h-screen">
|
||||||
|
<Loader2 className="w-8 h-8 animate-spin text-blue-500" />
|
||||||
|
<span className="ml-2">Loading categories...</span>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (error) {
|
||||||
|
return (
|
||||||
|
<div className="flex flex-col items-center justify-center min-h-screen">
|
||||||
|
<p className="text-red-500 mb-4">{error}</p>
|
||||||
|
<button
|
||||||
|
onClick={() => window.location.reload()}
|
||||||
|
className="px-4 py-2 bg-blue-500 text-white rounded hover:bg-blue-600"
|
||||||
|
>
|
||||||
|
Retry
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="min-h-screen bg-gray-50 dark:bg-gray-900">
|
||||||
|
{/* Header */}
|
||||||
|
<header className="bg-white dark:bg-gray-800 shadow-sm border-b">
|
||||||
|
<div className="max-w-7xl mx-auto px-4 py-4">
|
||||||
|
<div className="flex items-center justify-between">
|
||||||
|
<div>
|
||||||
|
<h1 className="text-2xl font-bold text-gray-900 dark:text-white">
|
||||||
|
GBP Category Explorer
|
||||||
|
</h1>
|
||||||
|
<p className="text-sm text-gray-500 mt-1">
|
||||||
|
Browse {stats.total.toLocaleString()} Google Business Profile categories
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Stats */}
|
||||||
|
<div className="flex gap-4 text-sm">
|
||||||
|
<div className="text-center">
|
||||||
|
<div className="font-bold text-blue-600">{stats.sectors}</div>
|
||||||
|
<div className="text-gray-500">Sectors</div>
|
||||||
|
</div>
|
||||||
|
<div className="text-center">
|
||||||
|
<div className="font-bold text-green-600">{stats.types}</div>
|
||||||
|
<div className="text-gray-500">Types</div>
|
||||||
|
</div>
|
||||||
|
<div className="text-center">
|
||||||
|
<div className="font-bold text-yellow-600">{stats.subs}</div>
|
||||||
|
<div className="text-gray-500">Sub-cats</div>
|
||||||
|
</div>
|
||||||
|
<div className="text-center">
|
||||||
|
<div className="font-bold text-purple-600">{stats.leaves}</div>
|
||||||
|
<div className="text-gray-500">Categories</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
{/* Toolbar */}
|
||||||
|
<div className="bg-white dark:bg-gray-800 border-b px-4 py-3">
|
||||||
|
<div className="max-w-7xl mx-auto flex items-center gap-4">
|
||||||
|
{/* Search */}
|
||||||
|
<div className="relative flex-1 max-w-md">
|
||||||
|
<Search className="absolute left-3 top-1/2 -translate-y-1/2 w-4 h-4 text-gray-400" />
|
||||||
|
<input
|
||||||
|
type="text"
|
||||||
|
placeholder="Search categories..."
|
||||||
|
value={searchQuery}
|
||||||
|
onChange={(e) => setSearchQuery(e.target.value)}
|
||||||
|
className="w-full pl-10 pr-4 py-2 border rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500 dark:bg-gray-700 dark:border-gray-600"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* View Toggle */}
|
||||||
|
<div className="flex border rounded-lg overflow-hidden">
|
||||||
|
<button
|
||||||
|
onClick={() => setViewMode('explorer')}
|
||||||
|
className={`px-4 py-2 flex items-center gap-2 ${
|
||||||
|
viewMode === 'explorer'
|
||||||
|
? 'bg-blue-500 text-white'
|
||||||
|
: 'bg-white dark:bg-gray-700 hover:bg-gray-50'
|
||||||
|
}`}
|
||||||
|
>
|
||||||
|
<TreePine className="w-4 h-4" />
|
||||||
|
Explorer
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
onClick={() => setViewMode('diagram')}
|
||||||
|
className={`px-4 py-2 flex items-center gap-2 ${
|
||||||
|
viewMode === 'diagram'
|
||||||
|
? 'bg-blue-500 text-white'
|
||||||
|
: 'bg-white dark:bg-gray-700 hover:bg-gray-50'
|
||||||
|
}`}
|
||||||
|
>
|
||||||
|
<Network className="w-4 h-4" />
|
||||||
|
Diagram
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Results count */}
|
||||||
|
{searchQuery && (
|
||||||
|
<span className="text-sm text-gray-500">
|
||||||
|
{filteredCategories.length} results
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Main Content */}
|
||||||
|
<div className="px-4 py-6">
|
||||||
|
{viewMode === 'explorer' ? (
|
||||||
|
/* Explorer View - Full Width */
|
||||||
|
<div className="bg-white dark:bg-gray-800 rounded-lg shadow p-4">
|
||||||
|
{tree.length > 0 ? (
|
||||||
|
tree.map((node) => renderTreeNode(node))
|
||||||
|
) : (
|
||||||
|
<div className="text-center text-gray-500 py-8">
|
||||||
|
No categories found
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
/* Diagram View with Detail Panel */
|
||||||
|
<div className="flex gap-6 h-[calc(100vh-180px)]">
|
||||||
|
<div className="flex-1 bg-white dark:bg-gray-800 rounded-lg shadow overflow-hidden">
|
||||||
|
<div className="h-full w-full">
|
||||||
|
{d3TreeData ? (
|
||||||
|
<Tree
|
||||||
|
data={d3TreeData}
|
||||||
|
orientation="vertical"
|
||||||
|
pathFunc="step"
|
||||||
|
translate={{ x: 400, y: 50 }}
|
||||||
|
separation={{ siblings: 1, nonSiblings: 2 }}
|
||||||
|
nodeSize={{ x: 200, y: 80 }}
|
||||||
|
renderCustomNodeElement={({ nodeDatum, toggleNode }) => (
|
||||||
|
<g onClick={toggleNode}>
|
||||||
|
<circle r={15} fill="#3b82f6" />
|
||||||
|
<text
|
||||||
|
fill="#1f2937"
|
||||||
|
strokeWidth="0"
|
||||||
|
x={20}
|
||||||
|
dy=".35em"
|
||||||
|
fontSize={12}
|
||||||
|
fontFamily="sans-serif"
|
||||||
|
>
|
||||||
|
{nodeDatum.name.length > 25
|
||||||
|
? nodeDatum.name.slice(0, 25) + '...'
|
||||||
|
: nodeDatum.name}
|
||||||
|
</text>
|
||||||
|
</g>
|
||||||
|
)}
|
||||||
|
/>
|
||||||
|
) : (
|
||||||
|
<div className="flex items-center justify-center h-full text-gray-500">
|
||||||
|
No data to display
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Detail Panel - Only in Diagram Mode */}
|
||||||
|
{selectedCategory && (
|
||||||
|
<div className="w-80 bg-white dark:bg-gray-800 rounded-lg shadow p-4">
|
||||||
|
<h3 className="font-bold text-lg mb-4">{selectedCategory.name}</h3>
|
||||||
|
|
||||||
|
{/* Breadcrumb */}
|
||||||
|
<div className="mb-4">
|
||||||
|
<span className="text-xs text-gray-500 uppercase">Path</span>
|
||||||
|
<div className="flex flex-wrap gap-1 mt-1">
|
||||||
|
{breadcrumb.map((cat, i) => (
|
||||||
|
<span key={cat.path} className="flex items-center">
|
||||||
|
<span
|
||||||
|
className={`text-xs px-2 py-1 rounded ${getLevelColor(cat.level)} text-white cursor-pointer hover:opacity-80`}
|
||||||
|
onClick={() => setSelectedCategory(cat)}
|
||||||
|
>
|
||||||
|
{cat.name}
|
||||||
|
</span>
|
||||||
|
{i < breadcrumb.length - 1 && (
|
||||||
|
<ChevronRight className="w-3 h-3 text-gray-400 mx-1" />
|
||||||
|
)}
|
||||||
|
</span>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Details */}
|
||||||
|
<div className="space-y-3 text-sm">
|
||||||
|
<div>
|
||||||
|
<span className="text-gray-500">Level:</span>
|
||||||
|
<span className="ml-2 font-medium">
|
||||||
|
{getLevelName(selectedCategory.level)} (L{selectedCategory.level})
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<span className="text-gray-500">Path:</span>
|
||||||
|
<code className="ml-2 text-xs bg-gray-100 dark:bg-gray-700 px-2 py-1 rounded">
|
||||||
|
{selectedCategory.path}
|
||||||
|
</code>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<span className="text-gray-500">Children:</span>
|
||||||
|
<span className="ml-2 font-medium">
|
||||||
|
{selectedCategory.category_count}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<span className="text-gray-500">Slug:</span>
|
||||||
|
<code className="ml-2 text-xs bg-gray-100 dark:bg-gray-700 px-2 py-1 rounded">
|
||||||
|
{selectedCategory.slug}
|
||||||
|
</code>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Use in search */}
|
||||||
|
<div className="mt-6 pt-4 border-t">
|
||||||
|
<button
|
||||||
|
className="w-full py-2 px-4 bg-blue-500 text-white rounded hover:bg-blue-600 text-sm"
|
||||||
|
onClick={() => {
|
||||||
|
// Copy ltree path for use in queries
|
||||||
|
navigator.clipboard.writeText(selectedCategory.path);
|
||||||
|
alert('Path copied to clipboard!');
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
Copy Path for Query
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
@@ -29,11 +29,11 @@ body {
|
|||||||
@keyframes fade-in {
|
@keyframes fade-in {
|
||||||
from {
|
from {
|
||||||
opacity: 0;
|
opacity: 0;
|
||||||
transform: translateX(-50%) translateY(4px);
|
transform: translateY(4px);
|
||||||
}
|
}
|
||||||
to {
|
to {
|
||||||
opacity: 1;
|
opacity: 1;
|
||||||
transform: translateX(-50%) translateY(0);
|
transform: translateY(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -32,10 +32,12 @@ export default function RootLayout({
|
|||||||
<JobsProvider>
|
<JobsProvider>
|
||||||
<div className="h-screen w-screen overflow-hidden flex">
|
<div className="h-screen w-screen overflow-hidden flex">
|
||||||
<Sidebar />
|
<Sidebar />
|
||||||
<div className="flex-1 bg-gray-50 overflow-hidden">
|
<div className="flex-1 bg-gray-50 overflow-auto">
|
||||||
{children}
|
{children}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
{/* Portal target for modals - outside overflow-hidden container */}
|
||||||
|
<div id="modal-root" />
|
||||||
</JobsProvider>
|
</JobsProvider>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|||||||
190
web/app/pipelines/[pipelineId]/analytics/page.tsx
Normal file
190
web/app/pipelines/[pipelineId]/analytics/page.tsx
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
'use client';
|
||||||
|
|
||||||
|
import { useEffect, useState } from 'react';
|
||||||
|
import { useParams, useSearchParams, useRouter } from 'next/navigation';
|
||||||
|
import Link from 'next/link';
|
||||||
|
import { ArrowLeft, Loader2, FileText, BarChart3 } from 'lucide-react';
|
||||||
|
import { DynamicDashboard } from '@/components/dashboard/DynamicDashboard';
|
||||||
|
import { ReviewIQDashboard } from '@/components/reviewiq';
|
||||||
|
import { getDashboardConfig } from '@/lib/pipeline-api';
|
||||||
|
import type { DashboardConfig } from '@/lib/pipeline-types';
|
||||||
|
|
||||||
|
// Lazy load Report tab
|
||||||
|
import dynamic from 'next/dynamic';
|
||||||
|
const ReportTab = dynamic(() => import('@/components/reviewiq/ReportTab').then(m => m.ReportTab), {
|
||||||
|
loading: () => <div className="flex items-center justify-center min-h-[400px]"><Loader2 className="w-8 h-8 animate-spin text-blue-600" /></div>
|
||||||
|
});
|
||||||
|
|
||||||
|
type ReviewIQTab = 'report' | 'dashboard';
|
||||||
|
|
||||||
|
export default function PipelineAnalyticsPage() {
|
||||||
|
const params = useParams();
|
||||||
|
const searchParams = useSearchParams();
|
||||||
|
|
||||||
|
const pipelineId = params.pipelineId as string;
|
||||||
|
const jobId = searchParams.get('job_id') || undefined;
|
||||||
|
const businessId = searchParams.get('business_id') || undefined;
|
||||||
|
|
||||||
|
const [config, setConfig] = useState<DashboardConfig | null>(null);
|
||||||
|
const [loading, setLoading] = useState(true);
|
||||||
|
const [error, setError] = useState<string | null>(null);
|
||||||
|
|
||||||
|
// Use the handcrafted ReviewIQ dashboard for the reviewiq pipeline
|
||||||
|
const useReviewIQDashboard = pipelineId === 'reviewiq';
|
||||||
|
|
||||||
|
// Tab state for ReviewIQ
|
||||||
|
const router = useRouter();
|
||||||
|
const viewParam = searchParams.get('view') as ReviewIQTab | null;
|
||||||
|
const [activeTab, setActiveTab] = useState<ReviewIQTab>(viewParam || 'report');
|
||||||
|
|
||||||
|
// Update URL when tab changes
|
||||||
|
const handleTabChange = (tab: ReviewIQTab) => {
|
||||||
|
setActiveTab(tab);
|
||||||
|
const params = new URLSearchParams(searchParams.toString());
|
||||||
|
if (tab === 'report') {
|
||||||
|
params.delete('view');
|
||||||
|
} else {
|
||||||
|
params.set('view', tab);
|
||||||
|
}
|
||||||
|
router.push(`/pipelines/${pipelineId}/analytics?${params.toString()}`, { scroll: false });
|
||||||
|
};
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
// Skip config fetch for ReviewIQ - it uses its own optimized endpoint
|
||||||
|
if (useReviewIQDashboard) {
|
||||||
|
setLoading(false);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchConfig() {
|
||||||
|
try {
|
||||||
|
setLoading(true);
|
||||||
|
const dashboardConfig = await getDashboardConfig(pipelineId);
|
||||||
|
setConfig(dashboardConfig);
|
||||||
|
} catch (err) {
|
||||||
|
setError(err instanceof Error ? err.message : 'Failed to load dashboard config');
|
||||||
|
} finally {
|
||||||
|
setLoading(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fetchConfig();
|
||||||
|
}, [pipelineId, useReviewIQDashboard]);
|
||||||
|
|
||||||
|
if (loading) {
|
||||||
|
return (
|
||||||
|
<div className="flex items-center justify-center min-h-[400px]">
|
||||||
|
<Loader2 className="w-8 h-8 animate-spin text-blue-600" />
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use handcrafted ReviewIQ Dashboard with tabs
|
||||||
|
if (useReviewIQDashboard) {
|
||||||
|
const tabs = [
|
||||||
|
{ id: 'report' as const, label: 'Report', icon: FileText },
|
||||||
|
{ id: 'dashboard' as const, label: 'Dashboard', icon: BarChart3 },
|
||||||
|
];
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="h-full overflow-y-auto p-6">
|
||||||
|
{/* Navigation breadcrumb */}
|
||||||
|
<div className="mb-4">
|
||||||
|
<Link
|
||||||
|
href={`/pipelines/${pipelineId}`}
|
||||||
|
className="inline-flex items-center text-sm text-gray-600 hover:text-gray-900"
|
||||||
|
>
|
||||||
|
<ArrowLeft className="w-4 h-4 mr-1" />
|
||||||
|
Back to ReviewIQ Pipeline
|
||||||
|
</Link>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Job context indicator */}
|
||||||
|
{jobId && (
|
||||||
|
<div className="mb-4 bg-blue-50 border border-blue-200 rounded-lg p-3 text-sm text-blue-700">
|
||||||
|
Showing results for job: <code className="bg-blue-100 px-1 rounded">{jobId}</code>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Tab Navigation */}
|
||||||
|
<div className="mb-6 border-b border-gray-200">
|
||||||
|
<nav className="flex gap-2" aria-label="Tabs">
|
||||||
|
{tabs.map((tab) => {
|
||||||
|
const Icon = tab.icon;
|
||||||
|
const isActive = activeTab === tab.id;
|
||||||
|
return (
|
||||||
|
<button
|
||||||
|
key={tab.id}
|
||||||
|
onClick={() => handleTabChange(tab.id)}
|
||||||
|
className={`
|
||||||
|
relative px-4 py-2.5 flex items-center gap-2 text-sm font-medium transition-colors
|
||||||
|
${isActive
|
||||||
|
? 'text-blue-600'
|
||||||
|
: 'text-gray-500 hover:text-gray-700'
|
||||||
|
}
|
||||||
|
`}
|
||||||
|
>
|
||||||
|
<Icon className={`w-4 h-4 ${isActive ? 'text-blue-600' : 'text-gray-400'}`} />
|
||||||
|
<span>{tab.label}</span>
|
||||||
|
{/* Active indicator bar */}
|
||||||
|
<span
|
||||||
|
className={`absolute bottom-0 left-0 right-0 h-0.5 bg-blue-600 transition-opacity ${isActive ? 'opacity-100' : 'opacity-0'}`}
|
||||||
|
/>
|
||||||
|
</button>
|
||||||
|
);
|
||||||
|
})}
|
||||||
|
</nav>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Tab Content */}
|
||||||
|
{activeTab === 'report' && (
|
||||||
|
<ReportTab jobId={jobId} businessId={businessId} />
|
||||||
|
)}
|
||||||
|
{activeTab === 'dashboard' && (
|
||||||
|
<ReviewIQDashboard jobId={jobId} businessId={businessId} />
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback for other pipelines using dynamic dashboard
|
||||||
|
if (error || !config) {
|
||||||
|
return (
|
||||||
|
<div className="p-6">
|
||||||
|
<div className="bg-red-50 border border-red-200 rounded-lg p-4 text-red-700">
|
||||||
|
{error || 'Failed to load dashboard configuration'}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="h-full overflow-y-auto p-6">
|
||||||
|
{/* Navigation breadcrumb */}
|
||||||
|
<div className="mb-4">
|
||||||
|
<Link
|
||||||
|
href={`/pipelines/${pipelineId}`}
|
||||||
|
className="inline-flex items-center text-sm text-gray-600 hover:text-gray-900"
|
||||||
|
>
|
||||||
|
<ArrowLeft className="w-4 h-4 mr-1" />
|
||||||
|
Back to {pipelineId} Pipeline
|
||||||
|
</Link>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Job context indicator */}
|
||||||
|
{jobId && (
|
||||||
|
<div className="mb-4 bg-blue-50 border border-blue-200 rounded-lg p-3 text-sm text-blue-700">
|
||||||
|
Showing results for job: <code className="bg-blue-100 px-1 rounded">{jobId}</code>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Dynamic Dashboard for other pipelines */}
|
||||||
|
<DynamicDashboard
|
||||||
|
pipelineId={pipelineId}
|
||||||
|
config={config}
|
||||||
|
businessId={businessId}
|
||||||
|
jobId={jobId}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
@@ -16,6 +16,7 @@ import {
|
|||||||
ExternalLink,
|
ExternalLink,
|
||||||
Timer,
|
Timer,
|
||||||
ArrowRightLeft,
|
ArrowRightLeft,
|
||||||
|
BarChart3,
|
||||||
} from 'lucide-react';
|
} from 'lucide-react';
|
||||||
import type { ExecutionStatus, StageMetrics } from '@/lib/pipeline-types';
|
import type { ExecutionStatus, StageMetrics } from '@/lib/pipeline-types';
|
||||||
import { getExecution } from '@/lib/pipeline-api';
|
import { getExecution } from '@/lib/pipeline-api';
|
||||||
@@ -432,6 +433,22 @@ export default function ExecutionDetailPage() {
|
|||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
|
{/* View Results Dashboard Button */}
|
||||||
|
{execution?.status === 'completed' && execution?.job_id && (
|
||||||
|
<div className="mt-6 pt-4 border-t border-gray-200">
|
||||||
|
<Link
|
||||||
|
href={`/pipelines/${pipelineId}/analytics?job_id=${execution.job_id}`}
|
||||||
|
className="inline-flex items-center px-4 py-2.5 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition-colors font-medium"
|
||||||
|
>
|
||||||
|
<BarChart3 className="w-5 h-5 mr-2" />
|
||||||
|
View Results Dashboard
|
||||||
|
</Link>
|
||||||
|
<p className="mt-2 text-sm text-gray-500">
|
||||||
|
See classification results, sentiment analysis, and identified issues
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{/* Error Message */}
|
{/* Error Message */}
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user