Initial commit - WhyRating Engine (Google Reviews Scraper)

This commit is contained in:
Alejandro Gutiérrez
2026-02-02 18:19:00 +00:00
parent 0543a08242
commit 2206ddeff2
136 changed files with 51138 additions and 855 deletions

View File

@@ -0,0 +1,87 @@
#!/usr/bin/env python3
"""
Backfill missing business_category for existing jobs.
Uses validation_only mode to quickly capture business info without re-scraping reviews.
"""
import asyncio
import asyncpg
import os
import sys
# Add project root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from scrapers.google_reviews.v1_1_0 import fast_scrape_reviews
DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://scraper:scraper123@localhost:5437/scraper')
async def backfill_categories():
"""Fetch and update missing business categories."""
# Connect to database
conn = await asyncpg.connect(DATABASE_URL)
try:
# Get jobs missing business_category
rows = await conn.fetch("""
SELECT job_id, url, business_name
FROM jobs
WHERE business_category IS NULL
AND status = 'completed'
ORDER BY created_at DESC
""")
print(f"Found {len(rows)} jobs missing business_category\n")
updated = 0
failed = 0
for row in rows:
job_id = row['job_id']
url = row['url']
name = row['business_name'] or 'Unknown'
print(f"Processing: {name[:50]}...")
try:
# Run validation-only scrape (fast - just captures business info)
result = await asyncio.to_thread(
fast_scrape_reviews,
url=url,
headless=True,
validation_only=True
)
# Extract category from validation_info
validation_info = result.get('validation_info', {})
category = validation_info.get('category')
if category:
# Update the database
await conn.execute("""
UPDATE jobs
SET business_category = $2,
updated_at = NOW()
WHERE job_id = $1
""", job_id, category)
print(f" ✓ Category: {category}")
updated += 1
else:
print(f" ✗ No category found")
failed += 1
except Exception as e:
print(f" ✗ Error: {e}")
failed += 1
print(f"\n{'='*50}")
print(f"Done! Updated: {updated}, Failed: {failed}")
finally:
await conn.close()
if __name__ == '__main__':
asyncio.run(backfill_categories())

View File

@@ -0,0 +1,142 @@
#!/usr/bin/env python3
"""
Register the Reputation Pipeline in the pipeline registry.
Usage:
python scripts/register_reputation_pipeline.py
"""
import asyncio
import os
import sys
import asyncpg
# Database URL
DB_URL = os.environ.get(
"DATABASE_URL",
"postgresql://scraper:scraper123@localhost:5437/scraper"
)
async def register_pipeline():
"""Register the Reputation Pipeline in the database."""
print(f"Connecting to database...")
conn = await asyncpg.connect(DB_URL)
try:
# Ensure the registry table exists
await conn.execute("""
CREATE TABLE IF NOT EXISTS pipeline.registry (
pipeline_id VARCHAR(50) PRIMARY KEY,
name VARCHAR(255) NOT NULL,
description TEXT,
version VARCHAR(50) NOT NULL,
module_path VARCHAR(500) NOT NULL,
stages TEXT[] NOT NULL DEFAULT '{}',
input_type VARCHAR(100),
config JSONB,
is_enabled BOOLEAN NOT NULL DEFAULT TRUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
)
""")
# Register the Reputation Pipeline
result = await conn.execute("""
INSERT INTO pipeline.registry (
pipeline_id,
name,
description,
version,
module_path,
stages,
input_type,
is_enabled,
updated_at
)
VALUES (
'reputation',
'Reputation Analytics Pipeline',
'Primitives-based classification and reputation scoring. Generates business-facing analytics reports with domain breakdown, key drivers, and actionable insights.',
'2.0.0',
'reviewiq_pipeline.reputation_pipeline:ReputationPipeline',
ARRAY['classify', 'report'],
'BusinessInput',
TRUE,
NOW()
)
ON CONFLICT (pipeline_id) DO UPDATE SET
name = EXCLUDED.name,
description = EXCLUDED.description,
version = EXCLUDED.version,
module_path = EXCLUDED.module_path,
stages = EXCLUDED.stages,
input_type = EXCLUDED.input_type,
is_enabled = EXCLUDED.is_enabled,
updated_at = NOW()
""")
print(f"✓ Registered 'reputation' pipeline")
# Also ensure the ReviewIQ pipeline is registered
result = await conn.execute("""
INSERT INTO pipeline.registry (
pipeline_id,
name,
description,
version,
module_path,
stages,
input_type,
is_enabled,
updated_at
)
VALUES (
'reviewiq',
'ReviewIQ Classification Pipeline',
'Classifies reviews using URT taxonomy, detects issues, and aggregates metrics for dashboards.',
'1.0.0',
'reviewiq_pipeline.pipeline:ReviewIQPipeline',
ARRAY['normalize', 'classify', 'route', 'aggregate', 'synthesize'],
'ScraperV1Output',
TRUE,
NOW()
)
ON CONFLICT (pipeline_id) DO UPDATE SET
name = EXCLUDED.name,
description = EXCLUDED.description,
version = EXCLUDED.version,
module_path = EXCLUDED.module_path,
stages = EXCLUDED.stages,
input_type = EXCLUDED.input_type,
is_enabled = EXCLUDED.is_enabled,
updated_at = NOW()
""")
print(f"✓ Registered 'reviewiq' pipeline")
# List all registered pipelines
rows = await conn.fetch("""
SELECT pipeline_id, name, version, is_enabled, stages
FROM pipeline.registry
ORDER BY name
""")
print(f"\n📋 Registered Pipelines:")
print("-" * 80)
for row in rows:
status = "✓ enabled" if row["is_enabled"] else "✗ disabled"
stages = ", ".join(row["stages"]) if row["stages"] else "none"
print(f" {row['pipeline_id']:20} v{row['version']:10} {status}")
print(f"{row['name']}")
print(f" → Stages: {stages}")
print()
finally:
await conn.close()
if __name__ == "__main__":
asyncio.run(register_pipeline())

View File

@@ -0,0 +1,414 @@
#!/usr/bin/env python3
"""
Resolve GBP taxonomy categories for all jobs.
Uses exact match, LLM match, or hierarchical classification.
Usage: source .env && python scripts/resolve_job_categories.py
"""
import asyncio
import os
import sys
from dataclasses import dataclass
from typing import Optional
import asyncpg
from openai import OpenAI
DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://scraper:scraper123@localhost:5437/scraper')
@dataclass
class ResolvedCategory:
"""Result of category resolution."""
category_id: int
path: str
name: str
level: int
method: str # 'exact', 'llm', 'hierarchical'
confidence: float
class SimpleLLM:
"""Simple OpenAI wrapper for category resolution."""
def __init__(self):
self.client = OpenAI()
async def complete(self, prompt: str, max_tokens: int = 50, temperature: float = 0) -> str:
"""Get completion from OpenAI."""
response = await asyncio.to_thread(
self.client.chat.completions.create,
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens,
temperature=temperature
)
return response.choices[0].message.content.strip()
class CategoryResolver:
"""Resolves business categories to GBP taxonomy nodes."""
def __init__(self, pool: asyncpg.Pool, llm: SimpleLLM):
self.pool = pool
self.llm = llm
self._level1_cache: list[dict] = []
self._level2_cache: dict[str, list[dict]] = {}
self._level3_cache: dict[str, list[dict]] = {}
async def resolve(
self,
google_category: Optional[str] = None,
business_name: Optional[str] = None,
business_address: Optional[str] = None
) -> Optional[ResolvedCategory]:
"""Resolve to the deepest taxonomy node."""
# Phase 1: Exact match
if google_category:
result = await self._exact_match(google_category)
if result:
return result
# Phase 2: LLM match
result = await self._llm_match(google_category)
if result:
return result
# Phase 3: Hierarchical classification
if business_name:
result = await self._hierarchical_classify(
business_name=business_name,
business_address=business_address,
google_category=google_category
)
if result:
return result
return None
async def _exact_match(self, google_category: str) -> Optional[ResolvedCategory]:
"""Try exact match against taxonomy."""
async with self.pool.acquire() as conn:
# Exact match (case-insensitive)
row = await conn.fetchrow("""
SELECT id, name, path::text as path, level
FROM gbp_categories
WHERE LOWER(name) = LOWER($1) AND level = 3
""", google_category)
if row:
return ResolvedCategory(
category_id=row['id'],
path=row['path'],
name=row['name'],
level=row['level'],
method='exact',
confidence=1.0
)
# Trigram similarity match (handles typos, slight variations)
# Threshold 0.7 = high confidence only, else fall through to LLM
row = await conn.fetchrow("""
SELECT id, name, path::text as path, level,
similarity(LOWER(name), LOWER($1)) as sim
FROM gbp_categories
WHERE level = 3 AND similarity(LOWER(name), LOWER($1)) > 0.7
ORDER BY sim DESC
LIMIT 1
""", google_category)
if row:
return ResolvedCategory(
category_id=row['id'],
path=row['path'],
name=row['name'],
level=row['level'],
method='fuzzy',
confidence=float(row['sim'])
)
return None
async def _llm_match(self, google_category: str) -> Optional[ResolvedCategory]:
"""Use LLM to match Google category to taxonomy."""
# Synonym expansion for common variations
SYNONYMS = {
'shop': ['store', 'shop', 'outlet'],
'store': ['store', 'shop', 'outlet'],
'house': ['house', 'home'],
'home': ['house', 'home'],
'office': ['office', 'clinic', 'center'],
'clinic': ['clinic', 'office', 'center'],
'center': ['center', 'centre'],
'centre': ['center', 'centre'],
'repair': ['repair', 'service', 'maintenance'],
}
async with self.pool.acquire() as conn:
# Get candidates using multiple strategies:
# 1. Word matches with synonym expansion
# 2. Trigram similarity
words = google_category.lower().split()
expanded_words = set()
for w in words:
if len(w) > 2:
expanded_words.add(w)
if w in SYNONYMS:
expanded_words.update(SYNONYMS[w])
word_conditions = " OR ".join([f"LOWER(name) LIKE '%{w}%'" for w in expanded_words])
primary_word = google_category.lower().split()[0] # First word is usually most important
# Order by: starts with primary word, then by similarity
candidates = await conn.fetch(f"""
SELECT DISTINCT id, name, path::text as path, level,
CASE WHEN LOWER(name) LIKE $2 THEN 1 ELSE 0 END as starts_with,
similarity(LOWER(name), LOWER($1)) as sim
FROM gbp_categories
WHERE level = 3 AND (
({word_conditions if word_conditions else 'FALSE'})
OR similarity(LOWER(name), LOWER($1)) > 0.3
)
ORDER BY starts_with DESC, sim DESC
LIMIT 20
""", google_category, f"{primary_word}%")
if not candidates:
return None
candidate_list = "\n".join([f"- {c['name']}" for c in candidates])
prompt = f"""Match business category "{google_category}" to the closest option.
Synonyms: shop=store, house=cafe/home, office=clinic/center
Options:
{candidate_list}
Reply with ONLY the exact category name from the list."""
response = await self.llm.complete(prompt, max_tokens=30)
selected = response.strip().strip('"').strip("'")
if selected.upper() == "NONE":
return None
for c in candidates:
if c['name'].lower() == selected.lower():
return ResolvedCategory(
category_id=c['id'],
path=c['path'],
name=c['name'],
level=c['level'],
method='llm',
confidence=0.85
)
# Fuzzy match selected name to candidates
for c in candidates:
if selected.lower() in c['name'].lower() or c['name'].lower() in selected.lower():
return ResolvedCategory(
category_id=c['id'],
path=c['path'],
name=c['name'],
level=c['level'],
method='llm',
confidence=0.75
)
return None
async def _hierarchical_classify(
self,
business_name: str,
business_address: Optional[str] = None,
google_category: Optional[str] = None
) -> Optional[ResolvedCategory]:
"""Walk down taxonomy tree using LLM."""
context = f"Business: {business_name}"
if business_address:
context += f"\nAddress: {business_address}"
if google_category:
context += f"\nHint: {google_category}"
# Level 1
level1 = await self._get_categories(1)
sector = await self._llm_select(context, level1, "sector")
if not sector:
return None
# Level 2
level2 = await self._get_categories(2, sector['path'])
biz_type = await self._llm_select(context, level2, "business type", sector['name'])
if not biz_type:
return None
# Level 3
level3 = await self._get_categories(3, biz_type['path'])
specific = await self._llm_select(context, level3, "specific category", biz_type['name'])
if not specific:
return None
return ResolvedCategory(
category_id=specific['id'],
path=specific['path'],
name=specific['name'],
level=specific['level'],
method='hierarchical',
confidence=0.7
)
async def _get_categories(self, level: int, parent_path: str = None) -> list[dict]:
"""Get categories at level, optionally under parent."""
async with self.pool.acquire() as conn:
if parent_path:
rows = await conn.fetch("""
SELECT id, name, path::text as path, level
FROM gbp_categories
WHERE level = $1 AND path <@ $2::ltree
ORDER BY name
""", level, parent_path)
else:
rows = await conn.fetch("""
SELECT id, name, path::text as path, level
FROM gbp_categories
WHERE level = $1
ORDER BY name
""", level)
return [dict(r) for r in rows]
async def _llm_select(
self,
context: str,
categories: list[dict],
level_name: str,
parent: str = None
) -> Optional[dict]:
"""Ask LLM to select best category."""
if not categories:
return None
if len(categories) == 1:
return categories[0]
cat_list = "\n".join([f"- {c['name']}" for c in categories])
parent_ctx = f" within {parent}" if parent else ""
prompt = f"""{context}
Select the most appropriate {level_name}{parent_ctx}.
Options:
{cat_list}
Respond with ONLY the exact name from the list."""
response = await self.llm.complete(prompt)
selected = response.strip().strip('"').strip("'")
for c in categories:
if c['name'].lower() == selected.lower():
return c
# Fuzzy fallback
for c in categories:
if selected.lower() in c['name'].lower():
return c
return categories[0] if categories else None
async def main():
# Connect to database
pool = await asyncpg.create_pool(DATABASE_URL, min_size=2, max_size=5)
# Initialize LLM client
llm = SimpleLLM()
try:
# Get jobs needing category resolution
async with pool.acquire() as conn:
jobs = await conn.fetch("""
SELECT job_id, business_name, business_category, business_address
FROM jobs
WHERE status = 'completed'
AND gbp_category_path IS NULL
ORDER BY created_at DESC
""")
print(f"Found {len(jobs)} jobs needing category resolution\n")
resolver = CategoryResolver(pool, llm)
resolved = 0
failed = 0
for job in jobs:
job_id = str(job['job_id'])
name = job['business_name'] or 'Unknown'
google_cat = job['business_category']
address = job['business_address']
print(f"Processing: {name[:50]}...")
if google_cat:
print(f" Google category: {google_cat}")
try:
result = await resolver.resolve(
google_category=google_cat,
business_name=name,
business_address=address
)
if result:
# Determine source: google if they had a category, inferred if we used business name
category_source = 'google' if google_cat else 'inferred'
# Save to database
async with pool.acquire() as conn:
await conn.execute("""
UPDATE jobs
SET gbp_category_id = $2,
gbp_category_path = $3::ltree,
category_resolution_method = $4,
business_category_source = $5,
updated_at = NOW()
WHERE job_id = $1::uuid
""", job_id, result.category_id, result.path, result.method, category_source)
print(f" ✓ Resolved: {result.path} ({result.method}, source={category_source})")
resolved += 1
else:
print(f" ✗ Could not resolve")
failed += 1
except Exception as e:
print(f" ✗ Error: {e}")
failed += 1
print(f"\n{'='*50}")
print(f"Done! Resolved: {resolved}, Failed: {failed}")
# Show results
async with pool.acquire() as conn:
results = await conn.fetch("""
SELECT business_name, business_category,
gbp_category_path::text as resolved_path,
category_resolution_method,
business_category_source
FROM jobs
WHERE status = 'completed' AND gbp_category_path IS NOT NULL
ORDER BY created_at DESC
LIMIT 10
""")
print(f"\n{'='*50}")
print("Recent resolved categories:")
for r in results:
source = r['business_category_source'] or '-'
print(f" {r['business_name'][:30]:30} | {r['business_category'] or '-':20} | {source:8} -> {r['resolved_path']} ({r['category_resolution_method']})")
finally:
await pool.close()
if __name__ == '__main__':
asyncio.run(main())