Initial commit - WhyRating Engine (Google Reviews Scraper)

2026-02-02 18:19:00 +00:00
parent 0543a08242
commit 2206ddeff2
136 changed files with 51138 additions and 855 deletions
--- a/scripts/backfill_business_category.py
+++ b/scripts/backfill_business_category.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+"""
+Backfill missing business_category for existing jobs.
+Uses validation_only mode to quickly capture business info without re-scraping reviews.
+"""
+import asyncio
+import asyncpg
+import os
+import sys
+
+# Add project root to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from scrapers.google_reviews.v1_1_0 import fast_scrape_reviews
+
+DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://scraper:scraper123@localhost:5437/scraper')
+
+
+async def backfill_categories():
+    """Fetch and update missing business categories."""
+
+    # Connect to database
+    conn = await asyncpg.connect(DATABASE_URL)
+
+    try:
+        # Get jobs missing business_category
+        rows = await conn.fetch("""
+            SELECT job_id, url, business_name
+            FROM jobs
+            WHERE business_category IS NULL
+              AND status = 'completed'
+            ORDER BY created_at DESC
+        """)
+
+        print(f"Found {len(rows)} jobs missing business_category\n")
+
+        updated = 0
+        failed = 0
+
+        for row in rows:
+            job_id = row['job_id']
+            url = row['url']
+            name = row['business_name'] or 'Unknown'
+
+            print(f"Processing: {name[:50]}...")
+
+            try:
+                # Run validation-only scrape (fast - just captures business info)
+                result = await asyncio.to_thread(
+                    fast_scrape_reviews,
+                    url=url,
+                    headless=True,
+                    validation_only=True
+                )
+
+                # Extract category from validation_info
+                validation_info = result.get('validation_info', {})
+                category = validation_info.get('category')
+
+                if category:
+                    # Update the database
+                    await conn.execute("""
+                        UPDATE jobs
+                        SET business_category = $2,
+                            updated_at = NOW()
+                        WHERE job_id = $1
+                    """, job_id, category)
+
+                    print(f"  ✓ Category: {category}")
+                    updated += 1
+                else:
+                    print(f"  ✗ No category found")
+                    failed += 1
+
+            except Exception as e:
+                print(f"  ✗ Error: {e}")
+                failed += 1
+
+        print(f"\n{'='*50}")
+        print(f"Done! Updated: {updated}, Failed: {failed}")
+
+    finally:
+        await conn.close()
+
+
+if __name__ == '__main__':
+    asyncio.run(backfill_categories())
--- a/scripts/register_reputation_pipeline.py
+++ b/scripts/register_reputation_pipeline.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+"""
+Register the Reputation Pipeline in the pipeline registry.
+
+Usage:
+    python scripts/register_reputation_pipeline.py
+"""
+
+import asyncio
+import os
+import sys
+
+import asyncpg
+
+# Database URL
+DB_URL = os.environ.get(
+    "DATABASE_URL",
+    "postgresql://scraper:scraper123@localhost:5437/scraper"
+)
+
+
+async def register_pipeline():
+    """Register the Reputation Pipeline in the database."""
+    print(f"Connecting to database...")
+
+    conn = await asyncpg.connect(DB_URL)
+
+    try:
+        # Ensure the registry table exists
+        await conn.execute("""
+            CREATE TABLE IF NOT EXISTS pipeline.registry (
+                pipeline_id VARCHAR(50) PRIMARY KEY,
+                name VARCHAR(255) NOT NULL,
+                description TEXT,
+                version VARCHAR(50) NOT NULL,
+                module_path VARCHAR(500) NOT NULL,
+                stages TEXT[] NOT NULL DEFAULT '{}',
+                input_type VARCHAR(100),
+                config JSONB,
+                is_enabled BOOLEAN NOT NULL DEFAULT TRUE,
+                created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+                updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+            )
+        """)
+
+        # Register the Reputation Pipeline
+        result = await conn.execute("""
+            INSERT INTO pipeline.registry (
+                pipeline_id,
+                name,
+                description,
+                version,
+                module_path,
+                stages,
+                input_type,
+                is_enabled,
+                updated_at
+            )
+            VALUES (
+                'reputation',
+                'Reputation Analytics Pipeline',
+                'Primitives-based classification and reputation scoring. Generates business-facing analytics reports with domain breakdown, key drivers, and actionable insights.',
+                '2.0.0',
+                'reviewiq_pipeline.reputation_pipeline:ReputationPipeline',
+                ARRAY['classify', 'report'],
+                'BusinessInput',
+                TRUE,
+                NOW()
+            )
+            ON CONFLICT (pipeline_id) DO UPDATE SET
+                name = EXCLUDED.name,
+                description = EXCLUDED.description,
+                version = EXCLUDED.version,
+                module_path = EXCLUDED.module_path,
+                stages = EXCLUDED.stages,
+                input_type = EXCLUDED.input_type,
+                is_enabled = EXCLUDED.is_enabled,
+                updated_at = NOW()
+        """)
+
+        print(f"✓ Registered 'reputation' pipeline")
+
+        # Also ensure the ReviewIQ pipeline is registered
+        result = await conn.execute("""
+            INSERT INTO pipeline.registry (
+                pipeline_id,
+                name,
+                description,
+                version,
+                module_path,
+                stages,
+                input_type,
+                is_enabled,
+                updated_at
+            )
+            VALUES (
+                'reviewiq',
+                'ReviewIQ Classification Pipeline',
+                'Classifies reviews using URT taxonomy, detects issues, and aggregates metrics for dashboards.',
+                '1.0.0',
+                'reviewiq_pipeline.pipeline:ReviewIQPipeline',
+                ARRAY['normalize', 'classify', 'route', 'aggregate', 'synthesize'],
+                'ScraperV1Output',
+                TRUE,
+                NOW()
+            )
+            ON CONFLICT (pipeline_id) DO UPDATE SET
+                name = EXCLUDED.name,
+                description = EXCLUDED.description,
+                version = EXCLUDED.version,
+                module_path = EXCLUDED.module_path,
+                stages = EXCLUDED.stages,
+                input_type = EXCLUDED.input_type,
+                is_enabled = EXCLUDED.is_enabled,
+                updated_at = NOW()
+        """)
+
+        print(f"✓ Registered 'reviewiq' pipeline")
+
+        # List all registered pipelines
+        rows = await conn.fetch("""
+            SELECT pipeline_id, name, version, is_enabled, stages
+            FROM pipeline.registry
+            ORDER BY name
+        """)
+
+        print(f"\n📋 Registered Pipelines:")
+        print("-" * 80)
+        for row in rows:
+            status = "✓ enabled" if row["is_enabled"] else "✗ disabled"
+            stages = ", ".join(row["stages"]) if row["stages"] else "none"
+            print(f"  {row['pipeline_id']:20} v{row['version']:10} {status}")
+            print(f"    → {row['name']}")
+            print(f"    → Stages: {stages}")
+            print()
+
+    finally:
+        await conn.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(register_pipeline())
--- a/scripts/resolve_job_categories.py
+++ b/scripts/resolve_job_categories.py
@@ -0,0 +1,414 @@
+#!/usr/bin/env python3
+"""
+Resolve GBP taxonomy categories for all jobs.
+Uses exact match, LLM match, or hierarchical classification.
+
+Usage: source .env && python scripts/resolve_job_categories.py
+"""
+import asyncio
+import os
+import sys
+from dataclasses import dataclass
+from typing import Optional
+
+import asyncpg
+from openai import OpenAI
+
+DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://scraper:scraper123@localhost:5437/scraper')
+
+
+@dataclass
+class ResolvedCategory:
+    """Result of category resolution."""
+    category_id: int
+    path: str
+    name: str
+    level: int
+    method: str  # 'exact', 'llm', 'hierarchical'
+    confidence: float
+
+
+class SimpleLLM:
+    """Simple OpenAI wrapper for category resolution."""
+
+    def __init__(self):
+        self.client = OpenAI()
+
+    async def complete(self, prompt: str, max_tokens: int = 50, temperature: float = 0) -> str:
+        """Get completion from OpenAI."""
+        response = await asyncio.to_thread(
+            self.client.chat.completions.create,
+            model="gpt-4o-mini",
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=max_tokens,
+            temperature=temperature
+        )
+        return response.choices[0].message.content.strip()
+
+
+class CategoryResolver:
+    """Resolves business categories to GBP taxonomy nodes."""
+
+    def __init__(self, pool: asyncpg.Pool, llm: SimpleLLM):
+        self.pool = pool
+        self.llm = llm
+        self._level1_cache: list[dict] = []
+        self._level2_cache: dict[str, list[dict]] = {}
+        self._level3_cache: dict[str, list[dict]] = {}
+
+    async def resolve(
+        self,
+        google_category: Optional[str] = None,
+        business_name: Optional[str] = None,
+        business_address: Optional[str] = None
+    ) -> Optional[ResolvedCategory]:
+        """Resolve to the deepest taxonomy node."""
+
+        # Phase 1: Exact match
+        if google_category:
+            result = await self._exact_match(google_category)
+            if result:
+                return result
+
+            # Phase 2: LLM match
+            result = await self._llm_match(google_category)
+            if result:
+                return result
+
+        # Phase 3: Hierarchical classification
+        if business_name:
+            result = await self._hierarchical_classify(
+                business_name=business_name,
+                business_address=business_address,
+                google_category=google_category
+            )
+            if result:
+                return result
+
+        return None
+
+    async def _exact_match(self, google_category: str) -> Optional[ResolvedCategory]:
+        """Try exact match against taxonomy."""
+        async with self.pool.acquire() as conn:
+            # Exact match (case-insensitive)
+            row = await conn.fetchrow("""
+                SELECT id, name, path::text as path, level
+                FROM gbp_categories
+                WHERE LOWER(name) = LOWER($1) AND level = 3
+            """, google_category)
+
+            if row:
+                return ResolvedCategory(
+                    category_id=row['id'],
+                    path=row['path'],
+                    name=row['name'],
+                    level=row['level'],
+                    method='exact',
+                    confidence=1.0
+                )
+
+            # Trigram similarity match (handles typos, slight variations)
+            # Threshold 0.7 = high confidence only, else fall through to LLM
+            row = await conn.fetchrow("""
+                SELECT id, name, path::text as path, level,
+                       similarity(LOWER(name), LOWER($1)) as sim
+                FROM gbp_categories
+                WHERE level = 3 AND similarity(LOWER(name), LOWER($1)) > 0.7
+                ORDER BY sim DESC
+                LIMIT 1
+            """, google_category)
+
+            if row:
+                return ResolvedCategory(
+                    category_id=row['id'],
+                    path=row['path'],
+                    name=row['name'],
+                    level=row['level'],
+                    method='fuzzy',
+                    confidence=float(row['sim'])
+                )
+
+        return None
+
+    async def _llm_match(self, google_category: str) -> Optional[ResolvedCategory]:
+        """Use LLM to match Google category to taxonomy."""
+        # Synonym expansion for common variations
+        SYNONYMS = {
+            'shop': ['store', 'shop', 'outlet'],
+            'store': ['store', 'shop', 'outlet'],
+            'house': ['house', 'home'],
+            'home': ['house', 'home'],
+            'office': ['office', 'clinic', 'center'],
+            'clinic': ['clinic', 'office', 'center'],
+            'center': ['center', 'centre'],
+            'centre': ['center', 'centre'],
+            'repair': ['repair', 'service', 'maintenance'],
+        }
+
+        async with self.pool.acquire() as conn:
+            # Get candidates using multiple strategies:
+            # 1. Word matches with synonym expansion
+            # 2. Trigram similarity
+            words = google_category.lower().split()
+            expanded_words = set()
+            for w in words:
+                if len(w) > 2:
+                    expanded_words.add(w)
+                    if w in SYNONYMS:
+                        expanded_words.update(SYNONYMS[w])
+
+            word_conditions = " OR ".join([f"LOWER(name) LIKE '%{w}%'" for w in expanded_words])
+            primary_word = google_category.lower().split()[0]  # First word is usually most important
+
+            # Order by: starts with primary word, then by similarity
+            candidates = await conn.fetch(f"""
+                SELECT DISTINCT id, name, path::text as path, level,
+                       CASE WHEN LOWER(name) LIKE $2 THEN 1 ELSE 0 END as starts_with,
+                       similarity(LOWER(name), LOWER($1)) as sim
+                FROM gbp_categories
+                WHERE level = 3 AND (
+                    ({word_conditions if word_conditions else 'FALSE'})
+                    OR similarity(LOWER(name), LOWER($1)) > 0.3
+                )
+                ORDER BY starts_with DESC, sim DESC
+                LIMIT 20
+            """, google_category, f"{primary_word}%")
+
+            if not candidates:
+                return None
+
+        candidate_list = "\n".join([f"- {c['name']}" for c in candidates])
+
+        prompt = f"""Match business category "{google_category}" to the closest option.
+Synonyms: shop=store, house=cafe/home, office=clinic/center
+
+Options:
+{candidate_list}
+
+Reply with ONLY the exact category name from the list."""
+
+        response = await self.llm.complete(prompt, max_tokens=30)
+        selected = response.strip().strip('"').strip("'")
+
+        if selected.upper() == "NONE":
+            return None
+
+        for c in candidates:
+            if c['name'].lower() == selected.lower():
+                return ResolvedCategory(
+                    category_id=c['id'],
+                    path=c['path'],
+                    name=c['name'],
+                    level=c['level'],
+                    method='llm',
+                    confidence=0.85
+                )
+
+        # Fuzzy match selected name to candidates
+        for c in candidates:
+            if selected.lower() in c['name'].lower() or c['name'].lower() in selected.lower():
+                return ResolvedCategory(
+                    category_id=c['id'],
+                    path=c['path'],
+                    name=c['name'],
+                    level=c['level'],
+                    method='llm',
+                    confidence=0.75
+                )
+
+        return None
+
+    async def _hierarchical_classify(
+        self,
+        business_name: str,
+        business_address: Optional[str] = None,
+        google_category: Optional[str] = None
+    ) -> Optional[ResolvedCategory]:
+        """Walk down taxonomy tree using LLM."""
+        context = f"Business: {business_name}"
+        if business_address:
+            context += f"\nAddress: {business_address}"
+        if google_category:
+            context += f"\nHint: {google_category}"
+
+        # Level 1
+        level1 = await self._get_categories(1)
+        sector = await self._llm_select(context, level1, "sector")
+        if not sector:
+            return None
+
+        # Level 2
+        level2 = await self._get_categories(2, sector['path'])
+        biz_type = await self._llm_select(context, level2, "business type", sector['name'])
+        if not biz_type:
+            return None
+
+        # Level 3
+        level3 = await self._get_categories(3, biz_type['path'])
+        specific = await self._llm_select(context, level3, "specific category", biz_type['name'])
+        if not specific:
+            return None
+
+        return ResolvedCategory(
+            category_id=specific['id'],
+            path=specific['path'],
+            name=specific['name'],
+            level=specific['level'],
+            method='hierarchical',
+            confidence=0.7
+        )
+
+    async def _get_categories(self, level: int, parent_path: str = None) -> list[dict]:
+        """Get categories at level, optionally under parent."""
+        async with self.pool.acquire() as conn:
+            if parent_path:
+                rows = await conn.fetch("""
+                    SELECT id, name, path::text as path, level
+                    FROM gbp_categories
+                    WHERE level = $1 AND path <@ $2::ltree
+                    ORDER BY name
+                """, level, parent_path)
+            else:
+                rows = await conn.fetch("""
+                    SELECT id, name, path::text as path, level
+                    FROM gbp_categories
+                    WHERE level = $1
+                    ORDER BY name
+                """, level)
+            return [dict(r) for r in rows]
+
+    async def _llm_select(
+        self,
+        context: str,
+        categories: list[dict],
+        level_name: str,
+        parent: str = None
+    ) -> Optional[dict]:
+        """Ask LLM to select best category."""
+        if not categories:
+            return None
+        if len(categories) == 1:
+            return categories[0]
+
+        cat_list = "\n".join([f"- {c['name']}" for c in categories])
+        parent_ctx = f" within {parent}" if parent else ""
+
+        prompt = f"""{context}
+
+Select the most appropriate {level_name}{parent_ctx}.
+
+Options:
+{cat_list}
+
+Respond with ONLY the exact name from the list."""
+
+        response = await self.llm.complete(prompt)
+        selected = response.strip().strip('"').strip("'")
+
+        for c in categories:
+            if c['name'].lower() == selected.lower():
+                return c
+
+        # Fuzzy fallback
+        for c in categories:
+            if selected.lower() in c['name'].lower():
+                return c
+
+        return categories[0] if categories else None
+
+async def main():
+    # Connect to database
+    pool = await asyncpg.create_pool(DATABASE_URL, min_size=2, max_size=5)
+
+    # Initialize LLM client
+    llm = SimpleLLM()
+
+    try:
+        # Get jobs needing category resolution
+        async with pool.acquire() as conn:
+            jobs = await conn.fetch("""
+                SELECT job_id, business_name, business_category, business_address
+                FROM jobs
+                WHERE status = 'completed'
+                  AND gbp_category_path IS NULL
+                ORDER BY created_at DESC
+            """)
+
+        print(f"Found {len(jobs)} jobs needing category resolution\n")
+
+        resolver = CategoryResolver(pool, llm)
+
+        resolved = 0
+        failed = 0
+
+        for job in jobs:
+            job_id = str(job['job_id'])
+            name = job['business_name'] or 'Unknown'
+            google_cat = job['business_category']
+            address = job['business_address']
+
+            print(f"Processing: {name[:50]}...")
+            if google_cat:
+                print(f"  Google category: {google_cat}")
+
+            try:
+                result = await resolver.resolve(
+                    google_category=google_cat,
+                    business_name=name,
+                    business_address=address
+                )
+
+                if result:
+                    # Determine source: google if they had a category, inferred if we used business name
+                    category_source = 'google' if google_cat else 'inferred'
+
+                    # Save to database
+                    async with pool.acquire() as conn:
+                        await conn.execute("""
+                            UPDATE jobs
+                            SET gbp_category_id = $2,
+                                gbp_category_path = $3::ltree,
+                                category_resolution_method = $4,
+                                business_category_source = $5,
+                                updated_at = NOW()
+                            WHERE job_id = $1::uuid
+                        """, job_id, result.category_id, result.path, result.method, category_source)
+
+                    print(f"  ✓ Resolved: {result.path} ({result.method}, source={category_source})")
+                    resolved += 1
+                else:
+                    print(f"  ✗ Could not resolve")
+                    failed += 1
+
+            except Exception as e:
+                print(f"  ✗ Error: {e}")
+                failed += 1
+
+        print(f"\n{'='*50}")
+        print(f"Done! Resolved: {resolved}, Failed: {failed}")
+
+        # Show results
+        async with pool.acquire() as conn:
+            results = await conn.fetch("""
+                SELECT business_name, business_category,
+                       gbp_category_path::text as resolved_path,
+                       category_resolution_method,
+                       business_category_source
+                FROM jobs
+                WHERE status = 'completed' AND gbp_category_path IS NOT NULL
+                ORDER BY created_at DESC
+                LIMIT 10
+            """)
+
+        print(f"\n{'='*50}")
+        print("Recent resolved categories:")
+        for r in results:
+            source = r['business_category_source'] or '-'
+            print(f"  {r['business_name'][:30]:30} | {r['business_category'] or '-':20} | {source:8} -> {r['resolved_path']} ({r['category_resolution_method']})")
+
+    finally:
+        await pool.close()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())