Initial commit - WhyRating Engine (Google Reviews Scraper)

2026-02-02 18:19:00 +00:00
parent 0543a08242
commit 2206ddeff2
136 changed files with 51138 additions and 855 deletions
--- a/api_server_production.py
+++ b/api_server_production.py
@@ -61,7 +61,9 @@ from api.routes import (
    dashboard_router, set_dashboard_db,
    admin_router, set_admin_db,
    pipelines_router, set_pipelines_db,
+    reviewiq_analytics_router, set_reviewiq_analytics_db,
 )
+from api.routes.sessions import router as sessions_router

 # Configure logging
 logging.basicConfig(
@@ -110,6 +112,7 @@ async def lifespan(app: FastAPI):
    set_dashboard_db(db)
    set_admin_db(db)
    set_pipelines_db(db.pool)  # Pipeline router uses raw asyncpg pool
+    set_reviewiq_analytics_db(db.pool)  # ReviewIQ analytics uses raw asyncpg pool

    # Initialize health check system with canary monitoring
    # DISABLED: Canary tests consume Google Maps requests and trigger rate limiting
@@ -124,12 +127,15 @@ async def lifespan(app: FastAPI):

    # Start Chrome worker pools (1 for validation, 2 for scraping)
    # These pre-warm Chrome instances for instant availability
-    # headless=False because Docker uses Xvfb virtual display for better compatibility
+    # In Docker: headless=False with Xvfb virtual display for better compatibility
+    # Locally: use CHROME_HEADLESS env var to control (default: headed for scraping)
+    is_docker = os.path.exists("/.dockerenv") or os.environ.get("DOCKER_CONTAINER", "false").lower() == "true"
+    chrome_headless = os.environ.get("CHROME_HEADLESS", "false").lower() == "true"
    await asyncio.to_thread(
        start_worker_pools,
        validation_size=1,
        scraping_size=2,
-        headless=False
+        headless=chrome_headless if not is_docker else False
    )
    log.info("Chrome worker pools started (1 validation + 2 scraping)")

@@ -172,6 +178,8 @@ app.include_router(batches_router)
 app.include_router(dashboard_router)
 app.include_router(admin_router)
 app.include_router(pipelines_router)
+app.include_router(reviewiq_analytics_router)
+app.include_router(sessions_router)  # Session handoff for validation → scraping


 # ==================== Request/Response Models ====================
@@ -220,6 +228,10 @@ class ScrapeRequest(BaseModel):
    callback_url: Optional[HttpUrl] = Field(None, description="URL to call when job completes (alternative to webhook)")
    scraper_version: Optional[str] = Field(None, description="Specific scraper version to use")
    scraper_variant: Optional[str] = Field(None, description="Scraper variant (e.g., 'fast', 'thorough', 'stealth')")
+    # Testing options
+    max_reviews: Optional[int] = Field(None, description="Maximum reviews to collect (for testing, default: unlimited)", ge=1, le=10000)
+    # Session handoff (v1.2.0) - reuse browser from validation
+    session_id: Optional[str] = Field(None, description="Session ID from /sessions/validate for browser reuse")


 class GoogleReviewsScrapeRequest(BaseModel):
@@ -236,6 +248,10 @@ class GoogleReviewsScrapeRequest(BaseModel):
    callback_url: Optional[HttpUrl] = Field(None, description="URL to call when job completes (alternative to webhook)")
    scraper_version: Optional[str] = Field(None, description="Specific scraper version to use")
    scraper_variant: Optional[str] = Field(None, description="Scraper variant (e.g., 'fast', 'thorough', 'stealth')")
+    # Testing options
+    max_reviews: Optional[int] = Field(None, description="Maximum reviews to collect (for testing, default: unlimited)", ge=1, le=10000)
+    # Session handoff (v1.2.0) - reuse browser from validation
+    session_id: Optional[str] = Field(None, description="Session ID from /sessions/validate for browser reuse")


 class JobResponse(BaseModel):
@@ -548,16 +564,21 @@ async def get_job(job_id: UUID):
        except:
            review_topics = None

-    # Extract business info from metadata if available
-    metadata = job.get('metadata')
-    if isinstance(metadata, str):
-        try:
-            metadata = json.loads(metadata)
-        except:
-            metadata = None
+    # Read business info from dedicated columns (with fallback to metadata for older jobs)
+    business_name = job.get('business_name')
+    business_category = job.get('business_category')

-    business_name = metadata.get('business_name') if metadata else None
-    business_category = metadata.get('business_category') if metadata else None
+    # Fallback to metadata for jobs created before migration
+    if not business_name or not business_category:
+        metadata = job.get('metadata')
+        if isinstance(metadata, str):
+            try:
+                metadata = json.loads(metadata)
+            except:
+                metadata = None
+        if metadata:
+            business_name = business_name or metadata.get('business_name')
+            # Note: business_category was not previously stored in metadata

    return JobResponse(
        job_id=str(job['job_id']),
@@ -1051,17 +1072,22 @@ async def list_jobs(

    result = []
    for job in jobs:
-        # Extract business info from metadata if available
-        metadata = job.get('metadata')
-        if isinstance(metadata, str):
-            try:
-                metadata = json.loads(metadata)
-            except:
-                metadata = None
+        # Read business info from dedicated columns (with fallback to metadata for older jobs)
+        business_name = job.get('business_name')
+        business_address = job.get('business_address')
+        business_category = job.get('business_category')

-        business_name = metadata.get('business_name') if metadata else None
-        business_address = metadata.get('business_address') if metadata else None
-        business_category = metadata.get('business_category') if metadata else None
+        # Fallback to metadata for jobs created before migration
+        if not business_name:
+            metadata = job.get('metadata')
+            if isinstance(metadata, str):
+                try:
+                    metadata = json.loads(metadata)
+                except:
+                    metadata = None
+            if metadata:
+                business_name = business_name or metadata.get('business_name')
+                business_address = business_address or metadata.get('business_address')

        # Parse review_topics if it's a string
        review_topics = job.get('review_topics')
@@ -1191,6 +1217,193 @@ async def get_stats():
    return StatsResponse(**stats)


+# ==================== GBP Categories Endpoints ====================
+
+@app.get("/categories", summary="Get GBP Categories")
+async def get_categories(
+    search: Optional[str] = Query(None, description="Search term for category name"),
+    parent: Optional[str] = Query(None, description="Parent path (ltree) to filter children"),
+    level: Optional[int] = Query(None, description="Category level (1-4)", ge=1, le=4),
+    limit: int = Query(5000, description="Maximum number of results", ge=1, le=10000),
+    offset: int = Query(0, description="Offset for pagination", ge=0),
+):
+    """
+    Get Google Business Profile categories.
+
+    Supports filtering by:
+    - search: Text search in category name
+    - parent: Get children of a specific path
+    - level: Filter by hierarchy level (1=Sector, 2=Business Type, 3=Sub-category, 4=Category)
+    """
+    if not db or not db.pool:
+        raise HTTPException(status_code=500, detail="Database not initialized")
+
+    async with db.pool.acquire() as conn:
+        # Build query dynamically based on filters
+        conditions = []
+        params = []
+        param_idx = 1
+
+        if search:
+            conditions.append(f"name ILIKE ${param_idx}")
+            params.append(f"%{search}%")
+            param_idx += 1
+
+        if parent:
+            conditions.append(f"path <@ ${param_idx}::ltree AND path != ${param_idx}::ltree")
+            params.append(parent)
+            param_idx += 1
+
+        if level:
+            conditions.append(f"level = ${param_idx}")
+            params.append(level)
+            param_idx += 1
+
+        where_clause = " AND ".join(conditions) if conditions else "TRUE"
+
+        # Get total count
+        count_query = f"SELECT COUNT(*) FROM gbp_categories WHERE {where_clause}"
+        total = await conn.fetchval(count_query, *params)
+
+        # Get categories
+        query = f"""
+            SELECT id, name, slug, path::text as path, level, parent_id, category_count
+            FROM gbp_categories
+            WHERE {where_clause}
+            ORDER BY path
+            LIMIT ${param_idx} OFFSET ${param_idx + 1}
+        """
+        params.extend([limit, offset])
+
+        rows = await conn.fetch(query, *params)
+        categories = [dict(row) for row in rows]
+
+        return {
+            "categories": categories,
+            "total": total,
+            "limit": limit,
+            "offset": offset,
+        }
+
+
+@app.get("/categories/tree", summary="Get GBP Categories Tree")
+async def get_categories_tree(
+    root: Optional[str] = Query(None, description="Root path to start the tree from"),
+    max_depth: int = Query(4, description="Maximum depth of the tree", ge=1, le=4),
+):
+    """
+    Get categories as a hierarchical tree structure.
+
+    Returns nested categories starting from root (or all roots if not specified).
+    """
+    if not db or not db.pool:
+        raise HTTPException(status_code=500, detail="Database not initialized")
+
+    async with db.pool.acquire() as conn:
+        if root:
+            # Get subtree starting from root
+            query = """
+                SELECT id, name, slug, path::text as path, level, parent_id, category_count
+                FROM gbp_categories
+                WHERE path <@ $1::ltree
+                ORDER BY path
+            """
+            rows = await conn.fetch(query, root)
+        else:
+            # Get all categories
+            query = """
+                SELECT id, name, slug, path::text as path, level, parent_id, category_count
+                FROM gbp_categories
+                ORDER BY path
+            """
+            rows = await conn.fetch(query)
+
+        categories = [dict(row) for row in rows]
+
+        # Build tree structure
+        def build_tree(cats, parent_path=None, current_depth=1):
+            if current_depth > max_depth:
+                return []
+
+            result = []
+            for cat in cats:
+                cat_parts = cat['path'].split('.')
+
+                if parent_path is None:
+                    # Root level - single segment paths
+                    if len(cat_parts) == 1:
+                        children = build_tree(cats, cat['path'], current_depth + 1)
+                        result.append({
+                            **cat,
+                            'children': children if children else None
+                        })
+                else:
+                    # Check if this is a direct child of parent_path
+                    parent_parts = parent_path.split('.')
+                    if (len(cat_parts) == len(parent_parts) + 1 and
+                        cat['path'].startswith(parent_path + '.')):
+                        children = build_tree(cats, cat['path'], current_depth + 1)
+                        result.append({
+                            **cat,
+                            'children': children if children else None
+                        })
+
+            return result
+
+        tree = build_tree(categories)
+
+        return {
+            "tree": tree,
+            "total": len(categories),
+        }
+
+
+@app.get("/categories/{path:path}", summary="Get Category by Path")
+async def get_category_by_path(path: str):
+    """
+    Get a specific category by its ltree path.
+
+    Also returns ancestors and direct children.
+    """
+    if not db or not db.pool:
+        raise HTTPException(status_code=500, detail="Database not initialized")
+
+    async with db.pool.acquire() as conn:
+        # Get the category
+        category = await conn.fetchrow("""
+            SELECT id, name, slug, path::text as path, level, parent_id, category_count
+            FROM gbp_categories
+            WHERE path = $1::ltree
+        """, path)
+
+        if not category:
+            raise HTTPException(status_code=404, detail="Category not found")
+
+        category = dict(category)
+
+        # Get ancestors
+        ancestors = await conn.fetch("""
+            SELECT id, name, slug, path::text as path, level, parent_id, category_count
+            FROM gbp_categories
+            WHERE path @> $1::ltree AND path != $1::ltree
+            ORDER BY path
+        """, path)
+
+        # Get direct children
+        children = await conn.fetch("""
+            SELECT id, name, slug, path::text as path, level, parent_id, category_count
+            FROM gbp_categories
+            WHERE path ~ ($1 || '.*{1}')::lquery
+            ORDER BY name
+        """, path)
+
+        return {
+            "category": category,
+            "ancestors": [dict(a) for a in ancestors],
+            "children": [dict(c) for c in children],
+        }
+
+
@app.get("/pool-stats", summary="Get Worker Pool Statistics")
 async def pool_stats():
    """Get Chrome worker pool statistics"""
@@ -1331,10 +1544,82 @@ async def get_crash_report(job_id: UUID):
    )


+# Available sort orders for retry strategy
+SORT_ORDERS = ["newest", "lowest", "highest", "relevant"]
+
+# Fingerprint rotation for retry - realistic browser profiles to avoid bot detection
+import random
+
+FINGERPRINT_PROFILES = [
+    {
+        "platform": "MacIntel",
+        "timezone": "Europe/Madrid",
+        "language": "es-ES",
+        "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+        "viewport": {"width": 1440, "height": 900}
+    },
+    {
+        "platform": "Win32",
+        "timezone": "Europe/London",
+        "language": "en-GB",
+        "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
+        "viewport": {"width": 1920, "height": 1080}
+    },
+    {
+        "platform": "MacIntel",
+        "timezone": "America/New_York",
+        "language": "en-US",
+        "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
+        "viewport": {"width": 1680, "height": 1050}
+    },
+    {
+        "platform": "Win32",
+        "timezone": "Europe/Paris",
+        "language": "fr-FR",
+        "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+        "viewport": {"width": 1366, "height": 768}
+    },
+    {
+        "platform": "MacIntel",
+        "timezone": "Europe/Berlin",
+        "language": "de-DE",
+        "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
+        "viewport": {"width": 1512, "height": 982}
+    },
+]
+
+def get_rotated_fingerprint(retry_attempt: int = 0, previous_fingerprints: list = None) -> dict:
+    """
+    Get a fingerprint profile for retry, avoiding previously used ones.
+
+    Args:
+        retry_attempt: Which retry attempt this is (0-indexed)
+        previous_fingerprints: List of previously used fingerprint platforms
+
+    Returns:
+        A fingerprint profile dict
+    """
+    previous_fingerprints = previous_fingerprints or []
+
+    # Filter out previously used profiles
+    available = [fp for fp in FINGERPRINT_PROFILES
+                 if fp["platform"] not in previous_fingerprints]
+
+    # If all used, cycle back
+    if not available:
+        available = FINGERPRINT_PROFILES
+
+    # Select based on retry attempt (deterministic but varied)
+    selected = available[retry_attempt % len(available)]
+
+    return selected.copy()
+
+
@app.post("/jobs/{job_id}/retry", response_model=RetryJobResponse, summary="Retry Failed Job")
 async def retry_job(
    job_id: UUID,
-    apply_fix: bool = Query(False, description="Apply auto-fix parameters based on crash analysis")
+    apply_fix: bool = Query(False, description="Apply auto-fix parameters based on crash analysis"),
+    next_sort: bool = Query(False, description="Use a different sort order than the original job (for partial jobs)")
 ):
    """
    Retry a failed or partial job, optionally applying auto-fix parameters.
@@ -1344,6 +1629,11 @@ async def retry_job(
    - Applies recommended parameter adjustments (e.g., reduced batch size for memory issues)
    - Creates a new job with the adjusted parameters

+    When next_sort=true:
+    - Uses a different sort order than previously attempted
+    - Helps get different reviews when stuck at ~1000 limit
+    - Tracks sort_orders_attempted for review merging
+
    Returns the new job ID for tracking.
    """
    if not db:
@@ -1418,6 +1708,72 @@ async def retry_job(
            applied_fixes = analysis.auto_fix_params
            log.info(f"Applying auto-fix for pattern '{analysis.pattern}': {applied_fixes}")

+    # Handle next_sort: use a different sort order than previously attempted
+    selected_sort = None
+    if next_sort:
+        # Get previously attempted sort orders
+        sort_orders_attempted = original_metadata.get('sort_orders_attempted', [])
+
+        # If no sort was tracked, assume "newest" was used (default)
+        if not sort_orders_attempted:
+            initial_sort_used = original_metadata.get('initial_sort_used', 'newest')
+            sort_orders_attempted = [initial_sort_used]
+
+        # Find next unused sort order
+        for sort_order in SORT_ORDERS:
+            if sort_order not in sort_orders_attempted:
+                selected_sort = sort_order
+                break
+
+        if selected_sort:
+            # Set the new sort strategy
+            original_metadata['initial_sort'] = selected_sort
+            original_metadata['sort_strategy'] = 'single'  # Don't auto-trigger multi-sort
+
+            # Track all attempted sorts (including this one)
+            original_metadata['sort_orders_attempted'] = sort_orders_attempted + [selected_sort]
+
+            # Track retry chain for review merging
+            if 'retry_chain' not in original_metadata:
+                original_metadata['retry_chain'] = [str(job_id)]
+            else:
+                original_metadata['retry_chain'].append(str(job_id))
+
+            original_metadata['retry_info'] = original_metadata.get('retry_info', {})
+            original_metadata['retry_info']['original_job_id'] = str(job_id)
+            original_metadata['retry_info']['retry_reason'] = 'next_sort'
+            original_metadata['retry_info']['selected_sort'] = selected_sort
+
+            log.info(f"Retry with next_sort: using '{selected_sort}' (previously tried: {sort_orders_attempted})")
+        else:
+            log.warn(f"All sort orders already attempted: {sort_orders_attempted}")
+
+    # Fingerprint rotation: if bot was detected, use a different fingerprint
+    selected_fingerprint = None
+    if next_sort and original_metadata.get('bot_detected', False):
+        # Get previously used fingerprints
+        previous_fingerprints = original_metadata.get('fingerprints_used', [])
+        retry_count = len(original_metadata.get('retry_chain', []))
+
+        # Get a rotated fingerprint
+        selected_fingerprint = get_rotated_fingerprint(retry_count, previous_fingerprints)
+
+        # Store the fingerprint in metadata
+        original_metadata['browser_fingerprint'] = selected_fingerprint
+
+        # Track used fingerprints
+        if 'fingerprints_used' not in original_metadata:
+            original_metadata['fingerprints_used'] = []
+        original_metadata['fingerprints_used'].append(selected_fingerprint['platform'])
+
+        original_metadata['retry_info']['fingerprint_rotated'] = True
+        original_metadata['retry_info']['new_fingerprint'] = {
+            'platform': selected_fingerprint['platform'],
+            'timezone': selected_fingerprint['timezone']
+        }
+
+        log.info(f"Fingerprint rotated for retry: {selected_fingerprint['platform']}, {selected_fingerprint['timezone']}")
+
    # Create new job with same URL and (possibly modified) metadata
    new_job_id = await db.create_job(
        url=original_job['url'],
@@ -1431,11 +1787,28 @@ async def retry_job(

    log.info(f"Created retry job {new_job_id} for original job {job_id}")

+    # Build response message
+    message = f"Retry job created from original job {job_id}"
+    if selected_sort:
+        message += f" (using sort: {selected_sort})"
+    if selected_fingerprint:
+        message += f" (fingerprint: {selected_fingerprint['platform']}/{selected_fingerprint['timezone']})"
+
+    # Build applied_fixes response
+    retry_fixes = {}
+    if selected_sort:
+        retry_fixes["selected_sort"] = selected_sort
+    if selected_fingerprint:
+        retry_fixes["fingerprint"] = {
+            "platform": selected_fingerprint["platform"],
+            "timezone": selected_fingerprint["timezone"]
+        }
+
    return RetryJobResponse(
        job_id=str(new_job_id),
        status="started",
-        message=f"Retry job created from original job {job_id}",
-        applied_fixes=applied_fixes
+        message=message,
+        applied_fixes=applied_fixes if applied_fixes else (retry_fixes if retry_fixes else None)
    )


@@ -1529,8 +1902,9 @@ async def liveness():

    Use this for Kubernetes liveness probe - restart container if fails.
    """
+    # If health system is disabled, just return healthy (server is alive)
    if not health_system:
-        raise HTTPException(status_code=503, detail="Health system not initialized")
+        return {"status": "healthy", "message": "Server is alive (health system disabled)"}

    return await health_system.check_liveness()

@@ -1542,8 +1916,12 @@ async def readiness():

    Use this for Kubernetes readiness probe - remove from load balancer if fails.
    """
+    # If health system is disabled, check if DB is connected
    if not health_system:
-        raise HTTPException(status_code=503, detail="Health system not initialized")
+        if db and db.pool:
+            return {"status": "ready", "message": "Server is ready (health system disabled)"}
+        else:
+            raise HTTPException(status_code=503, detail="Database not connected")

    result = await health_system.check_readiness()

@@ -1728,17 +2106,67 @@ async def run_scraping_job(job_id: UUID):
            scraper_func, actual_version = get_scraper_for_version(requested_version)
            log.info(f"Using scraper version {actual_version} for job {job_id}")

-            # Run scraping with progress callback and shared log capture
-            # headless=False because Docker uses Xvfb virtual display
-            result = await asyncio.to_thread(
-                scraper_func,
-                url=url,
-                headless=False,
-                progress_callback=progress_callback,
-                log_capture=log_capture,
-                flush_callback=flush_callback,
-                browser_fingerprint=browser_fingerprint  # Pass user's browser fingerprint
-            )
+            # Get sort strategy parameters from metadata (for retry with different sort)
+            initial_sort = metadata.get('initial_sort') if metadata else None
+            sort_strategy = metadata.get('sort_strategy', 'auto') if metadata else 'auto'
+            max_reviews = metadata.get('max_reviews') if metadata else None
+            session_id = metadata.get('session_id') if metadata else None
+            if initial_sort:
+                log.info(f"Using initial_sort={initial_sort}, sort_strategy={sort_strategy} for job {job_id}")
+            if max_reviews:
+                log.info(f"Using max_reviews={max_reviews} limit for job {job_id} (testing mode)")
+
+            # Check if we have a session_id for browser reuse (session handoff from validation)
+            if session_id:
+                log.info(f"Using session handoff (session_id={session_id}) for job {job_id} - skipping navigation")
+                from scrapers.google_reviews.v1_2_0 import scrape_with_session
+                result = await asyncio.to_thread(
+                    scrape_with_session,
+                    session_id=session_id,
+                    max_reviews=max_reviews,
+                    progress_callback=progress_callback,
+                    flush_callback=flush_callback,
+                    sort_strategy=sort_strategy,
+                    initial_sort=initial_sort
+                )
+                # Add logs from session scraping
+                if 'logs' in result:
+                    for log_entry in result.get('logs', []):
+                        log_capture.entries.append(log_entry)
+            else:
+                # Run scraping with progress callback and shared log capture
+                # headless=False because Docker uses Xvfb virtual display
+                result = await asyncio.to_thread(
+                    scraper_func,
+                    url=url,
+                    headless=False,
+                    progress_callback=progress_callback,
+                    log_capture=log_capture,
+                    flush_callback=flush_callback,
+                    browser_fingerprint=browser_fingerprint,  # Pass user's browser fingerprint
+                    initial_sort=initial_sort,  # Sort order for retry strategy
+                    sort_strategy=sort_strategy,  # Sort strategy (auto, multi, single)
+                    max_reviews=max_reviews  # Optional limit for testing
+                )
+
+            # Update job metadata with tracking info from scraper result
+            tracking_metadata = {
+                'bot_detected': result.get('bot_detected', False),
+                'initial_sort_used': result.get('initial_sort_used', 'newest'),
+                'multi_sort': result.get('multi_sort', {}),
+            }
+            # Preserve existing sort_orders_attempted and add current sort
+            existing_sorts = metadata.get('sort_orders_attempted', []) if metadata else []
+            current_sort = result.get('initial_sort_used', 'newest')
+            if current_sort not in existing_sorts:
+                tracking_metadata['sort_orders_attempted'] = existing_sorts + [current_sort]
+            else:
+                tracking_metadata['sort_orders_attempted'] = existing_sorts
+
+            # Update metadata in database
+            await db.update_job_metadata(job_id, tracking_metadata)
+            if result.get('bot_detected'):
+                log.warn(f"Bot detection flagged for job {job_id} - sort button was hidden")

            if result['success']:
                # Save session fingerprint if captured
@@ -1746,6 +2174,18 @@ async def run_scraping_job(job_id: UUID):
                    await db.update_session_fingerprint(job_id, result['session_fingerprint'])
                    log.info(f"Saved session fingerprint for job {job_id}")

+                # Save business info to dedicated columns (queryable/indexable)
+                business_info = result.get('business_info', {})
+                if business_info:
+                    await db.update_business_info(
+                        job_id=job_id,
+                        business_name=business_info.get('name'),
+                        business_category=business_info.get('category'),
+                        business_address=business_info.get('address'),
+                        business_rating=business_info.get('rating')
+                    )
+                    log.info(f"Saved business info for job {job_id}: {business_info.get('name')} ({business_info.get('category')})")
+
                # Save results to database (including scraper logs and review topics)
                await db.save_job_result(
                    job_id=job_id,