Add browser fingerprint support and analytics metadata display

- Transfer user's browser fingerprint (user-agent, viewport, timezone, language, geolocation) to Chrome for more authentic scraping - Display review topics from Google Maps in analytics dashboard - Show business category badge in analytics header - Fix date_text null handling in analytics (handle undefined/timestamp fields) - Add review_topics and business_category to JobStatus interface Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 10:36:06 +00:00
parent 1bd30c0789
commit a540ab97b1
9 changed files with 1214 additions and 231 deletions
--- a/api_server_production.py
+++ b/api_server_production.py
@@ -133,12 +133,36 @@ app.add_middleware(

 # ==================== Request/Response Models ====================

+class GeolocationModel(BaseModel):
+    """Geolocation coordinates"""
+    lat: float = Field(..., description="Latitude")
+    lng: float = Field(..., description="Longitude")
+
+
+class ViewportModel(BaseModel):
+    """Browser viewport size"""
+    width: int = Field(..., description="Viewport width")
+    height: int = Field(..., description="Viewport height")
+
+
+class BrowserFingerprintModel(BaseModel):
+    """Browser fingerprint to replicate user's browser"""
+    geolocation: Optional[GeolocationModel] = None
+    userAgent: Optional[str] = Field(None, description="User agent string")
+    viewport: Optional[ViewportModel] = Field(None, description="Screen resolution")
+    timezone: Optional[str] = Field(None, description="Timezone (e.g., Europe/Madrid)")
+    language: Optional[str] = Field(None, description="Browser language (e.g., en-US)")
+    platform: Optional[str] = Field(None, description="Platform (e.g., MacIntel, Win32)")
+
+
 class ScrapeRequest(BaseModel):
    """Request model for starting a scrape job"""
    url: HttpUrl = Field(..., description="Google Maps URL to scrape")
    webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications")
    webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature")
    metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata")
+    geolocation: Optional[GeolocationModel] = Field(None, description="User's geolocation for Chrome")
+    browser_fingerprint: Optional[BrowserFingerprintModel] = Field(None, description="User's browser fingerprint")


 class JobResponse(BaseModel):
@@ -149,6 +173,7 @@ class JobResponse(BaseModel):
    created_at: str
    started_at: Optional[str] = None
    completed_at: Optional[str] = None
+    updated_at: Optional[str] = None  # Last update time for progress tracking
    reviews_count: Optional[int] = None
    total_reviews: Optional[int] = None  # Total reviews available for this place
    scrape_time: Optional[float] = None
@@ -157,6 +182,8 @@ class JobResponse(BaseModel):
    # Business metadata
    business_name: Optional[str] = None
    business_address: Optional[str] = None
+    business_category: Optional[str] = None  # Category (e.g., "Barber shop")
+    review_topics: Optional[List[Dict[str, Any]]] = None  # Topic filters with mention counts


 class ReviewsResponse(BaseModel):
@@ -206,12 +233,32 @@ async def start_scrape(request: ScrapeRequest):
        raise HTTPException(status_code=500, detail="Database not initialized")

    try:
+        # Merge browser fingerprint into metadata if provided
+        metadata = request.metadata or {}
+        if request.browser_fingerprint:
+            fp = request.browser_fingerprint
+            metadata['browser_fingerprint'] = {
+                "userAgent": fp.userAgent,
+                "timezone": fp.timezone,
+                "language": fp.language,
+                "platform": fp.platform,
+            }
+            if fp.viewport:
+                metadata['browser_fingerprint']['viewport'] = {"width": fp.viewport.width, "height": fp.viewport.height}
+            if fp.geolocation:
+                metadata['browser_fingerprint']['geolocation'] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
+        elif request.geolocation:
+            metadata['geolocation'] = {
+                'lat': request.geolocation.lat,
+                'lng': request.geolocation.lng
+            }
+
        # Create job in database
        job_id = await db.create_job(
            url=str(request.url),
            webhook_url=str(request.webhook_url) if request.webhook_url else None,
            webhook_secret=request.webhook_secret,
-            metadata=request.metadata
+            metadata=metadata
        )

        # Start scraping job in background
@@ -240,6 +287,25 @@ async def get_job(job_id: UUID):
    if not job:
        raise HTTPException(status_code=404, detail="Job not found")

+    # Parse review_topics if it's a string (JSONB might be returned as string)
+    review_topics = job.get('review_topics')
+    if isinstance(review_topics, str):
+        try:
+            review_topics = json.loads(review_topics)
+        except:
+            review_topics = None
+
+    # Extract business info from metadata if available
+    metadata = job.get('metadata')
+    if isinstance(metadata, str):
+        try:
+            metadata = json.loads(metadata)
+        except:
+            metadata = None
+
+    business_name = metadata.get('business_name') if metadata else None
+    business_category = metadata.get('business_category') if metadata else None
+
    return JobResponse(
        job_id=str(job['job_id']),
        status=job['status'],
@@ -247,11 +313,15 @@ async def get_job(job_id: UUID):
        created_at=job['created_at'].isoformat(),
        started_at=job['started_at'].isoformat() if job['started_at'] else None,
        completed_at=job['completed_at'].isoformat() if job['completed_at'] else None,
+        updated_at=job['updated_at'].isoformat() if job.get('updated_at') else None,
        reviews_count=job['reviews_count'],
        total_reviews=job.get('total_reviews'),
        scrape_time=job['scrape_time'],
        error_message=job['error_message'],
-        webhook_url=job.get('webhook_url')
+        webhook_url=job.get('webhook_url'),
+        business_name=business_name,
+        business_category=business_category,
+        review_topics=review_topics
    )


@@ -541,25 +611,32 @@ async def stream_all_jobs():
@app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews")
 async def get_job_reviews(job_id: UUID):
    """
-    Get the actual reviews data for a completed job.
+    Get reviews data for a job.

-    Returns 404 if job not found or not completed yet.
+    Returns reviews for completed, partial, or running jobs (if reviews have been collected).
+    Returns 404 if job not found or no reviews available yet.
    """
    if not db:
        raise HTTPException(status_code=500, detail="Database not initialized")

-    reviews = await db.get_job_reviews(job_id)
+    # Get reviews (includes completed, running, and partial jobs)
+    reviews = await db.get_job_reviews(job_id, include_partial=True)
    if reviews is None:
        job = await db.get_job(job_id)
        if not job:
            raise HTTPException(status_code=404, detail="Job not found")
-        elif job['status'] != 'completed':
+        elif job['status'] == 'pending':
            raise HTTPException(
                status_code=400,
-                detail=f"Job not completed yet (current status: {job['status']})"
+                detail="Job has not started yet"
+            )
+        elif job['status'] == 'failed':
+            raise HTTPException(
+                status_code=400,
+                detail=f"Job failed without saving any reviews: {job.get('error_message', 'Unknown error')}"
            )
        else:
-            raise HTTPException(status_code=404, detail="Reviews data not available")
+            raise HTTPException(status_code=404, detail="No reviews data available yet")

    return ReviewsResponse(
        job_id=str(job_id),
@@ -603,6 +680,15 @@ async def list_jobs(

        business_name = metadata.get('business_name') if metadata else None
        business_address = metadata.get('business_address') if metadata else None
+        business_category = metadata.get('business_category') if metadata else None
+
+        # Parse review_topics if it's a string
+        review_topics = job.get('review_topics')
+        if isinstance(review_topics, str):
+            try:
+                review_topics = json.loads(review_topics)
+            except:
+                review_topics = None

        result.append(JobResponse(
            job_id=str(job['job_id']),
@@ -615,7 +701,9 @@ async def list_jobs(
            scrape_time=job.get('scrape_time'),
            error_message=job.get('error_message'),
            business_name=business_name,
-            business_address=business_address
+            business_address=business_address,
+            business_category=business_category,
+            review_topics=review_topics
        ))

    return result
@@ -640,63 +728,69 @@ async def check_reviews(request: ScrapeRequest):
    Get business card information from Google Maps.
    Returns business name, address, rating, and review count.

-    Uses pre-warmed Chrome worker from pool for instant response.
+    Creates a fresh Chrome instance for reliable results (same as full scraper).
    This is used to show the business confirmation card in the UI.
    """
-    worker = None
-    recycle_worker = False
-
    try:
        url = str(request.url)

-        # Get pre-warmed worker from validation pool
-        worker = await asyncio.to_thread(get_validation_worker, timeout=10)
+        # Use the SAME scraper algorithm with validation_only=True for early return
+        # Creates a fresh Chrome instance (same as full scraper) to avoid stale browser state
+        # Pooled browsers can have cookies/state that cause Google to render pages differently

-        if worker:
-            log.info(f"Using worker {worker.worker_id} for business card extraction")
-            # Use the pooled worker (don't close it)
-            result = await asyncio.to_thread(
-                get_business_card_info,
-                url=url,
-                driver=worker.driver,
-                return_driver=True
-            )
-
-            # Check if the result indicates a session error
-            if not result['success'] and result.get('error'):
-                error_msg = result.get('error', '').lower()
-                if 'invalid session' in error_msg or 'session' in error_msg:
-                    log.warning(f"Worker {worker.worker_id} has invalid session, will recycle")
-                    recycle_worker = True
+        # Build fingerprint dict from request
+        fingerprint = None
+        if request.browser_fingerprint:
+            fp = request.browser_fingerprint
+            fingerprint = {
+                "userAgent": fp.userAgent,
+                "timezone": fp.timezone,
+                "language": fp.language,
+                "platform": fp.platform,
+            }
+            if fp.viewport:
+                fingerprint["viewport"] = {"width": fp.viewport.width, "height": fp.viewport.height}
+            if fp.geolocation:
+                fingerprint["geolocation"] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
+            log.info(f"Creating Chrome with user fingerprint: {fp.platform}, {fp.timezone}")
+        elif request.geolocation:
+            fingerprint = {"geolocation": {"lat": request.geolocation.lat, "lng": request.geolocation.lng}}
+            log.info(f"Creating Chrome with geolocation only")
        else:
-            # Fallback: create temporary worker
-            log.warning("No pooled worker available, creating temporary instance")
-            result = await asyncio.to_thread(
-                get_business_card_info,
-                url=url
-            )
+            log.info(f"Creating Chrome with default settings")

-        # SIMPLIFIED VALIDATION: If we found a business (name + rating), assume it has reviews
-        # Let the actual scraper determine if reviews exist
-        has_business = bool(result.get('name') and result.get('rating'))
+        result = await asyncio.to_thread(
+            fast_scrape_reviews,
+            url=url,
+            headless=False,  # Use Xvfb display
+            validation_only=True,  # Return early after getting total_reviews
+            browser_fingerprint=fingerprint  # Pass user's browser fingerprint
+        )
+
+        # Extract validation info from the result
+        validation_info = result.get('validation_info', {})
+        total_reviews = validation_info.get('total_reviews') or result.get('total_reviews') or 0
+        name = validation_info.get('name')
+        rating = validation_info.get('rating')
+        category = validation_info.get('category')
+        address = validation_info.get('address')
+
+        # Has reviews if we found a business with the Reviews tab (indicated by total_reviews > 0)
+        has_reviews = bool(name and total_reviews > 0)

        return {
-            "has_reviews": has_business,  # Boolean: true if business exists
-            "total_reviews": result.get('total_reviews') or 0,  # Show 0 if unknown
-            "name": result.get('name'),
-            "address": result.get('address'),
-            "rating": result.get('rating'),
-            "success": result['success'],
+            "has_reviews": has_reviews,  # True if business has reviews
+            "total_reviews": total_reviews,
+            "name": name,
+            "address": address,
+            "rating": rating,
+            "category": category,
+            "success": result.get('success', True),
            "error": result.get('error')
        }

    except Exception as e:
        log.error(f"Error checking reviews: {e}")
-        # If it's a session error, recycle the worker
-        if worker:
-            error_msg = str(e).lower()
-            if 'invalid session' in error_msg or 'session' in error_msg:
-                recycle_worker = True

        return {
            "has_reviews": False,
@@ -704,10 +798,6 @@ async def check_reviews(request: ScrapeRequest):
            "success": False,
            "error": str(e)
        }
-    finally:
-        # Release worker back to pool (or recycle if broken)
-        if worker:
-            await asyncio.to_thread(release_validation_worker, worker, recycle=recycle_worker)


@app.get("/stats", response_model=StatsResponse, summary="Get Statistics")
@@ -808,6 +898,21 @@ async def run_scraping_job(job_id: UUID):
            job = await db.get_job(job_id)
            url = job['url']

+            # Extract browser fingerprint from metadata if available
+            browser_fingerprint = None
+            metadata = job.get('metadata')
+            if isinstance(metadata, str):
+                try:
+                    metadata = json.loads(metadata)
+                except:
+                    metadata = None
+            if metadata and 'browser_fingerprint' in metadata:
+                browser_fingerprint = metadata['browser_fingerprint']
+                log.info(f"Using user fingerprint: {browser_fingerprint.get('platform')}, {browser_fingerprint.get('timezone')}")
+            elif metadata and 'geolocation' in metadata:
+                browser_fingerprint = {'geolocation': metadata['geolocation']}
+                log.info(f"Using user geolocation only")
+
            # Broadcast job started via SSE
            await broadcast_job_update(job_id_str, "job_started", {
                "job_id": job_id_str,
@@ -821,9 +926,17 @@ async def run_scraping_job(job_id: UUID):
            # Create log capture instance that we can access for real-time logs
            log_capture = LogCapture()

+            # Track total reviews for incremental saves
+            total_reviews_seen = [None]
+            # Accumulate all reviews for incremental saves (flush_callback receives batches)
+            all_reviews_collected = []
+
            # Progress callback to update job status with current/total counts AND logs
            def progress_callback(current_count: int, total_count: int):
                """Update job progress and logs from worker thread"""
+                if total_count:
+                    total_reviews_seen[0] = total_count
+
                async def update():
                    # Get current logs from the shared log_capture
                    current_logs = log_capture.get_logs()
@@ -847,6 +960,22 @@ async def run_scraping_job(job_id: UUID):
                # Schedule the coroutine on the event loop
                asyncio.run_coroutine_threadsafe(update(), loop)

+            # Flush callback to save reviews incrementally (crash recovery)
+            # Note: flush_callback receives batches, so we accumulate them
+            def flush_callback(reviews_batch: list):
+                """Accumulate and save reviews to DB incrementally from worker thread"""
+                # Extend our collection with the new batch
+                all_reviews_collected.extend(reviews_batch)
+
+                async def save():
+                    await db.save_reviews_incremental(
+                        job_id=job_id,
+                        reviews=all_reviews_collected,  # Save ALL reviews so far
+                        total_reviews=total_reviews_seen[0]
+                    )
+                # Schedule the coroutine on the event loop
+                asyncio.run_coroutine_threadsafe(save(), loop)
+
            # Run scraping with progress callback and shared log capture
            # headless=False because Docker uses Xvfb virtual display
            result = await asyncio.to_thread(
@@ -854,17 +983,20 @@ async def run_scraping_job(job_id: UUID):
                url=url,
                headless=False,
                progress_callback=progress_callback,
-                log_capture=log_capture
+                log_capture=log_capture,
+                flush_callback=flush_callback,
+                browser_fingerprint=browser_fingerprint  # Pass user's browser fingerprint
            )

            if result['success']:
-                # Save results to database (including scraper logs)
+                # Save results to database (including scraper logs and review topics)
                await db.save_job_result(
                    job_id=job_id,
                    reviews=result['reviews'],
                    scrape_time=result['time'],
                    total_reviews=result.get('total_reviews'),
-                    scrape_logs=result.get('logs')
+                    scrape_logs=result.get('logs'),
+                    review_topics=result.get('review_topics')
                )

                log.info(
@@ -898,68 +1030,142 @@ async def run_scraping_job(job_id: UUID):
                    )

            else:
-                # Job failed - save logs for debugging
-                await db.update_job_status(
-                    job_id,
-                    JobStatus.FAILED,
-                    error_message=result.get('error', 'Unknown error'),
-                    scrape_logs=result.get('logs')
-                )
+                # Job failed - check if we have partial reviews saved
+                current_job = await db.get_job(job_id)
+                partial_count = current_job.get('reviews_count', 0) if current_job else 0

-                log.error(f"Failed job {job_id}: {result.get('error')}")
-
-                # Broadcast job failed via SSE
-                await broadcast_job_update(job_id_str, "job_failed", {
-                    "job_id": job_id_str,
-                    "status": "failed",
-                    "error_message": result.get('error'),
-                    "logs": result.get('logs', [])
-                })
-
-                # Send failure webhook if configured
-                if job.get('webhook_url'):
-                    webhook_manager = WebhookManager()
-                    await webhook_manager.send_job_completed_webhook(
-                        webhook_url=job['webhook_url'],
-                        job_id=job_id,
-                        status='failed',
-                        error_message=result.get('error'),
-                        secret=job.get('webhook_secret'),
-                        db=db
+                if partial_count > 0:
+                    # Mark as partial - we have some reviews saved
+                    await db.mark_job_partial(
+                        job_id,
+                        error_message=result.get('error', 'Unknown error'),
+                        scrape_logs=result.get('logs')
                    )

+                    log.warning(f"Partial job {job_id}: {partial_count} reviews saved before error: {result.get('error')}")
+
+                    # Broadcast job partial via SSE
+                    await broadcast_job_update(job_id_str, "job_partial", {
+                        "job_id": job_id_str,
+                        "status": "partial",
+                        "reviews_count": partial_count,
+                        "total_reviews": current_job.get('total_reviews'),
+                        "error_message": result.get('error'),
+                        "logs": result.get('logs', [])
+                    })
+
+                    # Send partial webhook if configured
+                    if job.get('webhook_url'):
+                        webhook_manager = WebhookManager()
+                        await webhook_manager.send_job_completed_webhook(
+                            webhook_url=job['webhook_url'],
+                            job_id=job_id,
+                            status='partial',
+                            reviews_count=partial_count,
+                            error_message=result.get('error'),
+                            secret=job.get('webhook_secret'),
+                            db=db
+                        )
+                else:
+                    # No reviews saved - mark as failed
+                    await db.update_job_status(
+                        job_id,
+                        JobStatus.FAILED,
+                        error_message=result.get('error', 'Unknown error'),
+                        scrape_logs=result.get('logs')
+                    )
+
+                    log.error(f"Failed job {job_id}: {result.get('error')}")
+
+                    # Broadcast job failed via SSE
+                    await broadcast_job_update(job_id_str, "job_failed", {
+                        "job_id": job_id_str,
+                        "status": "failed",
+                        "error_message": result.get('error'),
+                        "logs": result.get('logs', [])
+                    })
+
+                    # Send failure webhook if configured
+                    if job.get('webhook_url'):
+                        webhook_manager = WebhookManager()
+                        await webhook_manager.send_job_completed_webhook(
+                            webhook_url=job['webhook_url'],
+                            job_id=job_id,
+                            status='failed',
+                            error_message=result.get('error'),
+                            secret=job.get('webhook_secret'),
+                            db=db
+                        )
+
        except Exception as e:
            log.error(f"Error in scraping job {job_id}: {e}")
            import traceback
            traceback.print_exc()

-            await db.update_job_status(
-                job_id,
-                JobStatus.FAILED,
-                error_message=str(e)
-            )
+            # Check if we have partial reviews saved
+            current_job = await db.get_job(job_id)
+            partial_count = current_job.get('reviews_count', 0) if current_job else 0

-            # Broadcast job failed via SSE
-            await broadcast_job_update(job_id_str, "job_failed", {
-                "job_id": job_id_str,
-                "status": "failed",
-                "error_message": str(e),
-                "logs": []
-            })
-
-            # Send failure webhook
-            job = await db.get_job(job_id)
-            if job and job.get('webhook_url'):
-                webhook_manager = WebhookManager()
-                await webhook_manager.send_job_completed_webhook(
-                    webhook_url=job['webhook_url'],
-                    job_id=job_id,
-                    status='failed',
+            if partial_count > 0:
+                # Mark as partial - we have some reviews saved
+                await db.mark_job_partial(
+                    job_id,
                    error_message=str(e),
-                    secret=job.get('webhook_secret'),
-                    db=db
+                    scrape_logs=log_capture.get_logs() if log_capture else None
                )

+                log.warning(f"Partial job {job_id}: {partial_count} reviews saved before exception: {e}")
+
+                # Broadcast job partial via SSE
+                await broadcast_job_update(job_id_str, "job_partial", {
+                    "job_id": job_id_str,
+                    "status": "partial",
+                    "reviews_count": partial_count,
+                    "total_reviews": current_job.get('total_reviews'),
+                    "error_message": str(e),
+                    "logs": log_capture.get_logs() if log_capture else []
+                })
+
+                # Send partial webhook
+                if current_job and current_job.get('webhook_url'):
+                    webhook_manager = WebhookManager()
+                    await webhook_manager.send_job_completed_webhook(
+                        webhook_url=current_job['webhook_url'],
+                        job_id=job_id,
+                        status='partial',
+                        reviews_count=partial_count,
+                        error_message=str(e),
+                        secret=current_job.get('webhook_secret'),
+                        db=db
+                    )
+            else:
+                # No reviews saved - mark as failed
+                await db.update_job_status(
+                    job_id,
+                    JobStatus.FAILED,
+                    error_message=str(e)
+                )
+
+                # Broadcast job failed via SSE
+                await broadcast_job_update(job_id_str, "job_failed", {
+                    "job_id": job_id_str,
+                    "status": "failed",
+                    "error_message": str(e),
+                    "logs": []
+                })
+
+                # Send failure webhook
+                if current_job and current_job.get('webhook_url'):
+                    webhook_manager = WebhookManager()
+                    await webhook_manager.send_job_completed_webhook(
+                        webhook_url=current_job['webhook_url'],
+                        job_id=job_id,
+                        status='failed',
+                        error_message=str(e),
+                        secret=current_job.get('webhook_secret'),
+                        db=db
+                    )
+

 if __name__ == "__main__":
    import uvicorn