Add browser fingerprint support and analytics metadata display

- Transfer user's browser fingerprint (user-agent, viewport, timezone, language, geolocation) to Chrome for more authentic scraping - Display review topics from Google Maps in analytics dashboard - Show business category badge in analytics header - Fix date_text null handling in analytics (handle undefined/timestamp fields) - Add review_topics and business_category to JobStatus interface Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 10:36:06 +00:00
parent 1bd30c0789
commit a540ab97b1
9 changed files with 1214 additions and 231 deletions
--- a/22
+++ b/22
@@ -39,6 +39,13 @@ RUN apt-get update \
    && apt-get install -y chromium chromium-driver \
    && rm -rf /var/lib/apt/lists/*
 # Install VNC server and noVNC (browser-based VNC viewer)
 RUN apt-get update && apt-get install -y \
    x11vnc \
    novnc \
    websockify \
    && rm -rf /var/lib/apt/lists/*
 # Set working directory
 WORKDIR /app
@@ -51,7 +58,7 @@ COPY modules/ ./modules/
 COPY api_server_production.py .
 COPY config.yaml .
-# Create startup script for Xvfb + API server
+# Create startup script for Xvfb + VNC + API server
 RUN echo '#!/bin/bash\n\
 # Start Xvfb (virtual display) in background\n\
 Xvfb :99 -screen 0 1920x1080x24 -ac +extension GLX +render -noreset &\n\
@@ -60,6 +67,15 @@ export DISPLAY=:99\n\
 # Wait for Xvfb to start\n\
 sleep 2\n\
 \n\
 # Start VNC server (no password for local dev, binds to all interfaces)\n\
 x11vnc -display :99 -forever -shared -rfbport 5900 -nopw -bg\n\
 \n\
 # Start noVNC websocket proxy (browser access at http://localhost:6080/vnc.html)\n\
 websockify --web=/usr/share/novnc/ 6080 localhost:5900 &\n\
 \n\
 echo "VNC server running on port 5900"\n\
 echo "noVNC web interface at http://localhost:6080/vnc.html"\n\
 \n\
 # Start API server\n\
 exec python api_server_production.py\n\
 ' > /app/start.sh && chmod +x /app/start.sh
@@ -71,8 +87,8 @@ RUN useradd -m -u 1000 scraper && \
 USER scraper
-# Expose port
+# Expose ports: API (8000), VNC (5900), noVNC web (6080)
-EXPOSE 8000
+EXPOSE 8000 5900 6080
 # Environment variables for Chromium in container
 ENV DISPLAY=:99
--- a/api_server_production.py
+++ b/api_server_production.py
@@ -133,12 +133,36 @@ app.add_middleware(
 # ==================== Request/Response Models ====================
 class GeolocationModel(BaseModel):
    """Geolocation coordinates"""
    lat: float = Field(..., description="Latitude")
    lng: float = Field(..., description="Longitude")
 class ViewportModel(BaseModel):
    """Browser viewport size"""
    width: int = Field(..., description="Viewport width")
    height: int = Field(..., description="Viewport height")
 class BrowserFingerprintModel(BaseModel):
    """Browser fingerprint to replicate user's browser"""
    geolocation: Optional[GeolocationModel] = None
    userAgent: Optional[str] = Field(None, description="User agent string")
    viewport: Optional[ViewportModel] = Field(None, description="Screen resolution")
    timezone: Optional[str] = Field(None, description="Timezone (e.g., Europe/Madrid)")
    language: Optional[str] = Field(None, description="Browser language (e.g., en-US)")
    platform: Optional[str] = Field(None, description="Platform (e.g., MacIntel, Win32)")
 class ScrapeRequest(BaseModel):
    """Request model for starting a scrape job"""
    url: HttpUrl = Field(..., description="Google Maps URL to scrape")
    webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications")
    webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature")
    metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata")
    geolocation: Optional[GeolocationModel] = Field(None, description="User's geolocation for Chrome")
    browser_fingerprint: Optional[BrowserFingerprintModel] = Field(None, description="User's browser fingerprint")
 class JobResponse(BaseModel):
@@ -149,6 +173,7 @@ class JobResponse(BaseModel):
    created_at: str
    started_at: Optional[str] = None
    completed_at: Optional[str] = None
    updated_at: Optional[str] = None  # Last update time for progress tracking
    reviews_count: Optional[int] = None
    total_reviews: Optional[int] = None  # Total reviews available for this place
    scrape_time: Optional[float] = None
@@ -157,6 +182,8 @@ class JobResponse(BaseModel):
    # Business metadata
    business_name: Optional[str] = None
    business_address: Optional[str] = None
    business_category: Optional[str] = None  # Category (e.g., "Barber shop")
    review_topics: Optional[List[Dict[str, Any]]] = None  # Topic filters with mention counts
 class ReviewsResponse(BaseModel):
@@ -206,12 +233,32 @@ async def start_scrape(request: ScrapeRequest):
        raise HTTPException(status_code=500, detail="Database not initialized")
    try:
        # Merge browser fingerprint into metadata if provided
        metadata = request.metadata or {}
        if request.browser_fingerprint:
            fp = request.browser_fingerprint
            metadata['browser_fingerprint'] = {
                "userAgent": fp.userAgent,
                "timezone": fp.timezone,
                "language": fp.language,
                "platform": fp.platform,
            }
            if fp.viewport:
                metadata['browser_fingerprint']['viewport'] = {"width": fp.viewport.width, "height": fp.viewport.height}
            if fp.geolocation:
                metadata['browser_fingerprint']['geolocation'] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
        elif request.geolocation:
            metadata['geolocation'] = {
                'lat': request.geolocation.lat,
                'lng': request.geolocation.lng
            }
        # Create job in database
        job_id = await db.create_job(
            url=str(request.url),
            webhook_url=str(request.webhook_url) if request.webhook_url else None,
            webhook_secret=request.webhook_secret,
-            metadata=request.metadata
+            metadata=metadata
        )
        # Start scraping job in background
@@ -240,6 +287,25 @@ async def get_job(job_id: UUID):
    if not job:
        raise HTTPException(status_code=404, detail="Job not found")
    # Parse review_topics if it's a string (JSONB might be returned as string)
    review_topics = job.get('review_topics')
    if isinstance(review_topics, str):
        try:
            review_topics = json.loads(review_topics)
        except:
            review_topics = None
    # Extract business info from metadata if available
    metadata = job.get('metadata')
    if isinstance(metadata, str):
        try:
            metadata = json.loads(metadata)
        except:
            metadata = None
    business_name = metadata.get('business_name') if metadata else None
    business_category = metadata.get('business_category') if metadata else None
    return JobResponse(
        job_id=str(job['job_id']),
        status=job['status'],
@@ -247,11 +313,15 @@ async def get_job(job_id: UUID):
        created_at=job['created_at'].isoformat(),
        started_at=job['started_at'].isoformat() if job['started_at'] else None,
        completed_at=job['completed_at'].isoformat() if job['completed_at'] else None,
        updated_at=job['updated_at'].isoformat() if job.get('updated_at') else None,
        reviews_count=job['reviews_count'],
        total_reviews=job.get('total_reviews'),
        scrape_time=job['scrape_time'],
        error_message=job['error_message'],
-        webhook_url=job.get('webhook_url')
+        webhook_url=job.get('webhook_url'),
        business_name=business_name,
        business_category=business_category,
        review_topics=review_topics
    )
@@ -541,25 +611,32 @@ async def stream_all_jobs():
@app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews")
 async def get_job_reviews(job_id: UUID):
    """
-    Get the actual reviews data for a completed job.
+    Get reviews data for a job.
-    Returns 404 if job not found or not completed yet.
+    Returns reviews for completed, partial, or running jobs (if reviews have been collected).
    Returns 404 if job not found or no reviews available yet.
    """
    if not db:
        raise HTTPException(status_code=500, detail="Database not initialized")
-    reviews = await db.get_job_reviews(job_id)
+    # Get reviews (includes completed, running, and partial jobs)
    reviews = await db.get_job_reviews(job_id, include_partial=True)
    if reviews is None:
        job = await db.get_job(job_id)
        if not job:
            raise HTTPException(status_code=404, detail="Job not found")
-        elif job['status'] != 'completed':
+        elif job['status'] == 'pending':
            raise HTTPException(
                status_code=400,
-                detail=f"Job not completed yet (current status: {job['status']})"
+                detail="Job has not started yet"
            )
        elif job['status'] == 'failed':
            raise HTTPException(
                status_code=400,
                detail=f"Job failed without saving any reviews: {job.get('error_message', 'Unknown error')}"
            )
        else:
-            raise HTTPException(status_code=404, detail="Reviews data not available")
+            raise HTTPException(status_code=404, detail="No reviews data available yet")
    return ReviewsResponse(
        job_id=str(job_id),
@@ -603,6 +680,15 @@ async def list_jobs(
        business_name = metadata.get('business_name') if metadata else None
        business_address = metadata.get('business_address') if metadata else None
        business_category = metadata.get('business_category') if metadata else None
        # Parse review_topics if it's a string
        review_topics = job.get('review_topics')
        if isinstance(review_topics, str):
            try:
                review_topics = json.loads(review_topics)
            except:
                review_topics = None
        result.append(JobResponse(
            job_id=str(job['job_id']),
@@ -615,7 +701,9 @@ async def list_jobs(
            scrape_time=job.get('scrape_time'),
            error_message=job.get('error_message'),
            business_name=business_name,
-            business_address=business_address
+            business_address=business_address,
            business_category=business_category,
            review_topics=review_topics
        ))
    return result
@@ -640,63 +728,69 @@ async def check_reviews(request: ScrapeRequest):
    Get business card information from Google Maps.
    Returns business name, address, rating, and review count.
-    Uses pre-warmed Chrome worker from pool for instant response.
+    Creates a fresh Chrome instance for reliable results (same as full scraper).
    This is used to show the business confirmation card in the UI.
    """
    worker = None
    recycle_worker = False
    try:
        url = str(request.url)
-        # Get pre-warmed worker from validation pool
+        # Use the SAME scraper algorithm with validation_only=True for early return
-        worker = await asyncio.to_thread(get_validation_worker, timeout=10)
+        # Creates a fresh Chrome instance (same as full scraper) to avoid stale browser state
        # Pooled browsers can have cookies/state that cause Google to render pages differently
-        if worker:
+        # Build fingerprint dict from request
-            log.info(f"Using worker {worker.worker_id} for business card extraction")
+        fingerprint = None
-            # Use the pooled worker (don't close it)
+        if request.browser_fingerprint:
-            result = await asyncio.to_thread(
+            fp = request.browser_fingerprint
-                get_business_card_info,
+            fingerprint = {
-                url=url,
+                "userAgent": fp.userAgent,
-                driver=worker.driver,
+                "timezone": fp.timezone,
-                return_driver=True
+                "language": fp.language,
-            )
+                "platform": fp.platform,
-
+            }
-            # Check if the result indicates a session error
+            if fp.viewport:
-            if not result['success'] and result.get('error'):
+                fingerprint["viewport"] = {"width": fp.viewport.width, "height": fp.viewport.height}
-                error_msg = result.get('error', '').lower()
+            if fp.geolocation:
-                if 'invalid session' in error_msg or 'session' in error_msg:
+                fingerprint["geolocation"] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
-                    log.warning(f"Worker {worker.worker_id} has invalid session, will recycle")
+            log.info(f"Creating Chrome with user fingerprint: {fp.platform}, {fp.timezone}")
-                    recycle_worker = True
+        elif request.geolocation:
            fingerprint = {"geolocation": {"lat": request.geolocation.lat, "lng": request.geolocation.lng}}
            log.info(f"Creating Chrome with geolocation only")
        else:
-            # Fallback: create temporary worker
+            log.info(f"Creating Chrome with default settings")
            log.warning("No pooled worker available, creating temporary instance")
            result = await asyncio.to_thread(
                get_business_card_info,
                url=url
            )
-        # SIMPLIFIED VALIDATION: If we found a business (name + rating), assume it has reviews
+        result = await asyncio.to_thread(
-        # Let the actual scraper determine if reviews exist
+            fast_scrape_reviews,
-        has_business = bool(result.get('name') and result.get('rating'))
+            url=url,
            headless=False,  # Use Xvfb display
            validation_only=True,  # Return early after getting total_reviews
            browser_fingerprint=fingerprint  # Pass user's browser fingerprint
        )
        # Extract validation info from the result
        validation_info = result.get('validation_info', {})
        total_reviews = validation_info.get('total_reviews') or result.get('total_reviews') or 0
        name = validation_info.get('name')
        rating = validation_info.get('rating')
        category = validation_info.get('category')
        address = validation_info.get('address')
        # Has reviews if we found a business with the Reviews tab (indicated by total_reviews > 0)
        has_reviews = bool(name and total_reviews > 0)
        return {
-            "has_reviews": has_business,  # Boolean: true if business exists
+            "has_reviews": has_reviews,  # True if business has reviews
-            "total_reviews": result.get('total_reviews') or 0,  # Show 0 if unknown
+            "total_reviews": total_reviews,
-            "name": result.get('name'),
+            "name": name,
-            "address": result.get('address'),
+            "address": address,
-            "rating": result.get('rating'),
+            "rating": rating,
-            "success": result['success'],
+            "category": category,
            "success": result.get('success', True),
            "error": result.get('error')
        }
    except Exception as e:
        log.error(f"Error checking reviews: {e}")
        # If it's a session error, recycle the worker
        if worker:
            error_msg = str(e).lower()
            if 'invalid session' in error_msg or 'session' in error_msg:
                recycle_worker = True
        return {
            "has_reviews": False,
@@ -704,10 +798,6 @@ async def check_reviews(request: ScrapeRequest):
            "success": False,
            "error": str(e)
        }
    finally:
        # Release worker back to pool (or recycle if broken)
        if worker:
            await asyncio.to_thread(release_validation_worker, worker, recycle=recycle_worker)
@app.get("/stats", response_model=StatsResponse, summary="Get Statistics")
@@ -808,6 +898,21 @@ async def run_scraping_job(job_id: UUID):
            job = await db.get_job(job_id)
            url = job['url']
            # Extract browser fingerprint from metadata if available
            browser_fingerprint = None
            metadata = job.get('metadata')
            if isinstance(metadata, str):
                try:
                    metadata = json.loads(metadata)
                except:
                    metadata = None
            if metadata and 'browser_fingerprint' in metadata:
                browser_fingerprint = metadata['browser_fingerprint']
                log.info(f"Using user fingerprint: {browser_fingerprint.get('platform')}, {browser_fingerprint.get('timezone')}")
            elif metadata and 'geolocation' in metadata:
                browser_fingerprint = {'geolocation': metadata['geolocation']}
                log.info(f"Using user geolocation only")
            # Broadcast job started via SSE
            await broadcast_job_update(job_id_str, "job_started", {
                "job_id": job_id_str,
@@ -821,9 +926,17 @@ async def run_scraping_job(job_id: UUID):
            # Create log capture instance that we can access for real-time logs
            log_capture = LogCapture()
            # Track total reviews for incremental saves
            total_reviews_seen = [None]
            # Accumulate all reviews for incremental saves (flush_callback receives batches)
            all_reviews_collected = []
            # Progress callback to update job status with current/total counts AND logs
            def progress_callback(current_count: int, total_count: int):
                """Update job progress and logs from worker thread"""
                if total_count:
                    total_reviews_seen[0] = total_count
                async def update():
                    # Get current logs from the shared log_capture
                    current_logs = log_capture.get_logs()
@@ -847,6 +960,22 @@ async def run_scraping_job(job_id: UUID):
                # Schedule the coroutine on the event loop
                asyncio.run_coroutine_threadsafe(update(), loop)
            # Flush callback to save reviews incrementally (crash recovery)
            # Note: flush_callback receives batches, so we accumulate them
            def flush_callback(reviews_batch: list):
                """Accumulate and save reviews to DB incrementally from worker thread"""
                # Extend our collection with the new batch
                all_reviews_collected.extend(reviews_batch)
                async def save():
                    await db.save_reviews_incremental(
                        job_id=job_id,
                        reviews=all_reviews_collected,  # Save ALL reviews so far
                        total_reviews=total_reviews_seen[0]
                    )
                # Schedule the coroutine on the event loop
                asyncio.run_coroutine_threadsafe(save(), loop)
            # Run scraping with progress callback and shared log capture
            # headless=False because Docker uses Xvfb virtual display
            result = await asyncio.to_thread(
@@ -854,17 +983,20 @@ async def run_scraping_job(job_id: UUID):
                url=url,
                headless=False,
                progress_callback=progress_callback,
-                log_capture=log_capture
+                log_capture=log_capture,
                flush_callback=flush_callback,
                browser_fingerprint=browser_fingerprint  # Pass user's browser fingerprint
            )
            if result['success']:
-                # Save results to database (including scraper logs)
+                # Save results to database (including scraper logs and review topics)
                await db.save_job_result(
                    job_id=job_id,
                    reviews=result['reviews'],
                    scrape_time=result['time'],
                    total_reviews=result.get('total_reviews'),
-                    scrape_logs=result.get('logs')
+                    scrape_logs=result.get('logs'),
                    review_topics=result.get('review_topics')
                )
                log.info(
@@ -898,68 +1030,142 @@ async def run_scraping_job(job_id: UUID):
                    )
            else:
-                # Job failed - save logs for debugging
+                # Job failed - check if we have partial reviews saved
-                await db.update_job_status(
+                current_job = await db.get_job(job_id)
-                    job_id,
+                partial_count = current_job.get('reviews_count', 0) if current_job else 0
                    JobStatus.FAILED,
                    error_message=result.get('error', 'Unknown error'),
                    scrape_logs=result.get('logs')
                )
-                log.error(f"Failed job {job_id}: {result.get('error')}")
+                if partial_count > 0:
-
+                    # Mark as partial - we have some reviews saved
-                # Broadcast job failed via SSE
+                    await db.mark_job_partial(
-                await broadcast_job_update(job_id_str, "job_failed", {
+                        job_id,
-                    "job_id": job_id_str,
+                        error_message=result.get('error', 'Unknown error'),
-                    "status": "failed",
+                        scrape_logs=result.get('logs')
                    "error_message": result.get('error'),
                    "logs": result.get('logs', [])
                })
                # Send failure webhook if configured
                if job.get('webhook_url'):
                    webhook_manager = WebhookManager()
                    await webhook_manager.send_job_completed_webhook(
                        webhook_url=job['webhook_url'],
                        job_id=job_id,
                        status='failed',
                        error_message=result.get('error'),
                        secret=job.get('webhook_secret'),
                        db=db
                    )
                    log.warning(f"Partial job {job_id}: {partial_count} reviews saved before error: {result.get('error')}")
                    # Broadcast job partial via SSE
                    await broadcast_job_update(job_id_str, "job_partial", {
                        "job_id": job_id_str,
                        "status": "partial",
                        "reviews_count": partial_count,
                        "total_reviews": current_job.get('total_reviews'),
                        "error_message": result.get('error'),
                        "logs": result.get('logs', [])
                    })
                    # Send partial webhook if configured
                    if job.get('webhook_url'):
                        webhook_manager = WebhookManager()
                        await webhook_manager.send_job_completed_webhook(
                            webhook_url=job['webhook_url'],
                            job_id=job_id,
                            status='partial',
                            reviews_count=partial_count,
                            error_message=result.get('error'),
                            secret=job.get('webhook_secret'),
                            db=db
                        )
                else:
                    # No reviews saved - mark as failed
                    await db.update_job_status(
                        job_id,
                        JobStatus.FAILED,
                        error_message=result.get('error', 'Unknown error'),
                        scrape_logs=result.get('logs')
                    )
                    log.error(f"Failed job {job_id}: {result.get('error')}")
                    # Broadcast job failed via SSE
                    await broadcast_job_update(job_id_str, "job_failed", {
                        "job_id": job_id_str,
                        "status": "failed",
                        "error_message": result.get('error'),
                        "logs": result.get('logs', [])
                    })
                    # Send failure webhook if configured
                    if job.get('webhook_url'):
                        webhook_manager = WebhookManager()
                        await webhook_manager.send_job_completed_webhook(
                            webhook_url=job['webhook_url'],
                            job_id=job_id,
                            status='failed',
                            error_message=result.get('error'),
                            secret=job.get('webhook_secret'),
                            db=db
                        )
        except Exception as e:
            log.error(f"Error in scraping job {job_id}: {e}")
            import traceback
            traceback.print_exc()
-            await db.update_job_status(
+            # Check if we have partial reviews saved
-                job_id,
+            current_job = await db.get_job(job_id)
-                JobStatus.FAILED,
+            partial_count = current_job.get('reviews_count', 0) if current_job else 0
                error_message=str(e)
            )
-            # Broadcast job failed via SSE
+            if partial_count > 0:
-            await broadcast_job_update(job_id_str, "job_failed", {
+                # Mark as partial - we have some reviews saved
-                "job_id": job_id_str,
+                await db.mark_job_partial(
-                "status": "failed",
+                    job_id,
                "error_message": str(e),
                "logs": []
            })
            # Send failure webhook
            job = await db.get_job(job_id)
            if job and job.get('webhook_url'):
                webhook_manager = WebhookManager()
                await webhook_manager.send_job_completed_webhook(
                    webhook_url=job['webhook_url'],
                    job_id=job_id,
                    status='failed',
                    error_message=str(e),
-                    secret=job.get('webhook_secret'),
+                    scrape_logs=log_capture.get_logs() if log_capture else None
                    db=db
                )
                log.warning(f"Partial job {job_id}: {partial_count} reviews saved before exception: {e}")
                # Broadcast job partial via SSE
                await broadcast_job_update(job_id_str, "job_partial", {
                    "job_id": job_id_str,
                    "status": "partial",
                    "reviews_count": partial_count,
                    "total_reviews": current_job.get('total_reviews'),
                    "error_message": str(e),
                    "logs": log_capture.get_logs() if log_capture else []
                })
                # Send partial webhook
                if current_job and current_job.get('webhook_url'):
                    webhook_manager = WebhookManager()
                    await webhook_manager.send_job_completed_webhook(
                        webhook_url=current_job['webhook_url'],
                        job_id=job_id,
                        status='partial',
                        reviews_count=partial_count,
                        error_message=str(e),
                        secret=current_job.get('webhook_secret'),
                        db=db
                    )
            else:
                # No reviews saved - mark as failed
                await db.update_job_status(
                    job_id,
                    JobStatus.FAILED,
                    error_message=str(e)
                )
                # Broadcast job failed via SSE
                await broadcast_job_update(job_id_str, "job_failed", {
                    "job_id": job_id_str,
                    "status": "failed",
                    "error_message": str(e),
                    "logs": []
                })
                # Send failure webhook
                if current_job and current_job.get('webhook_url'):
                    webhook_manager = WebhookManager()
                    await webhook_manager.send_job_completed_webhook(
                        webhook_url=current_job['webhook_url'],
                        job_id=job_id,
                        status='failed',
                        error_message=str(e),
                        secret=current_job.get('webhook_secret'),
                        db=db
                    )
 if __name__ == "__main__":
    import uvicorn
--- a/docker-compose.production.yml
+++ b/docker-compose.production.yml
@@ -39,6 +39,8 @@ services:
      - CHROME_BIN=/usr/bin/chromium
    ports:
      - "8000:8000"
      - "5900:5900"   # VNC port (for VNC client)
      - "6080:6080"   # noVNC web interface (browser access)
    depends_on:
      db:
        condition: service_healthy
--- a/modules/database.py
+++ b/modules/database.py
@@ -21,6 +21,7 @@ class JobStatus(str, Enum):
    COMPLETED = "completed"
    FAILED = "failed"
    CANCELLED = "cancelled"
    PARTIAL = "partial"  # Job crashed but has partial reviews saved
 class DatabaseManager:
@@ -69,6 +70,7 @@ class DatabaseManager:
                    created_at TIMESTAMP NOT NULL DEFAULT NOW(),
                    started_at TIMESTAMP,
                    completed_at TIMESTAMP,
                    updated_at TIMESTAMP,
                    reviews_count INTEGER,
                    total_reviews INTEGER,
@@ -79,7 +81,7 @@ class DatabaseManager:
                    metadata JSONB,
                    scrape_logs JSONB,
-                    CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
+                    CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'))
                );
            """)
@@ -88,6 +90,24 @@ class DatabaseManager:
                ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
            """)
            # Add updated_at column if it doesn't exist (for incremental progress tracking)
            await conn.execute("""
                ALTER TABLE jobs ADD COLUMN IF NOT EXISTS updated_at TIMESTAMP;
            """)
            # Add review_topics column if it doesn't exist (extracted topic filters with mention counts)
            await conn.execute("""
                ALTER TABLE jobs ADD COLUMN IF NOT EXISTS review_topics JSONB;
            """)
            # Update constraint to include 'partial' status (for existing databases)
            await conn.execute("""
                ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_status;
            """)
            await conn.execute("""
                ALTER TABLE jobs ADD CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'));
            """)
            # Create indexes
            await conn.execute("""
                CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
@@ -187,13 +207,15 @@ class DatabaseManager:
                    created_at,
                    started_at,
                    completed_at,
                    updated_at,
                    reviews_count,
                    total_reviews,
                    reviews_data,
                    scrape_time,
                    error_message,
                    metadata,
-                    scrape_logs
+                    scrape_logs,
                    review_topics
                FROM jobs
                WHERE job_id = $1
            """, job_id)
@@ -203,22 +225,32 @@ class DatabaseManager:
            return dict(row)
-    async def get_job_reviews(self, job_id: UUID) -> Optional[List[Dict[str, Any]]]:
+    async def get_job_reviews(self, job_id: UUID, include_partial: bool = True) -> Optional[List[Dict[str, Any]]]:
        """
        Get reviews for a specific job.
        Args:
            job_id: Job UUID
            include_partial: If True, also return reviews for running and partial jobs
        Returns:
-            List of reviews or None if not found/not completed
+            List of reviews or None if not found/no reviews
        """
        async with self.pool.acquire() as conn:
-            reviews_data = await conn.fetchval("""
+            if include_partial:
-                SELECT reviews_data
+                # Return reviews for completed, running, or partial jobs
-                FROM jobs
+                reviews_data = await conn.fetchval("""
-                WHERE job_id = $1 AND status = 'completed'
+                    SELECT reviews_data
-            """, job_id)
+                    FROM jobs
                    WHERE job_id = $1 AND status IN ('completed', 'running', 'partial')
                """, job_id)
            else:
                # Only return reviews for completed jobs
                reviews_data = await conn.fetchval("""
                    SELECT reviews_data
                    FROM jobs
                    WHERE job_id = $1 AND status = 'completed'
                """, job_id)
            if not reviews_data:
                return None
@@ -278,7 +310,8 @@ class DatabaseManager:
        reviews: List[Dict[str, Any]],
        scrape_time: float,
        total_reviews: Optional[int] = None,
-        scrape_logs: Optional[List[Dict[str, Any]]] = None
+        scrape_logs: Optional[List[Dict[str, Any]]] = None,
        review_topics: Optional[List[Dict[str, Any]]] = None
    ):
        """
        Save scraping results to database.
@@ -289,8 +322,33 @@ class DatabaseManager:
            scrape_time: Time taken to scrape in seconds
            total_reviews: Total reviews available (from page counter)
            scrape_logs: List of log entries from the scraper
            review_topics: List of topic filter dictionaries with topic and count
        """
        async with self.pool.acquire() as conn:
            # If reviews list is empty, check if job already has reviews from incremental saves
            # This happens when flush_callback was used during scraping
            if not reviews:
                existing = await conn.fetchval(
                    "SELECT reviews_count FROM jobs WHERE job_id = $1", job_id
                )
                if existing and existing > 0:
                    # Job has reviews from incremental saves, don't overwrite reviews_data
                    await conn.execute("""
                        UPDATE jobs
                        SET
                            status = 'completed',
                            completed_at = NOW(),
                            total_reviews = COALESCE($2, total_reviews),
                            scrape_time = $3,
                            scrape_logs = $4::jsonb,
                            review_topics = $5::jsonb
                        WHERE job_id = $1
                    """, job_id, total_reviews, scrape_time,
                        json.dumps(scrape_logs) if scrape_logs else None,
                        json.dumps(review_topics) if review_topics else None)
                    log.info(f"Completed job {job_id} with {existing} reviews (from incremental saves)")
                    return
            await conn.execute("""
                UPDATE jobs
                SET
@@ -300,13 +358,70 @@ class DatabaseManager:
                    total_reviews = $3,
                    reviews_data = $4::jsonb,
                    scrape_time = $5,
-                    scrape_logs = $6::jsonb
+                    scrape_logs = $6::jsonb,
                    review_topics = $7::jsonb
                WHERE job_id = $1
            """, job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
-                json.dumps(scrape_logs) if scrape_logs else None)
+                json.dumps(scrape_logs) if scrape_logs else None,
                json.dumps(review_topics) if review_topics else None)
            log.info(f"Saved {len(reviews)} reviews for job {job_id}")
    async def save_reviews_incremental(
        self,
        job_id: UUID,
        reviews: List[Dict[str, Any]],
        total_reviews: Optional[int] = None
    ):
        """
        Save reviews incrementally during scraping.
        Called on each flush to preserve progress in case of crash.
        Args:
            job_id: Job UUID
            reviews: ALL reviews collected so far (not just new ones)
            total_reviews: Total reviews available (from page counter)
        """
        async with self.pool.acquire() as conn:
            await conn.execute("""
                UPDATE jobs
                SET
                    reviews_count = $2,
                    total_reviews = COALESCE($3, total_reviews),
                    reviews_data = $4::jsonb,
                    updated_at = NOW()
                WHERE job_id = $1 AND status = 'running'
            """, job_id, len(reviews), total_reviews, json.dumps(reviews))
            log.debug(f"Incremental save: {len(reviews)} reviews for job {job_id}")
    async def mark_job_partial(
        self,
        job_id: UUID,
        error_message: str,
        scrape_logs: Optional[List[Dict[str, Any]]] = None
    ):
        """
        Mark a job as partial (crashed but has some reviews saved).
        Args:
            job_id: Job UUID
            error_message: Error that caused the crash
            scrape_logs: Log entries from the scraper
        """
        async with self.pool.acquire() as conn:
            await conn.execute("""
                UPDATE jobs
                SET
                    status = 'partial',
                    completed_at = NOW(),
                    error_message = $2,
                    scrape_logs = $3::jsonb
                WHERE job_id = $1
            """, job_id, error_message, json.dumps(scrape_logs) if scrape_logs else None)
            log.info(f"Marked job {job_id} as partial due to: {error_message}")
    async def list_jobs(
        self,
        status: Optional[JobStatus] = None,
@@ -337,7 +452,8 @@ class DatabaseManager:
                        total_reviews,
                        scrape_time,
                        error_message,
-                        metadata
+                        metadata,
                        review_topics
                    FROM jobs
                    WHERE status = $1
                    ORDER BY created_at DESC
@@ -355,7 +471,8 @@ class DatabaseManager:
                        total_reviews,
                        scrape_time,
                        error_message,
-                        metadata
+                        metadata,
                        review_topics
                    FROM jobs
                    ORDER BY created_at DESC
                    LIMIT $1 OFFSET $2
--- a/modules/scraper_clean.py
+++ b/modules/scraper_clean.py
@@ -268,7 +268,7 @@ def parse_dom_review(card) -> dict:
 def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
                   flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None,
-                   progress_callback=None) -> dict:
+                   progress_callback=None, validation_only: bool = False) -> dict:
    """
    Scrape Google Maps reviews.
@@ -299,6 +299,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
    # Track total reviews (persists across refreshes)
    total_reviews = [None]  # Use list for closure mutation
    # Store business info extracted from overview (before clicking reviews tab)
    business_info_cache = [None]
    # Hard refresh counter
    hard_refresh_count = [0]
    max_hard_refreshes = 3  # Max number of hard refreshes before giving up
@@ -323,11 +326,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                pass
        return None
-    def setup_reviews_page(is_refresh=False):
+    def setup_reviews_page(is_refresh=False, validation_only_mode=False):
        """
        Setup the reviews page for scraping.
        Returns (scroll_container, stop_scrolling_event) or (None, None) on failure.
        Can be called after initial load or after a hard refresh.
        If validation_only_mode=True, returns early after extracting business info
        without clicking reviews tab or finding scroll container.
        """
        nonlocal total_reviews
@@ -335,6 +341,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        # Navigate to URL (only on initial load or refresh)
        if not is_refresh:
            # Reset browser state by navigating to blank page first
            # This clears any stale state from pooled browser sessions
            try:
                driver.get("about:blank")
                time.sleep(0.1)
            except:
                pass
            log.info(f"🌐 Loading: {url[:80]}...")
        else:
            log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
@@ -353,6 +366,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                            # Reload original URL after consent
                            log.info("  Reloading after consent...")
                            driver.get(url)
                            # Wait for page to settle after consent reload
                            time.sleep(1)
                            break
                except:
                    pass
@@ -362,43 +377,108 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                break
            time.sleep(0.01)  # 10ms - responsive but low CPU
-        # Extract total review count BEFORE clicking reviews tab (it's on Overview)
+        # Extract business info and total review count BEFORE clicking reviews tab (on Overview)
        # This captures name, rating, category, address while they're visible
        # Only on first load (don't overwrite if we already have it)
-        if total_reviews[0] is None:
+        if total_reviews[0] is None or business_info_cache[0] is None:
            start = time.time()
            while time.time() - start < 5:
                try:
-                    count = driver.execute_script("""
+                    info = driver.execute_script("""
-                        var reviewSpans = document.querySelectorAll('span[role="img"]');
+                        var result = {
-                        for (var i = 0; i < reviewSpans.length; i++) {
+                            total_reviews: null,
-                            var label = reviewSpans[i].getAttribute('aria-label') || '';
+                            name: null,
-                            var match = label.match(/^([\\d,\\.]+)\\s*review/i);
+                            rating: null,
-                            if (match) {
+                            category: null,
-                                return parseInt(match[1].replace(/[,\\.]/g, ''));
+                            address: null
                        };
                        // Business name from h1
                        var h1 = document.querySelector('h1');
                        if (h1) result.name = h1.textContent.trim();
                        // Category - use jsaction attribute (robust selector)
                        var catBtn = document.querySelector('button[jsaction*="category"]');
                        if (catBtn) result.category = catBtn.textContent.trim();
                        // Rating and review count from span[role="img"] aria-labels
                        var spans = document.querySelectorAll('span[role="img"]');
                        for (var i = 0; i < spans.length; i++) {
                            var label = spans[i].getAttribute('aria-label') || '';
                            // Rating: "4.8 stars"
                            var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
                            if (rMatch && !result.rating) {
                                result.rating = parseFloat(rMatch[1].replace(',', '.'));
                            }
                            // Reviews: "79 reviews"
                            var revMatch = label.match(/^([\\d,\\.]+)\\s*review/i);
                            if (revMatch && !result.total_reviews) {
                                result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
                            }
                        }
-                        return null;
+
                        // Address from button
                        var addrBtn = document.querySelector('button[data-item-id="address"]');
                        if (addrBtn) {
                            var label = addrBtn.getAttribute('aria-label');
                            if (label) result.address = label.replace(/^Address:\\s*/i, '');
                        }
                        return result;
                    """)
-                    if count:
+
-                        total_reviews[0] = count
+                    if info:
-                        log.info(f"📊 Total reviews on page: {count}")
+                        if info.get('total_reviews') and total_reviews[0] is None:
-                        break
+                            total_reviews[0] = info['total_reviews']
                            log.info(f"📊 Total reviews on page: {total_reviews[0]}")
                        if info.get('name') and business_info_cache[0] is None:
                            business_info_cache[0] = info
                            log.info(f"📍 Business: {info.get('name')}")
                        if total_reviews[0] and business_info_cache[0]:
                            break
                except:
                    pass
                time.sleep(0.1)
        # VALIDATION_ONLY: Return early - skip clicking reviews tab, sorting, etc.
        if validation_only_mode:
            log.info("📋 Validation mode: returning early (skipping reviews tab)")
            return ("validation_done", None)
        # Click reviews tab - poll until found
        review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
        start = time.time()
        tab_clicked = False
        tabs_logged = False
        while time.time() - start < 5:  # Max 5s for tabs
            try:
                tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
                # Log available tabs once for debugging
                if not tabs_logged and tabs:
                    tabs_logged = True
                    tab_texts = [t.text for t in tabs]
                    log.info(f"  Available tabs: {tab_texts}")
                for tab in tabs:
                    tab_text = tab.text.lower()
                    if any(kw in tab_text for kw in review_keywords):
                        if not is_refresh:
                            log.info(f"  Clicking reviews tab: '{tab.text}'")
                        # Extract total_reviews from tab text like "Reviews (79)" or "Reviews\n79"
                        if total_reviews[0] is None:
                            import re
                            # Try pattern with parentheses: "Reviews (79)"
                            match = re.search(r'\((\d+)\)', tab.text)
                            if match:
                                total_reviews[0] = int(match.group(1))
                                log.info(f"📊 Total reviews from tab: {total_reviews[0]}")
                            else:
                                # Try pattern with newline: "Reviews\n79"
                                match = re.search(r'(\d+)', tab.text)
                                if match:
                                    total_reviews[0] = int(match.group(1))
                                    log.info(f"📊 Total reviews from tab: {total_reviews[0]}")
                        tab.click()
                        tab_clicked = True
                        break
@@ -569,11 +649,85 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        return scroll_container, stop_scrolling
-    # Initial page setup
+    # Helper to extract review topics from the reviews tab
-    scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False)
+    def extract_review_topics():
        """Extract review topic filters from radiogroup (robust selectors)."""
        try:
            topics = driver.execute_script("""
                var topics = [];
                // Primary: use role="radiogroup" with aria-label="Refine reviews" (robust)
                var container = document.querySelector('div[role="radiogroup"][aria-label*="Refine"], div[role="radiogroup"][aria-label*="refine"]');
                if (!container) {
                    // Fallback: any radiogroup in the reviews area
                    container = document.querySelector('div[role="radiogroup"]');
                }
                if (container) {
                    var buttons = container.querySelectorAll('button[role="radio"]');
                    for (var btn of buttons) {
                        var label = btn.getAttribute('aria-label') || '';
                        // Parse "hair salon, mentioned in 4 reviews" format
                        var match = label.match(/^([^,]+),\\s*mentioned in (\\d+)/i);
                        if (match) {
                            topics.push({
                                topic: match[1].trim(),
                                count: parseInt(match[2])
                            });
                        } else if (label && !label.toLowerCase().includes('all review')) {
                            // Fallback: try to extract from child spans
                            var countSpan = btn.querySelector('.bC3Nkc, .fontBodySmall');
                            var nameSpan = btn.querySelector('.uEubGf, span:first-child');
                            if (nameSpan) {
                                var name = nameSpan.textContent.trim();
                                var count = countSpan ? parseInt(countSpan.textContent) : 0;
                                if (name && name.toLowerCase() !== 'all') {
                                    topics.push({topic: name, count: count || 0});
                                }
                            }
                        }
                    }
                }
                return topics;
            """)
            return topics or []
        except:
            return []
    # Initial page setup (pass validation_only to skip unnecessary steps)
    scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False, validation_only_mode=validation_only)
    # VALIDATION_ONLY MODE: Return early with just total_reviews and business info
    # setup_reviews_page returns ("validation_done", None) in this case
    if validation_only or scroll_container == "validation_done":
        # Use the business info captured from Overview (before clicking reviews tab)
        business_info = business_info_cache[0] or {}
        return {
            "reviews": [],
            "total": total_reviews[0] or 0,
            "scrolls": 0,
            "error": None,
            "validation_info": {
                "name": business_info.get("name"),
                "rating": business_info.get("rating"),
                "category": business_info.get("category"),
                "address": business_info.get("address"),
                "total_reviews": total_reviews[0]
            }
        }
    if not scroll_container:
        return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
    # Extract review topics after reviews tab is loaded (before scrolling begins)
    time.sleep(0.5)  # Brief wait for topic filters to render
    review_topics = extract_review_topics()
    if review_topics:
        log.info(f"📊 Found {len(review_topics)} review topics: {', '.join(t['topic'] for t in review_topics[:5])}...")
    def get_api_reviews():
        """Get reviews from intercepted API responses."""
        api_revs = []
@@ -990,13 +1144,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        "total_flushed": total_flushed[0],
        "checks": check_num,
        "url": url,
-        "logs": log.get_logs()
+        "logs": log.get_logs(),
        "review_topics": review_topics  # Topic filters with mention counts
    }
 def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
                        progress_callback=None, driver=None, return_driver: bool = False,
-                        log_capture: LogCapture = None):
+                        log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False,
                        browser_fingerprint: dict = None):
    """
    Production-compatible wrapper for scrape_reviews.
    Matches the API expected by job_manager.py.
@@ -1009,6 +1165,13 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
        driver: Existing driver instance to reuse
        return_driver: If True, return driver in result
        log_capture: Optional LogCapture instance for real-time log access
        browser_fingerprint: Optional dict with user's browser fingerprint:
            - geolocation: {lat, lng}
            - userAgent: string
            - viewport: {width, height}
            - timezone: string (e.g., "Europe/Madrid")
            - language: string (e.g., "en-US")
            - platform: string (e.g., "MacIntel")
    Returns:
        Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
@@ -1023,27 +1186,56 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
    log_capture = log_capture or LogCapture()
    try:
        # Extract fingerprint settings
        fp = browser_fingerprint or {}
        user_agent = fp.get('userAgent') or "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        viewport = fp.get('viewport') or {'width': 1200, 'height': 900}
        geolocation = fp.get('geolocation')
        timezone = fp.get('timezone')
        language = fp.get('language', 'en-US')
        # Create driver if not provided
        if not driver:
            driver = Driver(
                uc=True,
                headless=headless,
                page_load_strategy="normal",
-                agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+                agent=user_agent  # Use user's actual user agent
            )
-            driver.set_window_size(1200, 900)  # Proper viewport for Google Maps
+            # Set viewport to match user's screen
            driver.set_window_size(viewport['width'], viewport['height'])
-        # Set Chrome geolocation to US (Boston, MA) using CDP
+        # Apply browser fingerprint settings via CDP
        # This ensures Google Maps shows US results regardless of server location
        try:
-            driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
+            # Set timezone if provided
-                'latitude': 42.3601,
+            if timezone:
-                'longitude': -71.0589,
+                driver.execute_cdp_cmd('Emulation.setTimezoneOverride', {'timezoneId': timezone})
-                'accuracy': 100
+                log_capture.info(f"Set timezone to {timezone}")
-            })
+
-            log_capture.info("Set geolocation to US (Boston, MA)")
+            # Set locale/language
            driver.execute_cdp_cmd('Emulation.setLocaleOverride', {'locale': language})
            # Set geolocation
            if geolocation and 'lat' in geolocation and 'lng' in geolocation:
                driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
                    'latitude': geolocation['lat'],
                    'longitude': geolocation['lng'],
                    'accuracy': 1000  # ~1km accuracy for IP-based location
                })
                log_capture.info(f"Set geolocation to ({geolocation['lat']:.2f}, {geolocation['lng']:.2f})")
            else:
                # Default to US (Boston, MA) if no geolocation provided
                driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
                    'latitude': 42.3601,
                    'longitude': -71.0589,
                    'accuracy': 100
                })
                log_capture.info("Set geolocation to US (Boston, MA) [default]")
            if fp:
                log_capture.info(f"Browser fingerprint applied: {fp.get('platform', 'unknown')}, {viewport['width']}x{viewport['height']}")
        except Exception as e:
-            log_capture.warning(f"Could not set geolocation: {e}")
+            log_capture.warning(f"Could not apply fingerprint settings: {e}")
        # Add URL parameters for consistent results
        if 'hl=' not in url:
@@ -1052,14 +1244,18 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
        if 'gl=' not in url:
            url = f"{url}&gl=us"
-        # Create progress wrapper if callback provided
+        # Create combined flush callback for progress + external handler
-        flush_callback = None
+        external_flush = flush_callback  # Save external callback
-        if progress_callback:
+        internal_flush = None
        if progress_callback or external_flush:
            collected = [0]
-            def flush_with_progress(reviews_batch):
+            def combined_flush(reviews_batch):
-                collected[0] += len(reviews_batch)
+                collected[0] = len(reviews_batch)  # reviews_batch is ALL reviews so far
-                progress_callback(collected[0], None)
+                if progress_callback:
-            flush_callback = flush_with_progress
+                    progress_callback(collected[0], None)
                if external_flush:
                    external_flush(reviews_batch)  # Pass reviews to external handler
            internal_flush = combined_flush
        # Run the scraper with progress callback for real-time updates
        result = scrape_reviews(
@@ -1067,10 +1263,11 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
            url=url,
            max_reviews=999999,  # Effectively unlimited
            timeout_no_new=15,
-            flush_callback=flush_callback,
+            flush_callback=internal_flush,
            flush_batch_size=100,  # Smaller batches for more frequent progress
            log_capture=log_capture,
-            progress_callback=progress_callback  # Pass through for real-time log updates
+            progress_callback=progress_callback,  # Pass through for real-time log updates
            validation_only=validation_only  # Return early if just validating
        )
        elapsed = time.time() - start_time
@@ -1083,9 +1280,14 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
            "time": elapsed,
            "success": True,
            "error": None,
-            "logs": result.get("logs", [])
+            "logs": result.get("logs", []),
            "review_topics": result.get("review_topics", [])  # Topic filters with mention counts
        }
        # Include validation_info if in validation_only mode
        if validation_only and "validation_info" in result:
            response["validation_info"] = result["validation_info"]
        if return_driver:
            response["driver"] = driver
        elif should_close_driver:
@@ -1120,6 +1322,122 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
        }
 def extract_about_info(driver, url: str = None) -> dict:
    """
    Extract About section info from Google Maps (Accessibility, Amenities, etc.).
    This function should be called AFTER reviews are scraped if about info is needed,
    as it navigates to a different tab.
    Args:
        driver: Selenium WebDriver instance (already on the business page)
        url: Optional URL to navigate to first (if not already on the page)
    Returns:
        dict with section names as keys, each containing list of features
    """
    try:
        # Navigate if URL provided
        if url:
            # Force English
            if 'hl=' not in url:
                separator = '&' if '?' in url else '?'
                url = f"{url}{separator}hl=en"
            if 'gl=' not in url:
                url = f"{url}&gl=us"
            driver.get(url)
            time.sleep(1)
        # Click About tab using robust selectors
        clicked = driver.execute_script("""
            // Try multiple selectors for about tab
            var selectors = [
                'button[aria-label*="About"]',
                'button[data-tab-index="2"]',
                'div[role="tablist"] button:nth-child(3)',
                'button[jsaction*="about"]'
            ];
            for (var sel of selectors) {
                var btn = document.querySelector(sel);
                if (btn && btn.textContent.toLowerCase().includes('about')) {
                    btn.click();
                    return true;
                }
            }
            // Fallback: find by text content
            var buttons = document.querySelectorAll('button');
            for (var btn of buttons) {
                if (btn.textContent.trim().toLowerCase() === 'about') {
                    btn.click();
                    return true;
                }
            }
            return false;
        """)
        if not clicked:
            return {}
        time.sleep(1.5)  # Wait for about tab to load
        # Extract about sections using aria-labels (robust)
        about = driver.execute_script("""
            var about = {};
            // Find the about region by aria-label or role
            var container = document.querySelector('div[role="region"][aria-label*="About"]');
            if (!container) {
                // Fallback: look for the scrollable area with sections
                container = document.querySelector('.m6QErb[aria-label*="About"]');
            }
            if (!container) {
                // Last resort: find sections by h2 headers
                container = document;
            }
            // Find all section headers (h2 elements)
            var sections = container.querySelectorAll('h2');
            for (var h2 of sections) {
                var sectionName = h2.textContent.trim();
                var items = [];
                // Find the ul list following this h2
                var parent = h2.closest('.iP2t7d, div');
                if (parent) {
                    var listItems = parent.querySelectorAll('li span[aria-label]');
                    for (var li of listItems) {
                        var label = li.getAttribute('aria-label');
                        if (label) {
                            // Parse "Has toilet" or "No wheelchair-accessible car park"
                            var hasFeature = !label.toLowerCase().startsWith('no ');
                            var featureName = label.replace(/^(Has |No )/i, '');
                            items.push({
                                feature: featureName,
                                available: hasFeature
                            });
                        }
                    }
                }
                if (sectionName && items.length > 0) {
                    about[sectionName] = items;
                }
            }
            return about;
        """)
        return about or {}
    except Exception as e:
        return {"error": str(e)}
 # Test function
 if __name__ == "__main__":
    from seleniumbase import Driver
@@ -1159,6 +1477,8 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
        dict with: name, address, rating, total_reviews, success, error, time
    """
    from seleniumbase import Driver
    import logging
    log = logging.getLogger(__name__)
    start_time = time.time()
    driver_provided = driver is not None
@@ -1177,13 +1497,15 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
        except:
            pass
-        # Clear state if reusing a pooled driver (ensures clean page load)
+        # Don't clear state - Google may serve different content based on session history
-        if driver_provided:
+        # The scraper doesn't reset state, so validation shouldn't either
-            try:
+
-                driver.delete_all_cookies()
+        # Force English interface for consistent parsing
-                driver.get("about:blank")
+        if 'hl=' not in url:
-            except:
+            separator = '&' if '?' in url else '?'
-                pass
+            url = f"{url}{separator}hl=en"
        if 'gl=' not in url:
            url = f"{url}&gl=us"
        # Navigate to URL
        driver.get(url)
@@ -1193,48 +1515,183 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
        while time.time() - start < 5:
            if "consent.google" in driver.current_url:
                try:
-                    for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
+                    # Try multiple approaches to find and click accept button
-                        txt = btn.text.lower()
+                    clicked = False
-                        if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
+
-                            btn.click()
+                    # Method 1: Find by aria-label (most reliable for Google consent)
-                            driver.get(url)
+                    for btn in driver.find_elements(By.CSS_SELECTOR, "button[aria-label*='Accept']"):
-                            break
+                        btn.click()
-                except:
+                        clicked = True
                        break
                    # Method 2: Find by text content
                    if not clicked:
                        for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
                            txt = btn.text.lower()
                            if "accept all" in txt or "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
                                btn.click()
                                clicked = True
                                break
                    if clicked:
                        time.sleep(0.5)  # Brief wait for consent to process
                        driver.get(url)  # Reload the target URL
                        time.sleep(0.5)  # Wait for reload
                except Exception as e:
                    pass
                break
            if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url):
                break
            time.sleep(0.01)  # 10ms - responsive but low CPU
        # Log current URL after consent handling
        try:
            current_url = driver.current_url
            log.info(f"🔍 Validation: Current URL after load: {current_url[:80]}...")
        except:
            pass
        # Wait for page to fully render before polling (tabs may load dynamically)
        time.sleep(2)
        # Poll for business info (same pattern as total_reviews extraction)
-        info = {"name": None, "rating": None, "total_reviews": None, "address": None}
+        # Timeout increased to 10s because Reviews tab can take 6+ seconds to appear after consent
        info = {"name": None, "rating": None, "total_reviews": None, "address": None, "category": None}
        start = time.time()
-        while time.time() - start < 5:
+        debug_logged = False
        while time.time() - start < 10:
            try:
                info = driver.execute_script("""
-                    var result = {name: null, rating: null, total_reviews: null, address: null};
+                    var result = {name: null, rating: null, total_reviews: null, address: null, category: null, debug: []};
                    // Business name from h1
                    var h1 = document.querySelector('h1');
                    if (h1) result.name = h1.textContent.trim();
-                    // Rating and reviews from span[role="img"] aria-labels
+                    // Category - use jsaction attribute (robust, survives class changes)
-                    // Same pattern as scrape_reviews for consistency
+                    var catBtn = document.querySelector('button[jsaction*="category"]');
                    if (catBtn) result.category = catBtn.textContent.trim();
                    // Fallback: look for button after rating that's not a link
                    if (!result.category) {
                        var buttons = document.querySelectorAll('button');
                        for (var btn of buttons) {
                            var text = btn.textContent.trim();
                            // Categories are short words, no numbers, not navigation
                            if (text && text.length < 50 && !text.match(/^[0-9]/) &&
                                !text.match(/review|star|direction|save|share|photo/i)) {
                                // Check if it's near the rating area
                                var parent = btn.closest('.LBgpqf, .skqShb, .fontBodyMedium');
                                if (parent) {
                                    result.category = text;
                                    break;
                                }
                            }
                        }
                    }
                    // Rating from span[role="img"] aria-labels
                    var spans = document.querySelectorAll('span[role="img"]');
                    for (var i = 0; i < spans.length; i++) {
                        var label = spans[i].getAttribute('aria-label') || '';
-                        // Rating: "4.8 stars", "4,8 estrellas", etc (partial match)
+                        // Collect debug info for all aria-labels
-                        var rMatch = label.match(/^([\\d,.]+)\\s*(star|estrella|étoile|stern|stell)/i);
+                        if (label) {
                            result.debug.push('img-aria: ' + label);
                        }
                        // Rating: "4.8 stars" (English forced via hl=en)
                        var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
                        if (rMatch && !result.rating) {
                            result.rating = parseFloat(rMatch[1].replace(',', '.'));
                        }
-                        // Reviews: same as scrape_reviews - /^([\d,.]+)\s*review/i
+                        // Reviews: "79 reviews" or "4.8 stars 79 reviews" (English forced via hl=en)
-                        // Plus Spanish "reseña" which doesn't contain "review"
+                        // Try direct format first: "79 reviews"
-                        var revMatch = label.match(/^([\\d,\\.]+)\\s*(review|reseña|avis|bewertung|recension)/i);
+                        var revMatch = label.match(/^([\\d,]+)\\s*review/i);
                        if (revMatch && !result.total_reviews) {
-                            result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
+                            result.total_reviews = parseInt(revMatch[1].replace(/,/g, ''));
                        }
                        // Try combined format: "4.8 stars 79 reviews" or "4.8 stars 79k+ reviews"
                        if (!result.total_reviews) {
                            var combinedMatch = label.match(/stars?\\s+([\\d,]+k?\\+?)\\s*review/i);
                            if (combinedMatch) {
                                var countStr = combinedMatch[1].replace(/,/g, '');
                                if (countStr.includes('k')) {
                                    // Handle "9k+" format
                                    result.total_reviews = parseInt(countStr) * 1000;
                                } else {
                                    result.total_reviews = parseInt(countStr);
                                }
                            }
                        }
                    }
                    // Also collect tab button texts for debugging (include full text including numbers)
                    var tabs = document.querySelectorAll('button[role="tab"]');
                    for (var j = 0; j < tabs.length; j++) {
                        var tabText = tabs[j].textContent.trim();
                        result.debug.push('tab: ' + tabText);
                        // Also try to extract review count from tab text like "Reviews (79)"
                        if (tabText.toLowerCase().includes('review') && !result.total_reviews) {
                            var tabMatch = tabText.match(/\\((\\d+)\\)/);
                            if (tabMatch) {
                                result.total_reviews = parseInt(tabMatch[1]);
                                result.debug.push('Found reviews in tab: ' + tabText);
                            }
                        }
                    }
                    // Also check ALL buttons for reviews count
                    var allButtons = document.querySelectorAll('button');
                    for (var b = 0; b < allButtons.length; b++) {
                        var btnText = allButtons[b].textContent || '';
                        if (btnText.toLowerCase().includes('review') && !btnText.toLowerCase().includes('write')) {
                            var numMatch = btnText.match(/\\((\\d+)\\)/);
                            if (numMatch && !result.total_reviews) {
                                result.total_reviews = parseInt(numMatch[1]);
                                result.debug.push('Found reviews in button: ' + btnText.substring(0, 50));
                            }
                        }
                    }
                    // Check if we're on search results vs place page
                    result.debug.push('title: ' + document.title);
                    result.debug.push('url: ' + window.location.href.substring(0, 80));
                    // Check for search results list
                    var searchResults = document.querySelectorAll('div[role="feed"] > div');
                    result.debug.push('search_results_count: ' + searchResults.length);
                    // Fallback: Get review count from Reviews tab button "Reviews (79)"
                    // Search ALL tab buttons for one containing "review" text (same as scrape_reviews)
                    if (!result.total_reviews) {
                        var tabs = document.querySelectorAll('button[role="tab"]');
                        for (var tab of tabs) {
                            var text = tab.textContent.toLowerCase();
                            if (text.includes('review')) {
                                var match = tab.textContent.match(/\\((\\d+)\\)/);
                                if (match) {
                                    result.total_reviews = parseInt(match[1]);
                                    break;
                                }
                            }
                        }
                    }
                    // Fallback 2: Look for any button with "Reviews" and a number
                    if (!result.total_reviews) {
                        var buttons = document.querySelectorAll('button');
                        for (var btn of buttons) {
                            var text = btn.textContent;
                            if (text.toLowerCase().includes('review') && !text.toLowerCase().includes('write')) {
                                var numMatch = text.match(/\\((\\d+)\\)/);
                                if (numMatch) {
                                    result.total_reviews = parseInt(numMatch[1]);
                                    break;
                                }
                            }
                        }
                    }
@@ -1242,23 +1699,41 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
                    var addrBtn = document.querySelector('button[data-item-id="address"]');
                    if (addrBtn) {
                        var label = addrBtn.getAttribute('aria-label');
-                        if (label) result.address = label.replace(/^(Address|Dirección|Adresse):\\s*/i, '');
+                        if (label) result.address = label.replace(/^Address:\\s*/i, '');
                    }
                    return result;
                """)
-                # Exit early if we have the essentials
+                # Exit early if we have the essentials (name found AND reviews count > 0)
-                if info.get("name") and info.get("total_reviews") is not None:
+                if info.get("name") and info.get("total_reviews") and info.get("total_reviews") > 0:
                    break
                # Log debug info once after 3 seconds
                if not debug_logged and time.time() - start > 3:
                    debug_logged = True
                    debug_info = info.get("debug", [])
                    if debug_info:
                        log.info(f"🔍 Validation debug - URL: {url[:50]}...")
                        log.info(f"   Name: {info.get('name')}, Rating: {info.get('rating')}, Reviews: {info.get('total_reviews')}")
                        for d in debug_info[:10]:  # First 10 debug items
                            log.info(f"   {d}")
            except:
                pass
            time.sleep(0.1)  # 100ms between polls
        # Final debug log if still no reviews
        if not info.get("total_reviews"):
            debug_info = info.get("debug", [])
            log.warning(f"⚠️ Validation: No reviews found for '{info.get('name')}' after 10s polling")
            if debug_info:
                log.warning(f"   Debug items: {debug_info[:10]}")
        return {
            "name": info.get("name"),
            "address": info.get("address"),
            "rating": info.get("rating"),
            "total_reviews": info.get("total_reviews"),
            "category": info.get("category"),
            "success": bool(info.get("name")),
            "error": None,
            "time": time.time() - start_time
@@ -1270,6 +1745,7 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
            "address": None,
            "rating": None,
            "total_reviews": None,
            "category": None,
            "success": False,
            "error": str(e),
            "time": time.time() - start_time
--- a/web/app/page.tsx
+++ b/web/app/page.tsx
@@ -27,6 +27,8 @@ interface SelectedJob {
  jobId: string;
  newCount?: number;
  previousJobId?: string;
  businessCategory?: string;
  reviewTopics?: { topic: string; count: number }[];
 }
 type ViewType = 'newScrape' | 'jobs' | 'reports';
@@ -106,6 +108,8 @@ export default function Home() {
          jobId: job.job_id,
          newCount: data.new_count,
          previousJobId: previousJob?.job_id,
          businessCategory: job.business_category || undefined,
          reviewTopics: job.review_topics || undefined,
        });
        setActiveView('reports');
      }
@@ -155,7 +159,7 @@ export default function Home() {
                Back to Reports
              </button>
            </div>
-            <ReviewAnalytics reviews={selectedJob.reviews} businessName={selectedJob.businessName} businessUrl={selectedJob.businessUrl} newCount={selectedJob.newCount} />
+            <ReviewAnalytics reviews={selectedJob.reviews} businessName={selectedJob.businessName} businessUrl={selectedJob.businessUrl} newCount={selectedJob.newCount} businessCategory={selectedJob.businessCategory} reviewTopics={selectedJob.reviewTopics} />
          </div>
        ) : (
          <div className="h-full overflow-y-auto p-6">
--- a/web/components/ReviewAnalytics.tsx
+++ b/web/components/ReviewAnalytics.tsx
@@ -22,14 +22,21 @@ interface ReviewWithNew extends Review {
  photo_urls?: string[] | null;
 }
 interface ReviewTopic {
  topic: string;
  count: number;
 }
 interface ReviewAnalyticsProps {
  reviews: ReviewWithNew[];
  businessName?: string;
  businessUrl?: string;
  newCount?: number;
  businessCategory?: string;
  reviewTopics?: ReviewTopic[];
 }
-export default function ReviewAnalytics({ reviews, businessName, businessUrl, newCount }: ReviewAnalyticsProps) {
+export default function ReviewAnalytics({ reviews, businessName, businessUrl, newCount, businessCategory, reviewTopics }: ReviewAnalyticsProps) {
  const [sorting, setSorting] = useState<SortingState>([{ id: 'date', desc: true }]); // Default: newest first
  const [columnFilters, setColumnFiltersState] = useState<ColumnFiltersState>([]);
  const [globalFilter, setGlobalFilter] = useState('');
@@ -476,9 +483,16 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
      {/* Header */}
      <div className="flex items-center justify-between">
        <div>
-          <h2 className="text-3xl font-bold text-gray-900">
+          <div className="flex items-center gap-3">
-            {businessName || 'Review Analytics'}
+            <h2 className="text-3xl font-bold text-gray-900">
-          </h2>
+              {businessName || 'Review Analytics'}
            </h2>
            {businessCategory && (
              <span className="px-3 py-1 bg-purple-100 text-purple-800 text-sm font-medium rounded-full border border-purple-300">
                {businessCategory}
              </span>
            )}
          </div>
          {businessUrl && (
            <a
              href={businessUrl}
@@ -821,6 +835,33 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
        </div>
      </div>
      {/* Review Topics - from Google Maps */}
      {reviewTopics && reviewTopics.length > 0 && (
        <div className="bg-white border-2 border-gray-300 rounded-xl p-5 shadow-md">
          <div className="flex items-center gap-2 mb-4">
            <MessageSquare className="w-6 h-6 text-indigo-600" />
            <h3 className="text-lg font-bold text-gray-900">What People Talk About</h3>
            <span className="text-sm text-gray-500">({reviewTopics.length} topics from Google)</span>
          </div>
          <div className="flex flex-wrap gap-2">
            {reviewTopics.slice(0, 15).map((topic, idx) => (
              <div
                key={idx}
                className="px-3 py-1.5 bg-gradient-to-r from-indigo-50 to-purple-50 border border-indigo-200 rounded-full flex items-center gap-2"
              >
                <span className="text-sm font-medium text-indigo-800">{topic.topic}</span>
                <span className="text-xs bg-indigo-200 text-indigo-900 px-1.5 py-0.5 rounded-full font-bold">
                  {topic.count}
                </span>
              </div>
            ))}
          </div>
          {reviewTopics.length > 15 && (
            <p className="text-sm text-gray-500 mt-3">+{reviewTopics.length - 15} more topics</p>
          )}
        </div>
      )}
      {/* Rating & Volume Timeline */}
      {timelineData.length > 0 && (
        <div className={`bg-white rounded-xl p-6 shadow-md transition-all ${
--- a/web/components/ScraperTest.tsx
+++ b/web/components/ScraperTest.tsx
@@ -15,7 +15,7 @@ interface Review {
 export interface JobStatus {
  job_id: string;
-  status: 'pending' | 'running' | 'completed' | 'failed';
+  status: 'pending' | 'running' | 'completed' | 'failed' | 'partial';
  url: string;
  created_at: string;
  started_at: string | null;
@@ -28,8 +28,11 @@ export interface JobStatus {
  // Business metadata for tracking and comparison
  business_name: string | null;
  business_address: string | null;
  business_category: string | null;
  rating_snapshot: number | null;
  total_reviews_snapshot: number | null;
  // Review topics extracted from Google Maps
  review_topics: { topic: string; count: number }[] | null;
 }
 interface ScraperTestProps {
@@ -56,7 +59,64 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
  const [businessRating, setBusinessRating] = useState<number | null>(null);
  const [businessImage, setBusinessImage] = useState<string | null>(null);
  const [businessCategory, setBusinessCategory] = useState<string | null>(null);
  const [userFingerprint, setUserFingerprint] = useState<{
    geolocation?: {lat: number, lng: number},
    userAgent?: string,
    viewport?: {width: number, height: number},
    timezone?: string,
    language?: string,
    platform?: string
  }>({});
  const debounceRef = useRef<NodeJS.Timeout | null>(null);
  // Collect browser fingerprint on mount (no permissions needed)
  useEffect(() => {
    const collectFingerprint = async () => {
      const fingerprint: typeof userFingerprint = {};
      // User agent
      fingerprint.userAgent = navigator.userAgent;
      // Screen/viewport size
      fingerprint.viewport = {
        width: window.screen.width,
        height: window.screen.height
      };
      // Timezone
      fingerprint.timezone = Intl.DateTimeFormat().resolvedOptions().timeZone;
      // Language
      fingerprint.language = navigator.language;
      // Platform
      fingerprint.platform = navigator.platform;
      // Get approximate location from IP (no permission needed)
      try {
        const response = await fetch('https://ipapi.co/json/', {
          signal: AbortSignal.timeout(3000)
        });
        if (response.ok) {
          const data = await response.json();
          if (data.latitude && data.longitude) {
            fingerprint.geolocation = {
              lat: data.latitude,
              lng: data.longitude
            };
            console.log('IP location:', data.city, data.country_name);
          }
        }
      } catch (error) {
        console.log('IP geolocation not available');
      }
      setUserFingerprint(fingerprint);
      console.log('Browser fingerprint:', fingerprint);
    };
    collectFingerprint();
  }, []);
  const pollingIntervals = useRef<Map<string, NodeJS.Timeout>>(new Map());
  const abortControllerRef = useRef<AbortController | null>(null);
@@ -121,18 +181,23 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
    setBusinessCategory(null);
    setError('');
-    // Create new abort controller with 30 second timeout
+    // Create new abort controller with 60 second timeout (validation can be slow)
    const controller = new AbortController();
    abortControllerRef.current = controller;
-    const timeoutId = setTimeout(() => controller.abort(), 30000);
+    const timeoutId = setTimeout(() => controller.abort(), 60000);
    try {
-      const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(query)}`;
+      // Force English with hl=en parameter
      const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(query)}&hl=en`;
      const response = await fetch('/api/check-reviews', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({ url }),
+        body: JSON.stringify({
          url,
          geolocation: userFingerprint.geolocation,
          browser_fingerprint: userFingerprint  // Pass full fingerprint
        }),
        signal: controller.signal,
      });
@@ -157,21 +222,30 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
    } catch (err) {
      clearTimeout(timeoutId);
-      // Ignore AbortError (happens when user starts a new validation)
+      // Check if this is a timeout abort vs user-initiated abort
      if (err instanceof Error && err.name === 'AbortError') {
-        console.log('Validation cancelled (new validation started)');
+        // Check if it was a timeout (controller still matches) or user started new search
-        return;
+        if (abortControllerRef.current === controller) {
          // Timeout - show error
          console.error('Validation timed out');
          setError('Validation timed out. Please try again.');
          setHasReviews(false);
          setAvailableReviewCount(0);
        } else {
          // User started a new search - just return silently
          console.log('Validation cancelled (new validation started)');
          return;
        }
      } else {
        console.error('Error getting business info:', err);
        // Error occurred
        setHasReviews(false);
        setAvailableReviewCount(0);
      }
      console.error('Error getting business info:', err);
      // Error occurred
      setHasReviews(false);
      setAvailableReviewCount(0);
    } finally {
-      // Only clear loading state if this controller wasn't aborted
+      clearTimeout(timeoutId);
-      if (!controller.signal.aborted) {
+      // Always clear loading state (even on timeout)
-        setIsCheckingReviews(false);
+      setIsCheckingReviews(false);
      }
    }
  };
@@ -192,8 +266,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
          return newMap;
        });
-        // Stop polling if job is done
+        // Stop polling if job is done (completed, failed, or partial)
-        if (data.status === 'completed' || data.status === 'failed') {
+        if (data.status === 'completed' || data.status === 'failed' || data.status === 'partial') {
          const interval = pollingIntervals.current.get(jobId);
          if (interval) {
            clearInterval(interval);
@@ -244,8 +318,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
    setIsSubmitting(true);
    setShowConfirmModal(false);
-    // Use the search query to create a Google Maps search URL
+    // Use the search query to create a Google Maps search URL (force English)
-    const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(searchedQuery)}`;
+    const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(searchedQuery)}&hl=en`;
    try {
      const response = await fetch('/api/scrape', {
@@ -257,6 +331,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
          business_address: businessAddress,
          rating_snapshot: businessRating,
          total_reviews_snapshot: availableReviewCount,
          geolocation: userFingerprint.geolocation,
          browser_fingerprint: userFingerprint,  // Pass full fingerprint
        }),
      });
@@ -283,8 +359,10 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
          error_message: null,
          business_name: businessName,
          business_address: businessAddress,
          business_category: businessCategory,
          rating_snapshot: businessRating,
          total_reviews_snapshot: availableReviewCount,
          review_topics: null,  // Will be populated when job completes
        });
        return newMap;
      });
@@ -305,6 +383,7 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
      case 'completed': return 'text-green-700';
      case 'running': return 'text-blue-700';
      case 'failed': return 'text-red-700';
      case 'partial': return 'text-orange-700';
      default: return 'text-gray-800';
    }
  };
@@ -325,6 +404,12 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
            <path fillRule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zM8.707 7.293a1 1 0 00-1.414 1.414L8.586 10l-1.293 1.293a1 1 0 101.414 1.414L10 11.414l1.293 1.293a1 1 0 001.414-1.414L11.414 10l1.293-1.293a1 1 0 00-1.414-1.414L10 8.586 8.707 7.293z" clipRule="evenodd" />
          </svg>
        );
      case 'partial':
        return (
          <svg className="w-5 h-5 text-orange-500" fill="currentColor" viewBox="0 0 20 20">
            <path fillRule="evenodd" d="M8.257 3.099c.765-1.36 2.722-1.36 3.486 0l5.58 9.92c.75 1.334-.213 2.98-1.742 2.98H4.42c-1.53 0-2.493-1.646-1.743-2.98l5.58-9.92zM11 13a1 1 0 11-2 0 1 1 0 012 0zm-1-8a1 1 0 00-1 1v3a1 1 0 002 0V6a1 1 0 00-1-1z" clipRule="evenodd" />
          </svg>
        );
      default:
        return (
          <svg className="w-5 h-5 text-gray-400" fill="currentColor" viewBox="0 0 20 20">
@@ -776,8 +861,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
                )}
              </div>
-              {/* Action Buttons - Show when completed and has reviews */}
+              {/* Action Buttons - Show when completed, partial, or running with reviews */}
-              {job.status === 'completed' && job.reviews_count && job.reviews_count > 0 && (
+              {(job.status === 'completed' || job.status === 'partial' || (job.status === 'running' && job.reviews_count && job.reviews_count > 0)) && job.reviews_count && job.reviews_count > 0 && (
                <div className="flex gap-3">
                  <button
                    onClick={async () => {
@@ -818,7 +903,13 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
                      }
                    }}
                    disabled={isLoadingReviews}
-                    className="flex-1 py-4 bg-gradient-to-r from-blue-600 to-indigo-700 text-white rounded-xl font-bold hover:from-blue-700 hover:to-indigo-800 transition-all flex items-center justify-center gap-2 shadow-lg disabled:opacity-50 disabled:cursor-not-allowed text-lg border-2 border-blue-500"
+                    className={`flex-1 py-4 text-white rounded-xl font-bold transition-all flex items-center justify-center gap-2 shadow-lg disabled:opacity-50 disabled:cursor-not-allowed text-lg border-2 ${
                      job.status === 'partial'
                        ? 'bg-gradient-to-r from-orange-500 to-amber-600 hover:from-orange-600 hover:to-amber-700 border-orange-400'
                        : job.status === 'running'
                        ? 'bg-gradient-to-r from-blue-500 to-cyan-600 hover:from-blue-600 hover:to-cyan-700 border-blue-400'
                        : 'bg-gradient-to-r from-blue-600 to-indigo-700 hover:from-blue-700 hover:to-indigo-800 border-blue-500'
                    }`}
                  >
                    {isLoadingReviews ? (
                      <>
@@ -830,7 +921,7 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
                        <svg className="w-6 h-6" fill="none" stroke="currentColor" viewBox="0 0 24 24">
                          <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 19v-6a2 2 0 00-2-2H5a2 2 0 00-2 2v6a2 2 0 002 2h2a2 2 0 002-2zm0 0V9a2 2 0 012-2h2a2 2 0 012 2v10m-6 0a2 2 0 002 2h2a2 2 0 002-2m0 0V5a2 2 0 012-2h2a2 2 0 012 2v14a2 2 0 01-2 2h-2a2 2 0 01-2-2z" />
                        </svg>
-                        📊 Open Analytics Dashboard
+                        📊 {job.status === 'running' ? 'Preview Analytics' : job.status === 'partial' ? 'View Partial Data' : 'Open Analytics Dashboard'}
                      </>
                    )}
                  </button>
@@ -845,7 +936,7 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
                          const url = URL.createObjectURL(blob);
                          const a = document.createElement('a');
                          a.href = url;
-                          a.download = `reviews-${job.job_id}.json`;
+                          a.download = `reviews-${job.job_id}${job.status === 'partial' ? '-partial' : ''}.json`;
                          a.click();
                        }
                      } catch (err) {
@@ -862,6 +953,24 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
                </div>
              )}
              {/* Partial Job Warning */}
              {job.status === 'partial' && (
                <div className="mt-4 p-4 bg-orange-100 border-2 border-orange-300 rounded-lg">
                  <div className="flex items-start gap-2">
                    <svg className="w-5 h-5 text-orange-700 flex-shrink-0 mt-0.5" fill="currentColor" viewBox="0 0 20 20">
                      <path fillRule="evenodd" d="M8.257 3.099c.765-1.36 2.722-1.36 3.486 0l5.58 9.92c.75 1.334-.213 2.98-1.742 2.98H4.42c-1.53 0-2.493-1.646-1.743-2.98l5.58-9.92zM11 13a1 1 0 11-2 0 1 1 0 012 0zm-1-8a1 1 0 00-1 1v3a1 1 0 002 0V6a1 1 0 00-1-1z" clipRule="evenodd" />
                    </svg>
                    <div>
                      <p className="font-bold text-orange-900">Partial Results</p>
                      <p className="text-sm text-orange-800 mt-1">
                        This job was interrupted but {job.reviews_count} reviews were saved.
                        {job.error_message && <span className="block mt-1 text-orange-700">Reason: {job.error_message}</span>}
                      </p>
                    </div>
                  </div>
                </div>
              )}
              {/* Error Message */}
              {job.status === 'failed' && job.error_message && (
                <div className="mt-4 p-4 bg-red-100 border-2 border-red-300 rounded-lg">
--- a/web/lib/analytics.ts
+++ b/web/lib/analytics.ts
@@ -66,7 +66,9 @@ export function calculateReviewStats(reviews: Review[]): ReviewStats {
  // Populate minDate/maxDate/centerDate on reviews for display
  reviews.forEach(r => {
    if (!r.minDate || !r.maxDate || !r.centerDate) {
-      const range = parseDateTextToRange(r.date_text);
+      // Handle both date_text and timestamp field names
      const dateText = r.date_text || (r as any).timestamp || '';
      const range = parseDateTextToRange(dateText);
      r.minDate = range.minDate;
      r.maxDate = range.maxDate;
      // Calculate centerDate as midpoint
@@ -96,8 +98,8 @@ export function calculateReviewStats(reviews: Review[]): ReviewStats {
  // Recent reviews (last 30 days - simplified check)
  const recentReviews = reviews.filter(r => {
-    const text = r.date_text.toLowerCase();
+    const text = (r.date_text || (r as any).timestamp || '').toLowerCase();
-    return text.includes('day') || text.includes('week') || text.includes('hour');
+    return text.includes('day') || text.includes('week') || text.includes('hour') || text.includes('minute') || text.includes('second');
  }).length;
  // Rating distribution
@@ -278,6 +280,14 @@ function extractNumber(text: string): number {
 */
 export function parseDateTextToRange(dateText: string): { minDate: Date; maxDate: Date } {
  const now = new Date();
  // Handle undefined/null dateText
  if (!dateText) {
    // Return a default range (assume recent - within last month)
    const daysAgo = (days: number) => new Date(now.getTime() - days * 24 * 60 * 60 * 1000);
    return { minDate: daysAgo(30), maxDate: now };
  }
  const text = dateText.toLowerCase();
  // Remove "Edited " prefix if present
@@ -396,7 +406,8 @@ export function filterReviewsByDateRange(reviews: Review[], range: DateRange): R
  // Filter range: [filterStart, filterEnd]
  // Overlap occurs when: minDate <= filterEnd AND maxDate >= filterStart
  return reviews.filter(r => {
-    const { minDate, maxDate } = parseDateTextToRange(r.date_text);
+    const dateText = r.date_text || (r as any).timestamp || '';
    const { minDate, maxDate } = parseDateTextToRange(dateText);
    return minDate <= filterEnd && maxDate >= filterStart;
  });
 }
@@ -405,7 +416,8 @@ export function filterReviewsByCustomDateRange(reviews: Review[], fromDate: Date
  if (!fromDate && !toDate) return reviews;
  return reviews.filter(r => {
-    const reviewDate = parseDateText(r.date_text);
+    const dateText = r.date_text || (r as any).timestamp || '';
    const reviewDate = parseDateText(dateText);
    // If only fromDate is set, filter reviews >= fromDate
    if (fromDate && !toDate) {
@@ -429,7 +441,7 @@ export function filterReviewsByCustomDateRange(reviews: Review[], fromDate: Date
 export function calculateTimelineData(reviews: Review[]): TimelineDataPoint[] {
  // Sort reviews by date (newest first)
  const sortedReviews = [...reviews]
-    .map(r => ({ ...r, parsedDate: parseDateText(r.date_text) }))
+    .map(r => ({ ...r, parsedDate: parseDateText(r.date_text || (r as any).timestamp || '') }))
    .sort((a, b) => b.parsedDate.getTime() - a.parsedDate.getTime());
  // Group by month