diff --git a/Dockerfile b/Dockerfile index c3bbf89..ee8b8fc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,6 +39,13 @@ RUN apt-get update \ && apt-get install -y chromium chromium-driver \ && rm -rf /var/lib/apt/lists/* +# Install VNC server and noVNC (browser-based VNC viewer) +RUN apt-get update && apt-get install -y \ + x11vnc \ + novnc \ + websockify \ + && rm -rf /var/lib/apt/lists/* + # Set working directory WORKDIR /app @@ -51,7 +58,7 @@ COPY modules/ ./modules/ COPY api_server_production.py . COPY config.yaml . -# Create startup script for Xvfb + API server +# Create startup script for Xvfb + VNC + API server RUN echo '#!/bin/bash\n\ # Start Xvfb (virtual display) in background\n\ Xvfb :99 -screen 0 1920x1080x24 -ac +extension GLX +render -noreset &\n\ @@ -60,6 +67,15 @@ export DISPLAY=:99\n\ # Wait for Xvfb to start\n\ sleep 2\n\ \n\ +# Start VNC server (no password for local dev, binds to all interfaces)\n\ +x11vnc -display :99 -forever -shared -rfbport 5900 -nopw -bg\n\ +\n\ +# Start noVNC websocket proxy (browser access at http://localhost:6080/vnc.html)\n\ +websockify --web=/usr/share/novnc/ 6080 localhost:5900 &\n\ +\n\ +echo "VNC server running on port 5900"\n\ +echo "noVNC web interface at http://localhost:6080/vnc.html"\n\ +\n\ # Start API server\n\ exec python api_server_production.py\n\ ' > /app/start.sh && chmod +x /app/start.sh @@ -71,8 +87,8 @@ RUN useradd -m -u 1000 scraper && \ USER scraper -# Expose port -EXPOSE 8000 +# Expose ports: API (8000), VNC (5900), noVNC web (6080) +EXPOSE 8000 5900 6080 # Environment variables for Chromium in container ENV DISPLAY=:99 diff --git a/api_server_production.py b/api_server_production.py index 133ae8b..62b96e8 100644 --- a/api_server_production.py +++ b/api_server_production.py @@ -133,12 +133,36 @@ app.add_middleware( # ==================== Request/Response Models ==================== +class GeolocationModel(BaseModel): + """Geolocation coordinates""" + lat: float = Field(..., description="Latitude") + lng: float = Field(..., description="Longitude") + + +class ViewportModel(BaseModel): + """Browser viewport size""" + width: int = Field(..., description="Viewport width") + height: int = Field(..., description="Viewport height") + + +class BrowserFingerprintModel(BaseModel): + """Browser fingerprint to replicate user's browser""" + geolocation: Optional[GeolocationModel] = None + userAgent: Optional[str] = Field(None, description="User agent string") + viewport: Optional[ViewportModel] = Field(None, description="Screen resolution") + timezone: Optional[str] = Field(None, description="Timezone (e.g., Europe/Madrid)") + language: Optional[str] = Field(None, description="Browser language (e.g., en-US)") + platform: Optional[str] = Field(None, description="Platform (e.g., MacIntel, Win32)") + + class ScrapeRequest(BaseModel): """Request model for starting a scrape job""" url: HttpUrl = Field(..., description="Google Maps URL to scrape") webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications") webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature") metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata") + geolocation: Optional[GeolocationModel] = Field(None, description="User's geolocation for Chrome") + browser_fingerprint: Optional[BrowserFingerprintModel] = Field(None, description="User's browser fingerprint") class JobResponse(BaseModel): @@ -149,6 +173,7 @@ class JobResponse(BaseModel): created_at: str started_at: Optional[str] = None completed_at: Optional[str] = None + updated_at: Optional[str] = None # Last update time for progress tracking reviews_count: Optional[int] = None total_reviews: Optional[int] = None # Total reviews available for this place scrape_time: Optional[float] = None @@ -157,6 +182,8 @@ class JobResponse(BaseModel): # Business metadata business_name: Optional[str] = None business_address: Optional[str] = None + business_category: Optional[str] = None # Category (e.g., "Barber shop") + review_topics: Optional[List[Dict[str, Any]]] = None # Topic filters with mention counts class ReviewsResponse(BaseModel): @@ -206,12 +233,32 @@ async def start_scrape(request: ScrapeRequest): raise HTTPException(status_code=500, detail="Database not initialized") try: + # Merge browser fingerprint into metadata if provided + metadata = request.metadata or {} + if request.browser_fingerprint: + fp = request.browser_fingerprint + metadata['browser_fingerprint'] = { + "userAgent": fp.userAgent, + "timezone": fp.timezone, + "language": fp.language, + "platform": fp.platform, + } + if fp.viewport: + metadata['browser_fingerprint']['viewport'] = {"width": fp.viewport.width, "height": fp.viewport.height} + if fp.geolocation: + metadata['browser_fingerprint']['geolocation'] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng} + elif request.geolocation: + metadata['geolocation'] = { + 'lat': request.geolocation.lat, + 'lng': request.geolocation.lng + } + # Create job in database job_id = await db.create_job( url=str(request.url), webhook_url=str(request.webhook_url) if request.webhook_url else None, webhook_secret=request.webhook_secret, - metadata=request.metadata + metadata=metadata ) # Start scraping job in background @@ -240,6 +287,25 @@ async def get_job(job_id: UUID): if not job: raise HTTPException(status_code=404, detail="Job not found") + # Parse review_topics if it's a string (JSONB might be returned as string) + review_topics = job.get('review_topics') + if isinstance(review_topics, str): + try: + review_topics = json.loads(review_topics) + except: + review_topics = None + + # Extract business info from metadata if available + metadata = job.get('metadata') + if isinstance(metadata, str): + try: + metadata = json.loads(metadata) + except: + metadata = None + + business_name = metadata.get('business_name') if metadata else None + business_category = metadata.get('business_category') if metadata else None + return JobResponse( job_id=str(job['job_id']), status=job['status'], @@ -247,11 +313,15 @@ async def get_job(job_id: UUID): created_at=job['created_at'].isoformat(), started_at=job['started_at'].isoformat() if job['started_at'] else None, completed_at=job['completed_at'].isoformat() if job['completed_at'] else None, + updated_at=job['updated_at'].isoformat() if job.get('updated_at') else None, reviews_count=job['reviews_count'], total_reviews=job.get('total_reviews'), scrape_time=job['scrape_time'], error_message=job['error_message'], - webhook_url=job.get('webhook_url') + webhook_url=job.get('webhook_url'), + business_name=business_name, + business_category=business_category, + review_topics=review_topics ) @@ -541,25 +611,32 @@ async def stream_all_jobs(): @app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews") async def get_job_reviews(job_id: UUID): """ - Get the actual reviews data for a completed job. + Get reviews data for a job. - Returns 404 if job not found or not completed yet. + Returns reviews for completed, partial, or running jobs (if reviews have been collected). + Returns 404 if job not found or no reviews available yet. """ if not db: raise HTTPException(status_code=500, detail="Database not initialized") - reviews = await db.get_job_reviews(job_id) + # Get reviews (includes completed, running, and partial jobs) + reviews = await db.get_job_reviews(job_id, include_partial=True) if reviews is None: job = await db.get_job(job_id) if not job: raise HTTPException(status_code=404, detail="Job not found") - elif job['status'] != 'completed': + elif job['status'] == 'pending': raise HTTPException( status_code=400, - detail=f"Job not completed yet (current status: {job['status']})" + detail="Job has not started yet" + ) + elif job['status'] == 'failed': + raise HTTPException( + status_code=400, + detail=f"Job failed without saving any reviews: {job.get('error_message', 'Unknown error')}" ) else: - raise HTTPException(status_code=404, detail="Reviews data not available") + raise HTTPException(status_code=404, detail="No reviews data available yet") return ReviewsResponse( job_id=str(job_id), @@ -603,6 +680,15 @@ async def list_jobs( business_name = metadata.get('business_name') if metadata else None business_address = metadata.get('business_address') if metadata else None + business_category = metadata.get('business_category') if metadata else None + + # Parse review_topics if it's a string + review_topics = job.get('review_topics') + if isinstance(review_topics, str): + try: + review_topics = json.loads(review_topics) + except: + review_topics = None result.append(JobResponse( job_id=str(job['job_id']), @@ -615,7 +701,9 @@ async def list_jobs( scrape_time=job.get('scrape_time'), error_message=job.get('error_message'), business_name=business_name, - business_address=business_address + business_address=business_address, + business_category=business_category, + review_topics=review_topics )) return result @@ -640,63 +728,69 @@ async def check_reviews(request: ScrapeRequest): Get business card information from Google Maps. Returns business name, address, rating, and review count. - Uses pre-warmed Chrome worker from pool for instant response. + Creates a fresh Chrome instance for reliable results (same as full scraper). This is used to show the business confirmation card in the UI. """ - worker = None - recycle_worker = False - try: url = str(request.url) - # Get pre-warmed worker from validation pool - worker = await asyncio.to_thread(get_validation_worker, timeout=10) + # Use the SAME scraper algorithm with validation_only=True for early return + # Creates a fresh Chrome instance (same as full scraper) to avoid stale browser state + # Pooled browsers can have cookies/state that cause Google to render pages differently - if worker: - log.info(f"Using worker {worker.worker_id} for business card extraction") - # Use the pooled worker (don't close it) - result = await asyncio.to_thread( - get_business_card_info, - url=url, - driver=worker.driver, - return_driver=True - ) - - # Check if the result indicates a session error - if not result['success'] and result.get('error'): - error_msg = result.get('error', '').lower() - if 'invalid session' in error_msg or 'session' in error_msg: - log.warning(f"Worker {worker.worker_id} has invalid session, will recycle") - recycle_worker = True + # Build fingerprint dict from request + fingerprint = None + if request.browser_fingerprint: + fp = request.browser_fingerprint + fingerprint = { + "userAgent": fp.userAgent, + "timezone": fp.timezone, + "language": fp.language, + "platform": fp.platform, + } + if fp.viewport: + fingerprint["viewport"] = {"width": fp.viewport.width, "height": fp.viewport.height} + if fp.geolocation: + fingerprint["geolocation"] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng} + log.info(f"Creating Chrome with user fingerprint: {fp.platform}, {fp.timezone}") + elif request.geolocation: + fingerprint = {"geolocation": {"lat": request.geolocation.lat, "lng": request.geolocation.lng}} + log.info(f"Creating Chrome with geolocation only") else: - # Fallback: create temporary worker - log.warning("No pooled worker available, creating temporary instance") - result = await asyncio.to_thread( - get_business_card_info, - url=url - ) + log.info(f"Creating Chrome with default settings") - # SIMPLIFIED VALIDATION: If we found a business (name + rating), assume it has reviews - # Let the actual scraper determine if reviews exist - has_business = bool(result.get('name') and result.get('rating')) + result = await asyncio.to_thread( + fast_scrape_reviews, + url=url, + headless=False, # Use Xvfb display + validation_only=True, # Return early after getting total_reviews + browser_fingerprint=fingerprint # Pass user's browser fingerprint + ) + + # Extract validation info from the result + validation_info = result.get('validation_info', {}) + total_reviews = validation_info.get('total_reviews') or result.get('total_reviews') or 0 + name = validation_info.get('name') + rating = validation_info.get('rating') + category = validation_info.get('category') + address = validation_info.get('address') + + # Has reviews if we found a business with the Reviews tab (indicated by total_reviews > 0) + has_reviews = bool(name and total_reviews > 0) return { - "has_reviews": has_business, # Boolean: true if business exists - "total_reviews": result.get('total_reviews') or 0, # Show 0 if unknown - "name": result.get('name'), - "address": result.get('address'), - "rating": result.get('rating'), - "success": result['success'], + "has_reviews": has_reviews, # True if business has reviews + "total_reviews": total_reviews, + "name": name, + "address": address, + "rating": rating, + "category": category, + "success": result.get('success', True), "error": result.get('error') } except Exception as e: log.error(f"Error checking reviews: {e}") - # If it's a session error, recycle the worker - if worker: - error_msg = str(e).lower() - if 'invalid session' in error_msg or 'session' in error_msg: - recycle_worker = True return { "has_reviews": False, @@ -704,10 +798,6 @@ async def check_reviews(request: ScrapeRequest): "success": False, "error": str(e) } - finally: - # Release worker back to pool (or recycle if broken) - if worker: - await asyncio.to_thread(release_validation_worker, worker, recycle=recycle_worker) @app.get("/stats", response_model=StatsResponse, summary="Get Statistics") @@ -808,6 +898,21 @@ async def run_scraping_job(job_id: UUID): job = await db.get_job(job_id) url = job['url'] + # Extract browser fingerprint from metadata if available + browser_fingerprint = None + metadata = job.get('metadata') + if isinstance(metadata, str): + try: + metadata = json.loads(metadata) + except: + metadata = None + if metadata and 'browser_fingerprint' in metadata: + browser_fingerprint = metadata['browser_fingerprint'] + log.info(f"Using user fingerprint: {browser_fingerprint.get('platform')}, {browser_fingerprint.get('timezone')}") + elif metadata and 'geolocation' in metadata: + browser_fingerprint = {'geolocation': metadata['geolocation']} + log.info(f"Using user geolocation only") + # Broadcast job started via SSE await broadcast_job_update(job_id_str, "job_started", { "job_id": job_id_str, @@ -821,9 +926,17 @@ async def run_scraping_job(job_id: UUID): # Create log capture instance that we can access for real-time logs log_capture = LogCapture() + # Track total reviews for incremental saves + total_reviews_seen = [None] + # Accumulate all reviews for incremental saves (flush_callback receives batches) + all_reviews_collected = [] + # Progress callback to update job status with current/total counts AND logs def progress_callback(current_count: int, total_count: int): """Update job progress and logs from worker thread""" + if total_count: + total_reviews_seen[0] = total_count + async def update(): # Get current logs from the shared log_capture current_logs = log_capture.get_logs() @@ -847,6 +960,22 @@ async def run_scraping_job(job_id: UUID): # Schedule the coroutine on the event loop asyncio.run_coroutine_threadsafe(update(), loop) + # Flush callback to save reviews incrementally (crash recovery) + # Note: flush_callback receives batches, so we accumulate them + def flush_callback(reviews_batch: list): + """Accumulate and save reviews to DB incrementally from worker thread""" + # Extend our collection with the new batch + all_reviews_collected.extend(reviews_batch) + + async def save(): + await db.save_reviews_incremental( + job_id=job_id, + reviews=all_reviews_collected, # Save ALL reviews so far + total_reviews=total_reviews_seen[0] + ) + # Schedule the coroutine on the event loop + asyncio.run_coroutine_threadsafe(save(), loop) + # Run scraping with progress callback and shared log capture # headless=False because Docker uses Xvfb virtual display result = await asyncio.to_thread( @@ -854,17 +983,20 @@ async def run_scraping_job(job_id: UUID): url=url, headless=False, progress_callback=progress_callback, - log_capture=log_capture + log_capture=log_capture, + flush_callback=flush_callback, + browser_fingerprint=browser_fingerprint # Pass user's browser fingerprint ) if result['success']: - # Save results to database (including scraper logs) + # Save results to database (including scraper logs and review topics) await db.save_job_result( job_id=job_id, reviews=result['reviews'], scrape_time=result['time'], total_reviews=result.get('total_reviews'), - scrape_logs=result.get('logs') + scrape_logs=result.get('logs'), + review_topics=result.get('review_topics') ) log.info( @@ -898,68 +1030,142 @@ async def run_scraping_job(job_id: UUID): ) else: - # Job failed - save logs for debugging - await db.update_job_status( - job_id, - JobStatus.FAILED, - error_message=result.get('error', 'Unknown error'), - scrape_logs=result.get('logs') - ) + # Job failed - check if we have partial reviews saved + current_job = await db.get_job(job_id) + partial_count = current_job.get('reviews_count', 0) if current_job else 0 - log.error(f"Failed job {job_id}: {result.get('error')}") - - # Broadcast job failed via SSE - await broadcast_job_update(job_id_str, "job_failed", { - "job_id": job_id_str, - "status": "failed", - "error_message": result.get('error'), - "logs": result.get('logs', []) - }) - - # Send failure webhook if configured - if job.get('webhook_url'): - webhook_manager = WebhookManager() - await webhook_manager.send_job_completed_webhook( - webhook_url=job['webhook_url'], - job_id=job_id, - status='failed', - error_message=result.get('error'), - secret=job.get('webhook_secret'), - db=db + if partial_count > 0: + # Mark as partial - we have some reviews saved + await db.mark_job_partial( + job_id, + error_message=result.get('error', 'Unknown error'), + scrape_logs=result.get('logs') ) + log.warning(f"Partial job {job_id}: {partial_count} reviews saved before error: {result.get('error')}") + + # Broadcast job partial via SSE + await broadcast_job_update(job_id_str, "job_partial", { + "job_id": job_id_str, + "status": "partial", + "reviews_count": partial_count, + "total_reviews": current_job.get('total_reviews'), + "error_message": result.get('error'), + "logs": result.get('logs', []) + }) + + # Send partial webhook if configured + if job.get('webhook_url'): + webhook_manager = WebhookManager() + await webhook_manager.send_job_completed_webhook( + webhook_url=job['webhook_url'], + job_id=job_id, + status='partial', + reviews_count=partial_count, + error_message=result.get('error'), + secret=job.get('webhook_secret'), + db=db + ) + else: + # No reviews saved - mark as failed + await db.update_job_status( + job_id, + JobStatus.FAILED, + error_message=result.get('error', 'Unknown error'), + scrape_logs=result.get('logs') + ) + + log.error(f"Failed job {job_id}: {result.get('error')}") + + # Broadcast job failed via SSE + await broadcast_job_update(job_id_str, "job_failed", { + "job_id": job_id_str, + "status": "failed", + "error_message": result.get('error'), + "logs": result.get('logs', []) + }) + + # Send failure webhook if configured + if job.get('webhook_url'): + webhook_manager = WebhookManager() + await webhook_manager.send_job_completed_webhook( + webhook_url=job['webhook_url'], + job_id=job_id, + status='failed', + error_message=result.get('error'), + secret=job.get('webhook_secret'), + db=db + ) + except Exception as e: log.error(f"Error in scraping job {job_id}: {e}") import traceback traceback.print_exc() - await db.update_job_status( - job_id, - JobStatus.FAILED, - error_message=str(e) - ) + # Check if we have partial reviews saved + current_job = await db.get_job(job_id) + partial_count = current_job.get('reviews_count', 0) if current_job else 0 - # Broadcast job failed via SSE - await broadcast_job_update(job_id_str, "job_failed", { - "job_id": job_id_str, - "status": "failed", - "error_message": str(e), - "logs": [] - }) - - # Send failure webhook - job = await db.get_job(job_id) - if job and job.get('webhook_url'): - webhook_manager = WebhookManager() - await webhook_manager.send_job_completed_webhook( - webhook_url=job['webhook_url'], - job_id=job_id, - status='failed', + if partial_count > 0: + # Mark as partial - we have some reviews saved + await db.mark_job_partial( + job_id, error_message=str(e), - secret=job.get('webhook_secret'), - db=db + scrape_logs=log_capture.get_logs() if log_capture else None ) + log.warning(f"Partial job {job_id}: {partial_count} reviews saved before exception: {e}") + + # Broadcast job partial via SSE + await broadcast_job_update(job_id_str, "job_partial", { + "job_id": job_id_str, + "status": "partial", + "reviews_count": partial_count, + "total_reviews": current_job.get('total_reviews'), + "error_message": str(e), + "logs": log_capture.get_logs() if log_capture else [] + }) + + # Send partial webhook + if current_job and current_job.get('webhook_url'): + webhook_manager = WebhookManager() + await webhook_manager.send_job_completed_webhook( + webhook_url=current_job['webhook_url'], + job_id=job_id, + status='partial', + reviews_count=partial_count, + error_message=str(e), + secret=current_job.get('webhook_secret'), + db=db + ) + else: + # No reviews saved - mark as failed + await db.update_job_status( + job_id, + JobStatus.FAILED, + error_message=str(e) + ) + + # Broadcast job failed via SSE + await broadcast_job_update(job_id_str, "job_failed", { + "job_id": job_id_str, + "status": "failed", + "error_message": str(e), + "logs": [] + }) + + # Send failure webhook + if current_job and current_job.get('webhook_url'): + webhook_manager = WebhookManager() + await webhook_manager.send_job_completed_webhook( + webhook_url=current_job['webhook_url'], + job_id=job_id, + status='failed', + error_message=str(e), + secret=current_job.get('webhook_secret'), + db=db + ) + if __name__ == "__main__": import uvicorn diff --git a/docker-compose.production.yml b/docker-compose.production.yml index 856e585..5e35140 100644 --- a/docker-compose.production.yml +++ b/docker-compose.production.yml @@ -39,6 +39,8 @@ services: - CHROME_BIN=/usr/bin/chromium ports: - "8000:8000" + - "5900:5900" # VNC port (for VNC client) + - "6080:6080" # noVNC web interface (browser access) depends_on: db: condition: service_healthy diff --git a/modules/database.py b/modules/database.py index 8f112a1..7a660f8 100644 --- a/modules/database.py +++ b/modules/database.py @@ -21,6 +21,7 @@ class JobStatus(str, Enum): COMPLETED = "completed" FAILED = "failed" CANCELLED = "cancelled" + PARTIAL = "partial" # Job crashed but has partial reviews saved class DatabaseManager: @@ -69,6 +70,7 @@ class DatabaseManager: created_at TIMESTAMP NOT NULL DEFAULT NOW(), started_at TIMESTAMP, completed_at TIMESTAMP, + updated_at TIMESTAMP, reviews_count INTEGER, total_reviews INTEGER, @@ -79,7 +81,7 @@ class DatabaseManager: metadata JSONB, scrape_logs JSONB, - CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled')) + CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial')) ); """) @@ -88,6 +90,24 @@ class DatabaseManager: ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB; """) + # Add updated_at column if it doesn't exist (for incremental progress tracking) + await conn.execute(""" + ALTER TABLE jobs ADD COLUMN IF NOT EXISTS updated_at TIMESTAMP; + """) + + # Add review_topics column if it doesn't exist (extracted topic filters with mention counts) + await conn.execute(""" + ALTER TABLE jobs ADD COLUMN IF NOT EXISTS review_topics JSONB; + """) + + # Update constraint to include 'partial' status (for existing databases) + await conn.execute(""" + ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_status; + """) + await conn.execute(""" + ALTER TABLE jobs ADD CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial')); + """) + # Create indexes await conn.execute(""" CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status); @@ -187,13 +207,15 @@ class DatabaseManager: created_at, started_at, completed_at, + updated_at, reviews_count, total_reviews, reviews_data, scrape_time, error_message, metadata, - scrape_logs + scrape_logs, + review_topics FROM jobs WHERE job_id = $1 """, job_id) @@ -203,22 +225,32 @@ class DatabaseManager: return dict(row) - async def get_job_reviews(self, job_id: UUID) -> Optional[List[Dict[str, Any]]]: + async def get_job_reviews(self, job_id: UUID, include_partial: bool = True) -> Optional[List[Dict[str, Any]]]: """ Get reviews for a specific job. Args: job_id: Job UUID + include_partial: If True, also return reviews for running and partial jobs Returns: - List of reviews or None if not found/not completed + List of reviews or None if not found/no reviews """ async with self.pool.acquire() as conn: - reviews_data = await conn.fetchval(""" - SELECT reviews_data - FROM jobs - WHERE job_id = $1 AND status = 'completed' - """, job_id) + if include_partial: + # Return reviews for completed, running, or partial jobs + reviews_data = await conn.fetchval(""" + SELECT reviews_data + FROM jobs + WHERE job_id = $1 AND status IN ('completed', 'running', 'partial') + """, job_id) + else: + # Only return reviews for completed jobs + reviews_data = await conn.fetchval(""" + SELECT reviews_data + FROM jobs + WHERE job_id = $1 AND status = 'completed' + """, job_id) if not reviews_data: return None @@ -278,7 +310,8 @@ class DatabaseManager: reviews: List[Dict[str, Any]], scrape_time: float, total_reviews: Optional[int] = None, - scrape_logs: Optional[List[Dict[str, Any]]] = None + scrape_logs: Optional[List[Dict[str, Any]]] = None, + review_topics: Optional[List[Dict[str, Any]]] = None ): """ Save scraping results to database. @@ -289,8 +322,33 @@ class DatabaseManager: scrape_time: Time taken to scrape in seconds total_reviews: Total reviews available (from page counter) scrape_logs: List of log entries from the scraper + review_topics: List of topic filter dictionaries with topic and count """ async with self.pool.acquire() as conn: + # If reviews list is empty, check if job already has reviews from incremental saves + # This happens when flush_callback was used during scraping + if not reviews: + existing = await conn.fetchval( + "SELECT reviews_count FROM jobs WHERE job_id = $1", job_id + ) + if existing and existing > 0: + # Job has reviews from incremental saves, don't overwrite reviews_data + await conn.execute(""" + UPDATE jobs + SET + status = 'completed', + completed_at = NOW(), + total_reviews = COALESCE($2, total_reviews), + scrape_time = $3, + scrape_logs = $4::jsonb, + review_topics = $5::jsonb + WHERE job_id = $1 + """, job_id, total_reviews, scrape_time, + json.dumps(scrape_logs) if scrape_logs else None, + json.dumps(review_topics) if review_topics else None) + log.info(f"Completed job {job_id} with {existing} reviews (from incremental saves)") + return + await conn.execute(""" UPDATE jobs SET @@ -300,13 +358,70 @@ class DatabaseManager: total_reviews = $3, reviews_data = $4::jsonb, scrape_time = $5, - scrape_logs = $6::jsonb + scrape_logs = $6::jsonb, + review_topics = $7::jsonb WHERE job_id = $1 """, job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time, - json.dumps(scrape_logs) if scrape_logs else None) + json.dumps(scrape_logs) if scrape_logs else None, + json.dumps(review_topics) if review_topics else None) log.info(f"Saved {len(reviews)} reviews for job {job_id}") + async def save_reviews_incremental( + self, + job_id: UUID, + reviews: List[Dict[str, Any]], + total_reviews: Optional[int] = None + ): + """ + Save reviews incrementally during scraping. + Called on each flush to preserve progress in case of crash. + + Args: + job_id: Job UUID + reviews: ALL reviews collected so far (not just new ones) + total_reviews: Total reviews available (from page counter) + """ + async with self.pool.acquire() as conn: + await conn.execute(""" + UPDATE jobs + SET + reviews_count = $2, + total_reviews = COALESCE($3, total_reviews), + reviews_data = $4::jsonb, + updated_at = NOW() + WHERE job_id = $1 AND status = 'running' + """, job_id, len(reviews), total_reviews, json.dumps(reviews)) + + log.debug(f"Incremental save: {len(reviews)} reviews for job {job_id}") + + async def mark_job_partial( + self, + job_id: UUID, + error_message: str, + scrape_logs: Optional[List[Dict[str, Any]]] = None + ): + """ + Mark a job as partial (crashed but has some reviews saved). + + Args: + job_id: Job UUID + error_message: Error that caused the crash + scrape_logs: Log entries from the scraper + """ + async with self.pool.acquire() as conn: + await conn.execute(""" + UPDATE jobs + SET + status = 'partial', + completed_at = NOW(), + error_message = $2, + scrape_logs = $3::jsonb + WHERE job_id = $1 + """, job_id, error_message, json.dumps(scrape_logs) if scrape_logs else None) + + log.info(f"Marked job {job_id} as partial due to: {error_message}") + async def list_jobs( self, status: Optional[JobStatus] = None, @@ -337,7 +452,8 @@ class DatabaseManager: total_reviews, scrape_time, error_message, - metadata + metadata, + review_topics FROM jobs WHERE status = $1 ORDER BY created_at DESC @@ -355,7 +471,8 @@ class DatabaseManager: total_reviews, scrape_time, error_message, - metadata + metadata, + review_topics FROM jobs ORDER BY created_at DESC LIMIT $1 OFFSET $2 diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py index a60d114..e364359 100644 --- a/modules/scraper_clean.py +++ b/modules/scraper_clean.py @@ -268,7 +268,7 @@ def parse_dom_review(card) -> dict: def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15, flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None, - progress_callback=None) -> dict: + progress_callback=None, validation_only: bool = False) -> dict: """ Scrape Google Maps reviews. @@ -299,6 +299,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in # Track total reviews (persists across refreshes) total_reviews = [None] # Use list for closure mutation + # Store business info extracted from overview (before clicking reviews tab) + business_info_cache = [None] + # Hard refresh counter hard_refresh_count = [0] max_hard_refreshes = 3 # Max number of hard refreshes before giving up @@ -323,11 +326,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in pass return None - def setup_reviews_page(is_refresh=False): + def setup_reviews_page(is_refresh=False, validation_only_mode=False): """ Setup the reviews page for scraping. Returns (scroll_container, stop_scrolling_event) or (None, None) on failure. Can be called after initial load or after a hard refresh. + + If validation_only_mode=True, returns early after extracting business info + without clicking reviews tab or finding scroll container. """ nonlocal total_reviews @@ -335,6 +341,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in # Navigate to URL (only on initial load or refresh) if not is_refresh: + # Reset browser state by navigating to blank page first + # This clears any stale state from pooled browser sessions + try: + driver.get("about:blank") + time.sleep(0.1) + except: + pass log.info(f"🌐 Loading: {url[:80]}...") else: log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...") @@ -353,6 +366,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in # Reload original URL after consent log.info(" Reloading after consent...") driver.get(url) + # Wait for page to settle after consent reload + time.sleep(1) break except: pass @@ -362,43 +377,108 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in break time.sleep(0.01) # 10ms - responsive but low CPU - # Extract total review count BEFORE clicking reviews tab (it's on Overview) + # Extract business info and total review count BEFORE clicking reviews tab (on Overview) + # This captures name, rating, category, address while they're visible # Only on first load (don't overwrite if we already have it) - if total_reviews[0] is None: + if total_reviews[0] is None or business_info_cache[0] is None: start = time.time() while time.time() - start < 5: try: - count = driver.execute_script(""" - var reviewSpans = document.querySelectorAll('span[role="img"]'); - for (var i = 0; i < reviewSpans.length; i++) { - var label = reviewSpans[i].getAttribute('aria-label') || ''; - var match = label.match(/^([\\d,\\.]+)\\s*review/i); - if (match) { - return parseInt(match[1].replace(/[,\\.]/g, '')); + info = driver.execute_script(""" + var result = { + total_reviews: null, + name: null, + rating: null, + category: null, + address: null + }; + + // Business name from h1 + var h1 = document.querySelector('h1'); + if (h1) result.name = h1.textContent.trim(); + + // Category - use jsaction attribute (robust selector) + var catBtn = document.querySelector('button[jsaction*="category"]'); + if (catBtn) result.category = catBtn.textContent.trim(); + + // Rating and review count from span[role="img"] aria-labels + var spans = document.querySelectorAll('span[role="img"]'); + for (var i = 0; i < spans.length; i++) { + var label = spans[i].getAttribute('aria-label') || ''; + + // Rating: "4.8 stars" + var rMatch = label.match(/^([\\d,.]+)\\s*star/i); + if (rMatch && !result.rating) { + result.rating = parseFloat(rMatch[1].replace(',', '.')); + } + + // Reviews: "79 reviews" + var revMatch = label.match(/^([\\d,\\.]+)\\s*review/i); + if (revMatch && !result.total_reviews) { + result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, '')); } } - return null; + + // Address from button + var addrBtn = document.querySelector('button[data-item-id="address"]'); + if (addrBtn) { + var label = addrBtn.getAttribute('aria-label'); + if (label) result.address = label.replace(/^Address:\\s*/i, ''); + } + + return result; """) - if count: - total_reviews[0] = count - log.info(f"📊 Total reviews on page: {count}") - break + + if info: + if info.get('total_reviews') and total_reviews[0] is None: + total_reviews[0] = info['total_reviews'] + log.info(f"📊 Total reviews on page: {total_reviews[0]}") + if info.get('name') and business_info_cache[0] is None: + business_info_cache[0] = info + log.info(f"📍 Business: {info.get('name')}") + if total_reviews[0] and business_info_cache[0]: + break except: pass time.sleep(0.1) + # VALIDATION_ONLY: Return early - skip clicking reviews tab, sorting, etc. + if validation_only_mode: + log.info("📋 Validation mode: returning early (skipping reviews tab)") + return ("validation_done", None) + # Click reviews tab - poll until found review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"] start = time.time() tab_clicked = False + tabs_logged = False while time.time() - start < 5: # Max 5s for tabs try: tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']") + # Log available tabs once for debugging + if not tabs_logged and tabs: + tabs_logged = True + tab_texts = [t.text for t in tabs] + log.info(f" Available tabs: {tab_texts}") for tab in tabs: tab_text = tab.text.lower() if any(kw in tab_text for kw in review_keywords): if not is_refresh: log.info(f" Clicking reviews tab: '{tab.text}'") + # Extract total_reviews from tab text like "Reviews (79)" or "Reviews\n79" + if total_reviews[0] is None: + import re + # Try pattern with parentheses: "Reviews (79)" + match = re.search(r'\((\d+)\)', tab.text) + if match: + total_reviews[0] = int(match.group(1)) + log.info(f"📊 Total reviews from tab: {total_reviews[0]}") + else: + # Try pattern with newline: "Reviews\n79" + match = re.search(r'(\d+)', tab.text) + if match: + total_reviews[0] = int(match.group(1)) + log.info(f"📊 Total reviews from tab: {total_reviews[0]}") tab.click() tab_clicked = True break @@ -569,11 +649,85 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in return scroll_container, stop_scrolling - # Initial page setup - scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False) + # Helper to extract review topics from the reviews tab + def extract_review_topics(): + """Extract review topic filters from radiogroup (robust selectors).""" + try: + topics = driver.execute_script(""" + var topics = []; + + // Primary: use role="radiogroup" with aria-label="Refine reviews" (robust) + var container = document.querySelector('div[role="radiogroup"][aria-label*="Refine"], div[role="radiogroup"][aria-label*="refine"]'); + + if (!container) { + // Fallback: any radiogroup in the reviews area + container = document.querySelector('div[role="radiogroup"]'); + } + + if (container) { + var buttons = container.querySelectorAll('button[role="radio"]'); + for (var btn of buttons) { + var label = btn.getAttribute('aria-label') || ''; + // Parse "hair salon, mentioned in 4 reviews" format + var match = label.match(/^([^,]+),\\s*mentioned in (\\d+)/i); + if (match) { + topics.push({ + topic: match[1].trim(), + count: parseInt(match[2]) + }); + } else if (label && !label.toLowerCase().includes('all review')) { + // Fallback: try to extract from child spans + var countSpan = btn.querySelector('.bC3Nkc, .fontBodySmall'); + var nameSpan = btn.querySelector('.uEubGf, span:first-child'); + if (nameSpan) { + var name = nameSpan.textContent.trim(); + var count = countSpan ? parseInt(countSpan.textContent) : 0; + if (name && name.toLowerCase() !== 'all') { + topics.push({topic: name, count: count || 0}); + } + } + } + } + } + + return topics; + """) + return topics or [] + except: + return [] + + # Initial page setup (pass validation_only to skip unnecessary steps) + scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False, validation_only_mode=validation_only) + + # VALIDATION_ONLY MODE: Return early with just total_reviews and business info + # setup_reviews_page returns ("validation_done", None) in this case + if validation_only or scroll_container == "validation_done": + # Use the business info captured from Overview (before clicking reviews tab) + business_info = business_info_cache[0] or {} + + return { + "reviews": [], + "total": total_reviews[0] or 0, + "scrolls": 0, + "error": None, + "validation_info": { + "name": business_info.get("name"), + "rating": business_info.get("rating"), + "category": business_info.get("category"), + "address": business_info.get("address"), + "total_reviews": total_reviews[0] + } + } + if not scroll_container: return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"} + # Extract review topics after reviews tab is loaded (before scrolling begins) + time.sleep(0.5) # Brief wait for topic filters to render + review_topics = extract_review_topics() + if review_topics: + log.info(f"📊 Found {len(review_topics)} review topics: {', '.join(t['topic'] for t in review_topics[:5])}...") + def get_api_reviews(): """Get reviews from intercepted API responses.""" api_revs = [] @@ -990,13 +1144,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in "total_flushed": total_flushed[0], "checks": check_num, "url": url, - "logs": log.get_logs() + "logs": log.get_logs(), + "review_topics": review_topics # Topic filters with mention counts } def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999, progress_callback=None, driver=None, return_driver: bool = False, - log_capture: LogCapture = None): + log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False, + browser_fingerprint: dict = None): """ Production-compatible wrapper for scrape_reviews. Matches the API expected by job_manager.py. @@ -1009,6 +1165,13 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999 driver: Existing driver instance to reuse return_driver: If True, return driver in result log_capture: Optional LogCapture instance for real-time log access + browser_fingerprint: Optional dict with user's browser fingerprint: + - geolocation: {lat, lng} + - userAgent: string + - viewport: {width, height} + - timezone: string (e.g., "Europe/Madrid") + - language: string (e.g., "en-US") + - platform: string (e.g., "MacIntel") Returns: Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs @@ -1023,27 +1186,56 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999 log_capture = log_capture or LogCapture() try: + # Extract fingerprint settings + fp = browser_fingerprint or {} + user_agent = fp.get('userAgent') or "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + viewport = fp.get('viewport') or {'width': 1200, 'height': 900} + geolocation = fp.get('geolocation') + timezone = fp.get('timezone') + language = fp.get('language', 'en-US') + # Create driver if not provided if not driver: driver = Driver( uc=True, headless=headless, page_load_strategy="normal", - agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + agent=user_agent # Use user's actual user agent ) - driver.set_window_size(1200, 900) # Proper viewport for Google Maps + # Set viewport to match user's screen + driver.set_window_size(viewport['width'], viewport['height']) - # Set Chrome geolocation to US (Boston, MA) using CDP - # This ensures Google Maps shows US results regardless of server location + # Apply browser fingerprint settings via CDP try: - driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { - 'latitude': 42.3601, - 'longitude': -71.0589, - 'accuracy': 100 - }) - log_capture.info("Set geolocation to US (Boston, MA)") + # Set timezone if provided + if timezone: + driver.execute_cdp_cmd('Emulation.setTimezoneOverride', {'timezoneId': timezone}) + log_capture.info(f"Set timezone to {timezone}") + + # Set locale/language + driver.execute_cdp_cmd('Emulation.setLocaleOverride', {'locale': language}) + + # Set geolocation + if geolocation and 'lat' in geolocation and 'lng' in geolocation: + driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { + 'latitude': geolocation['lat'], + 'longitude': geolocation['lng'], + 'accuracy': 1000 # ~1km accuracy for IP-based location + }) + log_capture.info(f"Set geolocation to ({geolocation['lat']:.2f}, {geolocation['lng']:.2f})") + else: + # Default to US (Boston, MA) if no geolocation provided + driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { + 'latitude': 42.3601, + 'longitude': -71.0589, + 'accuracy': 100 + }) + log_capture.info("Set geolocation to US (Boston, MA) [default]") + + if fp: + log_capture.info(f"Browser fingerprint applied: {fp.get('platform', 'unknown')}, {viewport['width']}x{viewport['height']}") except Exception as e: - log_capture.warning(f"Could not set geolocation: {e}") + log_capture.warning(f"Could not apply fingerprint settings: {e}") # Add URL parameters for consistent results if 'hl=' not in url: @@ -1052,14 +1244,18 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999 if 'gl=' not in url: url = f"{url}&gl=us" - # Create progress wrapper if callback provided - flush_callback = None - if progress_callback: + # Create combined flush callback for progress + external handler + external_flush = flush_callback # Save external callback + internal_flush = None + if progress_callback or external_flush: collected = [0] - def flush_with_progress(reviews_batch): - collected[0] += len(reviews_batch) - progress_callback(collected[0], None) - flush_callback = flush_with_progress + def combined_flush(reviews_batch): + collected[0] = len(reviews_batch) # reviews_batch is ALL reviews so far + if progress_callback: + progress_callback(collected[0], None) + if external_flush: + external_flush(reviews_batch) # Pass reviews to external handler + internal_flush = combined_flush # Run the scraper with progress callback for real-time updates result = scrape_reviews( @@ -1067,10 +1263,11 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999 url=url, max_reviews=999999, # Effectively unlimited timeout_no_new=15, - flush_callback=flush_callback, + flush_callback=internal_flush, flush_batch_size=100, # Smaller batches for more frequent progress log_capture=log_capture, - progress_callback=progress_callback # Pass through for real-time log updates + progress_callback=progress_callback, # Pass through for real-time log updates + validation_only=validation_only # Return early if just validating ) elapsed = time.time() - start_time @@ -1083,9 +1280,14 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999 "time": elapsed, "success": True, "error": None, - "logs": result.get("logs", []) + "logs": result.get("logs", []), + "review_topics": result.get("review_topics", []) # Topic filters with mention counts } + # Include validation_info if in validation_only mode + if validation_only and "validation_info" in result: + response["validation_info"] = result["validation_info"] + if return_driver: response["driver"] = driver elif should_close_driver: @@ -1120,6 +1322,122 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999 } +def extract_about_info(driver, url: str = None) -> dict: + """ + Extract About section info from Google Maps (Accessibility, Amenities, etc.). + + This function should be called AFTER reviews are scraped if about info is needed, + as it navigates to a different tab. + + Args: + driver: Selenium WebDriver instance (already on the business page) + url: Optional URL to navigate to first (if not already on the page) + + Returns: + dict with section names as keys, each containing list of features + """ + try: + # Navigate if URL provided + if url: + # Force English + if 'hl=' not in url: + separator = '&' if '?' in url else '?' + url = f"{url}{separator}hl=en" + if 'gl=' not in url: + url = f"{url}&gl=us" + driver.get(url) + time.sleep(1) + + # Click About tab using robust selectors + clicked = driver.execute_script(""" + // Try multiple selectors for about tab + var selectors = [ + 'button[aria-label*="About"]', + 'button[data-tab-index="2"]', + 'div[role="tablist"] button:nth-child(3)', + 'button[jsaction*="about"]' + ]; + + for (var sel of selectors) { + var btn = document.querySelector(sel); + if (btn && btn.textContent.toLowerCase().includes('about')) { + btn.click(); + return true; + } + } + + // Fallback: find by text content + var buttons = document.querySelectorAll('button'); + for (var btn of buttons) { + if (btn.textContent.trim().toLowerCase() === 'about') { + btn.click(); + return true; + } + } + return false; + """) + + if not clicked: + return {} + + time.sleep(1.5) # Wait for about tab to load + + # Extract about sections using aria-labels (robust) + about = driver.execute_script(""" + var about = {}; + + // Find the about region by aria-label or role + var container = document.querySelector('div[role="region"][aria-label*="About"]'); + + if (!container) { + // Fallback: look for the scrollable area with sections + container = document.querySelector('.m6QErb[aria-label*="About"]'); + } + + if (!container) { + // Last resort: find sections by h2 headers + container = document; + } + + // Find all section headers (h2 elements) + var sections = container.querySelectorAll('h2'); + + for (var h2 of sections) { + var sectionName = h2.textContent.trim(); + var items = []; + + // Find the ul list following this h2 + var parent = h2.closest('.iP2t7d, div'); + if (parent) { + var listItems = parent.querySelectorAll('li span[aria-label]'); + for (var li of listItems) { + var label = li.getAttribute('aria-label'); + if (label) { + // Parse "Has toilet" or "No wheelchair-accessible car park" + var hasFeature = !label.toLowerCase().startsWith('no '); + var featureName = label.replace(/^(Has |No )/i, ''); + items.push({ + feature: featureName, + available: hasFeature + }); + } + } + } + + if (sectionName && items.length > 0) { + about[sectionName] = items; + } + } + + return about; + """) + + return about or {} + + except Exception as e: + return {"error": str(e)} + + # Test function if __name__ == "__main__": from seleniumbase import Driver @@ -1159,6 +1477,8 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_ dict with: name, address, rating, total_reviews, success, error, time """ from seleniumbase import Driver + import logging + log = logging.getLogger(__name__) start_time = time.time() driver_provided = driver is not None @@ -1177,13 +1497,15 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_ except: pass - # Clear state if reusing a pooled driver (ensures clean page load) - if driver_provided: - try: - driver.delete_all_cookies() - driver.get("about:blank") - except: - pass + # Don't clear state - Google may serve different content based on session history + # The scraper doesn't reset state, so validation shouldn't either + + # Force English interface for consistent parsing + if 'hl=' not in url: + separator = '&' if '?' in url else '?' + url = f"{url}{separator}hl=en" + if 'gl=' not in url: + url = f"{url}&gl=us" # Navigate to URL driver.get(url) @@ -1193,48 +1515,183 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_ while time.time() - start < 5: if "consent.google" in driver.current_url: try: - for btn in driver.find_elements(By.CSS_SELECTOR, "button"): - txt = btn.text.lower() - if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt: - btn.click() - driver.get(url) - break - except: + # Try multiple approaches to find and click accept button + clicked = False + + # Method 1: Find by aria-label (most reliable for Google consent) + for btn in driver.find_elements(By.CSS_SELECTOR, "button[aria-label*='Accept']"): + btn.click() + clicked = True + break + + # Method 2: Find by text content + if not clicked: + for btn in driver.find_elements(By.CSS_SELECTOR, "button"): + txt = btn.text.lower() + if "accept all" in txt or "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt: + btn.click() + clicked = True + break + + if clicked: + time.sleep(0.5) # Brief wait for consent to process + driver.get(url) # Reload the target URL + time.sleep(0.5) # Wait for reload + except Exception as e: pass break if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url): break time.sleep(0.01) # 10ms - responsive but low CPU + # Log current URL after consent handling + try: + current_url = driver.current_url + log.info(f"🔍 Validation: Current URL after load: {current_url[:80]}...") + except: + pass + + # Wait for page to fully render before polling (tabs may load dynamically) + time.sleep(2) + # Poll for business info (same pattern as total_reviews extraction) - info = {"name": None, "rating": None, "total_reviews": None, "address": None} + # Timeout increased to 10s because Reviews tab can take 6+ seconds to appear after consent + info = {"name": None, "rating": None, "total_reviews": None, "address": None, "category": None} start = time.time() - while time.time() - start < 5: + debug_logged = False + while time.time() - start < 10: try: info = driver.execute_script(""" - var result = {name: null, rating: null, total_reviews: null, address: null}; + var result = {name: null, rating: null, total_reviews: null, address: null, category: null, debug: []}; // Business name from h1 var h1 = document.querySelector('h1'); if (h1) result.name = h1.textContent.trim(); - // Rating and reviews from span[role="img"] aria-labels - // Same pattern as scrape_reviews for consistency + // Category - use jsaction attribute (robust, survives class changes) + var catBtn = document.querySelector('button[jsaction*="category"]'); + if (catBtn) result.category = catBtn.textContent.trim(); + + // Fallback: look for button after rating that's not a link + if (!result.category) { + var buttons = document.querySelectorAll('button'); + for (var btn of buttons) { + var text = btn.textContent.trim(); + // Categories are short words, no numbers, not navigation + if (text && text.length < 50 && !text.match(/^[0-9]/) && + !text.match(/review|star|direction|save|share|photo/i)) { + // Check if it's near the rating area + var parent = btn.closest('.LBgpqf, .skqShb, .fontBodyMedium'); + if (parent) { + result.category = text; + break; + } + } + } + } + + // Rating from span[role="img"] aria-labels var spans = document.querySelectorAll('span[role="img"]'); for (var i = 0; i < spans.length; i++) { var label = spans[i].getAttribute('aria-label') || ''; - // Rating: "4.8 stars", "4,8 estrellas", etc (partial match) - var rMatch = label.match(/^([\\d,.]+)\\s*(star|estrella|étoile|stern|stell)/i); + // Collect debug info for all aria-labels + if (label) { + result.debug.push('img-aria: ' + label); + } + + // Rating: "4.8 stars" (English forced via hl=en) + var rMatch = label.match(/^([\\d,.]+)\\s*star/i); if (rMatch && !result.rating) { result.rating = parseFloat(rMatch[1].replace(',', '.')); } - // Reviews: same as scrape_reviews - /^([\d,.]+)\s*review/i - // Plus Spanish "reseña" which doesn't contain "review" - var revMatch = label.match(/^([\\d,\\.]+)\\s*(review|reseña|avis|bewertung|recension)/i); + // Reviews: "79 reviews" or "4.8 stars 79 reviews" (English forced via hl=en) + // Try direct format first: "79 reviews" + var revMatch = label.match(/^([\\d,]+)\\s*review/i); if (revMatch && !result.total_reviews) { - result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, '')); + result.total_reviews = parseInt(revMatch[1].replace(/,/g, '')); + } + + // Try combined format: "4.8 stars 79 reviews" or "4.8 stars 79k+ reviews" + if (!result.total_reviews) { + var combinedMatch = label.match(/stars?\\s+([\\d,]+k?\\+?)\\s*review/i); + if (combinedMatch) { + var countStr = combinedMatch[1].replace(/,/g, ''); + if (countStr.includes('k')) { + // Handle "9k+" format + result.total_reviews = parseInt(countStr) * 1000; + } else { + result.total_reviews = parseInt(countStr); + } + } + } + } + + // Also collect tab button texts for debugging (include full text including numbers) + var tabs = document.querySelectorAll('button[role="tab"]'); + for (var j = 0; j < tabs.length; j++) { + var tabText = tabs[j].textContent.trim(); + result.debug.push('tab: ' + tabText); + // Also try to extract review count from tab text like "Reviews (79)" + if (tabText.toLowerCase().includes('review') && !result.total_reviews) { + var tabMatch = tabText.match(/\\((\\d+)\\)/); + if (tabMatch) { + result.total_reviews = parseInt(tabMatch[1]); + result.debug.push('Found reviews in tab: ' + tabText); + } + } + } + + // Also check ALL buttons for reviews count + var allButtons = document.querySelectorAll('button'); + for (var b = 0; b < allButtons.length; b++) { + var btnText = allButtons[b].textContent || ''; + if (btnText.toLowerCase().includes('review') && !btnText.toLowerCase().includes('write')) { + var numMatch = btnText.match(/\\((\\d+)\\)/); + if (numMatch && !result.total_reviews) { + result.total_reviews = parseInt(numMatch[1]); + result.debug.push('Found reviews in button: ' + btnText.substring(0, 50)); + } + } + } + + // Check if we're on search results vs place page + result.debug.push('title: ' + document.title); + result.debug.push('url: ' + window.location.href.substring(0, 80)); + + // Check for search results list + var searchResults = document.querySelectorAll('div[role="feed"] > div'); + result.debug.push('search_results_count: ' + searchResults.length); + + // Fallback: Get review count from Reviews tab button "Reviews (79)" + // Search ALL tab buttons for one containing "review" text (same as scrape_reviews) + if (!result.total_reviews) { + var tabs = document.querySelectorAll('button[role="tab"]'); + for (var tab of tabs) { + var text = tab.textContent.toLowerCase(); + if (text.includes('review')) { + var match = tab.textContent.match(/\\((\\d+)\\)/); + if (match) { + result.total_reviews = parseInt(match[1]); + break; + } + } + } + } + + // Fallback 2: Look for any button with "Reviews" and a number + if (!result.total_reviews) { + var buttons = document.querySelectorAll('button'); + for (var btn of buttons) { + var text = btn.textContent; + if (text.toLowerCase().includes('review') && !text.toLowerCase().includes('write')) { + var numMatch = text.match(/\\((\\d+)\\)/); + if (numMatch) { + result.total_reviews = parseInt(numMatch[1]); + break; + } + } } } @@ -1242,23 +1699,41 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_ var addrBtn = document.querySelector('button[data-item-id="address"]'); if (addrBtn) { var label = addrBtn.getAttribute('aria-label'); - if (label) result.address = label.replace(/^(Address|Dirección|Adresse):\\s*/i, ''); + if (label) result.address = label.replace(/^Address:\\s*/i, ''); } return result; """) - # Exit early if we have the essentials - if info.get("name") and info.get("total_reviews") is not None: + # Exit early if we have the essentials (name found AND reviews count > 0) + if info.get("name") and info.get("total_reviews") and info.get("total_reviews") > 0: break + + # Log debug info once after 3 seconds + if not debug_logged and time.time() - start > 3: + debug_logged = True + debug_info = info.get("debug", []) + if debug_info: + log.info(f"🔍 Validation debug - URL: {url[:50]}...") + log.info(f" Name: {info.get('name')}, Rating: {info.get('rating')}, Reviews: {info.get('total_reviews')}") + for d in debug_info[:10]: # First 10 debug items + log.info(f" {d}") except: pass time.sleep(0.1) # 100ms between polls + # Final debug log if still no reviews + if not info.get("total_reviews"): + debug_info = info.get("debug", []) + log.warning(f"⚠️ Validation: No reviews found for '{info.get('name')}' after 10s polling") + if debug_info: + log.warning(f" Debug items: {debug_info[:10]}") + return { "name": info.get("name"), "address": info.get("address"), "rating": info.get("rating"), "total_reviews": info.get("total_reviews"), + "category": info.get("category"), "success": bool(info.get("name")), "error": None, "time": time.time() - start_time @@ -1270,6 +1745,7 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_ "address": None, "rating": None, "total_reviews": None, + "category": None, "success": False, "error": str(e), "time": time.time() - start_time diff --git a/web/app/page.tsx b/web/app/page.tsx index a2478af..834865a 100644 --- a/web/app/page.tsx +++ b/web/app/page.tsx @@ -27,6 +27,8 @@ interface SelectedJob { jobId: string; newCount?: number; previousJobId?: string; + businessCategory?: string; + reviewTopics?: { topic: string; count: number }[]; } type ViewType = 'newScrape' | 'jobs' | 'reports'; @@ -106,6 +108,8 @@ export default function Home() { jobId: job.job_id, newCount: data.new_count, previousJobId: previousJob?.job_id, + businessCategory: job.business_category || undefined, + reviewTopics: job.review_topics || undefined, }); setActiveView('reports'); } @@ -155,7 +159,7 @@ export default function Home() { Back to Reports - + ) : (
diff --git a/web/components/ReviewAnalytics.tsx b/web/components/ReviewAnalytics.tsx index aef4bb0..5ec4c12 100644 --- a/web/components/ReviewAnalytics.tsx +++ b/web/components/ReviewAnalytics.tsx @@ -22,14 +22,21 @@ interface ReviewWithNew extends Review { photo_urls?: string[] | null; } +interface ReviewTopic { + topic: string; + count: number; +} + interface ReviewAnalyticsProps { reviews: ReviewWithNew[]; businessName?: string; businessUrl?: string; newCount?: number; + businessCategory?: string; + reviewTopics?: ReviewTopic[]; } -export default function ReviewAnalytics({ reviews, businessName, businessUrl, newCount }: ReviewAnalyticsProps) { +export default function ReviewAnalytics({ reviews, businessName, businessUrl, newCount, businessCategory, reviewTopics }: ReviewAnalyticsProps) { const [sorting, setSorting] = useState([{ id: 'date', desc: true }]); // Default: newest first const [columnFilters, setColumnFiltersState] = useState([]); const [globalFilter, setGlobalFilter] = useState(''); @@ -476,9 +483,16 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne {/* Header */}
-

- {businessName || 'Review Analytics'} -

+
+

+ {businessName || 'Review Analytics'} +

+ {businessCategory && ( + + {businessCategory} + + )} +
{businessUrl && (
+ {/* Review Topics - from Google Maps */} + {reviewTopics && reviewTopics.length > 0 && ( +
+
+ +

What People Talk About

+ ({reviewTopics.length} topics from Google) +
+
+ {reviewTopics.slice(0, 15).map((topic, idx) => ( +
+ {topic.topic} + + {topic.count} + +
+ ))} +
+ {reviewTopics.length > 15 && ( +

+{reviewTopics.length - 15} more topics

+ )} +
+ )} + {/* Rating & Volume Timeline */} {timelineData.length > 0 && (
(null); const [businessImage, setBusinessImage] = useState(null); const [businessCategory, setBusinessCategory] = useState(null); + const [userFingerprint, setUserFingerprint] = useState<{ + geolocation?: {lat: number, lng: number}, + userAgent?: string, + viewport?: {width: number, height: number}, + timezone?: string, + language?: string, + platform?: string + }>({}); const debounceRef = useRef(null); + + // Collect browser fingerprint on mount (no permissions needed) + useEffect(() => { + const collectFingerprint = async () => { + const fingerprint: typeof userFingerprint = {}; + + // User agent + fingerprint.userAgent = navigator.userAgent; + + // Screen/viewport size + fingerprint.viewport = { + width: window.screen.width, + height: window.screen.height + }; + + // Timezone + fingerprint.timezone = Intl.DateTimeFormat().resolvedOptions().timeZone; + + // Language + fingerprint.language = navigator.language; + + // Platform + fingerprint.platform = navigator.platform; + + // Get approximate location from IP (no permission needed) + try { + const response = await fetch('https://ipapi.co/json/', { + signal: AbortSignal.timeout(3000) + }); + if (response.ok) { + const data = await response.json(); + if (data.latitude && data.longitude) { + fingerprint.geolocation = { + lat: data.latitude, + lng: data.longitude + }; + console.log('IP location:', data.city, data.country_name); + } + } + } catch (error) { + console.log('IP geolocation not available'); + } + + setUserFingerprint(fingerprint); + console.log('Browser fingerprint:', fingerprint); + }; + + collectFingerprint(); + }, []); const pollingIntervals = useRef>(new Map()); const abortControllerRef = useRef(null); @@ -121,18 +181,23 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe setBusinessCategory(null); setError(''); - // Create new abort controller with 30 second timeout + // Create new abort controller with 60 second timeout (validation can be slow) const controller = new AbortController(); abortControllerRef.current = controller; - const timeoutId = setTimeout(() => controller.abort(), 30000); + const timeoutId = setTimeout(() => controller.abort(), 60000); try { - const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(query)}`; + // Force English with hl=en parameter + const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(query)}&hl=en`; const response = await fetch('/api/check-reviews', { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ url }), + body: JSON.stringify({ + url, + geolocation: userFingerprint.geolocation, + browser_fingerprint: userFingerprint // Pass full fingerprint + }), signal: controller.signal, }); @@ -157,21 +222,30 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe } catch (err) { clearTimeout(timeoutId); - // Ignore AbortError (happens when user starts a new validation) + // Check if this is a timeout abort vs user-initiated abort if (err instanceof Error && err.name === 'AbortError') { - console.log('Validation cancelled (new validation started)'); - return; + // Check if it was a timeout (controller still matches) or user started new search + if (abortControllerRef.current === controller) { + // Timeout - show error + console.error('Validation timed out'); + setError('Validation timed out. Please try again.'); + setHasReviews(false); + setAvailableReviewCount(0); + } else { + // User started a new search - just return silently + console.log('Validation cancelled (new validation started)'); + return; + } + } else { + console.error('Error getting business info:', err); + // Error occurred + setHasReviews(false); + setAvailableReviewCount(0); } - - console.error('Error getting business info:', err); - // Error occurred - setHasReviews(false); - setAvailableReviewCount(0); } finally { - // Only clear loading state if this controller wasn't aborted - if (!controller.signal.aborted) { - setIsCheckingReviews(false); - } + clearTimeout(timeoutId); + // Always clear loading state (even on timeout) + setIsCheckingReviews(false); } }; @@ -192,8 +266,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe return newMap; }); - // Stop polling if job is done - if (data.status === 'completed' || data.status === 'failed') { + // Stop polling if job is done (completed, failed, or partial) + if (data.status === 'completed' || data.status === 'failed' || data.status === 'partial') { const interval = pollingIntervals.current.get(jobId); if (interval) { clearInterval(interval); @@ -244,8 +318,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe setIsSubmitting(true); setShowConfirmModal(false); - // Use the search query to create a Google Maps search URL - const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(searchedQuery)}`; + // Use the search query to create a Google Maps search URL (force English) + const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(searchedQuery)}&hl=en`; try { const response = await fetch('/api/scrape', { @@ -257,6 +331,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe business_address: businessAddress, rating_snapshot: businessRating, total_reviews_snapshot: availableReviewCount, + geolocation: userFingerprint.geolocation, + browser_fingerprint: userFingerprint, // Pass full fingerprint }), }); @@ -283,8 +359,10 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe error_message: null, business_name: businessName, business_address: businessAddress, + business_category: businessCategory, rating_snapshot: businessRating, total_reviews_snapshot: availableReviewCount, + review_topics: null, // Will be populated when job completes }); return newMap; }); @@ -305,6 +383,7 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe case 'completed': return 'text-green-700'; case 'running': return 'text-blue-700'; case 'failed': return 'text-red-700'; + case 'partial': return 'text-orange-700'; default: return 'text-gray-800'; } }; @@ -325,6 +404,12 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe ); + case 'partial': + return ( + + + + ); default: return ( @@ -776,8 +861,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe )}
- {/* Action Buttons - Show when completed and has reviews */} - {job.status === 'completed' && job.reviews_count && job.reviews_count > 0 && ( + {/* Action Buttons - Show when completed, partial, or running with reviews */} + {(job.status === 'completed' || job.status === 'partial' || (job.status === 'running' && job.reviews_count && job.reviews_count > 0)) && job.reviews_count && job.reviews_count > 0 && (
@@ -845,7 +936,7 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe const url = URL.createObjectURL(blob); const a = document.createElement('a'); a.href = url; - a.download = `reviews-${job.job_id}.json`; + a.download = `reviews-${job.job_id}${job.status === 'partial' ? '-partial' : ''}.json`; a.click(); } } catch (err) { @@ -862,6 +953,24 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
)} + {/* Partial Job Warning */} + {job.status === 'partial' && ( +
+
+ + + +
+

Partial Results

+

+ This job was interrupted but {job.reviews_count} reviews were saved. + {job.error_message && Reason: {job.error_message}} +

+
+
+
+ )} + {/* Error Message */} {job.status === 'failed' && job.error_message && (
diff --git a/web/lib/analytics.ts b/web/lib/analytics.ts index 52424e3..bccc2ba 100644 --- a/web/lib/analytics.ts +++ b/web/lib/analytics.ts @@ -66,7 +66,9 @@ export function calculateReviewStats(reviews: Review[]): ReviewStats { // Populate minDate/maxDate/centerDate on reviews for display reviews.forEach(r => { if (!r.minDate || !r.maxDate || !r.centerDate) { - const range = parseDateTextToRange(r.date_text); + // Handle both date_text and timestamp field names + const dateText = r.date_text || (r as any).timestamp || ''; + const range = parseDateTextToRange(dateText); r.minDate = range.minDate; r.maxDate = range.maxDate; // Calculate centerDate as midpoint @@ -96,8 +98,8 @@ export function calculateReviewStats(reviews: Review[]): ReviewStats { // Recent reviews (last 30 days - simplified check) const recentReviews = reviews.filter(r => { - const text = r.date_text.toLowerCase(); - return text.includes('day') || text.includes('week') || text.includes('hour'); + const text = (r.date_text || (r as any).timestamp || '').toLowerCase(); + return text.includes('day') || text.includes('week') || text.includes('hour') || text.includes('minute') || text.includes('second'); }).length; // Rating distribution @@ -278,6 +280,14 @@ function extractNumber(text: string): number { */ export function parseDateTextToRange(dateText: string): { minDate: Date; maxDate: Date } { const now = new Date(); + + // Handle undefined/null dateText + if (!dateText) { + // Return a default range (assume recent - within last month) + const daysAgo = (days: number) => new Date(now.getTime() - days * 24 * 60 * 60 * 1000); + return { minDate: daysAgo(30), maxDate: now }; + } + const text = dateText.toLowerCase(); // Remove "Edited " prefix if present @@ -396,7 +406,8 @@ export function filterReviewsByDateRange(reviews: Review[], range: DateRange): R // Filter range: [filterStart, filterEnd] // Overlap occurs when: minDate <= filterEnd AND maxDate >= filterStart return reviews.filter(r => { - const { minDate, maxDate } = parseDateTextToRange(r.date_text); + const dateText = r.date_text || (r as any).timestamp || ''; + const { minDate, maxDate } = parseDateTextToRange(dateText); return minDate <= filterEnd && maxDate >= filterStart; }); } @@ -405,7 +416,8 @@ export function filterReviewsByCustomDateRange(reviews: Review[], fromDate: Date if (!fromDate && !toDate) return reviews; return reviews.filter(r => { - const reviewDate = parseDateText(r.date_text); + const dateText = r.date_text || (r as any).timestamp || ''; + const reviewDate = parseDateText(dateText); // If only fromDate is set, filter reviews >= fromDate if (fromDate && !toDate) { @@ -429,7 +441,7 @@ export function filterReviewsByCustomDateRange(reviews: Review[], fromDate: Date export function calculateTimelineData(reviews: Review[]): TimelineDataPoint[] { // Sort reviews by date (newest first) const sortedReviews = [...reviews] - .map(r => ({ ...r, parsedDate: parseDateText(r.date_text) })) + .map(r => ({ ...r, parsedDate: parseDateText(r.date_text || (r as any).timestamp || '') })) .sort((a, b) => b.parsedDate.getTime() - a.parsedDate.getTime()); // Group by month