Optimize scraper performance and add fallback selectors for robustness

Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-18 19:49:24 +00:00
parent bdffb5eaac
commit faa0704737
108 changed files with 23632 additions and 54 deletions
--- a/api_server_production.py
+++ b/api_server_production.py
@@ -0,0 +1,613 @@
+#!/usr/bin/env python3
+"""
+Production Google Reviews Scraper API Server with Phase 1 features:
+- PostgreSQL storage with JSONB
+- Webhook delivery with retries
+- Smart health checks with canary testing
+"""
+import asyncio
+import logging
+import os
+from contextlib import asynccontextmanager
+from typing import Optional, List, Dict, Any
+from uuid import UUID
+
+from fastapi import FastAPI, HTTPException, Query, Header
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, HttpUrl, Field
+from fastapi.responses import JSONResponse
+
+from modules.database import DatabaseManager, JobStatus
+from modules.webhooks import WebhookDispatcher, WebhookManager
+from modules.health_checks import HealthCheckSystem
+from modules.fast_scraper import fast_scrape_reviews, check_reviews_available, get_business_card_info
+from modules.chrome_pool import (
+    start_worker_pools,
+    stop_worker_pools,
+    get_validation_worker,
+    release_validation_worker,
+    get_scraping_worker,
+    release_scraping_worker,
+    get_pool_stats
+)
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+log = logging.getLogger("api_server")
+
+# Global instances
+db: Optional[DatabaseManager] = None
+webhook_dispatcher: Optional[WebhookDispatcher] = None
+health_system: Optional[HealthCheckSystem] = None
+
+# Concurrent job limiter (prevent too many Chrome instances)
+MAX_CONCURRENT_JOBS = int(os.getenv('MAX_CONCURRENT_JOBS', '5'))
+job_semaphore = asyncio.Semaphore(MAX_CONCURRENT_JOBS)
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Lifespan context manager for startup and shutdown"""
+    global db, webhook_dispatcher, health_system
+
+    # Startup
+    log.info("Starting Google Reviews Scraper API Server (Production)")
+
+    # Get database URL from environment
+    database_url = os.getenv(
+        'DATABASE_URL',
+        'postgresql://scraper:scraper@localhost:5432/scraper'
+    )
+
+    # Initialize database
+    db = DatabaseManager(database_url)
+    await db.connect()
+    await db.initialize_schema()
+    log.info("Database initialized")
+
+    # Initialize health check system with canary monitoring
+    # DISABLED: Canary tests consume Google Maps requests and trigger rate limiting
+    # health_system = HealthCheckSystem(db)
+    # await health_system.start()
+    log.info("Health check system DISABLED (canary tests disabled to avoid rate limiting)")
+
+    # Initialize webhook dispatcher
+    webhook_dispatcher = WebhookDispatcher(db, interval_seconds=30)
+    asyncio.create_task(webhook_dispatcher.start())
+    log.info("Webhook dispatcher started")
+
+    # Start Chrome worker pools (1 for validation, 2 for scraping)
+    # These pre-warm Chrome instances for instant availability
+    await asyncio.to_thread(
+        start_worker_pools,
+        validation_size=1,
+        scraping_size=2,
+        headless=True
+    )
+    log.info("Chrome worker pools started (1 validation + 2 scraping)")
+
+    yield
+
+    # Shutdown
+    log.info("Shutting down Google Reviews Scraper API Server")
+    if webhook_dispatcher:
+        webhook_dispatcher.stop()
+    # if health_system:
+    #     health_system.stop()
+
+    # Stop worker pools
+    await asyncio.to_thread(stop_worker_pools)
+    log.info("Chrome worker pools stopped")
+
+    if db:
+        await db.disconnect()
+
+
+# Initialize FastAPI app
+app = FastAPI(
+    title="Google Reviews Scraper API - Production",
+    description="Production-ready REST API for Google Maps review scraping with webhooks and health monitoring",
+    version="2.0.0",
+    lifespan=lifespan
+)
+
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Configure for production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+# ==================== Request/Response Models ====================
+
+class ScrapeRequest(BaseModel):
+    """Request model for starting a scrape job"""
+    url: HttpUrl = Field(..., description="Google Maps URL to scrape")
+    webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications")
+    webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature")
+    metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata")
+
+
+class JobResponse(BaseModel):
+    """Response model for job information"""
+    job_id: str
+    status: str
+    url: str
+    created_at: str
+    started_at: Optional[str] = None
+    completed_at: Optional[str] = None
+    reviews_count: Optional[int] = None
+    total_reviews: Optional[int] = None  # Total reviews available for this place
+    scrape_time: Optional[float] = None
+    error_message: Optional[str] = None
+    webhook_url: Optional[str] = None
+
+
+class ReviewsResponse(BaseModel):
+    """Response model for reviews data"""
+    job_id: str
+    reviews: List[Dict[str, Any]]
+    count: int
+
+
+class StatsResponse(BaseModel):
+    """Response model for statistics"""
+    total_jobs: int
+    pending: int
+    running: int
+    completed: int
+    failed: int
+    cancelled: int
+    avg_scrape_time: Optional[float] = None
+    total_reviews: Optional[int] = None
+
+
+# ==================== API Endpoints ====================
+
+@app.get("/", summary="API Health Check")
+async def root():
+    """Basic health check endpoint"""
+    return {
+        "message": "Google Reviews Scraper API (Production)",
+        "status": "healthy",
+        "version": "2.0.0",
+        "features": ["postgresql", "webhooks", "canary-testing"]
+    }
+
+
+@app.post("/scrape", response_model=Dict[str, str], summary="Start Scraping Job")
+async def start_scrape(request: ScrapeRequest):
+    """
+    Start a new scraping job.
+
+    The job runs asynchronously in the background. You can:
+    - Poll GET /jobs/{job_id} for status
+    - Provide webhook_url for automatic notification when complete
+
+    Returns the job ID for tracking.
+    """
+    if not db:
+        raise HTTPException(status_code=500, detail="Database not initialized")
+
+    try:
+        # Create job in database
+        job_id = await db.create_job(
+            url=str(request.url),
+            webhook_url=str(request.webhook_url) if request.webhook_url else None,
+            webhook_secret=request.webhook_secret,
+            metadata=request.metadata
+        )
+
+        # Start scraping job in background
+        asyncio.create_task(run_scraping_job(job_id))
+
+        log.info(f"Created and started job {job_id}")
+
+        return {
+            "job_id": str(job_id),
+            "status": "started",
+            "message": "Scraping job started successfully"
+        }
+
+    except Exception as e:
+        log.error(f"Error creating scraping job: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to create scraping job: {str(e)}")
+
+
+@app.get("/jobs/{job_id}", response_model=JobResponse, summary="Get Job Status")
+async def get_job(job_id: UUID):
+    """Get detailed information about a specific job"""
+    if not db:
+        raise HTTPException(status_code=500, detail="Database not initialized")
+
+    job = await db.get_job(job_id)
+    if not job:
+        raise HTTPException(status_code=404, detail="Job not found")
+
+    return JobResponse(
+        job_id=str(job['job_id']),
+        status=job['status'],
+        url=job['url'],
+        created_at=job['created_at'].isoformat(),
+        started_at=job['started_at'].isoformat() if job['started_at'] else None,
+        completed_at=job['completed_at'].isoformat() if job['completed_at'] else None,
+        reviews_count=job['reviews_count'],
+        scrape_time=job['scrape_time'],
+        error_message=job['error_message'],
+        webhook_url=job.get('webhook_url')
+    )
+
+
+@app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews")
+async def get_job_reviews(job_id: UUID):
+    """
+    Get the actual reviews data for a completed job.
+
+    Returns 404 if job not found or not completed yet.
+    """
+    if not db:
+        raise HTTPException(status_code=500, detail="Database not initialized")
+
+    reviews = await db.get_job_reviews(job_id)
+    if reviews is None:
+        job = await db.get_job(job_id)
+        if not job:
+            raise HTTPException(status_code=404, detail="Job not found")
+        elif job['status'] != 'completed':
+            raise HTTPException(
+                status_code=400,
+                detail=f"Job not completed yet (current status: {job['status']})"
+            )
+        else:
+            raise HTTPException(status_code=404, detail="Reviews data not available")
+
+    return ReviewsResponse(
+        job_id=str(job_id),
+        reviews=reviews,
+        count=len(reviews)
+    )
+
+
+@app.get("/jobs", response_model=List[JobResponse], summary="List Jobs")
+async def list_jobs(
+    status: Optional[str] = Query(None, description="Filter by job status"),
+    limit: int = Query(100, description="Maximum number of jobs to return", ge=1, le=1000),
+    offset: int = Query(0, description="Number of jobs to skip", ge=0)
+):
+    """List all jobs, optionally filtered by status"""
+    if not db:
+        raise HTTPException(status_code=500, detail="Database not initialized")
+
+    # Validate status if provided
+    job_status = None
+    if status:
+        try:
+            job_status = JobStatus(status.lower())
+        except ValueError:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Invalid status. Must be one of: {[s.value for s in JobStatus]}"
+            )
+
+    jobs = await db.list_jobs(status=job_status, limit=limit, offset=offset)
+
+    return [
+        JobResponse(
+            job_id=str(job['job_id']),
+            status=job['status'],
+            url=job['url'],
+            created_at=job['created_at'].isoformat(),
+            completed_at=job['completed_at'].isoformat() if job.get('completed_at') else None,
+            reviews_count=job.get('reviews_count'),
+            scrape_time=job.get('scrape_time'),
+            error_message=job.get('error_message')
+        )
+        for job in jobs
+    ]
+
+
+@app.delete("/jobs/{job_id}", summary="Delete Job")
+async def delete_job(job_id: UUID):
+    """Delete a job from the system"""
+    if not db:
+        raise HTTPException(status_code=500, detail="Database not initialized")
+
+    deleted = await db.delete_job(job_id)
+    if not deleted:
+        raise HTTPException(status_code=404, detail="Job not found")
+
+    return {"message": "Job deleted successfully"}
+
+
+@app.post("/check-reviews", summary="Check if Reviews Exist")
+async def check_reviews(request: ScrapeRequest):
+    """
+    Get business card information from Google Maps.
+    Returns business name, address, rating, and review count.
+
+    Uses pre-warmed Chrome worker from pool for instant response.
+    This is used to show the business confirmation card in the UI.
+    """
+    worker = None
+    recycle_worker = False
+
+    try:
+        url = str(request.url)
+
+        # Get pre-warmed worker from validation pool
+        worker = await asyncio.to_thread(get_validation_worker, timeout=10)
+
+        if worker:
+            log.info(f"Using worker {worker.worker_id} for business card extraction")
+            # Use the pooled worker (don't close it)
+            result = await asyncio.to_thread(
+                get_business_card_info,
+                url=url,
+                driver=worker.driver,
+                return_driver=True
+            )
+
+            # Check if the result indicates a session error
+            if not result['success'] and result.get('error'):
+                error_msg = result.get('error', '').lower()
+                if 'invalid session' in error_msg or 'session' in error_msg:
+                    log.warning(f"Worker {worker.worker_id} has invalid session, will recycle")
+                    recycle_worker = True
+        else:
+            # Fallback: create temporary worker
+            log.warning("No pooled worker available, creating temporary instance")
+            result = await asyncio.to_thread(
+                get_business_card_info,
+                url=url
+            )
+
+        # SIMPLIFIED VALIDATION: If we found a business (name + rating), assume it has reviews
+        # Let the actual scraper determine if reviews exist
+        has_business = result.get('name') and result.get('rating')
+
+        return {
+            "has_reviews": has_business,  # Assume true if business exists
+            "total_reviews": result['total_reviews'] or 0,  # Show 0 if unknown
+            "name": result.get('name'),
+            "address": result.get('address'),
+            "rating": result.get('rating'),
+            "success": result['success'],
+            "error": result.get('error')
+        }
+
+    except Exception as e:
+        log.error(f"Error checking reviews: {e}")
+        # If it's a session error, recycle the worker
+        if worker:
+            error_msg = str(e).lower()
+            if 'invalid session' in error_msg or 'session' in error_msg:
+                recycle_worker = True
+
+        return {
+            "has_reviews": False,
+            "review_count": 0,
+            "success": False,
+            "error": str(e)
+        }
+    finally:
+        # Release worker back to pool (or recycle if broken)
+        if worker:
+            await asyncio.to_thread(release_validation_worker, worker, recycle=recycle_worker)
+
+
+@app.get("/stats", response_model=StatsResponse, summary="Get Statistics")
+async def get_stats():
+    """Get job statistics"""
+    if not db:
+        raise HTTPException(status_code=500, detail="Database not initialized")
+
+    stats = await db.get_stats()
+    return StatsResponse(**stats)
+
+
+@app.get("/pool-stats", summary="Get Worker Pool Statistics")
+async def pool_stats():
+    """Get Chrome worker pool statistics"""
+    return await asyncio.to_thread(get_pool_stats)
+
+
+# ==================== Health Check Endpoints ====================
+
+@app.get("/health/live", summary="Liveness Probe")
+async def liveness():
+    """
+    Liveness check: Is the server alive?
+
+    Use this for Kubernetes liveness probe - restart container if fails.
+    """
+    if not health_system:
+        raise HTTPException(status_code=503, detail="Health system not initialized")
+
+    return await health_system.check_liveness()
+
+
+@app.get("/health/ready", summary="Readiness Probe")
+async def readiness():
+    """
+    Readiness check: Can the server handle traffic?
+
+    Use this for Kubernetes readiness probe - remove from load balancer if fails.
+    """
+    if not health_system:
+        raise HTTPException(status_code=503, detail="Health system not initialized")
+
+    result = await health_system.check_readiness()
+
+    if result["status"] != "ready":
+        return JSONResponse(status_code=503, content=result)
+
+    return result
+
+
+@app.get("/health/canary", summary="Canary Health Check")
+async def canary():
+    """
+    Canary check: Does scraping actually work?
+
+    Returns the latest canary test result (runs every 4 hours in background).
+    Use this for external monitoring (PagerDuty, DataDog) - alerts if fails.
+    """
+    if not health_system:
+        raise HTTPException(status_code=503, detail="Health system not initialized")
+
+    result = await health_system.check_canary()
+
+    if result["status"] not in ["healthy", "unknown"]:
+        return JSONResponse(status_code=503, content=result)
+
+    return result
+
+
+@app.get("/health/detailed", summary="Detailed Health Status")
+async def detailed_health():
+    """Get detailed health status of all components"""
+    if not health_system:
+        raise HTTPException(status_code=503, detail="Health system not initialized")
+
+    return await health_system.get_detailed_health()
+
+
+# ==================== Background Job Runner ====================
+
+async def run_scraping_job(job_id: UUID):
+    """
+    Run scraping job in background with concurrency limit.
+
+    Args:
+        job_id: Job UUID
+    """
+    async with job_semaphore:  # Limit concurrent Chrome instances
+        try:
+            # Update status to running
+            await db.update_job_status(job_id, JobStatus.RUNNING)
+            log.info(f"Starting scraping job {job_id}")
+
+            # Get job details
+            job = await db.get_job(job_id)
+            url = job['url']
+
+            # Get the event loop for progress updates from worker thread
+            loop = asyncio.get_running_loop()
+
+            # Progress callback to update job status with current/total counts
+            def progress_callback(current_count: int, total_count: int):
+                """Update job progress from worker thread"""
+                async def update():
+                    await db.update_job_status(
+                        job_id,
+                        JobStatus.RUNNING,
+                        reviews_count=current_count,
+                        total_reviews=total_count
+                    )
+
+                # Schedule the coroutine on the event loop
+                asyncio.run_coroutine_threadsafe(update(), loop)
+
+            # Run scraping with progress callback
+            result = await asyncio.to_thread(
+                fast_scrape_reviews,
+                url=url,
+                headless=True,
+                progress_callback=progress_callback
+            )
+
+            if result['success']:
+                # Save results to database
+                await db.save_job_result(
+                    job_id=job_id,
+                    reviews=result['reviews'],
+                    scrape_time=result['time'],
+                    total_reviews=result.get('total_reviews')
+                )
+
+                log.info(
+                    f"Completed job {job_id}: {result['count']} reviews in {result['time']:.1f}s"
+                )
+
+                # Send webhook if configured
+                if job.get('webhook_url'):
+                    webhook_manager = WebhookManager()
+                    api_base_url = os.getenv('API_BASE_URL', 'http://localhost:8000')
+
+                    await webhook_manager.send_job_completed_webhook(
+                        webhook_url=job['webhook_url'],
+                        job_id=job_id,
+                        status='completed',
+                        reviews_count=result['count'],
+                        scrape_time=result['time'],
+                        reviews_url=f"{api_base_url}/jobs/{job_id}/reviews",
+                        secret=job.get('webhook_secret'),
+                        db=db
+                    )
+
+            else:
+                # Job failed
+                await db.update_job_status(
+                    job_id,
+                    JobStatus.FAILED,
+                    error_message=result.get('error', 'Unknown error')
+                )
+
+                log.error(f"Failed job {job_id}: {result.get('error')}")
+
+                # Send failure webhook if configured
+                if job.get('webhook_url'):
+                    webhook_manager = WebhookManager()
+                    await webhook_manager.send_job_completed_webhook(
+                        webhook_url=job['webhook_url'],
+                        job_id=job_id,
+                        status='failed',
+                        error_message=result.get('error'),
+                        secret=job.get('webhook_secret'),
+                        db=db
+                    )
+
+        except Exception as e:
+            log.error(f"Error in scraping job {job_id}: {e}")
+            import traceback
+            traceback.print_exc()
+
+            await db.update_job_status(
+                job_id,
+                JobStatus.FAILED,
+                error_message=str(e)
+            )
+
+            # Send failure webhook
+            job = await db.get_job(job_id)
+            if job and job.get('webhook_url'):
+                webhook_manager = WebhookManager()
+                await webhook_manager.send_job_completed_webhook(
+                    webhook_url=job['webhook_url'],
+                    job_id=job_id,
+                    status='failed',
+                    error_message=str(e),
+                    secret=job.get('webhook_secret'),
+                    db=db
+                )
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    port = int(os.getenv('PORT', 8000))
+
+    log.info(f"Starting production server on port {port}...")
+    uvicorn.run(
+        "api_server_production:app",
+        host="0.0.0.0",
+        port=port,
+        reload=False,  # Disable reload in production
+        log_level="info"
+    )