Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
614 lines
20 KiB
Python
614 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Production Google Reviews Scraper API Server with Phase 1 features:
|
|
- PostgreSQL storage with JSONB
|
|
- Webhook delivery with retries
|
|
- Smart health checks with canary testing
|
|
"""
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
from contextlib import asynccontextmanager
|
|
from typing import Optional, List, Dict, Any
|
|
from uuid import UUID
|
|
|
|
from fastapi import FastAPI, HTTPException, Query, Header
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
from pydantic import BaseModel, HttpUrl, Field
|
|
from fastapi.responses import JSONResponse
|
|
|
|
from modules.database import DatabaseManager, JobStatus
|
|
from modules.webhooks import WebhookDispatcher, WebhookManager
|
|
from modules.health_checks import HealthCheckSystem
|
|
from modules.fast_scraper import fast_scrape_reviews, check_reviews_available, get_business_card_info
|
|
from modules.chrome_pool import (
|
|
start_worker_pools,
|
|
stop_worker_pools,
|
|
get_validation_worker,
|
|
release_validation_worker,
|
|
get_scraping_worker,
|
|
release_scraping_worker,
|
|
get_pool_stats
|
|
)
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
)
|
|
log = logging.getLogger("api_server")
|
|
|
|
# Global instances
|
|
db: Optional[DatabaseManager] = None
|
|
webhook_dispatcher: Optional[WebhookDispatcher] = None
|
|
health_system: Optional[HealthCheckSystem] = None
|
|
|
|
# Concurrent job limiter (prevent too many Chrome instances)
|
|
MAX_CONCURRENT_JOBS = int(os.getenv('MAX_CONCURRENT_JOBS', '5'))
|
|
job_semaphore = asyncio.Semaphore(MAX_CONCURRENT_JOBS)
|
|
|
|
|
|
@asynccontextmanager
|
|
async def lifespan(app: FastAPI):
|
|
"""Lifespan context manager for startup and shutdown"""
|
|
global db, webhook_dispatcher, health_system
|
|
|
|
# Startup
|
|
log.info("Starting Google Reviews Scraper API Server (Production)")
|
|
|
|
# Get database URL from environment
|
|
database_url = os.getenv(
|
|
'DATABASE_URL',
|
|
'postgresql://scraper:scraper@localhost:5432/scraper'
|
|
)
|
|
|
|
# Initialize database
|
|
db = DatabaseManager(database_url)
|
|
await db.connect()
|
|
await db.initialize_schema()
|
|
log.info("Database initialized")
|
|
|
|
# Initialize health check system with canary monitoring
|
|
# DISABLED: Canary tests consume Google Maps requests and trigger rate limiting
|
|
# health_system = HealthCheckSystem(db)
|
|
# await health_system.start()
|
|
log.info("Health check system DISABLED (canary tests disabled to avoid rate limiting)")
|
|
|
|
# Initialize webhook dispatcher
|
|
webhook_dispatcher = WebhookDispatcher(db, interval_seconds=30)
|
|
asyncio.create_task(webhook_dispatcher.start())
|
|
log.info("Webhook dispatcher started")
|
|
|
|
# Start Chrome worker pools (1 for validation, 2 for scraping)
|
|
# These pre-warm Chrome instances for instant availability
|
|
await asyncio.to_thread(
|
|
start_worker_pools,
|
|
validation_size=1,
|
|
scraping_size=2,
|
|
headless=True
|
|
)
|
|
log.info("Chrome worker pools started (1 validation + 2 scraping)")
|
|
|
|
yield
|
|
|
|
# Shutdown
|
|
log.info("Shutting down Google Reviews Scraper API Server")
|
|
if webhook_dispatcher:
|
|
webhook_dispatcher.stop()
|
|
# if health_system:
|
|
# health_system.stop()
|
|
|
|
# Stop worker pools
|
|
await asyncio.to_thread(stop_worker_pools)
|
|
log.info("Chrome worker pools stopped")
|
|
|
|
if db:
|
|
await db.disconnect()
|
|
|
|
|
|
# Initialize FastAPI app
|
|
app = FastAPI(
|
|
title="Google Reviews Scraper API - Production",
|
|
description="Production-ready REST API for Google Maps review scraping with webhooks and health monitoring",
|
|
version="2.0.0",
|
|
lifespan=lifespan
|
|
)
|
|
|
|
# Add CORS middleware
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=["*"], # Configure for production
|
|
allow_credentials=True,
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
|
|
# ==================== Request/Response Models ====================
|
|
|
|
class ScrapeRequest(BaseModel):
|
|
"""Request model for starting a scrape job"""
|
|
url: HttpUrl = Field(..., description="Google Maps URL to scrape")
|
|
webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications")
|
|
webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature")
|
|
metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata")
|
|
|
|
|
|
class JobResponse(BaseModel):
|
|
"""Response model for job information"""
|
|
job_id: str
|
|
status: str
|
|
url: str
|
|
created_at: str
|
|
started_at: Optional[str] = None
|
|
completed_at: Optional[str] = None
|
|
reviews_count: Optional[int] = None
|
|
total_reviews: Optional[int] = None # Total reviews available for this place
|
|
scrape_time: Optional[float] = None
|
|
error_message: Optional[str] = None
|
|
webhook_url: Optional[str] = None
|
|
|
|
|
|
class ReviewsResponse(BaseModel):
|
|
"""Response model for reviews data"""
|
|
job_id: str
|
|
reviews: List[Dict[str, Any]]
|
|
count: int
|
|
|
|
|
|
class StatsResponse(BaseModel):
|
|
"""Response model for statistics"""
|
|
total_jobs: int
|
|
pending: int
|
|
running: int
|
|
completed: int
|
|
failed: int
|
|
cancelled: int
|
|
avg_scrape_time: Optional[float] = None
|
|
total_reviews: Optional[int] = None
|
|
|
|
|
|
# ==================== API Endpoints ====================
|
|
|
|
@app.get("/", summary="API Health Check")
|
|
async def root():
|
|
"""Basic health check endpoint"""
|
|
return {
|
|
"message": "Google Reviews Scraper API (Production)",
|
|
"status": "healthy",
|
|
"version": "2.0.0",
|
|
"features": ["postgresql", "webhooks", "canary-testing"]
|
|
}
|
|
|
|
|
|
@app.post("/scrape", response_model=Dict[str, str], summary="Start Scraping Job")
|
|
async def start_scrape(request: ScrapeRequest):
|
|
"""
|
|
Start a new scraping job.
|
|
|
|
The job runs asynchronously in the background. You can:
|
|
- Poll GET /jobs/{job_id} for status
|
|
- Provide webhook_url for automatic notification when complete
|
|
|
|
Returns the job ID for tracking.
|
|
"""
|
|
if not db:
|
|
raise HTTPException(status_code=500, detail="Database not initialized")
|
|
|
|
try:
|
|
# Create job in database
|
|
job_id = await db.create_job(
|
|
url=str(request.url),
|
|
webhook_url=str(request.webhook_url) if request.webhook_url else None,
|
|
webhook_secret=request.webhook_secret,
|
|
metadata=request.metadata
|
|
)
|
|
|
|
# Start scraping job in background
|
|
asyncio.create_task(run_scraping_job(job_id))
|
|
|
|
log.info(f"Created and started job {job_id}")
|
|
|
|
return {
|
|
"job_id": str(job_id),
|
|
"status": "started",
|
|
"message": "Scraping job started successfully"
|
|
}
|
|
|
|
except Exception as e:
|
|
log.error(f"Error creating scraping job: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Failed to create scraping job: {str(e)}")
|
|
|
|
|
|
@app.get("/jobs/{job_id}", response_model=JobResponse, summary="Get Job Status")
|
|
async def get_job(job_id: UUID):
|
|
"""Get detailed information about a specific job"""
|
|
if not db:
|
|
raise HTTPException(status_code=500, detail="Database not initialized")
|
|
|
|
job = await db.get_job(job_id)
|
|
if not job:
|
|
raise HTTPException(status_code=404, detail="Job not found")
|
|
|
|
return JobResponse(
|
|
job_id=str(job['job_id']),
|
|
status=job['status'],
|
|
url=job['url'],
|
|
created_at=job['created_at'].isoformat(),
|
|
started_at=job['started_at'].isoformat() if job['started_at'] else None,
|
|
completed_at=job['completed_at'].isoformat() if job['completed_at'] else None,
|
|
reviews_count=job['reviews_count'],
|
|
scrape_time=job['scrape_time'],
|
|
error_message=job['error_message'],
|
|
webhook_url=job.get('webhook_url')
|
|
)
|
|
|
|
|
|
@app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews")
|
|
async def get_job_reviews(job_id: UUID):
|
|
"""
|
|
Get the actual reviews data for a completed job.
|
|
|
|
Returns 404 if job not found or not completed yet.
|
|
"""
|
|
if not db:
|
|
raise HTTPException(status_code=500, detail="Database not initialized")
|
|
|
|
reviews = await db.get_job_reviews(job_id)
|
|
if reviews is None:
|
|
job = await db.get_job(job_id)
|
|
if not job:
|
|
raise HTTPException(status_code=404, detail="Job not found")
|
|
elif job['status'] != 'completed':
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Job not completed yet (current status: {job['status']})"
|
|
)
|
|
else:
|
|
raise HTTPException(status_code=404, detail="Reviews data not available")
|
|
|
|
return ReviewsResponse(
|
|
job_id=str(job_id),
|
|
reviews=reviews,
|
|
count=len(reviews)
|
|
)
|
|
|
|
|
|
@app.get("/jobs", response_model=List[JobResponse], summary="List Jobs")
|
|
async def list_jobs(
|
|
status: Optional[str] = Query(None, description="Filter by job status"),
|
|
limit: int = Query(100, description="Maximum number of jobs to return", ge=1, le=1000),
|
|
offset: int = Query(0, description="Number of jobs to skip", ge=0)
|
|
):
|
|
"""List all jobs, optionally filtered by status"""
|
|
if not db:
|
|
raise HTTPException(status_code=500, detail="Database not initialized")
|
|
|
|
# Validate status if provided
|
|
job_status = None
|
|
if status:
|
|
try:
|
|
job_status = JobStatus(status.lower())
|
|
except ValueError:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Invalid status. Must be one of: {[s.value for s in JobStatus]}"
|
|
)
|
|
|
|
jobs = await db.list_jobs(status=job_status, limit=limit, offset=offset)
|
|
|
|
return [
|
|
JobResponse(
|
|
job_id=str(job['job_id']),
|
|
status=job['status'],
|
|
url=job['url'],
|
|
created_at=job['created_at'].isoformat(),
|
|
completed_at=job['completed_at'].isoformat() if job.get('completed_at') else None,
|
|
reviews_count=job.get('reviews_count'),
|
|
scrape_time=job.get('scrape_time'),
|
|
error_message=job.get('error_message')
|
|
)
|
|
for job in jobs
|
|
]
|
|
|
|
|
|
@app.delete("/jobs/{job_id}", summary="Delete Job")
|
|
async def delete_job(job_id: UUID):
|
|
"""Delete a job from the system"""
|
|
if not db:
|
|
raise HTTPException(status_code=500, detail="Database not initialized")
|
|
|
|
deleted = await db.delete_job(job_id)
|
|
if not deleted:
|
|
raise HTTPException(status_code=404, detail="Job not found")
|
|
|
|
return {"message": "Job deleted successfully"}
|
|
|
|
|
|
@app.post("/check-reviews", summary="Check if Reviews Exist")
|
|
async def check_reviews(request: ScrapeRequest):
|
|
"""
|
|
Get business card information from Google Maps.
|
|
Returns business name, address, rating, and review count.
|
|
|
|
Uses pre-warmed Chrome worker from pool for instant response.
|
|
This is used to show the business confirmation card in the UI.
|
|
"""
|
|
worker = None
|
|
recycle_worker = False
|
|
|
|
try:
|
|
url = str(request.url)
|
|
|
|
# Get pre-warmed worker from validation pool
|
|
worker = await asyncio.to_thread(get_validation_worker, timeout=10)
|
|
|
|
if worker:
|
|
log.info(f"Using worker {worker.worker_id} for business card extraction")
|
|
# Use the pooled worker (don't close it)
|
|
result = await asyncio.to_thread(
|
|
get_business_card_info,
|
|
url=url,
|
|
driver=worker.driver,
|
|
return_driver=True
|
|
)
|
|
|
|
# Check if the result indicates a session error
|
|
if not result['success'] and result.get('error'):
|
|
error_msg = result.get('error', '').lower()
|
|
if 'invalid session' in error_msg or 'session' in error_msg:
|
|
log.warning(f"Worker {worker.worker_id} has invalid session, will recycle")
|
|
recycle_worker = True
|
|
else:
|
|
# Fallback: create temporary worker
|
|
log.warning("No pooled worker available, creating temporary instance")
|
|
result = await asyncio.to_thread(
|
|
get_business_card_info,
|
|
url=url
|
|
)
|
|
|
|
# SIMPLIFIED VALIDATION: If we found a business (name + rating), assume it has reviews
|
|
# Let the actual scraper determine if reviews exist
|
|
has_business = result.get('name') and result.get('rating')
|
|
|
|
return {
|
|
"has_reviews": has_business, # Assume true if business exists
|
|
"total_reviews": result['total_reviews'] or 0, # Show 0 if unknown
|
|
"name": result.get('name'),
|
|
"address": result.get('address'),
|
|
"rating": result.get('rating'),
|
|
"success": result['success'],
|
|
"error": result.get('error')
|
|
}
|
|
|
|
except Exception as e:
|
|
log.error(f"Error checking reviews: {e}")
|
|
# If it's a session error, recycle the worker
|
|
if worker:
|
|
error_msg = str(e).lower()
|
|
if 'invalid session' in error_msg or 'session' in error_msg:
|
|
recycle_worker = True
|
|
|
|
return {
|
|
"has_reviews": False,
|
|
"review_count": 0,
|
|
"success": False,
|
|
"error": str(e)
|
|
}
|
|
finally:
|
|
# Release worker back to pool (or recycle if broken)
|
|
if worker:
|
|
await asyncio.to_thread(release_validation_worker, worker, recycle=recycle_worker)
|
|
|
|
|
|
@app.get("/stats", response_model=StatsResponse, summary="Get Statistics")
|
|
async def get_stats():
|
|
"""Get job statistics"""
|
|
if not db:
|
|
raise HTTPException(status_code=500, detail="Database not initialized")
|
|
|
|
stats = await db.get_stats()
|
|
return StatsResponse(**stats)
|
|
|
|
|
|
@app.get("/pool-stats", summary="Get Worker Pool Statistics")
|
|
async def pool_stats():
|
|
"""Get Chrome worker pool statistics"""
|
|
return await asyncio.to_thread(get_pool_stats)
|
|
|
|
|
|
# ==================== Health Check Endpoints ====================
|
|
|
|
@app.get("/health/live", summary="Liveness Probe")
|
|
async def liveness():
|
|
"""
|
|
Liveness check: Is the server alive?
|
|
|
|
Use this for Kubernetes liveness probe - restart container if fails.
|
|
"""
|
|
if not health_system:
|
|
raise HTTPException(status_code=503, detail="Health system not initialized")
|
|
|
|
return await health_system.check_liveness()
|
|
|
|
|
|
@app.get("/health/ready", summary="Readiness Probe")
|
|
async def readiness():
|
|
"""
|
|
Readiness check: Can the server handle traffic?
|
|
|
|
Use this for Kubernetes readiness probe - remove from load balancer if fails.
|
|
"""
|
|
if not health_system:
|
|
raise HTTPException(status_code=503, detail="Health system not initialized")
|
|
|
|
result = await health_system.check_readiness()
|
|
|
|
if result["status"] != "ready":
|
|
return JSONResponse(status_code=503, content=result)
|
|
|
|
return result
|
|
|
|
|
|
@app.get("/health/canary", summary="Canary Health Check")
|
|
async def canary():
|
|
"""
|
|
Canary check: Does scraping actually work?
|
|
|
|
Returns the latest canary test result (runs every 4 hours in background).
|
|
Use this for external monitoring (PagerDuty, DataDog) - alerts if fails.
|
|
"""
|
|
if not health_system:
|
|
raise HTTPException(status_code=503, detail="Health system not initialized")
|
|
|
|
result = await health_system.check_canary()
|
|
|
|
if result["status"] not in ["healthy", "unknown"]:
|
|
return JSONResponse(status_code=503, content=result)
|
|
|
|
return result
|
|
|
|
|
|
@app.get("/health/detailed", summary="Detailed Health Status")
|
|
async def detailed_health():
|
|
"""Get detailed health status of all components"""
|
|
if not health_system:
|
|
raise HTTPException(status_code=503, detail="Health system not initialized")
|
|
|
|
return await health_system.get_detailed_health()
|
|
|
|
|
|
# ==================== Background Job Runner ====================
|
|
|
|
async def run_scraping_job(job_id: UUID):
|
|
"""
|
|
Run scraping job in background with concurrency limit.
|
|
|
|
Args:
|
|
job_id: Job UUID
|
|
"""
|
|
async with job_semaphore: # Limit concurrent Chrome instances
|
|
try:
|
|
# Update status to running
|
|
await db.update_job_status(job_id, JobStatus.RUNNING)
|
|
log.info(f"Starting scraping job {job_id}")
|
|
|
|
# Get job details
|
|
job = await db.get_job(job_id)
|
|
url = job['url']
|
|
|
|
# Get the event loop for progress updates from worker thread
|
|
loop = asyncio.get_running_loop()
|
|
|
|
# Progress callback to update job status with current/total counts
|
|
def progress_callback(current_count: int, total_count: int):
|
|
"""Update job progress from worker thread"""
|
|
async def update():
|
|
await db.update_job_status(
|
|
job_id,
|
|
JobStatus.RUNNING,
|
|
reviews_count=current_count,
|
|
total_reviews=total_count
|
|
)
|
|
|
|
# Schedule the coroutine on the event loop
|
|
asyncio.run_coroutine_threadsafe(update(), loop)
|
|
|
|
# Run scraping with progress callback
|
|
result = await asyncio.to_thread(
|
|
fast_scrape_reviews,
|
|
url=url,
|
|
headless=True,
|
|
progress_callback=progress_callback
|
|
)
|
|
|
|
if result['success']:
|
|
# Save results to database
|
|
await db.save_job_result(
|
|
job_id=job_id,
|
|
reviews=result['reviews'],
|
|
scrape_time=result['time'],
|
|
total_reviews=result.get('total_reviews')
|
|
)
|
|
|
|
log.info(
|
|
f"Completed job {job_id}: {result['count']} reviews in {result['time']:.1f}s"
|
|
)
|
|
|
|
# Send webhook if configured
|
|
if job.get('webhook_url'):
|
|
webhook_manager = WebhookManager()
|
|
api_base_url = os.getenv('API_BASE_URL', 'http://localhost:8000')
|
|
|
|
await webhook_manager.send_job_completed_webhook(
|
|
webhook_url=job['webhook_url'],
|
|
job_id=job_id,
|
|
status='completed',
|
|
reviews_count=result['count'],
|
|
scrape_time=result['time'],
|
|
reviews_url=f"{api_base_url}/jobs/{job_id}/reviews",
|
|
secret=job.get('webhook_secret'),
|
|
db=db
|
|
)
|
|
|
|
else:
|
|
# Job failed
|
|
await db.update_job_status(
|
|
job_id,
|
|
JobStatus.FAILED,
|
|
error_message=result.get('error', 'Unknown error')
|
|
)
|
|
|
|
log.error(f"Failed job {job_id}: {result.get('error')}")
|
|
|
|
# Send failure webhook if configured
|
|
if job.get('webhook_url'):
|
|
webhook_manager = WebhookManager()
|
|
await webhook_manager.send_job_completed_webhook(
|
|
webhook_url=job['webhook_url'],
|
|
job_id=job_id,
|
|
status='failed',
|
|
error_message=result.get('error'),
|
|
secret=job.get('webhook_secret'),
|
|
db=db
|
|
)
|
|
|
|
except Exception as e:
|
|
log.error(f"Error in scraping job {job_id}: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
await db.update_job_status(
|
|
job_id,
|
|
JobStatus.FAILED,
|
|
error_message=str(e)
|
|
)
|
|
|
|
# Send failure webhook
|
|
job = await db.get_job(job_id)
|
|
if job and job.get('webhook_url'):
|
|
webhook_manager = WebhookManager()
|
|
await webhook_manager.send_job_completed_webhook(
|
|
webhook_url=job['webhook_url'],
|
|
job_id=job_id,
|
|
status='failed',
|
|
error_message=str(e),
|
|
secret=job.get('webhook_secret'),
|
|
db=db
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
|
|
port = int(os.getenv('PORT', 8000))
|
|
|
|
log.info(f"Starting production server on port {port}...")
|
|
uvicorn.run(
|
|
"api_server_production:app",
|
|
host="0.0.0.0",
|
|
port=port,
|
|
reload=False, # Disable reload in production
|
|
log_level="info"
|
|
)
|