Optimize scraper performance and add fallback selectors for robustness

Performance improvements:
- Validation speed: 59.71s → 10.96s (5.5x improvement)
- Removed 50+ console.log statements from JavaScript extraction
- Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting
- Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls)

Scraping improvements:
- Increased idle detection from 6 to 12 consecutive idle scrolls for completeness
- Added real-time progress updates every 5 scrolls with percentage calculation
- Added crash recovery to extract partial reviews if Chrome crashes
- Removed artificial 200-review limit to scrape ALL reviews

Timestamp tracking:
- Added updated_at field separate from started_at for progress tracking
- Frontend now shows both "Started" (fixed) and "Last Update" (dynamic)

Robustness improvements:
- Added 5 fallback CSS selectors to handle different Google Maps page structures
- Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc.
- Automatic selector detection logs which selector works for debugging

Test results:
- Successfully scraped 550 reviews in 150.53s without crashes
- Memory management prevents Chrome tab crashes during heavy scraping

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-18 19:49:24 +00:00
parent bdffb5eaac
commit faa0704737
108 changed files with 23632 additions and 54 deletions

View File

@@ -14,6 +14,8 @@ from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, HttpUrl, Field
from modules.job_manager import JobManager, JobStatus, ScrapingJob
from modules.chrome_pool import start_worker_pools, stop_worker_pools, get_pool_stats, get_validation_worker, release_validation_worker
from modules.fast_scraper import check_reviews_available, get_business_card_info
# Configure logging
logging.basicConfig(
@@ -30,21 +32,35 @@ job_manager: Optional[JobManager] = None
async def lifespan(app: FastAPI):
"""Lifespan context manager for startup and shutdown"""
global job_manager
# Startup
log.info("Starting Google Reviews Scraper API Server")
# Start Chrome worker pools
log.info("Initializing Chrome worker pools...")
start_worker_pools(
validation_size=1, # 1 pre-warmed worker for validation
scraping_size=2, # 2 pre-warmed workers for scraping
headless=True
)
job_manager = JobManager(max_concurrent_jobs=3)
# Start auto-cleanup task
asyncio.create_task(cleanup_jobs_periodically())
yield
# Shutdown
log.info("Shutting down Google Reviews Scraper API Server")
if job_manager:
job_manager.shutdown()
# Stop Chrome worker pools
log.info("Stopping Chrome worker pools...")
stop_worker_pools()
# Initialize FastAPI app
app = FastAPI(
@@ -68,7 +84,8 @@ app.add_middleware(
class ScrapeRequest(BaseModel):
"""Request model for starting a scrape job"""
url: HttpUrl = Field(..., description="Google Maps URL to scrape")
headless: Optional[bool] = Field(None, description="Run Chrome in headless mode")
headless: Optional[bool] = Field(None, description="Run Chrome in headless mode (default: True)")
max_scrolls: Optional[int] = Field(None, description="Maximum scrolls (default: unlimited - stops via idle detection)")
sort_by: Optional[str] = Field(None, description="Sort order: newest, highest, lowest, relevance")
stop_on_match: Optional[bool] = Field(None, description="Stop when first already-seen review is encountered")
overwrite_existing: Optional[bool] = Field(None, description="Overwrite existing reviews instead of appending")
@@ -85,10 +102,13 @@ class JobResponse(BaseModel):
created_at: str
started_at: Optional[str] = None
completed_at: Optional[str] = None
updated_at: Optional[str] = None # Last update time for progress tracking
error_message: Optional[str] = None
reviews_count: Optional[int] = None
total_reviews: Optional[int] = None # Total reviews available for this place
images_count: Optional[int] = None
progress: Optional[Dict[str, Any]] = None
scrape_time: Optional[float] = None # Time taken to scrape in seconds
class JobStatsResponse(BaseModel):
@@ -99,6 +119,13 @@ class JobStatsResponse(BaseModel):
max_concurrent_jobs: int
class ReviewsResponse(BaseModel):
"""Response model for reviews data"""
job_id: str
reviews: List[Dict[str, Any]]
count: int
# Background task for periodic cleanup
async def cleanup_jobs_periodically():
"""Periodically clean up old jobs"""
@@ -166,14 +193,44 @@ async def get_job(job_id: str):
"""Get detailed information about a specific job"""
if not job_manager:
raise HTTPException(status_code=500, detail="Job manager not initialized")
job = job_manager.get_job(job_id)
if not job:
raise HTTPException(status_code=404, detail="Job not found")
return JobResponse(**job.to_dict())
@app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews")
async def get_job_reviews(job_id: str):
"""
Get the actual reviews data for a completed job.
Returns 404 if job not found or not completed yet.
"""
if not job_manager:
raise HTTPException(status_code=500, detail="Job manager not initialized")
reviews = job_manager.get_job_reviews(job_id)
if reviews is None:
job = job_manager.get_job(job_id)
if not job:
raise HTTPException(status_code=404, detail="Job not found")
elif job.status != JobStatus.COMPLETED:
raise HTTPException(
status_code=400,
detail=f"Job not completed yet (current status: {job.status})"
)
else:
raise HTTPException(status_code=404, detail="Reviews data not available")
return ReviewsResponse(
job_id=job_id,
reviews=reviews,
count=len(reviews)
)
@app.get("/jobs", response_model=List[JobResponse], summary="List Jobs")
async def list_jobs(
status: Optional[JobStatus] = Query(None, description="Filter by job status"),
@@ -246,12 +303,69 @@ async def get_stats():
return JobStatsResponse(**stats)
@app.post("/check-reviews", summary="Check if Business Has Reviews")
async def check_reviews(request: Dict[str, str]):
"""
Lightweight validation endpoint to check if a business has reviews.
Uses the Chrome validation pool for fast response.
Returns business name, rating, address, and review count.
"""
url = request.get("url")
if not url:
raise HTTPException(status_code=400, detail="URL is required")
log.info(f"Validating business at: {url}")
# Get a worker from validation pool
worker = get_validation_worker(timeout=10)
if not worker:
raise HTTPException(
status_code=503,
detail="No validation workers available. Please try again in a few seconds."
)
try:
# Use the worker's driver to get business card info (faster than check_reviews_available)
result = get_business_card_info(
url=url,
headless=True,
driver=worker.driver,
return_driver=True # Don't close the driver
)
# Pop the driver from result before returning
result.pop('driver', None)
log.info(f"Validation result: name={result.get('name')}, rating={result.get('rating')}, reviews={result.get('total_reviews')}")
return result
except Exception as e:
log.error(f"Error during validation: {e}")
# Recycle worker if there was an error
release_validation_worker(worker, recycle=True)
raise HTTPException(status_code=500, detail=f"Validation failed: {str(e)}")
finally:
# Release worker back to pool (unless already recycled)
if worker and worker.driver:
release_validation_worker(worker, recycle=False)
@app.get("/pool-stats", summary="Get Chrome Pool Statistics")
async def pool_stats():
"""Get statistics about Chrome worker pools"""
stats = get_pool_stats()
return stats
@app.post("/cleanup", summary="Manual Job Cleanup")
async def cleanup_jobs(max_age_hours: int = Query(24, description="Maximum age in hours", ge=1)):
"""Manually trigger cleanup of old completed/failed jobs"""
if not job_manager:
raise HTTPException(status_code=500, detail="Job manager not initialized")
job_manager.cleanup_old_jobs(max_age_hours=max_age_hours)
return {"message": f"Cleaned up jobs older than {max_age_hours} hours"}