Optimize scraper performance and add fallback selectors for robustness
Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
130
api_server.py
130
api_server.py
@@ -14,6 +14,8 @@ from fastapi.middleware.cors import CORSMiddleware
|
||||
from pydantic import BaseModel, HttpUrl, Field
|
||||
|
||||
from modules.job_manager import JobManager, JobStatus, ScrapingJob
|
||||
from modules.chrome_pool import start_worker_pools, stop_worker_pools, get_pool_stats, get_validation_worker, release_validation_worker
|
||||
from modules.fast_scraper import check_reviews_available, get_business_card_info
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
@@ -30,21 +32,35 @@ job_manager: Optional[JobManager] = None
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Lifespan context manager for startup and shutdown"""
|
||||
global job_manager
|
||||
|
||||
|
||||
# Startup
|
||||
log.info("Starting Google Reviews Scraper API Server")
|
||||
|
||||
# Start Chrome worker pools
|
||||
log.info("Initializing Chrome worker pools...")
|
||||
start_worker_pools(
|
||||
validation_size=1, # 1 pre-warmed worker for validation
|
||||
scraping_size=2, # 2 pre-warmed workers for scraping
|
||||
headless=True
|
||||
)
|
||||
|
||||
job_manager = JobManager(max_concurrent_jobs=3)
|
||||
|
||||
|
||||
# Start auto-cleanup task
|
||||
asyncio.create_task(cleanup_jobs_periodically())
|
||||
|
||||
|
||||
yield
|
||||
|
||||
|
||||
# Shutdown
|
||||
log.info("Shutting down Google Reviews Scraper API Server")
|
||||
|
||||
if job_manager:
|
||||
job_manager.shutdown()
|
||||
|
||||
# Stop Chrome worker pools
|
||||
log.info("Stopping Chrome worker pools...")
|
||||
stop_worker_pools()
|
||||
|
||||
|
||||
# Initialize FastAPI app
|
||||
app = FastAPI(
|
||||
@@ -68,7 +84,8 @@ app.add_middleware(
|
||||
class ScrapeRequest(BaseModel):
|
||||
"""Request model for starting a scrape job"""
|
||||
url: HttpUrl = Field(..., description="Google Maps URL to scrape")
|
||||
headless: Optional[bool] = Field(None, description="Run Chrome in headless mode")
|
||||
headless: Optional[bool] = Field(None, description="Run Chrome in headless mode (default: True)")
|
||||
max_scrolls: Optional[int] = Field(None, description="Maximum scrolls (default: unlimited - stops via idle detection)")
|
||||
sort_by: Optional[str] = Field(None, description="Sort order: newest, highest, lowest, relevance")
|
||||
stop_on_match: Optional[bool] = Field(None, description="Stop when first already-seen review is encountered")
|
||||
overwrite_existing: Optional[bool] = Field(None, description="Overwrite existing reviews instead of appending")
|
||||
@@ -85,10 +102,13 @@ class JobResponse(BaseModel):
|
||||
created_at: str
|
||||
started_at: Optional[str] = None
|
||||
completed_at: Optional[str] = None
|
||||
updated_at: Optional[str] = None # Last update time for progress tracking
|
||||
error_message: Optional[str] = None
|
||||
reviews_count: Optional[int] = None
|
||||
total_reviews: Optional[int] = None # Total reviews available for this place
|
||||
images_count: Optional[int] = None
|
||||
progress: Optional[Dict[str, Any]] = None
|
||||
scrape_time: Optional[float] = None # Time taken to scrape in seconds
|
||||
|
||||
|
||||
class JobStatsResponse(BaseModel):
|
||||
@@ -99,6 +119,13 @@ class JobStatsResponse(BaseModel):
|
||||
max_concurrent_jobs: int
|
||||
|
||||
|
||||
class ReviewsResponse(BaseModel):
|
||||
"""Response model for reviews data"""
|
||||
job_id: str
|
||||
reviews: List[Dict[str, Any]]
|
||||
count: int
|
||||
|
||||
|
||||
# Background task for periodic cleanup
|
||||
async def cleanup_jobs_periodically():
|
||||
"""Periodically clean up old jobs"""
|
||||
@@ -166,14 +193,44 @@ async def get_job(job_id: str):
|
||||
"""Get detailed information about a specific job"""
|
||||
if not job_manager:
|
||||
raise HTTPException(status_code=500, detail="Job manager not initialized")
|
||||
|
||||
|
||||
job = job_manager.get_job(job_id)
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
|
||||
return JobResponse(**job.to_dict())
|
||||
|
||||
|
||||
@app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews")
|
||||
async def get_job_reviews(job_id: str):
|
||||
"""
|
||||
Get the actual reviews data for a completed job.
|
||||
|
||||
Returns 404 if job not found or not completed yet.
|
||||
"""
|
||||
if not job_manager:
|
||||
raise HTTPException(status_code=500, detail="Job manager not initialized")
|
||||
|
||||
reviews = job_manager.get_job_reviews(job_id)
|
||||
if reviews is None:
|
||||
job = job_manager.get_job(job_id)
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
elif job.status != JobStatus.COMPLETED:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Job not completed yet (current status: {job.status})"
|
||||
)
|
||||
else:
|
||||
raise HTTPException(status_code=404, detail="Reviews data not available")
|
||||
|
||||
return ReviewsResponse(
|
||||
job_id=job_id,
|
||||
reviews=reviews,
|
||||
count=len(reviews)
|
||||
)
|
||||
|
||||
|
||||
@app.get("/jobs", response_model=List[JobResponse], summary="List Jobs")
|
||||
async def list_jobs(
|
||||
status: Optional[JobStatus] = Query(None, description="Filter by job status"),
|
||||
@@ -246,12 +303,69 @@ async def get_stats():
|
||||
return JobStatsResponse(**stats)
|
||||
|
||||
|
||||
@app.post("/check-reviews", summary="Check if Business Has Reviews")
|
||||
async def check_reviews(request: Dict[str, str]):
|
||||
"""
|
||||
Lightweight validation endpoint to check if a business has reviews.
|
||||
Uses the Chrome validation pool for fast response.
|
||||
|
||||
Returns business name, rating, address, and review count.
|
||||
"""
|
||||
url = request.get("url")
|
||||
if not url:
|
||||
raise HTTPException(status_code=400, detail="URL is required")
|
||||
|
||||
log.info(f"Validating business at: {url}")
|
||||
|
||||
# Get a worker from validation pool
|
||||
worker = get_validation_worker(timeout=10)
|
||||
|
||||
if not worker:
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="No validation workers available. Please try again in a few seconds."
|
||||
)
|
||||
|
||||
try:
|
||||
# Use the worker's driver to get business card info (faster than check_reviews_available)
|
||||
result = get_business_card_info(
|
||||
url=url,
|
||||
headless=True,
|
||||
driver=worker.driver,
|
||||
return_driver=True # Don't close the driver
|
||||
)
|
||||
|
||||
# Pop the driver from result before returning
|
||||
result.pop('driver', None)
|
||||
|
||||
log.info(f"Validation result: name={result.get('name')}, rating={result.get('rating')}, reviews={result.get('total_reviews')}")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error during validation: {e}")
|
||||
# Recycle worker if there was an error
|
||||
release_validation_worker(worker, recycle=True)
|
||||
raise HTTPException(status_code=500, detail=f"Validation failed: {str(e)}")
|
||||
|
||||
finally:
|
||||
# Release worker back to pool (unless already recycled)
|
||||
if worker and worker.driver:
|
||||
release_validation_worker(worker, recycle=False)
|
||||
|
||||
|
||||
@app.get("/pool-stats", summary="Get Chrome Pool Statistics")
|
||||
async def pool_stats():
|
||||
"""Get statistics about Chrome worker pools"""
|
||||
stats = get_pool_stats()
|
||||
return stats
|
||||
|
||||
|
||||
@app.post("/cleanup", summary="Manual Job Cleanup")
|
||||
async def cleanup_jobs(max_age_hours: int = Query(24, description="Maximum age in hours", ge=1)):
|
||||
"""Manually trigger cleanup of old completed/failed jobs"""
|
||||
if not job_manager:
|
||||
raise HTTPException(status_code=500, detail="Job manager not initialized")
|
||||
|
||||
|
||||
job_manager.cleanup_old_jobs(max_age_hours=max_age_hours)
|
||||
return {"message": f"Cleaned up jobs older than {max_age_hours} hours"}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user