Added api support, now the scrapper can be triggered from 3rd party services

2025-06-03 00:23:22 +07:00
parent 0b561f7618
commit dddf388422
4 changed files with 628 additions and 2 deletions
--- a/README.md
+++ b/README.md
@@ -19,6 +19,7 @@
  - Hoards local paths or swaps URLs to your domain like a boss
  - Multi-threaded downloading that would make NASA jealous
  - **S3 Cloud Storage**: Auto-upload images to AWS S3 with custom folder structure
 - **REST API Server**: Trigger scraping jobs via HTTP endpoints with background processing
 - **Time-Bending Magic**: Transforms Google's vague "2 weeks ago" garbage into precise ISO timestamps
 - **Sort Any Damn Way**: Newest, highest, lowest, relevance - we've got you covered
 - **Metadata on Steroids**: Inject custom parameters into every review record
@@ -126,13 +127,54 @@ custom_params:
 ## 🖥️ Unleashing Hell
-### No-Frills, Get-It-Done Usage
+### Command Line Usage
 ```bash
 python start.py --url "https://maps.app.goo.gl/YOUR_URL"
 # Boom. That's it. Now go grab a coffee while the magic happens.
 ```
 ### 🚀 API Server Mode (NEW!)
 Want to trigger scraping jobs via REST API? We've got you covered:
 ```bash
 # Start the API server
 python api_server.py
 # Server runs on http://localhost:8000
 ```
 #### API Endpoints:
 **Start a scraping job:**
 ```bash
 curl -X POST "http://localhost:8000/scrape" \
  -H "Content-Type: application/json" \
  -d '{
    "url": "https://maps.app.goo.gl/YOUR_URL",
    "headless": true,
    "sort_by": "newest",
    "download_images": true
  }'
 ```
 **Check job status:**
 ```bash
 curl "http://localhost:8000/jobs/{job_id}"
 ```
 **List all jobs:**
 ```bash
 curl "http://localhost:8000/jobs"
 ```
 **Get job statistics:**
 ```bash
 curl "http://localhost:8000/stats"
 ```
 **Interactive API docs available at:** `http://localhost:8000/docs`
 ### Battle-Tested Recipes
 1. Stealth Mode + Fresh Stuff First:
--- a/api_server.py
+++ b/api_server.py
@@ -0,0 +1,269 @@
 #!/usr/bin/env python3
 """
 FastAPI server for Google Reviews Scraper.
 Provides REST API endpoints to trigger and manage scraping jobs.
 """
 import logging
 import asyncio
 from contextlib import asynccontextmanager
 from typing import Dict, Any, List, Optional
 from fastapi import FastAPI, HTTPException, BackgroundTasks, Query
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, HttpUrl, Field
 from modules.job_manager import JobManager, JobStatus, ScrapingJob
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 log = logging.getLogger("api_server")
 # Global job manager instance
 job_manager: Optional[JobManager] = None
@asynccontextmanager
 async def lifespan(app: FastAPI):
    """Lifespan context manager for startup and shutdown"""
    global job_manager
    # Startup
    log.info("Starting Google Reviews Scraper API Server")
    job_manager = JobManager(max_concurrent_jobs=3)
    # Start auto-cleanup task
    asyncio.create_task(cleanup_jobs_periodically())
    yield
    # Shutdown
    log.info("Shutting down Google Reviews Scraper API Server")
    if job_manager:
        job_manager.shutdown()
 # Initialize FastAPI app
 app = FastAPI(
    title="Google Reviews Scraper API",
    description="REST API for triggering and managing Google Maps review scraping jobs",
    version="1.0.0",
    lifespan=lifespan
 )
 # Add CORS middleware
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Configure appropriately for production
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 # Pydantic models for API
 class ScrapeRequest(BaseModel):
    """Request model for starting a scrape job"""
    url: HttpUrl = Field(..., description="Google Maps URL to scrape")
    headless: Optional[bool] = Field(None, description="Run Chrome in headless mode")
    sort_by: Optional[str] = Field(None, description="Sort order: newest, highest, lowest, relevance")
    stop_on_match: Optional[bool] = Field(None, description="Stop when first already-seen review is encountered")
    overwrite_existing: Optional[bool] = Field(None, description="Overwrite existing reviews instead of appending")
    download_images: Optional[bool] = Field(None, description="Download images from reviews")
    use_s3: Optional[bool] = Field(None, description="Upload images to S3")
    custom_params: Optional[Dict[str, Any]] = Field(None, description="Custom parameters to add to each document")
 class JobResponse(BaseModel):
    """Response model for job information"""
    job_id: str
    status: JobStatus
    url: str
    created_at: str
    started_at: Optional[str] = None
    completed_at: Optional[str] = None
    error_message: Optional[str] = None
    reviews_count: Optional[int] = None
    images_count: Optional[int] = None
    progress: Optional[Dict[str, Any]] = None
 class JobStatsResponse(BaseModel):
    """Response model for job statistics"""
    total_jobs: int
    by_status: Dict[str, int]
    running_jobs: int
    max_concurrent_jobs: int
 # Background task for periodic cleanup
 async def cleanup_jobs_periodically():
    """Periodically clean up old jobs"""
    while True:
        await asyncio.sleep(3600)  # Run every hour
        if job_manager:
            job_manager.cleanup_old_jobs(max_age_hours=24)
 # API Endpoints
@app.get("/", summary="API Health Check")
 async def root():
    """Health check endpoint"""
    return {
        "message": "Google Reviews Scraper API is running",
        "status": "healthy",
        "version": "1.0.0"
    }
@app.post("/scrape", response_model=Dict[str, str], summary="Start Scraping Job")
 async def start_scrape(request: ScrapeRequest, background_tasks: BackgroundTasks):
    """
    Start a new scraping job in the background.
    Returns the job ID that can be used to check status.
    """
    if not job_manager:
        raise HTTPException(status_code=500, detail="Job manager not initialized")
    # Prepare config overrides
    config_overrides = {}
    # Only include non-None values
    for field, value in request.dict().items():
        if value is not None and field != "url":
            config_overrides[field] = value
    # Convert URL to string
    url = str(request.url)
    try:
        # Create job
        job_id = job_manager.create_job(url, config_overrides)
        # Start job immediately if possible
        started = job_manager.start_job(job_id)
        log.info(f"Created scraping job {job_id} for URL: {url}")
        return {
            "job_id": job_id,
            "status": "started" if started else "queued",
            "message": f"Scraping job {'started' if started else 'queued'} successfully"
        }
    except Exception as e:
        log.error(f"Error creating scraping job: {e}")
        raise HTTPException(status_code=500, detail=f"Failed to create scraping job: {str(e)}")
@app.get("/jobs/{job_id}", response_model=JobResponse, summary="Get Job Status")
 async def get_job(job_id: str):
    """Get detailed information about a specific job"""
    if not job_manager:
        raise HTTPException(status_code=500, detail="Job manager not initialized")
    job = job_manager.get_job(job_id)
    if not job:
        raise HTTPException(status_code=404, detail="Job not found")
    return JobResponse(**job.to_dict())
@app.get("/jobs", response_model=List[JobResponse], summary="List Jobs")
 async def list_jobs(
    status: Optional[JobStatus] = Query(None, description="Filter by job status"),
    limit: int = Query(100, description="Maximum number of jobs to return", ge=1, le=1000)
 ):
    """List all jobs, optionally filtered by status"""
    if not job_manager:
        raise HTTPException(status_code=500, detail="Job manager not initialized")
    jobs = job_manager.list_jobs(status=status, limit=limit)
    return [JobResponse(**job.to_dict()) for job in jobs]
@app.post("/jobs/{job_id}/start", summary="Start Pending Job")
 async def start_job(job_id: str):
    """Start a pending job manually"""
    if not job_manager:
        raise HTTPException(status_code=500, detail="Job manager not initialized")
    started = job_manager.start_job(job_id)
    if not started:
        job = job_manager.get_job(job_id)
        if not job:
            raise HTTPException(status_code=404, detail="Job not found")
        if job.status != JobStatus.PENDING:
            raise HTTPException(status_code=400, detail=f"Job is not pending (current status: {job.status})")
        raise HTTPException(status_code=429, detail="Maximum concurrent jobs reached")
    return {"message": "Job started successfully"}
@app.post("/jobs/{job_id}/cancel", summary="Cancel Job")
 async def cancel_job(job_id: str):
    """Cancel a pending or running job"""
    if not job_manager:
        raise HTTPException(status_code=500, detail="Job manager not initialized")
    cancelled = job_manager.cancel_job(job_id)
    if not cancelled:
        job = job_manager.get_job(job_id)
        if not job:
            raise HTTPException(status_code=404, detail="Job not found")
        raise HTTPException(status_code=400, detail="Job cannot be cancelled (already completed, failed, or cancelled)")
    return {"message": "Job cancelled successfully"}
@app.delete("/jobs/{job_id}", summary="Delete Job")
 async def delete_job(job_id: str):
    """Delete a job from the system"""
    if not job_manager:
        raise HTTPException(status_code=500, detail="Job manager not initialized")
    deleted = job_manager.delete_job(job_id)
    if not deleted:
        raise HTTPException(status_code=404, detail="Job not found")
    return {"message": "Job deleted successfully"}
@app.get("/stats", response_model=JobStatsResponse, summary="Get Job Statistics")
 async def get_stats():
    """Get job manager statistics"""
    if not job_manager:
        raise HTTPException(status_code=500, detail="Job manager not initialized")
    stats = job_manager.get_stats()
    return JobStatsResponse(**stats)
@app.post("/cleanup", summary="Manual Job Cleanup")
 async def cleanup_jobs(max_age_hours: int = Query(24, description="Maximum age in hours", ge=1)):
    """Manually trigger cleanup of old completed/failed jobs"""
    if not job_manager:
        raise HTTPException(status_code=500, detail="Job manager not initialized")
    job_manager.cleanup_old_jobs(max_age_hours=max_age_hours)
    return {"message": f"Cleaned up jobs older than {max_age_hours} hours"}
 if __name__ == "__main__":
    import uvicorn
    log.info("Starting FastAPI server...")
    uvicorn.run(
        "api_server:app",
        host="0.0.0.0",
        port=8000,
        reload=True,
        log_level="info"
    )
--- a/modules/job_manager.py
+++ b/modules/job_manager.py
@@ -0,0 +1,311 @@
 """
 Background job manager for Google Reviews Scraper.
 """
 import asyncio
 import logging
 import threading
 import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 from enum import Enum
 from typing import Dict, Any, Optional, List
 from dataclasses import dataclass, asdict
 from modules.config import load_config
 from modules.scraper import GoogleReviewsScraper
 log = logging.getLogger("scraper")
 class JobStatus(str, Enum):
    """Job status enumeration"""
    PENDING = "pending"
    RUNNING = "running" 
    COMPLETED = "completed"
    FAILED = "failed"
    CANCELLED = "cancelled"
@dataclass
 class ScrapingJob:
    """Scraping job data class"""
    job_id: str
    status: JobStatus
    url: str
    config: Dict[str, Any]
    created_at: datetime
    started_at: Optional[datetime] = None
    completed_at: Optional[datetime] = None
    error_message: Optional[str] = None
    reviews_count: Optional[int] = None
    images_count: Optional[int] = None
    progress: Dict[str, Any] = None
    def to_dict(self) -> Dict[str, Any]:
        """Convert job to dictionary for JSON serialization"""
        data = asdict(self)
        # Convert datetime objects to ISO strings
        for field in ['created_at', 'started_at', 'completed_at']:
            if data[field]:
                data[field] = data[field].isoformat()
        return data
 class JobManager:
    """Manager for background scraping jobs"""
    def __init__(self, max_concurrent_jobs: int = 3):
        """Initialize job manager"""
        self.max_concurrent_jobs = max_concurrent_jobs
        self.jobs: Dict[str, ScrapingJob] = {}
        self.executor = ThreadPoolExecutor(max_workers=max_concurrent_jobs)
        self.lock = threading.Lock()
    def create_job(self, url: str, config_overrides: Dict[str, Any] = None) -> str:
        """
        Create a new scraping job.
        Args:
            url: Google Maps URL to scrape
            config_overrides: Optional config overrides
        Returns:
            Job ID
        """
        job_id = str(uuid.uuid4())
        # Load base config
        config = load_config()
        # Apply URL
        config["url"] = url
        # Apply any overrides
        if config_overrides:
            config.update(config_overrides)
        job = ScrapingJob(
            job_id=job_id,
            status=JobStatus.PENDING,
            url=url,
            config=config,
            created_at=datetime.now(),
            progress={"stage": "created", "message": "Job created and queued"}
        )
        with self.lock:
            self.jobs[job_id] = job
        log.info(f"Created scraping job {job_id} for URL: {url}")
        return job_id
    def start_job(self, job_id: str) -> bool:
        """
        Start a pending job.
        Args:
            job_id: Job ID to start
        Returns:
            True if job was started, False otherwise
        """
        with self.lock:
            if job_id not in self.jobs:
                return False
            job = self.jobs[job_id]
            if job.status != JobStatus.PENDING:
                return False
            # Check if we can start more jobs
            running_count = sum(1 for j in self.jobs.values() if j.status == JobStatus.RUNNING)
            if running_count >= self.max_concurrent_jobs:
                return False
            job.status = JobStatus.RUNNING
            job.started_at = datetime.now()
            job.progress = {"stage": "starting", "message": "Initializing scraper"}
        # Submit job to thread pool
        future = self.executor.submit(self._run_scraping_job, job_id)
        log.info(f"Started scraping job {job_id}")
        return True
    def _run_scraping_job(self, job_id: str):
        """
        Run the actual scraping job in background thread.
        Args:
            job_id: Job ID to run
        """
        try:
            with self.lock:
                job = self.jobs[job_id]
                job.progress = {"stage": "initializing", "message": "Setting up scraper"}
            # Create scraper with job config
            scraper = GoogleReviewsScraper(job.config)
            # Hook into scraper progress (if available)
            # This would require modifying the scraper to report progress
            with self.lock:
                job.progress = {"stage": "scraping", "message": "Scraping reviews in progress"}
            # Run the scraping
            scraper.scrape()
            # Mark job as completed
            with self.lock:
                job.status = JobStatus.COMPLETED
                job.completed_at = datetime.now()
                job.progress = {"stage": "completed", "message": "Scraping completed successfully"}
                # Try to get results count if available
                # This would require scraper to return results
                job.reviews_count = getattr(scraper, 'total_reviews', None)
                job.images_count = getattr(scraper, 'total_images', None)
            log.info(f"Completed scraping job {job_id}")
        except Exception as e:
            log.error(f"Error in scraping job {job_id}: {e}")
            with self.lock:
                job = self.jobs[job_id]
                job.status = JobStatus.FAILED
                job.completed_at = datetime.now()
                job.error_message = str(e)
                job.progress = {"stage": "failed", "message": f"Job failed: {str(e)}"}
    def get_job(self, job_id: str) -> Optional[ScrapingJob]:
        """
        Get job by ID.
        Args:
            job_id: Job ID
        Returns:
            Job object or None if not found
        """
        with self.lock:
            return self.jobs.get(job_id)
    def list_jobs(self, status: Optional[JobStatus] = None, limit: int = 100) -> List[ScrapingJob]:
        """
        List jobs, optionally filtered by status.
        Args:
            status: Optional status filter
            limit: Maximum number of jobs to return
        Returns:
            List of jobs
        """
        with self.lock:
            jobs = list(self.jobs.values())
        if status:
            jobs = [job for job in jobs if job.status == status]
        # Sort by creation time (newest first)
        jobs.sort(key=lambda x: x.created_at, reverse=True)
        return jobs[:limit]
    def cancel_job(self, job_id: str) -> bool:
        """
        Cancel a pending or running job.
        Args:
            job_id: Job ID to cancel
        Returns:
            True if job was cancelled, False otherwise
        """
        with self.lock:
            if job_id not in self.jobs:
                return False
            job = self.jobs[job_id]
            if job.status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED]:
                return False
            job.status = JobStatus.CANCELLED
            job.completed_at = datetime.now()
            job.progress = {"stage": "cancelled", "message": "Job was cancelled"}
        log.info(f"Cancelled scraping job {job_id}")
        return True
    def delete_job(self, job_id: str) -> bool:
        """
        Delete a job from the manager.
        Args:
            job_id: Job ID to delete
        Returns:
            True if job was deleted, False otherwise
        """
        with self.lock:
            if job_id not in self.jobs:
                return False
            del self.jobs[job_id]
        log.info(f"Deleted scraping job {job_id}")
        return True
    def get_stats(self) -> Dict[str, Any]:
        """
        Get job manager statistics.
        Returns:
            Statistics dictionary
        """
        with self.lock:
            jobs = list(self.jobs.values())
        stats = {
            "total_jobs": len(jobs),
            "by_status": {},
            "running_jobs": 0,
            "max_concurrent_jobs": self.max_concurrent_jobs
        }
        for status in JobStatus:
            count = sum(1 for job in jobs if job.status == status)
            stats["by_status"][status.value] = count
        stats["running_jobs"] = stats["by_status"].get(JobStatus.RUNNING.value, 0)
        return stats
    def cleanup_old_jobs(self, max_age_hours: int = 24):
        """
        Clean up old completed/failed jobs.
        Args:
            max_age_hours: Maximum age in hours before cleanup
        """
        cutoff_time = datetime.now().timestamp() - (max_age_hours * 3600)
        with self.lock:
            to_delete = []
            for job_id, job in self.jobs.items():
                if job.status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED]:
                    if job.completed_at and job.completed_at.timestamp() < cutoff_time:
                        to_delete.append(job_id)
            for job_id in to_delete:
                del self.jobs[job_id]
        if to_delete:
            log.info(f"Cleaned up {len(to_delete)} old jobs")
    def shutdown(self):
        """Shutdown the job manager"""
        log.info("Shutting down job manager")
        self.executor.shutdown(wait=True)
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,3 +12,7 @@ webdriver-manager==4.0.2
 setuptools==79.0.1
 boto3==1.35.1
 pytest==7.4.3
 fastapi==0.104.1
 uvicorn==0.24.0
 botocore~=1.35.99
 pydantic~=2.11.5