Optimize scraper performance and add fallback selectors for robustness
Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -15,6 +15,8 @@ from dataclasses import dataclass, asdict
|
||||
|
||||
from modules.config import load_config
|
||||
from modules.scraper import GoogleReviewsScraper
|
||||
from modules.fast_scraper import fast_scrape_reviews
|
||||
from modules.chrome_pool import get_scraping_worker, release_scraping_worker
|
||||
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
@@ -38,18 +40,32 @@ class ScrapingJob:
|
||||
created_at: datetime
|
||||
started_at: Optional[datetime] = None
|
||||
completed_at: Optional[datetime] = None
|
||||
updated_at: Optional[datetime] = None # Last update time (for progress tracking)
|
||||
error_message: Optional[str] = None
|
||||
reviews_count: Optional[int] = None
|
||||
total_reviews: Optional[int] = None # Total reviews available (from page counter)
|
||||
images_count: Optional[int] = None
|
||||
progress: Dict[str, Any] = None
|
||||
reviews_data: Optional[List[Dict[str, Any]]] = None # Store actual review data
|
||||
scrape_time: Optional[float] = None # Time taken to scrape
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert job to dictionary for JSON serialization"""
|
||||
def to_dict(self, include_reviews: bool = False) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert job to dictionary for JSON serialization
|
||||
|
||||
Args:
|
||||
include_reviews: Whether to include the full reviews data (default: False)
|
||||
"""
|
||||
data = asdict(self)
|
||||
# Convert datetime objects to ISO strings
|
||||
for field in ['created_at', 'started_at', 'completed_at']:
|
||||
if data[field]:
|
||||
data[field] = data[field].isoformat()
|
||||
|
||||
# Exclude reviews_data by default (can be large)
|
||||
if not include_reviews:
|
||||
data.pop('reviews_data', None)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
@@ -126,6 +142,7 @@ class JobManager:
|
||||
|
||||
job.status = JobStatus.RUNNING
|
||||
job.started_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.progress = {"stage": "starting", "message": "Initializing scraper"}
|
||||
|
||||
# Submit job to thread pool
|
||||
@@ -137,61 +154,139 @@ class JobManager:
|
||||
def _run_scraping_job(self, job_id: str):
|
||||
"""
|
||||
Run the actual scraping job in background thread.
|
||||
|
||||
|
||||
Args:
|
||||
job_id: Job ID to run
|
||||
"""
|
||||
def progress_callback(current_count: int, total_count: int):
|
||||
"""Update job progress during scraping"""
|
||||
with self.lock:
|
||||
job = self.jobs.get(job_id)
|
||||
if job:
|
||||
job.reviews_count = current_count
|
||||
job.total_reviews = total_count
|
||||
job.updated_at = datetime.now() # Update last update time
|
||||
# Calculate percentage for better UX
|
||||
percentage = int((current_count / total_count * 100)) if total_count > 0 else 0
|
||||
job.progress = {
|
||||
"stage": "scraping",
|
||||
"message": f"Collecting reviews: {current_count} / {total_count} ({percentage}%)",
|
||||
"percentage": percentage
|
||||
}
|
||||
|
||||
worker = None
|
||||
try:
|
||||
with self.lock:
|
||||
job = self.jobs[job_id]
|
||||
job.progress = {"stage": "initializing", "message": "Setting up scraper"}
|
||||
|
||||
# Create scraper with job config
|
||||
scraper = GoogleReviewsScraper(job.config)
|
||||
|
||||
# Hook into scraper progress (if available)
|
||||
# This would require modifying the scraper to report progress
|
||||
|
||||
job.progress = {"stage": "initializing", "message": "Acquiring Chrome worker from pool"}
|
||||
|
||||
# Get a worker from the scraping pool
|
||||
worker = get_scraping_worker(timeout=30)
|
||||
|
||||
if not worker:
|
||||
raise Exception("No Chrome workers available. Pool may be at capacity.")
|
||||
|
||||
log.info(f"Job {job_id}: Acquired worker {worker.worker_id} from pool")
|
||||
|
||||
# Get config
|
||||
url = job.config.get('url')
|
||||
headless = job.config.get('headless', True) # Default to headless
|
||||
max_scrolls = job.config.get('max_scrolls', 999999) # Effectively unlimited - relies on idle detection
|
||||
|
||||
with self.lock:
|
||||
job.progress = {"stage": "scraping", "message": "Scraping reviews in progress"}
|
||||
|
||||
# Run the scraping
|
||||
scraper.scrape()
|
||||
|
||||
# Mark job as completed
|
||||
job.progress = {"stage": "scraping", "message": f"Scraping reviews with {worker.worker_id} (fast mode)"}
|
||||
|
||||
# Run the FAST scraping with progress callback using pooled worker
|
||||
result = fast_scrape_reviews(
|
||||
url=url,
|
||||
headless=headless,
|
||||
max_scrolls=max_scrolls,
|
||||
progress_callback=progress_callback,
|
||||
driver=worker.driver, # Use worker's driver
|
||||
return_driver=True # Don't close the driver
|
||||
)
|
||||
|
||||
# Pop the driver from result before storing
|
||||
result.pop('driver', None)
|
||||
|
||||
# Mark job as completed or failed
|
||||
with self.lock:
|
||||
job.status = JobStatus.COMPLETED
|
||||
job.completed_at = datetime.now()
|
||||
job.progress = {"stage": "completed", "message": "Scraping completed successfully"}
|
||||
|
||||
# Try to get results count if available
|
||||
# This would require scraper to return results
|
||||
job.reviews_count = getattr(scraper, 'total_reviews', None)
|
||||
job.images_count = getattr(scraper, 'total_images', None)
|
||||
|
||||
log.info(f"Completed scraping job {job_id}")
|
||||
|
||||
if result['success']:
|
||||
job.status = JobStatus.COMPLETED
|
||||
job.completed_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.reviews_count = result['count']
|
||||
job.total_reviews = result.get('total_reviews') # Store total review count from page
|
||||
job.reviews_data = result['reviews'] # Store the actual reviews
|
||||
job.scrape_time = result['time']
|
||||
job.progress = {
|
||||
"stage": "completed",
|
||||
"message": f"Scraping completed successfully in {result['time']:.1f}s",
|
||||
"scroll_time": result.get('scroll_time'),
|
||||
"extract_time": result.get('extract_time')
|
||||
}
|
||||
log.info(f"Completed scraping job {job_id}: {result['count']} reviews in {result['time']:.1f}s")
|
||||
else:
|
||||
job.status = JobStatus.FAILED
|
||||
job.completed_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.error_message = result.get('error', 'Unknown error')
|
||||
job.progress = {"stage": "failed", "message": f"Job failed: {result.get('error')}"}
|
||||
log.error(f"Failed scraping job {job_id}: {result.get('error')}")
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error in scraping job {job_id}: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
with self.lock:
|
||||
job = self.jobs[job_id]
|
||||
job.status = JobStatus.FAILED
|
||||
job.completed_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.error_message = str(e)
|
||||
job.progress = {"stage": "failed", "message": f"Job failed: {str(e)}"}
|
||||
|
||||
# Recycle worker on error
|
||||
if worker:
|
||||
log.info(f"Job {job_id}: Recycling worker {worker.worker_id} due to error")
|
||||
release_scraping_worker(worker, recycle=True)
|
||||
worker = None # Mark as released
|
||||
|
||||
finally:
|
||||
# Release worker back to pool if not already released
|
||||
if worker:
|
||||
log.info(f"Job {job_id}: Releasing worker {worker.worker_id} back to pool")
|
||||
release_scraping_worker(worker, recycle=False)
|
||||
|
||||
def get_job(self, job_id: str) -> Optional[ScrapingJob]:
|
||||
"""
|
||||
Get job by ID.
|
||||
|
||||
|
||||
Args:
|
||||
job_id: Job ID
|
||||
|
||||
|
||||
Returns:
|
||||
Job object or None if not found
|
||||
"""
|
||||
with self.lock:
|
||||
return self.jobs.get(job_id)
|
||||
|
||||
def get_job_reviews(self, job_id: str) -> Optional[List[Dict[str, Any]]]:
|
||||
"""
|
||||
Get reviews data for a specific job.
|
||||
|
||||
Args:
|
||||
job_id: Job ID
|
||||
|
||||
Returns:
|
||||
List of reviews or None if not found/not completed
|
||||
"""
|
||||
with self.lock:
|
||||
job = self.jobs.get(job_id)
|
||||
if job and job.status == JobStatus.COMPLETED:
|
||||
return job.reviews_data
|
||||
return None
|
||||
|
||||
def list_jobs(self, status: Optional[JobStatus] = None, limit: int = 100) -> List[ScrapingJob]:
|
||||
"""
|
||||
@@ -235,6 +330,7 @@ class JobManager:
|
||||
|
||||
job.status = JobStatus.CANCELLED
|
||||
job.completed_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.progress = {"stage": "cancelled", "message": "Job was cancelled"}
|
||||
|
||||
log.info(f"Cancelled scraping job {job_id}")
|
||||
|
||||
Reference in New Issue
Block a user