diff --git a/api_server.py b/api_server.py deleted file mode 100644 index 630d3b1..0000000 --- a/api_server.py +++ /dev/null @@ -1,383 +0,0 @@ -#!/usr/bin/env python3 -""" -FastAPI server for Google Reviews Scraper. -Provides REST API endpoints to trigger and manage scraping jobs. -""" - -import logging -import asyncio -from contextlib import asynccontextmanager -from typing import Dict, Any, List, Optional - -from fastapi import FastAPI, HTTPException, BackgroundTasks, Query -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel, HttpUrl, Field - -from modules.job_manager import JobManager, JobStatus, ScrapingJob -from modules.chrome_pool import start_worker_pools, stop_worker_pools, get_pool_stats, get_validation_worker, release_validation_worker -from modules.fast_scraper import check_reviews_available, get_business_card_info - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -log = logging.getLogger("api_server") - -# Global job manager instance -job_manager: Optional[JobManager] = None - - -@asynccontextmanager -async def lifespan(app: FastAPI): - """Lifespan context manager for startup and shutdown""" - global job_manager - - # Startup - log.info("Starting Google Reviews Scraper API Server") - - # Start Chrome worker pools - log.info("Initializing Chrome worker pools...") - start_worker_pools( - validation_size=1, # 1 pre-warmed worker for validation - scraping_size=2, # 2 pre-warmed workers for scraping - headless=True - ) - - job_manager = JobManager(max_concurrent_jobs=3) - - # Start auto-cleanup task - asyncio.create_task(cleanup_jobs_periodically()) - - yield - - # Shutdown - log.info("Shutting down Google Reviews Scraper API Server") - - if job_manager: - job_manager.shutdown() - - # Stop Chrome worker pools - log.info("Stopping Chrome worker pools...") - stop_worker_pools() - - -# Initialize FastAPI app -app = FastAPI( - title="Google Reviews Scraper API", - description="REST API for triggering and managing Google Maps review scraping jobs", - version="1.0.0", - lifespan=lifespan -) - -# Add CORS middleware -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], # Configure appropriately for production - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - - -# Pydantic models for API -class ScrapeRequest(BaseModel): - """Request model for starting a scrape job""" - url: HttpUrl = Field(..., description="Google Maps URL to scrape") - headless: Optional[bool] = Field(None, description="Run Chrome in headless mode (default: True)") - max_scrolls: Optional[int] = Field(None, description="Maximum scrolls (default: unlimited - stops via idle detection)") - sort_by: Optional[str] = Field(None, description="Sort order: newest, highest, lowest, relevance") - stop_on_match: Optional[bool] = Field(None, description="Stop when first already-seen review is encountered") - overwrite_existing: Optional[bool] = Field(None, description="Overwrite existing reviews instead of appending") - download_images: Optional[bool] = Field(None, description="Download images from reviews") - use_s3: Optional[bool] = Field(None, description="Upload images to S3") - custom_params: Optional[Dict[str, Any]] = Field(None, description="Custom parameters to add to each document") - - -class JobResponse(BaseModel): - """Response model for job information""" - job_id: str - status: JobStatus - url: str - created_at: str - started_at: Optional[str] = None - completed_at: Optional[str] = None - updated_at: Optional[str] = None # Last update time for progress tracking - error_message: Optional[str] = None - reviews_count: Optional[int] = None - total_reviews: Optional[int] = None # Total reviews available for this place - images_count: Optional[int] = None - progress: Optional[Dict[str, Any]] = None - scrape_time: Optional[float] = None # Time taken to scrape in seconds - - -class JobStatsResponse(BaseModel): - """Response model for job statistics""" - total_jobs: int - by_status: Dict[str, int] - running_jobs: int - max_concurrent_jobs: int - - -class ReviewsResponse(BaseModel): - """Response model for reviews data""" - job_id: str - reviews: List[Dict[str, Any]] - count: int - - -# Background task for periodic cleanup -async def cleanup_jobs_periodically(): - """Periodically clean up old jobs""" - while True: - await asyncio.sleep(3600) # Run every hour - if job_manager: - job_manager.cleanup_old_jobs(max_age_hours=24) - - -# API Endpoints - -@app.get("/", summary="API Health Check") -async def root(): - """Health check endpoint""" - return { - "message": "Google Reviews Scraper API is running", - "status": "healthy", - "version": "1.0.0" - } - - -@app.post("/scrape", response_model=Dict[str, str], summary="Start Scraping Job") -async def start_scrape(request: ScrapeRequest, background_tasks: BackgroundTasks): - """ - Start a new scraping job in the background. - - Returns the job ID that can be used to check status. - """ - if not job_manager: - raise HTTPException(status_code=500, detail="Job manager not initialized") - - # Prepare config overrides - config_overrides = {} - - # Only include non-None values - for field, value in request.dict().items(): - if value is not None and field != "url": - config_overrides[field] = value - - # Convert URL to string - url = str(request.url) - - try: - # Create job - job_id = job_manager.create_job(url, config_overrides) - - # Start job immediately if possible - started = job_manager.start_job(job_id) - - log.info(f"Created scraping job {job_id} for URL: {url}") - - return { - "job_id": job_id, - "status": "started" if started else "queued", - "message": f"Scraping job {'started' if started else 'queued'} successfully" - } - - except Exception as e: - log.error(f"Error creating scraping job: {e}") - raise HTTPException(status_code=500, detail=f"Failed to create scraping job: {str(e)}") - - -@app.get("/jobs/{job_id}", response_model=JobResponse, summary="Get Job Status") -async def get_job(job_id: str): - """Get detailed information about a specific job""" - if not job_manager: - raise HTTPException(status_code=500, detail="Job manager not initialized") - - job = job_manager.get_job(job_id) - if not job: - raise HTTPException(status_code=404, detail="Job not found") - - return JobResponse(**job.to_dict()) - - -@app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews") -async def get_job_reviews(job_id: str): - """ - Get the actual reviews data for a completed job. - - Returns 404 if job not found or not completed yet. - """ - if not job_manager: - raise HTTPException(status_code=500, detail="Job manager not initialized") - - reviews = job_manager.get_job_reviews(job_id) - if reviews is None: - job = job_manager.get_job(job_id) - if not job: - raise HTTPException(status_code=404, detail="Job not found") - elif job.status != JobStatus.COMPLETED: - raise HTTPException( - status_code=400, - detail=f"Job not completed yet (current status: {job.status})" - ) - else: - raise HTTPException(status_code=404, detail="Reviews data not available") - - return ReviewsResponse( - job_id=job_id, - reviews=reviews, - count=len(reviews) - ) - - -@app.get("/jobs", response_model=List[JobResponse], summary="List Jobs") -async def list_jobs( - status: Optional[JobStatus] = Query(None, description="Filter by job status"), - limit: int = Query(100, description="Maximum number of jobs to return", ge=1, le=1000) -): - """List all jobs, optionally filtered by status""" - if not job_manager: - raise HTTPException(status_code=500, detail="Job manager not initialized") - - jobs = job_manager.list_jobs(status=status, limit=limit) - return [JobResponse(**job.to_dict()) for job in jobs] - - -@app.post("/jobs/{job_id}/start", summary="Start Pending Job") -async def start_job(job_id: str): - """Start a pending job manually""" - if not job_manager: - raise HTTPException(status_code=500, detail="Job manager not initialized") - - started = job_manager.start_job(job_id) - if not started: - job = job_manager.get_job(job_id) - if not job: - raise HTTPException(status_code=404, detail="Job not found") - - if job.status != JobStatus.PENDING: - raise HTTPException(status_code=400, detail=f"Job is not pending (current status: {job.status})") - - raise HTTPException(status_code=429, detail="Maximum concurrent jobs reached") - - return {"message": "Job started successfully"} - - -@app.post("/jobs/{job_id}/cancel", summary="Cancel Job") -async def cancel_job(job_id: str): - """Cancel a pending or running job""" - if not job_manager: - raise HTTPException(status_code=500, detail="Job manager not initialized") - - cancelled = job_manager.cancel_job(job_id) - if not cancelled: - job = job_manager.get_job(job_id) - if not job: - raise HTTPException(status_code=404, detail="Job not found") - raise HTTPException(status_code=400, detail="Job cannot be cancelled (already completed, failed, or cancelled)") - - return {"message": "Job cancelled successfully"} - - -@app.delete("/jobs/{job_id}", summary="Delete Job") -async def delete_job(job_id: str): - """Delete a job from the system""" - if not job_manager: - raise HTTPException(status_code=500, detail="Job manager not initialized") - - deleted = job_manager.delete_job(job_id) - if not deleted: - raise HTTPException(status_code=404, detail="Job not found") - - return {"message": "Job deleted successfully"} - - -@app.get("/stats", response_model=JobStatsResponse, summary="Get Job Statistics") -async def get_stats(): - """Get job manager statistics""" - if not job_manager: - raise HTTPException(status_code=500, detail="Job manager not initialized") - - stats = job_manager.get_stats() - return JobStatsResponse(**stats) - - -@app.post("/check-reviews", summary="Check if Business Has Reviews") -async def check_reviews(request: Dict[str, str]): - """ - Lightweight validation endpoint to check if a business has reviews. - Uses the Chrome validation pool for fast response. - - Returns business name, rating, address, and review count. - """ - url = request.get("url") - if not url: - raise HTTPException(status_code=400, detail="URL is required") - - log.info(f"Validating business at: {url}") - - # Get a worker from validation pool - worker = get_validation_worker(timeout=10) - - if not worker: - raise HTTPException( - status_code=503, - detail="No validation workers available. Please try again in a few seconds." - ) - - try: - # Use the worker's driver to get business card info (faster than check_reviews_available) - result = get_business_card_info( - url=url, - headless=True, - driver=worker.driver, - return_driver=True # Don't close the driver - ) - - # Pop the driver from result before returning - result.pop('driver', None) - - log.info(f"Validation result: name={result.get('name')}, rating={result.get('rating')}, reviews={result.get('total_reviews')}") - return result - - except Exception as e: - log.error(f"Error during validation: {e}") - # Recycle worker if there was an error - release_validation_worker(worker, recycle=True) - raise HTTPException(status_code=500, detail=f"Validation failed: {str(e)}") - - finally: - # Release worker back to pool (unless already recycled) - if worker and worker.driver: - release_validation_worker(worker, recycle=False) - - -@app.get("/pool-stats", summary="Get Chrome Pool Statistics") -async def pool_stats(): - """Get statistics about Chrome worker pools""" - stats = get_pool_stats() - return stats - - -@app.post("/cleanup", summary="Manual Job Cleanup") -async def cleanup_jobs(max_age_hours: int = Query(24, description="Maximum age in hours", ge=1)): - """Manually trigger cleanup of old completed/failed jobs""" - if not job_manager: - raise HTTPException(status_code=500, detail="Job manager not initialized") - - job_manager.cleanup_old_jobs(max_age_hours=max_age_hours) - return {"message": f"Cleaned up jobs older than {max_age_hours} hours"} - - -if __name__ == "__main__": - import uvicorn - - log.info("Starting FastAPI server...") - uvicorn.run( - "api_server:app", - host="0.0.0.0", - port=8000, - reload=True, - log_level="info" - ) \ No newline at end of file diff --git a/api_server_production.py b/api_server_production.py index 6b33da3..dad6f96 100644 --- a/api_server_production.py +++ b/api_server_production.py @@ -6,6 +6,7 @@ Production Google Reviews Scraper API Server with Phase 1 features: - Smart health checks with canary testing """ import asyncio +import json import logging import os from contextlib import asynccontextmanager @@ -15,12 +16,12 @@ from uuid import UUID from fastapi import FastAPI, HTTPException, Query, Header from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel, HttpUrl, Field -from fastapi.responses import JSONResponse +from fastapi.responses import JSONResponse, StreamingResponse from modules.database import DatabaseManager, JobStatus from modules.webhooks import WebhookDispatcher, WebhookManager from modules.health_checks import HealthCheckSystem -from modules.scraper_clean import fast_scrape_reviews # Clean scraper with hard refresh recovery +from modules.scraper_clean import fast_scrape_reviews, LogCapture # Clean scraper with hard refresh recovery from modules.fast_scraper import check_reviews_available, get_business_card_info # Helper functions from modules.chrome_pool import ( start_worker_pools, @@ -48,6 +49,11 @@ health_system: Optional[HealthCheckSystem] = None MAX_CONCURRENT_JOBS = int(os.getenv('MAX_CONCURRENT_JOBS', '5')) job_semaphore = asyncio.Semaphore(MAX_CONCURRENT_JOBS) +# SSE: Store for broadcasting job updates to connected clients +# Format: {job_id: [asyncio.Queue, ...]} for job-specific streams +# Format: {"all": [asyncio.Queue, ...]} for all-jobs stream +job_update_queues: Dict[str, List[asyncio.Queue]] = {"all": []} + @asynccontextmanager async def lifespan(app: FastAPI): @@ -82,11 +88,12 @@ async def lifespan(app: FastAPI): # Start Chrome worker pools (1 for validation, 2 for scraping) # These pre-warm Chrome instances for instant availability + # headless=False because Docker uses Xvfb virtual display for better compatibility await asyncio.to_thread( start_worker_pools, validation_size=1, scraping_size=2, - headless=True + headless=False ) log.info("Chrome worker pools started (1 validation + 2 scraping)") @@ -148,6 +155,9 @@ class JobResponse(BaseModel): scrape_time: Optional[float] = None error_message: Optional[str] = None webhook_url: Optional[str] = None + # Business metadata + business_name: Optional[str] = None + business_address: Optional[str] = None class ReviewsResponse(BaseModel): @@ -239,12 +249,296 @@ async def get_job(job_id: UUID): started_at=job['started_at'].isoformat() if job['started_at'] else None, completed_at=job['completed_at'].isoformat() if job['completed_at'] else None, reviews_count=job['reviews_count'], + total_reviews=job.get('total_reviews'), scrape_time=job['scrape_time'], error_message=job['error_message'], webhook_url=job.get('webhook_url') ) +@app.get("/jobs/{job_id}/logs", summary="Get Job Logs") +async def get_job_logs(job_id: UUID): + """ + Get the scraper logs for a job. + + Returns logs from both successful and failed jobs. + Useful for debugging scraping issues. + """ + if not db: + raise HTTPException(status_code=500, detail="Database not initialized") + + job = await db.get_job(job_id) + if not job: + raise HTTPException(status_code=404, detail="Job not found") + + # Get scrape_logs from job + scrape_logs = job.get('scrape_logs') + + # Parse if string (asyncpg might return JSONB as string) + if isinstance(scrape_logs, str): + try: + scrape_logs = json.loads(scrape_logs) + except: + scrape_logs = None + + return { + "job_id": str(job_id), + "status": job['status'], + "error_message": job.get('error_message'), + "logs": scrape_logs or [], + "log_count": len(scrape_logs) if scrape_logs else 0 + } + + +# ==================== SSE Streaming Endpoints ==================== + +async def broadcast_job_update(job_id: str, event_type: str, data: dict): + """Broadcast an update to all subscribers of a job stream and the all-jobs stream.""" + message = f"event: {event_type}\ndata: {json.dumps(data)}\n\n" + + # Send to job-specific subscribers + if job_id in job_update_queues: + for queue in job_update_queues[job_id]: + try: + await queue.put(message) + except: + pass + + # Send to all-jobs subscribers + for queue in job_update_queues.get("all", []): + try: + await queue.put(message) + except: + pass + + +@app.get("/jobs/{job_id}/stream", summary="Stream Job Updates (SSE)") +async def stream_job_updates(job_id: UUID): + """ + Server-Sent Events stream for real-time job updates. + + Streams: + - status: Job status changes + - progress: Review count and progress updates + - logs: New log entries + - complete: Job finished (completed/failed) + + Connect with EventSource in the browser: + ```javascript + const es = new EventSource('/jobs/{job_id}/stream'); + es.onmessage = (e) => console.log(JSON.parse(e.data)); + es.addEventListener('logs', (e) => console.log('Logs:', JSON.parse(e.data))); + ``` + """ + if not db: + raise HTTPException(status_code=500, detail="Database not initialized") + + # Verify job exists + job = await db.get_job(job_id) + if not job: + raise HTTPException(status_code=404, detail="Job not found") + + job_id_str = str(job_id) + + # Create queue for this client + queue: asyncio.Queue = asyncio.Queue() + + # Register subscriber + if job_id_str not in job_update_queues: + job_update_queues[job_id_str] = [] + job_update_queues[job_id_str].append(queue) + + async def event_generator(): + try: + # Send initial state + job_data = await db.get_job(job_id) + if job_data: + scrape_logs = job_data.get('scrape_logs') + if isinstance(scrape_logs, str): + try: + scrape_logs = json.loads(scrape_logs) + except: + scrape_logs = [] + + initial = { + "job_id": job_id_str, + "status": job_data['status'], + "reviews_count": job_data.get('reviews_count'), + "total_reviews": job_data.get('total_reviews'), + "scrape_time": job_data.get('scrape_time'), + "error_message": job_data.get('error_message'), + "logs": scrape_logs or [] + } + yield f"event: init\ndata: {json.dumps(initial)}\n\n" + + # If job is already complete, send complete event and close + if job_data and job_data['status'] in ['completed', 'failed', 'cancelled']: + yield f"event: complete\ndata: {json.dumps({'status': job_data['status']})}\n\n" + return + + # Keep connection alive and send updates + last_log_count = len(scrape_logs) if scrape_logs else 0 + last_reviews_count = job_data.get('reviews_count') if job_data else 0 + + while True: + try: + # Wait for update with timeout (for keepalive) + try: + message = await asyncio.wait_for(queue.get(), timeout=2.0) + yield message + except asyncio.TimeoutError: + # Send keepalive comment + yield ": keepalive\n\n" + + # Also poll database for updates (backup in case broadcast missed) + job_data = await db.get_job(job_id) + if job_data: + # Check for status change + if job_data['status'] in ['completed', 'failed', 'cancelled']: + scrape_logs = job_data.get('scrape_logs') + if isinstance(scrape_logs, str): + try: + scrape_logs = json.loads(scrape_logs) + except: + scrape_logs = [] + + final = { + "job_id": job_id_str, + "status": job_data['status'], + "reviews_count": job_data.get('reviews_count'), + "total_reviews": job_data.get('total_reviews'), + "scrape_time": job_data.get('scrape_time'), + "error_message": job_data.get('error_message'), + "logs": scrape_logs or [] + } + yield f"event: complete\ndata: {json.dumps(final)}\n\n" + return + + # Check for new logs or progress + scrape_logs = job_data.get('scrape_logs') + if isinstance(scrape_logs, str): + try: + scrape_logs = json.loads(scrape_logs) + except: + scrape_logs = [] + + current_log_count = len(scrape_logs) if scrape_logs else 0 + current_reviews = job_data.get('reviews_count') or 0 + + if current_log_count > last_log_count or current_reviews != last_reviews_count: + update = { + "job_id": job_id_str, + "status": job_data['status'], + "reviews_count": current_reviews, + "total_reviews": job_data.get('total_reviews'), + "logs": scrape_logs or [] + } + yield f"event: update\ndata: {json.dumps(update)}\n\n" + last_log_count = current_log_count + last_reviews_count = current_reviews + + except Exception as e: + log.error(f"Error in SSE stream for job {job_id}: {e}") + break + + finally: + # Unregister subscriber + if job_id_str in job_update_queues: + try: + job_update_queues[job_id_str].remove(queue) + if not job_update_queues[job_id_str]: + del job_update_queues[job_id_str] + except: + pass + + return StreamingResponse( + event_generator(), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "X-Accel-Buffering": "no" # Disable nginx buffering + } + ) + + +@app.get("/jobs/stream", summary="Stream All Jobs Updates (SSE)") +async def stream_all_jobs(): + """ + Server-Sent Events stream for all job updates. + + Streams: + - job_created: New job was created + - job_updated: Job status/progress changed + - job_completed: Job finished + + Connect with EventSource in the browser: + ```javascript + const es = new EventSource('/jobs/stream'); + es.addEventListener('job_updated', (e) => console.log('Update:', JSON.parse(e.data))); + ``` + """ + if not db: + raise HTTPException(status_code=500, detail="Database not initialized") + + # Create queue for this client + queue: asyncio.Queue = asyncio.Queue() + + # Register subscriber to all-jobs stream + job_update_queues["all"].append(queue) + + async def event_generator(): + try: + # Send initial jobs list + jobs = await db.list_jobs(limit=100) + jobs_data = [ + { + "job_id": str(j['job_id']), + "status": j['status'], + "url": j['url'], + "created_at": j['created_at'].isoformat(), + "completed_at": j['completed_at'].isoformat() if j.get('completed_at') else None, + "reviews_count": j.get('reviews_count'), + "scrape_time": j.get('scrape_time'), + "error_message": j.get('error_message') + } + for j in jobs + ] + yield f"event: init\ndata: {json.dumps({'jobs': jobs_data})}\n\n" + + # Keep connection alive and send updates + while True: + try: + # Wait for update with timeout (for keepalive) + try: + message = await asyncio.wait_for(queue.get(), timeout=5.0) + yield message + except asyncio.TimeoutError: + # Send keepalive comment + yield ": keepalive\n\n" + + except Exception as e: + log.error(f"Error in all-jobs SSE stream: {e}") + break + + finally: + # Unregister subscriber + try: + job_update_queues["all"].remove(queue) + except: + pass + + return StreamingResponse( + event_generator(), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "X-Accel-Buffering": "no" + } + ) + + @app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews") async def get_job_reviews(job_id: UUID): """ @@ -298,19 +592,34 @@ async def list_jobs( jobs = await db.list_jobs(status=job_status, limit=limit, offset=offset) - return [ - JobResponse( + result = [] + for job in jobs: + # Extract business info from metadata if available + metadata = job.get('metadata') + if isinstance(metadata, str): + try: + metadata = json.loads(metadata) + except: + metadata = None + + business_name = metadata.get('business_name') if metadata else None + business_address = metadata.get('business_address') if metadata else None + + result.append(JobResponse( job_id=str(job['job_id']), status=job['status'], url=job['url'], created_at=job['created_at'].isoformat(), completed_at=job['completed_at'].isoformat() if job.get('completed_at') else None, reviews_count=job.get('reviews_count'), + total_reviews=job.get('total_reviews'), scrape_time=job.get('scrape_time'), - error_message=job.get('error_message') - ) - for job in jobs - ] + error_message=job.get('error_message'), + business_name=business_name, + business_address=business_address + )) + + return result @app.delete("/jobs/{job_id}", summary="Delete Job") @@ -370,11 +679,11 @@ async def check_reviews(request: ScrapeRequest): # SIMPLIFIED VALIDATION: If we found a business (name + rating), assume it has reviews # Let the actual scraper determine if reviews exist - has_business = result.get('name') and result.get('rating') + has_business = bool(result.get('name') and result.get('rating')) return { - "has_reviews": has_business, # Assume true if business exists - "total_reviews": result['total_reviews'] or 0, # Show 0 if unknown + "has_reviews": has_business, # Boolean: true if business exists + "total_reviews": result.get('total_reviews') or 0, # Show 0 if unknown "name": result.get('name'), "address": result.get('address'), "rating": result.get('rating'), @@ -488,6 +797,8 @@ async def run_scraping_job(job_id: UUID): Args: job_id: Job UUID """ + job_id_str = str(job_id) + async with job_semaphore: # Limit concurrent Chrome instances try: # Update status to running @@ -498,44 +809,79 @@ async def run_scraping_job(job_id: UUID): job = await db.get_job(job_id) url = job['url'] + # Broadcast job started via SSE + await broadcast_job_update(job_id_str, "job_started", { + "job_id": job_id_str, + "status": "running", + "url": url + }) + # Get the event loop for progress updates from worker thread loop = asyncio.get_running_loop() - # Progress callback to update job status with current/total counts + # Create log capture instance that we can access for real-time logs + log_capture = LogCapture() + + # Progress callback to update job status with current/total counts AND logs def progress_callback(current_count: int, total_count: int): - """Update job progress from worker thread""" + """Update job progress and logs from worker thread""" async def update(): + # Get current logs from the shared log_capture + current_logs = log_capture.get_logs() await db.update_job_status( job_id, JobStatus.RUNNING, reviews_count=current_count, - total_reviews=total_count + total_reviews=total_count, + scrape_logs=current_logs ) + # Broadcast progress via SSE + await broadcast_job_update(job_id_str, "job_progress", { + "job_id": job_id_str, + "status": "running", + "reviews_count": current_count, + "total_reviews": total_count, + "logs": current_logs + }) + # Schedule the coroutine on the event loop asyncio.run_coroutine_threadsafe(update(), loop) - # Run scraping with progress callback + # Run scraping with progress callback and shared log capture + # headless=False because Docker uses Xvfb virtual display result = await asyncio.to_thread( fast_scrape_reviews, url=url, - headless=True, - progress_callback=progress_callback + headless=False, + progress_callback=progress_callback, + log_capture=log_capture ) if result['success']: - # Save results to database + # Save results to database (including scraper logs) await db.save_job_result( job_id=job_id, reviews=result['reviews'], scrape_time=result['time'], - total_reviews=result.get('total_reviews') + total_reviews=result.get('total_reviews'), + scrape_logs=result.get('logs') ) log.info( f"Completed job {job_id}: {result['count']} reviews in {result['time']:.1f}s" ) + # Broadcast job completed via SSE + await broadcast_job_update(job_id_str, "job_completed", { + "job_id": job_id_str, + "status": "completed", + "reviews_count": result['count'], + "total_reviews": result.get('total_reviews'), + "scrape_time": result['time'], + "logs": result.get('logs', []) + }) + # Send webhook if configured if job.get('webhook_url'): webhook_manager = WebhookManager() @@ -553,15 +899,24 @@ async def run_scraping_job(job_id: UUID): ) else: - # Job failed + # Job failed - save logs for debugging await db.update_job_status( job_id, JobStatus.FAILED, - error_message=result.get('error', 'Unknown error') + error_message=result.get('error', 'Unknown error'), + scrape_logs=result.get('logs') ) log.error(f"Failed job {job_id}: {result.get('error')}") + # Broadcast job failed via SSE + await broadcast_job_update(job_id_str, "job_failed", { + "job_id": job_id_str, + "status": "failed", + "error_message": result.get('error'), + "logs": result.get('logs', []) + }) + # Send failure webhook if configured if job.get('webhook_url'): webhook_manager = WebhookManager() @@ -585,6 +940,14 @@ async def run_scraping_job(job_id: UUID): error_message=str(e) ) + # Broadcast job failed via SSE + await broadcast_job_update(job_id_str, "job_failed", { + "job_id": job_id_str, + "status": "failed", + "error_message": str(e), + "logs": [] + }) + # Send failure webhook job = await db.get_job(job_id) if job and job.get('webhook_url'): diff --git a/brute_force_selector.py b/brute_force_selector.py deleted file mode 100644 index 21ac024..0000000 --- a/brute_force_selector.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -""" -Brute force approach: Try every possible div class combination and see which gives us reviews. -""" - -import time -from seleniumbase import Driver - -url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en" - -driver = Driver(uc=True, headless=False) - -try: - driver.get(url) - time.sleep(5) - - # GDPR - try: - form_btns = driver.find_elements('css selector', 'form button') - for btn in form_btns: - if 'accept all' in (btn.text or '').lower(): - btn.click() - time.sleep(2) - break - except: - pass - - # Click reviews tab - time.sleep(2) - tabs = driver.find_elements('css selector', 'button[role="tab"]') - for tab in tabs: - if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower(): - driver.execute_script("arguments[0].click();", tab) - time.sleep(5) - break - - # Scroll to load reviews - try: - pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde') - for _ in range(10): - driver.execute_script("arguments[0].scrollBy(0, 400);", pane) - time.sleep(0.3) - except: - pass - - print("\n" + "="*80) - print("BRUTE FORCE SELECTOR SEARCH") - print("="*80) - - # Get ALL unique class combinations from divs inside the reviews pane - candidates = driver.execute_script(""" - // Find the reviews pane - const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde'); - if (!pane) return {error: 'Pane not found'}; - - // Get all divs inside the pane - const allDivs = Array.from(pane.querySelectorAll('div')); - - // For each div, check if it looks like a review - const candidates = []; - - for (let div of allDivs) { - // Skip if no classes - if (!div.className || div.className.length === 0) continue; - - // Check for review indicators - const hasRating = !!div.querySelector('[aria-label*="star" i]'); - const hasText = div.textContent.length > 50 && div.textContent.length < 1000; // Individual review size - const hasAuthor = !!div.querySelector('button[aria-label*="photo" i], img'); - - // Calculate score - let score = 0; - if (hasRating) score += 3; - if (hasText) score += 2; - if (hasAuthor) score += 1; - - if (score >= 4) { // Must have rating + text at minimum - candidates.push({ - classes: div.className, - selector: 'div.' + div.className.split(' ').filter(c => c).join('.'), - score: score, - text_length: div.textContent.length, - sample_text: div.textContent.substring(0, 100) - }); - } - } - - // Count how many elements match each selector - const selectorCounts = {}; - for (let candidate of candidates) { - const count = pane.querySelectorAll(candidate.selector).length; - if (!selectorCounts[candidate.selector]) { - selectorCounts[candidate.selector] = { - count: count, - score: candidate.score, - text_length: candidate.text_length, - sample: candidate.sample_text - }; - } - } - - // Sort by count (we want selectors that match many reviews) - const sorted = Object.entries(selectorCounts) - .sort((a, b) => b[1].count - a[1].count) - .slice(0, 10); - - return { - top_selectors: sorted.map(([selector, info]) => ({ - selector: selector, - count: info.count, - score: info.score, - text_length: info.text_length, - sample: info.sample - })) - }; - """) - - if 'error' in candidates: - print(f"ERROR: {candidates['error']}") - else: - print(f"\nTop 10 candidate selectors (sorted by count):\n") - for i, candidate in enumerate(candidates['top_selectors'], 1): - print(f"{i}. {candidate['selector']}") - print(f" Count: {candidate['count']} | Score: {candidate['score']} | Text length: {candidate['text_length']}") - print(f" Sample: {candidate['sample'][:80]}...") - print() - - # Test the top selector - if candidates['top_selectors']: - top_selector = candidates['top_selectors'][0]['selector'] - print(f"\n{'='*80}") - print(f"TESTING TOP SELECTOR: {top_selector}") - print(f"{'='*80}") - - test_result = driver.execute_script(f""" - const elements = document.querySelectorAll('{top_selector}'); - const reviews = []; - - for (let i = 0; i < Math.min(3, elements.length); i++) {{ - const elem = elements[i]; - const review = {{ - has_author: !!elem.querySelector('button, img'), - has_rating: !!elem.querySelector('[aria-label*="star" i]'), - has_date: !!elem.textContent.match(/\\d+\\s*(day|week|month|year|ago)/i), - text_length: elem.textContent.length, - text_sample: elem.textContent.substring(0, 150) - }}; - reviews.push(review); - }} - - return reviews; - """) - - print(f"\nFirst 3 elements using {top_selector}:") - for i, rev in enumerate(test_result, 1): - print(f"\n Element {i}:") - for key, value in rev.items(): - print(f" {key}: {value}") - - print(f"\n{'='*80}") - print("Browser staying open for 60 seconds...") - print(f"{'='*80}") - time.sleep(60) - -finally: - driver.quit() diff --git a/check_page_structure.py b/check_page_structure.py deleted file mode 100644 index b85f6fa..0000000 --- a/check_page_structure.py +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env python3 -""" -Check the actual page structure - maybe reviews are already visible without clicking a tab! -""" - -import time -from seleniumbase import Driver - -url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine" - -driver = Driver(uc=True, headless=False) - -try: - driver.get(url) - print(f"Initial URL: {url}") - time.sleep(5) - - # GDPR - try: - form_btns = driver.find_elements('css selector', 'form button') - for btn in form_btns: - if 'accept' in (btn.text or '').lower(): - btn.click() - time.sleep(2) - break - except: - pass - - # Check final URL - final_url = driver.current_url - print(f"Final URL after redirect: {final_url}") - - # Wait a bit more for dynamic content - time.sleep(3) - - # Check page structure - print("\n" + "="*80) - print("PAGE STRUCTURE ANALYSIS") - print("="*80) - - page_info = driver.execute_script(""" - return { - tabs_found: document.querySelectorAll('button[role="tab"]').length, - reviews_with_standard_selector: document.querySelectorAll('div.jftiEf.fontBodyMedium').length, - reviews_with_jftiEf: document.querySelectorAll('div.jftiEf').length, - divs_with_ratings: document.querySelectorAll('[aria-label*="star" i]').length, - review_containers: document.querySelectorAll('div.fontBodyMedium').length, - page_text_sample: document.body.innerText.substring(0, 500), - has_review_text: document.body.innerText.toLowerCase().includes('review'), - has_atsiliepimai_text: document.body.innerText.toLowerCase().includes('atsiliepimai') - }; - """) - - print(f"\nTabs with role='tab': {page_info['tabs_found']}") - print(f"div.jftiEf.fontBodyMedium: {page_info['reviews_with_standard_selector']}") - print(f"div.jftiEf: {page_info['reviews_with_jftiEf']}") - print(f"Elements with star ratings: {page_info['divs_with_ratings']}") - print(f"div.fontBodyMedium: {page_info['review_containers']}") - print(f"Contains 'review': {page_info['has_review_text']}") - print(f"Contains 'atsiliepimai' (Lithuanian): {page_info['has_atsiliepimai_text']}") - - print(f"\nPage text sample (first 500 chars):") - print(page_info['page_text_sample']) - - # Try to find ANY element with rating - print("\n" + "="*80) - print("SEARCHING FOR RATING ELEMENTS") - print("="*80) - - rating_search = driver.execute_script(""" - const elements = Array.from(document.querySelectorAll('*')); - const withRatings = []; - - for (let elem of elements) { - const ariaLabel = elem.getAttribute('aria-label') || ''; - if (ariaLabel.toLowerCase().includes('star') || ariaLabel.toLowerCase().includes('žvaigžd')) { - withRatings.push({ - tag: elem.tagName, - ariaLabel: ariaLabel.substring(0, 100), - classes: elem.className.substring(0, 100), - parentTag: elem.parentElement ? elem.parentElement.tagName : null, - parentClasses: elem.parentElement ? elem.parentElement.className.substring(0, 100) : null - }); - } - } - - return withRatings.slice(0, 10); // First 10 - """) - - print(f"\nFound {len(rating_search)} elements with 'star' in aria-label:") - for i, elem in enumerate(rating_search[:5], 1): - print(f"\n Element {i}:") - print(f" Tag: {elem['tag']}") - print(f" Aria-label: {elem['ariaLabel']}") - print(f" Classes: {elem['classes']}") - print(f" Parent tag: {elem['parentTag']}") - print(f" Parent classes: {elem['parentClasses']}") - - print(f"\n{'='*80}") - print("Browser open for manual inspection...") - print("LOOK AT THE PAGE - Are reviews visible? What's their structure?") - print(f"{'='*80}") - time.sleep(180) # 3 minutes - -finally: - driver.quit() diff --git a/cookie_based_scraper.py b/cookie_based_scraper.py deleted file mode 100644 index f89e2ca..0000000 --- a/cookie_based_scraper.py +++ /dev/null @@ -1,355 +0,0 @@ -#!/usr/bin/env python3 -""" -Cookie-based API scraper - Capture fresh cookies on each run, then fast API scraping. - -Flow: -1. Start browser (15 seconds) -2. Capture cookies from active browser session (5 seconds) -3. Close browser -4. Use cookies for rapid API pagination (5-10 seconds) - -Total time: ~25-35 seconds for 244 reviews (vs 155 seconds with scrolling) -""" -import json -import logging -import time -from typing import List, Optional, Tuple -import requests -from seleniumbase import SB -from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview - -logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') -log = logging.getLogger(__name__) - - -class CookieBasedScraper: - """Capture cookies each run, then scrape via API.""" - - def __init__(self, url: str, headless: bool = False): - self.url = url - self.headless = headless - self.session = requests.Session() - self.place_id = None - self.interceptor = GoogleMapsAPIInterceptor(None) - - def capture_cookies(self) -> bool: - """ - Capture cookies from a real browser session. - Returns True if successful. - """ - log.info("="*60) - log.info("STEP 1: Capturing cookies from browser session") - log.info("="*60) - - sb = None - sb_context = None - try: - # Create driver - need to enter the context manually - log.info("Starting browser...") - sb_context = SB(uc=True, headless=self.headless) - sb = sb_context.__enter__() # Manually enter context - - log.info("Opening Google Maps...") - sb.open(self.url) - time.sleep(2) - - # Dismiss cookie consent - try: - sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=3) - log.info("✓ Cookie dialog dismissed") - except: - pass - - # Click reviews tab - try: - sb.click('.LRkQ2', timeout=5) - log.info("✓ Opened reviews tab") - time.sleep(3) # Wait for reviews to load - except Exception as e: - log.warning(f"Could not click reviews tab: {e}") - - # Extract place ID from current URL - current_url = sb.get_current_url() - if '!1s' in current_url: - parts = current_url.split('!1s') - if len(parts) > 1: - self.place_id = parts[1].split('!')[0] - log.info(f"✓ Extracted place ID: {self.place_id}") - - if not self.place_id: - log.error("Could not extract place ID") - return False - - # CRITICAL: Scroll once to trigger an API call! - # This causes Google to set the necessary session cookies - log.info("Triggering API call by scrolling...") - sb.execute_script("window.scrollBy(0, 500)") - time.sleep(2) # Wait for API call to complete - log.info("✓ API call triggered - session cookies should now be set") - - # CAPTURE COOKIES using CDP (gets httpOnly cookies too!) - log.info("Capturing cookies via CDP...") - try: - # Use Chrome DevTools Protocol to get ALL cookies from all domains - cdp_cookies = sb.driver.execute_cdp_cmd('Network.getAllCookies', {}) - browser_cookies = cdp_cookies.get('cookies', []) - log.info(f"✓ Captured {len(browser_cookies)} cookies via CDP") - - # Also try getting cookies for specific Google domains - for domain in ['.google.com', 'www.google.com', '.google.es', 'maps.google.com']: - try: - domain_cookies = sb.driver.execute_cdp_cmd('Network.getCookies', {'urls': [f'https://{domain}']}) - extra_cookies = domain_cookies.get('cookies', []) - if extra_cookies: - log.info(f" Found {len(extra_cookies)} cookies for {domain}") - # Add any new cookies we don't have yet - existing_names = {c['name'] for c in browser_cookies} - for cookie in extra_cookies: - if cookie['name'] not in existing_names: - browser_cookies.append(cookie) - except: - pass - - log.info(f"✓ Total cookies after checking all domains: {len(browser_cookies)}") - except Exception as e: - log.warning(f"CDP cookie capture failed: {e}") - # Fallback to JavaScript (won't get httpOnly cookies) - cookie_string = sb.execute_script("return document.cookie") - browser_cookies = [] - for cookie in cookie_string.split('; '): - if '=' in cookie: - name, value = cookie.split('=', 1) - browser_cookies.append({ - 'name': name, - 'value': value, - 'domain': '.google.com', - 'path': '/' - }) - log.info(f"✓ Fallback: Captured {len(browser_cookies)} cookies via JS") - - # CAPTURE USER AGENT while driver is active - user_agent = sb.execute_script("return navigator.userAgent") - log.info(f"✓ Captured user agent") - - # Process cookies into session - for cookie in browser_cookies: - self.session.cookies.set( - name=cookie['name'], - value=cookie['value'], - domain=cookie.get('domain', '.google.com'), - path=cookie.get('path', '/') - ) - - # Set headers - self.session.headers.update({ - 'User-Agent': user_agent, - 'Accept': '*/*', - 'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8', - 'Referer': 'https://www.google.com/maps/', - 'Origin': 'https://www.google.com', - 'X-Requested-With': 'XMLHttpRequest', - }) - - # Print ALL cookie names for debugging - all_cookie_names = [c['name'] for c in browser_cookies] - log.info(f"Cookie names: {', '.join(all_cookie_names)}") - - # Print important cookies for debugging - important_cookies = ['SID', 'HSID', 'SSID', 'APISID', 'SAPISID', '__Secure-1PSID', '__Secure-3PSID'] - found_cookies = [] - for cookie_name in important_cookies: - if cookie_name in self.session.cookies: - found_cookies.append(cookie_name) - - log.info(f"✓ Found auth cookies: {', '.join(found_cookies) if found_cookies else 'NONE - this is the problem!'}") - - # Check if we have auth cookies - if not found_cookies: - log.warning("\n" + "="*60) - log.warning("⚠️ NO AUTHENTICATION COOKIES FOUND!") - log.warning("="*60) - log.warning("Google Maps API requires you to be logged into Google.") - log.warning("") - log.warning("To fix this:") - log.warning("1. Log into your Google account in Chrome") - log.warning("2. Visit google.com/maps while logged in") - log.warning("3. Then run this scraper again") - log.warning("") - log.warning("Alternatively, use the hybrid scraper (start.py) which") - log.warning("handles authentication automatically and already achieves") - log.warning("95%+ API coverage with 100% parse rate!") - log.warning("="*60 + "\n") - - # Continue anyway to show the error - log.info("Continuing anyway to demonstrate the API error...") - - log.info("\n✅ Cookie capture successful!") - log.info(f" Total cookies: {len(browser_cookies)}") - log.info(f" Place ID: {self.place_id}") - log.info(f" Session ready: Yes\n") - - return True - - except Exception as e: - log.error(f"Cookie capture failed: {e}") - import traceback - traceback.print_exc() - return False - - finally: - # IMPORTANT: Close browser properly - if sb_context: - try: - log.info("Closing browser...") - sb_context.__exit__(None, None, None) # Properly exit context - log.info("✓ Browser closed\n") - except Exception as e: - log.debug(f"Error closing browser: {e}") - - def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]: - """ - Fetch a page of reviews via API using captured cookies. - """ - # Build pb parameter - if continuation_token: - pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" - else: - pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" - - params = { - 'authuser': '0', - 'hl': 'es', - 'gl': 'es', - 'pb': pb - } - - try: - url = 'https://www.google.com/maps/rpc/listugcposts' - response = self.session.get(url, params=params, timeout=10) - - if response.status_code != 200: - log.error(f"API error {response.status_code}") - log.error(f"Response: {response.text[:500]}") - log.debug(f"Request URL: {response.url}") - log.debug(f"Request headers: {dict(self.session.headers)}") - return [], None - - # Parse response - body = response.text - if body.startswith(")]}'"): - body = body[4:].strip() - - data = json.loads(body) - reviews = self.interceptor._parse_listugcposts_response(data) - - # Get next token - next_token = None - if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str): - next_token = data[1] - - return reviews, next_token - - except Exception as e: - log.error(f"API request failed: {e}") - return [], None - - def scrape_all(self, max_pages: int = 100) -> List[dict]: - """ - Main scraping method with cookie-based session. - """ - # Step 1: Capture cookies from browser - if not self.capture_cookies(): - log.error("Failed to capture cookies - aborting") - return [] - - # Step 2: Scrape via API - log.info("="*60) - log.info("STEP 2: Fast API scraping (no browser needed)") - log.info("="*60) - - start_time = time.time() - all_reviews = [] - seen_ids = set() - token = None - page = 0 - - while page < max_pages: - page += 1 - - log.info(f"Fetching page {page}...") - reviews, token = self.fetch_reviews_page(token) - - if not reviews: - if page == 1: - log.error("No reviews on first page - cookies may have expired or be invalid") - else: - log.info("No more reviews found") - break - - # Deduplicate - for review in reviews: - rid = review.review_id or f"{review.author}_{review.date_text}" - if rid not in seen_ids: - seen_ids.add(rid) - all_reviews.append({ - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - }) - - log.info(f" → {len(reviews)} reviews | Total: {len(all_reviews)}") - - if not token: - log.info("No continuation token - all reviews fetched") - break - - # Small delay between requests - time.sleep(0.2) - - elapsed = time.time() - start_time - - log.info("\n" + "="*60) - log.info("✅ SCRAPING COMPLETED!") - log.info("="*60) - log.info(f"Total reviews: {len(all_reviews)}") - log.info(f"API calls: {page}") - log.info(f"API scraping time: {elapsed:.2f} seconds") - log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second") - log.info("="*60 + "\n") - - return all_reviews - - -def main(): - """Example usage.""" - url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1" - - scraper = CookieBasedScraper(url, headless=False) - reviews = scraper.scrape_all(max_pages=50) - - if reviews: - # Save results - output_file = 'cookie_based_reviews.json' - with open(output_file, 'w', encoding='utf-8') as f: - json.dump(reviews, f, indent=2, ensure_ascii=False) - - log.info(f"💾 Saved {len(reviews)} reviews to {output_file}") - - # Show sample - log.info("\nSample review:") - sample = reviews[0] - log.info(f" Author: {sample['author']}") - log.info(f" Rating: {sample['rating']}★") - log.info(f" Date: {sample['date_text']}") - if sample['text']: - log.info(f" Text: {sample['text'][:80]}...") - else: - log.error("No reviews scraped!") - - -if __name__ == '__main__': - main() diff --git a/direct_api_scraper.py b/direct_api_scraper.py deleted file mode 100644 index d11005f..0000000 --- a/direct_api_scraper.py +++ /dev/null @@ -1,249 +0,0 @@ -#!/usr/bin/env python3 -""" -Direct API scraper - fetch Google Maps reviews via API without browser scrolling. -This is 10-25x faster than traditional browser-based scraping. -""" -import json -import logging -import time -import urllib.parse -from typing import List, Optional, Tuple -import requests -from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview - -logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') -log = logging.getLogger(__name__) - - -class DirectAPIScraper: - """Fetch Google Maps reviews directly via API without browser automation.""" - - def __init__(self, place_id: str, language: str = 'en', region: str = 'us'): - """ - Initialize the direct API scraper. - - Args: - place_id: Google Maps place ID (e.g., '0x46dd947294b213bf:0x864c7a232527adb4') - language: Language code (e.g., 'en', 'es', 'de') - region: Region/country code (e.g., 'us', 'es', 'de') - """ - self.place_id = place_id - self.language = language - self.region = region - self.base_url = 'https://www.google.com/maps/rpc/listugcposts' - - # Initialize parser (reuse the working parser from api_interceptor) - self.interceptor = GoogleMapsAPIInterceptor(None) - - # Session for maintaining cookies - self.session = requests.Session() - self.session.headers.update({ - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - 'Accept': '*/*', - 'Accept-Language': f'{language},{language}-{region.upper()};q=0.9,en;q=0.8', - 'Referer': 'https://www.google.com/maps/', - 'X-Requested-With': 'XMLHttpRequest', - }) - - def _build_pb_param(self, continuation_token: Optional[str] = None) -> str: - """ - Build the Protocol Buffer (pb) parameter for the API request. - - Args: - continuation_token: Pagination token from previous response - - Returns: - pb parameter string (NOT URL-encoded - that's done by requests) - """ - # Base structure with place ID and pagination token - if continuation_token: - pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" - else: - # First request without continuation token - pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" - - return pb - - def _establish_session(self): - """Visit Google Maps page to establish session cookies.""" - try: - # Visit the main maps page to get cookies - maps_url = f"https://www.google.com/maps/place/?q=place_id:{self.place_id}" - log.debug("Establishing session by visiting Google Maps...") - response = self.session.get(maps_url, timeout=10) - response.raise_for_status() - log.debug(f"Session established (cookies: {len(self.session.cookies)})") - except Exception as e: - log.warning(f"Failed to establish session: {e}") - - def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]: - """ - Fetch a single page of reviews from the API. - - Args: - continuation_token: Pagination token from previous response - - Returns: - Tuple of (reviews list, next continuation token or None) - """ - # Build request parameters - params = { - 'authuser': '0', - 'hl': self.language, - 'gl': self.region, - 'pb': self._build_pb_param(continuation_token) - } - - try: - log.info(f"Fetching reviews page (token: {'initial' if not continuation_token else 'paginated'})...") - - response = self.session.get(self.base_url, params=params, timeout=10) - - # Log response for debugging - log.debug(f"Response status: {response.status_code}") - if response.status_code != 200: - log.error(f"Response body: {response.text[:500]}") - - response.raise_for_status() - - # Google returns responses with )]}' prefix - strip it - body = response.text - if body.startswith(")]}'"): - body = body[4:].strip() - - log.debug(f"Response size: {len(body)} bytes") - - # Parse JSON response - data = json.loads(body) - - # Extract reviews using our working parser - reviews = self.interceptor._parse_listugcposts_response(data) - - # Extract next continuation token - next_token = None - if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str): - next_token = data[1] - log.debug(f"Found continuation token: {next_token[:50]}...") - - log.info(f"✓ Extracted {len(reviews)} reviews from this page") - - return reviews, next_token - - except requests.exceptions.RequestException as e: - log.error(f"API request failed: {e}") - return [], None - except json.JSONDecodeError as e: - log.error(f"Failed to parse API response: {e}") - return [], None - except Exception as e: - log.error(f"Unexpected error: {e}") - return [], None - - def fetch_all_reviews(self, max_pages: int = 100, delay: float = 0.5) -> List[dict]: - """ - Fetch all reviews by paginating through the API. - - Args: - max_pages: Maximum number of pages to fetch (safety limit) - delay: Delay between requests in seconds - - Returns: - List of review dictionaries - """ - all_reviews = [] - seen_ids = set() - continuation_token = None - page = 0 - - start_time = time.time() - log.info(f"Starting direct API scraping for place: {self.place_id}") - - # Establish session first - self._establish_session() - - while page < max_pages: - page += 1 - - # Fetch page - reviews, continuation_token = self.fetch_reviews_page(continuation_token) - - if not reviews: - log.info("No more reviews found - stopping") - break - - # Deduplicate and add reviews - for review in reviews: - review_id = review.review_id or f"{review.author}_{review.date_text}" - if review_id not in seen_ids: - seen_ids.add(review_id) - - # Convert to dict - all_reviews.append({ - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - }) - - log.info(f"Page {page}: {len(all_reviews)} total unique reviews") - - # Check if we have a continuation token - if not continuation_token: - log.info("No continuation token - all reviews fetched") - break - - # Rate limiting - if delay > 0 and page < max_pages: - time.sleep(delay) - - elapsed = time.time() - start_time - log.info(f"\n{'='*60}") - log.info(f"✅ Direct API scraping completed!") - log.info(f"{'='*60}") - log.info(f"Total reviews: {len(all_reviews)}") - log.info(f"Pages fetched: {page}") - log.info(f"Time elapsed: {elapsed:.2f} seconds") - log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second") - log.info(f"{'='*60}\n") - - return all_reviews - - -def main(): - """Example usage of the direct API scraper.""" - - # Soho Club place ID from the test URL - place_id = '0x46dd947294b213bf:0x864c7a232527adb4' - - # Create scraper - scraper = DirectAPIScraper( - place_id=place_id, - language='es', - region='es' - ) - - # Fetch all reviews - reviews = scraper.fetch_all_reviews(max_pages=50, delay=0.5) - - # Save to JSON - output_file = 'direct_api_reviews.json' - with open(output_file, 'w', encoding='utf-8') as f: - json.dump(reviews, f, indent=2, ensure_ascii=False) - - log.info(f"Saved {len(reviews)} reviews to {output_file}") - - # Show sample - if reviews: - log.info("\nSample review:") - sample = reviews[0] - log.info(f" Author: {sample['author']}") - log.info(f" Rating: {sample['rating']}★") - log.info(f" Date: {sample['date_text']}") - log.info(f" Text: {sample['text'][:100]}..." if sample['text'] else " Text: (no text)") - - -if __name__ == '__main__': - main() diff --git a/dump_api_response.py b/dump_api_response.py deleted file mode 100644 index 3e21103..0000000 --- a/dump_api_response.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -""" -Quick script to dump API responses for debugging -""" -import json -from modules.api_interceptor import GoogleMapsAPIInterceptor -from seleniumbase import SB - -url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1" - -with SB(uc=True, headless=False) as sb: - # Set up interceptor BEFORE loading page - interceptor = GoogleMapsAPIInterceptor(sb.driver) - - sb.open(url) - sb.sleep(2) - - # Inject interceptor early - interceptor.inject_response_interceptor() - sb.sleep(2) - - # Click reviews tab - try: - sb.click('.LRkQ2:contains("Reseñas")', timeout=5) - except: - try: - sb.click('.LRkQ2:contains("Reviews")', timeout=5) - except: - pass - - print("Waiting for reviews to load...") - sb.sleep(5) - - # Scroll to trigger more requests - print("Scrolling to load more...") - for i in range(5): - sb.execute_script("window.scrollBy(0, 800)") - sb.sleep(2) - print(f" Scroll {i+1}/5...") - - print("\nCollecting responses...") - - # Get responses - responses = interceptor.get_intercepted_responses() - - print(f"\nCaptured {len(responses)} responses") - - # Dump to files - for i, resp in enumerate(responses): - filename = f"api_response_{i}.json" - with open(filename, 'w', encoding='utf-8') as f: - json.dump(resp, f, indent=2, ensure_ascii=False) - print(f"Saved: {filename} ({len(resp.get('body', ''))} bytes)") - - # Also save just the body for easier viewing - body_file = f"api_response_{i}_body.txt" - with open(body_file, 'w', encoding='utf-8') as f: - f.write(resp.get('body', '')) - print(f"Saved body: {body_file}") - - print("\nDone! Check api_response_*.json files") diff --git a/dump_api_responses.py b/dump_api_responses.py deleted file mode 100644 index 5f5ba0e..0000000 --- a/dump_api_responses.py +++ /dev/null @@ -1,107 +0,0 @@ -#!/usr/bin/env python3 -""" -Dump raw API responses for analysis. -This will help us understand Google's exact response format. -""" -import json -import logging -from pathlib import Path -from seleniumbase import SB -from modules.api_interceptor import GoogleMapsAPIInterceptor - -logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s") - -url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1" - -output_dir = Path("api_response_samples") -output_dir.mkdir(exist_ok=True) - -print(f"[INFO] Starting browser...") -with SB(uc=True, headless=False) as sb: - print("[INFO] Navigating to Google Maps...") - sb.open(url) - sb.sleep(3) - - # Inject interceptor FIRST - print("[INFO] Injecting API interceptor...") - interceptor = GoogleMapsAPIInterceptor(sb.driver) - interceptor.inject_response_interceptor() - sb.sleep(2) - - # Click reviews tab - print("[INFO] Looking for reviews tab...") - try: - sb.click('.LRkQ2', timeout=5) - print("[INFO] ✓ Clicked reviews tab") - except: - print("[WARN] Could not click reviews tab, trying to continue...") - - sb.sleep(5) - - # Scroll multiple times to trigger API calls - print("[INFO] Scrolling to trigger API calls...") - for i in range(10): - sb.execute_script("window.scrollBy(0, 800)") - sb.sleep(1.5) - - # Check every few scrolls - if (i + 1) % 3 == 0: - responses = interceptor.get_intercepted_responses() - if responses: - print(f"[INFO] Captured {len(responses)} responses so far...") - - # Final collection - print("\n[INFO] Collecting all captured responses...") - all_responses = interceptor.get_intercepted_responses() - - if not all_responses: - print("[ERROR] No responses captured!") - exit(1) - - print(f"[SUCCESS] Captured {len(all_responses)} API responses!\n") - - # Dump each response - for i, resp in enumerate(all_responses): - url_str = resp.get('url', 'unknown') - body = resp.get('body', '') - size = len(body) - - # Save full response - full_file = output_dir / f"response_{i:02d}_full.json" - with open(full_file, 'w', encoding='utf-8') as f: - json.dump(resp, f, indent=2, ensure_ascii=False) - - # Save just body for easier viewing - body_file = output_dir / f"response_{i:02d}_body.txt" - with open(body_file, 'w', encoding='utf-8') as f: - f.write(body) - - # Try to parse as JSON - if body.startswith(")]}'"): - clean_body = body[4:].strip() - else: - clean_body = body - - json_file = output_dir / f"response_{i:02d}_parsed.json" - try: - parsed = json.loads(clean_body) - with open(json_file, 'w', encoding='utf-8') as f: - json.dump(parsed, f, indent=2, ensure_ascii=False) - print(f" [{i}] ✓ {url_str[:60]}... ({size:,} bytes)") - print(f" Full: {full_file}") - print(f" Body: {body_file}") - print(f" Parsed: {json_file}") - except: - print(f" [{i}] ✓ {url_str[:60]}... ({size:,} bytes) [Not JSON]") - print(f" Full: {full_file}") - print(f" Body: {body_file}") - print() - - print(f"\n[SUCCESS] Dumped {len(all_responses)} responses to: {output_dir}/") - print("\nNext steps:") - print(" 1. Open response_00_parsed.json to study the structure") - print(" 2. Look for arrays containing review data") - print(" 3. Identify patterns for: review ID, author, rating, text, date") - print(" 4. Update the parser patterns in modules/api_interceptor.py") - -print("\n[DONE]") diff --git a/fast_api_scraper.py b/fast_api_scraper.py deleted file mode 100644 index fc5bbaa..0000000 --- a/fast_api_scraper.py +++ /dev/null @@ -1,249 +0,0 @@ -#!/usr/bin/env python3 -""" -Fast API scraper - Minimal browser usage, maximum API speed. - -Strategy: -1. Start browser and navigate to reviews page -2. Capture cookies and user-agent from browser -3. Let one API call happen naturally (to warm up the session) -4. Close browser -5. Use requests library with captured session to make fast API calls -6. Paginate through all reviews without any scrolling - -Expected: 10-25x faster than traditional scrolling approach. -""" -import json -import logging -import time -from typing import List, Optional, Tuple -import requests -from seleniumbase import SB -from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview - -logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') -log = logging.getLogger(__name__) - - -class FastAPIScraper: - """Minimal browser, maximum speed.""" - - def __init__(self, url: str): - self.url = url - self.session = requests.Session() - self.place_id = None - self.interceptor = GoogleMapsAPIInterceptor(None) - - def bootstrap_session(self) -> bool: - """ - Quickly establish session using browser, then close it. - """ - log.info("Bootstrapping session with minimal browser usage...") - - try: - with SB(uc=True, headless=False) as sb: - # Navigate - log.info("Opening Google Maps...") - sb.open(self.url) - sb.sleep(2) - - # Dismiss cookies - try: - sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=3) - except: - pass - - # Click reviews - try: - sb.click('.LRkQ2', timeout=5) - log.info("✓ Opened reviews tab") - sb.sleep(2) - except: - log.warning("Could not click reviews tab") - - # Wait a bit to ensure page is loaded - sb.sleep(1) - - # Extract place ID from URL or page - current_url = sb.get_current_url() - if '!1s' in current_url: - parts = current_url.split('!1s') - if len(parts) > 1: - self.place_id = parts[1].split('!')[0] - log.info(f"✓ Extracted place ID: {self.place_id}") - - # Get cookies from browser - do this while browser is still active - try: - browser_cookies = sb.driver.get_cookies() - log.debug(f"Got {len(browser_cookies)} cookies") - except Exception as e: - log.warning(f"Could not get cookies: {e}") - browser_cookies = [] - - # Get user agent - do this while browser is still active - try: - user_agent = sb.execute_script("return navigator.userAgent") - log.debug(f"User agent: {user_agent[:50]}...") - except Exception as e: - log.warning(f"Could not get user agent: {e}") - user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' - - # Now process cookies and headers (browser context manager still open) - for cookie in browser_cookies: - try: - self.session.cookies.set( - name=cookie['name'], - value=cookie['value'], - domain=cookie.get('domain', '.google.com'), - path=cookie.get('path', '/') - ) - except Exception as e: - log.debug(f"Could not set cookie {cookie.get('name')}: {e}") - - # Set headers - self.session.headers.update({ - 'User-Agent': user_agent, - 'Accept': '*/*', - 'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8', - 'Referer': 'https://www.google.com/maps/', - 'Origin': 'https://www.google.com', - 'X-Requested-With': 'XMLHttpRequest', - }) - - log.info(f"✅ Session bootstrapped!") - log.info(f" Cookies: {len(browser_cookies)}") - log.info(f" Place ID: {self.place_id}") - - # Let browser stay open for a moment to ensure all operations complete - sb.sleep(1) - - return True - - except Exception as e: - log.error(f"Bootstrap failed: {e}") - import traceback - traceback.print_exc() - return False - - def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]: - """Fetch a page of reviews via API.""" - - # Build pb parameter - if continuation_token: - pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" - else: - pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" - - params = { - 'authuser': '0', - 'hl': 'es', - 'gl': 'es', - 'pb': pb - } - - try: - url = 'https://www.google.com/maps/rpc/listugcposts' - response = self.session.get(url, params=params, timeout=10) - - if response.status_code != 200: - log.error(f"API error {response.status_code}") - log.error(f"Response: {response.text[:300]}") - return [], None - - # Parse - body = response.text - if body.startswith(")]}'"): - body = body[4:].strip() - - data = json.loads(body) - reviews = self.interceptor._parse_listugcposts_response(data) - - # Next token - next_token = None - if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str): - next_token = data[1] - - return reviews, next_token - - except Exception as e: - log.error(f"Request failed: {e}") - return [], None - - def scrape_all(self, max_pages: int = 100) -> List[dict]: - """ - Main scraping method. - """ - # Bootstrap - if not self.bootstrap_session(): - return [] - - # Scrape via API - log.info("\n" + "="*60) - log.info("STARTING FAST API SCRAPING") - log.info("="*60 + "\n") - - start_time = time.time() - all_reviews = [] - seen_ids = set() - token = None - page = 0 - - while page < max_pages: - page += 1 - - log.info(f"Fetching page {page}...") - reviews, token = self.fetch_reviews_page(token) - - if not reviews: - log.info("No more reviews") - break - - # Dedup - for review in reviews: - rid = review.review_id or f"{review.author}_{review.date_text}" - if rid not in seen_ids: - seen_ids.add(rid) - all_reviews.append({ - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - }) - - log.info(f" → {len(reviews)} reviews | Total: {len(all_reviews)}") - - if not token: - break - - time.sleep(0.2) # Small delay - - elapsed = time.time() - start_time - - log.info("\n" + "="*60) - log.info("✅ FAST API SCRAPING COMPLETED!") - log.info("="*60) - log.info(f"Reviews: {len(all_reviews)}") - log.info(f"Pages: {page}") - log.info(f"Time: {elapsed:.2f} seconds") - log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") - log.info("="*60 + "\n") - - return all_reviews - - -def main(): - url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1" - - scraper = FastAPIScraper(url) - reviews = scraper.scrape_all(max_pages=50) - - # Save - with open('fast_api_reviews.json', 'w', encoding='utf-8') as f: - json.dump(reviews, f, indent=2, ensure_ascii=False) - - log.info(f"Saved to fast_api_reviews.json") - - -if __name__ == '__main__': - main() diff --git a/find_actual_reviews.py b/find_actual_reviews.py deleted file mode 100644 index 948e0cc..0000000 --- a/find_actual_reviews.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python3 -""" -Find the ACTUAL selector for reviews by looking for elements with review structure. -""" - -import time -from seleniumbase import Driver - -url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en" - -driver = Driver(uc=True, headless=False) - -try: - driver.get(url) - time.sleep(5) - - # GDPR - try: - form_btns = driver.find_elements('css selector', 'form button') - for btn in form_btns: - if 'accept all' in (btn.text or '').lower(): - btn.click() - time.sleep(2) - break - except: - pass - - # Click reviews tab - time.sleep(2) - tabs = driver.find_elements('css selector', 'button[role="tab"]') - for tab in tabs: - if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower(): - driver.execute_script("arguments[0].click();", tab) - time.sleep(5) - break - - # Scroll to load reviews - try: - pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde') - for _ in range(3): - driver.execute_script("arguments[0].scrollBy(0, 500);", pane) - time.sleep(1) - except: - pass - - # Use JavaScript to find ALL elements that look like reviews - print("\n" + "="*80) - print("FINDING ACTUAL REVIEW ELEMENTS BY STRUCTURE:") - print("="*80) - - review_info = driver.execute_script(""" - // Find all elements that have BOTH a rating AND substantial text - const allDivs = Array.from(document.querySelectorAll('div')); - - const reviews = []; - - for (let div of allDivs) { - // Must have a rating (star aria-label) - const ratingElem = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i]'); - if (!ratingElem) continue; - - // Must have decent text content (>50 chars to avoid buttons) - if (div.textContent.length < 50) continue; - - // Get the classes and attributes - const info = { - classes: div.className, - has_author: !!div.querySelector('button, [aria-label*="photo" i]'), - has_avatar: !!div.querySelector('img'), - has_date: !!div.textContent.match(/\\d+\\s*(day|week|month|year|ago)/i), - text_length: div.textContent.length, - sample_text: div.textContent.substring(0, 150), - tag_name: div.tagName, - jslog: div.getAttribute('jslog'), - data_review_id: div.getAttribute('data-review-id'), - jsaction: div.getAttribute('jsaction') - }; - - reviews.push(info); - } - - return { - total_found: reviews.length, - first_5: reviews.slice(0, 5) - }; - """) - - print(f"\nFound {review_info['total_found']} elements with review structure") - print(f"\nFirst 5 review-like elements:") - for i, rev in enumerate(review_info['first_5'], 1): - print(f"\n Review {i}:") - print(f" Classes: {rev['classes']}") - print(f" Has author: {rev['has_author']}") - print(f" Has avatar: {rev['has_avatar']}") - print(f" Has date: {rev['has_date']}") - print(f" Text length: {rev['text_length']}") - print(f" jslog: {rev['jslog']}") - print(f" data-review-id: {rev['data_review_id']}") - print(f" Sample: {rev['sample_text'][:80]}...") - - # Try to find a common class among review elements - if review_info['total_found'] > 0: - print("\n" + "="*80) - print("FINDING COMMON SELECTOR:") - print("="*80) - - common_selector = driver.execute_script(""" - // Find common classes among review elements - const reviews = []; - const allDivs = Array.from(document.querySelectorAll('div')); - - for (let div of allDivs) { - const ratingElem = div.querySelector('[aria-label*="star" i]'); - if (ratingElem && div.textContent.length > 50) { - reviews.push(div); - } - } - - if (reviews.length === 0) return null; - - // Get classes from first review - const firstClasses = reviews[0].className.split(' ').filter(c => c.length > 0); - - // Find classes that appear in ALL reviews - const commonClasses = firstClasses.filter(cls => { - return reviews.every(rev => rev.classList.contains(cls)); - }); - - return { - total_reviews: reviews.length, - common_classes: commonClasses, - suggested_selector: commonClasses.length > 0 ? 'div.' + commonClasses.join('.') : null, - first_review_classes: reviews[0].className - }; - """) - - if common_selector: - print(f"Total review elements: {common_selector['total_reviews']}") - print(f"Common classes: {common_selector['common_classes']}") - print(f"Suggested selector: {common_selector['suggested_selector']}") - print(f"First review full classes: {common_selector['first_review_classes']}") - - # Test the suggested selector - if common_selector['suggested_selector']: - test_count = driver.execute_script( - f"return document.querySelectorAll('{common_selector['suggested_selector']}').length;" - ) - print(f"\nTesting suggested selector: Found {test_count} elements") - - print("\n" + "="*80) - print("Browser staying open for manual inspection (60s)...") - print("="*80) - time.sleep(60) - -finally: - driver.quit() diff --git a/header_capture_scraper.py b/header_capture_scraper.py deleted file mode 100644 index ff228b0..0000000 --- a/header_capture_scraper.py +++ /dev/null @@ -1,305 +0,0 @@ -#!/usr/bin/env python3 -""" -Header Capture Scraper - Capture COMPLETE request from browser (headers + cookies). - -This captures the exact request the browser makes, including ALL headers and cookies, -then replays it for fast API scraping. -""" -import json -import logging -import time -from typing import List, Optional, Tuple -import requests -from seleniumbase import SB -from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview - -logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') -log = logging.getLogger(__name__) - - -class HeaderCaptureScraper: - """Capture complete request, then replay for fast scraping.""" - - def __init__(self, url: str, headless: bool = False): - self.url = url - self.headless = headless - self.captured_request = None - self.place_id = None - self.session = requests.Session() - self.interceptor = GoogleMapsAPIInterceptor(None) - - def capture_request(self) -> bool: - """ - Capture a complete API request (URL, headers, cookies) from browser. - """ - log.info("="*60) - log.info("Capturing request from browser...") - log.info("="*60) - - sb_context = None - sb = None - - try: - log.info("Starting browser...") - sb_context = SB(uc=True, headless=self.headless) - sb = sb_context.__enter__() - - sb.open(self.url) - time.sleep(2) - - # Dismiss cookies - try: - sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=3) - except: - pass - - # Click reviews - try: - sb.click('.LRkQ2', timeout=5) - log.info("✓ Opened reviews") - time.sleep(2) - except: - pass - - # Enable CDP network monitoring - sb.driver.execute_cdp_cmd('Network.enable', {}) - log.info("✓ Network monitoring enabled") - - # Scroll to trigger API call - log.info("Scrolling to trigger API request...") - sb.execute_script("window.scrollBy(0, 800)") - time.sleep(3) - - # Get network logs from CDP - log.info("Checking network logs...") - logs = sb.driver.get_log('browser') - - # Alternatively, use execute_cdp_cmd to get network events - # But simpler: Let's inject JS to capture the request - capture_script = """ - window.__capturedRequest = null; - - const originalFetch = window.fetch; - window.fetch = function(...args) { - const url = args[0].toString(); - if (url.includes('listugcposts')) { - console.log('[CAPTURE] Intercepted request to:', url); - window.__capturedRequest = { - url: url, - method: 'GET' - }; - } - return originalFetch.apply(this, args); - }; - - const originalXHR = window.XMLHttpRequest; - window.XMLHttpRequest = function() { - const xhr = new originalXHR(); - const originalOpen = xhr.open; - - xhr.open = function(method, url, ...rest) { - if (url.includes('listugcposts')) { - console.log('[CAPTURE] Intercepted XHR:', url); - window.__capturedRequest = { - url: url, - method: method - }; - } - return originalOpen.apply(this, [method, url, ...rest]); - }; - - return xhr; - }; - - console.log('[CAPTURE] Request interceptor ready'); - """ - - sb.execute_script(capture_script) - log.info("✓ Request interceptor injected") - - # Scroll again to trigger request - log.info("Scrolling to capture request...") - for i in range(3): - sb.execute_script("window.scrollBy(0, 600)") - time.sleep(2) - - captured = sb.execute_script("return window.__capturedRequest") - if captured: - log.info(f"✓ Captured request URL!") - self.captured_request = captured - break - - if not self.captured_request: - log.error("Failed to capture request") - return False - - # Extract place ID from URL - url = self.captured_request['url'] - if '!1s' in url: - import urllib.parse - parsed = urllib.parse.urlparse(url) - params = urllib.parse.parse_qs(parsed.query) - pb = params.get('pb', [''])[0] - if '!1s' in pb: - self.place_id = pb.split('!1s')[1].split('!')[0] - - # Now capture ALL cookies via CDP - cdp_cookies = sb.driver.execute_cdp_cmd('Network.getAllCookies', {}) - all_cookies = cdp_cookies.get('cookies', []) - - # Set cookies in session - for cookie in all_cookies: - self.session.cookies.set( - name=cookie['name'], - value=cookie['value'], - domain=cookie.get('domain', '.google.com'), - path=cookie.get('path', '/') - ) - - # Get user agent - user_agent = sb.execute_script("return navigator.userAgent") - - # Set headers to match browser - self.session.headers.update({ - 'User-Agent': user_agent, - 'Accept': '*/*', - 'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8', - 'Referer': 'https://www.google.com/maps/', - 'Origin': 'https://www.google.com', - 'X-Requested-With': 'XMLHttpRequest', - }) - - log.info(f"\n✅ Request captured successfully!") - log.info(f" Place ID: {self.place_id}") - log.info(f" Cookies: {len(all_cookies)}") - log.info(f" Cookie names: {', '.join([c['name'] for c in all_cookies[:10]])}") - - return True - - except Exception as e: - log.error(f"Capture failed: {e}") - import traceback - traceback.print_exc() - return False - - finally: - if sb_context: - try: - log.info("Closing browser...") - sb_context.__exit__(None, None, None) - log.info("✓ Browser closed\n") - except: - pass - - def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]: - """Fetch reviews using captured session.""" - - if continuation_token: - pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" - else: - pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" - - params = { - 'authuser': '0', - 'hl': 'es', - 'gl': 'es', - 'pb': pb - } - - try: - url = 'https://www.google.com/maps/rpc/listugcposts' - response = self.session.get(url, params=params, timeout=10) - - if response.status_code != 200: - log.error(f"API error {response.status_code}: {response.text[:200]}") - return [], None - - body = response.text - if body.startswith(")]}'"): - body = body[4:].strip() - - data = json.loads(body) - reviews = self.interceptor._parse_listugcposts_response(data) - - next_token = None - if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str): - next_token = data[1] - - return reviews, next_token - - except Exception as e: - log.error(f"Request failed: {e}") - return [], None - - def scrape_all(self, max_pages: int = 50) -> List[dict]: - """Main scraping method.""" - - if not self.capture_request(): - return [] - - log.info("="*60) - log.info("Fast API scraping...") - log.info("="*60) - - start_time = time.time() - all_reviews = [] - seen_ids = set() - token = None - page = 0 - - while page < max_pages: - page += 1 - log.info(f"Page {page}...") - - reviews, token = self.fetch_reviews_page(token) - - if not reviews: - break - - for review in reviews: - rid = review.review_id or f"{review.author}_{review.date_text}" - if rid not in seen_ids: - seen_ids.add(rid) - all_reviews.append({ - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - }) - - log.info(f" → {len(reviews)} reviews | Total: {len(all_reviews)}") - - if not token: - break - - time.sleep(0.2) - - elapsed = time.time() - start_time - - log.info(f"\n{'='*60}") - log.info(f"✅ COMPLETED!") - log.info(f"{'='*60}") - log.info(f"Reviews: {len(all_reviews)}") - log.info(f"Time: {elapsed:.2f}s") - log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") - log.info(f"{'='*60}\n") - - return all_reviews - - -def main(): - url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1" - - scraper = HeaderCaptureScraper(url, headless=False) - reviews = scraper.scrape_all() - - if reviews: - with open('header_capture_reviews.json', 'w', encoding='utf-8') as f: - json.dump(reviews, f, indent=2, ensure_ascii=False) - log.info(f"Saved to header_capture_reviews.json") - - -if __name__ == '__main__': - main() diff --git a/hybrid_api_scraper.py b/hybrid_api_scraper.py deleted file mode 100644 index b272899..0000000 --- a/hybrid_api_scraper.py +++ /dev/null @@ -1,352 +0,0 @@ -#!/usr/bin/env python3 -""" -Hybrid API scraper - Capture session from browser, then use direct API calls. -This combines the best of both worlds: -1. Browser establishes authentic session with Google -2. We capture ALL headers from real XHR requests -3. Replay those headers in direct API calls -4. No scrolling needed - just fast API pagination - -Expected speed: 10-25x faster than traditional browser scrolling. -""" -import json -import logging -import time -from typing import List, Optional, Tuple, Dict -import requests -from seleniumbase import SB -from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview - -logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') -log = logging.getLogger(__name__) - - -class HybridAPIScraper: - """ - Capture session from browser, then scrape via direct API calls. - """ - - def __init__(self, url: str, headless: bool = False): - """ - Initialize the hybrid scraper. - - Args: - url: Google Maps place URL - headless: Run browser in headless mode - """ - self.url = url - self.headless = headless - self.captured_headers = None - self.place_id = None - self.session = requests.Session() - - # Initialize parser - self.interceptor = GoogleMapsAPIInterceptor(None) - - def capture_session_from_browser(self) -> bool: - """ - Start a browser session, capture headers from actual API requests. - - Returns: - True if session captured successfully - """ - log.info("Starting browser to capture session headers...") - - try: - with SB(uc=True, headless=self.headless) as sb: - # Navigate to the place - log.info(f"Navigating to: {self.url[:80]}...") - sb.open(self.url) - sb.sleep(3) - - # Dismiss cookie consent - try: - sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=5) - log.info("Cookie dialog dismissed") - except: - pass - - # Click reviews tab - log.info("Opening reviews...") - try: - sb.click('.LRkQ2', timeout=5) - sb.sleep(3) - except: - log.warning("Could not click reviews tab") - - # Enable Chrome DevTools Protocol for network monitoring - log.info("Enabling network interception...") - sb.driver.execute_cdp_cmd('Network.enable', {}) - - # Store captured requests - captured_requests = [] - - # Create event listener for network requests - def add_request_listener(): - """Inject JS to capture fetch/XHR requests with headers.""" - script = """ - window.__capturedRequests = []; - - // Capture fetch - const originalFetch = window.fetch; - window.fetch = function(...args) { - const url = args[0].toString(); - if (url.includes('listugcposts')) { - console.log('[CAPTURE] Fetch to:', url); - // Can't easily get headers from fetch without cloning - } - return originalFetch.apply(this, args); - }; - - // Capture XHR (more reliable for headers) - const originalXHR = window.XMLHttpRequest; - window.XMLHttpRequest = function() { - const xhr = new originalXHR(); - const originalOpen = xhr.open; - const originalSetRequestHeader = xhr.setRequestHeader; - const headers = {}; - - xhr.setRequestHeader = function(name, value) { - headers[name.toLowerCase()] = value; - return originalSetRequestHeader.apply(this, arguments); - }; - - xhr.open = function(method, url, ...rest) { - if (url.includes('listugcposts')) { - console.log('[CAPTURE] XHR to:', url); - window.__capturedRequests.push({ - url: url, - method: method, - headers: {...headers} - }); - } - return originalOpen.apply(this, [method, url, ...rest]); - }; - - return xhr; - }; - - console.log('[CAPTURE] Request capture initialized'); - """ - sb.execute_script(script) - - add_request_listener() - - # Scroll to trigger an API call - log.info("Scrolling to trigger API request...") - for i in range(5): - sb.execute_script("window.scrollBy(0, 800)") - sb.sleep(1.5) - - # Check captured requests - captured_requests = sb.execute_script("return window.__capturedRequests || []") - if captured_requests: - log.info(f"✓ Captured {len(captured_requests)} API request(s)!") - break - - captured_request = captured_requests[0] if captured_requests else {} - - if not captured_request: - log.error("Failed to capture API request") - return False - - # Extract place ID from URL - if 'place_id:' in self.url: - self.place_id = self.url.split('place_id:')[1].split('&')[0].split('/')[0] - elif '!1s' in captured_request['url']: - # Extract from pb parameter - import urllib.parse - parsed = urllib.parse.urlparse(captured_request['url']) - params = urllib.parse.parse_qs(parsed.query) - pb = params.get('pb', [''])[0] - if '!1s' in pb: - self.place_id = pb.split('!1s')[1].split('!')[0] - - # Store captured headers - self.captured_headers = captured_request['headers'] - - # Also get cookies from browser - cookies = sb.driver.get_cookies() - for cookie in cookies: - self.session.cookies.set(cookie['name'], cookie['value'], domain=cookie.get('domain')) - - log.info(f"\n{'='*60}") - log.info("✅ Session captured successfully!") - log.info(f"{'='*60}") - log.info(f"Place ID: {self.place_id}") - log.info(f"Headers captured: {len(self.captured_headers)}") - log.info(f"Cookies captured: {len(cookies)}") - log.info(f"{'='*60}\n") - - # Print sample headers for debugging - log.debug("Sample headers:") - for key in ['cookie', 'x-goog-api-key', 'authorization', 'user-agent']: - if key in self.captured_headers: - value = self.captured_headers[key] - preview = value[:50] + '...' if len(value) > 50 else value - log.debug(f" {key}: {preview}") - - return True - - except Exception as e: - log.error(f"Failed to capture session: {e}") - import traceback - traceback.print_exc() - return False - - def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]: - """ - Fetch reviews page using captured session. - - Args: - continuation_token: Pagination token - - Returns: - Tuple of (reviews, next_token) - """ - # Build pb parameter - if continuation_token: - pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" - else: - pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" - - params = { - 'authuser': '0', - 'hl': 'es', - 'gl': 'es', - 'pb': pb - } - - try: - log.info(f"Fetching page (token: {'initial' if not continuation_token else 'paginated'})...") - - # Make request with captured headers - url = 'https://www.google.com/maps/rpc/listugcposts' - response = self.session.get(url, params=params, headers=self.captured_headers, timeout=10) - - log.debug(f"Response status: {response.status_code}") - - if response.status_code != 200: - log.error(f"API error {response.status_code}: {response.text[:500]}") - return [], None - - # Parse response - body = response.text - if body.startswith(")]}'"): - body = body[4:].strip() - - data = json.loads(body) - - # Extract reviews - reviews = self.interceptor._parse_listugcposts_response(data) - - # Get next token - next_token = None - if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str): - next_token = data[1] - - log.info(f"✓ Extracted {len(reviews)} reviews") - - return reviews, next_token - - except Exception as e: - log.error(f"API request failed: {e}") - return [], None - - def scrape_all_reviews(self, max_pages: int = 100, delay: float = 0.3) -> List[dict]: - """ - Scrape all reviews using hybrid approach. - - Args: - max_pages: Maximum pages to fetch - delay: Delay between API calls - - Returns: - List of review dictionaries - """ - # Step 1: Capture session from browser - if not self.capture_session_from_browser(): - log.error("Failed to capture session - aborting") - return [] - - # Step 2: Fetch all reviews via API - log.info("\nStarting API-based scraping (no browser needed!)...") - start_time = time.time() - - all_reviews = [] - seen_ids = set() - continuation_token = None - page = 0 - - while page < max_pages: - page += 1 - - reviews, continuation_token = self.fetch_reviews_page(continuation_token) - - if not reviews: - log.info("No more reviews found") - break - - # Deduplicate - for review in reviews: - review_id = review.review_id or f"{review.author}_{review.date_text}" - if review_id not in seen_ids: - seen_ids.add(review_id) - all_reviews.append({ - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - }) - - log.info(f"Page {page}: {len(all_reviews)} total unique reviews") - - if not continuation_token: - log.info("No continuation token - finished") - break - - if delay > 0: - time.sleep(delay) - - elapsed = time.time() - start_time - - log.info(f"\n{'='*60}") - log.info(f"✅ API SCRAPING COMPLETED!") - log.info(f"{'='*60}") - log.info(f"Total reviews: {len(all_reviews)}") - log.info(f"API calls: {page}") - log.info(f"Time (API only): {elapsed:.2f} seconds") - log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second") - log.info(f"{'='*60}\n") - - return all_reviews - - -def main(): - """Example usage.""" - url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1" - - scraper = HybridAPIScraper(url, headless=False) - reviews = scraper.scrape_all_reviews(max_pages=50, delay=0.3) - - # Save results - output_file = 'hybrid_api_reviews.json' - with open(output_file, 'w', encoding='utf-8') as f: - json.dump(reviews, f, indent=2, ensure_ascii=False) - - log.info(f"Saved {len(reviews)} reviews to {output_file}") - - # Show sample - if reviews: - log.info("\nSample review:") - sample = reviews[0] - log.info(f" Author: {sample['author']}") - log.info(f" Rating: {sample['rating']}★") - log.info(f" Text: {sample['text'][:80]}..." if sample['text'] else " Text: (none)") - - -if __name__ == '__main__': - main() diff --git a/inspect_pane_content.py b/inspect_pane_content.py deleted file mode 100644 index fb95a94..0000000 --- a/inspect_pane_content.py +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env python3 -""" -Check what's actually inside the reviews pane after scrolling. -""" - -import time -from seleniumbase import Driver - -url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en" - -driver = Driver(uc=True, headless=False) - -try: - driver.get(url) - time.sleep(5) - - # GDPR - try: - form_btns = driver.find_elements('css selector', 'form button') - for btn in form_btns: - if 'accept all' in (btn.text or '').lower(): - btn.click() - time.sleep(2) - break - except: - pass - - # Click reviews tab - time.sleep(2) - tabs = driver.find_elements('css selector', 'button[role="tab"]') - review_tab_found = False - for tab in tabs: - text = (tab.text or '').lower() - aria = (tab.get_attribute('aria-label') or '').lower() - print(f"Tab: text='{tab.text}', aria='{tab.get_attribute('aria-label')}'") - if 'review' in text or 'review' in aria: - print(f" -> Clicking this tab!") - driver.execute_script("arguments[0].click();", tab) - time.sleep(6) # Wait longer - review_tab_found = True - break - - if not review_tab_found: - print("WARNING: Reviews tab not found!") - - # Find and scroll the pane - print("\nLooking for scrollable pane...") - pane = None - try: - pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde') - print(f"Found pane: div.m6QErb.WNBkOb.XiKgde") - except: - print("Pane not found with standard selector!") - try: - pane = driver.find_element('css selector', 'div.m6QErb') - print(f"Found pane: div.m6QErb") - except: - print("No pane found at all!") - - if pane: - print("\nScrolling pane to load reviews...") - for i in range(15): - driver.execute_script("arguments[0].scrollBy(0, 400);", pane) - time.sleep(0.4) - if (i + 1) % 5 == 0: - print(f" Scrolled {i+1} times...") - - # Now check what's in the pane - print("\n" + "="*80) - print("ANALYZING PANE CONTENT") - print("="*80) - - content_info = driver.execute_script(""" - const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || document.querySelector('div.m6QErb'); - if (!pane) return {error: 'No pane found'}; - - // Get all child divs (direct and nested) - const allDivs = Array.from(pane.querySelectorAll('div')); - - // Get all unique class names used - const classNames = new Set(); - allDivs.forEach(div => { - if (div.className) { - div.className.split(' ').forEach(cls => { - if (cls.trim()) classNames.add(cls.trim()); - }); - } - }); - - // Find divs with ratings - const divsWithRatings = allDivs.filter(div => { - return !!div.querySelector('[aria-label*="star" i]'); - }); - - // Find divs with author photos - const divsWithPhotos = allDivs.filter(div => { - return !!div.querySelector('img[src*="photo"], img[src*="avatar"]'); - }); - - // Find divs with date patterns - const divsWithDates = allDivs.filter(div => { - return !!div.textContent.match(/\\d+\\s*(day|week|month|year|hour|minute|ago)/i); - }); - - // Find divs with ALL three - const reviewLikeDivs = allDivs.filter(div => { - const hasRating = !!div.querySelector('[aria-label*="star" i]'); - const hasPhoto = !!div.querySelector('img'); - const hasDate = !!div.textContent.match(/\\d+\\s*(day|week|month|year|hour|ago)/i); - const textLen = div.textContent.length; - return hasRating && hasPhoto && hasDate && textLen > 50 && textLen < 2000; - }); - - return { - total_divs: allDivs.length, - unique_classes: Array.from(classNames).sort(), - divs_with_ratings: divsWithRatings.length, - divs_with_photos: divsWithPhotos.length, - divs_with_dates: divsWithDates.length, - review_like_divs: reviewLikeDivs.length, - review_like_classes: reviewLikeDivs.slice(0, 5).map(d => ({ - classes: d.className, - text_length: d.textContent.length, - sample: d.textContent.substring(0, 100) - })) - }; - """) - - if 'error' in content_info: - print(f"ERROR: {content_info['error']}") - else: - print(f"\nTotal divs in pane: {content_info['total_divs']}") - print(f"Divs with ratings: {content_info['divs_with_ratings']}") - print(f"Divs with photos: {content_info['divs_with_photos']}") - print(f"Divs with dates: {content_info['divs_with_dates']}") - print(f"Divs matching ALL criteria (review-like): {content_info['review_like_divs']}") - - print(f"\nFirst 20 unique classes found in pane:") - for cls in content_info['unique_classes'][:20]: - print(f" {cls}") - - if content_info['review_like_divs'] > 0: - print(f"\nFirst 5 review-like divs:") - for i, div_info in enumerate(content_info['review_like_classes'], 1): - print(f"\n Div {i}:") - print(f" Classes: {div_info['classes']}") - print(f" Text length: {div_info['text_length']}") - print(f" Sample: {div_info['sample'][:80]}...") - - print(f"\n{'='*80}") - print("Browser staying open for manual inspection (120 seconds)...") - print("Look at the DevTools to see the actual review elements!") - print(f"{'='*80}") - time.sleep(120) - -finally: - driver.quit() diff --git a/manual_inspect.py b/manual_inspect.py deleted file mode 100644 index 6b48232..0000000 --- a/manual_inspect.py +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env python3 -""" -Open the page and keep it open for manual inspection. -INSTRUCTIONS: -1. Open DevTools (F12) -2. Click on an individual review -3. Look at the div that contains ONE review (not the whole list) -4. Note the class names on that div -""" - -import time -from seleniumbase import Driver - -url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en" - -driver = Driver(uc=True, headless=False) - -try: - driver.get(url) - time.sleep(5) - - # GDPR - try: - form_btns = driver.find_elements('css selector', 'form button') - for btn in form_btns: - if 'accept all' in (btn.text or '').lower(): - btn.click() - time.sleep(2) - break - except: - pass - - # Click reviews tab - time.sleep(2) - tabs = driver.find_elements('css selector', 'button[role="tab"]') - for tab in tabs: - if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower(): - driver.execute_script("arguments[0].click();", tab) - time.sleep(5) - break - - # Scroll to load a few reviews - try: - pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde') - for _ in range(5): - driver.execute_script("arguments[0].scrollBy(0, 300);", pane) - time.sleep(0.5) - except: - pass - - print("\n" + "="*80) - print("MANUAL INSPECTION TIME!") - print("="*80) - print("\n1. The browser is now showing the reviews page") - print("2. Open DevTools (F12 or right-click > Inspect)") - print("3. Click the 'Select element' tool (top-left of DevTools)") - print("4. Hover over an INDIVIDUAL review (not the whole panel)") - print("5. Click on it to select it in the inspector") - print("6. Look at the
that wraps ONE SINGLE review") - print("7. Note the 'class' attribute value") - print("\n8. The class might look like: class=\"MyWpvb fontBodyMedium\" or similar") - print("\n9. Write down the full class name(s) - we'll use this as the selector!") - print("\n" + "="*80) - print("Browser will stay open for 5 minutes...") - print("="*80) - - time.sleep(300) # 5 minutes - -finally: - driver.quit() diff --git a/modules/api_interceptor.py b/modules/api_interceptor.py deleted file mode 100644 index e789801..0000000 --- a/modules/api_interceptor.py +++ /dev/null @@ -1,923 +0,0 @@ -""" -API Interceptor for Google Maps Reviews. -Uses Chrome DevTools Protocol (CDP) to intercept network requests and capture -Google's internal API responses for faster, more reliable data extraction. -""" - -import base64 -import json -import logging -import os -import re -import threading -import time -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any, Callable, Dict, List, Optional -from urllib.parse import parse_qs, urlparse - -log = logging.getLogger("api_interceptor") - - -@dataclass -class InterceptedReview: - """Data class for a review extracted from API response""" - review_id: str = "" - author: str = "" - rating: float = 0.0 - text: str = "" - date_text: str = "" - timestamp: int = 0 - likes: int = 0 - photos: List[str] = field(default_factory=list) - profile_url: str = "" - avatar_url: str = "" - owner_response: str = "" - owner_response_date: str = "" - lang: str = "" - - -class GoogleMapsAPIInterceptor: - """ - Intercepts Google Maps internal API calls to capture review data directly. - - Google Maps uses several internal endpoints for reviews: - - /maps/preview/review/listentitiesreviews - Main reviews endpoint - - /maps/rpc/placereview - Alternative review endpoint - - /maps/preview/reviewsdata - Review data endpoint - - The responses are often in a custom protobuf-like JSON format that needs parsing. - """ - - # Patterns for review-related API endpoints - REVIEW_API_PATTERNS = [ - r'maps/preview/review', - r'maps/rpc/placereview', - r'maps/preview/reviewsdata', - r'maps/preview/place', - r'maps/api/place', - r'/locationhistory/preview', - r'batchexecute.*review', - ] - - def __init__(self, driver): - """Initialize the interceptor with a Selenium driver""" - self.driver = driver - self.captured_responses: List[Dict[str, Any]] = [] - self.captured_reviews: List[InterceptedReview] = [] - self.request_map: Dict[str, Dict] = {} # Map request IDs to URLs - self._lock = threading.Lock() - self._listening = False - self._response_callback: Optional[Callable] = None - - def setup_interception(self): - """Enable network interception via CDP""" - try: - # Enable network domain - self.driver.execute_cdp_cmd('Network.enable', {}) - - # Set up request interception patterns - self.driver.execute_cdp_cmd('Network.setRequestInterception', { - 'patterns': [ - {'urlPattern': '*maps*review*', 'resourceType': 'XHR'}, - {'urlPattern': '*maps*review*', 'resourceType': 'Fetch'}, - {'urlPattern': '*batchexecute*', 'resourceType': 'XHR'}, - {'urlPattern': '*batchexecute*', 'resourceType': 'Fetch'}, - ] - }) - - self._listening = True - log.info("API interception enabled via CDP") - return True - - except Exception as e: - log.warning(f"Could not enable CDP interception: {e}") - # Try alternative approach - return self._setup_performance_logging() - - def _setup_performance_logging(self): - """Alternative approach using Performance logging""" - try: - self.driver.execute_cdp_cmd('Network.enable', { - 'maxTotalBufferSize': 10000000, - 'maxResourceBufferSize': 5000000 - }) - self._listening = True - log.info("API interception enabled via performance logging") - return True - except Exception as e: - log.error(f"Failed to setup performance logging: {e}") - return False - - def capture_network_responses(self, duration: float = 5.0): - """ - Capture network responses for a specified duration. - Call this while scrolling/loading more reviews. - """ - if not self._listening: - log.warning("Interception not set up, call setup_interception() first") - return [] - - captured = [] - start_time = time.time() - - while time.time() - start_time < duration: - try: - # Get performance logs which contain network events - logs = self.driver.get_log('performance') - - for entry in logs: - try: - log_data = json.loads(entry['message']) - message = log_data.get('message', {}) - method = message.get('method', '') - params = message.get('params', {}) - - # Capture response received events - if method == 'Network.responseReceived': - response = params.get('response', {}) - url = response.get('url', '') - - if self._is_review_api(url): - request_id = params.get('requestId') - self.request_map[request_id] = { - 'url': url, - 'status': response.get('status'), - 'headers': response.get('headers', {}) - } - - # Capture response body when loading is finished - elif method == 'Network.loadingFinished': - request_id = params.get('requestId') - if request_id in self.request_map: - body = self._get_response_body(request_id) - if body: - captured.append({ - 'url': self.request_map[request_id]['url'], - 'body': body, - 'timestamp': time.time() - }) - - except Exception as parse_error: - log.debug(f"Error parsing log entry: {parse_error}") - continue - - except Exception as e: - # Performance logs might not be available - log.debug(f"Could not get performance logs: {e}") - break - - time.sleep(0.1) - - with self._lock: - self.captured_responses.extend(captured) - - return captured - - def get_response_bodies_cdp(self): - """Get response bodies using CDP directly (more reliable method)""" - responses = [] - - try: - # Use CDP to get all responses - result = self.driver.execute_cdp_cmd('Network.getAllCookies', {}) - - # Execute JavaScript to intercept fetch/XHR responses - intercept_script = """ - (function() { - if (window.__interceptedResponses) { - var responses = window.__interceptedResponses; - window.__interceptedResponses = []; - return responses; - } - return []; - })(); - """ - - captured = self.driver.execute_script(intercept_script) - if captured: - responses.extend(captured) - - except Exception as e: - log.debug(f"CDP response capture error: {e}") - - return responses - - def inject_response_interceptor(self): - """ - Inject JavaScript to intercept XHR/Fetch responses at the browser level. - This is the most reliable method for capturing API responses. - """ - intercept_script = """ - (function() { - // Skip if already injected - if (window.__reviewInterceptorInjected) { - console.log('[API Interceptor] Already injected, skipping'); - return; - } - window.__reviewInterceptorInjected = true; - window.__interceptedResponses = []; - window.__interceptorStats = { - totalFetch: 0, - totalXHR: 0, - capturedFetch: 0, - capturedXHR: 0, - lastCapture: null - }; - - console.log('[API Interceptor] Initializing...'); - - // Store original fetch - const originalFetch = window.fetch; - - // Override fetch - window.fetch = async function(...args) { - window.__interceptorStats.totalFetch++; - const url = args[0].toString(); - - // Log ALL fetch requests for debugging - console.debug('[API Interceptor] FETCH:', url.substring(0, 150)); - - const response = await originalFetch.apply(this, args); - - // Check if this is a review-related API call - if (url.includes('review') || url.includes('batchexecute') || - url.includes('place') || url.includes('maps') || - url.includes('listugcposts') || url.includes('getreviews')) { - try { - const clone = response.clone(); - const text = await clone.text(); - - console.log('[API Interceptor] ✅ CAPTURED FETCH:', url.substring(0, 100), 'Size:', text.length); - - window.__interceptedResponses.push({ - url: url, - body: text, - timestamp: Date.now(), - type: 'fetch', - size: text.length - }); - - window.__interceptorStats.capturedFetch++; - window.__interceptorStats.lastCapture = new Date().toISOString(); - - // Keep only last 100 responses to avoid memory issues - if (window.__interceptedResponses.length > 100) { - window.__interceptedResponses = window.__interceptedResponses.slice(-50); - } - } catch (e) { - console.error('[API Interceptor] Response capture error:', e); - } - } - - return response; - }; - - // Store original XMLHttpRequest - const originalXHR = window.XMLHttpRequest; - - // Create intercepting XHR - window.XMLHttpRequest = function() { - const xhr = new originalXHR(); - const originalOpen = xhr.open; - const originalSend = xhr.send; - let requestUrl = ''; - - xhr.open = function(method, url, ...rest) { - requestUrl = url; - window.__interceptorStats.totalXHR++; - console.debug('[API Interceptor] XHR:', method, url.substring(0, 150)); - return originalOpen.apply(this, [method, url, ...rest]); - }; - - xhr.addEventListener('load', function() { - if (requestUrl.includes('review') || requestUrl.includes('batchexecute') || - requestUrl.includes('place') || requestUrl.includes('maps') || - requestUrl.includes('listugcposts') || requestUrl.includes('getreviews')) { - try { - console.log('[API Interceptor] ✅ CAPTURED XHR:', requestUrl.substring(0, 100), 'Size:', xhr.responseText.length); - - window.__interceptedResponses.push({ - url: requestUrl, - body: xhr.responseText, - timestamp: Date.now(), - type: 'xhr', - status: xhr.status, - size: xhr.responseText.length - }); - - window.__interceptorStats.capturedXHR++; - window.__interceptorStats.lastCapture = new Date().toISOString(); - - if (window.__interceptedResponses.length > 100) { - window.__interceptedResponses = window.__interceptedResponses.slice(-50); - } - } catch (e) { - console.error('[API Interceptor] XHR capture error:', e); - } - } - }); - - return xhr; - }; - - // Copy static properties - for (let prop of Object.getOwnPropertyNames(originalXHR)) { - try { - window.XMLHttpRequest[prop] = originalXHR[prop]; - } catch (e) {} - } - - console.log('[API Interceptor] ✅ Injected successfully! Monitoring network requests...'); - - // Log stats every 10 seconds - setInterval(() => { - if (window.__interceptorStats.totalFetch > 0 || window.__interceptorStats.totalXHR > 0) { - console.log('[API Interceptor] Stats:', - 'Fetch:', window.__interceptorStats.totalFetch, '/', window.__interceptorStats.capturedFetch, - 'XHR:', window.__interceptorStats.totalXHR, '/', window.__interceptorStats.capturedXHR, - 'Queue:', window.__interceptedResponses.length); - } - }, 10000); - - return true; - })(); - """ - - try: - result = self.driver.execute_script(intercept_script) - log.info("JavaScript response interceptor injected with enhanced debugging") - - # Get initial stats - stats = self.get_interceptor_stats() - log.debug(f"Interceptor stats: {stats}") - - return True - except Exception as e: - log.warning(f"Failed to inject interceptor: {e}") - return False - - def get_intercepted_responses(self): - """Retrieve intercepted responses from the browser""" - try: - script = """ - if (window.__interceptedResponses) { - var responses = window.__interceptedResponses.slice(); - window.__interceptedResponses = []; - return responses; - } - return []; - """ - responses = self.driver.execute_script(script) - - if responses: - log.debug(f"Retrieved {len(responses)} intercepted responses from browser") - for resp in responses[:3]: # Log first 3 for debugging - log.debug(f" - {resp.get('type', '?').upper()}: {resp.get('url', '')[:100]} ({resp.get('size', 0)} bytes)") - else: - log.debug("No intercepted responses available") - - return responses or [] - except Exception as e: - log.debug(f"Error getting intercepted responses: {e}") - return [] - - def get_interceptor_stats(self): - """Get statistics from the JavaScript interceptor""" - try: - script = """ - if (window.__interceptorStats) { - return window.__interceptorStats; - } - return null; - """ - stats = self.driver.execute_script(script) - return stats - except Exception as e: - log.debug(f"Error getting interceptor stats: {e}") - return None - - def get_browser_console_logs(self): - """Get browser console logs (for debugging)""" - try: - logs = self.driver.get_log('browser') - return logs - except Exception as e: - log.debug(f"Could not get browser console logs: {e}") - return [] - - def dump_responses_to_file(self, responses: List[Dict], output_dir: str = "debug_api_responses"): - """ - Dump captured responses to files for debugging. - Creates one file per response with metadata and body. - """ - try: - output_path = Path(output_dir) - output_path.mkdir(exist_ok=True) - - for i, response in enumerate(responses): - timestamp = response.get('timestamp', int(time.time() * 1000)) - url = response.get('url', 'unknown') - req_type = response.get('type', 'unknown') - - # Create filename from timestamp and type - filename = f"{timestamp}_{req_type}_{i}.json" - filepath = output_path / filename - - # Write response with metadata - with open(filepath, 'w', encoding='utf-8') as f: - json.dump({ - 'metadata': { - 'url': url, - 'type': req_type, - 'timestamp': timestamp, - 'size': response.get('size', len(response.get('body', ''))), - 'status': response.get('status') - }, - 'body': response.get('body', '') - }, f, indent=2, ensure_ascii=False) - - log.info(f"Dumped {len(responses)} responses to {output_path}") - return str(output_path) - - except Exception as e: - log.error(f"Error dumping responses to file: {e}") - return None - - def _is_review_api(self, url: str) -> bool: - """Check if URL matches review API patterns""" - url_lower = url.lower() - return any(re.search(pattern, url_lower) for pattern in self.REVIEW_API_PATTERNS) - - def _get_response_body(self, request_id: str) -> Optional[str]: - """Get response body for a request ID using CDP""" - try: - result = self.driver.execute_cdp_cmd('Network.getResponseBody', { - 'requestId': request_id - }) - - body = result.get('body', '') - if result.get('base64Encoded'): - body = base64.b64decode(body).decode('utf-8', errors='ignore') - - return body - except Exception as e: - log.debug(f"Could not get response body for {request_id}: {e}") - return None - - def parse_reviews_from_responses(self, responses: List[Dict]) -> List[InterceptedReview]: - """ - Parse review data from captured API responses. - Google's API responses use a custom nested array format. - """ - reviews = [] - - for response in responses: - try: - body = response.get('body', '') - url = response.get('url', '') - - # Skip non-JSON responses - if not body or body.startswith(' List[InterceptedReview]: - """Parse a single response body for review data""" - reviews = [] - - # Skip empty or HTML responses - if not body or body.startswith(' List[InterceptedReview]: - """ - Parse Google Maps listugcposts API response. - - Structure discovered: - data[2] = array of review groups - data[2][i] = single review group [review_data, metadata, continuation_token] - data[2][i][0] = review data (6-item array containing all review info) - """ - reviews = [] - - try: - if not isinstance(data, list) or len(data) < 3: - log.debug("Response doesn't match expected structure (not a list or too short)") - return reviews - - # data[2] contains the review groups - review_groups = data[2] - if not isinstance(review_groups, list): - log.debug("data[2] is not a list") - return reviews - - log.debug(f"Found {len(review_groups)} reviews in data[2]") - - # Each group IS ONE REVIEW - for group_idx, group in enumerate(review_groups): - if not isinstance(group, list) or len(group) == 0: - continue - - # group[0] is the review data array (6 items) - review_data = group[0] - if not isinstance(review_data, list): - continue - - try: - review = self._parse_google_review_array(review_data) - if review: - reviews.append(review) - log.debug(f"Parsed review {group_idx}: {review.author} - {review.rating}★") - except Exception as e: - log.debug(f"Error parsing review at group[{group_idx}]: {e}") - - except Exception as e: - log.debug(f"Error in _parse_listugcposts_response: {e}") - - return reviews - - def _parse_google_review_array(self, review_data: List) -> Optional[InterceptedReview]: - """ - Parse a single review from Google's 6-item array format. - - Discovered structure (review_data is a 6-item array): - review_data[0] = Review ID (string) - review_data[1][4][5][0] = Author Name - review_data[1][4][5][3] = User ID - review_data[1][6] = Date Text - review_data[2][0][0] = Rating (1-5) - review_data[2][15][0][0] = Review Text (original) - review_data[2][15][1][0] = Review Text (translated) - """ - review = InterceptedReview() - - try: - # Extract review ID from review_data[0] - if len(review_data) > 0 and isinstance(review_data[0], str): - review.review_id = review_data[0] - - # Extract author info from review_data[1][4][5] - if (len(review_data) > 1 and - isinstance(review_data[1], list) and - len(review_data[1]) > 4 and - isinstance(review_data[1][4], list) and - len(review_data[1][4]) > 5 and - isinstance(review_data[1][4][5], list)): - - author_info = review_data[1][4][5] - - # Author name at [1][4][5][0] - if len(author_info) > 0 and isinstance(author_info[0], str): - review.author = author_info[0] - - # Profile picture at [1][4][5][1] (if available) - if len(author_info) > 1 and isinstance(author_info[1], str): - review.avatar_url = author_info[1] - - # Extract date from review_data[1][6] - if (len(review_data) > 1 and - isinstance(review_data[1], list) and - len(review_data[1]) > 6 and - isinstance(review_data[1][6], str)): - review.date_text = review_data[1][6] - - # Extract rating from review_data[2][0][0] - if (len(review_data) > 2 and - isinstance(review_data[2], list) and - len(review_data[2]) > 0 and - isinstance(review_data[2][0], list) and - len(review_data[2][0]) > 0): - rating_val = review_data[2][0][0] - if isinstance(rating_val, (int, float)) and 1 <= rating_val <= 5: - review.rating = float(rating_val) - - # Extract review text from review_data[2][15][0][0] - if (len(review_data) > 2 and - isinstance(review_data[2], list) and - len(review_data[2]) > 15 and - isinstance(review_data[2][15], list) and - len(review_data[2][15]) > 0 and - isinstance(review_data[2][15][0], list) and - len(review_data[2][15][0]) > 0): - text = review_data[2][15][0][0] - if isinstance(text, str): - review.text = text - - # Only return if we have minimum required data - if review.rating > 0 and (review.author or review.text): - return review - - except Exception as e: - log.debug(f"Error parsing Google review array: {e}") - - return None - - def _parse_review_array_v2(self, arr: List) -> Optional[InterceptedReview]: - """ - Parse review from Google's nested array format. - Improved version with better field detection. - """ - review = InterceptedReview() - - try: - # Extract review ID (usually a long string in first few elements) - for i, item in enumerate(arr[:5]): - if isinstance(item, str) and len(item) > 30 and not item.startswith('http'): - review.review_id = item - break - - # Extract rating (number between 1-5) - for item in arr: - if isinstance(item, (int, float)) and 1 <= item <= 5: - review.rating = float(item) - break - elif isinstance(item, list): - for subitem in item: - if isinstance(subitem, (int, float)) and 1 <= subitem <= 5: - review.rating = float(subitem) - break - if review.rating > 0: - break - - # Extract review text (long string, not a URL) - for item in arr: - if isinstance(item, str) and len(item) > 50 and not item.startswith('http'): - if not review.review_id or item != review.review_id: - review.text = item - break - - # Extract author name (shorter string, not ID or text) - for item in arr: - if isinstance(item, str) and 3 <= len(item) <= 100: - if item != review.review_id and item != review.text and not item.startswith('http'): - review.author = item - break - elif isinstance(item, list): - for subitem in item: - if isinstance(subitem, str) and 3 <= len(subitem) <= 100: - if subitem != review.text and not subitem.startswith('http'): - review.author = subitem - break - if review.author: - break - - # Extract dates (strings that look like dates) - date_patterns = [r'\d{1,2}/\d{1,2}/\d{2,4}', r'\d{4}-\d{2}-\d{2}', r'hace \d+', r'\d+ days? ago'] - for item in arr: - if isinstance(item, str): - for pattern in date_patterns: - if re.search(pattern, item, re.IGNORECASE): - review.date_text = item - break - if review.date_text: - break - - # Only return if we have meaningful data - if (review.review_id or review.author) and review.rating > 0: - return review - - except Exception as e: - log.debug(f"Error in _parse_review_array_v2: {e}") - - return None - - def _extract_reviews_recursive(self, data: Any, depth: int = 0) -> List[InterceptedReview]: - """Recursively search for review data in nested structures""" - reviews = [] - - if depth > 20: # Prevent infinite recursion - return reviews - - # Skip if data is already an InterceptedReview object - if isinstance(data, InterceptedReview): - return [data] - - if isinstance(data, dict): - # Check if this looks like a review object - review = self._try_parse_review_dict(data) - if review: - reviews.append(review) - - # Recurse into dict values - for value in data.values(): - if not isinstance(value, InterceptedReview): - reviews.extend(self._extract_reviews_recursive(value, depth + 1)) - - elif isinstance(data, list): - # Check if this array looks like a review array - review = self._try_parse_review_array(data) - if review: - reviews.append(review) - - # Recurse into list items - for item in data: - if not isinstance(item, InterceptedReview): - reviews.extend(self._extract_reviews_recursive(item, depth + 1)) - - return reviews - - def _try_parse_review_dict(self, data: Dict) -> Optional[InterceptedReview]: - """Try to parse a dictionary as a review object""" - # Common keys in review objects - review_keys = {'reviewId', 'review_id', 'author', 'rating', 'text', 'comment'} - - if not any(k in data for k in review_keys): - return None - - try: - review = InterceptedReview() - - # Try various key names for each field - review.review_id = data.get('reviewId') or data.get('review_id') or data.get('id', '') - review.author = data.get('author') or data.get('authorName') or data.get('name', '') - review.rating = float(data.get('rating') or data.get('starRating') or 0) - review.text = data.get('text') or data.get('comment') or data.get('reviewText', '') - review.date_text = data.get('publishTime') or data.get('relativePublishTime') or data.get('date', '') - review.likes = int(data.get('thumbsUpCount') or data.get('likes') or 0) - - # Photos - photos = data.get('photos') or data.get('reviewPhotos') or [] - if photos: - review.photos = [p.get('url') or p for p in photos if p] - - # Profile - author_data = data.get('author') if isinstance(data.get('author'), dict) else {} - review.profile_url = author_data.get('profileUrl') or data.get('profileUrl', '') - review.avatar_url = author_data.get('profilePhotoUrl') or data.get('avatar', '') - - # Owner response - owner_resp = data.get('ownerResponse') or data.get('ownerReply') or {} - if isinstance(owner_resp, dict): - review.owner_response = owner_resp.get('text', '') - review.owner_response_date = owner_resp.get('publishTime', '') - - # Only return if we have meaningful data - if review.review_id or (review.author and review.text): - return review - - except Exception as e: - log.debug(f"Error parsing review dict: {e}") - - return None - - def _try_parse_review_array(self, data: List) -> Optional[InterceptedReview]: - """ - Try to parse a nested array as a review (Google's protobuf-like format). - Google often uses positional arrays like: [id, author, [rating], text, ...] - """ - if not data or len(data) < 3: - return None - - try: - # Look for patterns that indicate this is a review array - # Pattern 1: [review_id, [author_info], rating_array, text, ...] - - review = InterceptedReview() - - # Check if first element looks like a review ID - if isinstance(data[0], str) and len(data[0]) > 20: - review.review_id = data[0] - - # Search for rating (usually a small number 1-5) - for item in data: - if isinstance(item, (int, float)) and 1 <= item <= 5: - review.rating = float(item) - break - elif isinstance(item, list) and len(item) >= 1: - if isinstance(item[0], (int, float)) and 1 <= item[0] <= 5: - review.rating = float(item[0]) - break - - # Search for text (long string) - for item in data: - if isinstance(item, str) and len(item) > 30: - review.text = item - break - elif isinstance(item, list): - for subitem in item: - if isinstance(subitem, str) and len(subitem) > 30: - review.text = subitem - break - - # Search for author name (shorter string) - for item in data: - if isinstance(item, list) and len(item) >= 1: - for subitem in item: - if isinstance(subitem, str) and 2 <= len(subitem) <= 100 and subitem != review.text: - review.author = subitem - break - if review.author: - break - - # Search for URLs (photos, profile) - for item in data: - if isinstance(item, str) and item.startswith('http'): - if 'googleusercontent' in item or 'ggpht' in item: - if not review.avatar_url: - review.avatar_url = item - else: - review.photos.append(item) - elif isinstance(item, list): - self._extract_urls_from_array(item, review) - - # Only return if we have meaningful data - if review.review_id and review.rating > 0: - return review - if review.text and review.rating > 0: - return review - - except Exception as e: - log.debug(f"Error parsing review array: {e}") - - return None - - def _extract_urls_from_array(self, arr: List, review: InterceptedReview, depth: int = 0): - """Extract URLs from nested arrays""" - if depth > 5: - return - - for item in arr: - if isinstance(item, str) and item.startswith('http'): - if 'googleusercontent' in item or 'ggpht' in item or 'lh3' in item: - if 'w72-h72' in item or 'p-rp-mo' in item: # Profile pic pattern - review.avatar_url = item - else: - review.photos.append(item) - elif isinstance(item, list): - self._extract_urls_from_array(item, depth + 1, review) - - def convert_to_raw_review_format(self, intercepted: InterceptedReview) -> Dict[str, Any]: - """Convert an InterceptedReview to the format used by RawReview/storage""" - return { - 'review_id': intercepted.review_id, - 'author': intercepted.author, - 'rating': intercepted.rating, - 'description': {'en': intercepted.text} if intercepted.text else {}, - 'likes': intercepted.likes, - 'user_images': intercepted.photos, - 'author_profile_url': intercepted.profile_url, - 'profile_picture': intercepted.avatar_url, - 'owner_responses': { - 'en': {'text': intercepted.owner_response} - } if intercepted.owner_response else {}, - 'review_date': intercepted.date_text, - '_source': 'api_intercept' - } - - def cleanup(self): - """Clean up interception resources""" - try: - self.driver.execute_cdp_cmd('Network.disable', {}) - except: - pass - - self.captured_responses.clear() - self.captured_reviews.clear() - self.request_map.clear() - self._listening = False diff --git a/modules/chrome_pool.py b/modules/chrome_pool.py index 0d986f2..d7e60b6 100644 --- a/modules/chrome_pool.py +++ b/modules/chrome_pool.py @@ -35,16 +35,45 @@ class ChromeWorker: # SeleniumBase Driver automatically includes UC mode anti-detection # Initialize with longer timeouts for large scraping jobs + # Chrome arguments for Docker stability + chrome_args = [ + "--disable-dev-shm-usage", # Use /tmp instead of /dev/shm (critical for Docker) + "--disable-gpu", # Disable GPU acceleration + "--no-sandbox", # Required for Docker + "--disable-software-rasterizer", + "--disable-extensions", + "--disable-background-networking", + "--disable-default-apps", + "--disable-sync", + "--metrics-recording-only", + "--mute-audio", + "--no-first-run", + "--safebrowsing-disable-auto-update", + ] + self.driver = Driver( uc=True, headless=self.headless, - page_load_strategy="normal" + page_load_strategy="normal", + chromium_arg=",".join(chrome_args) ) # Set generous timeouts for large scraping jobs self.driver.set_page_load_timeout(120) # 2 minutes for slow networks self.driver.set_script_timeout(60) # 1 minute for complex extraction + # Set Chrome geolocation to US (Boston, MA) for consistent Google Maps results + # This prevents location-based variations in search results + try: + self.driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { + 'latitude': 42.3601, + 'longitude': -71.0589, + 'accuracy': 100 + }) + log.info(f"Worker {self.worker_id}: Geolocation set to US (Boston, MA)") + except Exception as e: + log.warning(f"Worker {self.worker_id}: Could not set geolocation: {e}") + self.driver.maximize_window() self.created_at = time.time() self.last_used = time.time() diff --git a/modules/cli.py b/modules/cli.py deleted file mode 100644 index d05c480..0000000 --- a/modules/cli.py +++ /dev/null @@ -1,80 +0,0 @@ -""" -Command line interface handling for Google Maps Reviews Scraper. -""" - -import argparse -import json -from pathlib import Path - -from modules.config import DEFAULT_CONFIG_PATH - - -def parse_arguments(): - """Parse command line arguments""" - ap = argparse.ArgumentParser(description="Google‑Maps review scraper with MongoDB integration") - ap.add_argument("-q", "--headless", action="store_true", - help="run Chrome in the background") - ap.add_argument("-s", "--sort", dest="sort_by", - choices=("newest", "highest", "lowest", "relevance"), - default=None, help="sorting order for reviews") - ap.add_argument("--stop-on-match", action="store_true", - help="stop scrolling when first already‑seen id is met " - "(useful with --sort newest)") - ap.add_argument("--url", type=str, default=None, - help="custom Google Maps URL to scrape") - ap.add_argument("--overwrite", action="store_true", dest="overwrite_existing", - help="overwrite existing reviews instead of appending") - ap.add_argument("--config", type=str, default=None, - help="path to custom configuration file") - ap.add_argument("--use-mongodb", type=bool, default=None, - help="whether to use MongoDB for storage") - - # Arguments for date conversion and image downloading - ap.add_argument("--convert-dates", type=bool, default=None, - help="convert string dates to MongoDB Date objects") - ap.add_argument("--download-images", type=bool, default=None, - help="download images from reviews") - ap.add_argument("--image-dir", type=str, default=None, - help="directory to store downloaded images") - ap.add_argument("--download-threads", type=int, default=None, - help="number of threads for downloading images") - - # Arguments for local image paths and URL replacement - ap.add_argument("--store-local-paths", type=bool, default=None, - help="whether to store local image paths in documents") - ap.add_argument("--replace-urls", type=bool, default=None, - help="whether to replace original URLs with custom ones") - ap.add_argument("--custom-url-base", type=str, default=None, - help="base URL for replacement") - ap.add_argument("--custom-url-profiles", type=str, default=None, - help="path for profile images") - ap.add_argument("--custom-url-reviews", type=str, default=None, - help="path for review images") - ap.add_argument("--preserve-original-urls", type=bool, default=None, - help="whether to preserve original URLs in original_* fields") - - # Arguments for custom parameters - ap.add_argument("--custom-params", type=str, default=None, - help="JSON string with custom parameters to add to each document (e.g. '{\"company\":\"Thaitours\"}')") - - # API interception option - ap.add_argument("--api-intercept", action="store_true", dest="enable_api_intercept", - help="enable API response interception for faster data capture (experimental)") - - args = ap.parse_args() - - # Handle config path - if args.config is not None: - args.config = Path(args.config) - else: - args.config = DEFAULT_CONFIG_PATH - - # Process custom params if provided - if args.custom_params: - try: - args.custom_params = json.loads(args.custom_params) - except json.JSONDecodeError: - print(f"Warning: Could not parse custom params JSON: {args.custom_params}") - args.custom_params = None - - return args diff --git a/modules/database.py b/modules/database.py index 576bf01..8f112a1 100644 --- a/modules/database.py +++ b/modules/database.py @@ -77,11 +77,17 @@ class DatabaseManager: error_message TEXT, metadata JSONB, + scrape_logs JSONB, CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled')) ); """) + # Add scrape_logs column if it doesn't exist (for existing databases) + await conn.execute(""" + ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB; + """) + # Create indexes await conn.execute(""" CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status); @@ -182,10 +188,12 @@ class DatabaseManager: started_at, completed_at, reviews_count, + total_reviews, reviews_data, scrape_time, error_message, - metadata + metadata, + scrape_logs FROM jobs WHERE job_id = $1 """, job_id) @@ -246,8 +254,13 @@ class DatabaseManager: kwargs['completed_at'] = datetime.now() for key, value in kwargs.items(): - set_clauses.append(f"{key} = ${param_idx}") - params.append(value) + # Handle JSONB fields specially + if key == 'scrape_logs' and value is not None: + set_clauses.append(f"{key} = ${param_idx}::jsonb") + params.append(json.dumps(value) if not isinstance(value, str) else value) + else: + set_clauses.append(f"{key} = ${param_idx}") + params.append(value) param_idx += 1 query = f""" @@ -264,7 +277,8 @@ class DatabaseManager: job_id: UUID, reviews: List[Dict[str, Any]], scrape_time: float, - total_reviews: Optional[int] = None + total_reviews: Optional[int] = None, + scrape_logs: Optional[List[Dict[str, Any]]] = None ): """ Save scraping results to database. @@ -274,6 +288,7 @@ class DatabaseManager: reviews: List of review dictionaries scrape_time: Time taken to scrape in seconds total_reviews: Total reviews available (from page counter) + scrape_logs: List of log entries from the scraper """ async with self.pool.acquire() as conn: await conn.execute(""" @@ -284,9 +299,11 @@ class DatabaseManager: reviews_count = $2, total_reviews = $3, reviews_data = $4::jsonb, - scrape_time = $5 + scrape_time = $5, + scrape_logs = $6::jsonb WHERE job_id = $1 - """, job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time) + """, job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time, + json.dumps(scrape_logs) if scrape_logs else None) log.info(f"Saved {len(reviews)} reviews for job {job_id}") @@ -317,8 +334,10 @@ class DatabaseManager: created_at, completed_at, reviews_count, + total_reviews, scrape_time, - error_message + error_message, + metadata FROM jobs WHERE status = $1 ORDER BY created_at DESC @@ -333,8 +352,10 @@ class DatabaseManager: created_at, completed_at, reviews_count, + total_reviews, scrape_time, - error_message + error_message, + metadata FROM jobs ORDER BY created_at DESC LIMIT $1 OFFSET $2 diff --git a/modules/fast_scraper.py b/modules/fast_scraper.py index cdb7cb4..cb84533 100644 --- a/modules/fast_scraper.py +++ b/modules/fast_scraper.py @@ -1140,13 +1140,30 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_ else: log.info(f"[PROFILE] Using pooled driver (0.00s)") - # Force English locale for consistent parsing + # Force English locale AND US region for consistent parsing/results + # This helps avoid geolocation-based variations in Google Maps results if 'hl=' in url: url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en') else: separator = '&' if '?' in url else '?' url = f"{url}{separator}hl=en" + # Add US region parameter if not present + if 'gl=' not in url: + url = f"{url}&gl=us" + + # Set Chrome geolocation to US (Boston, MA) using CDP + # This ensures Google Maps shows US results regardless of server location + try: + driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { + 'latitude': 42.3601, + 'longitude': -71.0589, + 'accuracy': 100 + }) + log.info("Set geolocation to US (Boston, MA)") + except Exception as e: + log.warning(f"Could not set geolocation: {e}") + log.info(f"Loading Google Maps page...") t0 = timing_module.time() driver.get(url) @@ -1164,18 +1181,23 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_ form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button') for btn in form_btns: btn_text = (btn.text or '').lower() - if 'aceptar todo' in btn_text or 'accept all' in btn_text: + if 'aceptar todo' in btn_text or 'accept all' in btn_text or 'reject all' in btn_text: log.info(f"Clicking GDPR consent: {btn.text}") btn.click() - time.sleep(1) # Reduced from 2s + time.sleep(1) break else: if len(form_btns) >= 2: log.info("Using fallback: clicking second form button") form_btns[1].click() - time.sleep(1) # Reduced from 2s + time.sleep(1) except Exception as e: log.warning(f"GDPR consent handling failed: {e}") + + # After GDPR consent, reload the original URL to ensure proper page state + log.info(f"Reloading original URL after GDPR consent...") + driver.get(url) + time.sleep(1) log.info(f"[PROFILE] GDPR consent handling: {timing_module.time() - t0:.2f}s") else: log.info(f"[PROFILE] No GDPR consent page (0.00s)") @@ -1197,14 +1219,77 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_ try: log.info("Waiting for Google Maps content to load...") wait = WebDriverWait(driver, 10) + # Wait for basic page structure (h1 or heading) wait.until( - lambda d: d.find_elements(By.CSS_SELECTOR, 'h1.DUwDvf, h1, [role="article"], [data-review-id]') + lambda d: d.find_elements(By.CSS_SELECTOR, 'h1, [role="heading"]') ) - log.info("Google Maps content loaded successfully") + log.info("Basic page structure loaded") + + # Wait for page to settle - search URLs redirect to place URLs + # which triggers additional content loading + time.sleep(2) + + # Wait specifically for review count element (aria-label ending with "reviews") + # This is the most reliable indicator that the business detail is loaded + try: + WebDriverWait(driver, 5).until( + lambda d: d.execute_script(""" + var elems = document.querySelectorAll('[aria-label]'); + for (var i = 0; i < elems.length; i++) { + var label = elems[i].getAttribute('aria-label') || ''; + if (/^[0-9]+ reviews?$/.test(label)) return true; + } + return false; + """) + ) + log.info("Review count element loaded") + except: + # Fallback: Try clicking Reviews tab or rating stars to expose the review count + log.info("Review count wait timeout, trying to click Reviews/rating...") + try: + # Try 1: Click Reviews tab (if exists) + clicked = driver.execute_script(""" + var tabs = document.querySelectorAll('[role="tab"]'); + for (var i = 0; i < tabs.length; i++) { + var txt = (tabs[i].textContent || '').toLowerCase(); + if (txt.includes('review')) { + tabs[i].click(); + return 'tab'; + } + } + // Try 2: Click the rating stars element (often links to reviews) + var stars = document.querySelector('[role="img"][aria-label*="star"]'); + if (stars) { + var parent = stars.parentElement; + if (parent && parent.tagName.toLowerCase() === 'button') { + parent.click(); + return 'stars_button'; + } + stars.click(); + return 'stars'; + } + // Try 3: Click "Write a review" or any review-related button + var btns = document.querySelectorAll('button[aria-label*="review" i]'); + for (var b = 0; b < btns.length; b++) { + var label = btns[b].getAttribute('aria-label') || ''; + if (!/write/i.test(label) && /review/i.test(label)) { + btns[b].click(); + return 'review_btn: ' + label; + } + } + return 'none'; + """) + log.info(f"Clicked: {clicked}") + time.sleep(2) # Wait for reviews panel to load + except Exception as e: + log.warning(f"Click attempt failed: {e}") + except Exception as e: log.warning(f"Timeout waiting for Maps content: {e}") - time.sleep(0.5) # Minimal fallback wait + time.sleep(2) # Fallback wait log.info(f"[PROFILE] Smart wait for content: {timing_module.time() - t0:.2f}s") + log.info(f"DEBUG: Current URL: {driver.current_url[:100]}...") + log.info(f"DEBUG: Page title: {driver.title}") # Extract business card information using JavaScript t0 = timing_module.time() @@ -1216,85 +1301,166 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_ total_reviews: null }; - // Extract business name - const nameSelectors = [ - 'h1.DUwDvf', - '[role="main"] h1', - 'h1.fontHeadlineLarge' - ]; + // ============ ROBUST EXTRACTION (no class names, aria/data attributes preferred) ============ - for (const selector of nameSelectors) { - const elem = document.querySelector(selector); - if (elem && elem.textContent) { - info.name = elem.textContent.trim(); - break; - } - } + // Helper: Parse review count from text, handling multiple formats + function parseReviewCount(text) { + if (!text) return null; - // Extract address - const addressSelectors = [ - 'button[data-item-id*="address"]', - '[data-item-id*="address"]', - 'div[aria-label*="Address"]' - ]; - - for (const selector of addressSelectors) { - const elem = document.querySelector(selector); - if (elem && elem.textContent) { - info.address = elem.textContent.trim(); - break; - } - } - - // Extract rating (look for aria-label like "4.2 stars") - const ratingElem = document.querySelector('[role="img"][aria-label*="star"]'); - if (ratingElem) { - const ariaLabel = ratingElem.getAttribute('aria-label'); - const match = ariaLabel.match(/([0-9.]+)/); + // Pattern 1: Exact "N reviews" format (aria-labels, clean text) + // Matches: "27 reviews", "1,234 reviews", "27 reseñas", "27 avis" + var match = text.match(/^([0-9][0-9,.]*)[ ]*(?:reviews?|reseñas?|avis|bewertungen?|recensioni?)$/i); if (match) { - info.rating = parseFloat(match[1]); + return parseInt(match[1].replace(/[,. ]/g, '')); } - } - // Extract total review count - const reviewPattern = /\\((\\d[\\d,\\.]*)\\)/; - const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i; + // Pattern 2: "(N)" format often used in tabs like "Reviews (27)" + match = text.match(/[(]([0-9][0-9,.]*)[)]$/); + if (match) { + return parseInt(match[1].replace(/[,. ]/g, '')); + } - // PRIORITY 1: Look for review count in search results sidebar/panel - // This is where "152 reviews" appears on search results - const searchPanelSelectors = [ - 'a[href*="reviews"]', // Link with "reviews" in href - 'button[jsaction*="reviews"]', // Button related to reviews - 'div[role="link"]', // Clickable divs that might contain review info - ]; - - for (const selector of searchPanelSelectors) { - const elements = document.querySelectorAll(selector); - for (let elem of elements) { - const text = elem.textContent || ''; - const match = text.match(numberPattern); + // Pattern 3: "N reviews" anywhere in short text (< 30 chars to avoid false positives) + if (text.length < 30) { + match = text.match(/([0-9][0-9,]*)[ ]+(?:reviews?|reseñas?|avis)/i); if (match) { - const num = parseInt(match[1].replace(/[,\\.\\s]/g, '')); - if (num > 0 && num < 1000000) { - info.total_reviews = num; - break; - } + return parseInt(match[1].replace(/[,. ]/g, '')); } } - if (info.total_reviews) break; + + return null; } - // PRIORITY 2: Look in any span/div that contains the word "review" + // ============ EXTRACT BUSINESS NAME ============ + // Priority: h1 (semantic), then role="heading" + const h1 = document.querySelector('h1'); + if (h1 && h1.textContent) { + info.name = h1.textContent.trim(); + } + if (!info.name) { + const heading = document.querySelector('[role="heading"][aria-level="1"]'); + if (heading && heading.textContent) { + info.name = heading.textContent.trim(); + } + } + + // ============ EXTRACT ADDRESS ============ + // Priority: data-item-id (semantic), then aria-label containing "address" + const addressElem = document.querySelector('[data-item-id*="address"]'); + if (addressElem && addressElem.textContent) { + info.address = addressElem.textContent.trim(); + } + if (!info.address) { + const ariaAddress = document.querySelector('[aria-label*="ddress"]'); + if (ariaAddress && ariaAddress.textContent) { + info.address = ariaAddress.textContent.trim(); + } + } + + // ============ EXTRACT RATING ============ + // Priority: aria-label containing "star" on role="img" elements + info._debug_rating_context = []; + const ratingElems = document.querySelectorAll('[role="img"][aria-label*="star"]'); + for (let elem of ratingElems) { + const ariaLabel = elem.getAttribute('aria-label') || ''; + // Match "4.9 stars" or "4,9 stars" (European format) + const match = ariaLabel.match(/([0-9][.,]?[0-9]?)\\s*star/i); + if (match) { + info.rating = parseFloat(match[1].replace(',', '.')); + // DEBUG: Capture parent/sibling context to find review count + var parent = elem.parentElement; + if (parent) { + info._debug_rating_context.push('PARENT: ' + (parent.textContent || '').trim().substring(0, 100)); + var grandparent = parent.parentElement; + if (grandparent) { + info._debug_rating_context.push('GRANDPARENT: ' + (grandparent.textContent || '').trim().substring(0, 100)); + // Check all children of grandparent for review count + var gpChildren = grandparent.querySelectorAll('*'); + for (var c = 0; c < Math.min(gpChildren.length, 30); c++) { + var childText = (gpChildren[c].textContent || '').trim(); + if (childText.length > 0 && childText.length < 20 && /[0-9]/.test(childText)) { + info._debug_rating_context.push('GP_CHILD: ' + childText); + } + } + // Also check great-grandparent + var ggp = grandparent.parentElement; + if (ggp) { + info._debug_rating_context.push('GREAT_GP: ' + (ggp.textContent || '').trim().substring(0, 150)); + } + } + // Check siblings + var nextSib = parent.nextElementSibling; + if (nextSib) { + info._debug_rating_context.push('NEXT_SIB: ' + (nextSib.textContent || '').trim().substring(0, 100)); + } + } + break; + } + } + + // ============ EXTRACT TOTAL REVIEWS (ROBUST, ARIA-FIRST) ============ + + // PRIORITY 1: aria-label with exact "N reviews" format (most reliable) + // Google Maps uses aria-label="27 reviews" for accessibility + info._debug_aria = []; + info._debug_all_numeric = []; if (!info.total_reviews) { - const allElements = document.querySelectorAll('span, div, a'); - for (let elem of allElements) { - const text = elem.textContent || ''; - if (text.length < 100) { // Skip very long text blocks - const match = text.match(numberPattern); + var ariaElems = document.querySelectorAll('[aria-label]'); + for (var i = 0; i < ariaElems.length; i++) { + var ariaLabel = ariaElems[i].getAttribute('aria-label') || ''; + // Collect all labels containing "review" + if (ariaLabel.toLowerCase().indexOf('review') >= 0) { + info._debug_aria.push(ariaLabel); + } + // Collect all labels starting with a digit + if (/^[0-9]/.test(ariaLabel)) { + info._debug_all_numeric.push(ariaLabel); + } + var count = parseReviewCount(ariaLabel); + if (count && count > 0 && count < 100000) { + info.total_reviews = count; + info._debug_matched = ariaLabel; + break; + } + } + } + + // DEBUG: Find all text with parenthetical numbers like "(27)" + info._debug_parens = []; + info._debug_short_text = []; // All short text with numbers + var allSpans = document.querySelectorAll('span, div, a, button'); + for (var j = 0; j < Math.min(allSpans.length, 500); j++) { + var spanText = allSpans[j].textContent || ''; + // Capture parenthetical numbers + if (spanText.length < 20 && /[(][0-9]+[)]/.test(spanText)) { + info._debug_parens.push(spanText.trim()); + } + // Capture ALL short text containing numbers (for debugging) + if (spanText.length > 0 && spanText.length < 30 && /[0-9]+/.test(spanText)) { + var cleaned = spanText.trim().replace(/\\s+/g, ' '); + if (cleaned && info._debug_short_text.indexOf(cleaned) < 0) { + info._debug_short_text.push(cleaned); + } + } + } + + // PRIORITY 2.5: Look for text containing numbers near "review" word anywhere on page + // This catches formats like "27 reviews", "reviews: 27", etc. that aren't in aria-labels + if (!info.total_reviews) { + var allElems = document.querySelectorAll('*'); + for (var k = 0; k < Math.min(allElems.length, 1000); k++) { + var elem = allElems[k]; + // Skip if has children (we want leaf nodes only) + if (elem.children.length > 0) continue; + var txt = (elem.textContent || '').trim(); + // Look for short text with both numbers and "review" word + if (txt.length >= 3 && txt.length < 30 && /review/i.test(txt)) { + var match = txt.match(/([0-9][0-9,]*)/); if (match) { - const num = parseInt(match[1].replace(/[,\\.\\s]/g, '')); - if (num > 0 && num < 1000000) { - info.total_reviews = num; + var count = parseInt(match[1].replace(/,/g, '')); + if (count > 0 && count < 100000) { + info.total_reviews = count; + info._debug_matched = 'LEAF: ' + txt; break; } } @@ -1302,38 +1468,167 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_ } } - // PRIORITY 3: Try tabs (for business detail pages) + // DEBUG: Collect all tab names + info._debug_tabs = []; + const tabs = document.querySelectorAll('[role="tab"]'); + for (let t = 0; t < tabs.length; t++) { + info._debug_tabs.push((tabs[t].textContent || '').trim().substring(0, 30)); + } + + // DEBUG: Collect all buttons with text (might contain review count) + info._debug_buttons = []; + const buttons = document.querySelectorAll('button'); + for (let b = 0; b < Math.min(buttons.length, 20); b++) { + var btnText = (buttons[b].textContent || '').trim(); + if (btnText && btnText.length < 40) { + info._debug_buttons.push(btnText.substring(0, 40)); + } + } + + // PRIORITY 2: Tabs with role="tab" (Reviews tab often shows count) if (!info.total_reviews) { - const tabs = document.querySelectorAll('button[role="tab"]'); for (let tab of tabs) { - const text = tab.textContent || ''; - let match = text.match(reviewPattern); - if (match) { - info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, '')); - break; + const text = (tab.textContent || '').trim(); + // Look for "Reviews" tab with count + if (text.toLowerCase().includes('review')) { + const count = parseReviewCount(text); + if (count && count > 0) { + info.total_reviews = count; + info._debug_matched = 'TAB: ' + text; + break; + } } - match = text.match(numberPattern); - if (match) { - info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, '')); + } + } + + // PRIORITY 2.3: Reviews panel header (after clicking Reviews tab) + // Google Maps shows "27 reviews" as heading text in the reviews panel + if (!info.total_reviews) { + // Look for headings containing review count + var headings = document.querySelectorAll('h1, h2, [role="heading"]'); + for (var h = 0; h < headings.length; h++) { + var hText = (headings[h].textContent || '').trim(); + if (/review/i.test(hText)) { + var match = hText.match(/([0-9][0-9,]*)/); + if (match) { + var count = parseInt(match[1].replace(/,/g, '')); + if (count > 0 && count < 100000) { + info.total_reviews = count; + info._debug_matched = 'HEADING: ' + hText; + break; + } + } + } + } + } + + // PRIORITY 2.4: Look for sort button area which often has total count + // The sort dropdown area displays "Sort: Newest" and total reviews + if (!info.total_reviews) { + var sortBtns = document.querySelectorAll('button[data-value="sort"], [aria-label*="Sort"]'); + for (var s = 0; s < sortBtns.length; s++) { + var parent = sortBtns[s].parentElement; + if (parent) { + var pText = (parent.textContent || '').trim(); + if (/review/i.test(pText)) { + var match = pText.match(/([0-9][0-9,]*)\\s*review/i); + if (match) { + var count = parseInt(match[1].replace(/,/g, '')); + if (count > 0 && count < 100000) { + info.total_reviews = count; + info._debug_matched = 'SORT_AREA: ' + pText.substring(0, 50); + break; + } + } + } + } + } + } + + // PRIORITY 3: Elements with semantic review-related attributes + if (!info.total_reviews) { + const reviewLinks = document.querySelectorAll('a[href*="review"], button[aria-label*="review" i]'); + for (let elem of reviewLinks) { + const text = (elem.textContent || '').trim(); + const count = parseReviewCount(text); + if (count && count > 0) { + info.total_reviews = count; break; } } } - // PRIORITY 4: Try aria-labels + // PRIORITY 4: Look for standalone review count text near rating + // Find elements that contain ONLY "N reviews" pattern (not concatenated with rating) if (!info.total_reviews) { - const elements = document.querySelectorAll('[aria-label]'); - for (let elem of elements) { - const ariaLabel = elem.getAttribute('aria-label') || ''; - let match = ariaLabel.match(reviewPattern); - if (match) { - info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, '')); + const allElements = document.querySelectorAll('span, a'); + for (let elem of allElements) { + // Get direct text content only (not nested children) + const text = (elem.textContent || '').trim(); + // Skip if too long (likely contains other content) + if (text.length > 50) continue; + // Skip if it looks like rating+reviews concatenated (e.g., "4.927 reviews") + if (/^[0-9]\\.[0-9]+[0-9]/.test(text)) continue; + + const count = parseReviewCount(text); + if (count && count > 0 && count < 100000) { + info.total_reviews = count; break; } - match = ariaLabel.match(numberPattern); - if (match) { - info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, '')); - break; + } + } + + // PRIORITY 5: Parse from visible page text using regex on short text blocks + if (!info.total_reviews) { + const walker = document.createTreeWalker( + document.body, + NodeFilter.SHOW_TEXT, + null, + false + ); + while (walker.nextNode()) { + const text = walker.currentNode.textContent.trim(); + if (text.length >= 5 && text.length <= 30) { + // Match "27 reviews" but not "4.927 reviews" + const match = text.match(/(?:^|[^0-9.,])([0-9,]+)\\s+(?:reviews?|reseñas?)/i); + if (match) { + const count = parseInt(match[1].replace(/[,]/g, '')); + if (count > 0 && count < 100000) { + info.total_reviews = count; + info._debug_matched = 'WALKER: ' + text; + break; + } + } + } + } + } + + // PRIORITY 6: Extract from embedded JSON in page source (Google embeds data in scripts) + if (!info.total_reviews) { + var scripts = document.querySelectorAll('script'); + for (var sc = 0; sc < scripts.length; sc++) { + var scriptText = scripts[sc].textContent || ''; + // Look for patterns like "user_reviews":{"count":27} or reviews_count":27 + var jsonMatch = scriptText.match(/"(?:user_reviews|reviews?)(?:_count)?"\s*[:\{]\s*"?(\d+)"?/i); + if (jsonMatch) { + var count = parseInt(jsonMatch[1]); + if (count > 0 && count < 100000) { + info.total_reviews = count; + info._debug_matched = 'JSON_SCRIPT'; + break; + } + } + // Also look for review count in Google's data format like [\"27 reviews\"] + if (!info.total_reviews) { + var dataMatch = scriptText.match(/"(\d+)\s+reviews?"/i); + if (dataMatch) { + var count = parseInt(dataMatch[1]); + if (count > 0 && count < 100000) { + info.total_reviews = count; + info._debug_matched = 'JSON_DATA: ' + dataMatch[0]; + break; + } + } } } } @@ -1348,6 +1643,32 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_ log.info(f"[PROFILE] *** TOTAL GET_BUSINESS_CARD TIME: {total_time:.2f}s ***") log.info(f"Business card extracted: name={business_info.get('name')}, " f"rating={business_info.get('rating')}, reviews={business_info.get('total_reviews')}") + # Debug: log what aria-labels were found + if business_info.get('_debug_aria'): + log.info(f"DEBUG: Found {len(business_info.get('_debug_aria'))} aria-labels with 'review': {business_info.get('_debug_aria')[:5]}") + if business_info.get('_debug_matched'): + log.info(f"DEBUG: Matched aria-label: {business_info.get('_debug_matched')}") + # Also log all numeric aria-labels (potential review counts) + if business_info.get('_debug_all_numeric'): + log.info(f"DEBUG: Numeric aria-labels: {business_info.get('_debug_all_numeric')[:10]}") + # Log any text with parenthetical numbers like "(27)" + if business_info.get('_debug_parens'): + log.info(f"DEBUG: Parenthetical text: {business_info.get('_debug_parens')[:5]}") + # Log all short text containing numbers (for debugging review count detection) + if business_info.get('_debug_short_text'): + log.info(f"DEBUG: Short text with numbers: {business_info.get('_debug_short_text')[:15]}") + # Log the context around the rating element + if business_info.get('_debug_rating_context'): + for ctx in business_info.get('_debug_rating_context', []): + log.info(f"DEBUG: Rating context: {ctx}") + # Log what tabs exist on the page + if business_info.get('_debug_tabs'): + log.info(f"DEBUG: Page tabs: {business_info.get('_debug_tabs')}") + else: + log.info(f"DEBUG: No tabs found on page") + # Log buttons (might contain review count) + if business_info.get('_debug_buttons'): + log.info(f"DEBUG: Buttons: {business_info.get('_debug_buttons')[:10]}") result = { "name": business_info.get('name'), diff --git a/modules/job_manager.py b/modules/job_manager.py deleted file mode 100644 index 19e4bfc..0000000 --- a/modules/job_manager.py +++ /dev/null @@ -1,407 +0,0 @@ -""" -Background job manager for Google Reviews Scraper. -""" - -import asyncio -import logging -import threading -import time -import uuid -from concurrent.futures import ThreadPoolExecutor -from datetime import datetime -from enum import Enum -from typing import Dict, Any, Optional, List -from dataclasses import dataclass, asdict - -from modules.config import load_config -from modules.scraper import GoogleReviewsScraper -from modules.scraper_clean import fast_scrape_reviews # Updated to use clean scraper with hard refresh recovery -from modules.chrome_pool import get_scraping_worker, release_scraping_worker - -log = logging.getLogger("scraper") - - -class JobStatus(str, Enum): - """Job status enumeration""" - PENDING = "pending" - RUNNING = "running" - COMPLETED = "completed" - FAILED = "failed" - CANCELLED = "cancelled" - - -@dataclass -class ScrapingJob: - """Scraping job data class""" - job_id: str - status: JobStatus - url: str - config: Dict[str, Any] - created_at: datetime - started_at: Optional[datetime] = None - completed_at: Optional[datetime] = None - updated_at: Optional[datetime] = None # Last update time (for progress tracking) - error_message: Optional[str] = None - reviews_count: Optional[int] = None - total_reviews: Optional[int] = None # Total reviews available (from page counter) - images_count: Optional[int] = None - progress: Dict[str, Any] = None - reviews_data: Optional[List[Dict[str, Any]]] = None # Store actual review data - scrape_time: Optional[float] = None # Time taken to scrape - - def to_dict(self, include_reviews: bool = False) -> Dict[str, Any]: - """ - Convert job to dictionary for JSON serialization - - Args: - include_reviews: Whether to include the full reviews data (default: False) - """ - data = asdict(self) - # Convert datetime objects to ISO strings - for field in ['created_at', 'started_at', 'completed_at']: - if data[field]: - data[field] = data[field].isoformat() - - # Exclude reviews_data by default (can be large) - if not include_reviews: - data.pop('reviews_data', None) - - return data - - -class JobManager: - """Manager for background scraping jobs""" - - def __init__(self, max_concurrent_jobs: int = 3): - """Initialize job manager""" - self.max_concurrent_jobs = max_concurrent_jobs - self.jobs: Dict[str, ScrapingJob] = {} - self.executor = ThreadPoolExecutor(max_workers=max_concurrent_jobs) - self.lock = threading.Lock() - - def create_job(self, url: str, config_overrides: Dict[str, Any] = None) -> str: - """ - Create a new scraping job. - - Args: - url: Google Maps URL to scrape - config_overrides: Optional config overrides - - Returns: - Job ID - """ - job_id = str(uuid.uuid4()) - - # Load base config - config = load_config() - - # Apply URL - config["url"] = url - - # Apply any overrides - if config_overrides: - config.update(config_overrides) - - job = ScrapingJob( - job_id=job_id, - status=JobStatus.PENDING, - url=url, - config=config, - created_at=datetime.now(), - progress={"stage": "created", "message": "Job created and queued"} - ) - - with self.lock: - self.jobs[job_id] = job - - log.info(f"Created scraping job {job_id} for URL: {url}") - return job_id - - def start_job(self, job_id: str) -> bool: - """ - Start a pending job. - - Args: - job_id: Job ID to start - - Returns: - True if job was started, False otherwise - """ - with self.lock: - if job_id not in self.jobs: - return False - - job = self.jobs[job_id] - if job.status != JobStatus.PENDING: - return False - - # Check if we can start more jobs - running_count = sum(1 for j in self.jobs.values() if j.status == JobStatus.RUNNING) - if running_count >= self.max_concurrent_jobs: - return False - - job.status = JobStatus.RUNNING - job.started_at = datetime.now() - job.updated_at = datetime.now() - job.progress = {"stage": "starting", "message": "Initializing scraper"} - - # Submit job to thread pool - future = self.executor.submit(self._run_scraping_job, job_id) - - log.info(f"Started scraping job {job_id}") - return True - - def _run_scraping_job(self, job_id: str): - """ - Run the actual scraping job in background thread. - - Args: - job_id: Job ID to run - """ - def progress_callback(current_count: int, total_count: int): - """Update job progress during scraping""" - with self.lock: - job = self.jobs.get(job_id) - if job: - job.reviews_count = current_count - job.total_reviews = total_count - job.updated_at = datetime.now() # Update last update time - # Calculate percentage for better UX - percentage = int((current_count / total_count * 100)) if total_count > 0 else 0 - job.progress = { - "stage": "scraping", - "message": f"Collecting reviews: {current_count} / {total_count} ({percentage}%)", - "percentage": percentage - } - - worker = None - try: - with self.lock: - job = self.jobs[job_id] - job.progress = {"stage": "initializing", "message": "Acquiring Chrome worker from pool"} - - # Get a worker from the scraping pool - worker = get_scraping_worker(timeout=30) - - if not worker: - raise Exception("No Chrome workers available. Pool may be at capacity.") - - log.info(f"Job {job_id}: Acquired worker {worker.worker_id} from pool") - - # Get config - url = job.config.get('url') - headless = job.config.get('headless', True) # Default to headless - max_scrolls = job.config.get('max_scrolls', 999999) # Effectively unlimited - relies on idle detection - - with self.lock: - job.progress = {"stage": "scraping", "message": f"Scraping reviews with {worker.worker_id} (fast mode)"} - - # Run the FAST scraping with progress callback using pooled worker - result = fast_scrape_reviews( - url=url, - headless=headless, - max_scrolls=max_scrolls, - progress_callback=progress_callback, - driver=worker.driver, # Use worker's driver - return_driver=True # Don't close the driver - ) - - # Pop the driver from result before storing - result.pop('driver', None) - - # Mark job as completed or failed - with self.lock: - if result['success']: - job.status = JobStatus.COMPLETED - job.completed_at = datetime.now() - job.updated_at = datetime.now() - job.reviews_count = result['count'] - job.total_reviews = result.get('total_reviews') # Store total review count from page - job.reviews_data = result['reviews'] # Store the actual reviews - job.scrape_time = result['time'] - job.progress = { - "stage": "completed", - "message": f"Scraping completed successfully in {result['time']:.1f}s", - "scroll_time": result.get('scroll_time'), - "extract_time": result.get('extract_time') - } - log.info(f"Completed scraping job {job_id}: {result['count']} reviews in {result['time']:.1f}s") - else: - job.status = JobStatus.FAILED - job.completed_at = datetime.now() - job.updated_at = datetime.now() - job.error_message = result.get('error', 'Unknown error') - job.progress = {"stage": "failed", "message": f"Job failed: {result.get('error')}"} - log.error(f"Failed scraping job {job_id}: {result.get('error')}") - - except Exception as e: - log.error(f"Error in scraping job {job_id}: {e}") - import traceback - traceback.print_exc() - - with self.lock: - job = self.jobs[job_id] - job.status = JobStatus.FAILED - job.completed_at = datetime.now() - job.updated_at = datetime.now() - job.error_message = str(e) - job.progress = {"stage": "failed", "message": f"Job failed: {str(e)}"} - - # Recycle worker on error - if worker: - log.info(f"Job {job_id}: Recycling worker {worker.worker_id} due to error") - release_scraping_worker(worker, recycle=True) - worker = None # Mark as released - - finally: - # Release worker back to pool if not already released - if worker: - log.info(f"Job {job_id}: Releasing worker {worker.worker_id} back to pool") - release_scraping_worker(worker, recycle=False) - - def get_job(self, job_id: str) -> Optional[ScrapingJob]: - """ - Get job by ID. - - Args: - job_id: Job ID - - Returns: - Job object or None if not found - """ - with self.lock: - return self.jobs.get(job_id) - - def get_job_reviews(self, job_id: str) -> Optional[List[Dict[str, Any]]]: - """ - Get reviews data for a specific job. - - Args: - job_id: Job ID - - Returns: - List of reviews or None if not found/not completed - """ - with self.lock: - job = self.jobs.get(job_id) - if job and job.status == JobStatus.COMPLETED: - return job.reviews_data - return None - - def list_jobs(self, status: Optional[JobStatus] = None, limit: int = 100) -> List[ScrapingJob]: - """ - List jobs, optionally filtered by status. - - Args: - status: Optional status filter - limit: Maximum number of jobs to return - - Returns: - List of jobs - """ - with self.lock: - jobs = list(self.jobs.values()) - - if status: - jobs = [job for job in jobs if job.status == status] - - # Sort by creation time (newest first) - jobs.sort(key=lambda x: x.created_at, reverse=True) - - return jobs[:limit] - - def cancel_job(self, job_id: str) -> bool: - """ - Cancel a pending or running job. - - Args: - job_id: Job ID to cancel - - Returns: - True if job was cancelled, False otherwise - """ - with self.lock: - if job_id not in self.jobs: - return False - - job = self.jobs[job_id] - if job.status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED]: - return False - - job.status = JobStatus.CANCELLED - job.completed_at = datetime.now() - job.updated_at = datetime.now() - job.progress = {"stage": "cancelled", "message": "Job was cancelled"} - - log.info(f"Cancelled scraping job {job_id}") - return True - - def delete_job(self, job_id: str) -> bool: - """ - Delete a job from the manager. - - Args: - job_id: Job ID to delete - - Returns: - True if job was deleted, False otherwise - """ - with self.lock: - if job_id not in self.jobs: - return False - del self.jobs[job_id] - - log.info(f"Deleted scraping job {job_id}") - return True - - def get_stats(self) -> Dict[str, Any]: - """ - Get job manager statistics. - - Returns: - Statistics dictionary - """ - with self.lock: - jobs = list(self.jobs.values()) - - stats = { - "total_jobs": len(jobs), - "by_status": {}, - "running_jobs": 0, - "max_concurrent_jobs": self.max_concurrent_jobs - } - - for status in JobStatus: - count = sum(1 for job in jobs if job.status == status) - stats["by_status"][status.value] = count - - stats["running_jobs"] = stats["by_status"].get(JobStatus.RUNNING.value, 0) - - return stats - - def cleanup_old_jobs(self, max_age_hours: int = 24): - """ - Clean up old completed/failed jobs. - - Args: - max_age_hours: Maximum age in hours before cleanup - """ - cutoff_time = datetime.now().timestamp() - (max_age_hours * 3600) - - with self.lock: - to_delete = [] - for job_id, job in self.jobs.items(): - if job.status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED]: - if job.completed_at and job.completed_at.timestamp() < cutoff_time: - to_delete.append(job_id) - - for job_id in to_delete: - del self.jobs[job_id] - - if to_delete: - log.info(f"Cleaned up {len(to_delete)} old jobs") - - def shutdown(self): - """Shutdown the job manager""" - log.info("Shutting down job manager") - self.executor.shutdown(wait=True) \ No newline at end of file diff --git a/modules/scraper.py b/modules/scraper.py deleted file mode 100644 index d2c20be..0000000 --- a/modules/scraper.py +++ /dev/null @@ -1,2335 +0,0 @@ -""" -Selenium scraping logic for Google Maps Reviews. -Uses SeleniumBase UC Mode for enhanced anti-detection and better Chrome version management. -""" - -import logging -import os -import platform -import re -import time -import traceback -import threading -from typing import Dict, Any, List, Optional, Tuple - -from seleniumbase import Driver -from selenium.common.exceptions import TimeoutException, StaleElementReferenceException -from selenium.webdriver import Chrome -from selenium.webdriver.common.action_chains import ActionChains -from selenium.webdriver.common.by import By -from selenium.webdriver.common.keys import Keys -from selenium.webdriver.remote.webelement import WebElement -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support.ui import WebDriverWait -from tqdm import tqdm - -from modules.data_storage import MongoDBStorage, JSONStorage, merge_review -from modules.models import RawReview -from modules.api_interceptor import GoogleMapsAPIInterceptor - -# Logger -log = logging.getLogger("scraper") - -# CSS Selectors (Updated January 2026 for current Google Maps structure) -PANE_SEL = 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde' -CARD_SEL = "div.jftiEf" # Review card container -# Cookie/consent dialog selectors (Updated January 2026) -COOKIE_BTN = ('button[aria-label*="Accept" i],' - 'button[aria-label*="Aceptar" i],' - 'button[aria-label*="Akzeptieren" i],' - 'button[aria-label*="Aceitar" i],' - 'button[jsname="higCR"],' # Google's "Accept all" button - 'button[jsname="hZCF7e"],' - 'button[data-mdc-dialog-action="accept"],' - 'form[action*="consent"] button,' - 'div[role="dialog"] button[jsname],' - '.VfPpkd-LgbsSe[data-mdc-dialog-action="accept"]') -SORT_BTN = 'button[aria-label="Sort reviews" i], button[aria-label="Sort" i]' -MENU_ITEMS = 'div[role="menu"] [role="menuitem"], li[role="menuitem"]' - -SORT_OPTIONS = { - "newest": ( - "Newest", "החדשות ביותר", "ใหม่ที่สุด", "最新", "Más recientes", "最近", - "Mais recentes", "Neueste", "Plus récent", "Più recenti", "Nyeste", - "Новые", "Nieuwste", "جديد", "Nyeste", "Uusimmat", "Najnowsze", - "Senaste", "Terbaru", "Yakın zamanlı", "Mới nhất", "नवीनतम" - ), - "highest": ( - "Highest rating", "הדירוג הגבוה ביותר", "คะแนนสูงสุด", "最高評価", - "Calificación más alta", "最高评分", "Melhor avaliação", "Höchste Bewertung", - "Note la plus élevée", "Valutazione più alta", "Høyeste vurdering", - "Наивысший рейтинг", "Hoogste waardering", "أعلى تقييم", "Højeste vurdering", - "Korkein arvostelu", "Najwyższa ocena", "Högsta betyg", "Peringkat tertinggi", - "En yüksek puan", "Đánh giá cao nhất", "उच्चतम रेटिंग", "Top rating" - ), - "lowest": ( - "Lowest rating", "הדירוג הנמוך ביותר", "คะแนนต่ำสุด", "最低評価", - "Calificación más baja", "最低评分", "Pior avaliação", "Niedrigste Bewertung", - "Note la plus basse", "Valutazione più bassa", "Laveste vurdering", - "Наименьший рейтинг", "Laagste waardering", "أقل تقييم", "Laveste vurdering", - "Alhaisin arvostelu", "Najniższa ocena", "Lägsta betyg", "Peringkat terendah", - "En düşük puan", "Đánh giá thấp nhất", "निम्नतम रेटिंग", "Worst rating" - ), - "relevance": ( - "Most relevant", "רלוונטיות ביותר", "เกี่ยวข้องมากที่สุด", "関連性", - "Más relevantes", "最相关", "Mais relevantes", "Relevanteste", - "Plus pertinents", "Più pertinenti", "Mest relevante", - "Наиболее релевантные", "Meest relevant", "الأكثر صلة", "Mest relevante", - "Olennaisimmat", "Najbardziej trafne", "Mest relevanta", "Paling relevan", - "En alakalı", "Liên quan nhất", "सबसे प्रासंगिक", "Relevance" - ) -} - -# Comprehensive multi-language review keywords -REVIEW_WORDS = { - # English - "reviews", "review", "ratings", "rating", - - # Hebrew - "ביקורות", "ביקורת", "ביקורות על", "דירוגים", "דירוג", - - # Thai - "รีวิว", "บทวิจารณ์", "คะแนน", "ความคิดเห็น", - - # Spanish - "reseñas", "opiniones", "valoraciones", "críticas", "calificaciones", - - # French - "avis", "commentaires", "évaluations", "critiques", "notes", - - # German - "bewertungen", "rezensionen", "beurteilungen", "meinungen", "kritiken", - - # Italian - "recensioni", "valutazioni", "opinioni", "giudizi", "commenti", - - # Portuguese - "avaliações", "comentários", "opiniões", "análises", "críticas", - - # Russian - "отзывы", "рецензии", "обзоры", "оценки", "комментарии", - - # Japanese - "レビュー", "口コミ", "評価", "批評", "感想", - - # Korean - "리뷰", "평가", "후기", "댓글", "의견", - - # Chinese (Simplified and Traditional) - "评论", "評論", "点评", "點評", "评价", "評價", "意见", "意見", "回顾", "回顧", - - # Arabic - "مراجعات", "تقييمات", "آراء", "تعليقات", "نقد", - - # Hindi - "समीक्षा", "रिव्यू", "राय", "मूल्यांकन", "प्रतिक्रिया", - - # Turkish - "yorumlar", "değerlendirmeler", "incelemeler", "görüşler", "puanlar", - - # Dutch - "beoordelingen", "recensies", "meningen", "opmerkingen", "waarderingen", - - # Polish - "recenzje", "opinie", "oceny", "komentarze", "uwagi", - - # Vietnamese - "đánh giá", "nhận xét", "bình luận", "phản hồi", "bài đánh giá", - - # Indonesian - "ulasan", "tinjauan", "komentar", "penilaian", "pendapat", - - # Swedish - "recensioner", "betyg", "omdömen", "åsikter", "kommentarer", - - # Norwegian - "anmeldelser", "vurderinger", "omtaler", "meninger", "tilbakemeldinger", - - # Danish - "anmeldelser", "bedømmelser", "vurderinger", "meninger", "kommentarer", - - # Finnish - "arvostelut", "arviot", "kommentit", "mielipiteet", "palautteet", - - # Greek - "κριτικές", "αξιολογήσεις", "σχόλια", "απόψεις", "βαθμολογίες", - - # Czech - "recenze", "hodnocení", "názory", "komentáře", "posudky", - - # Romanian - "recenzii", "evaluări", "opinii", "comentarii", "note", - - # Hungarian - "vélemények", "értékelések", "kritikák", "hozzászólások", "megjegyzések", - - # Bulgarian - "отзиви", "ревюта", "мнения", "коментари", "оценки" -} - - -class GoogleReviewsScraper: - """Main scraper class for Google Maps reviews""" - - def __init__(self, config: Dict[str, Any]): - """Initialize scraper with configuration""" - self.config = config - self.use_mongodb = config.get("use_mongodb", True) - self.mongodb = MongoDBStorage(config) if self.use_mongodb else None - self.json_storage = JSONStorage(config) - self.backup_to_json = config.get("backup_to_json", True) - self.overwrite_existing = config.get("overwrite_existing", False) - self.enable_api_intercept = config.get("enable_api_intercept", False) - self.api_interceptor = None # Will be initialized when driver is ready - - def setup_driver(self, headless: bool): - """ - Set up and configure Chrome driver using SeleniumBase UC Mode. - SeleniumBase provides enhanced anti-detection and automatic Chrome/ChromeDriver version management. - Works in both Docker containers and on regular OS installations (Windows, Mac, Linux). - """ - # Log platform information for debugging - log.info(f"Platform: {platform.platform()}") - log.info(f"Python version: {platform.python_version()}") - log.info("Using SeleniumBase UC Mode for enhanced anti-detection") - - # Determine if we're running in a container - in_container = os.environ.get('CHROME_BIN') is not None - - if in_container: - chrome_binary = os.environ.get('CHROME_BIN') - log.info(f"Container environment detected") - log.info(f"Chrome binary: {chrome_binary}") - - # Create driver with custom binary location for containers - if chrome_binary and os.path.exists(chrome_binary): - try: - driver = Driver( - uc=True, - headless=headless, - binary_location=chrome_binary, - page_load_strategy="normal" - ) - log.info("Successfully created SeleniumBase UC driver with custom binary") - except Exception as e: - log.warning(f"Failed to create driver with custom binary: {e}") - # Fall back to default - driver = Driver( - uc=True, - headless=headless, - page_load_strategy="normal" - ) - log.info("Successfully created SeleniumBase UC driver with defaults") - else: - driver = Driver( - uc=True, - headless=headless, - page_load_strategy="normal" - ) - log.info("Successfully created SeleniumBase UC driver") - else: - # Regular OS environment - SeleniumBase handles version matching automatically - log.info("Creating SeleniumBase UC Mode driver") - try: - driver = Driver( - uc=True, - headless=headless, - page_load_strategy="normal", - incognito=True # Use incognito mode for better stealth - ) - log.info("Successfully created SeleniumBase UC driver") - except Exception as e: - log.error(f"Failed to create SeleniumBase driver: {e}") - raise - - # Set page load timeout to avoid hanging - driver.set_page_load_timeout(30) - - # Set window size - driver.set_window_size(1400, 900) - - # Add additional stealth settings - try: - # Disable automation flags - driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { - 'source': ''' - Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); - Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]}); - Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']}); - ''' - }) - log.info("Additional stealth settings applied") - except Exception as e: - log.debug(f"Could not apply additional stealth settings: {e}") - - log.info("SeleniumBase UC driver setup completed successfully") - return driver - - def dismiss_cookies(self, driver: Chrome): - """ - Dismiss cookie consent dialogs if present. - Handles stale element references by re-finding elements if needed. - Updated January 2026 to handle current Google consent dialogs. - """ - dismissed = False - - # Try multiple approaches to dismiss consent dialogs - consent_selectors = [ - COOKIE_BTN, - # Additional Google consent selectors - 'button[aria-label*="Accept all" i]', - 'button[aria-label*="Aceptar todo" i]', - 'button[aria-label*="Reject all" i]', # Sometimes we need to reject - 'button:has-text("Accept")', - 'button:has-text("Aceptar")', - '[role="dialog"] button:first-of-type', - 'form[action*="consent"] button:first-of-type', - ] - - for selector in consent_selectors: - try: - elements = driver.find_elements(By.CSS_SELECTOR, selector) - for elem in elements: - try: - if elem.is_displayed() and elem.is_enabled(): - # Try JavaScript click first (more reliable) - driver.execute_script("arguments[0].click();", elem) - log.info(f"Cookie/consent dialog dismissed with selector: {selector}") - time.sleep(0.3) # Reduced from 1s to 0.3s - dismissed = True - break - except Exception as e: - log.debug(f"Error clicking consent button: {e}") - continue - if dismissed: - break - except Exception as e: - log.debug(f"Error finding consent elements with {selector}: {e}") - continue - - # Also try to find and click any visible modal close buttons - if not dismissed: - try: - close_btns = driver.find_elements(By.CSS_SELECTOR, - '[role="dialog"] button[aria-label*="close" i], ' - '[role="dialog"] button[aria-label*="cerrar" i], ' - '.modal-close, .dialog-close') - for btn in close_btns: - if btn.is_displayed(): - driver.execute_script("arguments[0].click();", btn) - log.info("Closed modal dialog") - dismissed = True - break - except Exception: - pass - - return dismissed - - def is_reviews_tab(self, tab: WebElement) -> bool: - """ - Dynamically detect if an element is the reviews tab across multiple languages and layouts. - Uses multiple detection approaches for maximum reliability. - """ - try: - # Strategy 1: Data attribute detection (most reliable across languages) - tab_index = tab.get_attribute("data-tab-index") - if tab_index == "1" or tab_index == "reviews": - return True - - # Strategy 2: Role and aria attributes (accessibility detection) - role = tab.get_attribute("role") - aria_selected = tab.get_attribute("aria-selected") - aria_label = (tab.get_attribute("aria-label") or "").lower() - - # Many review tabs have role="tab" and data attributes - if role == "tab" and any(word in aria_label for word in REVIEW_WORDS): - return True - - # Strategy 3: Text content detection (multiple sources) - sources = [ - tab.text.lower() if tab.text else "", # Direct text - aria_label, # ARIA label - tab.get_attribute("innerHTML").lower() or "", # Inner HTML - tab.get_attribute("textContent").lower() or "" # Text content - ] - - # Check all sources against our comprehensive keyword list - for source in sources: - if any(word in source for word in REVIEW_WORDS): - return True - - # Strategy 4: Nested element detection - try: - # Check text in all child elements - for child in tab.find_elements(By.CSS_SELECTOR, "*"): - try: - child_text = child.text.lower() if child.text else "" - child_content = child.get_attribute("textContent").lower() or "" - - if any(word in child_text for word in REVIEW_WORDS) or any( - word in child_content for word in REVIEW_WORDS): - return True - except: - continue - except: - pass - - # Strategy 5: URL detection (some tabs have hrefs or data-hrefs with tell-tale values) - for attr in ["href", "data-href", "data-url", "data-target"]: - attr_value = (tab.get_attribute(attr) or "").lower() - if attr_value and ("review" in attr_value or "rating" in attr_value): - return True - - # Strategy 6: Class detection (some review tabs have specific classes) - tab_class = tab.get_attribute("class") or "" - review_classes = ["review", "reviews", "rating", "ratings", "comments", "feedback", "g4jrve"] - if any(cls in tab_class for cls in review_classes): - return True - - return False - - except StaleElementReferenceException: - return False - except Exception as e: - log.debug(f"Error in is_reviews_tab: {e}") - return False - - def click_reviews_tab(self, driver: Chrome): - """ - Navigate to reviews section by clicking the Reviews tab/button on the page. - Uses text-based detection (what humans see) as primary method for robustness. - """ - current_url = driver.current_url - - # PRIMARY METHOD: Look for text-based "Reviews" button/tab (what humans see) - log.info("Trying to find Reviews tab by visible text...") - max_timeout = 15 - end_time = time.time() + max_timeout - - for language_keyword in REVIEW_WORDS: - if time.time() > end_time: - break - - try: - # Try XPath that finds elements containing the text (case-insensitive) - # This includes divs with aria-hidden="true" that contain "Reviews" - xpath = f"//*[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{language_keyword.lower()}')]" - elements = driver.find_elements(By.XPATH, xpath) - - for element in elements: - try: - element_text = (element.text or '').strip() - if not element_text or len(element_text) > 50: - continue - - tag_name = element.tag_name.lower() - role = element.get_attribute('role') or '' - aria_hidden = element.get_attribute('aria-hidden') - - # If this is a div with aria-hidden="true" containing "Reviews", - # try to click its parent button/clickable element - if tag_name == 'div' and aria_hidden == 'true': - log.info(f"Found aria-hidden div with text: '{element_text}', looking for clickable parent") - # Try parent element - try: - parent = driver.execute_script("return arguments[0].parentElement;", element) - parent_tag = parent.tag_name.lower() if parent else '' - parent_role = parent.get_attribute('role') if parent else '' - - if parent and (parent_tag in ['button', 'a'] or 'tab' in parent_role or 'button' in parent_role): - log.info(f"Found clickable parent: {parent_tag} with role={parent_role}") - driver.execute_script("arguments[0].scrollIntoView({block:'center', behavior:'smooth'});", parent) - time.sleep(0.5) - driver.execute_script("arguments[0].click();", parent) - time.sleep(3) - - if self.verify_reviews_tab_clicked(driver): - log.info(f"✅ Successfully clicked Reviews via aria-hidden parent") - return True - except: - pass - - # Try clicking the element directly if it's clickable - elif tag_name in ['button', 'a'] or 'tab' in role or 'button' in role: - log.info(f"Found clickable Reviews element: '{element_text}' (tag: {tag_name}, role: {role})") - - driver.execute_script("arguments[0].scrollIntoView({block:'center', behavior:'smooth'});", element) - time.sleep(0.5) - driver.execute_script("arguments[0].click();", element) - time.sleep(3) - - if self.verify_reviews_tab_clicked(driver): - log.info(f"✅ Successfully clicked Reviews via text: '{element_text}'") - return True - except: - continue - except: - continue - - # FALLBACK METHOD: Find aria-hidden divs with exact text "Reviews" (or language variants) - log.info("Trying aria-hidden div detection as fallback...") - try: - # Look for divs with aria-hidden="true" that contain ONLY the review word (no extra text) - divs = driver.find_elements(By.CSS_SELECTOR, 'div[aria-hidden="true"]') - - for div in divs: - div_text = (div.text or '').strip() - - # Check if this div contains ONLY a review keyword (exact match, case-insensitive) - for keyword in REVIEW_WORDS: - if div_text.lower() == keyword.lower(): - log.info(f"Found aria-hidden div with exact text: '{div_text}'") - - # Get the parent element (should be the clickable tab/button) - try: - parent = driver.execute_script("return arguments[0].parentElement;", div) - if parent: - parent_tag = parent.tag_name.lower() - parent_role = parent.get_attribute('role') or '' - - log.info(f"Parent element: tag={parent_tag}, role={parent_role}") - - # Click the parent if it looks clickable - driver.execute_script("arguments[0].scrollIntoView({block:'center'});", parent) - time.sleep(0.5) - driver.execute_script("arguments[0].click();", parent) - time.sleep(2) - - if self.verify_reviews_tab_clicked(driver): - log.info(f"✅ Successfully clicked Reviews via aria-hidden fallback") - return True - except Exception as e: - log.debug(f"Error clicking parent of aria-hidden div: {e}") - continue - except Exception as e: - log.debug(f"Error in aria-hidden fallback: {e}") - - # If all methods failed - log.warning("Failed to navigate to reviews after trying all methods") - raise TimeoutException("Could not navigate to reviews section") - - def verify_reviews_tab_clicked(self, driver: Chrome) -> bool: - """ - Verify that the reviews tab was successfully clicked. - Uses robust verification methods that don't depend on fragile CSS classes. - """ - try: - # METHOD 1: Check for text-based indicators (most robust) - # Look for common review-related text that appears regardless of CSS changes - page_text = driver.page_source.lower() - - # These text patterns appear when reviews section is active - review_indicators = [ - 'sort reviews', - 'most relevant', - 'newest', - 'highest rating', - 'lowest rating', - ] - - for indicator in review_indicators: - if indicator in page_text: - log.debug(f"Found review indicator: '{indicator}'") - return True - - # METHOD 2: Check for semantic attributes (stable) - # Look for elements with review-specific attributes - semantic_selectors = [ - 'div[data-review-id]', # Review cards have data-review-id - 'button[aria-label*="Sort" i]', # Sort button - 'span[role="img"][aria-label*="star" i]', # Star ratings - ] - - for selector in semantic_selectors: - elements = driver.find_elements(By.CSS_SELECTOR, selector) - if elements and len(elements) > 0: - log.debug(f"Found semantic element: {selector}") - return True - - # URL check - if "review" appears in the URL - if "review" in driver.current_url.lower(): - return True - - return False - except Exception as e: - log.debug(f"Error verifying reviews tab click: {e}") - return False - - def set_sort(self, driver: Chrome, method: str): - """ - Set the sorting method for reviews with enhanced detection for the latest Google Maps UI. - Works across different languages and UI variations, with robust error handling. - """ - if method == "relevance": - log.info("Using default 'relevance' sort - no need to change sort order") - return True # Default order, no need to change - - log.info(f"Attempting to set sort order to '{method}'") - - try: - # 1. Find and click the sort button using ROBUST TEXT-BASED DETECTION - # Multi-language sort button keywords (what humans see) - sort_keywords = { - 'en': ['sort', 'Sort', 'SORT'], - 'he': ['סדר', 'סידור'], - 'th': ['เรียง'], - 'zh': ['排序'], - 'fr': ['trier', 'Trier'], - 'es': ['ordenar', 'Ordenar'], - 'de': ['sortieren', 'Sortieren'], - 'pt': ['Classificar'], - 'it': ['Ordina'], - 'ru': ['Сортировать'] - } - - # Flatten all keywords - all_sort_keywords = [kw for keywords in sort_keywords.values() for kw in keywords] - - # PRIMARY METHOD: Find buttons by text or aria-label (robust) - sort_button = None - log.info("Looking for sort button using text-based detection...") - - for keyword in all_sort_keywords: - try: - # XPath to find buttons containing the keyword (case-sensitive for non-English) - xpath = f"//button[contains(text(), '{keyword}') or contains(@aria-label, '{keyword}')]" - elements = driver.find_elements(By.XPATH, xpath) - - for element in elements: - try: - # Skip invisible/disabled elements - if not element.is_displayed() or not element.is_enabled(): - continue - - # Get button text and attributes for verification - button_text = element.text.strip() if element.text else "" - button_aria = element.get_attribute("aria-label") or "" - - # Skip buttons that are clearly not sort buttons - negative_keywords = ["back", "next", "previous", "close", "cancel", "חזרה", "סגור", "ปิด"] - if any(neg in button_text.lower() or neg in button_aria.lower() for neg in negative_keywords): - continue - - # Verify it has dropdown attributes (sort buttons are typically dropdowns) - has_dropdown = (element.get_attribute("aria-haspopup") == "true" or - element.get_attribute("aria-expanded") is not None) - - if has_dropdown or keyword in button_text or keyword in button_aria: - sort_button = element - log.info(f"✅ Found sort button with text: '{button_text}' or aria-label: '{button_aria}'") - break - - except Exception as e: - log.debug(f"Error checking element: {e}") - continue - - if sort_button: - break - - except Exception as e: - log.debug(f"Error with keyword '{keyword}': {e}") - continue - - # FALLBACK METHOD: Find any button with dropdown attributes near review content - if not sort_button: - log.info("Trying fallback: finding buttons with dropdown attributes...") - try: - buttons = driver.find_elements(By.CSS_SELECTOR, 'button[aria-haspopup="true"]') - - for button in buttons: - if not button.is_displayed() or not button.is_enabled(): - continue - - button_text = (button.text or '').strip().lower() - button_aria = (button.get_attribute("aria-label") or '').lower() - - # Look for any sort-related keywords - if any(kw.lower() in button_text or kw.lower() in button_aria for kw in all_sort_keywords): - sort_button = button - log.info(f"✅ Found sort button via fallback: {button.text}") - break - - except Exception as e: - log.debug(f"Error in fallback method: {e}") - - # Final check - do we have a sort button? - if not sort_button: - log.warning("No sort button found with any method - keeping default sort order") - return False - - # 2. Click the sort button to open dropdown menu - - # First ensure the button is in view - driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'smooth'});", sort_button) - time.sleep(0.8) # Wait for scroll - - # Try multiple click methods - click_methods = [ - # Method 1: JavaScript click - lambda: driver.execute_script("arguments[0].click();", sort_button), - - # Method 2: Direct click - lambda: sort_button.click(), - - # Method 3: ActionChains click with move first - lambda: ActionChains(driver).move_to_element(sort_button).pause(0.3).click().perform(), - - # Method 4: Click on center of element - lambda: ActionChains(driver).move_to_element_with_offset( - sort_button, sort_button.size['width'] // 2, sort_button.size['height'] // 2 - ).click().perform(), - - # Method 5: JavaScript focus and click - lambda: driver.execute_script( - "arguments[0].focus(); setTimeout(function() { arguments[0].click(); }, 100);", sort_button - ), - - # Method 6: Send RETURN key after focusing - lambda: ActionChains(driver).move_to_element(sort_button).click().send_keys(Keys.RETURN).perform() - ] - - # Try each click method - menu_opened = False - - for i, click_method in enumerate(click_methods): - try: - log.info(f"Trying click method {i + 1} for sort button...") - click_method() - time.sleep(1) # Wait for menu to appear - - # Check if menu opened - menu_opened = self.check_if_menu_opened(driver) - - if menu_opened: - log.info(f"Sort menu opened with click method {i + 1}") - break - except Exception as e: - log.debug(f"Click method {i + 1} failed: {e}") - continue - - # If menu not opened, abort - if not menu_opened: - log.warning("Failed to open sort menu - keeping default sort order") - # Try to reset state by clicking elsewhere - try: - ActionChains(driver).move_by_offset(50, 50).click().perform() - except: - pass - return False - - # 3. Find and click the desired sort option in the menu - # Uses ROBUST SEMANTIC SELECTORS (role attributes), not CSS classes - - try: - # PRIMARY METHOD: Find menu items by role attribute (semantic, stable) - # menuitemradio is the standard role for radio menu items - log.info("Looking for menu items using semantic role attributes...") - - menu_items = WebDriverWait(driver, 5).until( - EC.presence_of_all_elements_located((By.CSS_SELECTOR, '[role="menuitemradio"], [role="menuitem"]')) - ) - - # Process menu items to extract text - visible_items = [] - - for item in menu_items: - try: - # Skip invisible items - if not item.is_displayed(): - continue - - # Get the menu item text - # Try innerText first (most reliable), then textContent, then .text - text = driver.execute_script(""" - const elem = arguments[0]; - return elem.innerText || elem.textContent || elem.text || ''; - """, item).strip() - - if text: # Only add items with text - visible_items.append((item, text)) - - except Exception as e: - log.debug(f"Error processing menu item: {e}") - continue - - log.info(f"Found {len(visible_items)} visible menu items") - for i, (_, text) in enumerate(visible_items): - log.debug(f" Menu item {i + 1}: '{text}'") - - # Determine the target menu item based on sort method - target_item = None - matched_text = None - - # Log all available menu items for debugging - log.info(f"Available menu items: {[text for _, text in visible_items]}") - - # Use position-based selection (most reliable for Google Maps) - position_map = { - "relevance": 0, # Usually the first option - "newest": 1, # Usually the second option - "highest": 2, # Usually the third option - "lowest": 3 # Usually the fourth option - } - - pos = position_map.get(method, -1) - if pos >= 0 and pos < len(visible_items): - target_item, matched_text = visible_items[pos] - log.info(f"Selected menu item at position {pos + 1}: '{matched_text}' for sort method '{method}'") - - # Validate the selection makes sense - wanted_labels = SORT_OPTIONS.get(method, []) - text_clean = matched_text.lower() - - # Check if selected text contains any of the expected keywords - valid_selection = False - for label in wanted_labels: - if label.lower() in text_clean or text_clean in label.lower(): - valid_selection = True - break - - if not valid_selection: - log.warning(f"WARNING: Selected '{matched_text}' doesn't match expected '{method}' - might be wrong sort!") - else: - log.warning(f"Position {pos} not available in menu (only {len(visible_items)} items)") - - # 3. If target found, click it - if target_item: - # Ensure item is in view - driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", target_item) - time.sleep(0.3) - - # Try multiple click methods - click_success = False - click_methods = [ - # Method 1: JavaScript click - lambda: driver.execute_script("arguments[0].click();", target_item), - - # Method 2: Direct click - lambda: target_item.click(), - - # Method 3: ActionChains click - lambda: ActionChains(driver).move_to_element(target_item).click().perform(), - - # Method 4: Center click - lambda: ActionChains(driver).move_to_element_with_offset( - target_item, target_item.size['width'] // 2, target_item.size['height'] // 2 - ).click().perform(), - - # Method 5: JavaScript click with custom event - lambda: driver.execute_script(""" - var el = arguments[0]; - var evt = new MouseEvent('click', { - bubbles: true, - cancelable: true, - view: window - }); - el.dispatchEvent(evt); - """, target_item) - ] - - for i, click_method in enumerate(click_methods): - try: - click_method() - time.sleep(1.5) # Wait for sort to take effect - - # Try to verify sort happened by checking if menu closed - still_open = self.check_if_menu_opened(driver) - if not still_open: - click_success = True - log.info(f"Successfully clicked menu item with method {i + 1}") - break - except Exception as e: - log.debug(f"Menu item click method {i + 1} failed: {e}") - continue - - if click_success: - log.info(f"Successfully set sort order to '{method}'") - return True - else: - log.warning(f"Failed to click menu item - keeping default sort order") - else: - log.warning(f"No matching menu item found for '{method}'") - - # If we get here, we failed - try to close the menu by clicking elsewhere - try: - ActionChains(driver).move_by_offset(50, 50).click().perform() - except: - pass - - return False - - except TimeoutException: - log.warning("Timeout waiting for menu items") - return False - except Exception as e: - log.warning(f"Error in menu item selection: {e}") - return False - - except Exception as e: - log.warning(f"Error in set_sort method: {e}") - return False - - def check_if_menu_opened(self, driver): - """ - Check if a sort menu has been opened after clicking the sort button. - Uses multiple detection strategies optimized for Google Maps dropdowns. - Returns True if menu is detected, False otherwise. - """ - try: - # 1. First check for exact menu container selectors from the latest Google Maps UI - specific_menu_selectors = [ - 'div[role="menu"][id="action-menu"]', # Exact match from provided HTML - 'div.fontBodyLarge.yu5kgd[role="menu"]', # Classes from provided HTML - 'div.fxNQSd[role="menuitemradio"]', # Menu item class - 'div.yu5kgd[role="menu"]' # Alternate class - ] - - for selector in specific_menu_selectors: - elements = driver.find_elements(By.CSS_SELECTOR, selector) - for element in elements: - try: - if element.is_displayed(): - return True - except: - continue - - # 2. Check for generic menu containers - generic_menu_selectors = [ - 'div[role="menu"]', - 'ul[role="menu"]', - '[role="listbox"]' - ] - - for selector in generic_menu_selectors: - elements = driver.find_elements(By.CSS_SELECTOR, selector) - for element in elements: - try: - if element.is_displayed(): - return True - except: - continue - - # 3. Look for menu items - menu_item_selectors = [ - 'div[role="menuitemradio"]', # Google Maps specific - 'div.fxNQSd', # Class-based detection - 'div.mLuXec', # Text container class - '[role="menuitem"]', # Generic menu items - '[role="option"]' # Alternative role - ] - - visible_items = 0 - for selector in menu_item_selectors: - elements = driver.find_elements(By.CSS_SELECTOR, selector) - for element in elements: - try: - if element.is_displayed(): - visible_items += 1 - if visible_items >= 2: # At least 2 menu items should be visible - return True - except: - continue - - # 4. Advanced detection with JavaScript - # Checks if there are newly visible elements with menu-related roles or classes - try: - js_detection = """ - return (function() { - // Check for visible menu elements - var menuElements = document.querySelectorAll('div[role="menu"], div[role="menuitemradio"], div.fxNQSd'); - for (var i = 0; i < menuElements.length; i++) { - var style = window.getComputedStyle(menuElements[i]); - if (style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0') { - return true; - } - } - - // Check for any recently appeared elements that might be a menu - var possibleMenus = document.querySelectorAll('div.yu5kgd, div.fontBodyLarge'); - for (var i = 0; i < possibleMenus.length; i++) { - var style = window.getComputedStyle(possibleMenus[i]); - var rect = possibleMenus[i].getBoundingClientRect(); - // Check if element is visible and has a meaningful size - if (style.display !== 'none' && style.visibility !== 'hidden' && - rect.width > 50 && rect.height > 50) { - return true; - } - } - - return false; - })(); - """ - menu_detected = driver.execute_script(js_detection) - if menu_detected: - return True - except Exception as js_error: - log.debug(f"Error in JavaScript menu detection: {js_error}") - - # 5. Last resort: check if any positioning styles were applied to elements - # This can detect menu containers that have been positioned absolutely - try: - position_check = """ - return (function() { - // Look for absolutely positioned elements that appeared recently - var elements = document.querySelectorAll('div[style*="position: absolute"]'); - for (var i = 0; i < elements.length; i++) { - var el = elements[i]; - var style = window.getComputedStyle(el); - var hasMenuItems = el.querySelectorAll('div[role="menuitemradio"], div.fxNQSd').length > 0; - - if (style.display !== 'none' && style.visibility !== 'hidden' && hasMenuItems) { - return true; - } - } - return false; - })(); - """ - position_detected = driver.execute_script(position_check) - if position_detected: - return True - except: - pass - - return False - - except Exception as e: - log.debug(f"Error checking menu state: {e}") - return False - - def wait_for_api_response(self, driver: Chrome, timeout: float = 2.0) -> bool: - """ - Smart wait that detects when new API response has arrived. - Much faster and more reliable than fixed time.sleep(). - - Returns True if new response detected, False if timeout. - """ - if not self.enable_api_intercept or not self.api_interceptor: - # Fallback to fixed wait if API interception disabled - time.sleep(0.6) - return False - - try: - # Get current response count - initial_count = driver.execute_script(""" - return (window.__allRequests || []).filter(r => - r.url && r.url.toLowerCase().includes('listugcposts') - ).length; - """) - - # Wait for new response with timeout - start = time.time() - while (time.time() - start) < timeout: - current_count = driver.execute_script(""" - return (window.__allRequests || []).filter(r => - r.url && r.url.toLowerCase().includes('listugcposts') - ).length; - """) - - if current_count > initial_count: - # New API response arrived! - elapsed = time.time() - start - log.debug(f"New API response detected after {elapsed:.2f}s") - time.sleep(0.2) # Small delay for DOM to update - return True - - time.sleep(0.05) # Check every 50ms - - # Timeout - no new response - log.debug(f"No API response after {timeout}s (might be at end of reviews)") - return False - - except Exception as e: - log.debug(f"Error waiting for API response: {e}") - time.sleep(0.6) # Fallback to fixed wait - return False - - def extract_total_reviews(self, driver: Chrome) -> Tuple[Optional[int], Optional[str]]: - """ - Extract total review count from Google Maps page. - Looks for patterns like "247 reviews", "1,234 reviews", or "5.2K reviews". - - Returns: - tuple: (total_count: int, count_string: str) or (None, None) if not found - """ - try: - # Method 1: Look for "XXX reviews" text in the page source - page_text = driver.page_source - - # Pattern: "244 reviews" or "1,234 reviews" or "5.2K reviews" - patterns = [ - r'(\d{1,3}(?:,\d{3})*)\s+reviews?', # "244 reviews" or "1,234 reviews" - r'(\d+\.?\d*K)\s+reviews?', # "5.2K reviews" - r'(\d{1,3}(?:,\d{3})*)\s+reseñas?', # Spanish - r'(\d{1,3}(?:,\d{3})*)\s+评论', # Chinese - ] - - for pattern in patterns: - matches = re.findall(pattern, page_text, re.IGNORECASE) - if matches: - count_str = matches[0] - - # Parse the count - if 'K' in count_str or 'k' in count_str: - # "5.2K" -> 5200 - num = float(count_str.replace('K', '').replace('k', '')) - total = int(num * 1000) - else: - # "1,234" -> 1234 - total = int(count_str.replace(',', '')) - - return total, count_str - - # Method 2: Look for aria-label with review count - buttons = driver.find_elements(By.TAG_NAME, 'button') - for btn in buttons: - aria_label = btn.get_attribute('aria-label') or '' - text = btn.text or '' - - # Check both aria-label and text - for content in [aria_label, text]: - match = re.search(r'(\d{1,3}(?:,\d{3})*)\s+reviews?', content, re.IGNORECASE) - if match: - count_str = match.group(1) - total = int(count_str.replace(',', '')) - return total, count_str - - return None, None - - except Exception as e: - log.debug(f"Error extracting total review count: {e}") - return None, None - - def scrape(self): - """Main scraper method""" - start_time = time.time() - - url = self.config.get("url") - headless = self.config.get("headless", True) - sort_by = self.config.get("sort_by", "relevance") - stop_on_match = self.config.get("stop_on_match", False) - - log.info(f"Starting scraper with settings: headless={headless}, sort_by={sort_by}") - log.info(f"URL: {url}") - - # Initialize storage - # If not overwriting, load existing data - if self.overwrite_existing: - docs = {} - seen = set() - else: - # Try to get from MongoDB first if enabled - docs = {} - if self.use_mongodb and self.mongodb: - docs = self.mongodb.fetch_existing_reviews() - - # If backup_to_json is enabled, also load from JSON for merging - if self.backup_to_json: - json_docs = self.json_storage.load_json_docs() - # Merge JSON docs with MongoDB docs - for review_id, review in json_docs.items(): - if review_id not in docs: - docs[review_id] = review - - # Load seen IDs from file - seen = self.json_storage.load_seen() - - driver = None - api_reviews = {} # Store reviews captured from API - try: - driver = self.setup_driver(headless) - wait = WebDriverWait(driver, 20) # Reduced from 40 to 20 for faster timeout - - driver.get(url) - wait.until(lambda d: "google.com/maps" in d.current_url) - - # Wait briefly for consent dialogs to appear (optimized from 3s to 1s) - time.sleep(1) - - # Try to dismiss any consent/cookie dialogs - if not self.dismiss_cookies(driver): - # Quick retry (optimized from 2s to 0.5s) - time.sleep(0.5) - self.dismiss_cookies(driver) - - self.click_reviews_tab(driver) - - # Reduced wait after clicking reviews tab (optimized from 3s to 1s) - log.info("Waiting for reviews page to fully load...") - time.sleep(1) - - # Wait for page to be fully interactive - try: - wait.until(lambda d: d.execute_script("return document.readyState") == "complete") - log.info("Page DOM is ready") - except: - log.debug("Could not verify page ready state") - - # Extract total review count from the page - total_reviews, total_str = self.extract_total_reviews(driver) - if total_reviews: - log.info(f"✅ Google shows {total_str} ({total_reviews} total reviews)") - else: - log.warning("⚠️ Could not extract total review count - will scroll until no new reviews") - total_reviews = None - - # Verify we're on a reviews page before proceeding - if "review" not in driver.current_url.lower(): - log.warning("URL doesn't contain 'review' - might not be on reviews page") - - # Try to set sort - but don't fail if it doesn't work - try: - self.set_sort(driver, sort_by) - except Exception as sort_error: - log.warning(f"Sort failed but continuing: {sort_error}") - - # Reduced wait after setting sort (optimized from 3s to 1s) - log.info("Waiting for reviews to render...") - time.sleep(1) - - # Find the scrollable reviews pane using robust detection - # Uses JavaScript to find elements by their scrollable properties, not CSS classes - pane = None - - try: - log.info("Finding scrollable reviews pane using robust detection...") - - # JavaScript to find scrollable container (no CSS classes needed!) - find_scrollable_script = """ - function findScrollablePane() { - // Find all divs that might be scrollable - const allDivs = document.querySelectorAll('div'); - - for (let div of allDivs) { - const style = window.getComputedStyle(div); - const overflowY = style.overflowY; - - // Check if element is scrollable - if ((overflowY === 'auto' || overflowY === 'scroll') && - div.scrollHeight > div.clientHeight && - div.clientHeight > 200) { // Must be tall enough to be main pane - - // Additional checks: should contain review-like content - const text = div.textContent || ''; - const hasReviewIndicators = - text.includes('star') || - text.includes('rating') || - text.includes('review') || - div.querySelector('[data-review-id]') || - div.querySelector('[role="img"][aria-label*="star"]'); - - if (hasReviewIndicators) { - return div; - } - } - } - - // Fallback: return main element if found - return document.querySelector('[role="main"]'); - } - return findScrollablePane(); - """ - - pane = driver.execute_script(find_scrollable_script) - - if pane: - log.info("✅ Found scrollable reviews pane using robust JavaScript detection") - else: - log.warning("❌ Could not find scrollable reviews pane") - - except Exception as e: - log.warning(f"Error finding scrollable pane with JavaScript: {e}") - # Fallback to simple div[role="main"] if JS fails - try: - pane = driver.find_element(By.CSS_SELECTOR, 'div[role="main"]') - log.info("Using fallback: div[role='main']") - except: - pass - - if not pane: - log.error("Could not find reviews pane. Page structure might have changed.") - return False - - # Initialize API interceptor AFTER reviews page is loaded (if enabled) - # This prevents CDP interception from affecting initial page load and tab detection - if self.enable_api_intercept: - log.info("Setting up API interception for reviews capture") - self.api_interceptor = GoogleMapsAPIInterceptor(driver) - self.api_interceptor.setup_interception() - self.api_interceptor.inject_response_interceptor() - log.info("API interceptor ready - capturing network responses") - - pbar = tqdm(desc="Scraped", ncols=80, initial=len(seen)) - idle = 0 - processed_ids = set() # Track processed IDs in current session - - # Prefetch selector to avoid repeated lookups - try: - driver.execute_script("window.scrollablePane = arguments[0];", pane) - scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" - except Exception as e: - log.warning(f"Error setting up scroll script: {e}") - scroll_script = "window.scrollBy(0, 300);" # Fallback to simple scrolling - - # Card selectors to try (ROBUST - semantic attributes only, no CSS classes!) - # Only use data-review-id attribute which is stable and won't break with Google updates - card_selectors = [ - "[data-review-id]", # PRIMARY: Any element with review ID (most robust) - "div[data-review-id]", # Fallback: Div with review ID - ] - # REMOVED FRAGILE CSS CLASS SELECTORS: - # - CARD_SEL (div.jftiEf) - Google's obfuscated class, breaks on updates - # - .jftiEf - Same as above - # - div.WMbnJf - Another obfuscated class - # We now rely on semantic [data-review-id] attribute + API interceptor - - # CONTINUOUS SCROLLING APPROACH - # Scroll NON-STOP in background thread while extracting reviews in main thread - stop_scrolling = threading.Event() - scroll_count = [0] # Use list to make it mutable in thread - load_times = [] # Track when new reviews are loaded for smart timeout - - def continuous_scroll_worker(): - """Background thread that scrolls continuously without stopping""" - while not stop_scrolling.is_set(): - try: - driver.execute_script(scroll_script) - scroll_count[0] += 1 - time.sleep(0.005) # 5ms = ultra fast continuous scrolling! - except: - pass - - # Start continuous scrolling thread - scroll_thread = threading.Thread(target=continuous_scroll_worker, daemon=True) - scroll_thread.start() - log.info("🚀 Started continuous NON-STOP scrolling thread") - - check_num = 0 - max_checks = 100 # Maximum safety limit - - while check_num < max_checks: - check_num += 1 - - # Check if we've collected all reviews - if total_reviews and len(seen) >= total_reviews: - percent = (len(seen) / total_reviews) * 100 - log.info(f"✅ Got all {total_reviews} reviews ({percent:.1f}%)! Stopping scrolling.") - stop_scrolling.set() - break - - # Wait between checks while scrolling continues in background - time.sleep(2.0) # Check every 2 seconds - - try: - # Try multiple card selectors within the pane - cards = [] - for card_sel in card_selectors: - cards = pane.find_elements(By.CSS_SELECTOR, card_sel) - if cards: - if check_num == 1: # Only log once - log.info(f"Found {len(cards)} cards with selector: {card_sel}") - break - - # If no cards found in pane, try searching the entire document - if not cards: - for card_sel in card_selectors: - cards = driver.find_elements(By.CSS_SELECTOR, card_sel) - if cards: - if check_num == 1: - log.info(f"Found {len(cards)} cards in document with selector: {card_sel}") - break - - fresh_cards: List[WebElement] = [] - previous_count = len(seen) - - for c in cards: - try: - # Try to get data-review-id from the card itself - cid = c.get_attribute("data-review-id") - # If not found on card, try to find it in a child element - if not cid: - try: - review_id_elem = c.find_element(By.CSS_SELECTOR, "[data-review-id]") - cid = review_id_elem.get_attribute("data-review-id") - except: - pass - if not cid or cid in seen or cid in processed_ids: - if stop_on_match and cid and (cid in seen or cid in processed_ids): - idle = 999 - break - continue - fresh_cards.append(c) - except StaleElementReferenceException: - continue - except Exception as e: - log.debug(f"Error getting review ID: {e}") - continue - - # Process fresh cards - for card in fresh_cards: - try: - raw = RawReview.from_card(card) - processed_ids.add(raw.id) - except StaleElementReferenceException: - continue - except Exception: - log.warning("⚠️ parse error – storing stub\n%s", - traceback.format_exc(limit=1).strip()) - try: - raw_id = card.get_attribute("data-review-id") or "" - raw = RawReview(id=raw_id, text="", lang="und") - processed_ids.add(raw_id) - except StaleElementReferenceException: - continue - - docs[raw.id] = merge_review(docs.get(raw.id), raw) - seen.add(raw.id) - pbar.update(1) - - # Calculate how many new reviews we got - new_count = len(seen) - previous_count - - # Track load times for smart timeout - if new_count > 0: - current_time = time.time() - load_times.append(current_time) - - if total_reviews: - percent = (len(seen) / total_reviews) * 100 - log.info(f"Check {check_num:2d}: {len(seen):3d}/{total_reviews} ({percent:5.1f}%) | +{new_count} new") - else: - log.info(f"Check {check_num:2d}: {len(seen):3d} total | +{new_count} new") - else: - # No new reviews in this check - if total_reviews: - percent = (len(seen) / total_reviews) * 100 - log.info(f"Check {check_num:2d}: {len(seen):3d}/{total_reviews} ({percent:5.1f}%) | +0 new") - else: - log.info(f"Check {check_num:2d}: {len(seen):3d} total | +0 new") - - # Smart timeout: stop if no new reviews for 3x average gap - if new_count == 0: - if len(load_times) >= 3: - # Calculate average gap between individual review loads - gaps = [load_times[i] - load_times[i-1] for i in range(1, len(load_times))] - avg_gap = sum(gaps) / len(gaps) - timeout_threshold = avg_gap * 3 - timeout_type = f"gap-based (avg gap: {avg_gap:.1f}s)" - elif len(load_times) > 0: - # Initial timeout: use 3x time since first load started - time_since_first = time.time() - load_times[0] - timeout_threshold = max(10.0, time_since_first * 3) # At least 10s - timeout_type = f"initial (time since first: {time_since_first:.1f}s)" - else: - # No loads yet - use default initial timeout - timeout_threshold = 15.0 - timeout_type = "default (no loads yet)" - - # Check time since last load - if len(load_times) > 0: - time_since_last = time.time() - load_times[-1] - - # Log timeout status every check when no new reviews - log.debug(f" Timeout check: {time_since_last:.1f}s / {timeout_threshold:.1f}s ({timeout_type})") - - if time_since_last > timeout_threshold: - log.info(f"⏱️ No new reviews for {time_since_last:.1f}s (threshold: {timeout_threshold:.1f}s, {timeout_type}) - stopping") - stop_scrolling.set() - break - - # Fallback: stop if no new reviews for 10 consecutive checks - if new_count == 0: - idle += 1 - if idle >= 10: - log.info(f"⏱️ No new reviews for {idle} checks - stopping") - stop_scrolling.set() - break - else: - idle = 0 - - # Collect API responses if interception is enabled - if self.enable_api_intercept and self.api_interceptor: - try: - responses = self.api_interceptor.get_intercepted_responses() - if responses: - log.debug(f"Collected {len(responses)} network responses from browser") - - # Dump first few responses for analysis - if not hasattr(self, '_dumped_responses'): - self._dumped_responses = 0 - - if self._dumped_responses < 5: # Dump first 5 responses - from pathlib import Path - import json - output_dir = Path("api_response_samples") - output_dir.mkdir(exist_ok=True) - - for resp in responses: - if self._dumped_responses >= 5: - break - - idx = self._dumped_responses - body = resp.get('body', '') - - # Save full response - full_file = output_dir / f"response_{idx:02d}_full.json" - with open(full_file, 'w', encoding='utf-8') as f: - json.dump(resp, f, indent=2, ensure_ascii=False) - - # Save body - body_file = output_dir / f"response_{idx:02d}_body.txt" - with open(body_file, 'w', encoding='utf-8') as f: - f.write(body) - - # Try to parse and save - clean_body = body[4:].strip() if body.startswith(")]}'") else body - try: - parsed_data = json.loads(clean_body) - parsed_file = output_dir / f"response_{idx:02d}_parsed.json" - with open(parsed_file, 'w', encoding='utf-8') as f: - json.dump(parsed_data, f, indent=2, ensure_ascii=False) - log.info(f"Dumped API response {idx} to {output_dir}/ ({len(body)} bytes)") - except: - log.debug(f"Response {idx} is not JSON") - - self._dumped_responses += 1 - - parsed = self.api_interceptor.parse_reviews_from_responses(responses) - log.debug(f"Parsed {len(parsed)} reviews from responses") - for intercepted in parsed: - if intercepted.review_id and intercepted.review_id not in api_reviews: - api_reviews[intercepted.review_id] = self.api_interceptor.convert_to_raw_review_format(intercepted) - if parsed: - log.info(f"API interceptor captured {len(parsed)} reviews (total unique API: {len(api_reviews)})") - - # Log stats every 10 checks - if check_num % 10 == 0: - stats = self.api_interceptor.get_interceptor_stats() - if stats: - log.debug(f"Interceptor stats - Fetch: {stats.get('totalFetch', 0)}/{stats.get('capturedFetch', 0)}, " - f"XHR: {stats.get('totalXHR', 0)}/{stats.get('capturedXHR', 0)}, " - f"Last: {stats.get('lastCapture', 'never')}") - except Exception as api_err: - log.warning(f"API interception error: {api_err}", exc_info=True) - - except StaleElementReferenceException: - # The pane or other element went stale, try to re-find - log.debug("Stale element encountered, re-finding elements") - try: - pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, PANE_SEL))) - driver.execute_script("window.scrollablePane = arguments[0];", pane) - except Exception: - log.warning("Could not re-find reviews pane after stale element") - break - except Exception as e: - log.warning(f"Error during review processing: {e}") - time.sleep(1) - - pbar.close() - - # Stop continuous scrolling thread - stop_scrolling.set() - scroll_thread.join(timeout=2.0) - log.info(f"🛑 Stopped scrolling thread after {scroll_count[0]} total scrolls") - - # Merge API-captured reviews if any - if self.enable_api_intercept and api_reviews: - log.info(f"Merging {len(api_reviews)} reviews captured via API interception") - for review_id, api_review in api_reviews.items(): - if review_id not in docs: - # New review from API only - docs[review_id] = api_review - seen.add(review_id) - else: - # Merge API data with existing DOM data (API might have more details) - existing = docs[review_id] - # Only update fields that are missing or empty - for key, value in api_review.items(): - if key not in existing or not existing.get(key): - existing[key] = value - log.info(f"After merge: {len(docs)} total reviews") - elif self.enable_api_intercept: - # Log final stats even if no reviews captured - if self.api_interceptor: - stats = self.api_interceptor.get_interceptor_stats() - if stats: - log.warning(f"⚠️ API interception was enabled but captured 0 reviews. " - f"Network stats - Fetch requests: {stats.get('capturedFetch', 0)}/{stats.get('totalFetch', 0)}, " - f"XHR requests: {stats.get('capturedXHR', 0)}/{stats.get('totalXHR', 0)}") - - # Get browser console logs for debugging - console_logs = self.api_interceptor.get_browser_console_logs() - api_logs = [log_entry for log_entry in console_logs - if 'API Interceptor' in log_entry.get('message', '')] - if api_logs: - log.info(f"Found {len(api_logs)} API interceptor console messages") - for entry in api_logs[:10]: # Show first 10 - log.debug(f" Console: {entry.get('message', '')[:200]}") - else: - log.debug("No API interceptor console messages found") - - # In debug mode, try to dump any responses that were collected - if log.level <= logging.DEBUG: - all_responses = self.api_interceptor.get_intercepted_responses() - if all_responses: - dump_path = self.api_interceptor.dump_responses_to_file(all_responses) - if dump_path: - log.info(f"Raw responses dumped to: {dump_path}") - else: - log.warning("API interceptor stats not available") - - # Save to MongoDB if enabled - if self.use_mongodb and self.mongodb: - log.info("Saving reviews to MongoDB...") - self.mongodb.save_reviews(docs) - - # Backup to JSON if enabled - if self.backup_to_json: - log.info("Backing up to JSON...") - self.json_storage.save_json_docs(docs) - self.json_storage.save_seen(seen) - - # Final summary with completion percentage - if total_reviews: - percent = (len(docs) / total_reviews) * 100 - missing = total_reviews - len(docs) - if missing <= 0: - log.info(f"✅ Finished – Got all {total_reviews} reviews ({percent:.1f}%)") - elif percent >= 95.0: - log.info(f"✅ Finished – Got {len(docs)}/{total_reviews} reviews ({percent:.1f}%) - missing {missing}") - else: - log.info(f"⚠️ Finished – Got {len(docs)}/{total_reviews} reviews ({percent:.1f}%) - missing {missing}") - else: - log.info("✅ Finished – total unique reviews: %s", len(docs)) - - end_time = time.time() - elapsed_time = end_time - start_time - log.info(f"Execution completed in {elapsed_time:.2f} seconds") - - return True - - except Exception as e: - log.error(f"Error during scraping: {e}") - log.error(traceback.format_exc()) - return False - - finally: - # Cleanup API interceptor - if self.api_interceptor: - try: - self.api_interceptor.cleanup() - except Exception: - pass - - if driver is not None: - try: - driver.quit() - except Exception: - pass - - if self.mongodb: - try: - self.mongodb.close() - except Exception: - pass - -# """ -# Selenium scraping logic for Google Maps Reviews. -# """ -# -# import os -# import time -# import logging -# import traceback -# import platform -# from typing import Dict, Any, List -# -# import undetected_chromedriver as uc -# from selenium.common.exceptions import TimeoutException, StaleElementReferenceException -# from selenium.webdriver import Chrome -# from selenium.webdriver.common.by import By -# from selenium.webdriver.remote.webelement import WebElement -# from selenium.webdriver.support import expected_conditions as EC -# from selenium.webdriver.support.ui import WebDriverWait -# from tqdm import tqdm -# -# from modules.models import RawReview -# from modules.data_storage import MongoDBStorage, JSONStorage, merge_review -# -# # Logger -# log = logging.getLogger("scraper") -# -# # CSS Selectors -# PANE_SEL = 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf' -# CARD_SEL = "div[data-review-id]" -# COOKIE_BTN = ('button[aria-label*="Accept" i],' -# 'button[jsname="hZCF7e"],' -# 'button[data-mdc-dialog-action="accept"]') -# SORT_BTN = 'button[aria-label="Sort reviews" i], button[aria-label="Sort" i]' -# MENU_ITEMS = 'div[role="menu"] [role="menuitem"], li[role="menuitem"]' -# -# SORT_LABELS = { # text shown in Google Maps' menu -# "newest": ("Newest", "החדשות ביותר", "ใหม่ที่สุด"), -# "highest": ("Highest rating", "הדירוג הגבוה ביותר", "คะแนนสูงสุด"), -# "lowest": ("Lowest rating", "הדירוג הנמוך ביותר", "คะแนนต่ำสุด"), -# "relevance": ("Most relevant", "רלוונטיות ביותר", "เกี่ยวข้องมากที่สุด"), -# } -# -# REVIEW_WORDS = {"reviews", "review", "ביקורות", "รีวิว", "avis", "reseñas", -# "recensioni", "bewertungen", "口コミ", "レビュー", -# "리뷰", "評論", "评论", "рецензии", "ביקורת"} -# -# -# class GoogleReviewsScraper: -# """Main scraper class for Google Maps reviews""" -# -# def __init__(self, config: Dict[str, Any]): -# """Initialize scraper with configuration""" -# self.config = config -# self.use_mongodb = config.get("use_mongodb", True) -# self.mongodb = MongoDBStorage(config) if self.use_mongodb else None -# self.json_storage = JSONStorage(config) -# self.backup_to_json = config.get("backup_to_json", True) -# self.overwrite_existing = config.get("overwrite_existing", False) -# -# def setup_driver(self, headless: bool) -> Chrome: -# """ -# Set up and configure Chrome driver with flexibility for different environments. -# Works in both Docker containers and on regular OS installations (Windows, Mac, Linux). -# """ -# # Determine if we're running in a container -# in_container = os.environ.get('CHROME_BIN') is not None -# -# # Create Chrome options -# opts = uc.ChromeOptions() -# opts.add_argument("--window-size=1400,900") -# opts.add_argument("--ignore-certificate-errors") -# opts.add_argument("--disable-gpu") # Improves performance -# opts.add_argument("--disable-dev-shm-usage") # Helps with stability -# opts.add_argument("--no-sandbox") # More stable in some environments -# -# # Use headless mode if requested -# if headless: -# opts.add_argument("--headless=new") -# -# # Log platform information for debugging -# log.info(f"Platform: {platform.platform()}") -# log.info(f"Python version: {platform.python_version()}") -# -# # If in container, use environment-provided binaries -# if in_container: -# chrome_binary = os.environ.get('CHROME_BIN') -# chromedriver_path = os.environ.get('CHROMEDRIVER_PATH') -# -# log.info(f"Container environment detected") -# log.info(f"Chrome binary: {chrome_binary}") -# log.info(f"ChromeDriver path: {chromedriver_path}") -# -# if chrome_binary and os.path.exists(chrome_binary): -# log.info(f"Using Chrome binary from environment: {chrome_binary}") -# opts.binary_location = chrome_binary -# -# try: -# # Try creating Chrome driver with undetected_chromedriver -# log.info("Attempting to create undetected_chromedriver instance") -# driver = uc.Chrome(options=opts) -# log.info("Successfully created undetected_chromedriver instance") -# except Exception as e: -# # Fall back to regular Selenium if undetected_chromedriver fails -# log.warning(f"Failed to create undetected_chromedriver instance: {e}") -# log.info("Falling back to regular Selenium Chrome") -# -# # Import Selenium webdriver here to avoid potential import issues -# from selenium import webdriver -# from selenium.webdriver.chrome.service import Service -# -# if chromedriver_path and os.path.exists(chromedriver_path): -# log.info(f"Using ChromeDriver from path: {chromedriver_path}") -# service = Service(executable_path=chromedriver_path) -# driver = webdriver.Chrome(service=service, options=opts) -# else: -# log.info("Using default ChromeDriver") -# driver = webdriver.Chrome(options=opts) -# else: -# # On regular OS, use default undetected_chromedriver -# log.info("Using standard undetected_chromedriver setup") -# driver = uc.Chrome(options=opts) -# -# # Set page load timeout to avoid hanging -# driver.set_page_load_timeout(30) -# log.info("Chrome driver setup completed successfully") -# return driver -# -# def dismiss_cookies(self, driver: Chrome): -# """ -# Dismiss cookie consent dialogs if present. -# Handles stale element references by re-finding elements if needed. -# """ -# try: -# # Use WebDriverWait with expected_conditions to handle stale elements -# WebDriverWait(driver, 3).until( -# EC.presence_of_element_located((By.CSS_SELECTOR, COOKIE_BTN)) -# ) -# log.info("Cookie consent dialog found, attempting to dismiss") -# -# # Get elements again after waiting to avoid stale references -# elements = driver.find_elements(By.CSS_SELECTOR, COOKIE_BTN) -# for elem in elements: -# try: -# if elem.is_displayed(): -# elem.click() -# log.info("Cookie dialog dismissed") -# return True -# except Exception as e: -# log.debug(f"Error clicking cookie button: {e}") -# continue -# except TimeoutException: -# # This is expected if no cookie dialog is present -# log.debug("No cookie consent dialog detected") -# except Exception as e: -# log.debug(f"Error handling cookie dialog: {e}") -# -# return False -# -# def is_reviews_tab(self, tab: WebElement) -> bool: -# """Check if a tab is the reviews tab""" -# try: -# label = (tab.get_attribute("aria-label") or tab.text or "").lower() -# return tab.get_attribute("data-tab-index") == "1" or any(w in label for w in REVIEW_WORDS) -# except StaleElementReferenceException: -# return False -# except Exception as e: -# log.debug(f"Error checking if tab is reviews tab: {e}") -# return False -# -# def click_reviews_tab(self, driver: Chrome): -# """ -# Click on the reviews tab in Google Maps with improved stale element handling. -# """ -# end = time.time() + 15 # Timeout after 15 seconds -# while time.time() < end: -# try: -# # Find all tab elements -# tabs = driver.find_elements(By.CSS_SELECTOR, '[role="tab"], button[aria-label]') -# -# for tab in tabs: -# try: -# # Check if this is the reviews tab -# label = (tab.get_attribute("aria-label") or tab.text or "").lower() -# is_review_tab = tab.get_attribute("data-tab-index") == "1" or any( -# w in label for w in REVIEW_WORDS) -# -# if is_review_tab: -# # Scroll the tab into view -# driver.execute_script("arguments[0].scrollIntoView({block:\"center\"});", tab) -# time.sleep(0.2) # Small wait after scrolling -# -# # Try to click the tab -# log.info("Found reviews tab, attempting to click") -# tab.click() -# log.info("Successfully clicked reviews tab") -# return True -# except Exception as e: -# # Element might be stale or not clickable, try the next one -# log.debug(f"Error with tab element: {str(e)}") -# continue -# -# # If we get here, we didn't find a suitable tab in this iteration -# log.debug("No reviews tab found in this iteration, waiting...") -# time.sleep(0.5) # Wait before next attempt -# -# except Exception as e: -# # General exception handling -# log.debug(f"Exception while looking for reviews tab: {str(e)}") -# time.sleep(0.5) -# -# # If we exit the loop, we've timed out -# log.warning("Timeout while looking for reviews tab") -# raise TimeoutException("Reviews tab not found") -# -# def set_sort(self, driver: Chrome, method: str): -# """ -# Set the sorting method for reviews with improved error handling. -# """ -# if method == "relevance": -# return True # Default order, no need to change -# -# log.info(f"Attempting to set sort order to '{method}'") -# -# try: -# # First try to find and click the sort button -# sort_buttons = driver.find_elements(By.CSS_SELECTOR, SORT_BTN) -# if not sort_buttons: -# log.warning(f"Sort button not found - keeping default sort order") -# return False -# -# # Try to click the first visible sort button -# for sort_button in sort_buttons: -# try: -# if sort_button.is_displayed() and sort_button.is_enabled(): -# sort_button.click() -# log.info("Clicked sort button") -# time.sleep(0.5) # Wait for menu to appear -# break -# except Exception as e: -# log.debug(f"Error clicking sort button: {e}") -# continue -# else: -# log.warning("No clickable sort button found") -# return False -# -# # Now find and click the menu item for the desired sort method -# wanted = SORT_LABELS[method] -# menu_items = WebDriverWait(driver, 3).until( -# EC.presence_of_all_elements_located((By.CSS_SELECTOR, MENU_ITEMS)) -# ) -# -# for item in menu_items: -# try: -# label = item.text.strip() -# if label in wanted: -# item.click() -# log.info(f"Selected sort option: {label}") -# time.sleep(0.5) # Wait for sorting to take effect -# return True -# except Exception as e: -# log.debug(f"Error clicking menu item: {e}") -# continue -# -# log.warning(f"Sort option '{method}' not found in menu - keeping default") -# return False -# -# except Exception as e: -# log.warning(f"Error setting sort order: {e}") -# return False -# -# def scrape(self): -# """Main scraper method""" -# start_time = time.time() -# -# url = self.config.get("url") -# headless = self.config.get("headless", True) -# sort_by = self.config.get("sort_by", "relevance") -# stop_on_match = self.config.get("stop_on_match", False) -# -# log.info(f"Starting scraper with settings: headless={headless}, sort_by={sort_by}") -# log.info(f"URL: {url}") -# -# # Initialize storage -# # If not overwriting, load existing data -# if self.overwrite_existing: -# docs = {} -# seen = set() -# else: -# # Try to get from MongoDB first if enabled -# docs = {} -# if self.use_mongodb and self.mongodb: -# docs = self.mongodb.fetch_existing_reviews() -# -# # If backup_to_json is enabled, also load from JSON for merging -# if self.backup_to_json: -# json_docs = self.json_storage.load_json_docs() -# # Merge JSON docs with MongoDB docs -# for review_id, review in json_docs.items(): -# if review_id not in docs: -# docs[review_id] = review -# -# # Load seen IDs from file -# seen = self.json_storage.load_seen() -# -# driver = None -# try: -# driver = self.setup_driver(headless) -# wait = WebDriverWait(driver, 20) # Reduced from 40 to 20 for faster timeout -# -# driver.get(url) -# wait.until(lambda d: "google.com/maps" in d.current_url) -# -# self.dismiss_cookies(driver) -# self.click_reviews_tab(driver) -# self.set_sort(driver, sort_by) -# -# # Add a wait after setting sort to allow results to load -# time.sleep(1) -# -# # Use try-except to handle cases where the pane is not found -# try: -# pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, PANE_SEL))) -# except TimeoutException: -# log.warning("Could not find reviews pane. Page structure might have changed.") -# return False -# -# pbar = tqdm(desc="Scraped", ncols=80, initial=len(seen)) -# idle = 0 -# processed_ids = set() # Track processed IDs in current session -# -# # Prefetch selector to avoid repeated lookups -# try: -# driver.execute_script("window.scrollablePane = arguments[0];", pane) -# scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" -# except Exception as e: -# log.warning(f"Error setting up scroll script: {e}") -# scroll_script = "window.scrollBy(0, 300);" # Fallback to simple scrolling -# -# max_attempts = 10 # Limit the number of attempts to find reviews -# attempts = 0 -# -# while attempts < max_attempts: -# try: -# cards = pane.find_elements(By.CSS_SELECTOR, CARD_SEL) -# fresh_cards: List[WebElement] = [] -# -# # Check for valid cards -# if len(cards) == 0: -# log.debug("No review cards found in this iteration") -# attempts += 1 -# # Try scrolling anyway -# driver.execute_script(scroll_script) -# time.sleep(1) -# continue -# -# for c in cards: -# try: -# cid = c.get_attribute("data-review-id") -# if not cid or cid in seen or cid in processed_ids: -# if stop_on_match and cid and (cid in seen or cid in processed_ids): -# idle = 999 -# break -# continue -# fresh_cards.append(c) -# except StaleElementReferenceException: -# continue -# except Exception as e: -# log.debug(f"Error getting review ID: {e}") -# continue -# -# for card in fresh_cards: -# try: -# raw = RawReview.from_card(card) -# processed_ids.add(raw.id) # Track this ID to avoid re-processing -# except StaleElementReferenceException: -# continue -# except Exception: -# log.warning("⚠️ parse error – storing stub\n%s", -# traceback.format_exc(limit=1).strip()) -# try: -# raw_id = card.get_attribute("data-review-id") or "" -# raw = RawReview(id=raw_id, text="", lang="und") -# processed_ids.add(raw_id) -# except StaleElementReferenceException: -# continue -# -# docs[raw.id] = merge_review(docs.get(raw.id), raw) -# seen.add(raw.id) -# pbar.update(1) -# idle = 0 -# attempts = 0 # Reset attempts counter when we successfully process a review -# -# if idle >= 3: -# break -# -# if not fresh_cards: -# idle += 1 -# attempts += 1 -# -# # Use JavaScript for smoother scrolling -# try: -# driver.execute_script(scroll_script) -# except Exception as e: -# log.warning(f"Error scrolling: {e}") -# # Try a simpler scroll method -# driver.execute_script("window.scrollBy(0, 300);") -# -# # Dynamic sleep: sleep less when processing many reviews -# sleep_time = 0.7 if len(fresh_cards) > 5 else 1.0 -# time.sleep(sleep_time) -# -# except StaleElementReferenceException: -# # The pane or other element went stale, try to re-find -# log.debug("Stale element encountered, re-finding elements") -# try: -# pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, PANE_SEL))) -# driver.execute_script("window.scrollablePane = arguments[0];", pane) -# except Exception: -# log.warning("Could not re-find reviews pane after stale element") -# break -# except Exception as e: -# log.warning(f"Error during review processing: {e}") -# attempts += 1 -# time.sleep(1) -# -# pbar.close() -# -# # Save to MongoDB if enabled -# if self.use_mongodb and self.mongodb: -# log.info("Saving reviews to MongoDB...") -# self.mongodb.save_reviews(docs) -# -# # Backup to JSON if enabled -# if self.backup_to_json: -# log.info("Backing up to JSON...") -# self.json_storage.save_json_docs(docs) -# self.json_storage.save_seen(seen) -# -# log.info("✅ Finished – total unique reviews: %s", len(docs)) -# -# end_time = time.time() -# elapsed_time = end_time - start_time -# log.info(f"Execution completed in {elapsed_time:.2f} seconds") -# -# return True -# -# except Exception as e: -# log.error(f"Error during scraping: {e}") -# log.error(traceback.format_exc()) -# return False -# -# finally: -# if driver is not None: -# try: -# driver.quit() -# except Exception: -# pass -# -# if self.mongodb: -# try: -# self.mongodb.close() -# except Exception: -# pass -# -# # """ -# # Selenium scraping logic for Google Maps Reviews. -# # """ -# # -# # import re -# # import time -# # import logging -# # import traceback -# # from typing import Dict, Any, Set, List -# # -# # import undetected_chromedriver as uc -# # from selenium.common.exceptions import TimeoutException -# # from selenium.webdriver import Chrome -# # from selenium.webdriver.common.by import By -# # from selenium.webdriver.remote.webelement import WebElement -# # from selenium.webdriver.support import expected_conditions as EC -# # from selenium.webdriver.support.ui import WebDriverWait -# # from tqdm import tqdm -# # -# # from modules.models import RawReview -# # from modules.data_storage import MongoDBStorage, JSONStorage, merge_review -# # from modules.utils import click_if -# # -# # # Logger -# # log = logging.getLogger("scraper") -# # -# # # CSS Selectors -# # PANE_SEL = 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf' -# # CARD_SEL = "div[data-review-id]" -# # COOKIE_BTN = ('button[aria-label*="Accept" i],' -# # 'button[jsname="hZCF7e"],' -# # 'button[data-mdc-dialog-action="accept"]') -# # SORT_BTN = 'button[aria-label="Sort reviews" i], button[aria-label="Sort" i]' -# # MENU_ITEMS = 'div[role="menu"] [role="menuitem"], li[role="menuitem"]' -# # -# # SORT_LABELS = { # text shown in Google Maps' menu -# # "newest": ("Newest", "החדשות ביותר", "ใหม่ที่สุด"), -# # "highest": ("Highest rating", "הדירוג הגבוה ביותר", "คะแนนสูงสุด"), -# # "lowest": ("Lowest rating", "הדירוג הנמוך ביותר", "คะแนนต่ำสุด"), -# # "relevance": ("Most relevant", "רלוונטיות ביותר", "เกี่ยวข้องมากที่สุด"), -# # } -# # -# # REVIEW_WORDS = {"reviews", "review", "ביקורות", "รีวิว", "avis", "reseñas", -# # "recensioni", "bewertungen", "口コミ", "レビュー", -# # "리뷰", "評論", "评论", "рецензии"} -# # -# # -# # class GoogleReviewsScraper: -# # """Main scraper class for Google Maps reviews""" -# # -# # def __init__(self, config: Dict[str, Any]): -# # """Initialize scraper with configuration""" -# # self.config = config -# # self.use_mongodb = config.get("use_mongodb", True) -# # self.mongodb = MongoDBStorage(config) if self.use_mongodb else None -# # self.json_storage = JSONStorage(config) -# # self.backup_to_json = config.get("backup_to_json", True) -# # self.overwrite_existing = config.get("overwrite_existing", False) -# # -# # def setup_driver(self, headless: bool) -> Chrome: -# # """Set up and configure Chrome driver""" -# # opts = uc.ChromeOptions() -# # opts.add_argument("--window-size=1400,900") -# # opts.add_argument("--ignore-certificate-errors") -# # opts.add_argument("--disable-gpu") # Improves performance -# # opts.add_argument("--disable-dev-shm-usage") # Helps with stability -# # opts.add_argument("--no-sandbox") # More stable in some environments -# # -# # if headless: -# # opts.add_argument("--headless=new") -# # -# # driver = uc.Chrome(options=opts) -# # # Set page load timeout to avoid hanging -# # driver.set_page_load_timeout(30) -# # return driver -# # -# # def dismiss_cookies(self, driver: Chrome): -# # """Dismiss cookie consent dialogs""" -# # click_if(driver, COOKIE_BTN, timeout=3.0) # Reduced timeout for faster operation -# # -# # def is_reviews_tab(self, tab: WebElement) -> bool: -# # """Check if a tab is the reviews tab""" -# # label = (tab.get_attribute("aria-label") or tab.text or "").lower() -# # return tab.get_attribute("data-tab-index") == "1" or any(w in label for w in REVIEW_WORDS) -# # -# # def click_reviews_tab(self, driver: Chrome): -# # """Click on the reviews tab in Google Maps""" -# # end = time.time() + 15 # Reduced timeout from 30 to 15 seconds -# # while time.time() < end: -# # for tab in driver.find_elements(By.CSS_SELECTOR, -# # '[role="tab"], button[aria-label]'): -# # if self.is_reviews_tab(tab): -# # driver.execute_script("arguments[0].scrollIntoView({block:\"center\"});", tab) -# # try: -# # tab.click() -# # return -# # except Exception: -# # continue -# # time.sleep(.2) # Reduced sleep time from 0.4 to 0.2 -# # raise TimeoutException("Reviews tab not found") -# # -# # def set_sort(self, driver: Chrome, method: str): -# # """Set the sorting method for reviews""" -# # if method == "relevance": -# # return # default order -# # if not click_if(driver, SORT_BTN): -# # return -# # -# # wanted = SORT_LABELS[method] -# # -# # for item in driver.find_elements(By.CSS_SELECTOR, MENU_ITEMS): -# # label = item.text.strip() -# # if label in wanted: -# # item.click() -# # time.sleep(0.5) # Reduced wait time from 1.0 to 0.5 -# # return -# # log.warning("⚠️ sort option %s not found – keeping default", method) -# # -# # def scrape(self): -# # """Main scraper method""" -# # start_time = time.time() -# # -# # url = self.config.get("url") -# # headless = self.config.get("headless", True) -# # sort_by = self.config.get("sort_by", "relevance") -# # stop_on_match = self.config.get("stop_on_match", False) -# # -# # log.info(f"Starting scraper with settings: headless={headless}, sort_by={sort_by}") -# # log.info(f"URL: {url}") -# # -# # # Initialize storage -# # # If not overwriting, load existing data -# # if self.overwrite_existing: -# # docs = {} -# # seen = set() -# # else: -# # # Try to get from MongoDB first if enabled -# # docs = {} -# # if self.use_mongodb and self.mongodb: -# # docs = self.mongodb.fetch_existing_reviews() -# # -# # # If backup_to_json is enabled, also load from JSON for merging -# # if self.backup_to_json: -# # json_docs = self.json_storage.load_json_docs() -# # # Merge JSON docs with MongoDB docs -# # for review_id, review in json_docs.items(): -# # if review_id not in docs: -# # docs[review_id] = review -# # -# # # Load seen IDs from file -# # seen = self.json_storage.load_seen() -# # -# # driver = self.setup_driver(headless) -# # wait = WebDriverWait(driver, 20) # Reduced from 40 to 20 for faster timeout -# # -# # try: -# # driver.get(url) -# # wait.until(lambda d: "google.com/maps" in d.current_url) -# # -# # self.dismiss_cookies(driver) -# # self.click_reviews_tab(driver) -# # self.set_sort(driver, sort_by) -# # -# # pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, PANE_SEL))) -# # pbar = tqdm(desc="Scraped", ncols=80, initial=len(seen)) -# # idle = 0 -# # processed_ids = set() # Track processed IDs in current session -# # -# # # Prefetch selector to avoid repeated lookups -# # driver.execute_script("window.scrollablePane = arguments[0];", pane) -# # scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" -# # -# # while True: -# # cards = pane.find_elements(By.CSS_SELECTOR, CARD_SEL) -# # fresh_cards: List[WebElement] = [] -# # -# # for c in cards: -# # cid = c.get_attribute("data-review-id") -# # if cid in seen or cid in processed_ids: -# # if stop_on_match: -# # idle = 999 -# # break -# # continue -# # fresh_cards.append(c) -# # -# # for card in fresh_cards: -# # try: -# # raw = RawReview.from_card(card) -# # processed_ids.add(raw.id) # Track this ID to avoid re-processing -# # except Exception: -# # log.warning("⚠️ parse error – storing stub\n%s", -# # traceback.format_exc(limit=1).strip()) -# # raw_id = card.get_attribute("data-review-id") or "" -# # raw = RawReview(id=raw_id, text="", lang="und") -# # processed_ids.add(raw_id) -# # -# # docs[raw.id] = merge_review(docs.get(raw.id), raw) -# # seen.add(raw.id) -# # pbar.update(1) -# # idle = 0 -# # -# # if idle >= 3: -# # break -# # -# # if not fresh_cards: -# # idle += 1 -# # -# # # Use JavaScript for smoother scrolling -# # driver.execute_script(scroll_script) -# # -# # # Dynamic sleep: sleep less when processing many reviews -# # sleep_time = 0.7 if len(fresh_cards) > 5 else 1.0 -# # time.sleep(sleep_time) -# # -# # pbar.close() -# # -# # # Save to MongoDB if enabled -# # if self.use_mongodb and self.mongodb: -# # log.info("Saving reviews to MongoDB...") -# # self.mongodb.save_reviews(docs) -# # -# # # Backup to JSON if enabled -# # if self.backup_to_json: -# # log.info("Backing up to JSON...") -# # self.json_storage.save_json_docs(docs) -# # self.json_storage.save_seen(seen) -# # -# # log.info("✅ Finished – total unique reviews: %s", len(docs)) -# # -# # end_time = time.time() -# # elapsed_time = end_time - start_time -# # log.info(f"Execution completed in {elapsed_time:.2f} seconds") -# # -# # finally: -# # driver.quit() -# # if self.mongodb: -# # self.mongodb.close() diff --git a/reverse_engineer_date_formatter.py b/reverse_engineer_date_formatter.py deleted file mode 100644 index f1ffb41..0000000 --- a/reverse_engineer_date_formatter.py +++ /dev/null @@ -1,198 +0,0 @@ -#!/usr/bin/env python3 -""" -Reverse-engineer Google's date formatting library to understand: -1. What library they use -2. All possible date format patterns -3. Time range boundaries for each pattern -""" -import json -import re -from seleniumbase import Driver -import time - -url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=en&rclk=1" - -print("Starting browser...") -driver = Driver(uc=True, headless=False) - -try: - print(f"Loading URL: {url}") - driver.get(url) - time.sleep(8) - - # Script to find date formatting function - find_formatter_script = """ - const results = { - scripts: [], - potential_formatters: [], - date_strings: [] - }; - - // 1. Search all script tags for date-related code - const scriptTags = document.querySelectorAll('script'); - let scriptContent = ''; - - scriptTags.forEach((script, idx) => { - const content = script.textContent || script.innerText; - if (content) { - scriptContent += content + '\\n'; - - // Look for date formatting patterns - if (content.includes('ago') || content.includes('month') || content.includes('year')) { - const snippet = content.substring(0, 500); - results.scripts.push({ - index: idx, - snippet: snippet, - length: content.length - }); - } - } - }); - - // 2. Search for common date formatting library signatures - const librarySignatures = [ - 'moment', - 'date-fns', - 'dayjs', - 'luxon', - 'timeago', - 'formatRelative', - 'relativeTime', - 'fromNow' - ]; - - librarySignatures.forEach(sig => { - if (scriptContent.includes(sig)) { - results.potential_formatters.push(sig); - } - }); - - // 3. Try to find the actual formatting function by injecting test dates - // Look for Google's internal date formatter - const googleFormatters = []; - for (let key in window) { - if (typeof window[key] === 'function') { - const funcStr = window[key].toString(); - if (funcStr.includes('ago') && funcStr.includes('month')) { - googleFormatters.push({ - name: key, - signature: funcStr.substring(0, 200) - }); - } - } - } - results.google_formatters = googleFormatters; - - // 4. Extract all "X ago" patterns from the page - const pageText = document.body.innerText; - const agoPatterns = pageText.match(/\\d+\\s+(second|minute|hour|day|week|month|year)s?\\s+ago/gi) || []; - const singlePatterns = pageText.match(/a\\s+(second|minute|hour|day|week|month|year)\\s+ago/gi) || []; - - results.date_strings = [...new Set([...agoPatterns, ...singlePatterns])]; - - return results; - """ - - print("Searching for date formatting code...") - formatter_info = driver.execute_script(find_formatter_script) - - print("\n" + "="*80) - print("FINDINGS:") - print("="*80) - - print(f"\n1. Scripts with date-related code: {len(formatter_info.get('scripts', []))}") - - print(f"\n2. Potential libraries detected: {formatter_info.get('potential_formatters', [])}") - - print(f"\n3. Google formatter functions found: {len(formatter_info.get('google_formatters', []))}") - for gf in formatter_info.get('google_formatters', [])[:3]: - print(f" - {gf['name']}: {gf['signature'][:100]}...") - - print(f"\n4. Date patterns found on page:") - date_strings = formatter_info.get('date_strings', []) - for ds in sorted(set(date_strings))[:20]: - print(f" - '{ds}'") - - # Now let's test different timestamps to understand the boundaries - print("\n" + "="*80) - print("TESTING TIME RANGE BOUNDARIES:") - print("="*80) - - # We need to inject JavaScript that can format dates like Google does - # Let's search the actual DOM for the pattern - boundary_test_script = """ - // Collect all unique date strings from reviews - const dateElements = document.querySelectorAll('span.rsqaWe'); - const dateStrings = new Set(); - - dateElements.forEach(elem => { - const text = elem.textContent.trim(); - if (text) { - dateStrings.add(text); - } - }); - - return Array.from(dateStrings).sort(); - """ - - all_date_strings = driver.execute_script(boundary_test_script) - - print(f"\nFound {len(all_date_strings)} unique date formats:") - for ds in all_date_strings[:30]: - print(f" - '{ds}'") - - # Analyze the patterns - print("\n" + "="*80) - print("PATTERN ANALYSIS:") - print("="*80) - - patterns = { - 'seconds': [], - 'minutes': [], - 'hours': [], - 'days': [], - 'weeks': [], - 'months': [], - 'years': [] - } - - for ds in all_date_strings: - ds_lower = ds.lower() - if 'second' in ds_lower: - patterns['seconds'].append(ds) - elif 'minute' in ds_lower: - patterns['minutes'].append(ds) - elif 'hour' in ds_lower: - patterns['hours'].append(ds) - elif 'day' in ds_lower: - patterns['days'].append(ds) - elif 'week' in ds_lower: - patterns['weeks'].append(ds) - elif 'month' in ds_lower: - patterns['months'].append(ds) - elif 'year' in ds_lower: - patterns['years'].append(ds) - - for unit, examples in patterns.items(): - if examples: - print(f"\n{unit.upper()}:") - for ex in examples[:5]: - print(f" - '{ex}'") - - # Save all data - output = { - 'formatter_info': formatter_info, - 'all_date_strings': all_date_strings, - 'pattern_analysis': {k: v for k, v in patterns.items() if v} - } - - with open('/tmp/google_date_formatter_analysis.json', 'w') as f: - json.dump(output, f, indent=2) - - print("\n" + "="*80) - print("Full analysis saved to: /tmp/google_date_formatter_analysis.json") - print("="*80) - -finally: - driver.quit() - print("\nBrowser closed") diff --git a/reverse_engineer_date_formatter_v2.py b/reverse_engineer_date_formatter_v2.py deleted file mode 100644 index ba95f8e..0000000 --- a/reverse_engineer_date_formatter_v2.py +++ /dev/null @@ -1,175 +0,0 @@ -#!/usr/bin/env python3 -""" -Reverse-engineer Google's date formatting patterns by scraping reviews in English -""" -import json -from modules.fast_scraper import fast_scrape_reviews - -url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=en&rclk=1" - -print("Scraping reviews in English...") -result = fast_scrape_reviews(url, headless=True) - -reviews = result.get('reviews', []) -print(f"\nExtracted {len(reviews)} reviews") - -if reviews: - # Collect all unique date strings - date_strings = set() - for rev in reviews: - date_text = rev.get('date_text') - if date_text: - date_strings.add(date_text) - - print(f"\nFound {len(date_strings)} unique date formats:") - for ds in sorted(date_strings): - print(f" '{ds}'") - - # Analyze patterns - print("\n" + "="*80) - print("PATTERN ANALYSIS:") - print("="*80) - - patterns = { - 'seconds': [], - 'minutes': [], - 'hours': [], - 'days': [], - 'weeks': [], - 'months': [], - 'years': [] - } - - for ds in date_strings: - ds_lower = ds.lower() - if 'second' in ds_lower: - patterns['seconds'].append(ds) - elif 'minute' in ds_lower: - patterns['minutes'].append(ds) - elif 'hour' in ds_lower: - patterns['hours'].append(ds) - elif 'day' in ds_lower: - patterns['days'].append(ds) - elif 'week' in ds_lower: - patterns['weeks'].append(ds) - elif 'month' in ds_lower: - patterns['months'].append(ds) - elif 'year' in ds_lower: - patterns['years'].append(ds) - - for unit, examples in sorted(patterns.items()): - if examples: - print(f"\n{unit.upper()} ({len(examples)} patterns):") - for ex in sorted(examples): - print(f" '{ex}'") - - # Identify the specific patterns - print("\n" + "="*80) - print("GOOGLE MAPS DATE FORMAT PATTERNS (English):") - print("="*80) - - print("\nPattern Structure:") - print("-" * 80) - - single_unit_patterns = [] # "a month ago" - plural_patterns = [] # "3 months ago" - - for ds in sorted(date_strings): - if ds.startswith('a '): - single_unit_patterns.append(ds) - elif ds.split()[0].isdigit(): - plural_patterns.append(ds) - - print(f"\nSingular (a X ago): {len(single_unit_patterns)} patterns") - for p in sorted(single_unit_patterns): - print(f" '{p}'") - - print(f"\nPlural (N Xs ago): {len(plural_patterns)} patterns") - for p in sorted(plural_patterns): - print(f" '{p}'") - - # Determine time ranges - print("\n" + "="*80) - print("TIME RANGE BOUNDARIES:") - print("="*80) - - # Extract numbers from plural patterns - import re - from collections import defaultdict - - unit_values = defaultdict(list) - for ds in date_strings: - match = re.match(r'(\d+)\s+(\w+)\s+ago', ds.lower()) - if match: - number = int(match.group(1)) - unit = match.group(2).rstrip('s') # Remove plural 's' - unit_values[unit].append(number) - - for unit, values in sorted(unit_values.items()): - if values: - print(f"\n{unit.upper()}:") - print(f" Range: {min(values)} - {max(values)}") - print(f" Values found: {sorted(set(values))}") - - # Save analysis - output = { - 'total_reviews': len(reviews), - 'unique_date_formats': len(date_strings), - 'all_date_strings': sorted(list(date_strings)), - 'patterns_by_unit': {k: sorted(v) for k, v in patterns.items() if v}, - 'singular_patterns': sorted(single_unit_patterns), - 'plural_patterns': sorted(plural_patterns), - 'value_ranges': {unit: {'min': min(values), 'max': max(values), 'values': sorted(set(values))} - for unit, values in unit_values.items() if values} - } - - with open('/tmp/google_date_patterns_english.json', 'w') as f: - json.dump(output, f, indent=2) - - print("\n" + "="*80) - print("Analysis saved to: /tmp/google_date_patterns_english.json") - print("="*80) - - # Now let's determine the EXACT library/algorithm Google uses - print("\n" + "="*80) - print("REVERSE-ENGINEERING GOOGLE'S ALGORITHM:") - print("="*80) - - print("\nBased on the patterns, Google's relative date formatter:") - print("-" * 80) - - print("\n1. FORMAT STRUCTURE:") - print(" Single unit: 'a {unit} ago'") - print(" Multiple: '{number} {unit}s ago'") - - print("\n2. UNIT SELECTION (hypothesis):") - if 'second' in unit_values: - print(f" - Seconds: Used for 0-59 seconds ago") - if 'minute' in unit_values: - print(f" - Minutes: Used for 1-59 minutes ago") - if 'hour' in unit_values: - print(f" - Hours: Used for 1-23 hours ago") - if 'day' in unit_values: - print(f" - Days: Used for 1-6 days ago") - if 'week' in unit_values: - print(f" - Weeks: Used for 1-3 weeks ago") - if 'month' in unit_values: - print(f" - Months: Used for 1-11 months ago") - if 'year' in unit_values: - print(f" - Years: Used for 1+ years ago") - - print("\n3. BOUNDARY THRESHOLDS (estimated):") - print(" 60 seconds = switch to minutes") - print(" 60 minutes = switch to hours") - print(" 24 hours = switch to days") - print(" 7 days = switch to weeks") - print(" ~30 days (4 weeks) = switch to months") - print(" 12 months = switch to years") - - print("\n4. UNCERTAINTY RANGES:") - print(" 'a month ago' = 30-59 days ago (±15 days)") - print(" '2 months ago' = 60-89 days ago (±15 days)") - print(" 'a year ago' = 365-729 days ago (±6 months)") - -else: - print("No reviews extracted!") diff --git a/start.py b/start.py deleted file mode 100644 index 87cc4bf..0000000 --- a/start.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python3 -""" -Google‑Maps review scraper with MongoDB integration -================================================= - -Main entry point for the scraper. -""" - -from modules.cli import parse_arguments -from modules.config import load_config -from modules.scraper import GoogleReviewsScraper - - -def main(): - """Main function to initialize and run the scraper""" - # Parse command line arguments - args = parse_arguments() - - # Load configuration - config = load_config(args.config) - - # Override config with command line arguments if provided - if args.headless: - config["headless"] = True - if args.sort_by is not None: - config["sort_by"] = args.sort_by - if args.stop_on_match: - config["stop_on_match"] = True - if args.url is not None: - config["url"] = args.url - if args.overwrite_existing: - config["overwrite_existing"] = True - if args.use_mongodb is not None: - config["use_mongodb"] = args.use_mongodb - - # Handle arguments for date conversion and image downloading - if args.convert_dates is not None: - config["convert_dates"] = args.convert_dates - if args.download_images is not None: - config["download_images"] = args.download_images - if args.image_dir is not None: - config["image_dir"] = args.image_dir - if args.download_threads is not None: - config["download_threads"] = args.download_threads - - # Handle arguments for local image paths and URL replacement - if args.store_local_paths is not None: - config["store_local_paths"] = args.store_local_paths - if args.replace_urls is not None: - config["replace_urls"] = args.replace_urls - if args.custom_url_base is not None: - config["custom_url_base"] = args.custom_url_base - if args.custom_url_profiles is not None: - config["custom_url_profiles"] = args.custom_url_profiles - if args.custom_url_reviews is not None: - config["custom_url_reviews"] = args.custom_url_reviews - if args.preserve_original_urls is not None: - config["preserve_original_urls"] = args.preserve_original_urls - - # Handle custom parameters - if args.custom_params is not None: - if "custom_params" not in config: - config["custom_params"] = {} - # Update config with the provided custom parameters - config["custom_params"].update(args.custom_params) - - # Handle API interception option - if args.enable_api_intercept: - config["enable_api_intercept"] = True - - # Initialize and run scraper - scraper = GoogleReviewsScraper(config) - scraper.scrape() - - -if __name__ == "__main__": - main() diff --git a/start_api_244.py b/start_api_244.py deleted file mode 100644 index cf9c0a4..0000000 --- a/start_api_244.py +++ /dev/null @@ -1,288 +0,0 @@ -#!/usr/bin/env python3 -""" -API-Only 244 Scraper - Attempt to get ALL 244 reviews via API alone. - -Strategy: -1. More patient scrolling (more scrolls, longer waits) -2. Collect responses more frequently -3. Extra end-of-list collection -4. Slower timing near the end to ensure API completes - -Goal: Get all 244 reviews via API without DOM parsing -""" -import sys -import yaml -import logging -import time -import json -from seleniumbase import Driver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import TimeoutException -from modules.api_interceptor import GoogleMapsAPIInterceptor - -logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s') -log = logging.getLogger(__name__) -log.setLevel(logging.INFO) - - -def load_config(): - with open('config.yaml', 'r') as f: - return yaml.safe_load(f) - - -def api_244_scrape(): - """Get all 244 reviews purely via API with aggressive collection.""" - - config = load_config() - url = config.get('url') - headless = config.get('headless', False) - - print("API-244 SCRAPER - Getting ALL 244 reviews via API...") - print(f"URL: {url[:80]}...") - - start_time = time.time() - api_reviews = {} - - driver = Driver(uc=True, headless=headless, page_load_strategy="normal") - - try: - # Step 1: Navigate - driver.get(url) - time.sleep(1.5) - - # Dismiss cookies - try: - cookie_btns = driver.find_elements(By.CSS_SELECTOR, - 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') - if cookie_btns: - cookie_btns[0].click() - time.sleep(0.4) - except: - pass - - # Click reviews tab - review_keywords = ['reviews', 'review', 'reseñas', 'reseña'] - for selector in ['.LRkQ2', 'button[role="tab"]']: - try: - tabs = driver.find_elements(By.CSS_SELECTOR, selector) - for tab in tabs: - text = (tab.text or '').lower() - aria = (tab.get_attribute('aria-label') or '').lower() - if any(kw in text or kw in aria for kw in review_keywords): - driver.execute_script("arguments[0].click();", tab) - time.sleep(0.4) - break - except: - continue - - # Wait for page stability - time.sleep(1.0) - - # Find pane - pane = None - try: - wait = WebDriverWait(driver, 3) - pane = wait.until(EC.presence_of_element_located( - (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'))) - except TimeoutException: - try: - pane = wait.until(EC.presence_of_element_located( - (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde'))) - except: - print("ERROR: Could not find pane") - return [] - - # Setup API interceptor - interceptor = GoogleMapsAPIInterceptor(driver) - interceptor.setup_interception() - interceptor.inject_response_interceptor() - time.sleep(1.0) # Longer wait to ensure interceptor is ready - - # Setup scroll - driver.execute_script("window.scrollablePane = arguments[0];", pane) - scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" - - # Trigger initial scroll - driver.execute_script(scroll_script) - time.sleep(1.0) # Wait for first API response - - print("Scrolling with extended collection strategy...") - - # Extended scrolling - MORE scrolls, SLOWER timing - max_scrolls = 50 # More scrolls to ensure we catch everything - idle_scrolls = 0 - max_idle = 15 # Even more patience - last_count = 0 - last_scroll_pos = 0 - scroll_stuck_count = 0 - - for i in range(max_scrolls): - # Scroll - driver.execute_script(scroll_script) - - # Progressive timing - slower and slower - if len(api_reviews) < 50: - time.sleep(0.30) # Start moderate - elif len(api_reviews) < 100: - time.sleep(0.35) - elif len(api_reviews) < 150: - time.sleep(0.40) - elif len(api_reviews) < 200: - time.sleep(0.50) - elif len(api_reviews) < 230: - time.sleep(0.60) # Much slower near end - else: - time.sleep(0.80) # Very slow for final reviews - - # Collect responses - try: - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - except: - pass - - # Check if we got new reviews - current_count = len(api_reviews) - if current_count == last_count: - idle_scrolls += 1 - else: - idle_scrolls = 0 - if (i + 1) % 10 == 0: - print(f" {current_count} reviews...") - - last_count = current_count - - # Check scroll position - try: - current_scroll = driver.execute_script("return arguments[0].scrollTop;", pane) - if current_scroll == last_scroll_pos: - scroll_stuck_count += 1 - else: - scroll_stuck_count = 0 - last_scroll_pos = current_scroll - except: - pass - - # Stop conditions - but only if we have at least 240 reviews - if idle_scrolls >= max_idle and scroll_stuck_count >= 5 and current_count >= 240: - print(f" Reached end (no new reviews for {idle_scrolls} scrolls)") - break - - # AGGRESSIVE final collection phase - print(f" Aggressive final collection (currently have {len(api_reviews)})...") - - # Do 10 more scrolls with very long waits - for extra in range(10): - driver.execute_script(scroll_script) - time.sleep(1.2) # Very long wait - - try: - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - new_count = 0 - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - new_count += 1 - - if new_count > 0: - print(f" +{new_count} more reviews (total: {len(api_reviews)})") - except: - pass - - # Ultra-final wait and collect - time.sleep(2.0) - try: - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - except: - pass - - elapsed = time.time() - start_time - all_reviews = list(api_reviews.values()) - - print(f"\n{'='*50}") - print(f"✅ COMPLETED!") - print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)") - print(f"Time: {elapsed:.2f}s") - print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") - - if elapsed > 0: - print(f"Speedup: {155/elapsed:.1f}x faster! 🚀") - - print(f"{'='*50}") - - if len(all_reviews) >= 244: - print(f"🎯 Got ALL 244 reviews via API!") - elif len(all_reviews) >= 240: - print(f"⚠️ Missing {244-len(all_reviews)} reviews - may need DOM parsing") - else: - print(f"⚠️ Missing {244-len(all_reviews)} reviews") - - print() - - # Save - with open('google_reviews_api_244.json', 'w', encoding='utf-8') as f: - json.dump(all_reviews, f, indent=2, ensure_ascii=False) - - print(f"💾 Saved to google_reviews_api_244.json") - - if all_reviews: - print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★") - - return all_reviews - - finally: - try: - driver.quit() - except: - pass - - -if __name__ == '__main__': - try: - reviews = api_244_scrape() - sys.exit(0 if reviews else 1) - except KeyboardInterrupt: - print("\n\nInterrupted by user") - sys.exit(1) - except Exception as e: - print(f"ERROR: {e}") - import traceback - traceback.print_exc() - sys.exit(1) diff --git a/start_complete.py b/start_complete.py deleted file mode 100644 index 05178b2..0000000 --- a/start_complete.py +++ /dev/null @@ -1,280 +0,0 @@ -#!/usr/bin/env python3 -""" -Complete Scraper - Gets ALL reviews while staying fast. - -Strategy: -1. Scroll until no new reviews for 5 consecutive scrolls -2. Check scroll position to detect end -3. Do extra scrolls at the end to catch stragglers -4. Adaptive timing - faster at start, slower at end - -Target: Get all 244 reviews in ~22-25 seconds -""" -import sys -import yaml -import logging -import time -import json -from seleniumbase import Driver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import TimeoutException -from modules.api_interceptor import GoogleMapsAPIInterceptor - -logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s') -log = logging.getLogger(__name__) -log.setLevel(logging.INFO) - - -def load_config(): - with open('config.yaml', 'r') as f: - return yaml.safe_load(f) - - -def complete_scrape(): - """Get ALL reviews with intelligent scrolling.""" - - config = load_config() - url = config.get('url') - headless = config.get('headless', False) - - print("COMPLETE SCRAPER - Getting ALL reviews...") - print(f"URL: {url[:80]}...") - - start_time = time.time() - api_reviews = {} - - driver = Driver(uc=True, headless=headless, page_load_strategy="normal") - - try: - # Step 1: Navigate - driver.get(url) - time.sleep(1.5) - - # Dismiss cookies - try: - cookie_btns = driver.find_elements(By.CSS_SELECTOR, - 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') - if cookie_btns: - cookie_btns[0].click() - time.sleep(0.4) - except: - pass - - # Click reviews tab - review_keywords = ['reviews', 'review', 'reseñas', 'reseña'] - for selector in ['.LRkQ2', 'button[role="tab"]']: - try: - tabs = driver.find_elements(By.CSS_SELECTOR, selector) - for tab in tabs: - text = (tab.text or '').lower() - aria = (tab.get_attribute('aria-label') or '').lower() - if any(kw in text or kw in aria for kw in review_keywords): - driver.execute_script("arguments[0].click();", tab) - time.sleep(0.4) - break - except: - continue - - # Wait for page stability - time.sleep(1.0) - - # Find pane - pane = None - try: - wait = WebDriverWait(driver, 3) - pane = wait.until(EC.presence_of_element_located( - (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'))) - except TimeoutException: - try: - pane = wait.until(EC.presence_of_element_located( - (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde'))) - except: - print("ERROR: Could not find pane") - return [] - - # Wait for initial reviews to load - time.sleep(1.5) - - # Setup API interceptor - interceptor = GoogleMapsAPIInterceptor(driver) - interceptor.setup_interception() - interceptor.inject_response_interceptor() - time.sleep(1.0) # Important: wait for interceptor to be ready - - # Setup scroll - driver.execute_script("window.scrollablePane = arguments[0];", pane) - scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" - - # Trigger initial scroll to get first API response - driver.execute_script(scroll_script) - time.sleep(1.0) # Wait for first API response - - print("Scrolling with intelligent stopping...") - - # Intelligent scrolling - max_scrolls = 60 # Higher limit to ensure we get everything - idle_scrolls = 0 # Count scrolls with no new reviews - max_idle = 12 # More patience - stop after 12 scrolls with no new reviews - last_count = 0 - last_scroll_pos = 0 - scroll_stuck_count = 0 - - for i in range(max_scrolls): - # Scroll - driver.execute_script(scroll_script) - - # Adaptive timing - faster at start, slower near end - if len(api_reviews) < 100: - time.sleep(0.27) # Fast at beginning - elif len(api_reviews) < 200: - time.sleep(0.30) # Medium in middle - elif len(api_reviews) < 235: - time.sleep(0.40) # Slower near end - else: - time.sleep(0.50) # Very slow at the very end to catch stragglers - - # Collect responses - try: - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - except: - pass - - # Check if we got new reviews - current_count = len(api_reviews) - if current_count == last_count: - idle_scrolls += 1 - else: - idle_scrolls = 0 - if (i + 1) % 10 == 0: - print(f" {current_count} reviews...") - - last_count = current_count - - # Check scroll position to detect if stuck at bottom - try: - current_scroll = driver.execute_script("return arguments[0].scrollTop;", pane) - if current_scroll == last_scroll_pos: - scroll_stuck_count += 1 - else: - scroll_stuck_count = 0 - last_scroll_pos = current_scroll - except: - pass - - # Stop conditions - if idle_scrolls >= max_idle and scroll_stuck_count >= 3: - print(f" Reached end (no new reviews for {idle_scrolls} scrolls)") - break - - # Extra thorough collection at the end - print(f" Final collection sweep (currently have {len(api_reviews)})...") - - # Do a few more scrolls with longer waits - for extra in range(5): - driver.execute_script(scroll_script) - time.sleep(0.8) # Longer wait to ensure API completes - - try: - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - new_count = 0 - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - new_count += 1 - - if new_count > 0: - print(f" +{new_count} more reviews (total: {len(api_reviews)})") - except: - pass - - # Final wait and collect - time.sleep(1.0) - try: - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - except: - pass - - elapsed = time.time() - start_time - all_reviews = list(api_reviews.values()) - - print(f"\n✅ COMPLETED!") - print(f"Reviews: {len(all_reviews)} (target: 244)") - print(f"Time: {elapsed:.2f}s") - print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") - print(f"Speedup: {155/elapsed:.1f}x faster! 🚀") - - if len(all_reviews) >= 244: - print(f"🎯 Got ALL reviews!") - elif len(all_reviews) >= 240: - print(f"⚠️ Missing {244-len(all_reviews)} reviews") - - print() - - # Save - with open('google_reviews_complete.json', 'w', encoding='utf-8') as f: - json.dump(all_reviews, f, indent=2, ensure_ascii=False) - - print(f"💾 Saved to google_reviews_complete.json") - - if all_reviews: - print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★") - - return all_reviews - - finally: - try: - driver.quit() - except: - pass - - -if __name__ == '__main__': - try: - reviews = complete_scrape() - sys.exit(0 if reviews else 1) - except KeyboardInterrupt: - print("\n\nInterrupted by user") - sys.exit(1) - except Exception as e: - print(f"ERROR: {e}") - import traceback - traceback.print_exc() - sys.exit(1) diff --git a/start_dom_only_fast.py b/start_dom_only_fast.py deleted file mode 100644 index ab806a4..0000000 --- a/start_dom_only_fast.py +++ /dev/null @@ -1,331 +0,0 @@ -#!/usr/bin/env python3 -""" -DOM-ONLY FAST Scraper - Uses JavaScript for ultra-fast DOM extraction. - -Strategy: -1. Scroll to load all reviews -2. Extract ALL data using JavaScript in one shot (no slow Selenium queries) -3. Should be faster and simpler than API + DOM hybrid - -Target: ~20-25 seconds for all 244 reviews with simpler code -""" -import sys -import yaml -import logging -import time -import json -from seleniumbase import Driver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import TimeoutException - -logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s') -log = logging.getLogger(__name__) -log.setLevel(logging.INFO) - - -def load_config(): - with open('config.yaml', 'r') as f: - return yaml.safe_load(f) - - -def extract_all_reviews_js(driver): - """Extract ALL reviews using JavaScript - single fast operation.""" - - extract_script = """ - const reviews = []; - const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium'); - - for (let i = 0; i < elements.length; i++) { - const elem = elements[i]; - const review = {}; - - try { - // Author - const authorElem = elem.querySelector('div.d4r55'); - review.author = authorElem ? authorElem.textContent.trim() : null; - - // Rating - const ratingElem = elem.querySelector('span.kvMYJc'); - if (ratingElem) { - const ariaLabel = ratingElem.getAttribute('aria-label'); - if (ariaLabel) { - const match = ariaLabel.match(/\\d+/); - review.rating = match ? parseFloat(match[0]) : null; - } - } - - // Text - const textElem = elem.querySelector('span.wiI7pd'); - review.text = textElem ? textElem.textContent.trim() : null; - - // Date - const dateElem = elem.querySelector('span.rsqaWe'); - review.date_text = dateElem ? dateElem.textContent.trim() : null; - - // Avatar - const avatarElem = elem.querySelector('img.NBa7we'); - review.avatar_url = avatarElem ? avatarElem.src : null; - - // Profile URL - const profileElem = elem.querySelector('button.WEBjve'); - review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null; - - if (review.author && review.date_text) { - reviews.push(review); - } - } catch (e) { - // Skip this review - } - } - - return reviews; - """ - - try: - reviews_data = driver.execute_script(extract_script) - - # Add review IDs - reviews = [] - for review_data in reviews_data: - review_id = f"review_{hash(review_data['author'] + review_data['date_text'])}" - review_data['review_id'] = review_id - reviews.append(review_data) - - return reviews - - except Exception as e: - print(f" Error in JavaScript extraction: {e}") - return [] - - -def dom_only_fast_scrape(): - """Ultra-fast DOM-only scraping with JavaScript extraction.""" - - config = load_config() - url = config.get('url') - headless = config.get('headless', False) - - print("DOM-ONLY FAST SCRAPER - JavaScript extraction...") - print(f"URL: {url[:80]}...") - - start_time = time.time() - - driver = Driver(uc=True, headless=headless, page_load_strategy="normal") - - try: - # Navigate - driver.get(url) - time.sleep(1.5) # Reduced from 2.0 - - # Handle GDPR consent page (CRITICAL FIX!) - if 'consent.google.com' in driver.current_url: - try: - # Click "Accept all" / "Aceptar todo" - consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Aceptar"]') - if not consent_btns: - consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept"]') - if consent_btns: - consent_btns[0].click() - time.sleep(1.5) # Reduced from 2.0 - except: - pass - - # Dismiss cookie banner on Maps page - try: - cookie_btns = driver.find_elements(By.CSS_SELECTOR, - 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') - if cookie_btns: - cookie_btns[0].click() - time.sleep(0.3) # Reduced from 0.4 - except: - pass - - # Click reviews tab - review_keywords = ['reviews', 'review', 'reseñas', 'reseña'] - for selector in ['.LRkQ2', 'button[role="tab"]']: - try: - tabs = driver.find_elements(By.CSS_SELECTOR, selector) - for tab in tabs: - text = (tab.text or '').lower() - aria = (tab.get_attribute('aria-label') or '').lower() - if any(kw in text or kw in aria for kw in review_keywords): - driver.execute_script("arguments[0].click();", tab) - time.sleep(0.3) # Reduced from 0.4 - break - except: - continue - - # Wait for page stability - time.sleep(0.8) # Reduced from 1.0 - - # Find pane - pane = None - try: - wait = WebDriverWait(driver, 3) - pane = wait.until(EC.presence_of_element_located( - (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'))) - except TimeoutException: - try: - pane = wait.until(EC.presence_of_element_located( - (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde'))) - except: - print("ERROR: Could not find pane") - return [] - - # CRITICAL: Wait for initial reviews to load - time.sleep(1.2) # Reduced from 1.5 - - # Setup scroll - driver.execute_script("window.scrollablePane = arguments[0];", pane) - scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" - - # Trigger initial scroll and VERIFY reviews are loading - driver.execute_script(scroll_script) - time.sleep(0.8) # Reduced from 1.0 - - # Check if reviews are actually loading - initial_count = driver.execute_script( - "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;" - ) - - if initial_count < 5: - # Reviews not loaded yet, wait more - print(f" Waiting for reviews to load (found {initial_count})...") - time.sleep(1.5) # Reduced from 2.0 - driver.execute_script(scroll_script) - time.sleep(0.8) - initial_count = driver.execute_script( - "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;" - ) - - print(f"Scrolling to load all reviews (starting with {initial_count})...") - - # Fast scrolling to load all DOM elements - # No hard limit - stops automatically via idle detection - max_scrolls = 999999 - last_count = 0 - idle_count = 0 - last_scroll_pos = 0 - - for i in range(max_scrolls): - # Get current review count - current_count = driver.execute_script( - "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;" - ) - - # Scroll to load more - prev_count = current_count - driver.execute_script(scroll_script) - - # SMART WAIT: Wait until new reviews actually load (instead of fixed delay!) - max_wait = 1.0 # Maximum 1 second - wait_step = 0.05 # Check every 50ms - waited = 0 - - while waited < max_wait: - time.sleep(wait_step) - waited += wait_step - - new_count = driver.execute_script( - "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;" - ) - - # If reviews loaded, continue immediately! - if new_count > prev_count: - break - - # If at bottom and no new reviews after 0.3s, we're done - if waited >= 0.3 and new_count == prev_count: - scroll_pos = driver.execute_script("return arguments[0].scrollTop;", pane) - if scroll_pos == last_scroll_pos: - idle_count += 1 - if idle_count >= 3: - print(f" Reached end at {new_count} reviews") - break - last_scroll_pos = scroll_pos - break - - current_count = new_count - - # Progress logging every 10 scrolls - if (i + 1) % 10 == 0: - print(f" {current_count} review elements loaded...") - - # Track for idle detection - if current_count == prev_count: - idle_count += 1 - if idle_count >= 3: - break - else: - idle_count = 0 - - last_count = current_count - - # Shorter final scroll - for _ in range(2): # Reduced from 3 - driver.execute_script(scroll_script) - time.sleep(0.3) # Reduced from 0.4 - - scroll_time = time.time() - start_time - print(f" Scrolling complete in {scroll_time:.2f}s") - - # Extract ALL reviews using JavaScript (fast!) - print("Extracting reviews with JavaScript...") - extract_start = time.time() - - all_reviews = extract_all_reviews_js(driver) - - extract_time = time.time() - extract_start - print(f" Extraction complete in {extract_time:.2f}s") - - elapsed = time.time() - start_time - - print(f"\n{'='*50}") - print(f"✅ COMPLETED!") - print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)") - print(f"Time: {elapsed:.2f}s") - print(f" - Scrolling: {scroll_time:.2f}s") - print(f" - Extraction: {extract_time:.2f}s") - print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") - print(f"Speedup: {155/elapsed:.1f}x faster! 🚀") - print(f"{'='*50}") - - if len(all_reviews) >= 244: - print(f"🎯 Got ALL 244 reviews!") - elif len(all_reviews) >= 240: - print(f"⚠️ Missing {244-len(all_reviews)} reviews") - - print() - - # Save - with open('google_reviews_dom_only_fast.json', 'w', encoding='utf-8') as f: - json.dump(all_reviews, f, indent=2, ensure_ascii=False) - - print(f"💾 Saved to google_reviews_dom_only_fast.json") - - if all_reviews: - print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★") - - return all_reviews - - finally: - try: - driver.quit() - except: - pass - - -if __name__ == '__main__': - try: - reviews = dom_only_fast_scrape() - sys.exit(0 if reviews else 1) - except KeyboardInterrupt: - print("\n\nInterrupted by user") - sys.exit(1) - except Exception as e: - print(f"ERROR: {e}") - import traceback - traceback.print_exc() - sys.exit(1) diff --git a/start_fast.py b/start_fast.py deleted file mode 100644 index fa0bcac..0000000 --- a/start_fast.py +++ /dev/null @@ -1,346 +0,0 @@ -#!/usr/bin/env python3 -""" -Fast API-First Scraper - Optimized version of start.py - -Strategy: -1. Open browser and navigate to reviews (~15 seconds) -2. Scroll rapidly JUST to trigger API calls (~15 seconds) -3. Collect all API responses during scrolling -4. Parse reviews from API responses -5. Skip DOM parsing entirely -6. Exit immediately - -Expected time: ~30-40 seconds for 244 reviews (vs 155 seconds) -Speed improvement: ~4-5x faster! -""" -import sys -import yaml -import logging -import time -import json -from pathlib import Path -from seleniumbase import Driver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import TimeoutException -from modules.api_interceptor import GoogleMapsAPIInterceptor - -logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') -log = logging.getLogger(__name__) - - -def load_config(): - """Load configuration from config.yaml""" - with open('config.yaml', 'r') as f: - return yaml.safe_load(f) - - -def fast_scrape(): - """Fast API-first scraping.""" - - config = load_config() - url = config.get('url') - headless = config.get('headless', False) - - log.info("="*60) - log.info("FAST API-FIRST SCRAPER") - log.info("="*60) - log.info(f"URL: {url[:80]}...") - log.info(f"Mode: API-first (skip DOM parsing)") - log.info("="*60 + "\n") - - start_time = time.time() - api_reviews = {} - - # Create driver using SeleniumBase UC Mode (like original scraper) - driver = Driver(uc=True, headless=headless, page_load_strategy="normal") - - try: - # Step 1: Navigate to reviews - log.info("Step 1: Opening Google Maps...") - driver.get(url) - time.sleep(2) - - # Dismiss cookies - try: - cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') - if cookie_btns: - cookie_btns[0].click() - log.info("✓ Cookie dialog dismissed") - time.sleep(1) - except: - pass - - # Click reviews tab - comprehensive approach - log.info("Step 2: Opening reviews tab...") - - # Review keywords for multiple languages - review_keywords = [ - 'reviews', 'review', 'reseñas', 'reseña', 'opiniones', 'avis', - 'bewertungen', 'recensioni', 'avaliações', 'ביקורות' - ] - - clicked = False - tab_selectors = [ - '.LRkQ2', # Primary - '.hh2c6', # Alternative - '[data-tab-index="1"]', # Tab index - 'button[role="tab"]', # Button tabs - 'div[role="tab"]', # Div tabs - ] - - # Try each selector - for selector in tab_selectors: - try: - tabs = driver.find_elements(By.CSS_SELECTOR, selector) - for tab in tabs: - try: - # Check if this is the reviews tab - text = (tab.text or '').lower() - aria_label = (tab.get_attribute('aria-label') or '').lower() - - if any(keyword in text or keyword in aria_label for keyword in review_keywords): - log.info(f"Found reviews tab with selector {selector}: '{tab.text}'") - # Scroll into view - driver.execute_script("arguments[0].scrollIntoView({block:'center'});", tab) - time.sleep(0.5) - # Click with JavaScript (most reliable) - driver.execute_script("arguments[0].click();", tab) - time.sleep(1.5) - log.info("✓ Reviews tab clicked") - clicked = True - break - except: - continue - if clicked: - break - except: - continue - - if not clicked: - log.warning("Could not find/click reviews tab - may already be on reviews or page structure changed") - - # CRITICAL: Wait after clicking reviews tab for page to load - log.info("Waiting for reviews page to fully load...") - time.sleep(3) - - # Find reviews pane - log.info("Step 3: Finding reviews pane...") - log.info(f"Current URL: {driver.current_url}") - - pane = None - pane_selectors = [ - 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', # Primary - 'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', # Without role="main" - 'div.m6QErb.WNBkOb.XiKgde', # Alternative class combination - 'div[role="main"] div.m6QErb.XiKgde', # Simplified with XiKgde - 'div.m6QErb.DxyBCb.XiKgde', # Another variant - 'div[role="main"] div.m6QErb', # Simplified version - 'div.m6QErb.DxyBCb', # Even more simplified - 'div[role="main"]', # Most generic - ] - - for selector in pane_selectors: - try: - log.info(f"Trying selector: {selector}") - wait = WebDriverWait(driver, 5) - pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector))) - log.info(f"✓ Found reviews pane with: {selector}") - break - except TimeoutException: - log.debug(f"Pane not found with selector: {selector}") - continue - - if not pane: - log.error("Could not find reviews pane after all attempts!") - log.error(f"Final URL: {driver.current_url}") - # Save screenshot for debugging - try: - screenshot_path = 'pane_not_found.png' - driver.save_screenshot(screenshot_path) - log.info(f"Screenshot saved to {screenshot_path}") - except: - pass - return [] - - # Wait for initial reviews to load - log.info("Waiting for initial reviews to render...") - time.sleep(3) - - # Check if any review cards are present - try: - cards = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf') - log.info(f"Found {len(cards)} initial review cards") - except: - log.warning("Could not find initial review cards") - - # Step 4: Setup API interceptor (AFTER finding pane) - log.info("Step 4: Setting up API interception...") - interceptor = GoogleMapsAPIInterceptor(driver) - try: - interceptor.setup_interception() - interceptor.inject_response_interceptor() - log.info("✓ API interceptor ready - capturing network responses") - except Exception as e: - log.warning(f"Failed to setup interceptor: {e}") - import traceback - traceback.print_exc() - time.sleep(2) # Extra wait for interception to be fully active - log.info("") - - # Step 5: Rapid scrolling to trigger API calls - log.info("="*60) - log.info("Step 5: Rapid scrolling to trigger API calls") - log.info("="*60) - - # Setup scroll script (same as original scraper) - try: - driver.execute_script("window.scrollablePane = arguments[0];", pane) - scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" - log.info("✓ Scroll script setup complete") - except Exception as e: - log.warning(f"Error setting up scroll script: {e}") - scroll_script = "window.scrollBy(0, 300);" # Fallback - - # Verify interceptor is active - try: - is_injected = driver.execute_script("return window.__reviewInterceptorInjected === true;") - stats = driver.execute_script("return window.__interceptorStats;") - queue_length = driver.execute_script("return window.__interceptedResponses ? window.__interceptedResponses.length : -1;") - log.info(f"Interceptor status: injected={is_injected}, queue={queue_length}, stats={stats}") - except Exception as e: - log.warning(f"Could not check interceptor status: {e}") - - # Trigger initial API call - log.info("Triggering initial API call...") - driver.execute_script(scroll_script) - time.sleep(2) # Wait for first API response - log.info("") - - # We need about 25 API calls for 244 reviews (10 per call) - # Scroll rapidly - no DOM parsing! - target_reviews = 240 - max_scrolls = 30 - - for i in range(max_scrolls): - # Fast scroll - driver.execute_script(scroll_script) - time.sleep(0.3) # Optimal timing - fast but captures all responses - - # Collect API responses - try: - responses = interceptor.get_intercepted_responses() - if i == 5: # Debug on scroll 5 - log.info(f"DEBUG: Got {len(responses)} responses from interceptor") - - # Check browser console - try: - console_logs = driver.get_log('browser') - interceptor_logs = [l for l in console_logs if 'API Interceptor' in l.get('message', '')] - if interceptor_logs: - log.info(f"DEBUG: Interceptor console logs:") - for l in interceptor_logs[-10:]: # Last 10 - log.info(f" {l['message']}") - else: - log.info("DEBUG: No interceptor logs in console") - except Exception as e: - log.warning(f"Could not get console logs: {e}") - - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - if i == 5: # Debug on scroll 5 - log.info(f"DEBUG: Parsed {len(parsed)} reviews from responses") - - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - - if parsed: - log.info(f"Scroll {i+1}: +{len(parsed)} reviews | Total: {len(api_reviews)}") - - # Exit early if we have enough - if len(api_reviews) >= target_reviews: - log.info(f"\n✓ Reached target of {target_reviews} reviews!") - break - except Exception as e: - log.error(f"Error collecting API responses: {e}") - import traceback - traceback.print_exc() - - # Quick progress update - if (i + 1) % 5 == 0 and i > 0: - log.info(f"Progress: {i+1}/{max_scrolls} scrolls, {len(api_reviews)} reviews collected") - - elapsed = time.time() - start_time - - # Convert to list - all_reviews = list(api_reviews.values()) - - log.info("\n" + "="*60) - log.info("✅ FAST SCRAPING COMPLETED!") - log.info("="*60) - log.info(f"Total reviews: {len(all_reviews)}") - log.info(f"Scrolls performed: {i+1}") - log.info(f"Time elapsed: {elapsed:.2f} seconds") - if all_reviews: - log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second") - log.info("="*60 + "\n") - - # Save results - output_file = 'google_reviews_fast.json' - with open(output_file, 'w', encoding='utf-8') as f: - json.dump(all_reviews, f, indent=2, ensure_ascii=False) - - log.info(f"💾 Saved {len(all_reviews)} reviews to {output_file}") - - # Show sample - if all_reviews: - log.info("\n📝 Sample review:") - sample = all_reviews[0] - log.info(f" Author: {sample['author']}") - log.info(f" Rating: {sample['rating']}★") - log.info(f" Date: {sample['date_text']}") - if sample['text']: - log.info(f" Text: {sample['text'][:80]}...") - - # Stats comparison - log.info("\n" + "="*60) - log.info("SPEED COMPARISON") - log.info("="*60) - log.info(f"Old approach: ~155 seconds for 244 reviews") - log.info(f"Fast approach: ~{elapsed:.0f} seconds for {len(all_reviews)} reviews") - if elapsed > 0: - log.info(f"Improvement: {155/elapsed:.1f}x faster! 🚀") - log.info("="*60 + "\n") - - return all_reviews - - finally: - # Always close the driver - try: - driver.quit() - except: - pass - - -if __name__ == '__main__': - try: - reviews = fast_scrape() - sys.exit(0 if reviews else 1) - except KeyboardInterrupt: - log.info("\n\nInterrupted by user") - sys.exit(1) - except Exception as e: - log.error(f"Fatal error: {e}") - import traceback - traceback.print_exc() - sys.exit(1) diff --git a/start_fastest_stable.py b/start_fastest_stable.py deleted file mode 100644 index af91fe0..0000000 --- a/start_fastest_stable.py +++ /dev/null @@ -1,307 +0,0 @@ -#!/usr/bin/env python3 -""" -FASTEST STABLE Scraper - Best of both worlds. - -Strategy: -1. Ultra-fast API scrolling (proven stable) → 234 reviews in ~19s -2. Instant JavaScript DOM extraction → 10 missing reviews in ~0.5s -3. Total: ~20 seconds for all 244 reviews with 100% stability - -Combines stability of API approach with speed of JavaScript extraction. -""" -import sys -import yaml -import logging -import time -import json -from seleniumbase import Driver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import TimeoutException -from modules.api_interceptor import GoogleMapsAPIInterceptor - -logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s') -log = logging.getLogger(__name__) -log.setLevel(logging.INFO) - - -def load_config(): - with open('config.yaml', 'r') as f: - return yaml.safe_load(f) - - -def extract_missing_reviews_js(driver, max_reviews=25): - """Ultra-fast JavaScript extraction for missing reviews.""" - - extract_script = """ - const reviews = []; - const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium'); - const maxCount = Math.min(arguments[0], elements.length); - - for (let i = 0; i < maxCount; i++) { - const elem = elements[i]; - const review = {}; - - try { - const authorElem = elem.querySelector('div.d4r55'); - review.author = authorElem ? authorElem.textContent.trim() : null; - - const ratingElem = elem.querySelector('span.kvMYJc'); - if (ratingElem) { - const ariaLabel = ratingElem.getAttribute('aria-label'); - if (ariaLabel) { - const match = ariaLabel.match(/\\d+/); - review.rating = match ? parseFloat(match[0]) : null; - } - } - - const textElem = elem.querySelector('span.wiI7pd'); - review.text = textElem ? textElem.textContent.trim() : null; - - const dateElem = elem.querySelector('span.rsqaWe'); - review.date_text = dateElem ? dateElem.textContent.trim() : null; - - const avatarElem = elem.querySelector('img.NBa7we'); - review.avatar_url = avatarElem ? avatarElem.src : null; - - const profileElem = elem.querySelector('button.WEBjve'); - review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null; - - if (review.author && review.date_text) { - reviews.push(review); - } - } catch (e) { - // Skip - } - } - return reviews; - """ - - try: - reviews_data = driver.execute_script(extract_script, max_reviews) - - reviews = [] - for review_data in reviews_data: - review_id = f"dom_{hash(review_data['author'] + review_data['date_text'])}" - review_data['review_id'] = review_id - reviews.append(review_data) - - return reviews - except Exception as e: - return [] - - -def fastest_stable_scrape(): - """Get ALL 244 reviews with ultra-fast API + instant JS extraction.""" - - config = load_config() - url = config.get('url') - headless = config.get('headless', False) - - print("FASTEST STABLE SCRAPER - Ultra-fast API + instant JS...") - print(f"URL: {url[:80]}...") - - start_time = time.time() - api_reviews = {} - - driver = Driver(uc=True, headless=headless, page_load_strategy="normal") - - try: - # Navigate - driver.get(url) - time.sleep(1.5) - - # Dismiss cookies - try: - cookie_btns = driver.find_elements(By.CSS_SELECTOR, - 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') - if cookie_btns: - cookie_btns[0].click() - time.sleep(0.4) - except: - pass - - # Click reviews tab - review_keywords = ['reviews', 'review', 'reseñas', 'reseña'] - for selector in ['.LRkQ2', 'button[role="tab"]']: - try: - tabs = driver.find_elements(By.CSS_SELECTOR, selector) - for tab in tabs: - text = (tab.text or '').lower() - aria = (tab.get_attribute('aria-label') or '').lower() - if any(kw in text or kw in aria for kw in review_keywords): - driver.execute_script("arguments[0].click();", tab) - time.sleep(0.4) - break - except: - continue - - # Wait for stability - time.sleep(1.0) - - # Find pane - pane = None - try: - wait = WebDriverWait(driver, 3) - pane = wait.until(EC.presence_of_element_located( - (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'))) - except TimeoutException: - try: - pane = wait.until(EC.presence_of_element_located( - (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde'))) - except: - print("ERROR: Could not find pane") - return [] - - # Wait for initial reviews to load (critical for stability) - time.sleep(1.5) - - # Setup API interceptor - interceptor = GoogleMapsAPIInterceptor(driver) - interceptor.setup_interception() - interceptor.inject_response_interceptor() - time.sleep(1.0) # Important: wait for interceptor to be ready - - # Setup scroll - driver.execute_script("window.scrollablePane = arguments[0];", pane) - scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" - - # Trigger initial scroll to get first API response - driver.execute_script(scroll_script) - time.sleep(1.0) # Wait for first API response - - print("[Phase 1] Ultra-fast API scrolling...") - - # Ultra-fast API scrolling - target_reviews = 240 - max_scrolls = 35 - - for i in range(max_scrolls): - driver.execute_script(scroll_script) - time.sleep(0.27) # Optimal timing - - # API collection - try: - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - - if (i + 1) % 10 == 0: - print(f" {len(api_reviews)} reviews...") - - if len(api_reviews) >= target_reviews: - break - except: - pass - - # Final API collection - try: - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - except: - pass - - api_time = time.time() - start_time - print(f" ✅ Phase 1: {len(api_reviews)} reviews in {api_time:.2f}s") - - # [Phase 2] Instant JavaScript extraction for missing reviews - missing = 244 - len(api_reviews) - if missing > 0: - print(f"\n[Phase 2] Fast JS extraction for {missing} missing reviews...") - - # Scroll to top (missing reviews likely at top) - driver.execute_script("window.scrollablePane.scrollTo(0, 0);", pane) - time.sleep(0.3) - - # Extract with JavaScript - dom_reviews = extract_missing_reviews_js(driver, max_reviews=min(missing + 10, 25)) - - # Build API keys for deduplication - api_keys = set() - for api_review in api_reviews.values(): - key = (api_review.get('author', ''), (api_review.get('date_text', '') or '')[:20]) - api_keys.add(key) - - # Add unique DOM reviews - dom_added = 0 - for dom_review in dom_reviews: - dom_key = (dom_review.get('author', ''), (dom_review.get('date_text', '') or '')[:20]) - if dom_key not in api_keys: - api_reviews[dom_review['review_id']] = dom_review - dom_added += 1 - - dom_time = time.time() - start_time - api_time - print(f" ✅ Phase 2: +{dom_added} reviews in {dom_time:.2f}s") - - elapsed = time.time() - start_time - all_reviews = list(api_reviews.values()) - - print(f"\n{'='*50}") - print(f"✅ COMPLETED!") - print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)") - print(f"Time: {elapsed:.2f}s") - print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") - print(f"Speedup: {155/elapsed:.1f}x faster! 🚀") - print(f"{'='*50}") - - if len(all_reviews) >= 244: - print(f"🎯 Got ALL 244 reviews!") - elif len(all_reviews) >= 240: - print(f"⚠️ Missing {244-len(all_reviews)} reviews") - - print() - - # Save - with open('google_reviews_fastest_stable.json', 'w', encoding='utf-8') as f: - json.dump(all_reviews, f, indent=2, ensure_ascii=False) - - print(f"💾 Saved to google_reviews_fastest_stable.json") - - if all_reviews: - print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★") - - return all_reviews - - finally: - try: - driver.quit() - except: - pass - - -if __name__ == '__main__': - try: - reviews = fastest_stable_scrape() - sys.exit(0 if reviews else 1) - except KeyboardInterrupt: - print("\n\nInterrupted by user") - sys.exit(1) - except Exception as e: - print(f"ERROR: {e}") - import traceback - traceback.print_exc() - sys.exit(1) diff --git a/start_hybrid_parallel.py b/start_hybrid_parallel.py deleted file mode 100644 index c9e432c..0000000 --- a/start_hybrid_parallel.py +++ /dev/null @@ -1,286 +0,0 @@ -#!/usr/bin/env python3 -""" -Hybrid Parallel Scraper - Best of both worlds. - -Strategy: -1. Open browser and get to reviews page (~15s) -2. Scroll quickly to collect ~5-10 continuation tokens (~5s) -3. Make parallel API calls in browser using JavaScript (~2-3s) -4. Total: ~22-25 seconds for 244 reviews - -This approach: -- Uses browser's active session (no auth issues) -- Collects tokens sequentially (required by API) -- Makes parallel calls for remaining pages (fast!) -""" -import sys -import yaml -import logging -import time -import json -from seleniumbase import Driver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import TimeoutException -from modules.api_interceptor import GoogleMapsAPIInterceptor - -logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') -log = logging.getLogger(__name__) - - -def load_config(): - with open('config.yaml', 'r') as f: - return yaml.safe_load(f) - - -def hybrid_parallel_scrape(): - """Hybrid approach: Sequential token collection + Parallel fetch.""" - - config = load_config() - url = config.get('url') - headless = config.get('headless', False) - - log.info("="*60) - log.info("HYBRID PARALLEL SCRAPER") - log.info("="*60) - log.info(f"URL: {url[:80]}...") - log.info(f"Mode: Sequential tokens + Parallel fetch") - log.info("="*60 + "\n") - - start_time = time.time() - driver = Driver(uc=True, headless=headless, page_load_strategy="normal") - - try: - # PHASE 1: Setup (~15s) - log.info("Phase 1: Browser setup...") - driver.get(url) - time.sleep(2) - - # Dismiss cookies - try: - cookie_btns = driver.find_elements(By.CSS_SELECTOR, - 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') - if cookie_btns: - cookie_btns[0].click() - time.sleep(1) - except: - pass - - # Click reviews tab - review_keywords = ['reviews', 'review', 'reseñas'] - for selector in ['.LRkQ2', '.hh2c6', 'button[role="tab"]']: - try: - tabs = driver.find_elements(By.CSS_SELECTOR, selector) - for tab in tabs: - text = (tab.text or '').lower() - aria = (tab.get_attribute('aria-label') or '').lower() - if any(kw in text or kw in aria for kw in review_keywords): - driver.execute_script("arguments[0].click();", tab) - time.sleep(2) - break - except: - continue - - time.sleep(3) - - # Find pane - pane = None - for selector in ['div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', - 'div.m6QErb.WNBkOb.XiKgde']: - try: - wait = WebDriverWait(driver, 5) - pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector))) - break - except: - continue - - if not pane: - log.error("Could not find pane") - return [] - - time.sleep(2) - - # Extract place ID - place_id = None - current_url = driver.current_url - if '!1s' in current_url: - parts = current_url.split('!1s') - if len(parts) > 1: - place_id = parts[1].split('!')[0] - - if not place_id: - log.error("Could not extract place ID") - return [] - - log.info(f"✓ Setup complete (place_id: {place_id})\n") - - # PHASE 2: Collect tokens via scrolling (~5s) - log.info("Phase 2: Collecting continuation tokens...") - interceptor = GoogleMapsAPIInterceptor(driver) - interceptor.setup_interception() - interceptor.inject_response_interceptor() - time.sleep(1) - - # Setup scroll - driver.execute_script("window.scrollablePane = arguments[0];", pane) - scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" - - # Collect tokens by scrolling quickly - tokens = [] - all_reviews = {} - - for i in range(8): # 8 scrolls to get ~8 tokens - driver.execute_script(scroll_script) - time.sleep(0.2) # Very fast scrolling - - # Collect responses - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - for review in parsed: - if review.review_id and review.review_id not in all_reviews: - all_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - - # Extract continuation token from raw response - for resp in responses: - try: - body = resp.get('body', '') - if body.startswith(")]}'"): - body = body[4:] - data = json.loads(body) - if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str): - token = data[1] - if token and token not in tokens: - tokens.append(token) - except: - pass - - log.info(f"✓ Collected {len(tokens)} continuation tokens") - log.info(f"✓ Got {len(all_reviews)} reviews from scrolling\n") - - # PHASE 3: Parallel fetch remaining pages (~2-3s) - if len(tokens) > 0: - log.info("Phase 3: Parallel fetch of remaining pages...") - - parallel_script = """ - async function fetchPages(placeId, tokens) { - const baseUrl = 'https://www.google.com/maps/rpc/listugcposts'; - const results = []; - - const promises = tokens.map((token, idx) => { - const pb = `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`; - const params = new URLSearchParams({ - authuser: '0', - hl: 'es', - gl: 'es', - pb: pb - }); - - return fetch(`${baseUrl}?${params}`) - .then(r => r.text()) - .then(text => { - const body = text.startsWith(")]}'") ? text.substring(4) : text; - return {idx, data: JSON.parse(body)}; - }) - .catch(e => null); - }); - - const settled = await Promise.all(promises); - return settled.filter(r => r !== null); - } - - return await fetchPages(arguments[0], arguments[1]); - """ - - try: - parallel_start = time.time() - results = driver.execute_async_script(parallel_script, place_id, tokens[:15]) # Limit to 15 parallel - parallel_time = time.time() - parallel_start - - log.info(f"✓ Parallel fetch completed in {parallel_time:.2f}s") - log.info(f" Received {len(results)} responses") - - # Parse parallel results - for result in results: - if result and 'data' in result: - try: - parsed = interceptor._parse_listugcposts_response(result['data']) - for review in parsed: - if review.review_id and review.review_id not in all_reviews: - all_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - except Exception as e: - log.debug(f"Parse error: {e}") - - log.info(f"✓ Total reviews after parallel fetch: {len(all_reviews)}\n") - - except Exception as e: - log.warning(f"Parallel fetch failed: {e}") - - reviews_list = list(all_reviews.values()) - elapsed = time.time() - start_time - - log.info("="*60) - log.info("✅ HYBRID PARALLEL SCRAPING COMPLETED!") - log.info("="*60) - log.info(f"Total reviews: {len(reviews_list)}") - log.info(f"Total time: {elapsed:.2f} seconds") - log.info(f"Speed: {len(reviews_list)/elapsed:.1f} reviews/second") - log.info("="*60 + "\n") - - # Save - with open('google_reviews_hybrid.json', 'w', encoding='utf-8') as f: - json.dump(reviews_list, f, indent=2, ensure_ascii=False) - - log.info(f"💾 Saved {len(reviews_list)} reviews to google_reviews_hybrid.json") - - if reviews_list: - log.info("\n📝 Sample:") - s = reviews_list[0] - log.info(f" {s['author']} - {s['rating']}★ - {s['date_text']}") - - log.info("\n" + "="*60) - log.info("SPEED COMPARISON") - log.info("="*60) - log.info(f"Old DOM: ~155s for 244 reviews (1.0x)") - log.info(f"Fast scrolling: ~29s for 234 reviews (5.3x)") - log.info(f"Hybrid parallel: ~{elapsed:.0f}s for {len(reviews_list)} reviews ({155/elapsed:.1f}x)! 🚀") - log.info("="*60 + "\n") - - return reviews_list - - finally: - try: - driver.quit() - except: - pass - - -if __name__ == '__main__': - try: - reviews = hybrid_parallel_scrape() - sys.exit(0 if reviews else 1) - except KeyboardInterrupt: - log.info("\n\nInterrupted by user") - sys.exit(1) - except Exception as e: - log.error(f"Fatal error: {e}") - import traceback - traceback.print_exc() - sys.exit(1) diff --git a/start_optimized_hybrid.py b/start_optimized_hybrid.py deleted file mode 100644 index 529c583..0000000 --- a/start_optimized_hybrid.py +++ /dev/null @@ -1,318 +0,0 @@ -#!/usr/bin/env python3 -""" -OPTIMIZED HYBRID Scraper - True parallel with minimal overhead. - -Strategy: -1. Ultra-fast API scrolling (no DOM parsing during scroll!) -2. Quick DOM count check near end (minimal overhead) -3. If needed, targeted DOM parse at very end for missing reviews -4. Goal: ~22-25s for all 244 reviews - -Key: Keep scroll loop FAST, only parse DOM if absolutely needed at the very end. -""" -import sys -import yaml -import logging -import time -import json -from seleniumbase import Driver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import TimeoutException -from modules.api_interceptor import GoogleMapsAPIInterceptor - -logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s') -log = logging.getLogger(__name__) -log.setLevel(logging.INFO) - - -def load_config(): - with open('config.yaml', 'r') as f: - return yaml.safe_load(f) - - -def quick_dom_parse_top_reviews(driver, count=15): - """Quick parse of just the top N reviews from DOM.""" - dom_reviews = [] - - try: - # Get only first N review elements (the ones most likely to be missing from API) - review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')[:count] - - for elem in review_elements: - try: - review_data = {} - - # Author - try: - author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55') - review_data['author'] = author_elem.text - except: - review_data['author'] = None - - # Rating - try: - rating_elem = elem.find_element(By.CSS_SELECTOR, 'span.kvMYJc') - rating_attr = rating_elem.get_attribute('aria-label') - if rating_attr: - rating_parts = rating_attr.split() - if rating_parts: - review_data['rating'] = float(rating_parts[0]) - except: - review_data['rating'] = None - - # Text - try: - text_elem = elem.find_element(By.CSS_SELECTOR, 'span.wiI7pd') - review_data['text'] = text_elem.text - except: - review_data['text'] = None - - # Date - try: - date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe') - review_data['date_text'] = date_elem.text - except: - review_data['date_text'] = None - - # Avatar - try: - avatar_elem = elem.find_element(By.CSS_SELECTOR, 'img.NBa7we') - review_data['avatar_url'] = avatar_elem.get_attribute('src') - except: - review_data['avatar_url'] = None - - # Profile URL - try: - profile_elem = elem.find_element(By.CSS_SELECTOR, 'button.WEBjve') - review_data['profile_url'] = profile_elem.get_attribute('data-review-id') - except: - review_data['profile_url'] = None - - # Generate ID - if review_data.get('author'): - review_id = f"dom_{hash(str(review_data.get('author', '')) + str(review_data.get('date_text', '')))}" - review_data['review_id'] = review_id - dom_reviews.append(review_data) - - except: - continue - - except Exception as e: - pass - - return dom_reviews - - -def optimized_hybrid_scrape(): - """Ultra-fast API scrolling + minimal targeted DOM parse.""" - - config = load_config() - url = config.get('url') - headless = config.get('headless', False) - - print("OPTIMIZED HYBRID SCRAPER - Ultra-fast API + minimal DOM...") - print(f"URL: {url[:80]}...") - - start_time = time.time() - api_reviews = {} - - driver = Driver(uc=True, headless=headless, page_load_strategy="normal") - - try: - # Navigate - driver.get(url) - time.sleep(1.5) - - # Dismiss cookies - try: - cookie_btns = driver.find_elements(By.CSS_SELECTOR, - 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') - if cookie_btns: - cookie_btns[0].click() - time.sleep(0.4) - except: - pass - - # Click reviews tab - review_keywords = ['reviews', 'review', 'reseñas', 'reseña'] - for selector in ['.LRkQ2', 'button[role="tab"]']: - try: - tabs = driver.find_elements(By.CSS_SELECTOR, selector) - for tab in tabs: - text = (tab.text or '').lower() - aria = (tab.get_attribute('aria-label') or '').lower() - if any(kw in text or kw in aria for kw in review_keywords): - driver.execute_script("arguments[0].click();", tab) - time.sleep(0.4) - break - except: - continue - - # Brief wait for reviews page (balance speed vs stability) - time.sleep(1.0) # Reduced from 3s but needed for stability - - # Find pane - use most common selector directly - pane = None - try: - wait = WebDriverWait(driver, 3) # Reduced from 5s - pane = wait.until(EC.presence_of_element_located( - (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'))) - except TimeoutException: - try: - pane = wait.until(EC.presence_of_element_located( - (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde'))) - except: - print("ERROR: Could not find pane") - return [] - - # Setup API interceptor immediately - interceptor = GoogleMapsAPIInterceptor(driver) - interceptor.setup_interception() - interceptor.inject_response_interceptor() - time.sleep(0.3) # Minimal wait for interceptor - - # Setup scroll - driver.execute_script("window.scrollablePane = arguments[0];", pane) - scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" - - # Trigger initial scroll - driver.execute_script(scroll_script) - time.sleep(0.3) # Minimal initial trigger wait - - print("Ultra-fast API scrolling...") - - # FAST API-only scrolling (NO DOM parsing overhead!) - max_scrolls = 35 - for i in range(max_scrolls): - driver.execute_script(scroll_script) - time.sleep(0.27) - - # API collection only - try: - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - except: - pass - - if (i + 1) % 10 == 0: - print(f" {len(api_reviews)} reviews...") - - # Final API collection - try: - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - except: - pass - - api_time = time.time() - start_time - print(f" ✅ API complete: {len(api_reviews)} reviews in {api_time:.2f}s") - - # Targeted DOM parse ONLY if we're missing reviews - missing = 244 - len(api_reviews) - if missing > 0: - print(f"\nQuick DOM parse for {missing} missing reviews...") - - # Scroll to top - driver.execute_script("window.scrollablePane.scrollTo(0, 0);", pane) - time.sleep(0.5) - - # Quick parse of top reviews (most likely to be missing) - dom_reviews = quick_dom_parse_top_reviews(driver, count=min(missing + 5, 20)) - - # Build API keys - api_keys = set() - for api_review in api_reviews.values(): - key = ( - api_review.get('author', ''), - (api_review.get('date_text', '') or '')[:20] - ) - api_keys.add(key) - - # Add unique DOM reviews - dom_added = 0 - for dom_review in dom_reviews: - dom_key = ( - dom_review.get('author', ''), - (dom_review.get('date_text', '') or '')[:20] - ) - if dom_key not in api_keys and dom_review.get('review_id'): - api_reviews[dom_review['review_id']] = dom_review - dom_added += 1 - - dom_time = time.time() - start_time - api_time - print(f" ✅ DOM complete: +{dom_added} reviews in {dom_time:.2f}s") - - elapsed = time.time() - start_time - all_reviews = list(api_reviews.values()) - - print(f"\n{'='*50}") - print(f"✅ COMPLETED!") - print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)") - print(f"Time: {elapsed:.2f}s") - print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") - print(f"Speedup: {155/elapsed:.1f}x faster! 🚀") - print(f"{'='*50}") - - if len(all_reviews) >= 244: - print(f"🎯 Got ALL 244 reviews!") - elif len(all_reviews) >= 240: - print(f"⚠️ Missing {244-len(all_reviews)} reviews") - - print() - - # Save - with open('google_reviews_optimized_hybrid.json', 'w', encoding='utf-8') as f: - json.dump(all_reviews, f, indent=2, ensure_ascii=False) - - print(f"💾 Saved to google_reviews_optimized_hybrid.json") - - if all_reviews: - print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★") - - return all_reviews - - finally: - try: - driver.quit() - except: - pass - - -if __name__ == '__main__': - try: - reviews = optimized_hybrid_scrape() - sys.exit(0 if reviews else 1) - except KeyboardInterrupt: - print("\n\nInterrupted by user") - sys.exit(1) - except Exception as e: - print(f"ERROR: {e}") - import traceback - traceback.print_exc() - sys.exit(1) diff --git a/start_parallel.py b/start_parallel.py deleted file mode 100644 index 6d9b6df..0000000 --- a/start_parallel.py +++ /dev/null @@ -1,360 +0,0 @@ -#!/usr/bin/env python3 -""" -Parallel API Scraper - Capture session, then parallel API calls. - -Strategy: -1. Open browser and navigate to reviews (~15 seconds) -2. Capture cookies and place ID from active session (~2 seconds) -3. Make parallel API calls using requests (~5-10 seconds) -4. Close browser immediately - -Expected time: ~20-30 seconds for 244 reviews (vs 155 seconds) -Speed improvement: ~5-7x faster! -""" -import sys -import yaml -import logging -import time -import json -from pathlib import Path -from concurrent.futures import ThreadPoolExecutor, as_completed -import requests -from seleniumbase import Driver -from selenium.webdriver.common.by import By -from modules.api_interceptor import GoogleMapsAPIInterceptor - -logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') -log = logging.getLogger(__name__) - - -def load_config(): - """Load configuration from config.yaml""" - with open('config.yaml', 'r') as f: - return yaml.safe_load(f) - - -def capture_session(url: str, headless: bool = False): - """ - Capture cookies and place ID from browser session. - Returns (session, place_id, interceptor) - """ - log.info("="*60) - log.info("STEP 1: Capturing session from browser") - log.info("="*60) - - driver = Driver(uc=True, headless=headless, page_load_strategy="normal") - - try: - # Navigate to place - log.info("Opening Google Maps...") - driver.get(url) - time.sleep(2) - - # Dismiss cookies - try: - cookie_btns = driver.find_elements(By.CSS_SELECTOR, - 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') - if cookie_btns: - cookie_btns[0].click() - log.info("✓ Cookie dialog dismissed") - time.sleep(1) - except: - pass - - # Click reviews tab - log.info("Opening reviews tab...") - review_keywords = ['reviews', 'review', 'reseñas', 'reseña', 'opiniones'] - clicked = False - - for selector in ['.LRkQ2', '.hh2c6', '[data-tab-index="1"]', 'button[role="tab"]']: - try: - tabs = driver.find_elements(By.CSS_SELECTOR, selector) - for tab in tabs: - text = (tab.text or '').lower() - aria_label = (tab.get_attribute('aria-label') or '').lower() - if any(kw in text or kw in aria_label for kw in review_keywords): - driver.execute_script("arguments[0].click();", tab) - time.sleep(2) - log.info("✓ Reviews tab clicked") - clicked = True - break - if clicked: - break - except: - continue - - # Wait for reviews to load - time.sleep(3) - - # Extract place ID from URL - current_url = driver.current_url - place_id = None - if '!1s' in current_url: - parts = current_url.split('!1s') - if len(parts) > 1: - place_id = parts[1].split('!')[0] - log.info(f"✓ Extracted place ID: {place_id}") - - if not place_id: - log.error("Could not extract place ID from URL") - return None, None, None - - # Capture ALL cookies using CDP - log.info("Capturing cookies via CDP...") - cdp_cookies = driver.execute_cdp_cmd('Network.getAllCookies', {}) - browser_cookies = cdp_cookies.get('cookies', []) - log.info(f"✓ Captured {len(browser_cookies)} cookies") - - # Get user agent - user_agent = driver.execute_script("return navigator.userAgent") - - # Create session with cookies - session = requests.Session() - for cookie in browser_cookies: - session.cookies.set( - name=cookie['name'], - value=cookie['value'], - domain=cookie.get('domain', '.google.com'), - path=cookie.get('path', '/') - ) - - # Set headers - session.headers.update({ - 'User-Agent': user_agent, - 'Accept': '*/*', - 'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8', - 'Referer': 'https://www.google.com/maps/', - 'Origin': 'https://www.google.com', - }) - - # Create interceptor for parsing - interceptor = GoogleMapsAPIInterceptor(None) - - log.info("✓ Session captured successfully\n") - return session, place_id, interceptor - - finally: - # Close browser immediately - we don't need it anymore! - try: - driver.quit() - log.info("✓ Browser closed\n") - except: - pass - - -def fetch_reviews_page(session, place_id, interceptor, continuation_token=None): - """Fetch a single page of reviews via API.""" - if continuation_token: - pb = f"!1m6!1s{place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" - else: - pb = f"!1m6!1s{place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" - - params = { - 'authuser': '0', - 'hl': 'es', - 'gl': 'es', - 'pb': pb - } - - try: - url = 'https://www.google.com/maps/rpc/listugcposts' - response = session.get(url, params=params, timeout=10) - - if response.status_code != 200: - log.error(f"API error {response.status_code}") - return [], None - - body = response.text - if body.startswith(")]}'"): - body = body[4:].strip() - - data = json.loads(body) - reviews = interceptor._parse_listugcposts_response(data) - - # Get next token - next_token = None - if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str): - next_token = data[1] - - return reviews, next_token - - except Exception as e: - log.error(f"Request failed: {e}") - return [], None - - -def scrape_all_parallel(session, place_id, interceptor, max_workers=5): - """ - Main scraping method with parallel API calls. - """ - log.info("="*60) - log.info("STEP 2: Parallel API scraping") - log.info("="*60) - - start_time = time.time() - all_reviews = [] - seen_ids = set() - - # Fetch first page to get continuation token - log.info("Fetching first page...") - reviews, token = fetch_reviews_page(session, place_id, interceptor, None) - for review in reviews: - rid = review.review_id or f"{review.author}_{review.date_text}" - if rid not in seen_ids: - seen_ids.add(rid) - all_reviews.append({ - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - }) - - log.info(f" → {len(reviews)} reviews | Total: {len(all_reviews)}") - - if not token: - log.info("No continuation token - only one page of reviews") - return all_reviews - - # Collect continuation tokens by fetching a few sequential pages - # (We need to do this sequentially to get the tokens) - tokens = [token] - log.info("Collecting continuation tokens...") - for i in range(4): # Get 5 total tokens - reviews, next_token = fetch_reviews_page(session, place_id, interceptor, token) - if next_token: - tokens.append(next_token) - token = next_token - else: - break - - for review in reviews: - rid = review.review_id or f"{review.author}_{review.date_text}" - if rid not in seen_ids: - seen_ids.add(rid) - all_reviews.append({ - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - }) - - log.info(f"Collected {len(tokens)} tokens, {len(all_reviews)} reviews so far") - log.info(f"Starting parallel fetch with {max_workers} workers...\n") - - # Now fetch remaining pages in parallel - with ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = [] - for token in tokens: - future = executor.submit(fetch_reviews_page, session, place_id, interceptor, token) - futures.append(future) - - for i, future in enumerate(as_completed(futures)): - try: - reviews, _ = future.result() - new_count = 0 - for review in reviews: - rid = review.review_id or f"{review.author}_{review.date_text}" - if rid not in seen_ids: - seen_ids.add(rid) - all_reviews.append({ - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - }) - new_count += 1 - - log.info(f" Completed {i+1}/{len(futures)}: +{new_count} new reviews | Total: {len(all_reviews)}") - except Exception as e: - log.error(f" Error in parallel fetch: {e}") - - elapsed = time.time() - start_time - - log.info(f"\n{'='*60}") - log.info(f"✅ PARALLEL SCRAPING COMPLETED!") - log.info(f"{'='*60}") - log.info(f"Total reviews: {len(all_reviews)}") - log.info(f"Parallel workers: {max_workers}") - log.info(f"API time: {elapsed:.2f} seconds") - log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") - log.info(f"{'='*60}\n") - - return all_reviews - - -def main(): - """Main entry point.""" - config = load_config() - url = config.get('url') - headless = config.get('headless', False) - - log.info("="*60) - log.info("PARALLEL API SCRAPER") - log.info("="*60) - log.info(f"URL: {url[:80]}...") - log.info(f"Mode: Parallel API calls (no scrolling)") - log.info("="*60 + "\n") - - total_start = time.time() - - # Step 1: Capture session from browser - session, place_id, interceptor = capture_session(url, headless) - if not session or not place_id: - log.error("Failed to capture session") - return [] - - # Step 2: Parallel API scraping - reviews = scrape_all_parallel(session, place_id, interceptor, max_workers=5) - - total_elapsed = time.time() - total_start - - # Save results - output_file = 'google_reviews_parallel.json' - with open(output_file, 'w', encoding='utf-8') as f: - json.dump(reviews, f, indent=2, ensure_ascii=False) - - log.info(f"💾 Saved {len(reviews)} reviews to {output_file}") - - # Show sample - if reviews: - log.info("\n📝 Sample review:") - sample = reviews[0] - log.info(f" Author: {sample['author']}") - log.info(f" Rating: {sample['rating']}★") - log.info(f" Date: {sample['date_text']}") - if sample['text']: - log.info(f" Text: {sample['text'][:80]}...") - - # Stats comparison - log.info("\n" + "="*60) - log.info("SPEED COMPARISON") - log.info("="*60) - log.info(f"Old DOM scraping: ~155 seconds for 244 reviews") - log.info(f"Fast API scrolling: ~43 seconds for 234 reviews (3.6x faster)") - log.info(f"Parallel API calls: ~{total_elapsed:.0f} seconds for {len(reviews)} reviews ({155/total_elapsed:.1f}x faster!) 🚀") - log.info("="*60 + "\n") - - return reviews - - -if __name__ == '__main__': - try: - reviews = main() - sys.exit(0 if reviews else 1) - except KeyboardInterrupt: - log.info("\n\nInterrupted by user") - sys.exit(1) - except Exception as e: - log.error(f"Fatal error: {e}") - import traceback - traceback.print_exc() - sys.exit(1) diff --git a/start_parallel_hybrid.py b/start_parallel_hybrid.py deleted file mode 100644 index ac6f65f..0000000 --- a/start_parallel_hybrid.py +++ /dev/null @@ -1,350 +0,0 @@ -#!/usr/bin/env python3 -""" -PARALLEL HYBRID Scraper - Collects API + DOM simultaneously while scrolling. - -Strategy: -1. During scrolling, collect BOTH API responses AND DOM elements in parallel -2. Deduplicate at the end -3. Should get all 244 reviews in ~20-25s (vs 34s sequential) - -Optimization: No separate DOM parsing phase - everything happens during scroll! -""" -import sys -import yaml -import logging -import time -import json -from seleniumbase import Driver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import TimeoutException, StaleElementReferenceException -from modules.api_interceptor import GoogleMapsAPIInterceptor - -logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s') -log = logging.getLogger(__name__) -log.setLevel(logging.INFO) - - -def load_config(): - with open('config.yaml', 'r') as f: - return yaml.safe_load(f) - - -def parse_dom_review_element(elem): - """Parse a single review element from DOM.""" - try: - review_data = {} - - # Author name - try: - author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55') - review_data['author'] = author_elem.text - except: - review_data['author'] = None - - # Rating - try: - rating_elem = elem.find_element(By.CSS_SELECTOR, 'span.kvMYJc') - rating_attr = rating_elem.get_attribute('aria-label') - if rating_attr: - rating_parts = rating_attr.split() - if rating_parts: - review_data['rating'] = float(rating_parts[0]) - except: - review_data['rating'] = None - - # Review text - try: - text_elem = elem.find_element(By.CSS_SELECTOR, 'span.wiI7pd') - review_data['text'] = text_elem.text - except: - review_data['text'] = None - - # Date - try: - date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe') - review_data['date_text'] = date_elem.text - except: - review_data['date_text'] = None - - # Avatar URL - try: - avatar_elem = elem.find_element(By.CSS_SELECTOR, 'img.NBa7we') - review_data['avatar_url'] = avatar_elem.get_attribute('src') - except: - review_data['avatar_url'] = None - - # Profile URL - try: - profile_elem = elem.find_element(By.CSS_SELECTOR, 'button.WEBjve') - review_data['profile_url'] = profile_elem.get_attribute('data-review-id') - except: - review_data['profile_url'] = None - - # Generate ID from author + date + rating - if review_data.get('author'): - review_id = f"dom_{hash(str(review_data.get('author', '')) + str(review_data.get('date_text', '')) + str(review_data.get('rating', '')))}" - review_data['review_id'] = review_id - return review_data - - return None - - except (StaleElementReferenceException, Exception): - return None - - -def parallel_hybrid_scrape(): - """Collect API + DOM simultaneously during scrolling.""" - - config = load_config() - url = config.get('url') - headless = config.get('headless', False) - - print("PARALLEL HYBRID SCRAPER - Collecting API + DOM simultaneously...") - print(f"URL: {url[:80]}...") - - start_time = time.time() - api_reviews = {} - dom_reviews = {} - - driver = Driver(uc=True, headless=headless, page_load_strategy="normal") - - try: - # Step 1: Navigate - driver.get(url) - time.sleep(1.5) - - # Dismiss cookies - try: - cookie_btns = driver.find_elements(By.CSS_SELECTOR, - 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') - if cookie_btns: - cookie_btns[0].click() - time.sleep(0.4) - except: - pass - - # Click reviews tab - review_keywords = ['reviews', 'review', 'reseñas', 'reseña'] - for selector in ['.LRkQ2', 'button[role="tab"]']: - try: - tabs = driver.find_elements(By.CSS_SELECTOR, selector) - for tab in tabs: - text = (tab.text or '').lower() - aria = (tab.get_attribute('aria-label') or '').lower() - if any(kw in text or kw in aria for kw in review_keywords): - driver.execute_script("arguments[0].click();", tab) - time.sleep(0.4) - break - except: - continue - - # Wait for page stability - time.sleep(1.0) - - # Find pane - pane = None - try: - wait = WebDriverWait(driver, 3) - pane = wait.until(EC.presence_of_element_located( - (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'))) - except TimeoutException: - try: - pane = wait.until(EC.presence_of_element_located( - (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde'))) - except: - print("ERROR: Could not find pane") - return [] - - # Wait for reviews to start loading - time.sleep(1.5) - - # Setup API interceptor - interceptor = GoogleMapsAPIInterceptor(driver) - interceptor.setup_interception() - interceptor.inject_response_interceptor() - time.sleep(1.0) # Important: wait for interceptor to be ready - - # Setup scroll - driver.execute_script("window.scrollablePane = arguments[0];", pane) - scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" - - # Trigger initial scroll to get first API response - driver.execute_script(scroll_script) - time.sleep(1.0) # Wait for first API response - - print("Parallel collection (API + DOM simultaneously)...") - - # Scrolling with PARALLEL API + DOM collection - max_scrolls = 35 - dom_parse_start = 25 # Only start DOM parsing after 25 scrolls (when near end) - - for i in range(max_scrolls): - # Scroll - driver.execute_script(scroll_script) - time.sleep(0.27) # Optimal scroll timing - - # PARALLEL COLLECTION 1: API Responses (always) - try: - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - except: - pass - - # PARALLEL COLLECTION 2: DOM Elements (only near the end, lightweight) - # Only parse DOM in the last scrolls when we know we're near 234 API reviews - if i >= dom_parse_start and len(api_reviews) >= 220: - try: - # Lightweight: Just get author + date as unique key, don't parse everything - review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium') - for elem in review_elements[:min(len(review_elements), 250)]: # Limit to first 250 for speed - try: - # Quick parse - just essentials - author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55') - author = author_elem.text if author_elem else None - - date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe') - date_text = date_elem.text if date_elem else None - - if author and date_text: - dom_key = (author, date_text[:20]) - if dom_key not in dom_reviews: - # Full parse only if needed - dom_review = parse_dom_review_element(elem) - if dom_review: - dom_reviews[dom_key] = dom_review - except: - continue - except: - pass - - # Progress logging - if (i + 1) % 10 == 0: - print(f" API: {len(api_reviews)}, DOM: {len(dom_reviews)} unique keys...") - - # Final collections - print("Final collection sweep...") - - # Final API collection - try: - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - except: - pass - - # Final DOM parse (quick sweep) - try: - review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium') - for elem in review_elements[:min(len(review_elements), 250)]: - try: - author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55') - author = author_elem.text if author_elem else None - - date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe') - date_text = date_elem.text if date_elem else None - - if author and date_text: - dom_key = (author, date_text[:20]) - if dom_key not in dom_reviews: - dom_review = parse_dom_review_element(elem) - if dom_review: - dom_reviews[dom_key] = dom_review - except: - continue - except: - pass - - # Merge: Start with API reviews, add DOM reviews that aren't duplicates - print("\nMerging API + DOM reviews...") - - # Build set of API keys for deduplication (author + date) - api_keys = set() - for api_review in api_reviews.values(): - key = ( - api_review.get('author', ''), - (api_review.get('date_text', '') or '')[:20] - ) - api_keys.add(key) - - # Add unique DOM reviews - dom_added = 0 - for dom_key, dom_review in dom_reviews.items(): - if dom_key not in api_keys and dom_review.get('review_id'): - api_reviews[dom_review['review_id']] = dom_review - dom_added += 1 - - elapsed = time.time() - start_time - all_reviews = list(api_reviews.values()) - - print(f"\n{'='*50}") - print(f"✅ COMPLETED!") - print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)") - print(f" - API: {len(api_reviews) - dom_added}") - print(f" - DOM: {dom_added} unique") - print(f"Time: {elapsed:.2f}s") - print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") - print(f"Speedup: {155/elapsed:.1f}x faster! 🚀") - print(f"{'='*50}") - - if len(all_reviews) >= 244: - print(f"🎯 Got ALL 244 reviews!") - elif len(all_reviews) >= 240: - print(f"⚠️ Missing {244-len(all_reviews)} reviews") - - print() - - # Save - with open('google_reviews_parallel_hybrid.json', 'w', encoding='utf-8') as f: - json.dump(all_reviews, f, indent=2, ensure_ascii=False) - - print(f"💾 Saved to google_reviews_parallel_hybrid.json") - - if all_reviews: - print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★") - - return all_reviews - - finally: - try: - driver.quit() - except: - pass - - -if __name__ == '__main__': - try: - reviews = parallel_hybrid_scrape() - sys.exit(0 if reviews else 1) - except KeyboardInterrupt: - print("\n\nInterrupted by user") - sys.exit(1) - except Exception as e: - print(f"ERROR: {e}") - import traceback - traceback.print_exc() - sys.exit(1) diff --git a/start_parallel_v2.py b/start_parallel_v2.py deleted file mode 100644 index 714638f..0000000 --- a/start_parallel_v2.py +++ /dev/null @@ -1,319 +0,0 @@ -#!/usr/bin/env python3 -""" -Parallel API Scraper V2 - Use browser's fetch API for parallel calls. - -Strategy: -1. Open browser and navigate to reviews (~15 seconds) -2. Trigger initial API call to get place ID and pattern -3. Use JavaScript fetch API to make 25 parallel calls (~3-5 seconds) -4. Collect all results at once - -Expected time: ~20-25 seconds for 244 reviews -Speed improvement: ~6-7x faster! -""" -import sys -import yaml -import logging -import time -import json -from pathlib import Path -from seleniumbase import Driver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import TimeoutException -from modules.api_interceptor import GoogleMapsAPIInterceptor - -logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') -log = logging.getLogger(__name__) - - -def load_config(): - """Load configuration from config.yaml""" - with open('config.yaml', 'r') as f: - return yaml.safe_load(f) - - -def parallel_scrape(): - """Parallel API-first scraping using browser's fetch API.""" - - config = load_config() - url = config.get('url') - headless = config.get('headless', False) - - log.info("="*60) - log.info("PARALLEL API SCRAPER V2") - log.info("="*60) - log.info(f"URL: {url[:80]}...") - log.info(f"Mode: Parallel browser fetch calls") - log.info("="*60 + "\n") - - start_time = time.time() - - driver = Driver(uc=True, headless=headless, page_load_strategy="normal") - - try: - # Step 1: Navigate and setup - log.info("Step 1: Opening Google Maps...") - driver.get(url) - time.sleep(2) - - # Dismiss cookies - try: - cookie_btns = driver.find_elements(By.CSS_SELECTOR, - 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') - if cookie_btns: - cookie_btns[0].click() - log.info("✓ Cookie dialog dismissed") - time.sleep(1) - except: - pass - - # Click reviews tab - log.info("Step 2: Opening reviews tab...") - review_keywords = ['reviews', 'review', 'reseñas', 'reseña', 'opiniones'] - clicked = False - - for selector in ['.LRkQ2', '.hh2c6', '[data-tab-index="1"]', 'button[role="tab"]']: - try: - tabs = driver.find_elements(By.CSS_SELECTOR, selector) - for tab in tabs: - text = (tab.text or '').lower() - aria_label = (tab.get_attribute('aria-label') or '').lower() - if any(kw in text or kw in aria_label for kw in review_keywords): - driver.execute_script("arguments[0].click();", tab) - time.sleep(2) - log.info("✓ Reviews tab clicked") - clicked = True - break - if clicked: - break - except: - continue - - # Wait for reviews to load - log.info("Waiting for reviews page to fully load...") - time.sleep(3) - - # Find reviews pane - log.info("Step 3: Finding reviews pane...") - pane = None - pane_selectors = [ - 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', - 'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', - 'div.m6QErb.WNBkOb.XiKgde', - ] - - for selector in pane_selectors: - try: - wait = WebDriverWait(driver, 5) - pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector))) - log.info(f"✓ Found reviews pane with: {selector}") - break - except TimeoutException: - continue - - if not pane: - log.error("Could not find reviews pane") - return [] - - # Wait for initial reviews - time.sleep(2) - - # Extract place ID from URL - current_url = driver.current_url - place_id = None - if '!1s' in current_url: - parts = current_url.split('!1s') - if len(parts) > 1: - place_id = parts[1].split('!')[0] - log.info(f"✓ Extracted place ID: {place_id}") - - if not place_id: - log.error("Could not extract place ID from URL") - return [] - - # Step 4: Make parallel API calls using browser's fetch - log.info("\n" + "="*60) - log.info("Step 4: Making parallel API calls via browser fetch") - log.info("="*60) - - # JavaScript to make parallel API calls - parallel_fetch_script = """ - async function fetchReviewsParallel(placeId, numPages) { - const baseUrl = 'https://www.google.com/maps/rpc/listugcposts'; - const results = []; - - // Build pb parameter for each page - const requests = []; - let token = null; - - console.log('[Parallel Fetch] Starting parallel fetch for', numPages, 'pages'); - - // First, we need to get continuation tokens sequentially - const tokens = []; - for (let i = 0; i < Math.min(numPages, 5); i++) { - const pb = token - ? `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1` - : `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`; - - const params = new URLSearchParams({ - authuser: '0', - hl: 'es', - gl: 'es', - pb: pb - }); - - try { - const response = await fetch(`${baseUrl}?${params}`); - const text = await response.text(); - const body = text.startsWith(")]}'") ? text.substring(4) : text; - const data = JSON.parse(body); - - results.push({index: i, data: data}); - - // Get next token - if (data && data.length > 1 && typeof data[1] === 'string') { - token = data[1]; - tokens.push(token); - } else { - break; // No more pages - } - } catch (e) { - console.error('[Parallel Fetch] Error fetching page', i, e); - } - } - - console.log('[Parallel Fetch] Got', tokens.length, 'continuation tokens'); - console.log('[Parallel Fetch] Now fetching remaining pages in parallel...'); - - // Now fetch remaining pages in parallel using the tokens - const parallelPromises = tokens.slice(5).map((tok, idx) => { - const pb = `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${tok}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`; - const params = new URLSearchParams({ - authuser: '0', - hl: 'es', - gl: 'es', - pb: pb - }); - - return fetch(`${baseUrl}?${params}`) - .then(r => r.text()) - .then(text => { - const body = text.startsWith(")]}'") ? text.substring(4) : text; - return JSON.parse(body); - }) - .then(data => ({index: idx + 5, data: data})) - .catch(e => { - console.error('[Parallel Fetch] Parallel fetch error', idx, e); - return null; - }); - }); - - const parallelResults = await Promise.all(parallelPromises); - results.push(...parallelResults.filter(r => r !== null)); - - console.log('[Parallel Fetch] Completed! Total responses:', results.length); - return results; - } - - // Execute parallel fetch - return await fetchReviewsParallel(arguments[0], arguments[1]); - """ - - log.info(f"Fetching up to 25 pages in parallel...") - api_start = time.time() - - try: - results = driver.execute_async_script(parallel_fetch_script, place_id, 25) - api_elapsed = time.time() - api_start - log.info(f"✓ Parallel fetch completed in {api_elapsed:.2f} seconds") - log.info(f" Received {len(results)} API responses") - except Exception as e: - log.error(f"Parallel fetch failed: {e}") - return [] - - # Parse results - log.info("\nStep 5: Parsing reviews from API responses...") - interceptor = GoogleMapsAPIInterceptor(None) - all_reviews = {} - - for result in results: - if result and 'data' in result: - try: - parsed = interceptor._parse_listugcposts_response(result['data']) - for review in parsed: - if review.review_id and review.review_id not in all_reviews: - all_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - except Exception as e: - log.debug(f"Error parsing response: {e}") - - reviews_list = list(all_reviews.values()) - elapsed = time.time() - start_time - - log.info(f"\n{'='*60}") - log.info(f"✅ PARALLEL SCRAPING COMPLETED!") - log.info(f"{'='*60}") - log.info(f"Total reviews: {len(reviews_list)}") - log.info(f"API responses: {len(results)}") - log.info(f"Total time: {elapsed:.2f} seconds") - log.info(f" - Setup: {api_start - start_time:.2f}s") - log.info(f" - Parallel API: {api_elapsed:.2f}s") - log.info(f"Speed: {len(reviews_list)/elapsed:.1f} reviews/second") - log.info(f"{'='*60}\n") - - # Save results - output_file = 'google_reviews_parallel.json' - with open(output_file, 'w', encoding='utf-8') as f: - json.dump(reviews_list, f, indent=2, ensure_ascii=False) - - log.info(f"💾 Saved {len(reviews_list)} reviews to {output_file}") - - # Show sample - if reviews_list: - log.info("\n📝 Sample review:") - sample = reviews_list[0] - log.info(f" Author: {sample['author']}") - log.info(f" Rating: {sample['rating']}★") - log.info(f" Date: {sample['date_text']}") - if sample['text']: - log.info(f" Text: {sample['text'][:80]}...") - - # Stats comparison - log.info("\n" + "="*60) - log.info("SPEED COMPARISON") - log.info("="*60) - log.info(f"Old DOM scraping: ~155 seconds for 244 reviews (1.0x)") - log.info(f"Fast API scrolling: ~43 seconds for 234 reviews (3.6x faster)") - log.info(f"Parallel browser fetch: ~{elapsed:.0f} seconds for {len(reviews_list)} reviews ({155/elapsed:.1f}x faster!) 🚀") - log.info("="*60 + "\n") - - return reviews_list - - finally: - try: - driver.quit() - except: - pass - - -if __name__ == '__main__': - try: - reviews = parallel_scrape() - sys.exit(0 if reviews else 1) - except KeyboardInterrupt: - log.info("\n\nInterrupted by user") - sys.exit(1) - except Exception as e: - log.error(f"Fatal error: {e}") - import traceback - traceback.print_exc() - sys.exit(1) diff --git a/start_ultra_fast.py b/start_ultra_fast.py deleted file mode 100644 index c26aca3..0000000 --- a/start_ultra_fast.py +++ /dev/null @@ -1,279 +0,0 @@ -#!/usr/bin/env python3 -""" -ULTRA-FAST API Scraper - Maximum speed optimization. - -Optimizations: -1. Minimal waits (0.5s after tab click instead of 3s) -2. No wait for "initial reviews" (removes 3s) -3. Faster scroll timing (0.2s instead of 0.3s) -4. Batch response collection (every 3 scrolls, not every scroll) -5. Less logging during scrolling (I/O overhead) -6. Direct pane selection (no trying multiple) -7. Parallel operations where possible - -Target: ~15-20 seconds for 234 reviews -""" -import sys -import yaml -import logging -import time -import json -from seleniumbase import Driver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import TimeoutException -from modules.api_interceptor import GoogleMapsAPIInterceptor - -logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s') -log = logging.getLogger(__name__) -# Only show INFO and above -log.setLevel(logging.INFO) - - -def load_config(): - with open('config.yaml', 'r') as f: - return yaml.safe_load(f) - - -def ultra_fast_scrape(): - """Ultra-fast API-first scraping with all optimizations.""" - - config = load_config() - url = config.get('url') - headless = config.get('headless', False) - - print("ULTRA-FAST SCRAPER - Starting...") - print(f"URL: {url[:80]}...") - - start_time = time.time() - api_reviews = {} - - driver = Driver(uc=True, headless=headless, page_load_strategy="normal") - - try: - # Step 1: Navigate (minimal waits) - driver.get(url) - time.sleep(1.5) # Stable wait - - # Dismiss cookies (non-blocking) - try: - cookie_btns = driver.find_elements(By.CSS_SELECTOR, - 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') - if cookie_btns: - cookie_btns[0].click() - time.sleep(0.4) # Balanced wait - except: - pass - - # Click reviews tab - review_keywords = ['reviews', 'review', 'reseñas', 'reseña'] - for selector in ['.LRkQ2', 'button[role="tab"]']: - try: - tabs = driver.find_elements(By.CSS_SELECTOR, selector) - for tab in tabs: - text = (tab.text or '').lower() - aria = (tab.get_attribute('aria-label') or '').lower() - if any(kw in text or kw in aria for kw in review_keywords): - driver.execute_script("arguments[0].click();", tab) - time.sleep(0.4) # Balanced wait - break - except: - continue - - # Brief wait for reviews page (balance speed vs stability) - time.sleep(1.0) # Reduced from 3s but needed for stability - - # Find pane - use most common selector directly - pane = None - try: - wait = WebDriverWait(driver, 3) # Reduced from 5s - pane = wait.until(EC.presence_of_element_located( - (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'))) - except TimeoutException: - try: - pane = wait.until(EC.presence_of_element_located( - (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde'))) - except: - print("ERROR: Could not find pane") - return [] - - # NO wait for initial reviews - save 3s! - # Setup API interceptor immediately - - interceptor = GoogleMapsAPIInterceptor(driver) - interceptor.setup_interception() - interceptor.inject_response_interceptor() - time.sleep(0.3) # Minimal wait for interceptor - - # Setup scroll - driver.execute_script("window.scrollablePane = arguments[0];", pane) - scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" - - # Trigger initial scroll - driver.execute_script(scroll_script) - time.sleep(0.3) # Minimal initial trigger wait - - print("Fast scrolling...") - - # Rapid scrolling with batch collection - target_reviews = 240 - max_scrolls = 35 # Slightly more to compensate for faster timing - - for i in range(max_scrolls): - # Ultra-fast scroll - driver.execute_script(scroll_script) - time.sleep(0.27) # Sweet spot for stability - - # Collect every scroll (can't skip or buffer clears) - try: - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - - # Only log every 10 scrolls to reduce I/O - if (i + 1) % 10 == 0: - print(f" {len(api_reviews)} reviews...") - - if len(api_reviews) >= target_reviews: - break - except: - pass - - # Final collection - try: - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - except: - pass - - # Quick DOM parse for missing reviews (only if needed) - missing = 244 - len(api_reviews) - if missing > 0: - print(f"\nQuick DOM parse for {missing} missing reviews...") - try: - # Scroll to top - driver.execute_script("window.scrollablePane.scrollTo(0, 0);", pane) - time.sleep(0.3) - - # Parse top reviews (most likely to be missing) - review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')[:min(missing + 5, 20)] - - # Build API keys for deduplication - api_keys = set() - for api_review in api_reviews.values(): - key = (api_review.get('author', ''), (api_review.get('date_text', '') or '')[:20]) - api_keys.add(key) - - # Parse and add unique DOM reviews - dom_added = 0 - for elem in review_elements: - try: - review_data = {} - - # Author - author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55') - review_data['author'] = author_elem.text if author_elem else None - - # Rating - rating_elem = elem.find_element(By.CSS_SELECTOR, 'span.kvMYJc') - rating_attr = rating_elem.get_attribute('aria-label') - if rating_attr: - rating_parts = rating_attr.split() - if rating_parts: - review_data['rating'] = float(rating_parts[0]) - - # Text - text_elem = elem.find_element(By.CSS_SELECTOR, 'span.wiI7pd') - review_data['text'] = text_elem.text if text_elem else None - - # Date - date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe') - review_data['date_text'] = date_elem.text if date_elem else None - - # Avatar - avatar_elem = elem.find_element(By.CSS_SELECTOR, 'img.NBa7we') - review_data['avatar_url'] = avatar_elem.get_attribute('src') if avatar_elem else None - - # Profile URL - profile_elem = elem.find_element(By.CSS_SELECTOR, 'button.WEBjve') - review_data['profile_url'] = profile_elem.get_attribute('data-review-id') if profile_elem else None - - # Check if unique - dom_key = (review_data.get('author', ''), (review_data.get('date_text', '') or '')[:20]) - if dom_key not in api_keys and review_data.get('author'): - review_id = f"dom_{hash(str(review_data.get('author', '')) + str(review_data.get('date_text', '')))}" - review_data['review_id'] = review_id - api_reviews[review_id] = review_data - api_keys.add(dom_key) - dom_added += 1 - - except: - continue - - print(f" +{dom_added} reviews from DOM") - except Exception as e: - print(f" DOM parse failed: {e}") - - elapsed = time.time() - start_time - all_reviews = list(api_reviews.values()) - - print(f"\n✅ COMPLETED!") - print(f"Reviews: {len(all_reviews)}") - print(f"Time: {elapsed:.2f}s") - print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") - print(f"Speedup: {155/elapsed:.1f}x faster! 🚀\n") - - # Save - with open('google_reviews_ultra_fast.json', 'w', encoding='utf-8') as f: - json.dump(all_reviews, f, indent=2, ensure_ascii=False) - - print(f"💾 Saved to google_reviews_ultra_fast.json") - - if all_reviews: - print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★") - - return all_reviews - - finally: - try: - driver.quit() - except: - pass - - -if __name__ == '__main__': - try: - reviews = ultra_fast_scrape() - sys.exit(0 if reviews else 1) - except KeyboardInterrupt: - print("\n\nInterrupted by user") - sys.exit(1) - except Exception as e: - print(f"ERROR: {e}") - import traceback - traceback.print_exc() - sys.exit(1) diff --git a/start_ultra_fast_complete.py b/start_ultra_fast_complete.py deleted file mode 100644 index c0764af..0000000 --- a/start_ultra_fast_complete.py +++ /dev/null @@ -1,336 +0,0 @@ -#!/usr/bin/env python3 -""" -ULTRA-FAST COMPLETE Scraper - Gets ALL 244 reviews in ~25-30 seconds. - -Strategy: -1. Ultra-fast API scrolling to get 234 reviews (~19s) -2. DOM parsing for missing 10 reviews (~5-10s) -3. Total: ~25-30s for 244 reviews (vs 155s original) - -Combines speed of start_ultra_fast.py with completeness of original scraper. -""" -import sys -import yaml -import logging -import time -import json -from seleniumbase import Driver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import TimeoutException -from modules.api_interceptor import GoogleMapsAPIInterceptor - -logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s') -log = logging.getLogger(__name__) -log.setLevel(logging.INFO) - - -def load_config(): - with open('config.yaml', 'r') as f: - return yaml.safe_load(f) - - -def parse_dom_reviews_fast(driver, max_reviews=20): - """Fast DOM parsing using JavaScript - extracts data in bulk.""" - - # JavaScript to extract review data from first N reviews - extract_script = """ - const reviews = []; - const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium'); - const maxCount = Math.min(arguments[0], elements.length); - - for (let i = 0; i < maxCount; i++) { - const elem = elements[i]; - const review = {}; - - try { - // Author - const authorElem = elem.querySelector('div.d4r55'); - review.author = authorElem ? authorElem.textContent : null; - - // Rating - const ratingElem = elem.querySelector('span.kvMYJc'); - if (ratingElem) { - const ariaLabel = ratingElem.getAttribute('aria-label'); - if (ariaLabel) { - const match = ariaLabel.match(/\\d+/); - review.rating = match ? parseFloat(match[0]) : null; - } - } - - // Text - const textElem = elem.querySelector('span.wiI7pd'); - review.text = textElem ? textElem.textContent : null; - - // Date - const dateElem = elem.querySelector('span.rsqaWe'); - review.date_text = dateElem ? dateElem.textContent : null; - - // Avatar - const avatarElem = elem.querySelector('img.NBa7we'); - review.avatar_url = avatarElem ? avatarElem.src : null; - - // Profile URL - const profileElem = elem.querySelector('button.WEBjve'); - review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null; - - if (review.author) { - reviews.push(review); - } - } catch (e) { - // Skip this review - } - } - - return reviews; - """ - - try: - # Execute JavaScript to get all review data at once - dom_reviews_data = driver.execute_script(extract_script, max_reviews) - - # Convert to our format - dom_reviews = [] - for review_data in dom_reviews_data: - if review_data.get('author') and review_data.get('date_text'): - review_id = f"dom_{hash(review_data['author'] + review_data['date_text'])}" - review_data['review_id'] = review_id - dom_reviews.append(review_data) - - return dom_reviews - - except Exception as e: - print(f" Error in fast DOM parse: {e}") - return [] - - -def ultra_fast_complete_scrape(): - """Get ALL reviews with ultra-fast API + DOM fallback.""" - - config = load_config() - url = config.get('url') - headless = config.get('headless', False) - - print("ULTRA-FAST COMPLETE SCRAPER - Getting ALL 244 reviews...") - print(f"URL: {url[:80]}...") - - start_time = time.time() - api_reviews = {} - - driver = Driver(uc=True, headless=headless, page_load_strategy="normal") - - try: - # ====== PHASE 1: ULTRA-FAST API SCROLLING ====== - print("\n[Phase 1] Ultra-fast API scrolling...") - - # Step 1: Navigate - driver.get(url) - time.sleep(1.5) - - # Dismiss cookies - try: - cookie_btns = driver.find_elements(By.CSS_SELECTOR, - 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') - if cookie_btns: - cookie_btns[0].click() - time.sleep(0.4) - except: - pass - - # Click reviews tab - review_keywords = ['reviews', 'review', 'reseñas', 'reseña'] - for selector in ['.LRkQ2', 'button[role="tab"]']: - try: - tabs = driver.find_elements(By.CSS_SELECTOR, selector) - for tab in tabs: - text = (tab.text or '').lower() - aria = (tab.get_attribute('aria-label') or '').lower() - if any(kw in text or kw in aria for kw in review_keywords): - driver.execute_script("arguments[0].click();", tab) - time.sleep(0.4) - break - except: - continue - - # Wait for page stability - time.sleep(1.0) - - # Find pane - pane = None - try: - wait = WebDriverWait(driver, 3) - pane = wait.until(EC.presence_of_element_located( - (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'))) - except TimeoutException: - try: - pane = wait.until(EC.presence_of_element_located( - (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde'))) - except: - print("ERROR: Could not find pane") - return [] - - # Setup API interceptor - interceptor = GoogleMapsAPIInterceptor(driver) - interceptor.setup_interception() - interceptor.inject_response_interceptor() - time.sleep(0.3) - - # Setup scroll - driver.execute_script("window.scrollablePane = arguments[0];", pane) - scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" - - # Trigger initial scroll - driver.execute_script(scroll_script) - time.sleep(0.3) - - print(" Fast scrolling for API reviews...") - - # Rapid scrolling - target_reviews = 240 - max_scrolls = 35 - - for i in range(max_scrolls): - driver.execute_script(scroll_script) - time.sleep(0.27) - - # Collect responses - try: - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - - if (i + 1) % 10 == 0: - print(f" {len(api_reviews)} reviews...") - - if len(api_reviews) >= target_reviews: - break - except: - pass - - # Final API collection - try: - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - except: - pass - - phase1_time = time.time() - start_time - print(f" ✅ Phase 1 complete: {len(api_reviews)} reviews in {phase1_time:.2f}s") - - # ====== PHASE 2: DOM PARSING FOR MISSING REVIEWS ====== - missing_count = 244 - len(api_reviews) - - if missing_count > 0: - print(f"\n[Phase 2] Fast DOM parsing for {missing_count} missing reviews...") - - # Scroll to top (missing reviews likely at top) - driver.execute_script("window.scrollablePane.scrollTo(0, 0);", pane) - time.sleep(0.5) # Brief wait for scroll - - # Fast JavaScript-based parsing (only first 20 reviews) - dom_reviews = parse_dom_reviews_fast(driver, max_reviews=min(missing_count + 10, 25)) - - # Add DOM reviews that aren't in API reviews - # Use author + rating + date as key for better duplicate detection - api_keys = set() - for api_review in api_reviews.values(): - key = ( - api_review.get('author', ''), - api_review.get('rating', 0), - (api_review.get('date_text', '') or '')[:20] # First 20 chars of date - ) - api_keys.add(key) - - dom_added = 0 - for dom_review in dom_reviews: - # Create key for this DOM review - dom_key = ( - dom_review.get('author', ''), - dom_review.get('rating', 0), - (dom_review.get('date_text', '') or '')[:20] - ) - - # Only add if not already in API reviews - if dom_key not in api_keys and dom_review.get('review_id'): - api_reviews[dom_review['review_id']] = dom_review - api_keys.add(dom_key) # Track this to avoid duplicates within DOM too - dom_added += 1 - - phase2_time = time.time() - start_time - phase1_time - print(f" ✅ Phase 2 complete: +{dom_added} reviews from DOM in {phase2_time:.2f}s") - - # ====== RESULTS ====== - elapsed = time.time() - start_time - all_reviews = list(api_reviews.values()) - - print(f"\n{'='*50}") - print(f"✅ COMPLETED!") - print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)") - print(f"Time: {elapsed:.2f}s") - print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") - print(f"Speedup: {155/elapsed:.1f}x faster! 🚀") - print(f"{'='*50}") - - if len(all_reviews) >= 244: - print(f"🎯 Got ALL 244 reviews!") - elif len(all_reviews) >= 240: - print(f"⚠️ Missing {244-len(all_reviews)} reviews") - else: - print(f"⚠️ Missing {244-len(all_reviews)} reviews - may need more DOM parsing") - - print() - - # Save - with open('google_reviews_ultra_fast_complete.json', 'w', encoding='utf-8') as f: - json.dump(all_reviews, f, indent=2, ensure_ascii=False) - - print(f"💾 Saved to google_reviews_ultra_fast_complete.json") - - if all_reviews: - print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★") - - return all_reviews - - finally: - try: - driver.quit() - except: - pass - - -if __name__ == '__main__': - try: - reviews = ultra_fast_complete_scrape() - sys.exit(0 if reviews else 1) - except KeyboardInterrupt: - print("\n\nInterrupted by user") - sys.exit(1) - except Exception as e: - print(f"ERROR: {e}") - import traceback - traceback.print_exc() - sys.exit(1) diff --git a/start_ultra_fast_v2.py b/start_ultra_fast_v2.py deleted file mode 100644 index 05178b2..0000000 --- a/start_ultra_fast_v2.py +++ /dev/null @@ -1,280 +0,0 @@ -#!/usr/bin/env python3 -""" -Complete Scraper - Gets ALL reviews while staying fast. - -Strategy: -1. Scroll until no new reviews for 5 consecutive scrolls -2. Check scroll position to detect end -3. Do extra scrolls at the end to catch stragglers -4. Adaptive timing - faster at start, slower at end - -Target: Get all 244 reviews in ~22-25 seconds -""" -import sys -import yaml -import logging -import time -import json -from seleniumbase import Driver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import TimeoutException -from modules.api_interceptor import GoogleMapsAPIInterceptor - -logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s') -log = logging.getLogger(__name__) -log.setLevel(logging.INFO) - - -def load_config(): - with open('config.yaml', 'r') as f: - return yaml.safe_load(f) - - -def complete_scrape(): - """Get ALL reviews with intelligent scrolling.""" - - config = load_config() - url = config.get('url') - headless = config.get('headless', False) - - print("COMPLETE SCRAPER - Getting ALL reviews...") - print(f"URL: {url[:80]}...") - - start_time = time.time() - api_reviews = {} - - driver = Driver(uc=True, headless=headless, page_load_strategy="normal") - - try: - # Step 1: Navigate - driver.get(url) - time.sleep(1.5) - - # Dismiss cookies - try: - cookie_btns = driver.find_elements(By.CSS_SELECTOR, - 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') - if cookie_btns: - cookie_btns[0].click() - time.sleep(0.4) - except: - pass - - # Click reviews tab - review_keywords = ['reviews', 'review', 'reseñas', 'reseña'] - for selector in ['.LRkQ2', 'button[role="tab"]']: - try: - tabs = driver.find_elements(By.CSS_SELECTOR, selector) - for tab in tabs: - text = (tab.text or '').lower() - aria = (tab.get_attribute('aria-label') or '').lower() - if any(kw in text or kw in aria for kw in review_keywords): - driver.execute_script("arguments[0].click();", tab) - time.sleep(0.4) - break - except: - continue - - # Wait for page stability - time.sleep(1.0) - - # Find pane - pane = None - try: - wait = WebDriverWait(driver, 3) - pane = wait.until(EC.presence_of_element_located( - (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'))) - except TimeoutException: - try: - pane = wait.until(EC.presence_of_element_located( - (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde'))) - except: - print("ERROR: Could not find pane") - return [] - - # Wait for initial reviews to load - time.sleep(1.5) - - # Setup API interceptor - interceptor = GoogleMapsAPIInterceptor(driver) - interceptor.setup_interception() - interceptor.inject_response_interceptor() - time.sleep(1.0) # Important: wait for interceptor to be ready - - # Setup scroll - driver.execute_script("window.scrollablePane = arguments[0];", pane) - scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" - - # Trigger initial scroll to get first API response - driver.execute_script(scroll_script) - time.sleep(1.0) # Wait for first API response - - print("Scrolling with intelligent stopping...") - - # Intelligent scrolling - max_scrolls = 60 # Higher limit to ensure we get everything - idle_scrolls = 0 # Count scrolls with no new reviews - max_idle = 12 # More patience - stop after 12 scrolls with no new reviews - last_count = 0 - last_scroll_pos = 0 - scroll_stuck_count = 0 - - for i in range(max_scrolls): - # Scroll - driver.execute_script(scroll_script) - - # Adaptive timing - faster at start, slower near end - if len(api_reviews) < 100: - time.sleep(0.27) # Fast at beginning - elif len(api_reviews) < 200: - time.sleep(0.30) # Medium in middle - elif len(api_reviews) < 235: - time.sleep(0.40) # Slower near end - else: - time.sleep(0.50) # Very slow at the very end to catch stragglers - - # Collect responses - try: - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - except: - pass - - # Check if we got new reviews - current_count = len(api_reviews) - if current_count == last_count: - idle_scrolls += 1 - else: - idle_scrolls = 0 - if (i + 1) % 10 == 0: - print(f" {current_count} reviews...") - - last_count = current_count - - # Check scroll position to detect if stuck at bottom - try: - current_scroll = driver.execute_script("return arguments[0].scrollTop;", pane) - if current_scroll == last_scroll_pos: - scroll_stuck_count += 1 - else: - scroll_stuck_count = 0 - last_scroll_pos = current_scroll - except: - pass - - # Stop conditions - if idle_scrolls >= max_idle and scroll_stuck_count >= 3: - print(f" Reached end (no new reviews for {idle_scrolls} scrolls)") - break - - # Extra thorough collection at the end - print(f" Final collection sweep (currently have {len(api_reviews)})...") - - # Do a few more scrolls with longer waits - for extra in range(5): - driver.execute_script(scroll_script) - time.sleep(0.8) # Longer wait to ensure API completes - - try: - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - new_count = 0 - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - new_count += 1 - - if new_count > 0: - print(f" +{new_count} more reviews (total: {len(api_reviews)})") - except: - pass - - # Final wait and collect - time.sleep(1.0) - try: - responses = interceptor.get_intercepted_responses() - if responses: - parsed = interceptor.parse_reviews_from_responses(responses) - for review in parsed: - if review.review_id and review.review_id not in api_reviews: - api_reviews[review.review_id] = { - 'review_id': review.review_id, - 'author': review.author, - 'rating': review.rating, - 'text': review.text, - 'date_text': review.date_text, - 'avatar_url': review.avatar_url, - 'profile_url': review.profile_url, - } - except: - pass - - elapsed = time.time() - start_time - all_reviews = list(api_reviews.values()) - - print(f"\n✅ COMPLETED!") - print(f"Reviews: {len(all_reviews)} (target: 244)") - print(f"Time: {elapsed:.2f}s") - print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") - print(f"Speedup: {155/elapsed:.1f}x faster! 🚀") - - if len(all_reviews) >= 244: - print(f"🎯 Got ALL reviews!") - elif len(all_reviews) >= 240: - print(f"⚠️ Missing {244-len(all_reviews)} reviews") - - print() - - # Save - with open('google_reviews_complete.json', 'w', encoding='utf-8') as f: - json.dump(all_reviews, f, indent=2, ensure_ascii=False) - - print(f"💾 Saved to google_reviews_complete.json") - - if all_reviews: - print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★") - - return all_reviews - - finally: - try: - driver.quit() - except: - pass - - -if __name__ == '__main__': - try: - reviews = complete_scrape() - sys.exit(0 if reviews else 1) - except KeyboardInterrupt: - print("\n\nInterrupted by user") - sys.exit(1) - except Exception as e: - print(f"ERROR: {e}") - import traceback - traceback.print_exc() - sys.exit(1)