diff --git a/api_server.py b/api_server.py
deleted file mode 100644
index 630d3b1..0000000
--- a/api_server.py
+++ /dev/null
@@ -1,383 +0,0 @@
-#!/usr/bin/env python3
-"""
-FastAPI server for Google Reviews Scraper.
-Provides REST API endpoints to trigger and manage scraping jobs.
-"""
-
-import logging
-import asyncio
-from contextlib import asynccontextmanager
-from typing import Dict, Any, List, Optional
-
-from fastapi import FastAPI, HTTPException, BackgroundTasks, Query
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel, HttpUrl, Field
-
-from modules.job_manager import JobManager, JobStatus, ScrapingJob
-from modules.chrome_pool import start_worker_pools, stop_worker_pools, get_pool_stats, get_validation_worker, release_validation_worker
-from modules.fast_scraper import check_reviews_available, get_business_card_info
-
-# Configure logging
-logging.basicConfig(
- level=logging.INFO,
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-log = logging.getLogger("api_server")
-
-# Global job manager instance
-job_manager: Optional[JobManager] = None
-
-
-@asynccontextmanager
-async def lifespan(app: FastAPI):
- """Lifespan context manager for startup and shutdown"""
- global job_manager
-
- # Startup
- log.info("Starting Google Reviews Scraper API Server")
-
- # Start Chrome worker pools
- log.info("Initializing Chrome worker pools...")
- start_worker_pools(
- validation_size=1, # 1 pre-warmed worker for validation
- scraping_size=2, # 2 pre-warmed workers for scraping
- headless=True
- )
-
- job_manager = JobManager(max_concurrent_jobs=3)
-
- # Start auto-cleanup task
- asyncio.create_task(cleanup_jobs_periodically())
-
- yield
-
- # Shutdown
- log.info("Shutting down Google Reviews Scraper API Server")
-
- if job_manager:
- job_manager.shutdown()
-
- # Stop Chrome worker pools
- log.info("Stopping Chrome worker pools...")
- stop_worker_pools()
-
-
-# Initialize FastAPI app
-app = FastAPI(
- title="Google Reviews Scraper API",
- description="REST API for triggering and managing Google Maps review scraping jobs",
- version="1.0.0",
- lifespan=lifespan
-)
-
-# Add CORS middleware
-app.add_middleware(
- CORSMiddleware,
- allow_origins=["*"], # Configure appropriately for production
- allow_credentials=True,
- allow_methods=["*"],
- allow_headers=["*"],
-)
-
-
-# Pydantic models for API
-class ScrapeRequest(BaseModel):
- """Request model for starting a scrape job"""
- url: HttpUrl = Field(..., description="Google Maps URL to scrape")
- headless: Optional[bool] = Field(None, description="Run Chrome in headless mode (default: True)")
- max_scrolls: Optional[int] = Field(None, description="Maximum scrolls (default: unlimited - stops via idle detection)")
- sort_by: Optional[str] = Field(None, description="Sort order: newest, highest, lowest, relevance")
- stop_on_match: Optional[bool] = Field(None, description="Stop when first already-seen review is encountered")
- overwrite_existing: Optional[bool] = Field(None, description="Overwrite existing reviews instead of appending")
- download_images: Optional[bool] = Field(None, description="Download images from reviews")
- use_s3: Optional[bool] = Field(None, description="Upload images to S3")
- custom_params: Optional[Dict[str, Any]] = Field(None, description="Custom parameters to add to each document")
-
-
-class JobResponse(BaseModel):
- """Response model for job information"""
- job_id: str
- status: JobStatus
- url: str
- created_at: str
- started_at: Optional[str] = None
- completed_at: Optional[str] = None
- updated_at: Optional[str] = None # Last update time for progress tracking
- error_message: Optional[str] = None
- reviews_count: Optional[int] = None
- total_reviews: Optional[int] = None # Total reviews available for this place
- images_count: Optional[int] = None
- progress: Optional[Dict[str, Any]] = None
- scrape_time: Optional[float] = None # Time taken to scrape in seconds
-
-
-class JobStatsResponse(BaseModel):
- """Response model for job statistics"""
- total_jobs: int
- by_status: Dict[str, int]
- running_jobs: int
- max_concurrent_jobs: int
-
-
-class ReviewsResponse(BaseModel):
- """Response model for reviews data"""
- job_id: str
- reviews: List[Dict[str, Any]]
- count: int
-
-
-# Background task for periodic cleanup
-async def cleanup_jobs_periodically():
- """Periodically clean up old jobs"""
- while True:
- await asyncio.sleep(3600) # Run every hour
- if job_manager:
- job_manager.cleanup_old_jobs(max_age_hours=24)
-
-
-# API Endpoints
-
-@app.get("/", summary="API Health Check")
-async def root():
- """Health check endpoint"""
- return {
- "message": "Google Reviews Scraper API is running",
- "status": "healthy",
- "version": "1.0.0"
- }
-
-
-@app.post("/scrape", response_model=Dict[str, str], summary="Start Scraping Job")
-async def start_scrape(request: ScrapeRequest, background_tasks: BackgroundTasks):
- """
- Start a new scraping job in the background.
-
- Returns the job ID that can be used to check status.
- """
- if not job_manager:
- raise HTTPException(status_code=500, detail="Job manager not initialized")
-
- # Prepare config overrides
- config_overrides = {}
-
- # Only include non-None values
- for field, value in request.dict().items():
- if value is not None and field != "url":
- config_overrides[field] = value
-
- # Convert URL to string
- url = str(request.url)
-
- try:
- # Create job
- job_id = job_manager.create_job(url, config_overrides)
-
- # Start job immediately if possible
- started = job_manager.start_job(job_id)
-
- log.info(f"Created scraping job {job_id} for URL: {url}")
-
- return {
- "job_id": job_id,
- "status": "started" if started else "queued",
- "message": f"Scraping job {'started' if started else 'queued'} successfully"
- }
-
- except Exception as e:
- log.error(f"Error creating scraping job: {e}")
- raise HTTPException(status_code=500, detail=f"Failed to create scraping job: {str(e)}")
-
-
-@app.get("/jobs/{job_id}", response_model=JobResponse, summary="Get Job Status")
-async def get_job(job_id: str):
- """Get detailed information about a specific job"""
- if not job_manager:
- raise HTTPException(status_code=500, detail="Job manager not initialized")
-
- job = job_manager.get_job(job_id)
- if not job:
- raise HTTPException(status_code=404, detail="Job not found")
-
- return JobResponse(**job.to_dict())
-
-
-@app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews")
-async def get_job_reviews(job_id: str):
- """
- Get the actual reviews data for a completed job.
-
- Returns 404 if job not found or not completed yet.
- """
- if not job_manager:
- raise HTTPException(status_code=500, detail="Job manager not initialized")
-
- reviews = job_manager.get_job_reviews(job_id)
- if reviews is None:
- job = job_manager.get_job(job_id)
- if not job:
- raise HTTPException(status_code=404, detail="Job not found")
- elif job.status != JobStatus.COMPLETED:
- raise HTTPException(
- status_code=400,
- detail=f"Job not completed yet (current status: {job.status})"
- )
- else:
- raise HTTPException(status_code=404, detail="Reviews data not available")
-
- return ReviewsResponse(
- job_id=job_id,
- reviews=reviews,
- count=len(reviews)
- )
-
-
-@app.get("/jobs", response_model=List[JobResponse], summary="List Jobs")
-async def list_jobs(
- status: Optional[JobStatus] = Query(None, description="Filter by job status"),
- limit: int = Query(100, description="Maximum number of jobs to return", ge=1, le=1000)
-):
- """List all jobs, optionally filtered by status"""
- if not job_manager:
- raise HTTPException(status_code=500, detail="Job manager not initialized")
-
- jobs = job_manager.list_jobs(status=status, limit=limit)
- return [JobResponse(**job.to_dict()) for job in jobs]
-
-
-@app.post("/jobs/{job_id}/start", summary="Start Pending Job")
-async def start_job(job_id: str):
- """Start a pending job manually"""
- if not job_manager:
- raise HTTPException(status_code=500, detail="Job manager not initialized")
-
- started = job_manager.start_job(job_id)
- if not started:
- job = job_manager.get_job(job_id)
- if not job:
- raise HTTPException(status_code=404, detail="Job not found")
-
- if job.status != JobStatus.PENDING:
- raise HTTPException(status_code=400, detail=f"Job is not pending (current status: {job.status})")
-
- raise HTTPException(status_code=429, detail="Maximum concurrent jobs reached")
-
- return {"message": "Job started successfully"}
-
-
-@app.post("/jobs/{job_id}/cancel", summary="Cancel Job")
-async def cancel_job(job_id: str):
- """Cancel a pending or running job"""
- if not job_manager:
- raise HTTPException(status_code=500, detail="Job manager not initialized")
-
- cancelled = job_manager.cancel_job(job_id)
- if not cancelled:
- job = job_manager.get_job(job_id)
- if not job:
- raise HTTPException(status_code=404, detail="Job not found")
- raise HTTPException(status_code=400, detail="Job cannot be cancelled (already completed, failed, or cancelled)")
-
- return {"message": "Job cancelled successfully"}
-
-
-@app.delete("/jobs/{job_id}", summary="Delete Job")
-async def delete_job(job_id: str):
- """Delete a job from the system"""
- if not job_manager:
- raise HTTPException(status_code=500, detail="Job manager not initialized")
-
- deleted = job_manager.delete_job(job_id)
- if not deleted:
- raise HTTPException(status_code=404, detail="Job not found")
-
- return {"message": "Job deleted successfully"}
-
-
-@app.get("/stats", response_model=JobStatsResponse, summary="Get Job Statistics")
-async def get_stats():
- """Get job manager statistics"""
- if not job_manager:
- raise HTTPException(status_code=500, detail="Job manager not initialized")
-
- stats = job_manager.get_stats()
- return JobStatsResponse(**stats)
-
-
-@app.post("/check-reviews", summary="Check if Business Has Reviews")
-async def check_reviews(request: Dict[str, str]):
- """
- Lightweight validation endpoint to check if a business has reviews.
- Uses the Chrome validation pool for fast response.
-
- Returns business name, rating, address, and review count.
- """
- url = request.get("url")
- if not url:
- raise HTTPException(status_code=400, detail="URL is required")
-
- log.info(f"Validating business at: {url}")
-
- # Get a worker from validation pool
- worker = get_validation_worker(timeout=10)
-
- if not worker:
- raise HTTPException(
- status_code=503,
- detail="No validation workers available. Please try again in a few seconds."
- )
-
- try:
- # Use the worker's driver to get business card info (faster than check_reviews_available)
- result = get_business_card_info(
- url=url,
- headless=True,
- driver=worker.driver,
- return_driver=True # Don't close the driver
- )
-
- # Pop the driver from result before returning
- result.pop('driver', None)
-
- log.info(f"Validation result: name={result.get('name')}, rating={result.get('rating')}, reviews={result.get('total_reviews')}")
- return result
-
- except Exception as e:
- log.error(f"Error during validation: {e}")
- # Recycle worker if there was an error
- release_validation_worker(worker, recycle=True)
- raise HTTPException(status_code=500, detail=f"Validation failed: {str(e)}")
-
- finally:
- # Release worker back to pool (unless already recycled)
- if worker and worker.driver:
- release_validation_worker(worker, recycle=False)
-
-
-@app.get("/pool-stats", summary="Get Chrome Pool Statistics")
-async def pool_stats():
- """Get statistics about Chrome worker pools"""
- stats = get_pool_stats()
- return stats
-
-
-@app.post("/cleanup", summary="Manual Job Cleanup")
-async def cleanup_jobs(max_age_hours: int = Query(24, description="Maximum age in hours", ge=1)):
- """Manually trigger cleanup of old completed/failed jobs"""
- if not job_manager:
- raise HTTPException(status_code=500, detail="Job manager not initialized")
-
- job_manager.cleanup_old_jobs(max_age_hours=max_age_hours)
- return {"message": f"Cleaned up jobs older than {max_age_hours} hours"}
-
-
-if __name__ == "__main__":
- import uvicorn
-
- log.info("Starting FastAPI server...")
- uvicorn.run(
- "api_server:app",
- host="0.0.0.0",
- port=8000,
- reload=True,
- log_level="info"
- )
\ No newline at end of file
diff --git a/api_server_production.py b/api_server_production.py
index 6b33da3..dad6f96 100644
--- a/api_server_production.py
+++ b/api_server_production.py
@@ -6,6 +6,7 @@ Production Google Reviews Scraper API Server with Phase 1 features:
- Smart health checks with canary testing
"""
import asyncio
+import json
import logging
import os
from contextlib import asynccontextmanager
@@ -15,12 +16,12 @@ from uuid import UUID
from fastapi import FastAPI, HTTPException, Query, Header
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, HttpUrl, Field
-from fastapi.responses import JSONResponse
+from fastapi.responses import JSONResponse, StreamingResponse
from modules.database import DatabaseManager, JobStatus
from modules.webhooks import WebhookDispatcher, WebhookManager
from modules.health_checks import HealthCheckSystem
-from modules.scraper_clean import fast_scrape_reviews # Clean scraper with hard refresh recovery
+from modules.scraper_clean import fast_scrape_reviews, LogCapture # Clean scraper with hard refresh recovery
from modules.fast_scraper import check_reviews_available, get_business_card_info # Helper functions
from modules.chrome_pool import (
start_worker_pools,
@@ -48,6 +49,11 @@ health_system: Optional[HealthCheckSystem] = None
MAX_CONCURRENT_JOBS = int(os.getenv('MAX_CONCURRENT_JOBS', '5'))
job_semaphore = asyncio.Semaphore(MAX_CONCURRENT_JOBS)
+# SSE: Store for broadcasting job updates to connected clients
+# Format: {job_id: [asyncio.Queue, ...]} for job-specific streams
+# Format: {"all": [asyncio.Queue, ...]} for all-jobs stream
+job_update_queues: Dict[str, List[asyncio.Queue]] = {"all": []}
+
@asynccontextmanager
async def lifespan(app: FastAPI):
@@ -82,11 +88,12 @@ async def lifespan(app: FastAPI):
# Start Chrome worker pools (1 for validation, 2 for scraping)
# These pre-warm Chrome instances for instant availability
+ # headless=False because Docker uses Xvfb virtual display for better compatibility
await asyncio.to_thread(
start_worker_pools,
validation_size=1,
scraping_size=2,
- headless=True
+ headless=False
)
log.info("Chrome worker pools started (1 validation + 2 scraping)")
@@ -148,6 +155,9 @@ class JobResponse(BaseModel):
scrape_time: Optional[float] = None
error_message: Optional[str] = None
webhook_url: Optional[str] = None
+ # Business metadata
+ business_name: Optional[str] = None
+ business_address: Optional[str] = None
class ReviewsResponse(BaseModel):
@@ -239,12 +249,296 @@ async def get_job(job_id: UUID):
started_at=job['started_at'].isoformat() if job['started_at'] else None,
completed_at=job['completed_at'].isoformat() if job['completed_at'] else None,
reviews_count=job['reviews_count'],
+ total_reviews=job.get('total_reviews'),
scrape_time=job['scrape_time'],
error_message=job['error_message'],
webhook_url=job.get('webhook_url')
)
+@app.get("/jobs/{job_id}/logs", summary="Get Job Logs")
+async def get_job_logs(job_id: UUID):
+ """
+ Get the scraper logs for a job.
+
+ Returns logs from both successful and failed jobs.
+ Useful for debugging scraping issues.
+ """
+ if not db:
+ raise HTTPException(status_code=500, detail="Database not initialized")
+
+ job = await db.get_job(job_id)
+ if not job:
+ raise HTTPException(status_code=404, detail="Job not found")
+
+ # Get scrape_logs from job
+ scrape_logs = job.get('scrape_logs')
+
+ # Parse if string (asyncpg might return JSONB as string)
+ if isinstance(scrape_logs, str):
+ try:
+ scrape_logs = json.loads(scrape_logs)
+ except:
+ scrape_logs = None
+
+ return {
+ "job_id": str(job_id),
+ "status": job['status'],
+ "error_message": job.get('error_message'),
+ "logs": scrape_logs or [],
+ "log_count": len(scrape_logs) if scrape_logs else 0
+ }
+
+
+# ==================== SSE Streaming Endpoints ====================
+
+async def broadcast_job_update(job_id: str, event_type: str, data: dict):
+ """Broadcast an update to all subscribers of a job stream and the all-jobs stream."""
+ message = f"event: {event_type}\ndata: {json.dumps(data)}\n\n"
+
+ # Send to job-specific subscribers
+ if job_id in job_update_queues:
+ for queue in job_update_queues[job_id]:
+ try:
+ await queue.put(message)
+ except:
+ pass
+
+ # Send to all-jobs subscribers
+ for queue in job_update_queues.get("all", []):
+ try:
+ await queue.put(message)
+ except:
+ pass
+
+
+@app.get("/jobs/{job_id}/stream", summary="Stream Job Updates (SSE)")
+async def stream_job_updates(job_id: UUID):
+ """
+ Server-Sent Events stream for real-time job updates.
+
+ Streams:
+ - status: Job status changes
+ - progress: Review count and progress updates
+ - logs: New log entries
+ - complete: Job finished (completed/failed)
+
+ Connect with EventSource in the browser:
+ ```javascript
+ const es = new EventSource('/jobs/{job_id}/stream');
+ es.onmessage = (e) => console.log(JSON.parse(e.data));
+ es.addEventListener('logs', (e) => console.log('Logs:', JSON.parse(e.data)));
+ ```
+ """
+ if not db:
+ raise HTTPException(status_code=500, detail="Database not initialized")
+
+ # Verify job exists
+ job = await db.get_job(job_id)
+ if not job:
+ raise HTTPException(status_code=404, detail="Job not found")
+
+ job_id_str = str(job_id)
+
+ # Create queue for this client
+ queue: asyncio.Queue = asyncio.Queue()
+
+ # Register subscriber
+ if job_id_str not in job_update_queues:
+ job_update_queues[job_id_str] = []
+ job_update_queues[job_id_str].append(queue)
+
+ async def event_generator():
+ try:
+ # Send initial state
+ job_data = await db.get_job(job_id)
+ if job_data:
+ scrape_logs = job_data.get('scrape_logs')
+ if isinstance(scrape_logs, str):
+ try:
+ scrape_logs = json.loads(scrape_logs)
+ except:
+ scrape_logs = []
+
+ initial = {
+ "job_id": job_id_str,
+ "status": job_data['status'],
+ "reviews_count": job_data.get('reviews_count'),
+ "total_reviews": job_data.get('total_reviews'),
+ "scrape_time": job_data.get('scrape_time'),
+ "error_message": job_data.get('error_message'),
+ "logs": scrape_logs or []
+ }
+ yield f"event: init\ndata: {json.dumps(initial)}\n\n"
+
+ # If job is already complete, send complete event and close
+ if job_data and job_data['status'] in ['completed', 'failed', 'cancelled']:
+ yield f"event: complete\ndata: {json.dumps({'status': job_data['status']})}\n\n"
+ return
+
+ # Keep connection alive and send updates
+ last_log_count = len(scrape_logs) if scrape_logs else 0
+ last_reviews_count = job_data.get('reviews_count') if job_data else 0
+
+ while True:
+ try:
+ # Wait for update with timeout (for keepalive)
+ try:
+ message = await asyncio.wait_for(queue.get(), timeout=2.0)
+ yield message
+ except asyncio.TimeoutError:
+ # Send keepalive comment
+ yield ": keepalive\n\n"
+
+ # Also poll database for updates (backup in case broadcast missed)
+ job_data = await db.get_job(job_id)
+ if job_data:
+ # Check for status change
+ if job_data['status'] in ['completed', 'failed', 'cancelled']:
+ scrape_logs = job_data.get('scrape_logs')
+ if isinstance(scrape_logs, str):
+ try:
+ scrape_logs = json.loads(scrape_logs)
+ except:
+ scrape_logs = []
+
+ final = {
+ "job_id": job_id_str,
+ "status": job_data['status'],
+ "reviews_count": job_data.get('reviews_count'),
+ "total_reviews": job_data.get('total_reviews'),
+ "scrape_time": job_data.get('scrape_time'),
+ "error_message": job_data.get('error_message'),
+ "logs": scrape_logs or []
+ }
+ yield f"event: complete\ndata: {json.dumps(final)}\n\n"
+ return
+
+ # Check for new logs or progress
+ scrape_logs = job_data.get('scrape_logs')
+ if isinstance(scrape_logs, str):
+ try:
+ scrape_logs = json.loads(scrape_logs)
+ except:
+ scrape_logs = []
+
+ current_log_count = len(scrape_logs) if scrape_logs else 0
+ current_reviews = job_data.get('reviews_count') or 0
+
+ if current_log_count > last_log_count or current_reviews != last_reviews_count:
+ update = {
+ "job_id": job_id_str,
+ "status": job_data['status'],
+ "reviews_count": current_reviews,
+ "total_reviews": job_data.get('total_reviews'),
+ "logs": scrape_logs or []
+ }
+ yield f"event: update\ndata: {json.dumps(update)}\n\n"
+ last_log_count = current_log_count
+ last_reviews_count = current_reviews
+
+ except Exception as e:
+ log.error(f"Error in SSE stream for job {job_id}: {e}")
+ break
+
+ finally:
+ # Unregister subscriber
+ if job_id_str in job_update_queues:
+ try:
+ job_update_queues[job_id_str].remove(queue)
+ if not job_update_queues[job_id_str]:
+ del job_update_queues[job_id_str]
+ except:
+ pass
+
+ return StreamingResponse(
+ event_generator(),
+ media_type="text/event-stream",
+ headers={
+ "Cache-Control": "no-cache",
+ "Connection": "keep-alive",
+ "X-Accel-Buffering": "no" # Disable nginx buffering
+ }
+ )
+
+
+@app.get("/jobs/stream", summary="Stream All Jobs Updates (SSE)")
+async def stream_all_jobs():
+ """
+ Server-Sent Events stream for all job updates.
+
+ Streams:
+ - job_created: New job was created
+ - job_updated: Job status/progress changed
+ - job_completed: Job finished
+
+ Connect with EventSource in the browser:
+ ```javascript
+ const es = new EventSource('/jobs/stream');
+ es.addEventListener('job_updated', (e) => console.log('Update:', JSON.parse(e.data)));
+ ```
+ """
+ if not db:
+ raise HTTPException(status_code=500, detail="Database not initialized")
+
+ # Create queue for this client
+ queue: asyncio.Queue = asyncio.Queue()
+
+ # Register subscriber to all-jobs stream
+ job_update_queues["all"].append(queue)
+
+ async def event_generator():
+ try:
+ # Send initial jobs list
+ jobs = await db.list_jobs(limit=100)
+ jobs_data = [
+ {
+ "job_id": str(j['job_id']),
+ "status": j['status'],
+ "url": j['url'],
+ "created_at": j['created_at'].isoformat(),
+ "completed_at": j['completed_at'].isoformat() if j.get('completed_at') else None,
+ "reviews_count": j.get('reviews_count'),
+ "scrape_time": j.get('scrape_time'),
+ "error_message": j.get('error_message')
+ }
+ for j in jobs
+ ]
+ yield f"event: init\ndata: {json.dumps({'jobs': jobs_data})}\n\n"
+
+ # Keep connection alive and send updates
+ while True:
+ try:
+ # Wait for update with timeout (for keepalive)
+ try:
+ message = await asyncio.wait_for(queue.get(), timeout=5.0)
+ yield message
+ except asyncio.TimeoutError:
+ # Send keepalive comment
+ yield ": keepalive\n\n"
+
+ except Exception as e:
+ log.error(f"Error in all-jobs SSE stream: {e}")
+ break
+
+ finally:
+ # Unregister subscriber
+ try:
+ job_update_queues["all"].remove(queue)
+ except:
+ pass
+
+ return StreamingResponse(
+ event_generator(),
+ media_type="text/event-stream",
+ headers={
+ "Cache-Control": "no-cache",
+ "Connection": "keep-alive",
+ "X-Accel-Buffering": "no"
+ }
+ )
+
+
@app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews")
async def get_job_reviews(job_id: UUID):
"""
@@ -298,19 +592,34 @@ async def list_jobs(
jobs = await db.list_jobs(status=job_status, limit=limit, offset=offset)
- return [
- JobResponse(
+ result = []
+ for job in jobs:
+ # Extract business info from metadata if available
+ metadata = job.get('metadata')
+ if isinstance(metadata, str):
+ try:
+ metadata = json.loads(metadata)
+ except:
+ metadata = None
+
+ business_name = metadata.get('business_name') if metadata else None
+ business_address = metadata.get('business_address') if metadata else None
+
+ result.append(JobResponse(
job_id=str(job['job_id']),
status=job['status'],
url=job['url'],
created_at=job['created_at'].isoformat(),
completed_at=job['completed_at'].isoformat() if job.get('completed_at') else None,
reviews_count=job.get('reviews_count'),
+ total_reviews=job.get('total_reviews'),
scrape_time=job.get('scrape_time'),
- error_message=job.get('error_message')
- )
- for job in jobs
- ]
+ error_message=job.get('error_message'),
+ business_name=business_name,
+ business_address=business_address
+ ))
+
+ return result
@app.delete("/jobs/{job_id}", summary="Delete Job")
@@ -370,11 +679,11 @@ async def check_reviews(request: ScrapeRequest):
# SIMPLIFIED VALIDATION: If we found a business (name + rating), assume it has reviews
# Let the actual scraper determine if reviews exist
- has_business = result.get('name') and result.get('rating')
+ has_business = bool(result.get('name') and result.get('rating'))
return {
- "has_reviews": has_business, # Assume true if business exists
- "total_reviews": result['total_reviews'] or 0, # Show 0 if unknown
+ "has_reviews": has_business, # Boolean: true if business exists
+ "total_reviews": result.get('total_reviews') or 0, # Show 0 if unknown
"name": result.get('name'),
"address": result.get('address'),
"rating": result.get('rating'),
@@ -488,6 +797,8 @@ async def run_scraping_job(job_id: UUID):
Args:
job_id: Job UUID
"""
+ job_id_str = str(job_id)
+
async with job_semaphore: # Limit concurrent Chrome instances
try:
# Update status to running
@@ -498,44 +809,79 @@ async def run_scraping_job(job_id: UUID):
job = await db.get_job(job_id)
url = job['url']
+ # Broadcast job started via SSE
+ await broadcast_job_update(job_id_str, "job_started", {
+ "job_id": job_id_str,
+ "status": "running",
+ "url": url
+ })
+
# Get the event loop for progress updates from worker thread
loop = asyncio.get_running_loop()
- # Progress callback to update job status with current/total counts
+ # Create log capture instance that we can access for real-time logs
+ log_capture = LogCapture()
+
+ # Progress callback to update job status with current/total counts AND logs
def progress_callback(current_count: int, total_count: int):
- """Update job progress from worker thread"""
+ """Update job progress and logs from worker thread"""
async def update():
+ # Get current logs from the shared log_capture
+ current_logs = log_capture.get_logs()
await db.update_job_status(
job_id,
JobStatus.RUNNING,
reviews_count=current_count,
- total_reviews=total_count
+ total_reviews=total_count,
+ scrape_logs=current_logs
)
+ # Broadcast progress via SSE
+ await broadcast_job_update(job_id_str, "job_progress", {
+ "job_id": job_id_str,
+ "status": "running",
+ "reviews_count": current_count,
+ "total_reviews": total_count,
+ "logs": current_logs
+ })
+
# Schedule the coroutine on the event loop
asyncio.run_coroutine_threadsafe(update(), loop)
- # Run scraping with progress callback
+ # Run scraping with progress callback and shared log capture
+ # headless=False because Docker uses Xvfb virtual display
result = await asyncio.to_thread(
fast_scrape_reviews,
url=url,
- headless=True,
- progress_callback=progress_callback
+ headless=False,
+ progress_callback=progress_callback,
+ log_capture=log_capture
)
if result['success']:
- # Save results to database
+ # Save results to database (including scraper logs)
await db.save_job_result(
job_id=job_id,
reviews=result['reviews'],
scrape_time=result['time'],
- total_reviews=result.get('total_reviews')
+ total_reviews=result.get('total_reviews'),
+ scrape_logs=result.get('logs')
)
log.info(
f"Completed job {job_id}: {result['count']} reviews in {result['time']:.1f}s"
)
+ # Broadcast job completed via SSE
+ await broadcast_job_update(job_id_str, "job_completed", {
+ "job_id": job_id_str,
+ "status": "completed",
+ "reviews_count": result['count'],
+ "total_reviews": result.get('total_reviews'),
+ "scrape_time": result['time'],
+ "logs": result.get('logs', [])
+ })
+
# Send webhook if configured
if job.get('webhook_url'):
webhook_manager = WebhookManager()
@@ -553,15 +899,24 @@ async def run_scraping_job(job_id: UUID):
)
else:
- # Job failed
+ # Job failed - save logs for debugging
await db.update_job_status(
job_id,
JobStatus.FAILED,
- error_message=result.get('error', 'Unknown error')
+ error_message=result.get('error', 'Unknown error'),
+ scrape_logs=result.get('logs')
)
log.error(f"Failed job {job_id}: {result.get('error')}")
+ # Broadcast job failed via SSE
+ await broadcast_job_update(job_id_str, "job_failed", {
+ "job_id": job_id_str,
+ "status": "failed",
+ "error_message": result.get('error'),
+ "logs": result.get('logs', [])
+ })
+
# Send failure webhook if configured
if job.get('webhook_url'):
webhook_manager = WebhookManager()
@@ -585,6 +940,14 @@ async def run_scraping_job(job_id: UUID):
error_message=str(e)
)
+ # Broadcast job failed via SSE
+ await broadcast_job_update(job_id_str, "job_failed", {
+ "job_id": job_id_str,
+ "status": "failed",
+ "error_message": str(e),
+ "logs": []
+ })
+
# Send failure webhook
job = await db.get_job(job_id)
if job and job.get('webhook_url'):
diff --git a/brute_force_selector.py b/brute_force_selector.py
deleted file mode 100644
index 21ac024..0000000
--- a/brute_force_selector.py
+++ /dev/null
@@ -1,166 +0,0 @@
-#!/usr/bin/env python3
-"""
-Brute force approach: Try every possible div class combination and see which gives us reviews.
-"""
-
-import time
-from seleniumbase import Driver
-
-url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
-
-driver = Driver(uc=True, headless=False)
-
-try:
- driver.get(url)
- time.sleep(5)
-
- # GDPR
- try:
- form_btns = driver.find_elements('css selector', 'form button')
- for btn in form_btns:
- if 'accept all' in (btn.text or '').lower():
- btn.click()
- time.sleep(2)
- break
- except:
- pass
-
- # Click reviews tab
- time.sleep(2)
- tabs = driver.find_elements('css selector', 'button[role="tab"]')
- for tab in tabs:
- if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower():
- driver.execute_script("arguments[0].click();", tab)
- time.sleep(5)
- break
-
- # Scroll to load reviews
- try:
- pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
- for _ in range(10):
- driver.execute_script("arguments[0].scrollBy(0, 400);", pane)
- time.sleep(0.3)
- except:
- pass
-
- print("\n" + "="*80)
- print("BRUTE FORCE SELECTOR SEARCH")
- print("="*80)
-
- # Get ALL unique class combinations from divs inside the reviews pane
- candidates = driver.execute_script("""
- // Find the reviews pane
- const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde');
- if (!pane) return {error: 'Pane not found'};
-
- // Get all divs inside the pane
- const allDivs = Array.from(pane.querySelectorAll('div'));
-
- // For each div, check if it looks like a review
- const candidates = [];
-
- for (let div of allDivs) {
- // Skip if no classes
- if (!div.className || div.className.length === 0) continue;
-
- // Check for review indicators
- const hasRating = !!div.querySelector('[aria-label*="star" i]');
- const hasText = div.textContent.length > 50 && div.textContent.length < 1000; // Individual review size
- const hasAuthor = !!div.querySelector('button[aria-label*="photo" i], img');
-
- // Calculate score
- let score = 0;
- if (hasRating) score += 3;
- if (hasText) score += 2;
- if (hasAuthor) score += 1;
-
- if (score >= 4) { // Must have rating + text at minimum
- candidates.push({
- classes: div.className,
- selector: 'div.' + div.className.split(' ').filter(c => c).join('.'),
- score: score,
- text_length: div.textContent.length,
- sample_text: div.textContent.substring(0, 100)
- });
- }
- }
-
- // Count how many elements match each selector
- const selectorCounts = {};
- for (let candidate of candidates) {
- const count = pane.querySelectorAll(candidate.selector).length;
- if (!selectorCounts[candidate.selector]) {
- selectorCounts[candidate.selector] = {
- count: count,
- score: candidate.score,
- text_length: candidate.text_length,
- sample: candidate.sample_text
- };
- }
- }
-
- // Sort by count (we want selectors that match many reviews)
- const sorted = Object.entries(selectorCounts)
- .sort((a, b) => b[1].count - a[1].count)
- .slice(0, 10);
-
- return {
- top_selectors: sorted.map(([selector, info]) => ({
- selector: selector,
- count: info.count,
- score: info.score,
- text_length: info.text_length,
- sample: info.sample
- }))
- };
- """)
-
- if 'error' in candidates:
- print(f"ERROR: {candidates['error']}")
- else:
- print(f"\nTop 10 candidate selectors (sorted by count):\n")
- for i, candidate in enumerate(candidates['top_selectors'], 1):
- print(f"{i}. {candidate['selector']}")
- print(f" Count: {candidate['count']} | Score: {candidate['score']} | Text length: {candidate['text_length']}")
- print(f" Sample: {candidate['sample'][:80]}...")
- print()
-
- # Test the top selector
- if candidates['top_selectors']:
- top_selector = candidates['top_selectors'][0]['selector']
- print(f"\n{'='*80}")
- print(f"TESTING TOP SELECTOR: {top_selector}")
- print(f"{'='*80}")
-
- test_result = driver.execute_script(f"""
- const elements = document.querySelectorAll('{top_selector}');
- const reviews = [];
-
- for (let i = 0; i < Math.min(3, elements.length); i++) {{
- const elem = elements[i];
- const review = {{
- has_author: !!elem.querySelector('button, img'),
- has_rating: !!elem.querySelector('[aria-label*="star" i]'),
- has_date: !!elem.textContent.match(/\\d+\\s*(day|week|month|year|ago)/i),
- text_length: elem.textContent.length,
- text_sample: elem.textContent.substring(0, 150)
- }};
- reviews.push(review);
- }}
-
- return reviews;
- """)
-
- print(f"\nFirst 3 elements using {top_selector}:")
- for i, rev in enumerate(test_result, 1):
- print(f"\n Element {i}:")
- for key, value in rev.items():
- print(f" {key}: {value}")
-
- print(f"\n{'='*80}")
- print("Browser staying open for 60 seconds...")
- print(f"{'='*80}")
- time.sleep(60)
-
-finally:
- driver.quit()
diff --git a/check_page_structure.py b/check_page_structure.py
deleted file mode 100644
index b85f6fa..0000000
--- a/check_page_structure.py
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/usr/bin/env python3
-"""
-Check the actual page structure - maybe reviews are already visible without clicking a tab!
-"""
-
-import time
-from seleniumbase import Driver
-
-url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
-
-driver = Driver(uc=True, headless=False)
-
-try:
- driver.get(url)
- print(f"Initial URL: {url}")
- time.sleep(5)
-
- # GDPR
- try:
- form_btns = driver.find_elements('css selector', 'form button')
- for btn in form_btns:
- if 'accept' in (btn.text or '').lower():
- btn.click()
- time.sleep(2)
- break
- except:
- pass
-
- # Check final URL
- final_url = driver.current_url
- print(f"Final URL after redirect: {final_url}")
-
- # Wait a bit more for dynamic content
- time.sleep(3)
-
- # Check page structure
- print("\n" + "="*80)
- print("PAGE STRUCTURE ANALYSIS")
- print("="*80)
-
- page_info = driver.execute_script("""
- return {
- tabs_found: document.querySelectorAll('button[role="tab"]').length,
- reviews_with_standard_selector: document.querySelectorAll('div.jftiEf.fontBodyMedium').length,
- reviews_with_jftiEf: document.querySelectorAll('div.jftiEf').length,
- divs_with_ratings: document.querySelectorAll('[aria-label*="star" i]').length,
- review_containers: document.querySelectorAll('div.fontBodyMedium').length,
- page_text_sample: document.body.innerText.substring(0, 500),
- has_review_text: document.body.innerText.toLowerCase().includes('review'),
- has_atsiliepimai_text: document.body.innerText.toLowerCase().includes('atsiliepimai')
- };
- """)
-
- print(f"\nTabs with role='tab': {page_info['tabs_found']}")
- print(f"div.jftiEf.fontBodyMedium: {page_info['reviews_with_standard_selector']}")
- print(f"div.jftiEf: {page_info['reviews_with_jftiEf']}")
- print(f"Elements with star ratings: {page_info['divs_with_ratings']}")
- print(f"div.fontBodyMedium: {page_info['review_containers']}")
- print(f"Contains 'review': {page_info['has_review_text']}")
- print(f"Contains 'atsiliepimai' (Lithuanian): {page_info['has_atsiliepimai_text']}")
-
- print(f"\nPage text sample (first 500 chars):")
- print(page_info['page_text_sample'])
-
- # Try to find ANY element with rating
- print("\n" + "="*80)
- print("SEARCHING FOR RATING ELEMENTS")
- print("="*80)
-
- rating_search = driver.execute_script("""
- const elements = Array.from(document.querySelectorAll('*'));
- const withRatings = [];
-
- for (let elem of elements) {
- const ariaLabel = elem.getAttribute('aria-label') || '';
- if (ariaLabel.toLowerCase().includes('star') || ariaLabel.toLowerCase().includes('žvaigžd')) {
- withRatings.push({
- tag: elem.tagName,
- ariaLabel: ariaLabel.substring(0, 100),
- classes: elem.className.substring(0, 100),
- parentTag: elem.parentElement ? elem.parentElement.tagName : null,
- parentClasses: elem.parentElement ? elem.parentElement.className.substring(0, 100) : null
- });
- }
- }
-
- return withRatings.slice(0, 10); // First 10
- """)
-
- print(f"\nFound {len(rating_search)} elements with 'star' in aria-label:")
- for i, elem in enumerate(rating_search[:5], 1):
- print(f"\n Element {i}:")
- print(f" Tag: {elem['tag']}")
- print(f" Aria-label: {elem['ariaLabel']}")
- print(f" Classes: {elem['classes']}")
- print(f" Parent tag: {elem['parentTag']}")
- print(f" Parent classes: {elem['parentClasses']}")
-
- print(f"\n{'='*80}")
- print("Browser open for manual inspection...")
- print("LOOK AT THE PAGE - Are reviews visible? What's their structure?")
- print(f"{'='*80}")
- time.sleep(180) # 3 minutes
-
-finally:
- driver.quit()
diff --git a/cookie_based_scraper.py b/cookie_based_scraper.py
deleted file mode 100644
index f89e2ca..0000000
--- a/cookie_based_scraper.py
+++ /dev/null
@@ -1,355 +0,0 @@
-#!/usr/bin/env python3
-"""
-Cookie-based API scraper - Capture fresh cookies on each run, then fast API scraping.
-
-Flow:
-1. Start browser (15 seconds)
-2. Capture cookies from active browser session (5 seconds)
-3. Close browser
-4. Use cookies for rapid API pagination (5-10 seconds)
-
-Total time: ~25-35 seconds for 244 reviews (vs 155 seconds with scrolling)
-"""
-import json
-import logging
-import time
-from typing import List, Optional, Tuple
-import requests
-from seleniumbase import SB
-from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview
-
-logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
-log = logging.getLogger(__name__)
-
-
-class CookieBasedScraper:
- """Capture cookies each run, then scrape via API."""
-
- def __init__(self, url: str, headless: bool = False):
- self.url = url
- self.headless = headless
- self.session = requests.Session()
- self.place_id = None
- self.interceptor = GoogleMapsAPIInterceptor(None)
-
- def capture_cookies(self) -> bool:
- """
- Capture cookies from a real browser session.
- Returns True if successful.
- """
- log.info("="*60)
- log.info("STEP 1: Capturing cookies from browser session")
- log.info("="*60)
-
- sb = None
- sb_context = None
- try:
- # Create driver - need to enter the context manually
- log.info("Starting browser...")
- sb_context = SB(uc=True, headless=self.headless)
- sb = sb_context.__enter__() # Manually enter context
-
- log.info("Opening Google Maps...")
- sb.open(self.url)
- time.sleep(2)
-
- # Dismiss cookie consent
- try:
- sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=3)
- log.info("✓ Cookie dialog dismissed")
- except:
- pass
-
- # Click reviews tab
- try:
- sb.click('.LRkQ2', timeout=5)
- log.info("✓ Opened reviews tab")
- time.sleep(3) # Wait for reviews to load
- except Exception as e:
- log.warning(f"Could not click reviews tab: {e}")
-
- # Extract place ID from current URL
- current_url = sb.get_current_url()
- if '!1s' in current_url:
- parts = current_url.split('!1s')
- if len(parts) > 1:
- self.place_id = parts[1].split('!')[0]
- log.info(f"✓ Extracted place ID: {self.place_id}")
-
- if not self.place_id:
- log.error("Could not extract place ID")
- return False
-
- # CRITICAL: Scroll once to trigger an API call!
- # This causes Google to set the necessary session cookies
- log.info("Triggering API call by scrolling...")
- sb.execute_script("window.scrollBy(0, 500)")
- time.sleep(2) # Wait for API call to complete
- log.info("✓ API call triggered - session cookies should now be set")
-
- # CAPTURE COOKIES using CDP (gets httpOnly cookies too!)
- log.info("Capturing cookies via CDP...")
- try:
- # Use Chrome DevTools Protocol to get ALL cookies from all domains
- cdp_cookies = sb.driver.execute_cdp_cmd('Network.getAllCookies', {})
- browser_cookies = cdp_cookies.get('cookies', [])
- log.info(f"✓ Captured {len(browser_cookies)} cookies via CDP")
-
- # Also try getting cookies for specific Google domains
- for domain in ['.google.com', 'www.google.com', '.google.es', 'maps.google.com']:
- try:
- domain_cookies = sb.driver.execute_cdp_cmd('Network.getCookies', {'urls': [f'https://{domain}']})
- extra_cookies = domain_cookies.get('cookies', [])
- if extra_cookies:
- log.info(f" Found {len(extra_cookies)} cookies for {domain}")
- # Add any new cookies we don't have yet
- existing_names = {c['name'] for c in browser_cookies}
- for cookie in extra_cookies:
- if cookie['name'] not in existing_names:
- browser_cookies.append(cookie)
- except:
- pass
-
- log.info(f"✓ Total cookies after checking all domains: {len(browser_cookies)}")
- except Exception as e:
- log.warning(f"CDP cookie capture failed: {e}")
- # Fallback to JavaScript (won't get httpOnly cookies)
- cookie_string = sb.execute_script("return document.cookie")
- browser_cookies = []
- for cookie in cookie_string.split('; '):
- if '=' in cookie:
- name, value = cookie.split('=', 1)
- browser_cookies.append({
- 'name': name,
- 'value': value,
- 'domain': '.google.com',
- 'path': '/'
- })
- log.info(f"✓ Fallback: Captured {len(browser_cookies)} cookies via JS")
-
- # CAPTURE USER AGENT while driver is active
- user_agent = sb.execute_script("return navigator.userAgent")
- log.info(f"✓ Captured user agent")
-
- # Process cookies into session
- for cookie in browser_cookies:
- self.session.cookies.set(
- name=cookie['name'],
- value=cookie['value'],
- domain=cookie.get('domain', '.google.com'),
- path=cookie.get('path', '/')
- )
-
- # Set headers
- self.session.headers.update({
- 'User-Agent': user_agent,
- 'Accept': '*/*',
- 'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8',
- 'Referer': 'https://www.google.com/maps/',
- 'Origin': 'https://www.google.com',
- 'X-Requested-With': 'XMLHttpRequest',
- })
-
- # Print ALL cookie names for debugging
- all_cookie_names = [c['name'] for c in browser_cookies]
- log.info(f"Cookie names: {', '.join(all_cookie_names)}")
-
- # Print important cookies for debugging
- important_cookies = ['SID', 'HSID', 'SSID', 'APISID', 'SAPISID', '__Secure-1PSID', '__Secure-3PSID']
- found_cookies = []
- for cookie_name in important_cookies:
- if cookie_name in self.session.cookies:
- found_cookies.append(cookie_name)
-
- log.info(f"✓ Found auth cookies: {', '.join(found_cookies) if found_cookies else 'NONE - this is the problem!'}")
-
- # Check if we have auth cookies
- if not found_cookies:
- log.warning("\n" + "="*60)
- log.warning("⚠️ NO AUTHENTICATION COOKIES FOUND!")
- log.warning("="*60)
- log.warning("Google Maps API requires you to be logged into Google.")
- log.warning("")
- log.warning("To fix this:")
- log.warning("1. Log into your Google account in Chrome")
- log.warning("2. Visit google.com/maps while logged in")
- log.warning("3. Then run this scraper again")
- log.warning("")
- log.warning("Alternatively, use the hybrid scraper (start.py) which")
- log.warning("handles authentication automatically and already achieves")
- log.warning("95%+ API coverage with 100% parse rate!")
- log.warning("="*60 + "\n")
-
- # Continue anyway to show the error
- log.info("Continuing anyway to demonstrate the API error...")
-
- log.info("\n✅ Cookie capture successful!")
- log.info(f" Total cookies: {len(browser_cookies)}")
- log.info(f" Place ID: {self.place_id}")
- log.info(f" Session ready: Yes\n")
-
- return True
-
- except Exception as e:
- log.error(f"Cookie capture failed: {e}")
- import traceback
- traceback.print_exc()
- return False
-
- finally:
- # IMPORTANT: Close browser properly
- if sb_context:
- try:
- log.info("Closing browser...")
- sb_context.__exit__(None, None, None) # Properly exit context
- log.info("✓ Browser closed\n")
- except Exception as e:
- log.debug(f"Error closing browser: {e}")
-
- def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]:
- """
- Fetch a page of reviews via API using captured cookies.
- """
- # Build pb parameter
- if continuation_token:
- pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
- else:
- pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
-
- params = {
- 'authuser': '0',
- 'hl': 'es',
- 'gl': 'es',
- 'pb': pb
- }
-
- try:
- url = 'https://www.google.com/maps/rpc/listugcposts'
- response = self.session.get(url, params=params, timeout=10)
-
- if response.status_code != 200:
- log.error(f"API error {response.status_code}")
- log.error(f"Response: {response.text[:500]}")
- log.debug(f"Request URL: {response.url}")
- log.debug(f"Request headers: {dict(self.session.headers)}")
- return [], None
-
- # Parse response
- body = response.text
- if body.startswith(")]}'"):
- body = body[4:].strip()
-
- data = json.loads(body)
- reviews = self.interceptor._parse_listugcposts_response(data)
-
- # Get next token
- next_token = None
- if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
- next_token = data[1]
-
- return reviews, next_token
-
- except Exception as e:
- log.error(f"API request failed: {e}")
- return [], None
-
- def scrape_all(self, max_pages: int = 100) -> List[dict]:
- """
- Main scraping method with cookie-based session.
- """
- # Step 1: Capture cookies from browser
- if not self.capture_cookies():
- log.error("Failed to capture cookies - aborting")
- return []
-
- # Step 2: Scrape via API
- log.info("="*60)
- log.info("STEP 2: Fast API scraping (no browser needed)")
- log.info("="*60)
-
- start_time = time.time()
- all_reviews = []
- seen_ids = set()
- token = None
- page = 0
-
- while page < max_pages:
- page += 1
-
- log.info(f"Fetching page {page}...")
- reviews, token = self.fetch_reviews_page(token)
-
- if not reviews:
- if page == 1:
- log.error("No reviews on first page - cookies may have expired or be invalid")
- else:
- log.info("No more reviews found")
- break
-
- # Deduplicate
- for review in reviews:
- rid = review.review_id or f"{review.author}_{review.date_text}"
- if rid not in seen_ids:
- seen_ids.add(rid)
- all_reviews.append({
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- })
-
- log.info(f" → {len(reviews)} reviews | Total: {len(all_reviews)}")
-
- if not token:
- log.info("No continuation token - all reviews fetched")
- break
-
- # Small delay between requests
- time.sleep(0.2)
-
- elapsed = time.time() - start_time
-
- log.info("\n" + "="*60)
- log.info("✅ SCRAPING COMPLETED!")
- log.info("="*60)
- log.info(f"Total reviews: {len(all_reviews)}")
- log.info(f"API calls: {page}")
- log.info(f"API scraping time: {elapsed:.2f} seconds")
- log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second")
- log.info("="*60 + "\n")
-
- return all_reviews
-
-
-def main():
- """Example usage."""
- url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
-
- scraper = CookieBasedScraper(url, headless=False)
- reviews = scraper.scrape_all(max_pages=50)
-
- if reviews:
- # Save results
- output_file = 'cookie_based_reviews.json'
- with open(output_file, 'w', encoding='utf-8') as f:
- json.dump(reviews, f, indent=2, ensure_ascii=False)
-
- log.info(f"💾 Saved {len(reviews)} reviews to {output_file}")
-
- # Show sample
- log.info("\nSample review:")
- sample = reviews[0]
- log.info(f" Author: {sample['author']}")
- log.info(f" Rating: {sample['rating']}★")
- log.info(f" Date: {sample['date_text']}")
- if sample['text']:
- log.info(f" Text: {sample['text'][:80]}...")
- else:
- log.error("No reviews scraped!")
-
-
-if __name__ == '__main__':
- main()
diff --git a/direct_api_scraper.py b/direct_api_scraper.py
deleted file mode 100644
index d11005f..0000000
--- a/direct_api_scraper.py
+++ /dev/null
@@ -1,249 +0,0 @@
-#!/usr/bin/env python3
-"""
-Direct API scraper - fetch Google Maps reviews via API without browser scrolling.
-This is 10-25x faster than traditional browser-based scraping.
-"""
-import json
-import logging
-import time
-import urllib.parse
-from typing import List, Optional, Tuple
-import requests
-from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview
-
-logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
-log = logging.getLogger(__name__)
-
-
-class DirectAPIScraper:
- """Fetch Google Maps reviews directly via API without browser automation."""
-
- def __init__(self, place_id: str, language: str = 'en', region: str = 'us'):
- """
- Initialize the direct API scraper.
-
- Args:
- place_id: Google Maps place ID (e.g., '0x46dd947294b213bf:0x864c7a232527adb4')
- language: Language code (e.g., 'en', 'es', 'de')
- region: Region/country code (e.g., 'us', 'es', 'de')
- """
- self.place_id = place_id
- self.language = language
- self.region = region
- self.base_url = 'https://www.google.com/maps/rpc/listugcposts'
-
- # Initialize parser (reuse the working parser from api_interceptor)
- self.interceptor = GoogleMapsAPIInterceptor(None)
-
- # Session for maintaining cookies
- self.session = requests.Session()
- self.session.headers.update({
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
- 'Accept': '*/*',
- 'Accept-Language': f'{language},{language}-{region.upper()};q=0.9,en;q=0.8',
- 'Referer': 'https://www.google.com/maps/',
- 'X-Requested-With': 'XMLHttpRequest',
- })
-
- def _build_pb_param(self, continuation_token: Optional[str] = None) -> str:
- """
- Build the Protocol Buffer (pb) parameter for the API request.
-
- Args:
- continuation_token: Pagination token from previous response
-
- Returns:
- pb parameter string (NOT URL-encoded - that's done by requests)
- """
- # Base structure with place ID and pagination token
- if continuation_token:
- pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
- else:
- # First request without continuation token
- pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
-
- return pb
-
- def _establish_session(self):
- """Visit Google Maps page to establish session cookies."""
- try:
- # Visit the main maps page to get cookies
- maps_url = f"https://www.google.com/maps/place/?q=place_id:{self.place_id}"
- log.debug("Establishing session by visiting Google Maps...")
- response = self.session.get(maps_url, timeout=10)
- response.raise_for_status()
- log.debug(f"Session established (cookies: {len(self.session.cookies)})")
- except Exception as e:
- log.warning(f"Failed to establish session: {e}")
-
- def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]:
- """
- Fetch a single page of reviews from the API.
-
- Args:
- continuation_token: Pagination token from previous response
-
- Returns:
- Tuple of (reviews list, next continuation token or None)
- """
- # Build request parameters
- params = {
- 'authuser': '0',
- 'hl': self.language,
- 'gl': self.region,
- 'pb': self._build_pb_param(continuation_token)
- }
-
- try:
- log.info(f"Fetching reviews page (token: {'initial' if not continuation_token else 'paginated'})...")
-
- response = self.session.get(self.base_url, params=params, timeout=10)
-
- # Log response for debugging
- log.debug(f"Response status: {response.status_code}")
- if response.status_code != 200:
- log.error(f"Response body: {response.text[:500]}")
-
- response.raise_for_status()
-
- # Google returns responses with )]}' prefix - strip it
- body = response.text
- if body.startswith(")]}'"):
- body = body[4:].strip()
-
- log.debug(f"Response size: {len(body)} bytes")
-
- # Parse JSON response
- data = json.loads(body)
-
- # Extract reviews using our working parser
- reviews = self.interceptor._parse_listugcposts_response(data)
-
- # Extract next continuation token
- next_token = None
- if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
- next_token = data[1]
- log.debug(f"Found continuation token: {next_token[:50]}...")
-
- log.info(f"✓ Extracted {len(reviews)} reviews from this page")
-
- return reviews, next_token
-
- except requests.exceptions.RequestException as e:
- log.error(f"API request failed: {e}")
- return [], None
- except json.JSONDecodeError as e:
- log.error(f"Failed to parse API response: {e}")
- return [], None
- except Exception as e:
- log.error(f"Unexpected error: {e}")
- return [], None
-
- def fetch_all_reviews(self, max_pages: int = 100, delay: float = 0.5) -> List[dict]:
- """
- Fetch all reviews by paginating through the API.
-
- Args:
- max_pages: Maximum number of pages to fetch (safety limit)
- delay: Delay between requests in seconds
-
- Returns:
- List of review dictionaries
- """
- all_reviews = []
- seen_ids = set()
- continuation_token = None
- page = 0
-
- start_time = time.time()
- log.info(f"Starting direct API scraping for place: {self.place_id}")
-
- # Establish session first
- self._establish_session()
-
- while page < max_pages:
- page += 1
-
- # Fetch page
- reviews, continuation_token = self.fetch_reviews_page(continuation_token)
-
- if not reviews:
- log.info("No more reviews found - stopping")
- break
-
- # Deduplicate and add reviews
- for review in reviews:
- review_id = review.review_id or f"{review.author}_{review.date_text}"
- if review_id not in seen_ids:
- seen_ids.add(review_id)
-
- # Convert to dict
- all_reviews.append({
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- })
-
- log.info(f"Page {page}: {len(all_reviews)} total unique reviews")
-
- # Check if we have a continuation token
- if not continuation_token:
- log.info("No continuation token - all reviews fetched")
- break
-
- # Rate limiting
- if delay > 0 and page < max_pages:
- time.sleep(delay)
-
- elapsed = time.time() - start_time
- log.info(f"\n{'='*60}")
- log.info(f"✅ Direct API scraping completed!")
- log.info(f"{'='*60}")
- log.info(f"Total reviews: {len(all_reviews)}")
- log.info(f"Pages fetched: {page}")
- log.info(f"Time elapsed: {elapsed:.2f} seconds")
- log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second")
- log.info(f"{'='*60}\n")
-
- return all_reviews
-
-
-def main():
- """Example usage of the direct API scraper."""
-
- # Soho Club place ID from the test URL
- place_id = '0x46dd947294b213bf:0x864c7a232527adb4'
-
- # Create scraper
- scraper = DirectAPIScraper(
- place_id=place_id,
- language='es',
- region='es'
- )
-
- # Fetch all reviews
- reviews = scraper.fetch_all_reviews(max_pages=50, delay=0.5)
-
- # Save to JSON
- output_file = 'direct_api_reviews.json'
- with open(output_file, 'w', encoding='utf-8') as f:
- json.dump(reviews, f, indent=2, ensure_ascii=False)
-
- log.info(f"Saved {len(reviews)} reviews to {output_file}")
-
- # Show sample
- if reviews:
- log.info("\nSample review:")
- sample = reviews[0]
- log.info(f" Author: {sample['author']}")
- log.info(f" Rating: {sample['rating']}★")
- log.info(f" Date: {sample['date_text']}")
- log.info(f" Text: {sample['text'][:100]}..." if sample['text'] else " Text: (no text)")
-
-
-if __name__ == '__main__':
- main()
diff --git a/dump_api_response.py b/dump_api_response.py
deleted file mode 100644
index 3e21103..0000000
--- a/dump_api_response.py
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/usr/bin/env python3
-"""
-Quick script to dump API responses for debugging
-"""
-import json
-from modules.api_interceptor import GoogleMapsAPIInterceptor
-from seleniumbase import SB
-
-url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
-
-with SB(uc=True, headless=False) as sb:
- # Set up interceptor BEFORE loading page
- interceptor = GoogleMapsAPIInterceptor(sb.driver)
-
- sb.open(url)
- sb.sleep(2)
-
- # Inject interceptor early
- interceptor.inject_response_interceptor()
- sb.sleep(2)
-
- # Click reviews tab
- try:
- sb.click('.LRkQ2:contains("Reseñas")', timeout=5)
- except:
- try:
- sb.click('.LRkQ2:contains("Reviews")', timeout=5)
- except:
- pass
-
- print("Waiting for reviews to load...")
- sb.sleep(5)
-
- # Scroll to trigger more requests
- print("Scrolling to load more...")
- for i in range(5):
- sb.execute_script("window.scrollBy(0, 800)")
- sb.sleep(2)
- print(f" Scroll {i+1}/5...")
-
- print("\nCollecting responses...")
-
- # Get responses
- responses = interceptor.get_intercepted_responses()
-
- print(f"\nCaptured {len(responses)} responses")
-
- # Dump to files
- for i, resp in enumerate(responses):
- filename = f"api_response_{i}.json"
- with open(filename, 'w', encoding='utf-8') as f:
- json.dump(resp, f, indent=2, ensure_ascii=False)
- print(f"Saved: {filename} ({len(resp.get('body', ''))} bytes)")
-
- # Also save just the body for easier viewing
- body_file = f"api_response_{i}_body.txt"
- with open(body_file, 'w', encoding='utf-8') as f:
- f.write(resp.get('body', ''))
- print(f"Saved body: {body_file}")
-
- print("\nDone! Check api_response_*.json files")
diff --git a/dump_api_responses.py b/dump_api_responses.py
deleted file mode 100644
index 5f5ba0e..0000000
--- a/dump_api_responses.py
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/usr/bin/env python3
-"""
-Dump raw API responses for analysis.
-This will help us understand Google's exact response format.
-"""
-import json
-import logging
-from pathlib import Path
-from seleniumbase import SB
-from modules.api_interceptor import GoogleMapsAPIInterceptor
-
-logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
-
-url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
-
-output_dir = Path("api_response_samples")
-output_dir.mkdir(exist_ok=True)
-
-print(f"[INFO] Starting browser...")
-with SB(uc=True, headless=False) as sb:
- print("[INFO] Navigating to Google Maps...")
- sb.open(url)
- sb.sleep(3)
-
- # Inject interceptor FIRST
- print("[INFO] Injecting API interceptor...")
- interceptor = GoogleMapsAPIInterceptor(sb.driver)
- interceptor.inject_response_interceptor()
- sb.sleep(2)
-
- # Click reviews tab
- print("[INFO] Looking for reviews tab...")
- try:
- sb.click('.LRkQ2', timeout=5)
- print("[INFO] ✓ Clicked reviews tab")
- except:
- print("[WARN] Could not click reviews tab, trying to continue...")
-
- sb.sleep(5)
-
- # Scroll multiple times to trigger API calls
- print("[INFO] Scrolling to trigger API calls...")
- for i in range(10):
- sb.execute_script("window.scrollBy(0, 800)")
- sb.sleep(1.5)
-
- # Check every few scrolls
- if (i + 1) % 3 == 0:
- responses = interceptor.get_intercepted_responses()
- if responses:
- print(f"[INFO] Captured {len(responses)} responses so far...")
-
- # Final collection
- print("\n[INFO] Collecting all captured responses...")
- all_responses = interceptor.get_intercepted_responses()
-
- if not all_responses:
- print("[ERROR] No responses captured!")
- exit(1)
-
- print(f"[SUCCESS] Captured {len(all_responses)} API responses!\n")
-
- # Dump each response
- for i, resp in enumerate(all_responses):
- url_str = resp.get('url', 'unknown')
- body = resp.get('body', '')
- size = len(body)
-
- # Save full response
- full_file = output_dir / f"response_{i:02d}_full.json"
- with open(full_file, 'w', encoding='utf-8') as f:
- json.dump(resp, f, indent=2, ensure_ascii=False)
-
- # Save just body for easier viewing
- body_file = output_dir / f"response_{i:02d}_body.txt"
- with open(body_file, 'w', encoding='utf-8') as f:
- f.write(body)
-
- # Try to parse as JSON
- if body.startswith(")]}'"):
- clean_body = body[4:].strip()
- else:
- clean_body = body
-
- json_file = output_dir / f"response_{i:02d}_parsed.json"
- try:
- parsed = json.loads(clean_body)
- with open(json_file, 'w', encoding='utf-8') as f:
- json.dump(parsed, f, indent=2, ensure_ascii=False)
- print(f" [{i}] ✓ {url_str[:60]}... ({size:,} bytes)")
- print(f" Full: {full_file}")
- print(f" Body: {body_file}")
- print(f" Parsed: {json_file}")
- except:
- print(f" [{i}] ✓ {url_str[:60]}... ({size:,} bytes) [Not JSON]")
- print(f" Full: {full_file}")
- print(f" Body: {body_file}")
- print()
-
- print(f"\n[SUCCESS] Dumped {len(all_responses)} responses to: {output_dir}/")
- print("\nNext steps:")
- print(" 1. Open response_00_parsed.json to study the structure")
- print(" 2. Look for arrays containing review data")
- print(" 3. Identify patterns for: review ID, author, rating, text, date")
- print(" 4. Update the parser patterns in modules/api_interceptor.py")
-
-print("\n[DONE]")
diff --git a/fast_api_scraper.py b/fast_api_scraper.py
deleted file mode 100644
index fc5bbaa..0000000
--- a/fast_api_scraper.py
+++ /dev/null
@@ -1,249 +0,0 @@
-#!/usr/bin/env python3
-"""
-Fast API scraper - Minimal browser usage, maximum API speed.
-
-Strategy:
-1. Start browser and navigate to reviews page
-2. Capture cookies and user-agent from browser
-3. Let one API call happen naturally (to warm up the session)
-4. Close browser
-5. Use requests library with captured session to make fast API calls
-6. Paginate through all reviews without any scrolling
-
-Expected: 10-25x faster than traditional scrolling approach.
-"""
-import json
-import logging
-import time
-from typing import List, Optional, Tuple
-import requests
-from seleniumbase import SB
-from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview
-
-logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
-log = logging.getLogger(__name__)
-
-
-class FastAPIScraper:
- """Minimal browser, maximum speed."""
-
- def __init__(self, url: str):
- self.url = url
- self.session = requests.Session()
- self.place_id = None
- self.interceptor = GoogleMapsAPIInterceptor(None)
-
- def bootstrap_session(self) -> bool:
- """
- Quickly establish session using browser, then close it.
- """
- log.info("Bootstrapping session with minimal browser usage...")
-
- try:
- with SB(uc=True, headless=False) as sb:
- # Navigate
- log.info("Opening Google Maps...")
- sb.open(self.url)
- sb.sleep(2)
-
- # Dismiss cookies
- try:
- sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=3)
- except:
- pass
-
- # Click reviews
- try:
- sb.click('.LRkQ2', timeout=5)
- log.info("✓ Opened reviews tab")
- sb.sleep(2)
- except:
- log.warning("Could not click reviews tab")
-
- # Wait a bit to ensure page is loaded
- sb.sleep(1)
-
- # Extract place ID from URL or page
- current_url = sb.get_current_url()
- if '!1s' in current_url:
- parts = current_url.split('!1s')
- if len(parts) > 1:
- self.place_id = parts[1].split('!')[0]
- log.info(f"✓ Extracted place ID: {self.place_id}")
-
- # Get cookies from browser - do this while browser is still active
- try:
- browser_cookies = sb.driver.get_cookies()
- log.debug(f"Got {len(browser_cookies)} cookies")
- except Exception as e:
- log.warning(f"Could not get cookies: {e}")
- browser_cookies = []
-
- # Get user agent - do this while browser is still active
- try:
- user_agent = sb.execute_script("return navigator.userAgent")
- log.debug(f"User agent: {user_agent[:50]}...")
- except Exception as e:
- log.warning(f"Could not get user agent: {e}")
- user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
-
- # Now process cookies and headers (browser context manager still open)
- for cookie in browser_cookies:
- try:
- self.session.cookies.set(
- name=cookie['name'],
- value=cookie['value'],
- domain=cookie.get('domain', '.google.com'),
- path=cookie.get('path', '/')
- )
- except Exception as e:
- log.debug(f"Could not set cookie {cookie.get('name')}: {e}")
-
- # Set headers
- self.session.headers.update({
- 'User-Agent': user_agent,
- 'Accept': '*/*',
- 'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8',
- 'Referer': 'https://www.google.com/maps/',
- 'Origin': 'https://www.google.com',
- 'X-Requested-With': 'XMLHttpRequest',
- })
-
- log.info(f"✅ Session bootstrapped!")
- log.info(f" Cookies: {len(browser_cookies)}")
- log.info(f" Place ID: {self.place_id}")
-
- # Let browser stay open for a moment to ensure all operations complete
- sb.sleep(1)
-
- return True
-
- except Exception as e:
- log.error(f"Bootstrap failed: {e}")
- import traceback
- traceback.print_exc()
- return False
-
- def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]:
- """Fetch a page of reviews via API."""
-
- # Build pb parameter
- if continuation_token:
- pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
- else:
- pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
-
- params = {
- 'authuser': '0',
- 'hl': 'es',
- 'gl': 'es',
- 'pb': pb
- }
-
- try:
- url = 'https://www.google.com/maps/rpc/listugcposts'
- response = self.session.get(url, params=params, timeout=10)
-
- if response.status_code != 200:
- log.error(f"API error {response.status_code}")
- log.error(f"Response: {response.text[:300]}")
- return [], None
-
- # Parse
- body = response.text
- if body.startswith(")]}'"):
- body = body[4:].strip()
-
- data = json.loads(body)
- reviews = self.interceptor._parse_listugcposts_response(data)
-
- # Next token
- next_token = None
- if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
- next_token = data[1]
-
- return reviews, next_token
-
- except Exception as e:
- log.error(f"Request failed: {e}")
- return [], None
-
- def scrape_all(self, max_pages: int = 100) -> List[dict]:
- """
- Main scraping method.
- """
- # Bootstrap
- if not self.bootstrap_session():
- return []
-
- # Scrape via API
- log.info("\n" + "="*60)
- log.info("STARTING FAST API SCRAPING")
- log.info("="*60 + "\n")
-
- start_time = time.time()
- all_reviews = []
- seen_ids = set()
- token = None
- page = 0
-
- while page < max_pages:
- page += 1
-
- log.info(f"Fetching page {page}...")
- reviews, token = self.fetch_reviews_page(token)
-
- if not reviews:
- log.info("No more reviews")
- break
-
- # Dedup
- for review in reviews:
- rid = review.review_id or f"{review.author}_{review.date_text}"
- if rid not in seen_ids:
- seen_ids.add(rid)
- all_reviews.append({
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- })
-
- log.info(f" → {len(reviews)} reviews | Total: {len(all_reviews)}")
-
- if not token:
- break
-
- time.sleep(0.2) # Small delay
-
- elapsed = time.time() - start_time
-
- log.info("\n" + "="*60)
- log.info("✅ FAST API SCRAPING COMPLETED!")
- log.info("="*60)
- log.info(f"Reviews: {len(all_reviews)}")
- log.info(f"Pages: {page}")
- log.info(f"Time: {elapsed:.2f} seconds")
- log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
- log.info("="*60 + "\n")
-
- return all_reviews
-
-
-def main():
- url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
-
- scraper = FastAPIScraper(url)
- reviews = scraper.scrape_all(max_pages=50)
-
- # Save
- with open('fast_api_reviews.json', 'w', encoding='utf-8') as f:
- json.dump(reviews, f, indent=2, ensure_ascii=False)
-
- log.info(f"Saved to fast_api_reviews.json")
-
-
-if __name__ == '__main__':
- main()
diff --git a/find_actual_reviews.py b/find_actual_reviews.py
deleted file mode 100644
index 948e0cc..0000000
--- a/find_actual_reviews.py
+++ /dev/null
@@ -1,156 +0,0 @@
-#!/usr/bin/env python3
-"""
-Find the ACTUAL selector for reviews by looking for elements with review structure.
-"""
-
-import time
-from seleniumbase import Driver
-
-url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
-
-driver = Driver(uc=True, headless=False)
-
-try:
- driver.get(url)
- time.sleep(5)
-
- # GDPR
- try:
- form_btns = driver.find_elements('css selector', 'form button')
- for btn in form_btns:
- if 'accept all' in (btn.text or '').lower():
- btn.click()
- time.sleep(2)
- break
- except:
- pass
-
- # Click reviews tab
- time.sleep(2)
- tabs = driver.find_elements('css selector', 'button[role="tab"]')
- for tab in tabs:
- if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower():
- driver.execute_script("arguments[0].click();", tab)
- time.sleep(5)
- break
-
- # Scroll to load reviews
- try:
- pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
- for _ in range(3):
- driver.execute_script("arguments[0].scrollBy(0, 500);", pane)
- time.sleep(1)
- except:
- pass
-
- # Use JavaScript to find ALL elements that look like reviews
- print("\n" + "="*80)
- print("FINDING ACTUAL REVIEW ELEMENTS BY STRUCTURE:")
- print("="*80)
-
- review_info = driver.execute_script("""
- // Find all elements that have BOTH a rating AND substantial text
- const allDivs = Array.from(document.querySelectorAll('div'));
-
- const reviews = [];
-
- for (let div of allDivs) {
- // Must have a rating (star aria-label)
- const ratingElem = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i]');
- if (!ratingElem) continue;
-
- // Must have decent text content (>50 chars to avoid buttons)
- if (div.textContent.length < 50) continue;
-
- // Get the classes and attributes
- const info = {
- classes: div.className,
- has_author: !!div.querySelector('button, [aria-label*="photo" i]'),
- has_avatar: !!div.querySelector('img'),
- has_date: !!div.textContent.match(/\\d+\\s*(day|week|month|year|ago)/i),
- text_length: div.textContent.length,
- sample_text: div.textContent.substring(0, 150),
- tag_name: div.tagName,
- jslog: div.getAttribute('jslog'),
- data_review_id: div.getAttribute('data-review-id'),
- jsaction: div.getAttribute('jsaction')
- };
-
- reviews.push(info);
- }
-
- return {
- total_found: reviews.length,
- first_5: reviews.slice(0, 5)
- };
- """)
-
- print(f"\nFound {review_info['total_found']} elements with review structure")
- print(f"\nFirst 5 review-like elements:")
- for i, rev in enumerate(review_info['first_5'], 1):
- print(f"\n Review {i}:")
- print(f" Classes: {rev['classes']}")
- print(f" Has author: {rev['has_author']}")
- print(f" Has avatar: {rev['has_avatar']}")
- print(f" Has date: {rev['has_date']}")
- print(f" Text length: {rev['text_length']}")
- print(f" jslog: {rev['jslog']}")
- print(f" data-review-id: {rev['data_review_id']}")
- print(f" Sample: {rev['sample_text'][:80]}...")
-
- # Try to find a common class among review elements
- if review_info['total_found'] > 0:
- print("\n" + "="*80)
- print("FINDING COMMON SELECTOR:")
- print("="*80)
-
- common_selector = driver.execute_script("""
- // Find common classes among review elements
- const reviews = [];
- const allDivs = Array.from(document.querySelectorAll('div'));
-
- for (let div of allDivs) {
- const ratingElem = div.querySelector('[aria-label*="star" i]');
- if (ratingElem && div.textContent.length > 50) {
- reviews.push(div);
- }
- }
-
- if (reviews.length === 0) return null;
-
- // Get classes from first review
- const firstClasses = reviews[0].className.split(' ').filter(c => c.length > 0);
-
- // Find classes that appear in ALL reviews
- const commonClasses = firstClasses.filter(cls => {
- return reviews.every(rev => rev.classList.contains(cls));
- });
-
- return {
- total_reviews: reviews.length,
- common_classes: commonClasses,
- suggested_selector: commonClasses.length > 0 ? 'div.' + commonClasses.join('.') : null,
- first_review_classes: reviews[0].className
- };
- """)
-
- if common_selector:
- print(f"Total review elements: {common_selector['total_reviews']}")
- print(f"Common classes: {common_selector['common_classes']}")
- print(f"Suggested selector: {common_selector['suggested_selector']}")
- print(f"First review full classes: {common_selector['first_review_classes']}")
-
- # Test the suggested selector
- if common_selector['suggested_selector']:
- test_count = driver.execute_script(
- f"return document.querySelectorAll('{common_selector['suggested_selector']}').length;"
- )
- print(f"\nTesting suggested selector: Found {test_count} elements")
-
- print("\n" + "="*80)
- print("Browser staying open for manual inspection (60s)...")
- print("="*80)
- time.sleep(60)
-
-finally:
- driver.quit()
diff --git a/header_capture_scraper.py b/header_capture_scraper.py
deleted file mode 100644
index ff228b0..0000000
--- a/header_capture_scraper.py
+++ /dev/null
@@ -1,305 +0,0 @@
-#!/usr/bin/env python3
-"""
-Header Capture Scraper - Capture COMPLETE request from browser (headers + cookies).
-
-This captures the exact request the browser makes, including ALL headers and cookies,
-then replays it for fast API scraping.
-"""
-import json
-import logging
-import time
-from typing import List, Optional, Tuple
-import requests
-from seleniumbase import SB
-from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview
-
-logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
-log = logging.getLogger(__name__)
-
-
-class HeaderCaptureScraper:
- """Capture complete request, then replay for fast scraping."""
-
- def __init__(self, url: str, headless: bool = False):
- self.url = url
- self.headless = headless
- self.captured_request = None
- self.place_id = None
- self.session = requests.Session()
- self.interceptor = GoogleMapsAPIInterceptor(None)
-
- def capture_request(self) -> bool:
- """
- Capture a complete API request (URL, headers, cookies) from browser.
- """
- log.info("="*60)
- log.info("Capturing request from browser...")
- log.info("="*60)
-
- sb_context = None
- sb = None
-
- try:
- log.info("Starting browser...")
- sb_context = SB(uc=True, headless=self.headless)
- sb = sb_context.__enter__()
-
- sb.open(self.url)
- time.sleep(2)
-
- # Dismiss cookies
- try:
- sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=3)
- except:
- pass
-
- # Click reviews
- try:
- sb.click('.LRkQ2', timeout=5)
- log.info("✓ Opened reviews")
- time.sleep(2)
- except:
- pass
-
- # Enable CDP network monitoring
- sb.driver.execute_cdp_cmd('Network.enable', {})
- log.info("✓ Network monitoring enabled")
-
- # Scroll to trigger API call
- log.info("Scrolling to trigger API request...")
- sb.execute_script("window.scrollBy(0, 800)")
- time.sleep(3)
-
- # Get network logs from CDP
- log.info("Checking network logs...")
- logs = sb.driver.get_log('browser')
-
- # Alternatively, use execute_cdp_cmd to get network events
- # But simpler: Let's inject JS to capture the request
- capture_script = """
- window.__capturedRequest = null;
-
- const originalFetch = window.fetch;
- window.fetch = function(...args) {
- const url = args[0].toString();
- if (url.includes('listugcposts')) {
- console.log('[CAPTURE] Intercepted request to:', url);
- window.__capturedRequest = {
- url: url,
- method: 'GET'
- };
- }
- return originalFetch.apply(this, args);
- };
-
- const originalXHR = window.XMLHttpRequest;
- window.XMLHttpRequest = function() {
- const xhr = new originalXHR();
- const originalOpen = xhr.open;
-
- xhr.open = function(method, url, ...rest) {
- if (url.includes('listugcposts')) {
- console.log('[CAPTURE] Intercepted XHR:', url);
- window.__capturedRequest = {
- url: url,
- method: method
- };
- }
- return originalOpen.apply(this, [method, url, ...rest]);
- };
-
- return xhr;
- };
-
- console.log('[CAPTURE] Request interceptor ready');
- """
-
- sb.execute_script(capture_script)
- log.info("✓ Request interceptor injected")
-
- # Scroll again to trigger request
- log.info("Scrolling to capture request...")
- for i in range(3):
- sb.execute_script("window.scrollBy(0, 600)")
- time.sleep(2)
-
- captured = sb.execute_script("return window.__capturedRequest")
- if captured:
- log.info(f"✓ Captured request URL!")
- self.captured_request = captured
- break
-
- if not self.captured_request:
- log.error("Failed to capture request")
- return False
-
- # Extract place ID from URL
- url = self.captured_request['url']
- if '!1s' in url:
- import urllib.parse
- parsed = urllib.parse.urlparse(url)
- params = urllib.parse.parse_qs(parsed.query)
- pb = params.get('pb', [''])[0]
- if '!1s' in pb:
- self.place_id = pb.split('!1s')[1].split('!')[0]
-
- # Now capture ALL cookies via CDP
- cdp_cookies = sb.driver.execute_cdp_cmd('Network.getAllCookies', {})
- all_cookies = cdp_cookies.get('cookies', [])
-
- # Set cookies in session
- for cookie in all_cookies:
- self.session.cookies.set(
- name=cookie['name'],
- value=cookie['value'],
- domain=cookie.get('domain', '.google.com'),
- path=cookie.get('path', '/')
- )
-
- # Get user agent
- user_agent = sb.execute_script("return navigator.userAgent")
-
- # Set headers to match browser
- self.session.headers.update({
- 'User-Agent': user_agent,
- 'Accept': '*/*',
- 'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8',
- 'Referer': 'https://www.google.com/maps/',
- 'Origin': 'https://www.google.com',
- 'X-Requested-With': 'XMLHttpRequest',
- })
-
- log.info(f"\n✅ Request captured successfully!")
- log.info(f" Place ID: {self.place_id}")
- log.info(f" Cookies: {len(all_cookies)}")
- log.info(f" Cookie names: {', '.join([c['name'] for c in all_cookies[:10]])}")
-
- return True
-
- except Exception as e:
- log.error(f"Capture failed: {e}")
- import traceback
- traceback.print_exc()
- return False
-
- finally:
- if sb_context:
- try:
- log.info("Closing browser...")
- sb_context.__exit__(None, None, None)
- log.info("✓ Browser closed\n")
- except:
- pass
-
- def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]:
- """Fetch reviews using captured session."""
-
- if continuation_token:
- pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
- else:
- pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
-
- params = {
- 'authuser': '0',
- 'hl': 'es',
- 'gl': 'es',
- 'pb': pb
- }
-
- try:
- url = 'https://www.google.com/maps/rpc/listugcposts'
- response = self.session.get(url, params=params, timeout=10)
-
- if response.status_code != 200:
- log.error(f"API error {response.status_code}: {response.text[:200]}")
- return [], None
-
- body = response.text
- if body.startswith(")]}'"):
- body = body[4:].strip()
-
- data = json.loads(body)
- reviews = self.interceptor._parse_listugcposts_response(data)
-
- next_token = None
- if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
- next_token = data[1]
-
- return reviews, next_token
-
- except Exception as e:
- log.error(f"Request failed: {e}")
- return [], None
-
- def scrape_all(self, max_pages: int = 50) -> List[dict]:
- """Main scraping method."""
-
- if not self.capture_request():
- return []
-
- log.info("="*60)
- log.info("Fast API scraping...")
- log.info("="*60)
-
- start_time = time.time()
- all_reviews = []
- seen_ids = set()
- token = None
- page = 0
-
- while page < max_pages:
- page += 1
- log.info(f"Page {page}...")
-
- reviews, token = self.fetch_reviews_page(token)
-
- if not reviews:
- break
-
- for review in reviews:
- rid = review.review_id or f"{review.author}_{review.date_text}"
- if rid not in seen_ids:
- seen_ids.add(rid)
- all_reviews.append({
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- })
-
- log.info(f" → {len(reviews)} reviews | Total: {len(all_reviews)}")
-
- if not token:
- break
-
- time.sleep(0.2)
-
- elapsed = time.time() - start_time
-
- log.info(f"\n{'='*60}")
- log.info(f"✅ COMPLETED!")
- log.info(f"{'='*60}")
- log.info(f"Reviews: {len(all_reviews)}")
- log.info(f"Time: {elapsed:.2f}s")
- log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
- log.info(f"{'='*60}\n")
-
- return all_reviews
-
-
-def main():
- url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
-
- scraper = HeaderCaptureScraper(url, headless=False)
- reviews = scraper.scrape_all()
-
- if reviews:
- with open('header_capture_reviews.json', 'w', encoding='utf-8') as f:
- json.dump(reviews, f, indent=2, ensure_ascii=False)
- log.info(f"Saved to header_capture_reviews.json")
-
-
-if __name__ == '__main__':
- main()
diff --git a/hybrid_api_scraper.py b/hybrid_api_scraper.py
deleted file mode 100644
index b272899..0000000
--- a/hybrid_api_scraper.py
+++ /dev/null
@@ -1,352 +0,0 @@
-#!/usr/bin/env python3
-"""
-Hybrid API scraper - Capture session from browser, then use direct API calls.
-This combines the best of both worlds:
-1. Browser establishes authentic session with Google
-2. We capture ALL headers from real XHR requests
-3. Replay those headers in direct API calls
-4. No scrolling needed - just fast API pagination
-
-Expected speed: 10-25x faster than traditional browser scrolling.
-"""
-import json
-import logging
-import time
-from typing import List, Optional, Tuple, Dict
-import requests
-from seleniumbase import SB
-from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview
-
-logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
-log = logging.getLogger(__name__)
-
-
-class HybridAPIScraper:
- """
- Capture session from browser, then scrape via direct API calls.
- """
-
- def __init__(self, url: str, headless: bool = False):
- """
- Initialize the hybrid scraper.
-
- Args:
- url: Google Maps place URL
- headless: Run browser in headless mode
- """
- self.url = url
- self.headless = headless
- self.captured_headers = None
- self.place_id = None
- self.session = requests.Session()
-
- # Initialize parser
- self.interceptor = GoogleMapsAPIInterceptor(None)
-
- def capture_session_from_browser(self) -> bool:
- """
- Start a browser session, capture headers from actual API requests.
-
- Returns:
- True if session captured successfully
- """
- log.info("Starting browser to capture session headers...")
-
- try:
- with SB(uc=True, headless=self.headless) as sb:
- # Navigate to the place
- log.info(f"Navigating to: {self.url[:80]}...")
- sb.open(self.url)
- sb.sleep(3)
-
- # Dismiss cookie consent
- try:
- sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=5)
- log.info("Cookie dialog dismissed")
- except:
- pass
-
- # Click reviews tab
- log.info("Opening reviews...")
- try:
- sb.click('.LRkQ2', timeout=5)
- sb.sleep(3)
- except:
- log.warning("Could not click reviews tab")
-
- # Enable Chrome DevTools Protocol for network monitoring
- log.info("Enabling network interception...")
- sb.driver.execute_cdp_cmd('Network.enable', {})
-
- # Store captured requests
- captured_requests = []
-
- # Create event listener for network requests
- def add_request_listener():
- """Inject JS to capture fetch/XHR requests with headers."""
- script = """
- window.__capturedRequests = [];
-
- // Capture fetch
- const originalFetch = window.fetch;
- window.fetch = function(...args) {
- const url = args[0].toString();
- if (url.includes('listugcposts')) {
- console.log('[CAPTURE] Fetch to:', url);
- // Can't easily get headers from fetch without cloning
- }
- return originalFetch.apply(this, args);
- };
-
- // Capture XHR (more reliable for headers)
- const originalXHR = window.XMLHttpRequest;
- window.XMLHttpRequest = function() {
- const xhr = new originalXHR();
- const originalOpen = xhr.open;
- const originalSetRequestHeader = xhr.setRequestHeader;
- const headers = {};
-
- xhr.setRequestHeader = function(name, value) {
- headers[name.toLowerCase()] = value;
- return originalSetRequestHeader.apply(this, arguments);
- };
-
- xhr.open = function(method, url, ...rest) {
- if (url.includes('listugcposts')) {
- console.log('[CAPTURE] XHR to:', url);
- window.__capturedRequests.push({
- url: url,
- method: method,
- headers: {...headers}
- });
- }
- return originalOpen.apply(this, [method, url, ...rest]);
- };
-
- return xhr;
- };
-
- console.log('[CAPTURE] Request capture initialized');
- """
- sb.execute_script(script)
-
- add_request_listener()
-
- # Scroll to trigger an API call
- log.info("Scrolling to trigger API request...")
- for i in range(5):
- sb.execute_script("window.scrollBy(0, 800)")
- sb.sleep(1.5)
-
- # Check captured requests
- captured_requests = sb.execute_script("return window.__capturedRequests || []")
- if captured_requests:
- log.info(f"✓ Captured {len(captured_requests)} API request(s)!")
- break
-
- captured_request = captured_requests[0] if captured_requests else {}
-
- if not captured_request:
- log.error("Failed to capture API request")
- return False
-
- # Extract place ID from URL
- if 'place_id:' in self.url:
- self.place_id = self.url.split('place_id:')[1].split('&')[0].split('/')[0]
- elif '!1s' in captured_request['url']:
- # Extract from pb parameter
- import urllib.parse
- parsed = urllib.parse.urlparse(captured_request['url'])
- params = urllib.parse.parse_qs(parsed.query)
- pb = params.get('pb', [''])[0]
- if '!1s' in pb:
- self.place_id = pb.split('!1s')[1].split('!')[0]
-
- # Store captured headers
- self.captured_headers = captured_request['headers']
-
- # Also get cookies from browser
- cookies = sb.driver.get_cookies()
- for cookie in cookies:
- self.session.cookies.set(cookie['name'], cookie['value'], domain=cookie.get('domain'))
-
- log.info(f"\n{'='*60}")
- log.info("✅ Session captured successfully!")
- log.info(f"{'='*60}")
- log.info(f"Place ID: {self.place_id}")
- log.info(f"Headers captured: {len(self.captured_headers)}")
- log.info(f"Cookies captured: {len(cookies)}")
- log.info(f"{'='*60}\n")
-
- # Print sample headers for debugging
- log.debug("Sample headers:")
- for key in ['cookie', 'x-goog-api-key', 'authorization', 'user-agent']:
- if key in self.captured_headers:
- value = self.captured_headers[key]
- preview = value[:50] + '...' if len(value) > 50 else value
- log.debug(f" {key}: {preview}")
-
- return True
-
- except Exception as e:
- log.error(f"Failed to capture session: {e}")
- import traceback
- traceback.print_exc()
- return False
-
- def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]:
- """
- Fetch reviews page using captured session.
-
- Args:
- continuation_token: Pagination token
-
- Returns:
- Tuple of (reviews, next_token)
- """
- # Build pb parameter
- if continuation_token:
- pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
- else:
- pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
-
- params = {
- 'authuser': '0',
- 'hl': 'es',
- 'gl': 'es',
- 'pb': pb
- }
-
- try:
- log.info(f"Fetching page (token: {'initial' if not continuation_token else 'paginated'})...")
-
- # Make request with captured headers
- url = 'https://www.google.com/maps/rpc/listugcposts'
- response = self.session.get(url, params=params, headers=self.captured_headers, timeout=10)
-
- log.debug(f"Response status: {response.status_code}")
-
- if response.status_code != 200:
- log.error(f"API error {response.status_code}: {response.text[:500]}")
- return [], None
-
- # Parse response
- body = response.text
- if body.startswith(")]}'"):
- body = body[4:].strip()
-
- data = json.loads(body)
-
- # Extract reviews
- reviews = self.interceptor._parse_listugcposts_response(data)
-
- # Get next token
- next_token = None
- if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
- next_token = data[1]
-
- log.info(f"✓ Extracted {len(reviews)} reviews")
-
- return reviews, next_token
-
- except Exception as e:
- log.error(f"API request failed: {e}")
- return [], None
-
- def scrape_all_reviews(self, max_pages: int = 100, delay: float = 0.3) -> List[dict]:
- """
- Scrape all reviews using hybrid approach.
-
- Args:
- max_pages: Maximum pages to fetch
- delay: Delay between API calls
-
- Returns:
- List of review dictionaries
- """
- # Step 1: Capture session from browser
- if not self.capture_session_from_browser():
- log.error("Failed to capture session - aborting")
- return []
-
- # Step 2: Fetch all reviews via API
- log.info("\nStarting API-based scraping (no browser needed!)...")
- start_time = time.time()
-
- all_reviews = []
- seen_ids = set()
- continuation_token = None
- page = 0
-
- while page < max_pages:
- page += 1
-
- reviews, continuation_token = self.fetch_reviews_page(continuation_token)
-
- if not reviews:
- log.info("No more reviews found")
- break
-
- # Deduplicate
- for review in reviews:
- review_id = review.review_id or f"{review.author}_{review.date_text}"
- if review_id not in seen_ids:
- seen_ids.add(review_id)
- all_reviews.append({
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- })
-
- log.info(f"Page {page}: {len(all_reviews)} total unique reviews")
-
- if not continuation_token:
- log.info("No continuation token - finished")
- break
-
- if delay > 0:
- time.sleep(delay)
-
- elapsed = time.time() - start_time
-
- log.info(f"\n{'='*60}")
- log.info(f"✅ API SCRAPING COMPLETED!")
- log.info(f"{'='*60}")
- log.info(f"Total reviews: {len(all_reviews)}")
- log.info(f"API calls: {page}")
- log.info(f"Time (API only): {elapsed:.2f} seconds")
- log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second")
- log.info(f"{'='*60}\n")
-
- return all_reviews
-
-
-def main():
- """Example usage."""
- url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
-
- scraper = HybridAPIScraper(url, headless=False)
- reviews = scraper.scrape_all_reviews(max_pages=50, delay=0.3)
-
- # Save results
- output_file = 'hybrid_api_reviews.json'
- with open(output_file, 'w', encoding='utf-8') as f:
- json.dump(reviews, f, indent=2, ensure_ascii=False)
-
- log.info(f"Saved {len(reviews)} reviews to {output_file}")
-
- # Show sample
- if reviews:
- log.info("\nSample review:")
- sample = reviews[0]
- log.info(f" Author: {sample['author']}")
- log.info(f" Rating: {sample['rating']}★")
- log.info(f" Text: {sample['text'][:80]}..." if sample['text'] else " Text: (none)")
-
-
-if __name__ == '__main__':
- main()
diff --git a/inspect_pane_content.py b/inspect_pane_content.py
deleted file mode 100644
index fb95a94..0000000
--- a/inspect_pane_content.py
+++ /dev/null
@@ -1,157 +0,0 @@
-#!/usr/bin/env python3
-"""
-Check what's actually inside the reviews pane after scrolling.
-"""
-
-import time
-from seleniumbase import Driver
-
-url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
-
-driver = Driver(uc=True, headless=False)
-
-try:
- driver.get(url)
- time.sleep(5)
-
- # GDPR
- try:
- form_btns = driver.find_elements('css selector', 'form button')
- for btn in form_btns:
- if 'accept all' in (btn.text or '').lower():
- btn.click()
- time.sleep(2)
- break
- except:
- pass
-
- # Click reviews tab
- time.sleep(2)
- tabs = driver.find_elements('css selector', 'button[role="tab"]')
- review_tab_found = False
- for tab in tabs:
- text = (tab.text or '').lower()
- aria = (tab.get_attribute('aria-label') or '').lower()
- print(f"Tab: text='{tab.text}', aria='{tab.get_attribute('aria-label')}'")
- if 'review' in text or 'review' in aria:
- print(f" -> Clicking this tab!")
- driver.execute_script("arguments[0].click();", tab)
- time.sleep(6) # Wait longer
- review_tab_found = True
- break
-
- if not review_tab_found:
- print("WARNING: Reviews tab not found!")
-
- # Find and scroll the pane
- print("\nLooking for scrollable pane...")
- pane = None
- try:
- pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
- print(f"Found pane: div.m6QErb.WNBkOb.XiKgde")
- except:
- print("Pane not found with standard selector!")
- try:
- pane = driver.find_element('css selector', 'div.m6QErb')
- print(f"Found pane: div.m6QErb")
- except:
- print("No pane found at all!")
-
- if pane:
- print("\nScrolling pane to load reviews...")
- for i in range(15):
- driver.execute_script("arguments[0].scrollBy(0, 400);", pane)
- time.sleep(0.4)
- if (i + 1) % 5 == 0:
- print(f" Scrolled {i+1} times...")
-
- # Now check what's in the pane
- print("\n" + "="*80)
- print("ANALYZING PANE CONTENT")
- print("="*80)
-
- content_info = driver.execute_script("""
- const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || document.querySelector('div.m6QErb');
- if (!pane) return {error: 'No pane found'};
-
- // Get all child divs (direct and nested)
- const allDivs = Array.from(pane.querySelectorAll('div'));
-
- // Get all unique class names used
- const classNames = new Set();
- allDivs.forEach(div => {
- if (div.className) {
- div.className.split(' ').forEach(cls => {
- if (cls.trim()) classNames.add(cls.trim());
- });
- }
- });
-
- // Find divs with ratings
- const divsWithRatings = allDivs.filter(div => {
- return !!div.querySelector('[aria-label*="star" i]');
- });
-
- // Find divs with author photos
- const divsWithPhotos = allDivs.filter(div => {
- return !!div.querySelector('img[src*="photo"], img[src*="avatar"]');
- });
-
- // Find divs with date patterns
- const divsWithDates = allDivs.filter(div => {
- return !!div.textContent.match(/\\d+\\s*(day|week|month|year|hour|minute|ago)/i);
- });
-
- // Find divs with ALL three
- const reviewLikeDivs = allDivs.filter(div => {
- const hasRating = !!div.querySelector('[aria-label*="star" i]');
- const hasPhoto = !!div.querySelector('img');
- const hasDate = !!div.textContent.match(/\\d+\\s*(day|week|month|year|hour|ago)/i);
- const textLen = div.textContent.length;
- return hasRating && hasPhoto && hasDate && textLen > 50 && textLen < 2000;
- });
-
- return {
- total_divs: allDivs.length,
- unique_classes: Array.from(classNames).sort(),
- divs_with_ratings: divsWithRatings.length,
- divs_with_photos: divsWithPhotos.length,
- divs_with_dates: divsWithDates.length,
- review_like_divs: reviewLikeDivs.length,
- review_like_classes: reviewLikeDivs.slice(0, 5).map(d => ({
- classes: d.className,
- text_length: d.textContent.length,
- sample: d.textContent.substring(0, 100)
- }))
- };
- """)
-
- if 'error' in content_info:
- print(f"ERROR: {content_info['error']}")
- else:
- print(f"\nTotal divs in pane: {content_info['total_divs']}")
- print(f"Divs with ratings: {content_info['divs_with_ratings']}")
- print(f"Divs with photos: {content_info['divs_with_photos']}")
- print(f"Divs with dates: {content_info['divs_with_dates']}")
- print(f"Divs matching ALL criteria (review-like): {content_info['review_like_divs']}")
-
- print(f"\nFirst 20 unique classes found in pane:")
- for cls in content_info['unique_classes'][:20]:
- print(f" {cls}")
-
- if content_info['review_like_divs'] > 0:
- print(f"\nFirst 5 review-like divs:")
- for i, div_info in enumerate(content_info['review_like_classes'], 1):
- print(f"\n Div {i}:")
- print(f" Classes: {div_info['classes']}")
- print(f" Text length: {div_info['text_length']}")
- print(f" Sample: {div_info['sample'][:80]}...")
-
- print(f"\n{'='*80}")
- print("Browser staying open for manual inspection (120 seconds)...")
- print("Look at the DevTools to see the actual review elements!")
- print(f"{'='*80}")
- time.sleep(120)
-
-finally:
- driver.quit()
diff --git a/manual_inspect.py b/manual_inspect.py
deleted file mode 100644
index 6b48232..0000000
--- a/manual_inspect.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/env python3
-"""
-Open the page and keep it open for manual inspection.
-INSTRUCTIONS:
-1. Open DevTools (F12)
-2. Click on an individual review
-3. Look at the div that contains ONE review (not the whole list)
-4. Note the class names on that div
-"""
-
-import time
-from seleniumbase import Driver
-
-url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
-
-driver = Driver(uc=True, headless=False)
-
-try:
- driver.get(url)
- time.sleep(5)
-
- # GDPR
- try:
- form_btns = driver.find_elements('css selector', 'form button')
- for btn in form_btns:
- if 'accept all' in (btn.text or '').lower():
- btn.click()
- time.sleep(2)
- break
- except:
- pass
-
- # Click reviews tab
- time.sleep(2)
- tabs = driver.find_elements('css selector', 'button[role="tab"]')
- for tab in tabs:
- if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower():
- driver.execute_script("arguments[0].click();", tab)
- time.sleep(5)
- break
-
- # Scroll to load a few reviews
- try:
- pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
- for _ in range(5):
- driver.execute_script("arguments[0].scrollBy(0, 300);", pane)
- time.sleep(0.5)
- except:
- pass
-
- print("\n" + "="*80)
- print("MANUAL INSPECTION TIME!")
- print("="*80)
- print("\n1. The browser is now showing the reviews page")
- print("2. Open DevTools (F12 or right-click > Inspect)")
- print("3. Click the 'Select element' tool (top-left of DevTools)")
- print("4. Hover over an INDIVIDUAL review (not the whole panel)")
- print("5. Click on it to select it in the inspector")
- print("6. Look at the
that wraps ONE SINGLE review")
- print("7. Note the 'class' attribute value")
- print("\n8. The class might look like: class=\"MyWpvb fontBodyMedium\" or similar")
- print("\n9. Write down the full class name(s) - we'll use this as the selector!")
- print("\n" + "="*80)
- print("Browser will stay open for 5 minutes...")
- print("="*80)
-
- time.sleep(300) # 5 minutes
-
-finally:
- driver.quit()
diff --git a/modules/api_interceptor.py b/modules/api_interceptor.py
deleted file mode 100644
index e789801..0000000
--- a/modules/api_interceptor.py
+++ /dev/null
@@ -1,923 +0,0 @@
-"""
-API Interceptor for Google Maps Reviews.
-Uses Chrome DevTools Protocol (CDP) to intercept network requests and capture
-Google's internal API responses for faster, more reliable data extraction.
-"""
-
-import base64
-import json
-import logging
-import os
-import re
-import threading
-import time
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional
-from urllib.parse import parse_qs, urlparse
-
-log = logging.getLogger("api_interceptor")
-
-
-@dataclass
-class InterceptedReview:
- """Data class for a review extracted from API response"""
- review_id: str = ""
- author: str = ""
- rating: float = 0.0
- text: str = ""
- date_text: str = ""
- timestamp: int = 0
- likes: int = 0
- photos: List[str] = field(default_factory=list)
- profile_url: str = ""
- avatar_url: str = ""
- owner_response: str = ""
- owner_response_date: str = ""
- lang: str = ""
-
-
-class GoogleMapsAPIInterceptor:
- """
- Intercepts Google Maps internal API calls to capture review data directly.
-
- Google Maps uses several internal endpoints for reviews:
- - /maps/preview/review/listentitiesreviews - Main reviews endpoint
- - /maps/rpc/placereview - Alternative review endpoint
- - /maps/preview/reviewsdata - Review data endpoint
-
- The responses are often in a custom protobuf-like JSON format that needs parsing.
- """
-
- # Patterns for review-related API endpoints
- REVIEW_API_PATTERNS = [
- r'maps/preview/review',
- r'maps/rpc/placereview',
- r'maps/preview/reviewsdata',
- r'maps/preview/place',
- r'maps/api/place',
- r'/locationhistory/preview',
- r'batchexecute.*review',
- ]
-
- def __init__(self, driver):
- """Initialize the interceptor with a Selenium driver"""
- self.driver = driver
- self.captured_responses: List[Dict[str, Any]] = []
- self.captured_reviews: List[InterceptedReview] = []
- self.request_map: Dict[str, Dict] = {} # Map request IDs to URLs
- self._lock = threading.Lock()
- self._listening = False
- self._response_callback: Optional[Callable] = None
-
- def setup_interception(self):
- """Enable network interception via CDP"""
- try:
- # Enable network domain
- self.driver.execute_cdp_cmd('Network.enable', {})
-
- # Set up request interception patterns
- self.driver.execute_cdp_cmd('Network.setRequestInterception', {
- 'patterns': [
- {'urlPattern': '*maps*review*', 'resourceType': 'XHR'},
- {'urlPattern': '*maps*review*', 'resourceType': 'Fetch'},
- {'urlPattern': '*batchexecute*', 'resourceType': 'XHR'},
- {'urlPattern': '*batchexecute*', 'resourceType': 'Fetch'},
- ]
- })
-
- self._listening = True
- log.info("API interception enabled via CDP")
- return True
-
- except Exception as e:
- log.warning(f"Could not enable CDP interception: {e}")
- # Try alternative approach
- return self._setup_performance_logging()
-
- def _setup_performance_logging(self):
- """Alternative approach using Performance logging"""
- try:
- self.driver.execute_cdp_cmd('Network.enable', {
- 'maxTotalBufferSize': 10000000,
- 'maxResourceBufferSize': 5000000
- })
- self._listening = True
- log.info("API interception enabled via performance logging")
- return True
- except Exception as e:
- log.error(f"Failed to setup performance logging: {e}")
- return False
-
- def capture_network_responses(self, duration: float = 5.0):
- """
- Capture network responses for a specified duration.
- Call this while scrolling/loading more reviews.
- """
- if not self._listening:
- log.warning("Interception not set up, call setup_interception() first")
- return []
-
- captured = []
- start_time = time.time()
-
- while time.time() - start_time < duration:
- try:
- # Get performance logs which contain network events
- logs = self.driver.get_log('performance')
-
- for entry in logs:
- try:
- log_data = json.loads(entry['message'])
- message = log_data.get('message', {})
- method = message.get('method', '')
- params = message.get('params', {})
-
- # Capture response received events
- if method == 'Network.responseReceived':
- response = params.get('response', {})
- url = response.get('url', '')
-
- if self._is_review_api(url):
- request_id = params.get('requestId')
- self.request_map[request_id] = {
- 'url': url,
- 'status': response.get('status'),
- 'headers': response.get('headers', {})
- }
-
- # Capture response body when loading is finished
- elif method == 'Network.loadingFinished':
- request_id = params.get('requestId')
- if request_id in self.request_map:
- body = self._get_response_body(request_id)
- if body:
- captured.append({
- 'url': self.request_map[request_id]['url'],
- 'body': body,
- 'timestamp': time.time()
- })
-
- except Exception as parse_error:
- log.debug(f"Error parsing log entry: {parse_error}")
- continue
-
- except Exception as e:
- # Performance logs might not be available
- log.debug(f"Could not get performance logs: {e}")
- break
-
- time.sleep(0.1)
-
- with self._lock:
- self.captured_responses.extend(captured)
-
- return captured
-
- def get_response_bodies_cdp(self):
- """Get response bodies using CDP directly (more reliable method)"""
- responses = []
-
- try:
- # Use CDP to get all responses
- result = self.driver.execute_cdp_cmd('Network.getAllCookies', {})
-
- # Execute JavaScript to intercept fetch/XHR responses
- intercept_script = """
- (function() {
- if (window.__interceptedResponses) {
- var responses = window.__interceptedResponses;
- window.__interceptedResponses = [];
- return responses;
- }
- return [];
- })();
- """
-
- captured = self.driver.execute_script(intercept_script)
- if captured:
- responses.extend(captured)
-
- except Exception as e:
- log.debug(f"CDP response capture error: {e}")
-
- return responses
-
- def inject_response_interceptor(self):
- """
- Inject JavaScript to intercept XHR/Fetch responses at the browser level.
- This is the most reliable method for capturing API responses.
- """
- intercept_script = """
- (function() {
- // Skip if already injected
- if (window.__reviewInterceptorInjected) {
- console.log('[API Interceptor] Already injected, skipping');
- return;
- }
- window.__reviewInterceptorInjected = true;
- window.__interceptedResponses = [];
- window.__interceptorStats = {
- totalFetch: 0,
- totalXHR: 0,
- capturedFetch: 0,
- capturedXHR: 0,
- lastCapture: null
- };
-
- console.log('[API Interceptor] Initializing...');
-
- // Store original fetch
- const originalFetch = window.fetch;
-
- // Override fetch
- window.fetch = async function(...args) {
- window.__interceptorStats.totalFetch++;
- const url = args[0].toString();
-
- // Log ALL fetch requests for debugging
- console.debug('[API Interceptor] FETCH:', url.substring(0, 150));
-
- const response = await originalFetch.apply(this, args);
-
- // Check if this is a review-related API call
- if (url.includes('review') || url.includes('batchexecute') ||
- url.includes('place') || url.includes('maps') ||
- url.includes('listugcposts') || url.includes('getreviews')) {
- try {
- const clone = response.clone();
- const text = await clone.text();
-
- console.log('[API Interceptor] ✅ CAPTURED FETCH:', url.substring(0, 100), 'Size:', text.length);
-
- window.__interceptedResponses.push({
- url: url,
- body: text,
- timestamp: Date.now(),
- type: 'fetch',
- size: text.length
- });
-
- window.__interceptorStats.capturedFetch++;
- window.__interceptorStats.lastCapture = new Date().toISOString();
-
- // Keep only last 100 responses to avoid memory issues
- if (window.__interceptedResponses.length > 100) {
- window.__interceptedResponses = window.__interceptedResponses.slice(-50);
- }
- } catch (e) {
- console.error('[API Interceptor] Response capture error:', e);
- }
- }
-
- return response;
- };
-
- // Store original XMLHttpRequest
- const originalXHR = window.XMLHttpRequest;
-
- // Create intercepting XHR
- window.XMLHttpRequest = function() {
- const xhr = new originalXHR();
- const originalOpen = xhr.open;
- const originalSend = xhr.send;
- let requestUrl = '';
-
- xhr.open = function(method, url, ...rest) {
- requestUrl = url;
- window.__interceptorStats.totalXHR++;
- console.debug('[API Interceptor] XHR:', method, url.substring(0, 150));
- return originalOpen.apply(this, [method, url, ...rest]);
- };
-
- xhr.addEventListener('load', function() {
- if (requestUrl.includes('review') || requestUrl.includes('batchexecute') ||
- requestUrl.includes('place') || requestUrl.includes('maps') ||
- requestUrl.includes('listugcposts') || requestUrl.includes('getreviews')) {
- try {
- console.log('[API Interceptor] ✅ CAPTURED XHR:', requestUrl.substring(0, 100), 'Size:', xhr.responseText.length);
-
- window.__interceptedResponses.push({
- url: requestUrl,
- body: xhr.responseText,
- timestamp: Date.now(),
- type: 'xhr',
- status: xhr.status,
- size: xhr.responseText.length
- });
-
- window.__interceptorStats.capturedXHR++;
- window.__interceptorStats.lastCapture = new Date().toISOString();
-
- if (window.__interceptedResponses.length > 100) {
- window.__interceptedResponses = window.__interceptedResponses.slice(-50);
- }
- } catch (e) {
- console.error('[API Interceptor] XHR capture error:', e);
- }
- }
- });
-
- return xhr;
- };
-
- // Copy static properties
- for (let prop of Object.getOwnPropertyNames(originalXHR)) {
- try {
- window.XMLHttpRequest[prop] = originalXHR[prop];
- } catch (e) {}
- }
-
- console.log('[API Interceptor] ✅ Injected successfully! Monitoring network requests...');
-
- // Log stats every 10 seconds
- setInterval(() => {
- if (window.__interceptorStats.totalFetch > 0 || window.__interceptorStats.totalXHR > 0) {
- console.log('[API Interceptor] Stats:',
- 'Fetch:', window.__interceptorStats.totalFetch, '/', window.__interceptorStats.capturedFetch,
- 'XHR:', window.__interceptorStats.totalXHR, '/', window.__interceptorStats.capturedXHR,
- 'Queue:', window.__interceptedResponses.length);
- }
- }, 10000);
-
- return true;
- })();
- """
-
- try:
- result = self.driver.execute_script(intercept_script)
- log.info("JavaScript response interceptor injected with enhanced debugging")
-
- # Get initial stats
- stats = self.get_interceptor_stats()
- log.debug(f"Interceptor stats: {stats}")
-
- return True
- except Exception as e:
- log.warning(f"Failed to inject interceptor: {e}")
- return False
-
- def get_intercepted_responses(self):
- """Retrieve intercepted responses from the browser"""
- try:
- script = """
- if (window.__interceptedResponses) {
- var responses = window.__interceptedResponses.slice();
- window.__interceptedResponses = [];
- return responses;
- }
- return [];
- """
- responses = self.driver.execute_script(script)
-
- if responses:
- log.debug(f"Retrieved {len(responses)} intercepted responses from browser")
- for resp in responses[:3]: # Log first 3 for debugging
- log.debug(f" - {resp.get('type', '?').upper()}: {resp.get('url', '')[:100]} ({resp.get('size', 0)} bytes)")
- else:
- log.debug("No intercepted responses available")
-
- return responses or []
- except Exception as e:
- log.debug(f"Error getting intercepted responses: {e}")
- return []
-
- def get_interceptor_stats(self):
- """Get statistics from the JavaScript interceptor"""
- try:
- script = """
- if (window.__interceptorStats) {
- return window.__interceptorStats;
- }
- return null;
- """
- stats = self.driver.execute_script(script)
- return stats
- except Exception as e:
- log.debug(f"Error getting interceptor stats: {e}")
- return None
-
- def get_browser_console_logs(self):
- """Get browser console logs (for debugging)"""
- try:
- logs = self.driver.get_log('browser')
- return logs
- except Exception as e:
- log.debug(f"Could not get browser console logs: {e}")
- return []
-
- def dump_responses_to_file(self, responses: List[Dict], output_dir: str = "debug_api_responses"):
- """
- Dump captured responses to files for debugging.
- Creates one file per response with metadata and body.
- """
- try:
- output_path = Path(output_dir)
- output_path.mkdir(exist_ok=True)
-
- for i, response in enumerate(responses):
- timestamp = response.get('timestamp', int(time.time() * 1000))
- url = response.get('url', 'unknown')
- req_type = response.get('type', 'unknown')
-
- # Create filename from timestamp and type
- filename = f"{timestamp}_{req_type}_{i}.json"
- filepath = output_path / filename
-
- # Write response with metadata
- with open(filepath, 'w', encoding='utf-8') as f:
- json.dump({
- 'metadata': {
- 'url': url,
- 'type': req_type,
- 'timestamp': timestamp,
- 'size': response.get('size', len(response.get('body', ''))),
- 'status': response.get('status')
- },
- 'body': response.get('body', '')
- }, f, indent=2, ensure_ascii=False)
-
- log.info(f"Dumped {len(responses)} responses to {output_path}")
- return str(output_path)
-
- except Exception as e:
- log.error(f"Error dumping responses to file: {e}")
- return None
-
- def _is_review_api(self, url: str) -> bool:
- """Check if URL matches review API patterns"""
- url_lower = url.lower()
- return any(re.search(pattern, url_lower) for pattern in self.REVIEW_API_PATTERNS)
-
- def _get_response_body(self, request_id: str) -> Optional[str]:
- """Get response body for a request ID using CDP"""
- try:
- result = self.driver.execute_cdp_cmd('Network.getResponseBody', {
- 'requestId': request_id
- })
-
- body = result.get('body', '')
- if result.get('base64Encoded'):
- body = base64.b64decode(body).decode('utf-8', errors='ignore')
-
- return body
- except Exception as e:
- log.debug(f"Could not get response body for {request_id}: {e}")
- return None
-
- def parse_reviews_from_responses(self, responses: List[Dict]) -> List[InterceptedReview]:
- """
- Parse review data from captured API responses.
- Google's API responses use a custom nested array format.
- """
- reviews = []
-
- for response in responses:
- try:
- body = response.get('body', '')
- url = response.get('url', '')
-
- # Skip non-JSON responses
- if not body or body.startswith(' List[InterceptedReview]:
- """Parse a single response body for review data"""
- reviews = []
-
- # Skip empty or HTML responses
- if not body or body.startswith(' List[InterceptedReview]:
- """
- Parse Google Maps listugcposts API response.
-
- Structure discovered:
- data[2] = array of review groups
- data[2][i] = single review group [review_data, metadata, continuation_token]
- data[2][i][0] = review data (6-item array containing all review info)
- """
- reviews = []
-
- try:
- if not isinstance(data, list) or len(data) < 3:
- log.debug("Response doesn't match expected structure (not a list or too short)")
- return reviews
-
- # data[2] contains the review groups
- review_groups = data[2]
- if not isinstance(review_groups, list):
- log.debug("data[2] is not a list")
- return reviews
-
- log.debug(f"Found {len(review_groups)} reviews in data[2]")
-
- # Each group IS ONE REVIEW
- for group_idx, group in enumerate(review_groups):
- if not isinstance(group, list) or len(group) == 0:
- continue
-
- # group[0] is the review data array (6 items)
- review_data = group[0]
- if not isinstance(review_data, list):
- continue
-
- try:
- review = self._parse_google_review_array(review_data)
- if review:
- reviews.append(review)
- log.debug(f"Parsed review {group_idx}: {review.author} - {review.rating}★")
- except Exception as e:
- log.debug(f"Error parsing review at group[{group_idx}]: {e}")
-
- except Exception as e:
- log.debug(f"Error in _parse_listugcposts_response: {e}")
-
- return reviews
-
- def _parse_google_review_array(self, review_data: List) -> Optional[InterceptedReview]:
- """
- Parse a single review from Google's 6-item array format.
-
- Discovered structure (review_data is a 6-item array):
- review_data[0] = Review ID (string)
- review_data[1][4][5][0] = Author Name
- review_data[1][4][5][3] = User ID
- review_data[1][6] = Date Text
- review_data[2][0][0] = Rating (1-5)
- review_data[2][15][0][0] = Review Text (original)
- review_data[2][15][1][0] = Review Text (translated)
- """
- review = InterceptedReview()
-
- try:
- # Extract review ID from review_data[0]
- if len(review_data) > 0 and isinstance(review_data[0], str):
- review.review_id = review_data[0]
-
- # Extract author info from review_data[1][4][5]
- if (len(review_data) > 1 and
- isinstance(review_data[1], list) and
- len(review_data[1]) > 4 and
- isinstance(review_data[1][4], list) and
- len(review_data[1][4]) > 5 and
- isinstance(review_data[1][4][5], list)):
-
- author_info = review_data[1][4][5]
-
- # Author name at [1][4][5][0]
- if len(author_info) > 0 and isinstance(author_info[0], str):
- review.author = author_info[0]
-
- # Profile picture at [1][4][5][1] (if available)
- if len(author_info) > 1 and isinstance(author_info[1], str):
- review.avatar_url = author_info[1]
-
- # Extract date from review_data[1][6]
- if (len(review_data) > 1 and
- isinstance(review_data[1], list) and
- len(review_data[1]) > 6 and
- isinstance(review_data[1][6], str)):
- review.date_text = review_data[1][6]
-
- # Extract rating from review_data[2][0][0]
- if (len(review_data) > 2 and
- isinstance(review_data[2], list) and
- len(review_data[2]) > 0 and
- isinstance(review_data[2][0], list) and
- len(review_data[2][0]) > 0):
- rating_val = review_data[2][0][0]
- if isinstance(rating_val, (int, float)) and 1 <= rating_val <= 5:
- review.rating = float(rating_val)
-
- # Extract review text from review_data[2][15][0][0]
- if (len(review_data) > 2 and
- isinstance(review_data[2], list) and
- len(review_data[2]) > 15 and
- isinstance(review_data[2][15], list) and
- len(review_data[2][15]) > 0 and
- isinstance(review_data[2][15][0], list) and
- len(review_data[2][15][0]) > 0):
- text = review_data[2][15][0][0]
- if isinstance(text, str):
- review.text = text
-
- # Only return if we have minimum required data
- if review.rating > 0 and (review.author or review.text):
- return review
-
- except Exception as e:
- log.debug(f"Error parsing Google review array: {e}")
-
- return None
-
- def _parse_review_array_v2(self, arr: List) -> Optional[InterceptedReview]:
- """
- Parse review from Google's nested array format.
- Improved version with better field detection.
- """
- review = InterceptedReview()
-
- try:
- # Extract review ID (usually a long string in first few elements)
- for i, item in enumerate(arr[:5]):
- if isinstance(item, str) and len(item) > 30 and not item.startswith('http'):
- review.review_id = item
- break
-
- # Extract rating (number between 1-5)
- for item in arr:
- if isinstance(item, (int, float)) and 1 <= item <= 5:
- review.rating = float(item)
- break
- elif isinstance(item, list):
- for subitem in item:
- if isinstance(subitem, (int, float)) and 1 <= subitem <= 5:
- review.rating = float(subitem)
- break
- if review.rating > 0:
- break
-
- # Extract review text (long string, not a URL)
- for item in arr:
- if isinstance(item, str) and len(item) > 50 and not item.startswith('http'):
- if not review.review_id or item != review.review_id:
- review.text = item
- break
-
- # Extract author name (shorter string, not ID or text)
- for item in arr:
- if isinstance(item, str) and 3 <= len(item) <= 100:
- if item != review.review_id and item != review.text and not item.startswith('http'):
- review.author = item
- break
- elif isinstance(item, list):
- for subitem in item:
- if isinstance(subitem, str) and 3 <= len(subitem) <= 100:
- if subitem != review.text and not subitem.startswith('http'):
- review.author = subitem
- break
- if review.author:
- break
-
- # Extract dates (strings that look like dates)
- date_patterns = [r'\d{1,2}/\d{1,2}/\d{2,4}', r'\d{4}-\d{2}-\d{2}', r'hace \d+', r'\d+ days? ago']
- for item in arr:
- if isinstance(item, str):
- for pattern in date_patterns:
- if re.search(pattern, item, re.IGNORECASE):
- review.date_text = item
- break
- if review.date_text:
- break
-
- # Only return if we have meaningful data
- if (review.review_id or review.author) and review.rating > 0:
- return review
-
- except Exception as e:
- log.debug(f"Error in _parse_review_array_v2: {e}")
-
- return None
-
- def _extract_reviews_recursive(self, data: Any, depth: int = 0) -> List[InterceptedReview]:
- """Recursively search for review data in nested structures"""
- reviews = []
-
- if depth > 20: # Prevent infinite recursion
- return reviews
-
- # Skip if data is already an InterceptedReview object
- if isinstance(data, InterceptedReview):
- return [data]
-
- if isinstance(data, dict):
- # Check if this looks like a review object
- review = self._try_parse_review_dict(data)
- if review:
- reviews.append(review)
-
- # Recurse into dict values
- for value in data.values():
- if not isinstance(value, InterceptedReview):
- reviews.extend(self._extract_reviews_recursive(value, depth + 1))
-
- elif isinstance(data, list):
- # Check if this array looks like a review array
- review = self._try_parse_review_array(data)
- if review:
- reviews.append(review)
-
- # Recurse into list items
- for item in data:
- if not isinstance(item, InterceptedReview):
- reviews.extend(self._extract_reviews_recursive(item, depth + 1))
-
- return reviews
-
- def _try_parse_review_dict(self, data: Dict) -> Optional[InterceptedReview]:
- """Try to parse a dictionary as a review object"""
- # Common keys in review objects
- review_keys = {'reviewId', 'review_id', 'author', 'rating', 'text', 'comment'}
-
- if not any(k in data for k in review_keys):
- return None
-
- try:
- review = InterceptedReview()
-
- # Try various key names for each field
- review.review_id = data.get('reviewId') or data.get('review_id') or data.get('id', '')
- review.author = data.get('author') or data.get('authorName') or data.get('name', '')
- review.rating = float(data.get('rating') or data.get('starRating') or 0)
- review.text = data.get('text') or data.get('comment') or data.get('reviewText', '')
- review.date_text = data.get('publishTime') or data.get('relativePublishTime') or data.get('date', '')
- review.likes = int(data.get('thumbsUpCount') or data.get('likes') or 0)
-
- # Photos
- photos = data.get('photos') or data.get('reviewPhotos') or []
- if photos:
- review.photos = [p.get('url') or p for p in photos if p]
-
- # Profile
- author_data = data.get('author') if isinstance(data.get('author'), dict) else {}
- review.profile_url = author_data.get('profileUrl') or data.get('profileUrl', '')
- review.avatar_url = author_data.get('profilePhotoUrl') or data.get('avatar', '')
-
- # Owner response
- owner_resp = data.get('ownerResponse') or data.get('ownerReply') or {}
- if isinstance(owner_resp, dict):
- review.owner_response = owner_resp.get('text', '')
- review.owner_response_date = owner_resp.get('publishTime', '')
-
- # Only return if we have meaningful data
- if review.review_id or (review.author and review.text):
- return review
-
- except Exception as e:
- log.debug(f"Error parsing review dict: {e}")
-
- return None
-
- def _try_parse_review_array(self, data: List) -> Optional[InterceptedReview]:
- """
- Try to parse a nested array as a review (Google's protobuf-like format).
- Google often uses positional arrays like: [id, author, [rating], text, ...]
- """
- if not data or len(data) < 3:
- return None
-
- try:
- # Look for patterns that indicate this is a review array
- # Pattern 1: [review_id, [author_info], rating_array, text, ...]
-
- review = InterceptedReview()
-
- # Check if first element looks like a review ID
- if isinstance(data[0], str) and len(data[0]) > 20:
- review.review_id = data[0]
-
- # Search for rating (usually a small number 1-5)
- for item in data:
- if isinstance(item, (int, float)) and 1 <= item <= 5:
- review.rating = float(item)
- break
- elif isinstance(item, list) and len(item) >= 1:
- if isinstance(item[0], (int, float)) and 1 <= item[0] <= 5:
- review.rating = float(item[0])
- break
-
- # Search for text (long string)
- for item in data:
- if isinstance(item, str) and len(item) > 30:
- review.text = item
- break
- elif isinstance(item, list):
- for subitem in item:
- if isinstance(subitem, str) and len(subitem) > 30:
- review.text = subitem
- break
-
- # Search for author name (shorter string)
- for item in data:
- if isinstance(item, list) and len(item) >= 1:
- for subitem in item:
- if isinstance(subitem, str) and 2 <= len(subitem) <= 100 and subitem != review.text:
- review.author = subitem
- break
- if review.author:
- break
-
- # Search for URLs (photos, profile)
- for item in data:
- if isinstance(item, str) and item.startswith('http'):
- if 'googleusercontent' in item or 'ggpht' in item:
- if not review.avatar_url:
- review.avatar_url = item
- else:
- review.photos.append(item)
- elif isinstance(item, list):
- self._extract_urls_from_array(item, review)
-
- # Only return if we have meaningful data
- if review.review_id and review.rating > 0:
- return review
- if review.text and review.rating > 0:
- return review
-
- except Exception as e:
- log.debug(f"Error parsing review array: {e}")
-
- return None
-
- def _extract_urls_from_array(self, arr: List, review: InterceptedReview, depth: int = 0):
- """Extract URLs from nested arrays"""
- if depth > 5:
- return
-
- for item in arr:
- if isinstance(item, str) and item.startswith('http'):
- if 'googleusercontent' in item or 'ggpht' in item or 'lh3' in item:
- if 'w72-h72' in item or 'p-rp-mo' in item: # Profile pic pattern
- review.avatar_url = item
- else:
- review.photos.append(item)
- elif isinstance(item, list):
- self._extract_urls_from_array(item, depth + 1, review)
-
- def convert_to_raw_review_format(self, intercepted: InterceptedReview) -> Dict[str, Any]:
- """Convert an InterceptedReview to the format used by RawReview/storage"""
- return {
- 'review_id': intercepted.review_id,
- 'author': intercepted.author,
- 'rating': intercepted.rating,
- 'description': {'en': intercepted.text} if intercepted.text else {},
- 'likes': intercepted.likes,
- 'user_images': intercepted.photos,
- 'author_profile_url': intercepted.profile_url,
- 'profile_picture': intercepted.avatar_url,
- 'owner_responses': {
- 'en': {'text': intercepted.owner_response}
- } if intercepted.owner_response else {},
- 'review_date': intercepted.date_text,
- '_source': 'api_intercept'
- }
-
- def cleanup(self):
- """Clean up interception resources"""
- try:
- self.driver.execute_cdp_cmd('Network.disable', {})
- except:
- pass
-
- self.captured_responses.clear()
- self.captured_reviews.clear()
- self.request_map.clear()
- self._listening = False
diff --git a/modules/chrome_pool.py b/modules/chrome_pool.py
index 0d986f2..d7e60b6 100644
--- a/modules/chrome_pool.py
+++ b/modules/chrome_pool.py
@@ -35,16 +35,45 @@ class ChromeWorker:
# SeleniumBase Driver automatically includes UC mode anti-detection
# Initialize with longer timeouts for large scraping jobs
+ # Chrome arguments for Docker stability
+ chrome_args = [
+ "--disable-dev-shm-usage", # Use /tmp instead of /dev/shm (critical for Docker)
+ "--disable-gpu", # Disable GPU acceleration
+ "--no-sandbox", # Required for Docker
+ "--disable-software-rasterizer",
+ "--disable-extensions",
+ "--disable-background-networking",
+ "--disable-default-apps",
+ "--disable-sync",
+ "--metrics-recording-only",
+ "--mute-audio",
+ "--no-first-run",
+ "--safebrowsing-disable-auto-update",
+ ]
+
self.driver = Driver(
uc=True,
headless=self.headless,
- page_load_strategy="normal"
+ page_load_strategy="normal",
+ chromium_arg=",".join(chrome_args)
)
# Set generous timeouts for large scraping jobs
self.driver.set_page_load_timeout(120) # 2 minutes for slow networks
self.driver.set_script_timeout(60) # 1 minute for complex extraction
+ # Set Chrome geolocation to US (Boston, MA) for consistent Google Maps results
+ # This prevents location-based variations in search results
+ try:
+ self.driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
+ 'latitude': 42.3601,
+ 'longitude': -71.0589,
+ 'accuracy': 100
+ })
+ log.info(f"Worker {self.worker_id}: Geolocation set to US (Boston, MA)")
+ except Exception as e:
+ log.warning(f"Worker {self.worker_id}: Could not set geolocation: {e}")
+
self.driver.maximize_window()
self.created_at = time.time()
self.last_used = time.time()
diff --git a/modules/cli.py b/modules/cli.py
deleted file mode 100644
index d05c480..0000000
--- a/modules/cli.py
+++ /dev/null
@@ -1,80 +0,0 @@
-"""
-Command line interface handling for Google Maps Reviews Scraper.
-"""
-
-import argparse
-import json
-from pathlib import Path
-
-from modules.config import DEFAULT_CONFIG_PATH
-
-
-def parse_arguments():
- """Parse command line arguments"""
- ap = argparse.ArgumentParser(description="Google‑Maps review scraper with MongoDB integration")
- ap.add_argument("-q", "--headless", action="store_true",
- help="run Chrome in the background")
- ap.add_argument("-s", "--sort", dest="sort_by",
- choices=("newest", "highest", "lowest", "relevance"),
- default=None, help="sorting order for reviews")
- ap.add_argument("--stop-on-match", action="store_true",
- help="stop scrolling when first already‑seen id is met "
- "(useful with --sort newest)")
- ap.add_argument("--url", type=str, default=None,
- help="custom Google Maps URL to scrape")
- ap.add_argument("--overwrite", action="store_true", dest="overwrite_existing",
- help="overwrite existing reviews instead of appending")
- ap.add_argument("--config", type=str, default=None,
- help="path to custom configuration file")
- ap.add_argument("--use-mongodb", type=bool, default=None,
- help="whether to use MongoDB for storage")
-
- # Arguments for date conversion and image downloading
- ap.add_argument("--convert-dates", type=bool, default=None,
- help="convert string dates to MongoDB Date objects")
- ap.add_argument("--download-images", type=bool, default=None,
- help="download images from reviews")
- ap.add_argument("--image-dir", type=str, default=None,
- help="directory to store downloaded images")
- ap.add_argument("--download-threads", type=int, default=None,
- help="number of threads for downloading images")
-
- # Arguments for local image paths and URL replacement
- ap.add_argument("--store-local-paths", type=bool, default=None,
- help="whether to store local image paths in documents")
- ap.add_argument("--replace-urls", type=bool, default=None,
- help="whether to replace original URLs with custom ones")
- ap.add_argument("--custom-url-base", type=str, default=None,
- help="base URL for replacement")
- ap.add_argument("--custom-url-profiles", type=str, default=None,
- help="path for profile images")
- ap.add_argument("--custom-url-reviews", type=str, default=None,
- help="path for review images")
- ap.add_argument("--preserve-original-urls", type=bool, default=None,
- help="whether to preserve original URLs in original_* fields")
-
- # Arguments for custom parameters
- ap.add_argument("--custom-params", type=str, default=None,
- help="JSON string with custom parameters to add to each document (e.g. '{\"company\":\"Thaitours\"}')")
-
- # API interception option
- ap.add_argument("--api-intercept", action="store_true", dest="enable_api_intercept",
- help="enable API response interception for faster data capture (experimental)")
-
- args = ap.parse_args()
-
- # Handle config path
- if args.config is not None:
- args.config = Path(args.config)
- else:
- args.config = DEFAULT_CONFIG_PATH
-
- # Process custom params if provided
- if args.custom_params:
- try:
- args.custom_params = json.loads(args.custom_params)
- except json.JSONDecodeError:
- print(f"Warning: Could not parse custom params JSON: {args.custom_params}")
- args.custom_params = None
-
- return args
diff --git a/modules/database.py b/modules/database.py
index 576bf01..8f112a1 100644
--- a/modules/database.py
+++ b/modules/database.py
@@ -77,11 +77,17 @@ class DatabaseManager:
error_message TEXT,
metadata JSONB,
+ scrape_logs JSONB,
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
);
""")
+ # Add scrape_logs column if it doesn't exist (for existing databases)
+ await conn.execute("""
+ ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
+ """)
+
# Create indexes
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
@@ -182,10 +188,12 @@ class DatabaseManager:
started_at,
completed_at,
reviews_count,
+ total_reviews,
reviews_data,
scrape_time,
error_message,
- metadata
+ metadata,
+ scrape_logs
FROM jobs
WHERE job_id = $1
""", job_id)
@@ -246,8 +254,13 @@ class DatabaseManager:
kwargs['completed_at'] = datetime.now()
for key, value in kwargs.items():
- set_clauses.append(f"{key} = ${param_idx}")
- params.append(value)
+ # Handle JSONB fields specially
+ if key == 'scrape_logs' and value is not None:
+ set_clauses.append(f"{key} = ${param_idx}::jsonb")
+ params.append(json.dumps(value) if not isinstance(value, str) else value)
+ else:
+ set_clauses.append(f"{key} = ${param_idx}")
+ params.append(value)
param_idx += 1
query = f"""
@@ -264,7 +277,8 @@ class DatabaseManager:
job_id: UUID,
reviews: List[Dict[str, Any]],
scrape_time: float,
- total_reviews: Optional[int] = None
+ total_reviews: Optional[int] = None,
+ scrape_logs: Optional[List[Dict[str, Any]]] = None
):
"""
Save scraping results to database.
@@ -274,6 +288,7 @@ class DatabaseManager:
reviews: List of review dictionaries
scrape_time: Time taken to scrape in seconds
total_reviews: Total reviews available (from page counter)
+ scrape_logs: List of log entries from the scraper
"""
async with self.pool.acquire() as conn:
await conn.execute("""
@@ -284,9 +299,11 @@ class DatabaseManager:
reviews_count = $2,
total_reviews = $3,
reviews_data = $4::jsonb,
- scrape_time = $5
+ scrape_time = $5,
+ scrape_logs = $6::jsonb
WHERE job_id = $1
- """, job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time)
+ """, job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
+ json.dumps(scrape_logs) if scrape_logs else None)
log.info(f"Saved {len(reviews)} reviews for job {job_id}")
@@ -317,8 +334,10 @@ class DatabaseManager:
created_at,
completed_at,
reviews_count,
+ total_reviews,
scrape_time,
- error_message
+ error_message,
+ metadata
FROM jobs
WHERE status = $1
ORDER BY created_at DESC
@@ -333,8 +352,10 @@ class DatabaseManager:
created_at,
completed_at,
reviews_count,
+ total_reviews,
scrape_time,
- error_message
+ error_message,
+ metadata
FROM jobs
ORDER BY created_at DESC
LIMIT $1 OFFSET $2
diff --git a/modules/fast_scraper.py b/modules/fast_scraper.py
index cdb7cb4..cb84533 100644
--- a/modules/fast_scraper.py
+++ b/modules/fast_scraper.py
@@ -1140,13 +1140,30 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
else:
log.info(f"[PROFILE] Using pooled driver (0.00s)")
- # Force English locale for consistent parsing
+ # Force English locale AND US region for consistent parsing/results
+ # This helps avoid geolocation-based variations in Google Maps results
if 'hl=' in url:
url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en')
else:
separator = '&' if '?' in url else '?'
url = f"{url}{separator}hl=en"
+ # Add US region parameter if not present
+ if 'gl=' not in url:
+ url = f"{url}&gl=us"
+
+ # Set Chrome geolocation to US (Boston, MA) using CDP
+ # This ensures Google Maps shows US results regardless of server location
+ try:
+ driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
+ 'latitude': 42.3601,
+ 'longitude': -71.0589,
+ 'accuracy': 100
+ })
+ log.info("Set geolocation to US (Boston, MA)")
+ except Exception as e:
+ log.warning(f"Could not set geolocation: {e}")
+
log.info(f"Loading Google Maps page...")
t0 = timing_module.time()
driver.get(url)
@@ -1164,18 +1181,23 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
for btn in form_btns:
btn_text = (btn.text or '').lower()
- if 'aceptar todo' in btn_text or 'accept all' in btn_text:
+ if 'aceptar todo' in btn_text or 'accept all' in btn_text or 'reject all' in btn_text:
log.info(f"Clicking GDPR consent: {btn.text}")
btn.click()
- time.sleep(1) # Reduced from 2s
+ time.sleep(1)
break
else:
if len(form_btns) >= 2:
log.info("Using fallback: clicking second form button")
form_btns[1].click()
- time.sleep(1) # Reduced from 2s
+ time.sleep(1)
except Exception as e:
log.warning(f"GDPR consent handling failed: {e}")
+
+ # After GDPR consent, reload the original URL to ensure proper page state
+ log.info(f"Reloading original URL after GDPR consent...")
+ driver.get(url)
+ time.sleep(1)
log.info(f"[PROFILE] GDPR consent handling: {timing_module.time() - t0:.2f}s")
else:
log.info(f"[PROFILE] No GDPR consent page (0.00s)")
@@ -1197,14 +1219,77 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
try:
log.info("Waiting for Google Maps content to load...")
wait = WebDriverWait(driver, 10)
+ # Wait for basic page structure (h1 or heading)
wait.until(
- lambda d: d.find_elements(By.CSS_SELECTOR, 'h1.DUwDvf, h1, [role="article"], [data-review-id]')
+ lambda d: d.find_elements(By.CSS_SELECTOR, 'h1, [role="heading"]')
)
- log.info("Google Maps content loaded successfully")
+ log.info("Basic page structure loaded")
+
+ # Wait for page to settle - search URLs redirect to place URLs
+ # which triggers additional content loading
+ time.sleep(2)
+
+ # Wait specifically for review count element (aria-label ending with "reviews")
+ # This is the most reliable indicator that the business detail is loaded
+ try:
+ WebDriverWait(driver, 5).until(
+ lambda d: d.execute_script("""
+ var elems = document.querySelectorAll('[aria-label]');
+ for (var i = 0; i < elems.length; i++) {
+ var label = elems[i].getAttribute('aria-label') || '';
+ if (/^[0-9]+ reviews?$/.test(label)) return true;
+ }
+ return false;
+ """)
+ )
+ log.info("Review count element loaded")
+ except:
+ # Fallback: Try clicking Reviews tab or rating stars to expose the review count
+ log.info("Review count wait timeout, trying to click Reviews/rating...")
+ try:
+ # Try 1: Click Reviews tab (if exists)
+ clicked = driver.execute_script("""
+ var tabs = document.querySelectorAll('[role="tab"]');
+ for (var i = 0; i < tabs.length; i++) {
+ var txt = (tabs[i].textContent || '').toLowerCase();
+ if (txt.includes('review')) {
+ tabs[i].click();
+ return 'tab';
+ }
+ }
+ // Try 2: Click the rating stars element (often links to reviews)
+ var stars = document.querySelector('[role="img"][aria-label*="star"]');
+ if (stars) {
+ var parent = stars.parentElement;
+ if (parent && parent.tagName.toLowerCase() === 'button') {
+ parent.click();
+ return 'stars_button';
+ }
+ stars.click();
+ return 'stars';
+ }
+ // Try 3: Click "Write a review" or any review-related button
+ var btns = document.querySelectorAll('button[aria-label*="review" i]');
+ for (var b = 0; b < btns.length; b++) {
+ var label = btns[b].getAttribute('aria-label') || '';
+ if (!/write/i.test(label) && /review/i.test(label)) {
+ btns[b].click();
+ return 'review_btn: ' + label;
+ }
+ }
+ return 'none';
+ """)
+ log.info(f"Clicked: {clicked}")
+ time.sleep(2) # Wait for reviews panel to load
+ except Exception as e:
+ log.warning(f"Click attempt failed: {e}")
+
except Exception as e:
log.warning(f"Timeout waiting for Maps content: {e}")
- time.sleep(0.5) # Minimal fallback wait
+ time.sleep(2) # Fallback wait
log.info(f"[PROFILE] Smart wait for content: {timing_module.time() - t0:.2f}s")
+ log.info(f"DEBUG: Current URL: {driver.current_url[:100]}...")
+ log.info(f"DEBUG: Page title: {driver.title}")
# Extract business card information using JavaScript
t0 = timing_module.time()
@@ -1216,85 +1301,166 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
total_reviews: null
};
- // Extract business name
- const nameSelectors = [
- 'h1.DUwDvf',
- '[role="main"] h1',
- 'h1.fontHeadlineLarge'
- ];
+ // ============ ROBUST EXTRACTION (no class names, aria/data attributes preferred) ============
- for (const selector of nameSelectors) {
- const elem = document.querySelector(selector);
- if (elem && elem.textContent) {
- info.name = elem.textContent.trim();
- break;
- }
- }
+ // Helper: Parse review count from text, handling multiple formats
+ function parseReviewCount(text) {
+ if (!text) return null;
- // Extract address
- const addressSelectors = [
- 'button[data-item-id*="address"]',
- '[data-item-id*="address"]',
- 'div[aria-label*="Address"]'
- ];
-
- for (const selector of addressSelectors) {
- const elem = document.querySelector(selector);
- if (elem && elem.textContent) {
- info.address = elem.textContent.trim();
- break;
- }
- }
-
- // Extract rating (look for aria-label like "4.2 stars")
- const ratingElem = document.querySelector('[role="img"][aria-label*="star"]');
- if (ratingElem) {
- const ariaLabel = ratingElem.getAttribute('aria-label');
- const match = ariaLabel.match(/([0-9.]+)/);
+ // Pattern 1: Exact "N reviews" format (aria-labels, clean text)
+ // Matches: "27 reviews", "1,234 reviews", "27 reseñas", "27 avis"
+ var match = text.match(/^([0-9][0-9,.]*)[ ]*(?:reviews?|reseñas?|avis|bewertungen?|recensioni?)$/i);
if (match) {
- info.rating = parseFloat(match[1]);
+ return parseInt(match[1].replace(/[,. ]/g, ''));
}
- }
- // Extract total review count
- const reviewPattern = /\\((\\d[\\d,\\.]*)\\)/;
- const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
+ // Pattern 2: "(N)" format often used in tabs like "Reviews (27)"
+ match = text.match(/[(]([0-9][0-9,.]*)[)]$/);
+ if (match) {
+ return parseInt(match[1].replace(/[,. ]/g, ''));
+ }
- // PRIORITY 1: Look for review count in search results sidebar/panel
- // This is where "152 reviews" appears on search results
- const searchPanelSelectors = [
- 'a[href*="reviews"]', // Link with "reviews" in href
- 'button[jsaction*="reviews"]', // Button related to reviews
- 'div[role="link"]', // Clickable divs that might contain review info
- ];
-
- for (const selector of searchPanelSelectors) {
- const elements = document.querySelectorAll(selector);
- for (let elem of elements) {
- const text = elem.textContent || '';
- const match = text.match(numberPattern);
+ // Pattern 3: "N reviews" anywhere in short text (< 30 chars to avoid false positives)
+ if (text.length < 30) {
+ match = text.match(/([0-9][0-9,]*)[ ]+(?:reviews?|reseñas?|avis)/i);
if (match) {
- const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
- if (num > 0 && num < 1000000) {
- info.total_reviews = num;
- break;
- }
+ return parseInt(match[1].replace(/[,. ]/g, ''));
}
}
- if (info.total_reviews) break;
+
+ return null;
}
- // PRIORITY 2: Look in any span/div that contains the word "review"
+ // ============ EXTRACT BUSINESS NAME ============
+ // Priority: h1 (semantic), then role="heading"
+ const h1 = document.querySelector('h1');
+ if (h1 && h1.textContent) {
+ info.name = h1.textContent.trim();
+ }
+ if (!info.name) {
+ const heading = document.querySelector('[role="heading"][aria-level="1"]');
+ if (heading && heading.textContent) {
+ info.name = heading.textContent.trim();
+ }
+ }
+
+ // ============ EXTRACT ADDRESS ============
+ // Priority: data-item-id (semantic), then aria-label containing "address"
+ const addressElem = document.querySelector('[data-item-id*="address"]');
+ if (addressElem && addressElem.textContent) {
+ info.address = addressElem.textContent.trim();
+ }
+ if (!info.address) {
+ const ariaAddress = document.querySelector('[aria-label*="ddress"]');
+ if (ariaAddress && ariaAddress.textContent) {
+ info.address = ariaAddress.textContent.trim();
+ }
+ }
+
+ // ============ EXTRACT RATING ============
+ // Priority: aria-label containing "star" on role="img" elements
+ info._debug_rating_context = [];
+ const ratingElems = document.querySelectorAll('[role="img"][aria-label*="star"]');
+ for (let elem of ratingElems) {
+ const ariaLabel = elem.getAttribute('aria-label') || '';
+ // Match "4.9 stars" or "4,9 stars" (European format)
+ const match = ariaLabel.match(/([0-9][.,]?[0-9]?)\\s*star/i);
+ if (match) {
+ info.rating = parseFloat(match[1].replace(',', '.'));
+ // DEBUG: Capture parent/sibling context to find review count
+ var parent = elem.parentElement;
+ if (parent) {
+ info._debug_rating_context.push('PARENT: ' + (parent.textContent || '').trim().substring(0, 100));
+ var grandparent = parent.parentElement;
+ if (grandparent) {
+ info._debug_rating_context.push('GRANDPARENT: ' + (grandparent.textContent || '').trim().substring(0, 100));
+ // Check all children of grandparent for review count
+ var gpChildren = grandparent.querySelectorAll('*');
+ for (var c = 0; c < Math.min(gpChildren.length, 30); c++) {
+ var childText = (gpChildren[c].textContent || '').trim();
+ if (childText.length > 0 && childText.length < 20 && /[0-9]/.test(childText)) {
+ info._debug_rating_context.push('GP_CHILD: ' + childText);
+ }
+ }
+ // Also check great-grandparent
+ var ggp = grandparent.parentElement;
+ if (ggp) {
+ info._debug_rating_context.push('GREAT_GP: ' + (ggp.textContent || '').trim().substring(0, 150));
+ }
+ }
+ // Check siblings
+ var nextSib = parent.nextElementSibling;
+ if (nextSib) {
+ info._debug_rating_context.push('NEXT_SIB: ' + (nextSib.textContent || '').trim().substring(0, 100));
+ }
+ }
+ break;
+ }
+ }
+
+ // ============ EXTRACT TOTAL REVIEWS (ROBUST, ARIA-FIRST) ============
+
+ // PRIORITY 1: aria-label with exact "N reviews" format (most reliable)
+ // Google Maps uses aria-label="27 reviews" for accessibility
+ info._debug_aria = [];
+ info._debug_all_numeric = [];
if (!info.total_reviews) {
- const allElements = document.querySelectorAll('span, div, a');
- for (let elem of allElements) {
- const text = elem.textContent || '';
- if (text.length < 100) { // Skip very long text blocks
- const match = text.match(numberPattern);
+ var ariaElems = document.querySelectorAll('[aria-label]');
+ for (var i = 0; i < ariaElems.length; i++) {
+ var ariaLabel = ariaElems[i].getAttribute('aria-label') || '';
+ // Collect all labels containing "review"
+ if (ariaLabel.toLowerCase().indexOf('review') >= 0) {
+ info._debug_aria.push(ariaLabel);
+ }
+ // Collect all labels starting with a digit
+ if (/^[0-9]/.test(ariaLabel)) {
+ info._debug_all_numeric.push(ariaLabel);
+ }
+ var count = parseReviewCount(ariaLabel);
+ if (count && count > 0 && count < 100000) {
+ info.total_reviews = count;
+ info._debug_matched = ariaLabel;
+ break;
+ }
+ }
+ }
+
+ // DEBUG: Find all text with parenthetical numbers like "(27)"
+ info._debug_parens = [];
+ info._debug_short_text = []; // All short text with numbers
+ var allSpans = document.querySelectorAll('span, div, a, button');
+ for (var j = 0; j < Math.min(allSpans.length, 500); j++) {
+ var spanText = allSpans[j].textContent || '';
+ // Capture parenthetical numbers
+ if (spanText.length < 20 && /[(][0-9]+[)]/.test(spanText)) {
+ info._debug_parens.push(spanText.trim());
+ }
+ // Capture ALL short text containing numbers (for debugging)
+ if (spanText.length > 0 && spanText.length < 30 && /[0-9]+/.test(spanText)) {
+ var cleaned = spanText.trim().replace(/\\s+/g, ' ');
+ if (cleaned && info._debug_short_text.indexOf(cleaned) < 0) {
+ info._debug_short_text.push(cleaned);
+ }
+ }
+ }
+
+ // PRIORITY 2.5: Look for text containing numbers near "review" word anywhere on page
+ // This catches formats like "27 reviews", "reviews: 27", etc. that aren't in aria-labels
+ if (!info.total_reviews) {
+ var allElems = document.querySelectorAll('*');
+ for (var k = 0; k < Math.min(allElems.length, 1000); k++) {
+ var elem = allElems[k];
+ // Skip if has children (we want leaf nodes only)
+ if (elem.children.length > 0) continue;
+ var txt = (elem.textContent || '').trim();
+ // Look for short text with both numbers and "review" word
+ if (txt.length >= 3 && txt.length < 30 && /review/i.test(txt)) {
+ var match = txt.match(/([0-9][0-9,]*)/);
if (match) {
- const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
- if (num > 0 && num < 1000000) {
- info.total_reviews = num;
+ var count = parseInt(match[1].replace(/,/g, ''));
+ if (count > 0 && count < 100000) {
+ info.total_reviews = count;
+ info._debug_matched = 'LEAF: ' + txt;
break;
}
}
@@ -1302,38 +1468,167 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
}
}
- // PRIORITY 3: Try tabs (for business detail pages)
+ // DEBUG: Collect all tab names
+ info._debug_tabs = [];
+ const tabs = document.querySelectorAll('[role="tab"]');
+ for (let t = 0; t < tabs.length; t++) {
+ info._debug_tabs.push((tabs[t].textContent || '').trim().substring(0, 30));
+ }
+
+ // DEBUG: Collect all buttons with text (might contain review count)
+ info._debug_buttons = [];
+ const buttons = document.querySelectorAll('button');
+ for (let b = 0; b < Math.min(buttons.length, 20); b++) {
+ var btnText = (buttons[b].textContent || '').trim();
+ if (btnText && btnText.length < 40) {
+ info._debug_buttons.push(btnText.substring(0, 40));
+ }
+ }
+
+ // PRIORITY 2: Tabs with role="tab" (Reviews tab often shows count)
if (!info.total_reviews) {
- const tabs = document.querySelectorAll('button[role="tab"]');
for (let tab of tabs) {
- const text = tab.textContent || '';
- let match = text.match(reviewPattern);
- if (match) {
- info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
- break;
+ const text = (tab.textContent || '').trim();
+ // Look for "Reviews" tab with count
+ if (text.toLowerCase().includes('review')) {
+ const count = parseReviewCount(text);
+ if (count && count > 0) {
+ info.total_reviews = count;
+ info._debug_matched = 'TAB: ' + text;
+ break;
+ }
}
- match = text.match(numberPattern);
- if (match) {
- info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
+ }
+ }
+
+ // PRIORITY 2.3: Reviews panel header (after clicking Reviews tab)
+ // Google Maps shows "27 reviews" as heading text in the reviews panel
+ if (!info.total_reviews) {
+ // Look for headings containing review count
+ var headings = document.querySelectorAll('h1, h2, [role="heading"]');
+ for (var h = 0; h < headings.length; h++) {
+ var hText = (headings[h].textContent || '').trim();
+ if (/review/i.test(hText)) {
+ var match = hText.match(/([0-9][0-9,]*)/);
+ if (match) {
+ var count = parseInt(match[1].replace(/,/g, ''));
+ if (count > 0 && count < 100000) {
+ info.total_reviews = count;
+ info._debug_matched = 'HEADING: ' + hText;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ // PRIORITY 2.4: Look for sort button area which often has total count
+ // The sort dropdown area displays "Sort: Newest" and total reviews
+ if (!info.total_reviews) {
+ var sortBtns = document.querySelectorAll('button[data-value="sort"], [aria-label*="Sort"]');
+ for (var s = 0; s < sortBtns.length; s++) {
+ var parent = sortBtns[s].parentElement;
+ if (parent) {
+ var pText = (parent.textContent || '').trim();
+ if (/review/i.test(pText)) {
+ var match = pText.match(/([0-9][0-9,]*)\\s*review/i);
+ if (match) {
+ var count = parseInt(match[1].replace(/,/g, ''));
+ if (count > 0 && count < 100000) {
+ info.total_reviews = count;
+ info._debug_matched = 'SORT_AREA: ' + pText.substring(0, 50);
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // PRIORITY 3: Elements with semantic review-related attributes
+ if (!info.total_reviews) {
+ const reviewLinks = document.querySelectorAll('a[href*="review"], button[aria-label*="review" i]');
+ for (let elem of reviewLinks) {
+ const text = (elem.textContent || '').trim();
+ const count = parseReviewCount(text);
+ if (count && count > 0) {
+ info.total_reviews = count;
break;
}
}
}
- // PRIORITY 4: Try aria-labels
+ // PRIORITY 4: Look for standalone review count text near rating
+ // Find elements that contain ONLY "N reviews" pattern (not concatenated with rating)
if (!info.total_reviews) {
- const elements = document.querySelectorAll('[aria-label]');
- for (let elem of elements) {
- const ariaLabel = elem.getAttribute('aria-label') || '';
- let match = ariaLabel.match(reviewPattern);
- if (match) {
- info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
+ const allElements = document.querySelectorAll('span, a');
+ for (let elem of allElements) {
+ // Get direct text content only (not nested children)
+ const text = (elem.textContent || '').trim();
+ // Skip if too long (likely contains other content)
+ if (text.length > 50) continue;
+ // Skip if it looks like rating+reviews concatenated (e.g., "4.927 reviews")
+ if (/^[0-9]\\.[0-9]+[0-9]/.test(text)) continue;
+
+ const count = parseReviewCount(text);
+ if (count && count > 0 && count < 100000) {
+ info.total_reviews = count;
break;
}
- match = ariaLabel.match(numberPattern);
- if (match) {
- info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
- break;
+ }
+ }
+
+ // PRIORITY 5: Parse from visible page text using regex on short text blocks
+ if (!info.total_reviews) {
+ const walker = document.createTreeWalker(
+ document.body,
+ NodeFilter.SHOW_TEXT,
+ null,
+ false
+ );
+ while (walker.nextNode()) {
+ const text = walker.currentNode.textContent.trim();
+ if (text.length >= 5 && text.length <= 30) {
+ // Match "27 reviews" but not "4.927 reviews"
+ const match = text.match(/(?:^|[^0-9.,])([0-9,]+)\\s+(?:reviews?|reseñas?)/i);
+ if (match) {
+ const count = parseInt(match[1].replace(/[,]/g, ''));
+ if (count > 0 && count < 100000) {
+ info.total_reviews = count;
+ info._debug_matched = 'WALKER: ' + text;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ // PRIORITY 6: Extract from embedded JSON in page source (Google embeds data in scripts)
+ if (!info.total_reviews) {
+ var scripts = document.querySelectorAll('script');
+ for (var sc = 0; sc < scripts.length; sc++) {
+ var scriptText = scripts[sc].textContent || '';
+ // Look for patterns like "user_reviews":{"count":27} or reviews_count":27
+ var jsonMatch = scriptText.match(/"(?:user_reviews|reviews?)(?:_count)?"\s*[:\{]\s*"?(\d+)"?/i);
+ if (jsonMatch) {
+ var count = parseInt(jsonMatch[1]);
+ if (count > 0 && count < 100000) {
+ info.total_reviews = count;
+ info._debug_matched = 'JSON_SCRIPT';
+ break;
+ }
+ }
+ // Also look for review count in Google's data format like [\"27 reviews\"]
+ if (!info.total_reviews) {
+ var dataMatch = scriptText.match(/"(\d+)\s+reviews?"/i);
+ if (dataMatch) {
+ var count = parseInt(dataMatch[1]);
+ if (count > 0 && count < 100000) {
+ info.total_reviews = count;
+ info._debug_matched = 'JSON_DATA: ' + dataMatch[0];
+ break;
+ }
+ }
}
}
}
@@ -1348,6 +1643,32 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
log.info(f"[PROFILE] *** TOTAL GET_BUSINESS_CARD TIME: {total_time:.2f}s ***")
log.info(f"Business card extracted: name={business_info.get('name')}, "
f"rating={business_info.get('rating')}, reviews={business_info.get('total_reviews')}")
+ # Debug: log what aria-labels were found
+ if business_info.get('_debug_aria'):
+ log.info(f"DEBUG: Found {len(business_info.get('_debug_aria'))} aria-labels with 'review': {business_info.get('_debug_aria')[:5]}")
+ if business_info.get('_debug_matched'):
+ log.info(f"DEBUG: Matched aria-label: {business_info.get('_debug_matched')}")
+ # Also log all numeric aria-labels (potential review counts)
+ if business_info.get('_debug_all_numeric'):
+ log.info(f"DEBUG: Numeric aria-labels: {business_info.get('_debug_all_numeric')[:10]}")
+ # Log any text with parenthetical numbers like "(27)"
+ if business_info.get('_debug_parens'):
+ log.info(f"DEBUG: Parenthetical text: {business_info.get('_debug_parens')[:5]}")
+ # Log all short text containing numbers (for debugging review count detection)
+ if business_info.get('_debug_short_text'):
+ log.info(f"DEBUG: Short text with numbers: {business_info.get('_debug_short_text')[:15]}")
+ # Log the context around the rating element
+ if business_info.get('_debug_rating_context'):
+ for ctx in business_info.get('_debug_rating_context', []):
+ log.info(f"DEBUG: Rating context: {ctx}")
+ # Log what tabs exist on the page
+ if business_info.get('_debug_tabs'):
+ log.info(f"DEBUG: Page tabs: {business_info.get('_debug_tabs')}")
+ else:
+ log.info(f"DEBUG: No tabs found on page")
+ # Log buttons (might contain review count)
+ if business_info.get('_debug_buttons'):
+ log.info(f"DEBUG: Buttons: {business_info.get('_debug_buttons')[:10]}")
result = {
"name": business_info.get('name'),
diff --git a/modules/job_manager.py b/modules/job_manager.py
deleted file mode 100644
index 19e4bfc..0000000
--- a/modules/job_manager.py
+++ /dev/null
@@ -1,407 +0,0 @@
-"""
-Background job manager for Google Reviews Scraper.
-"""
-
-import asyncio
-import logging
-import threading
-import time
-import uuid
-from concurrent.futures import ThreadPoolExecutor
-from datetime import datetime
-from enum import Enum
-from typing import Dict, Any, Optional, List
-from dataclasses import dataclass, asdict
-
-from modules.config import load_config
-from modules.scraper import GoogleReviewsScraper
-from modules.scraper_clean import fast_scrape_reviews # Updated to use clean scraper with hard refresh recovery
-from modules.chrome_pool import get_scraping_worker, release_scraping_worker
-
-log = logging.getLogger("scraper")
-
-
-class JobStatus(str, Enum):
- """Job status enumeration"""
- PENDING = "pending"
- RUNNING = "running"
- COMPLETED = "completed"
- FAILED = "failed"
- CANCELLED = "cancelled"
-
-
-@dataclass
-class ScrapingJob:
- """Scraping job data class"""
- job_id: str
- status: JobStatus
- url: str
- config: Dict[str, Any]
- created_at: datetime
- started_at: Optional[datetime] = None
- completed_at: Optional[datetime] = None
- updated_at: Optional[datetime] = None # Last update time (for progress tracking)
- error_message: Optional[str] = None
- reviews_count: Optional[int] = None
- total_reviews: Optional[int] = None # Total reviews available (from page counter)
- images_count: Optional[int] = None
- progress: Dict[str, Any] = None
- reviews_data: Optional[List[Dict[str, Any]]] = None # Store actual review data
- scrape_time: Optional[float] = None # Time taken to scrape
-
- def to_dict(self, include_reviews: bool = False) -> Dict[str, Any]:
- """
- Convert job to dictionary for JSON serialization
-
- Args:
- include_reviews: Whether to include the full reviews data (default: False)
- """
- data = asdict(self)
- # Convert datetime objects to ISO strings
- for field in ['created_at', 'started_at', 'completed_at']:
- if data[field]:
- data[field] = data[field].isoformat()
-
- # Exclude reviews_data by default (can be large)
- if not include_reviews:
- data.pop('reviews_data', None)
-
- return data
-
-
-class JobManager:
- """Manager for background scraping jobs"""
-
- def __init__(self, max_concurrent_jobs: int = 3):
- """Initialize job manager"""
- self.max_concurrent_jobs = max_concurrent_jobs
- self.jobs: Dict[str, ScrapingJob] = {}
- self.executor = ThreadPoolExecutor(max_workers=max_concurrent_jobs)
- self.lock = threading.Lock()
-
- def create_job(self, url: str, config_overrides: Dict[str, Any] = None) -> str:
- """
- Create a new scraping job.
-
- Args:
- url: Google Maps URL to scrape
- config_overrides: Optional config overrides
-
- Returns:
- Job ID
- """
- job_id = str(uuid.uuid4())
-
- # Load base config
- config = load_config()
-
- # Apply URL
- config["url"] = url
-
- # Apply any overrides
- if config_overrides:
- config.update(config_overrides)
-
- job = ScrapingJob(
- job_id=job_id,
- status=JobStatus.PENDING,
- url=url,
- config=config,
- created_at=datetime.now(),
- progress={"stage": "created", "message": "Job created and queued"}
- )
-
- with self.lock:
- self.jobs[job_id] = job
-
- log.info(f"Created scraping job {job_id} for URL: {url}")
- return job_id
-
- def start_job(self, job_id: str) -> bool:
- """
- Start a pending job.
-
- Args:
- job_id: Job ID to start
-
- Returns:
- True if job was started, False otherwise
- """
- with self.lock:
- if job_id not in self.jobs:
- return False
-
- job = self.jobs[job_id]
- if job.status != JobStatus.PENDING:
- return False
-
- # Check if we can start more jobs
- running_count = sum(1 for j in self.jobs.values() if j.status == JobStatus.RUNNING)
- if running_count >= self.max_concurrent_jobs:
- return False
-
- job.status = JobStatus.RUNNING
- job.started_at = datetime.now()
- job.updated_at = datetime.now()
- job.progress = {"stage": "starting", "message": "Initializing scraper"}
-
- # Submit job to thread pool
- future = self.executor.submit(self._run_scraping_job, job_id)
-
- log.info(f"Started scraping job {job_id}")
- return True
-
- def _run_scraping_job(self, job_id: str):
- """
- Run the actual scraping job in background thread.
-
- Args:
- job_id: Job ID to run
- """
- def progress_callback(current_count: int, total_count: int):
- """Update job progress during scraping"""
- with self.lock:
- job = self.jobs.get(job_id)
- if job:
- job.reviews_count = current_count
- job.total_reviews = total_count
- job.updated_at = datetime.now() # Update last update time
- # Calculate percentage for better UX
- percentage = int((current_count / total_count * 100)) if total_count > 0 else 0
- job.progress = {
- "stage": "scraping",
- "message": f"Collecting reviews: {current_count} / {total_count} ({percentage}%)",
- "percentage": percentage
- }
-
- worker = None
- try:
- with self.lock:
- job = self.jobs[job_id]
- job.progress = {"stage": "initializing", "message": "Acquiring Chrome worker from pool"}
-
- # Get a worker from the scraping pool
- worker = get_scraping_worker(timeout=30)
-
- if not worker:
- raise Exception("No Chrome workers available. Pool may be at capacity.")
-
- log.info(f"Job {job_id}: Acquired worker {worker.worker_id} from pool")
-
- # Get config
- url = job.config.get('url')
- headless = job.config.get('headless', True) # Default to headless
- max_scrolls = job.config.get('max_scrolls', 999999) # Effectively unlimited - relies on idle detection
-
- with self.lock:
- job.progress = {"stage": "scraping", "message": f"Scraping reviews with {worker.worker_id} (fast mode)"}
-
- # Run the FAST scraping with progress callback using pooled worker
- result = fast_scrape_reviews(
- url=url,
- headless=headless,
- max_scrolls=max_scrolls,
- progress_callback=progress_callback,
- driver=worker.driver, # Use worker's driver
- return_driver=True # Don't close the driver
- )
-
- # Pop the driver from result before storing
- result.pop('driver', None)
-
- # Mark job as completed or failed
- with self.lock:
- if result['success']:
- job.status = JobStatus.COMPLETED
- job.completed_at = datetime.now()
- job.updated_at = datetime.now()
- job.reviews_count = result['count']
- job.total_reviews = result.get('total_reviews') # Store total review count from page
- job.reviews_data = result['reviews'] # Store the actual reviews
- job.scrape_time = result['time']
- job.progress = {
- "stage": "completed",
- "message": f"Scraping completed successfully in {result['time']:.1f}s",
- "scroll_time": result.get('scroll_time'),
- "extract_time": result.get('extract_time')
- }
- log.info(f"Completed scraping job {job_id}: {result['count']} reviews in {result['time']:.1f}s")
- else:
- job.status = JobStatus.FAILED
- job.completed_at = datetime.now()
- job.updated_at = datetime.now()
- job.error_message = result.get('error', 'Unknown error')
- job.progress = {"stage": "failed", "message": f"Job failed: {result.get('error')}"}
- log.error(f"Failed scraping job {job_id}: {result.get('error')}")
-
- except Exception as e:
- log.error(f"Error in scraping job {job_id}: {e}")
- import traceback
- traceback.print_exc()
-
- with self.lock:
- job = self.jobs[job_id]
- job.status = JobStatus.FAILED
- job.completed_at = datetime.now()
- job.updated_at = datetime.now()
- job.error_message = str(e)
- job.progress = {"stage": "failed", "message": f"Job failed: {str(e)}"}
-
- # Recycle worker on error
- if worker:
- log.info(f"Job {job_id}: Recycling worker {worker.worker_id} due to error")
- release_scraping_worker(worker, recycle=True)
- worker = None # Mark as released
-
- finally:
- # Release worker back to pool if not already released
- if worker:
- log.info(f"Job {job_id}: Releasing worker {worker.worker_id} back to pool")
- release_scraping_worker(worker, recycle=False)
-
- def get_job(self, job_id: str) -> Optional[ScrapingJob]:
- """
- Get job by ID.
-
- Args:
- job_id: Job ID
-
- Returns:
- Job object or None if not found
- """
- with self.lock:
- return self.jobs.get(job_id)
-
- def get_job_reviews(self, job_id: str) -> Optional[List[Dict[str, Any]]]:
- """
- Get reviews data for a specific job.
-
- Args:
- job_id: Job ID
-
- Returns:
- List of reviews or None if not found/not completed
- """
- with self.lock:
- job = self.jobs.get(job_id)
- if job and job.status == JobStatus.COMPLETED:
- return job.reviews_data
- return None
-
- def list_jobs(self, status: Optional[JobStatus] = None, limit: int = 100) -> List[ScrapingJob]:
- """
- List jobs, optionally filtered by status.
-
- Args:
- status: Optional status filter
- limit: Maximum number of jobs to return
-
- Returns:
- List of jobs
- """
- with self.lock:
- jobs = list(self.jobs.values())
-
- if status:
- jobs = [job for job in jobs if job.status == status]
-
- # Sort by creation time (newest first)
- jobs.sort(key=lambda x: x.created_at, reverse=True)
-
- return jobs[:limit]
-
- def cancel_job(self, job_id: str) -> bool:
- """
- Cancel a pending or running job.
-
- Args:
- job_id: Job ID to cancel
-
- Returns:
- True if job was cancelled, False otherwise
- """
- with self.lock:
- if job_id not in self.jobs:
- return False
-
- job = self.jobs[job_id]
- if job.status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED]:
- return False
-
- job.status = JobStatus.CANCELLED
- job.completed_at = datetime.now()
- job.updated_at = datetime.now()
- job.progress = {"stage": "cancelled", "message": "Job was cancelled"}
-
- log.info(f"Cancelled scraping job {job_id}")
- return True
-
- def delete_job(self, job_id: str) -> bool:
- """
- Delete a job from the manager.
-
- Args:
- job_id: Job ID to delete
-
- Returns:
- True if job was deleted, False otherwise
- """
- with self.lock:
- if job_id not in self.jobs:
- return False
- del self.jobs[job_id]
-
- log.info(f"Deleted scraping job {job_id}")
- return True
-
- def get_stats(self) -> Dict[str, Any]:
- """
- Get job manager statistics.
-
- Returns:
- Statistics dictionary
- """
- with self.lock:
- jobs = list(self.jobs.values())
-
- stats = {
- "total_jobs": len(jobs),
- "by_status": {},
- "running_jobs": 0,
- "max_concurrent_jobs": self.max_concurrent_jobs
- }
-
- for status in JobStatus:
- count = sum(1 for job in jobs if job.status == status)
- stats["by_status"][status.value] = count
-
- stats["running_jobs"] = stats["by_status"].get(JobStatus.RUNNING.value, 0)
-
- return stats
-
- def cleanup_old_jobs(self, max_age_hours: int = 24):
- """
- Clean up old completed/failed jobs.
-
- Args:
- max_age_hours: Maximum age in hours before cleanup
- """
- cutoff_time = datetime.now().timestamp() - (max_age_hours * 3600)
-
- with self.lock:
- to_delete = []
- for job_id, job in self.jobs.items():
- if job.status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED]:
- if job.completed_at and job.completed_at.timestamp() < cutoff_time:
- to_delete.append(job_id)
-
- for job_id in to_delete:
- del self.jobs[job_id]
-
- if to_delete:
- log.info(f"Cleaned up {len(to_delete)} old jobs")
-
- def shutdown(self):
- """Shutdown the job manager"""
- log.info("Shutting down job manager")
- self.executor.shutdown(wait=True)
\ No newline at end of file
diff --git a/modules/scraper.py b/modules/scraper.py
deleted file mode 100644
index d2c20be..0000000
--- a/modules/scraper.py
+++ /dev/null
@@ -1,2335 +0,0 @@
-"""
-Selenium scraping logic for Google Maps Reviews.
-Uses SeleniumBase UC Mode for enhanced anti-detection and better Chrome version management.
-"""
-
-import logging
-import os
-import platform
-import re
-import time
-import traceback
-import threading
-from typing import Dict, Any, List, Optional, Tuple
-
-from seleniumbase import Driver
-from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
-from selenium.webdriver import Chrome
-from selenium.webdriver.common.action_chains import ActionChains
-from selenium.webdriver.common.by import By
-from selenium.webdriver.common.keys import Keys
-from selenium.webdriver.remote.webelement import WebElement
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.support.ui import WebDriverWait
-from tqdm import tqdm
-
-from modules.data_storage import MongoDBStorage, JSONStorage, merge_review
-from modules.models import RawReview
-from modules.api_interceptor import GoogleMapsAPIInterceptor
-
-# Logger
-log = logging.getLogger("scraper")
-
-# CSS Selectors (Updated January 2026 for current Google Maps structure)
-PANE_SEL = 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'
-CARD_SEL = "div.jftiEf" # Review card container
-# Cookie/consent dialog selectors (Updated January 2026)
-COOKIE_BTN = ('button[aria-label*="Accept" i],'
- 'button[aria-label*="Aceptar" i],'
- 'button[aria-label*="Akzeptieren" i],'
- 'button[aria-label*="Aceitar" i],'
- 'button[jsname="higCR"],' # Google's "Accept all" button
- 'button[jsname="hZCF7e"],'
- 'button[data-mdc-dialog-action="accept"],'
- 'form[action*="consent"] button,'
- 'div[role="dialog"] button[jsname],'
- '.VfPpkd-LgbsSe[data-mdc-dialog-action="accept"]')
-SORT_BTN = 'button[aria-label="Sort reviews" i], button[aria-label="Sort" i]'
-MENU_ITEMS = 'div[role="menu"] [role="menuitem"], li[role="menuitem"]'
-
-SORT_OPTIONS = {
- "newest": (
- "Newest", "החדשות ביותר", "ใหม่ที่สุด", "最新", "Más recientes", "最近",
- "Mais recentes", "Neueste", "Plus récent", "Più recenti", "Nyeste",
- "Новые", "Nieuwste", "جديد", "Nyeste", "Uusimmat", "Najnowsze",
- "Senaste", "Terbaru", "Yakın zamanlı", "Mới nhất", "नवीनतम"
- ),
- "highest": (
- "Highest rating", "הדירוג הגבוה ביותר", "คะแนนสูงสุด", "最高評価",
- "Calificación más alta", "最高评分", "Melhor avaliação", "Höchste Bewertung",
- "Note la plus élevée", "Valutazione più alta", "Høyeste vurdering",
- "Наивысший рейтинг", "Hoogste waardering", "أعلى تقييم", "Højeste vurdering",
- "Korkein arvostelu", "Najwyższa ocena", "Högsta betyg", "Peringkat tertinggi",
- "En yüksek puan", "Đánh giá cao nhất", "उच्चतम रेटिंग", "Top rating"
- ),
- "lowest": (
- "Lowest rating", "הדירוג הנמוך ביותר", "คะแนนต่ำสุด", "最低評価",
- "Calificación más baja", "最低评分", "Pior avaliação", "Niedrigste Bewertung",
- "Note la plus basse", "Valutazione più bassa", "Laveste vurdering",
- "Наименьший рейтинг", "Laagste waardering", "أقل تقييم", "Laveste vurdering",
- "Alhaisin arvostelu", "Najniższa ocena", "Lägsta betyg", "Peringkat terendah",
- "En düşük puan", "Đánh giá thấp nhất", "निम्नतम रेटिंग", "Worst rating"
- ),
- "relevance": (
- "Most relevant", "רלוונטיות ביותר", "เกี่ยวข้องมากที่สุด", "関連性",
- "Más relevantes", "最相关", "Mais relevantes", "Relevanteste",
- "Plus pertinents", "Più pertinenti", "Mest relevante",
- "Наиболее релевантные", "Meest relevant", "الأكثر صلة", "Mest relevante",
- "Olennaisimmat", "Najbardziej trafne", "Mest relevanta", "Paling relevan",
- "En alakalı", "Liên quan nhất", "सबसे प्रासंगिक", "Relevance"
- )
-}
-
-# Comprehensive multi-language review keywords
-REVIEW_WORDS = {
- # English
- "reviews", "review", "ratings", "rating",
-
- # Hebrew
- "ביקורות", "ביקורת", "ביקורות על", "דירוגים", "דירוג",
-
- # Thai
- "รีวิว", "บทวิจารณ์", "คะแนน", "ความคิดเห็น",
-
- # Spanish
- "reseñas", "opiniones", "valoraciones", "críticas", "calificaciones",
-
- # French
- "avis", "commentaires", "évaluations", "critiques", "notes",
-
- # German
- "bewertungen", "rezensionen", "beurteilungen", "meinungen", "kritiken",
-
- # Italian
- "recensioni", "valutazioni", "opinioni", "giudizi", "commenti",
-
- # Portuguese
- "avaliações", "comentários", "opiniões", "análises", "críticas",
-
- # Russian
- "отзывы", "рецензии", "обзоры", "оценки", "комментарии",
-
- # Japanese
- "レビュー", "口コミ", "評価", "批評", "感想",
-
- # Korean
- "리뷰", "평가", "후기", "댓글", "의견",
-
- # Chinese (Simplified and Traditional)
- "评论", "評論", "点评", "點評", "评价", "評價", "意见", "意見", "回顾", "回顧",
-
- # Arabic
- "مراجعات", "تقييمات", "آراء", "تعليقات", "نقد",
-
- # Hindi
- "समीक्षा", "रिव्यू", "राय", "मूल्यांकन", "प्रतिक्रिया",
-
- # Turkish
- "yorumlar", "değerlendirmeler", "incelemeler", "görüşler", "puanlar",
-
- # Dutch
- "beoordelingen", "recensies", "meningen", "opmerkingen", "waarderingen",
-
- # Polish
- "recenzje", "opinie", "oceny", "komentarze", "uwagi",
-
- # Vietnamese
- "đánh giá", "nhận xét", "bình luận", "phản hồi", "bài đánh giá",
-
- # Indonesian
- "ulasan", "tinjauan", "komentar", "penilaian", "pendapat",
-
- # Swedish
- "recensioner", "betyg", "omdömen", "åsikter", "kommentarer",
-
- # Norwegian
- "anmeldelser", "vurderinger", "omtaler", "meninger", "tilbakemeldinger",
-
- # Danish
- "anmeldelser", "bedømmelser", "vurderinger", "meninger", "kommentarer",
-
- # Finnish
- "arvostelut", "arviot", "kommentit", "mielipiteet", "palautteet",
-
- # Greek
- "κριτικές", "αξιολογήσεις", "σχόλια", "απόψεις", "βαθμολογίες",
-
- # Czech
- "recenze", "hodnocení", "názory", "komentáře", "posudky",
-
- # Romanian
- "recenzii", "evaluări", "opinii", "comentarii", "note",
-
- # Hungarian
- "vélemények", "értékelések", "kritikák", "hozzászólások", "megjegyzések",
-
- # Bulgarian
- "отзиви", "ревюта", "мнения", "коментари", "оценки"
-}
-
-
-class GoogleReviewsScraper:
- """Main scraper class for Google Maps reviews"""
-
- def __init__(self, config: Dict[str, Any]):
- """Initialize scraper with configuration"""
- self.config = config
- self.use_mongodb = config.get("use_mongodb", True)
- self.mongodb = MongoDBStorage(config) if self.use_mongodb else None
- self.json_storage = JSONStorage(config)
- self.backup_to_json = config.get("backup_to_json", True)
- self.overwrite_existing = config.get("overwrite_existing", False)
- self.enable_api_intercept = config.get("enable_api_intercept", False)
- self.api_interceptor = None # Will be initialized when driver is ready
-
- def setup_driver(self, headless: bool):
- """
- Set up and configure Chrome driver using SeleniumBase UC Mode.
- SeleniumBase provides enhanced anti-detection and automatic Chrome/ChromeDriver version management.
- Works in both Docker containers and on regular OS installations (Windows, Mac, Linux).
- """
- # Log platform information for debugging
- log.info(f"Platform: {platform.platform()}")
- log.info(f"Python version: {platform.python_version()}")
- log.info("Using SeleniumBase UC Mode for enhanced anti-detection")
-
- # Determine if we're running in a container
- in_container = os.environ.get('CHROME_BIN') is not None
-
- if in_container:
- chrome_binary = os.environ.get('CHROME_BIN')
- log.info(f"Container environment detected")
- log.info(f"Chrome binary: {chrome_binary}")
-
- # Create driver with custom binary location for containers
- if chrome_binary and os.path.exists(chrome_binary):
- try:
- driver = Driver(
- uc=True,
- headless=headless,
- binary_location=chrome_binary,
- page_load_strategy="normal"
- )
- log.info("Successfully created SeleniumBase UC driver with custom binary")
- except Exception as e:
- log.warning(f"Failed to create driver with custom binary: {e}")
- # Fall back to default
- driver = Driver(
- uc=True,
- headless=headless,
- page_load_strategy="normal"
- )
- log.info("Successfully created SeleniumBase UC driver with defaults")
- else:
- driver = Driver(
- uc=True,
- headless=headless,
- page_load_strategy="normal"
- )
- log.info("Successfully created SeleniumBase UC driver")
- else:
- # Regular OS environment - SeleniumBase handles version matching automatically
- log.info("Creating SeleniumBase UC Mode driver")
- try:
- driver = Driver(
- uc=True,
- headless=headless,
- page_load_strategy="normal",
- incognito=True # Use incognito mode for better stealth
- )
- log.info("Successfully created SeleniumBase UC driver")
- except Exception as e:
- log.error(f"Failed to create SeleniumBase driver: {e}")
- raise
-
- # Set page load timeout to avoid hanging
- driver.set_page_load_timeout(30)
-
- # Set window size
- driver.set_window_size(1400, 900)
-
- # Add additional stealth settings
- try:
- # Disable automation flags
- driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
- 'source': '''
- Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
- Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
- Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']});
- '''
- })
- log.info("Additional stealth settings applied")
- except Exception as e:
- log.debug(f"Could not apply additional stealth settings: {e}")
-
- log.info("SeleniumBase UC driver setup completed successfully")
- return driver
-
- def dismiss_cookies(self, driver: Chrome):
- """
- Dismiss cookie consent dialogs if present.
- Handles stale element references by re-finding elements if needed.
- Updated January 2026 to handle current Google consent dialogs.
- """
- dismissed = False
-
- # Try multiple approaches to dismiss consent dialogs
- consent_selectors = [
- COOKIE_BTN,
- # Additional Google consent selectors
- 'button[aria-label*="Accept all" i]',
- 'button[aria-label*="Aceptar todo" i]',
- 'button[aria-label*="Reject all" i]', # Sometimes we need to reject
- 'button:has-text("Accept")',
- 'button:has-text("Aceptar")',
- '[role="dialog"] button:first-of-type',
- 'form[action*="consent"] button:first-of-type',
- ]
-
- for selector in consent_selectors:
- try:
- elements = driver.find_elements(By.CSS_SELECTOR, selector)
- for elem in elements:
- try:
- if elem.is_displayed() and elem.is_enabled():
- # Try JavaScript click first (more reliable)
- driver.execute_script("arguments[0].click();", elem)
- log.info(f"Cookie/consent dialog dismissed with selector: {selector}")
- time.sleep(0.3) # Reduced from 1s to 0.3s
- dismissed = True
- break
- except Exception as e:
- log.debug(f"Error clicking consent button: {e}")
- continue
- if dismissed:
- break
- except Exception as e:
- log.debug(f"Error finding consent elements with {selector}: {e}")
- continue
-
- # Also try to find and click any visible modal close buttons
- if not dismissed:
- try:
- close_btns = driver.find_elements(By.CSS_SELECTOR,
- '[role="dialog"] button[aria-label*="close" i], '
- '[role="dialog"] button[aria-label*="cerrar" i], '
- '.modal-close, .dialog-close')
- for btn in close_btns:
- if btn.is_displayed():
- driver.execute_script("arguments[0].click();", btn)
- log.info("Closed modal dialog")
- dismissed = True
- break
- except Exception:
- pass
-
- return dismissed
-
- def is_reviews_tab(self, tab: WebElement) -> bool:
- """
- Dynamically detect if an element is the reviews tab across multiple languages and layouts.
- Uses multiple detection approaches for maximum reliability.
- """
- try:
- # Strategy 1: Data attribute detection (most reliable across languages)
- tab_index = tab.get_attribute("data-tab-index")
- if tab_index == "1" or tab_index == "reviews":
- return True
-
- # Strategy 2: Role and aria attributes (accessibility detection)
- role = tab.get_attribute("role")
- aria_selected = tab.get_attribute("aria-selected")
- aria_label = (tab.get_attribute("aria-label") or "").lower()
-
- # Many review tabs have role="tab" and data attributes
- if role == "tab" and any(word in aria_label for word in REVIEW_WORDS):
- return True
-
- # Strategy 3: Text content detection (multiple sources)
- sources = [
- tab.text.lower() if tab.text else "", # Direct text
- aria_label, # ARIA label
- tab.get_attribute("innerHTML").lower() or "", # Inner HTML
- tab.get_attribute("textContent").lower() or "" # Text content
- ]
-
- # Check all sources against our comprehensive keyword list
- for source in sources:
- if any(word in source for word in REVIEW_WORDS):
- return True
-
- # Strategy 4: Nested element detection
- try:
- # Check text in all child elements
- for child in tab.find_elements(By.CSS_SELECTOR, "*"):
- try:
- child_text = child.text.lower() if child.text else ""
- child_content = child.get_attribute("textContent").lower() or ""
-
- if any(word in child_text for word in REVIEW_WORDS) or any(
- word in child_content for word in REVIEW_WORDS):
- return True
- except:
- continue
- except:
- pass
-
- # Strategy 5: URL detection (some tabs have hrefs or data-hrefs with tell-tale values)
- for attr in ["href", "data-href", "data-url", "data-target"]:
- attr_value = (tab.get_attribute(attr) or "").lower()
- if attr_value and ("review" in attr_value or "rating" in attr_value):
- return True
-
- # Strategy 6: Class detection (some review tabs have specific classes)
- tab_class = tab.get_attribute("class") or ""
- review_classes = ["review", "reviews", "rating", "ratings", "comments", "feedback", "g4jrve"]
- if any(cls in tab_class for cls in review_classes):
- return True
-
- return False
-
- except StaleElementReferenceException:
- return False
- except Exception as e:
- log.debug(f"Error in is_reviews_tab: {e}")
- return False
-
- def click_reviews_tab(self, driver: Chrome):
- """
- Navigate to reviews section by clicking the Reviews tab/button on the page.
- Uses text-based detection (what humans see) as primary method for robustness.
- """
- current_url = driver.current_url
-
- # PRIMARY METHOD: Look for text-based "Reviews" button/tab (what humans see)
- log.info("Trying to find Reviews tab by visible text...")
- max_timeout = 15
- end_time = time.time() + max_timeout
-
- for language_keyword in REVIEW_WORDS:
- if time.time() > end_time:
- break
-
- try:
- # Try XPath that finds elements containing the text (case-insensitive)
- # This includes divs with aria-hidden="true" that contain "Reviews"
- xpath = f"//*[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{language_keyword.lower()}')]"
- elements = driver.find_elements(By.XPATH, xpath)
-
- for element in elements:
- try:
- element_text = (element.text or '').strip()
- if not element_text or len(element_text) > 50:
- continue
-
- tag_name = element.tag_name.lower()
- role = element.get_attribute('role') or ''
- aria_hidden = element.get_attribute('aria-hidden')
-
- # If this is a div with aria-hidden="true" containing "Reviews",
- # try to click its parent button/clickable element
- if tag_name == 'div' and aria_hidden == 'true':
- log.info(f"Found aria-hidden div with text: '{element_text}', looking for clickable parent")
- # Try parent element
- try:
- parent = driver.execute_script("return arguments[0].parentElement;", element)
- parent_tag = parent.tag_name.lower() if parent else ''
- parent_role = parent.get_attribute('role') if parent else ''
-
- if parent and (parent_tag in ['button', 'a'] or 'tab' in parent_role or 'button' in parent_role):
- log.info(f"Found clickable parent: {parent_tag} with role={parent_role}")
- driver.execute_script("arguments[0].scrollIntoView({block:'center', behavior:'smooth'});", parent)
- time.sleep(0.5)
- driver.execute_script("arguments[0].click();", parent)
- time.sleep(3)
-
- if self.verify_reviews_tab_clicked(driver):
- log.info(f"✅ Successfully clicked Reviews via aria-hidden parent")
- return True
- except:
- pass
-
- # Try clicking the element directly if it's clickable
- elif tag_name in ['button', 'a'] or 'tab' in role or 'button' in role:
- log.info(f"Found clickable Reviews element: '{element_text}' (tag: {tag_name}, role: {role})")
-
- driver.execute_script("arguments[0].scrollIntoView({block:'center', behavior:'smooth'});", element)
- time.sleep(0.5)
- driver.execute_script("arguments[0].click();", element)
- time.sleep(3)
-
- if self.verify_reviews_tab_clicked(driver):
- log.info(f"✅ Successfully clicked Reviews via text: '{element_text}'")
- return True
- except:
- continue
- except:
- continue
-
- # FALLBACK METHOD: Find aria-hidden divs with exact text "Reviews" (or language variants)
- log.info("Trying aria-hidden div detection as fallback...")
- try:
- # Look for divs with aria-hidden="true" that contain ONLY the review word (no extra text)
- divs = driver.find_elements(By.CSS_SELECTOR, 'div[aria-hidden="true"]')
-
- for div in divs:
- div_text = (div.text or '').strip()
-
- # Check if this div contains ONLY a review keyword (exact match, case-insensitive)
- for keyword in REVIEW_WORDS:
- if div_text.lower() == keyword.lower():
- log.info(f"Found aria-hidden div with exact text: '{div_text}'")
-
- # Get the parent element (should be the clickable tab/button)
- try:
- parent = driver.execute_script("return arguments[0].parentElement;", div)
- if parent:
- parent_tag = parent.tag_name.lower()
- parent_role = parent.get_attribute('role') or ''
-
- log.info(f"Parent element: tag={parent_tag}, role={parent_role}")
-
- # Click the parent if it looks clickable
- driver.execute_script("arguments[0].scrollIntoView({block:'center'});", parent)
- time.sleep(0.5)
- driver.execute_script("arguments[0].click();", parent)
- time.sleep(2)
-
- if self.verify_reviews_tab_clicked(driver):
- log.info(f"✅ Successfully clicked Reviews via aria-hidden fallback")
- return True
- except Exception as e:
- log.debug(f"Error clicking parent of aria-hidden div: {e}")
- continue
- except Exception as e:
- log.debug(f"Error in aria-hidden fallback: {e}")
-
- # If all methods failed
- log.warning("Failed to navigate to reviews after trying all methods")
- raise TimeoutException("Could not navigate to reviews section")
-
- def verify_reviews_tab_clicked(self, driver: Chrome) -> bool:
- """
- Verify that the reviews tab was successfully clicked.
- Uses robust verification methods that don't depend on fragile CSS classes.
- """
- try:
- # METHOD 1: Check for text-based indicators (most robust)
- # Look for common review-related text that appears regardless of CSS changes
- page_text = driver.page_source.lower()
-
- # These text patterns appear when reviews section is active
- review_indicators = [
- 'sort reviews',
- 'most relevant',
- 'newest',
- 'highest rating',
- 'lowest rating',
- ]
-
- for indicator in review_indicators:
- if indicator in page_text:
- log.debug(f"Found review indicator: '{indicator}'")
- return True
-
- # METHOD 2: Check for semantic attributes (stable)
- # Look for elements with review-specific attributes
- semantic_selectors = [
- 'div[data-review-id]', # Review cards have data-review-id
- 'button[aria-label*="Sort" i]', # Sort button
- 'span[role="img"][aria-label*="star" i]', # Star ratings
- ]
-
- for selector in semantic_selectors:
- elements = driver.find_elements(By.CSS_SELECTOR, selector)
- if elements and len(elements) > 0:
- log.debug(f"Found semantic element: {selector}")
- return True
-
- # URL check - if "review" appears in the URL
- if "review" in driver.current_url.lower():
- return True
-
- return False
- except Exception as e:
- log.debug(f"Error verifying reviews tab click: {e}")
- return False
-
- def set_sort(self, driver: Chrome, method: str):
- """
- Set the sorting method for reviews with enhanced detection for the latest Google Maps UI.
- Works across different languages and UI variations, with robust error handling.
- """
- if method == "relevance":
- log.info("Using default 'relevance' sort - no need to change sort order")
- return True # Default order, no need to change
-
- log.info(f"Attempting to set sort order to '{method}'")
-
- try:
- # 1. Find and click the sort button using ROBUST TEXT-BASED DETECTION
- # Multi-language sort button keywords (what humans see)
- sort_keywords = {
- 'en': ['sort', 'Sort', 'SORT'],
- 'he': ['סדר', 'סידור'],
- 'th': ['เรียง'],
- 'zh': ['排序'],
- 'fr': ['trier', 'Trier'],
- 'es': ['ordenar', 'Ordenar'],
- 'de': ['sortieren', 'Sortieren'],
- 'pt': ['Classificar'],
- 'it': ['Ordina'],
- 'ru': ['Сортировать']
- }
-
- # Flatten all keywords
- all_sort_keywords = [kw for keywords in sort_keywords.values() for kw in keywords]
-
- # PRIMARY METHOD: Find buttons by text or aria-label (robust)
- sort_button = None
- log.info("Looking for sort button using text-based detection...")
-
- for keyword in all_sort_keywords:
- try:
- # XPath to find buttons containing the keyword (case-sensitive for non-English)
- xpath = f"//button[contains(text(), '{keyword}') or contains(@aria-label, '{keyword}')]"
- elements = driver.find_elements(By.XPATH, xpath)
-
- for element in elements:
- try:
- # Skip invisible/disabled elements
- if not element.is_displayed() or not element.is_enabled():
- continue
-
- # Get button text and attributes for verification
- button_text = element.text.strip() if element.text else ""
- button_aria = element.get_attribute("aria-label") or ""
-
- # Skip buttons that are clearly not sort buttons
- negative_keywords = ["back", "next", "previous", "close", "cancel", "חזרה", "סגור", "ปิด"]
- if any(neg in button_text.lower() or neg in button_aria.lower() for neg in negative_keywords):
- continue
-
- # Verify it has dropdown attributes (sort buttons are typically dropdowns)
- has_dropdown = (element.get_attribute("aria-haspopup") == "true" or
- element.get_attribute("aria-expanded") is not None)
-
- if has_dropdown or keyword in button_text or keyword in button_aria:
- sort_button = element
- log.info(f"✅ Found sort button with text: '{button_text}' or aria-label: '{button_aria}'")
- break
-
- except Exception as e:
- log.debug(f"Error checking element: {e}")
- continue
-
- if sort_button:
- break
-
- except Exception as e:
- log.debug(f"Error with keyword '{keyword}': {e}")
- continue
-
- # FALLBACK METHOD: Find any button with dropdown attributes near review content
- if not sort_button:
- log.info("Trying fallback: finding buttons with dropdown attributes...")
- try:
- buttons = driver.find_elements(By.CSS_SELECTOR, 'button[aria-haspopup="true"]')
-
- for button in buttons:
- if not button.is_displayed() or not button.is_enabled():
- continue
-
- button_text = (button.text or '').strip().lower()
- button_aria = (button.get_attribute("aria-label") or '').lower()
-
- # Look for any sort-related keywords
- if any(kw.lower() in button_text or kw.lower() in button_aria for kw in all_sort_keywords):
- sort_button = button
- log.info(f"✅ Found sort button via fallback: {button.text}")
- break
-
- except Exception as e:
- log.debug(f"Error in fallback method: {e}")
-
- # Final check - do we have a sort button?
- if not sort_button:
- log.warning("No sort button found with any method - keeping default sort order")
- return False
-
- # 2. Click the sort button to open dropdown menu
-
- # First ensure the button is in view
- driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'smooth'});", sort_button)
- time.sleep(0.8) # Wait for scroll
-
- # Try multiple click methods
- click_methods = [
- # Method 1: JavaScript click
- lambda: driver.execute_script("arguments[0].click();", sort_button),
-
- # Method 2: Direct click
- lambda: sort_button.click(),
-
- # Method 3: ActionChains click with move first
- lambda: ActionChains(driver).move_to_element(sort_button).pause(0.3).click().perform(),
-
- # Method 4: Click on center of element
- lambda: ActionChains(driver).move_to_element_with_offset(
- sort_button, sort_button.size['width'] // 2, sort_button.size['height'] // 2
- ).click().perform(),
-
- # Method 5: JavaScript focus and click
- lambda: driver.execute_script(
- "arguments[0].focus(); setTimeout(function() { arguments[0].click(); }, 100);", sort_button
- ),
-
- # Method 6: Send RETURN key after focusing
- lambda: ActionChains(driver).move_to_element(sort_button).click().send_keys(Keys.RETURN).perform()
- ]
-
- # Try each click method
- menu_opened = False
-
- for i, click_method in enumerate(click_methods):
- try:
- log.info(f"Trying click method {i + 1} for sort button...")
- click_method()
- time.sleep(1) # Wait for menu to appear
-
- # Check if menu opened
- menu_opened = self.check_if_menu_opened(driver)
-
- if menu_opened:
- log.info(f"Sort menu opened with click method {i + 1}")
- break
- except Exception as e:
- log.debug(f"Click method {i + 1} failed: {e}")
- continue
-
- # If menu not opened, abort
- if not menu_opened:
- log.warning("Failed to open sort menu - keeping default sort order")
- # Try to reset state by clicking elsewhere
- try:
- ActionChains(driver).move_by_offset(50, 50).click().perform()
- except:
- pass
- return False
-
- # 3. Find and click the desired sort option in the menu
- # Uses ROBUST SEMANTIC SELECTORS (role attributes), not CSS classes
-
- try:
- # PRIMARY METHOD: Find menu items by role attribute (semantic, stable)
- # menuitemradio is the standard role for radio menu items
- log.info("Looking for menu items using semantic role attributes...")
-
- menu_items = WebDriverWait(driver, 5).until(
- EC.presence_of_all_elements_located((By.CSS_SELECTOR, '[role="menuitemradio"], [role="menuitem"]'))
- )
-
- # Process menu items to extract text
- visible_items = []
-
- for item in menu_items:
- try:
- # Skip invisible items
- if not item.is_displayed():
- continue
-
- # Get the menu item text
- # Try innerText first (most reliable), then textContent, then .text
- text = driver.execute_script("""
- const elem = arguments[0];
- return elem.innerText || elem.textContent || elem.text || '';
- """, item).strip()
-
- if text: # Only add items with text
- visible_items.append((item, text))
-
- except Exception as e:
- log.debug(f"Error processing menu item: {e}")
- continue
-
- log.info(f"Found {len(visible_items)} visible menu items")
- for i, (_, text) in enumerate(visible_items):
- log.debug(f" Menu item {i + 1}: '{text}'")
-
- # Determine the target menu item based on sort method
- target_item = None
- matched_text = None
-
- # Log all available menu items for debugging
- log.info(f"Available menu items: {[text for _, text in visible_items]}")
-
- # Use position-based selection (most reliable for Google Maps)
- position_map = {
- "relevance": 0, # Usually the first option
- "newest": 1, # Usually the second option
- "highest": 2, # Usually the third option
- "lowest": 3 # Usually the fourth option
- }
-
- pos = position_map.get(method, -1)
- if pos >= 0 and pos < len(visible_items):
- target_item, matched_text = visible_items[pos]
- log.info(f"Selected menu item at position {pos + 1}: '{matched_text}' for sort method '{method}'")
-
- # Validate the selection makes sense
- wanted_labels = SORT_OPTIONS.get(method, [])
- text_clean = matched_text.lower()
-
- # Check if selected text contains any of the expected keywords
- valid_selection = False
- for label in wanted_labels:
- if label.lower() in text_clean or text_clean in label.lower():
- valid_selection = True
- break
-
- if not valid_selection:
- log.warning(f"WARNING: Selected '{matched_text}' doesn't match expected '{method}' - might be wrong sort!")
- else:
- log.warning(f"Position {pos} not available in menu (only {len(visible_items)} items)")
-
- # 3. If target found, click it
- if target_item:
- # Ensure item is in view
- driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", target_item)
- time.sleep(0.3)
-
- # Try multiple click methods
- click_success = False
- click_methods = [
- # Method 1: JavaScript click
- lambda: driver.execute_script("arguments[0].click();", target_item),
-
- # Method 2: Direct click
- lambda: target_item.click(),
-
- # Method 3: ActionChains click
- lambda: ActionChains(driver).move_to_element(target_item).click().perform(),
-
- # Method 4: Center click
- lambda: ActionChains(driver).move_to_element_with_offset(
- target_item, target_item.size['width'] // 2, target_item.size['height'] // 2
- ).click().perform(),
-
- # Method 5: JavaScript click with custom event
- lambda: driver.execute_script("""
- var el = arguments[0];
- var evt = new MouseEvent('click', {
- bubbles: true,
- cancelable: true,
- view: window
- });
- el.dispatchEvent(evt);
- """, target_item)
- ]
-
- for i, click_method in enumerate(click_methods):
- try:
- click_method()
- time.sleep(1.5) # Wait for sort to take effect
-
- # Try to verify sort happened by checking if menu closed
- still_open = self.check_if_menu_opened(driver)
- if not still_open:
- click_success = True
- log.info(f"Successfully clicked menu item with method {i + 1}")
- break
- except Exception as e:
- log.debug(f"Menu item click method {i + 1} failed: {e}")
- continue
-
- if click_success:
- log.info(f"Successfully set sort order to '{method}'")
- return True
- else:
- log.warning(f"Failed to click menu item - keeping default sort order")
- else:
- log.warning(f"No matching menu item found for '{method}'")
-
- # If we get here, we failed - try to close the menu by clicking elsewhere
- try:
- ActionChains(driver).move_by_offset(50, 50).click().perform()
- except:
- pass
-
- return False
-
- except TimeoutException:
- log.warning("Timeout waiting for menu items")
- return False
- except Exception as e:
- log.warning(f"Error in menu item selection: {e}")
- return False
-
- except Exception as e:
- log.warning(f"Error in set_sort method: {e}")
- return False
-
- def check_if_menu_opened(self, driver):
- """
- Check if a sort menu has been opened after clicking the sort button.
- Uses multiple detection strategies optimized for Google Maps dropdowns.
- Returns True if menu is detected, False otherwise.
- """
- try:
- # 1. First check for exact menu container selectors from the latest Google Maps UI
- specific_menu_selectors = [
- 'div[role="menu"][id="action-menu"]', # Exact match from provided HTML
- 'div.fontBodyLarge.yu5kgd[role="menu"]', # Classes from provided HTML
- 'div.fxNQSd[role="menuitemradio"]', # Menu item class
- 'div.yu5kgd[role="menu"]' # Alternate class
- ]
-
- for selector in specific_menu_selectors:
- elements = driver.find_elements(By.CSS_SELECTOR, selector)
- for element in elements:
- try:
- if element.is_displayed():
- return True
- except:
- continue
-
- # 2. Check for generic menu containers
- generic_menu_selectors = [
- 'div[role="menu"]',
- 'ul[role="menu"]',
- '[role="listbox"]'
- ]
-
- for selector in generic_menu_selectors:
- elements = driver.find_elements(By.CSS_SELECTOR, selector)
- for element in elements:
- try:
- if element.is_displayed():
- return True
- except:
- continue
-
- # 3. Look for menu items
- menu_item_selectors = [
- 'div[role="menuitemradio"]', # Google Maps specific
- 'div.fxNQSd', # Class-based detection
- 'div.mLuXec', # Text container class
- '[role="menuitem"]', # Generic menu items
- '[role="option"]' # Alternative role
- ]
-
- visible_items = 0
- for selector in menu_item_selectors:
- elements = driver.find_elements(By.CSS_SELECTOR, selector)
- for element in elements:
- try:
- if element.is_displayed():
- visible_items += 1
- if visible_items >= 2: # At least 2 menu items should be visible
- return True
- except:
- continue
-
- # 4. Advanced detection with JavaScript
- # Checks if there are newly visible elements with menu-related roles or classes
- try:
- js_detection = """
- return (function() {
- // Check for visible menu elements
- var menuElements = document.querySelectorAll('div[role="menu"], div[role="menuitemradio"], div.fxNQSd');
- for (var i = 0; i < menuElements.length; i++) {
- var style = window.getComputedStyle(menuElements[i]);
- if (style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0') {
- return true;
- }
- }
-
- // Check for any recently appeared elements that might be a menu
- var possibleMenus = document.querySelectorAll('div.yu5kgd, div.fontBodyLarge');
- for (var i = 0; i < possibleMenus.length; i++) {
- var style = window.getComputedStyle(possibleMenus[i]);
- var rect = possibleMenus[i].getBoundingClientRect();
- // Check if element is visible and has a meaningful size
- if (style.display !== 'none' && style.visibility !== 'hidden' &&
- rect.width > 50 && rect.height > 50) {
- return true;
- }
- }
-
- return false;
- })();
- """
- menu_detected = driver.execute_script(js_detection)
- if menu_detected:
- return True
- except Exception as js_error:
- log.debug(f"Error in JavaScript menu detection: {js_error}")
-
- # 5. Last resort: check if any positioning styles were applied to elements
- # This can detect menu containers that have been positioned absolutely
- try:
- position_check = """
- return (function() {
- // Look for absolutely positioned elements that appeared recently
- var elements = document.querySelectorAll('div[style*="position: absolute"]');
- for (var i = 0; i < elements.length; i++) {
- var el = elements[i];
- var style = window.getComputedStyle(el);
- var hasMenuItems = el.querySelectorAll('div[role="menuitemradio"], div.fxNQSd').length > 0;
-
- if (style.display !== 'none' && style.visibility !== 'hidden' && hasMenuItems) {
- return true;
- }
- }
- return false;
- })();
- """
- position_detected = driver.execute_script(position_check)
- if position_detected:
- return True
- except:
- pass
-
- return False
-
- except Exception as e:
- log.debug(f"Error checking menu state: {e}")
- return False
-
- def wait_for_api_response(self, driver: Chrome, timeout: float = 2.0) -> bool:
- """
- Smart wait that detects when new API response has arrived.
- Much faster and more reliable than fixed time.sleep().
-
- Returns True if new response detected, False if timeout.
- """
- if not self.enable_api_intercept or not self.api_interceptor:
- # Fallback to fixed wait if API interception disabled
- time.sleep(0.6)
- return False
-
- try:
- # Get current response count
- initial_count = driver.execute_script("""
- return (window.__allRequests || []).filter(r =>
- r.url && r.url.toLowerCase().includes('listugcposts')
- ).length;
- """)
-
- # Wait for new response with timeout
- start = time.time()
- while (time.time() - start) < timeout:
- current_count = driver.execute_script("""
- return (window.__allRequests || []).filter(r =>
- r.url && r.url.toLowerCase().includes('listugcposts')
- ).length;
- """)
-
- if current_count > initial_count:
- # New API response arrived!
- elapsed = time.time() - start
- log.debug(f"New API response detected after {elapsed:.2f}s")
- time.sleep(0.2) # Small delay for DOM to update
- return True
-
- time.sleep(0.05) # Check every 50ms
-
- # Timeout - no new response
- log.debug(f"No API response after {timeout}s (might be at end of reviews)")
- return False
-
- except Exception as e:
- log.debug(f"Error waiting for API response: {e}")
- time.sleep(0.6) # Fallback to fixed wait
- return False
-
- def extract_total_reviews(self, driver: Chrome) -> Tuple[Optional[int], Optional[str]]:
- """
- Extract total review count from Google Maps page.
- Looks for patterns like "247 reviews", "1,234 reviews", or "5.2K reviews".
-
- Returns:
- tuple: (total_count: int, count_string: str) or (None, None) if not found
- """
- try:
- # Method 1: Look for "XXX reviews" text in the page source
- page_text = driver.page_source
-
- # Pattern: "244 reviews" or "1,234 reviews" or "5.2K reviews"
- patterns = [
- r'(\d{1,3}(?:,\d{3})*)\s+reviews?', # "244 reviews" or "1,234 reviews"
- r'(\d+\.?\d*K)\s+reviews?', # "5.2K reviews"
- r'(\d{1,3}(?:,\d{3})*)\s+reseñas?', # Spanish
- r'(\d{1,3}(?:,\d{3})*)\s+评论', # Chinese
- ]
-
- for pattern in patterns:
- matches = re.findall(pattern, page_text, re.IGNORECASE)
- if matches:
- count_str = matches[0]
-
- # Parse the count
- if 'K' in count_str or 'k' in count_str:
- # "5.2K" -> 5200
- num = float(count_str.replace('K', '').replace('k', ''))
- total = int(num * 1000)
- else:
- # "1,234" -> 1234
- total = int(count_str.replace(',', ''))
-
- return total, count_str
-
- # Method 2: Look for aria-label with review count
- buttons = driver.find_elements(By.TAG_NAME, 'button')
- for btn in buttons:
- aria_label = btn.get_attribute('aria-label') or ''
- text = btn.text or ''
-
- # Check both aria-label and text
- for content in [aria_label, text]:
- match = re.search(r'(\d{1,3}(?:,\d{3})*)\s+reviews?', content, re.IGNORECASE)
- if match:
- count_str = match.group(1)
- total = int(count_str.replace(',', ''))
- return total, count_str
-
- return None, None
-
- except Exception as e:
- log.debug(f"Error extracting total review count: {e}")
- return None, None
-
- def scrape(self):
- """Main scraper method"""
- start_time = time.time()
-
- url = self.config.get("url")
- headless = self.config.get("headless", True)
- sort_by = self.config.get("sort_by", "relevance")
- stop_on_match = self.config.get("stop_on_match", False)
-
- log.info(f"Starting scraper with settings: headless={headless}, sort_by={sort_by}")
- log.info(f"URL: {url}")
-
- # Initialize storage
- # If not overwriting, load existing data
- if self.overwrite_existing:
- docs = {}
- seen = set()
- else:
- # Try to get from MongoDB first if enabled
- docs = {}
- if self.use_mongodb and self.mongodb:
- docs = self.mongodb.fetch_existing_reviews()
-
- # If backup_to_json is enabled, also load from JSON for merging
- if self.backup_to_json:
- json_docs = self.json_storage.load_json_docs()
- # Merge JSON docs with MongoDB docs
- for review_id, review in json_docs.items():
- if review_id not in docs:
- docs[review_id] = review
-
- # Load seen IDs from file
- seen = self.json_storage.load_seen()
-
- driver = None
- api_reviews = {} # Store reviews captured from API
- try:
- driver = self.setup_driver(headless)
- wait = WebDriverWait(driver, 20) # Reduced from 40 to 20 for faster timeout
-
- driver.get(url)
- wait.until(lambda d: "google.com/maps" in d.current_url)
-
- # Wait briefly for consent dialogs to appear (optimized from 3s to 1s)
- time.sleep(1)
-
- # Try to dismiss any consent/cookie dialogs
- if not self.dismiss_cookies(driver):
- # Quick retry (optimized from 2s to 0.5s)
- time.sleep(0.5)
- self.dismiss_cookies(driver)
-
- self.click_reviews_tab(driver)
-
- # Reduced wait after clicking reviews tab (optimized from 3s to 1s)
- log.info("Waiting for reviews page to fully load...")
- time.sleep(1)
-
- # Wait for page to be fully interactive
- try:
- wait.until(lambda d: d.execute_script("return document.readyState") == "complete")
- log.info("Page DOM is ready")
- except:
- log.debug("Could not verify page ready state")
-
- # Extract total review count from the page
- total_reviews, total_str = self.extract_total_reviews(driver)
- if total_reviews:
- log.info(f"✅ Google shows {total_str} ({total_reviews} total reviews)")
- else:
- log.warning("⚠️ Could not extract total review count - will scroll until no new reviews")
- total_reviews = None
-
- # Verify we're on a reviews page before proceeding
- if "review" not in driver.current_url.lower():
- log.warning("URL doesn't contain 'review' - might not be on reviews page")
-
- # Try to set sort - but don't fail if it doesn't work
- try:
- self.set_sort(driver, sort_by)
- except Exception as sort_error:
- log.warning(f"Sort failed but continuing: {sort_error}")
-
- # Reduced wait after setting sort (optimized from 3s to 1s)
- log.info("Waiting for reviews to render...")
- time.sleep(1)
-
- # Find the scrollable reviews pane using robust detection
- # Uses JavaScript to find elements by their scrollable properties, not CSS classes
- pane = None
-
- try:
- log.info("Finding scrollable reviews pane using robust detection...")
-
- # JavaScript to find scrollable container (no CSS classes needed!)
- find_scrollable_script = """
- function findScrollablePane() {
- // Find all divs that might be scrollable
- const allDivs = document.querySelectorAll('div');
-
- for (let div of allDivs) {
- const style = window.getComputedStyle(div);
- const overflowY = style.overflowY;
-
- // Check if element is scrollable
- if ((overflowY === 'auto' || overflowY === 'scroll') &&
- div.scrollHeight > div.clientHeight &&
- div.clientHeight > 200) { // Must be tall enough to be main pane
-
- // Additional checks: should contain review-like content
- const text = div.textContent || '';
- const hasReviewIndicators =
- text.includes('star') ||
- text.includes('rating') ||
- text.includes('review') ||
- div.querySelector('[data-review-id]') ||
- div.querySelector('[role="img"][aria-label*="star"]');
-
- if (hasReviewIndicators) {
- return div;
- }
- }
- }
-
- // Fallback: return main element if found
- return document.querySelector('[role="main"]');
- }
- return findScrollablePane();
- """
-
- pane = driver.execute_script(find_scrollable_script)
-
- if pane:
- log.info("✅ Found scrollable reviews pane using robust JavaScript detection")
- else:
- log.warning("❌ Could not find scrollable reviews pane")
-
- except Exception as e:
- log.warning(f"Error finding scrollable pane with JavaScript: {e}")
- # Fallback to simple div[role="main"] if JS fails
- try:
- pane = driver.find_element(By.CSS_SELECTOR, 'div[role="main"]')
- log.info("Using fallback: div[role='main']")
- except:
- pass
-
- if not pane:
- log.error("Could not find reviews pane. Page structure might have changed.")
- return False
-
- # Initialize API interceptor AFTER reviews page is loaded (if enabled)
- # This prevents CDP interception from affecting initial page load and tab detection
- if self.enable_api_intercept:
- log.info("Setting up API interception for reviews capture")
- self.api_interceptor = GoogleMapsAPIInterceptor(driver)
- self.api_interceptor.setup_interception()
- self.api_interceptor.inject_response_interceptor()
- log.info("API interceptor ready - capturing network responses")
-
- pbar = tqdm(desc="Scraped", ncols=80, initial=len(seen))
- idle = 0
- processed_ids = set() # Track processed IDs in current session
-
- # Prefetch selector to avoid repeated lookups
- try:
- driver.execute_script("window.scrollablePane = arguments[0];", pane)
- scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
- except Exception as e:
- log.warning(f"Error setting up scroll script: {e}")
- scroll_script = "window.scrollBy(0, 300);" # Fallback to simple scrolling
-
- # Card selectors to try (ROBUST - semantic attributes only, no CSS classes!)
- # Only use data-review-id attribute which is stable and won't break with Google updates
- card_selectors = [
- "[data-review-id]", # PRIMARY: Any element with review ID (most robust)
- "div[data-review-id]", # Fallback: Div with review ID
- ]
- # REMOVED FRAGILE CSS CLASS SELECTORS:
- # - CARD_SEL (div.jftiEf) - Google's obfuscated class, breaks on updates
- # - .jftiEf - Same as above
- # - div.WMbnJf - Another obfuscated class
- # We now rely on semantic [data-review-id] attribute + API interceptor
-
- # CONTINUOUS SCROLLING APPROACH
- # Scroll NON-STOP in background thread while extracting reviews in main thread
- stop_scrolling = threading.Event()
- scroll_count = [0] # Use list to make it mutable in thread
- load_times = [] # Track when new reviews are loaded for smart timeout
-
- def continuous_scroll_worker():
- """Background thread that scrolls continuously without stopping"""
- while not stop_scrolling.is_set():
- try:
- driver.execute_script(scroll_script)
- scroll_count[0] += 1
- time.sleep(0.005) # 5ms = ultra fast continuous scrolling!
- except:
- pass
-
- # Start continuous scrolling thread
- scroll_thread = threading.Thread(target=continuous_scroll_worker, daemon=True)
- scroll_thread.start()
- log.info("🚀 Started continuous NON-STOP scrolling thread")
-
- check_num = 0
- max_checks = 100 # Maximum safety limit
-
- while check_num < max_checks:
- check_num += 1
-
- # Check if we've collected all reviews
- if total_reviews and len(seen) >= total_reviews:
- percent = (len(seen) / total_reviews) * 100
- log.info(f"✅ Got all {total_reviews} reviews ({percent:.1f}%)! Stopping scrolling.")
- stop_scrolling.set()
- break
-
- # Wait between checks while scrolling continues in background
- time.sleep(2.0) # Check every 2 seconds
-
- try:
- # Try multiple card selectors within the pane
- cards = []
- for card_sel in card_selectors:
- cards = pane.find_elements(By.CSS_SELECTOR, card_sel)
- if cards:
- if check_num == 1: # Only log once
- log.info(f"Found {len(cards)} cards with selector: {card_sel}")
- break
-
- # If no cards found in pane, try searching the entire document
- if not cards:
- for card_sel in card_selectors:
- cards = driver.find_elements(By.CSS_SELECTOR, card_sel)
- if cards:
- if check_num == 1:
- log.info(f"Found {len(cards)} cards in document with selector: {card_sel}")
- break
-
- fresh_cards: List[WebElement] = []
- previous_count = len(seen)
-
- for c in cards:
- try:
- # Try to get data-review-id from the card itself
- cid = c.get_attribute("data-review-id")
- # If not found on card, try to find it in a child element
- if not cid:
- try:
- review_id_elem = c.find_element(By.CSS_SELECTOR, "[data-review-id]")
- cid = review_id_elem.get_attribute("data-review-id")
- except:
- pass
- if not cid or cid in seen or cid in processed_ids:
- if stop_on_match and cid and (cid in seen or cid in processed_ids):
- idle = 999
- break
- continue
- fresh_cards.append(c)
- except StaleElementReferenceException:
- continue
- except Exception as e:
- log.debug(f"Error getting review ID: {e}")
- continue
-
- # Process fresh cards
- for card in fresh_cards:
- try:
- raw = RawReview.from_card(card)
- processed_ids.add(raw.id)
- except StaleElementReferenceException:
- continue
- except Exception:
- log.warning("⚠️ parse error – storing stub\n%s",
- traceback.format_exc(limit=1).strip())
- try:
- raw_id = card.get_attribute("data-review-id") or ""
- raw = RawReview(id=raw_id, text="", lang="und")
- processed_ids.add(raw_id)
- except StaleElementReferenceException:
- continue
-
- docs[raw.id] = merge_review(docs.get(raw.id), raw)
- seen.add(raw.id)
- pbar.update(1)
-
- # Calculate how many new reviews we got
- new_count = len(seen) - previous_count
-
- # Track load times for smart timeout
- if new_count > 0:
- current_time = time.time()
- load_times.append(current_time)
-
- if total_reviews:
- percent = (len(seen) / total_reviews) * 100
- log.info(f"Check {check_num:2d}: {len(seen):3d}/{total_reviews} ({percent:5.1f}%) | +{new_count} new")
- else:
- log.info(f"Check {check_num:2d}: {len(seen):3d} total | +{new_count} new")
- else:
- # No new reviews in this check
- if total_reviews:
- percent = (len(seen) / total_reviews) * 100
- log.info(f"Check {check_num:2d}: {len(seen):3d}/{total_reviews} ({percent:5.1f}%) | +0 new")
- else:
- log.info(f"Check {check_num:2d}: {len(seen):3d} total | +0 new")
-
- # Smart timeout: stop if no new reviews for 3x average gap
- if new_count == 0:
- if len(load_times) >= 3:
- # Calculate average gap between individual review loads
- gaps = [load_times[i] - load_times[i-1] for i in range(1, len(load_times))]
- avg_gap = sum(gaps) / len(gaps)
- timeout_threshold = avg_gap * 3
- timeout_type = f"gap-based (avg gap: {avg_gap:.1f}s)"
- elif len(load_times) > 0:
- # Initial timeout: use 3x time since first load started
- time_since_first = time.time() - load_times[0]
- timeout_threshold = max(10.0, time_since_first * 3) # At least 10s
- timeout_type = f"initial (time since first: {time_since_first:.1f}s)"
- else:
- # No loads yet - use default initial timeout
- timeout_threshold = 15.0
- timeout_type = "default (no loads yet)"
-
- # Check time since last load
- if len(load_times) > 0:
- time_since_last = time.time() - load_times[-1]
-
- # Log timeout status every check when no new reviews
- log.debug(f" Timeout check: {time_since_last:.1f}s / {timeout_threshold:.1f}s ({timeout_type})")
-
- if time_since_last > timeout_threshold:
- log.info(f"⏱️ No new reviews for {time_since_last:.1f}s (threshold: {timeout_threshold:.1f}s, {timeout_type}) - stopping")
- stop_scrolling.set()
- break
-
- # Fallback: stop if no new reviews for 10 consecutive checks
- if new_count == 0:
- idle += 1
- if idle >= 10:
- log.info(f"⏱️ No new reviews for {idle} checks - stopping")
- stop_scrolling.set()
- break
- else:
- idle = 0
-
- # Collect API responses if interception is enabled
- if self.enable_api_intercept and self.api_interceptor:
- try:
- responses = self.api_interceptor.get_intercepted_responses()
- if responses:
- log.debug(f"Collected {len(responses)} network responses from browser")
-
- # Dump first few responses for analysis
- if not hasattr(self, '_dumped_responses'):
- self._dumped_responses = 0
-
- if self._dumped_responses < 5: # Dump first 5 responses
- from pathlib import Path
- import json
- output_dir = Path("api_response_samples")
- output_dir.mkdir(exist_ok=True)
-
- for resp in responses:
- if self._dumped_responses >= 5:
- break
-
- idx = self._dumped_responses
- body = resp.get('body', '')
-
- # Save full response
- full_file = output_dir / f"response_{idx:02d}_full.json"
- with open(full_file, 'w', encoding='utf-8') as f:
- json.dump(resp, f, indent=2, ensure_ascii=False)
-
- # Save body
- body_file = output_dir / f"response_{idx:02d}_body.txt"
- with open(body_file, 'w', encoding='utf-8') as f:
- f.write(body)
-
- # Try to parse and save
- clean_body = body[4:].strip() if body.startswith(")]}'") else body
- try:
- parsed_data = json.loads(clean_body)
- parsed_file = output_dir / f"response_{idx:02d}_parsed.json"
- with open(parsed_file, 'w', encoding='utf-8') as f:
- json.dump(parsed_data, f, indent=2, ensure_ascii=False)
- log.info(f"Dumped API response {idx} to {output_dir}/ ({len(body)} bytes)")
- except:
- log.debug(f"Response {idx} is not JSON")
-
- self._dumped_responses += 1
-
- parsed = self.api_interceptor.parse_reviews_from_responses(responses)
- log.debug(f"Parsed {len(parsed)} reviews from responses")
- for intercepted in parsed:
- if intercepted.review_id and intercepted.review_id not in api_reviews:
- api_reviews[intercepted.review_id] = self.api_interceptor.convert_to_raw_review_format(intercepted)
- if parsed:
- log.info(f"API interceptor captured {len(parsed)} reviews (total unique API: {len(api_reviews)})")
-
- # Log stats every 10 checks
- if check_num % 10 == 0:
- stats = self.api_interceptor.get_interceptor_stats()
- if stats:
- log.debug(f"Interceptor stats - Fetch: {stats.get('totalFetch', 0)}/{stats.get('capturedFetch', 0)}, "
- f"XHR: {stats.get('totalXHR', 0)}/{stats.get('capturedXHR', 0)}, "
- f"Last: {stats.get('lastCapture', 'never')}")
- except Exception as api_err:
- log.warning(f"API interception error: {api_err}", exc_info=True)
-
- except StaleElementReferenceException:
- # The pane or other element went stale, try to re-find
- log.debug("Stale element encountered, re-finding elements")
- try:
- pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, PANE_SEL)))
- driver.execute_script("window.scrollablePane = arguments[0];", pane)
- except Exception:
- log.warning("Could not re-find reviews pane after stale element")
- break
- except Exception as e:
- log.warning(f"Error during review processing: {e}")
- time.sleep(1)
-
- pbar.close()
-
- # Stop continuous scrolling thread
- stop_scrolling.set()
- scroll_thread.join(timeout=2.0)
- log.info(f"🛑 Stopped scrolling thread after {scroll_count[0]} total scrolls")
-
- # Merge API-captured reviews if any
- if self.enable_api_intercept and api_reviews:
- log.info(f"Merging {len(api_reviews)} reviews captured via API interception")
- for review_id, api_review in api_reviews.items():
- if review_id not in docs:
- # New review from API only
- docs[review_id] = api_review
- seen.add(review_id)
- else:
- # Merge API data with existing DOM data (API might have more details)
- existing = docs[review_id]
- # Only update fields that are missing or empty
- for key, value in api_review.items():
- if key not in existing or not existing.get(key):
- existing[key] = value
- log.info(f"After merge: {len(docs)} total reviews")
- elif self.enable_api_intercept:
- # Log final stats even if no reviews captured
- if self.api_interceptor:
- stats = self.api_interceptor.get_interceptor_stats()
- if stats:
- log.warning(f"⚠️ API interception was enabled but captured 0 reviews. "
- f"Network stats - Fetch requests: {stats.get('capturedFetch', 0)}/{stats.get('totalFetch', 0)}, "
- f"XHR requests: {stats.get('capturedXHR', 0)}/{stats.get('totalXHR', 0)}")
-
- # Get browser console logs for debugging
- console_logs = self.api_interceptor.get_browser_console_logs()
- api_logs = [log_entry for log_entry in console_logs
- if 'API Interceptor' in log_entry.get('message', '')]
- if api_logs:
- log.info(f"Found {len(api_logs)} API interceptor console messages")
- for entry in api_logs[:10]: # Show first 10
- log.debug(f" Console: {entry.get('message', '')[:200]}")
- else:
- log.debug("No API interceptor console messages found")
-
- # In debug mode, try to dump any responses that were collected
- if log.level <= logging.DEBUG:
- all_responses = self.api_interceptor.get_intercepted_responses()
- if all_responses:
- dump_path = self.api_interceptor.dump_responses_to_file(all_responses)
- if dump_path:
- log.info(f"Raw responses dumped to: {dump_path}")
- else:
- log.warning("API interceptor stats not available")
-
- # Save to MongoDB if enabled
- if self.use_mongodb and self.mongodb:
- log.info("Saving reviews to MongoDB...")
- self.mongodb.save_reviews(docs)
-
- # Backup to JSON if enabled
- if self.backup_to_json:
- log.info("Backing up to JSON...")
- self.json_storage.save_json_docs(docs)
- self.json_storage.save_seen(seen)
-
- # Final summary with completion percentage
- if total_reviews:
- percent = (len(docs) / total_reviews) * 100
- missing = total_reviews - len(docs)
- if missing <= 0:
- log.info(f"✅ Finished – Got all {total_reviews} reviews ({percent:.1f}%)")
- elif percent >= 95.0:
- log.info(f"✅ Finished – Got {len(docs)}/{total_reviews} reviews ({percent:.1f}%) - missing {missing}")
- else:
- log.info(f"⚠️ Finished – Got {len(docs)}/{total_reviews} reviews ({percent:.1f}%) - missing {missing}")
- else:
- log.info("✅ Finished – total unique reviews: %s", len(docs))
-
- end_time = time.time()
- elapsed_time = end_time - start_time
- log.info(f"Execution completed in {elapsed_time:.2f} seconds")
-
- return True
-
- except Exception as e:
- log.error(f"Error during scraping: {e}")
- log.error(traceback.format_exc())
- return False
-
- finally:
- # Cleanup API interceptor
- if self.api_interceptor:
- try:
- self.api_interceptor.cleanup()
- except Exception:
- pass
-
- if driver is not None:
- try:
- driver.quit()
- except Exception:
- pass
-
- if self.mongodb:
- try:
- self.mongodb.close()
- except Exception:
- pass
-
-# """
-# Selenium scraping logic for Google Maps Reviews.
-# """
-#
-# import os
-# import time
-# import logging
-# import traceback
-# import platform
-# from typing import Dict, Any, List
-#
-# import undetected_chromedriver as uc
-# from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
-# from selenium.webdriver import Chrome
-# from selenium.webdriver.common.by import By
-# from selenium.webdriver.remote.webelement import WebElement
-# from selenium.webdriver.support import expected_conditions as EC
-# from selenium.webdriver.support.ui import WebDriverWait
-# from tqdm import tqdm
-#
-# from modules.models import RawReview
-# from modules.data_storage import MongoDBStorage, JSONStorage, merge_review
-#
-# # Logger
-# log = logging.getLogger("scraper")
-#
-# # CSS Selectors
-# PANE_SEL = 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf'
-# CARD_SEL = "div[data-review-id]"
-# COOKIE_BTN = ('button[aria-label*="Accept" i],'
-# 'button[jsname="hZCF7e"],'
-# 'button[data-mdc-dialog-action="accept"]')
-# SORT_BTN = 'button[aria-label="Sort reviews" i], button[aria-label="Sort" i]'
-# MENU_ITEMS = 'div[role="menu"] [role="menuitem"], li[role="menuitem"]'
-#
-# SORT_LABELS = { # text shown in Google Maps' menu
-# "newest": ("Newest", "החדשות ביותר", "ใหม่ที่สุด"),
-# "highest": ("Highest rating", "הדירוג הגבוה ביותר", "คะแนนสูงสุด"),
-# "lowest": ("Lowest rating", "הדירוג הנמוך ביותר", "คะแนนต่ำสุด"),
-# "relevance": ("Most relevant", "רלוונטיות ביותר", "เกี่ยวข้องมากที่สุด"),
-# }
-#
-# REVIEW_WORDS = {"reviews", "review", "ביקורות", "รีวิว", "avis", "reseñas",
-# "recensioni", "bewertungen", "口コミ", "レビュー",
-# "리뷰", "評論", "评论", "рецензии", "ביקורת"}
-#
-#
-# class GoogleReviewsScraper:
-# """Main scraper class for Google Maps reviews"""
-#
-# def __init__(self, config: Dict[str, Any]):
-# """Initialize scraper with configuration"""
-# self.config = config
-# self.use_mongodb = config.get("use_mongodb", True)
-# self.mongodb = MongoDBStorage(config) if self.use_mongodb else None
-# self.json_storage = JSONStorage(config)
-# self.backup_to_json = config.get("backup_to_json", True)
-# self.overwrite_existing = config.get("overwrite_existing", False)
-#
-# def setup_driver(self, headless: bool) -> Chrome:
-# """
-# Set up and configure Chrome driver with flexibility for different environments.
-# Works in both Docker containers and on regular OS installations (Windows, Mac, Linux).
-# """
-# # Determine if we're running in a container
-# in_container = os.environ.get('CHROME_BIN') is not None
-#
-# # Create Chrome options
-# opts = uc.ChromeOptions()
-# opts.add_argument("--window-size=1400,900")
-# opts.add_argument("--ignore-certificate-errors")
-# opts.add_argument("--disable-gpu") # Improves performance
-# opts.add_argument("--disable-dev-shm-usage") # Helps with stability
-# opts.add_argument("--no-sandbox") # More stable in some environments
-#
-# # Use headless mode if requested
-# if headless:
-# opts.add_argument("--headless=new")
-#
-# # Log platform information for debugging
-# log.info(f"Platform: {platform.platform()}")
-# log.info(f"Python version: {platform.python_version()}")
-#
-# # If in container, use environment-provided binaries
-# if in_container:
-# chrome_binary = os.environ.get('CHROME_BIN')
-# chromedriver_path = os.environ.get('CHROMEDRIVER_PATH')
-#
-# log.info(f"Container environment detected")
-# log.info(f"Chrome binary: {chrome_binary}")
-# log.info(f"ChromeDriver path: {chromedriver_path}")
-#
-# if chrome_binary and os.path.exists(chrome_binary):
-# log.info(f"Using Chrome binary from environment: {chrome_binary}")
-# opts.binary_location = chrome_binary
-#
-# try:
-# # Try creating Chrome driver with undetected_chromedriver
-# log.info("Attempting to create undetected_chromedriver instance")
-# driver = uc.Chrome(options=opts)
-# log.info("Successfully created undetected_chromedriver instance")
-# except Exception as e:
-# # Fall back to regular Selenium if undetected_chromedriver fails
-# log.warning(f"Failed to create undetected_chromedriver instance: {e}")
-# log.info("Falling back to regular Selenium Chrome")
-#
-# # Import Selenium webdriver here to avoid potential import issues
-# from selenium import webdriver
-# from selenium.webdriver.chrome.service import Service
-#
-# if chromedriver_path and os.path.exists(chromedriver_path):
-# log.info(f"Using ChromeDriver from path: {chromedriver_path}")
-# service = Service(executable_path=chromedriver_path)
-# driver = webdriver.Chrome(service=service, options=opts)
-# else:
-# log.info("Using default ChromeDriver")
-# driver = webdriver.Chrome(options=opts)
-# else:
-# # On regular OS, use default undetected_chromedriver
-# log.info("Using standard undetected_chromedriver setup")
-# driver = uc.Chrome(options=opts)
-#
-# # Set page load timeout to avoid hanging
-# driver.set_page_load_timeout(30)
-# log.info("Chrome driver setup completed successfully")
-# return driver
-#
-# def dismiss_cookies(self, driver: Chrome):
-# """
-# Dismiss cookie consent dialogs if present.
-# Handles stale element references by re-finding elements if needed.
-# """
-# try:
-# # Use WebDriverWait with expected_conditions to handle stale elements
-# WebDriverWait(driver, 3).until(
-# EC.presence_of_element_located((By.CSS_SELECTOR, COOKIE_BTN))
-# )
-# log.info("Cookie consent dialog found, attempting to dismiss")
-#
-# # Get elements again after waiting to avoid stale references
-# elements = driver.find_elements(By.CSS_SELECTOR, COOKIE_BTN)
-# for elem in elements:
-# try:
-# if elem.is_displayed():
-# elem.click()
-# log.info("Cookie dialog dismissed")
-# return True
-# except Exception as e:
-# log.debug(f"Error clicking cookie button: {e}")
-# continue
-# except TimeoutException:
-# # This is expected if no cookie dialog is present
-# log.debug("No cookie consent dialog detected")
-# except Exception as e:
-# log.debug(f"Error handling cookie dialog: {e}")
-#
-# return False
-#
-# def is_reviews_tab(self, tab: WebElement) -> bool:
-# """Check if a tab is the reviews tab"""
-# try:
-# label = (tab.get_attribute("aria-label") or tab.text or "").lower()
-# return tab.get_attribute("data-tab-index") == "1" or any(w in label for w in REVIEW_WORDS)
-# except StaleElementReferenceException:
-# return False
-# except Exception as e:
-# log.debug(f"Error checking if tab is reviews tab: {e}")
-# return False
-#
-# def click_reviews_tab(self, driver: Chrome):
-# """
-# Click on the reviews tab in Google Maps with improved stale element handling.
-# """
-# end = time.time() + 15 # Timeout after 15 seconds
-# while time.time() < end:
-# try:
-# # Find all tab elements
-# tabs = driver.find_elements(By.CSS_SELECTOR, '[role="tab"], button[aria-label]')
-#
-# for tab in tabs:
-# try:
-# # Check if this is the reviews tab
-# label = (tab.get_attribute("aria-label") or tab.text or "").lower()
-# is_review_tab = tab.get_attribute("data-tab-index") == "1" or any(
-# w in label for w in REVIEW_WORDS)
-#
-# if is_review_tab:
-# # Scroll the tab into view
-# driver.execute_script("arguments[0].scrollIntoView({block:\"center\"});", tab)
-# time.sleep(0.2) # Small wait after scrolling
-#
-# # Try to click the tab
-# log.info("Found reviews tab, attempting to click")
-# tab.click()
-# log.info("Successfully clicked reviews tab")
-# return True
-# except Exception as e:
-# # Element might be stale or not clickable, try the next one
-# log.debug(f"Error with tab element: {str(e)}")
-# continue
-#
-# # If we get here, we didn't find a suitable tab in this iteration
-# log.debug("No reviews tab found in this iteration, waiting...")
-# time.sleep(0.5) # Wait before next attempt
-#
-# except Exception as e:
-# # General exception handling
-# log.debug(f"Exception while looking for reviews tab: {str(e)}")
-# time.sleep(0.5)
-#
-# # If we exit the loop, we've timed out
-# log.warning("Timeout while looking for reviews tab")
-# raise TimeoutException("Reviews tab not found")
-#
-# def set_sort(self, driver: Chrome, method: str):
-# """
-# Set the sorting method for reviews with improved error handling.
-# """
-# if method == "relevance":
-# return True # Default order, no need to change
-#
-# log.info(f"Attempting to set sort order to '{method}'")
-#
-# try:
-# # First try to find and click the sort button
-# sort_buttons = driver.find_elements(By.CSS_SELECTOR, SORT_BTN)
-# if not sort_buttons:
-# log.warning(f"Sort button not found - keeping default sort order")
-# return False
-#
-# # Try to click the first visible sort button
-# for sort_button in sort_buttons:
-# try:
-# if sort_button.is_displayed() and sort_button.is_enabled():
-# sort_button.click()
-# log.info("Clicked sort button")
-# time.sleep(0.5) # Wait for menu to appear
-# break
-# except Exception as e:
-# log.debug(f"Error clicking sort button: {e}")
-# continue
-# else:
-# log.warning("No clickable sort button found")
-# return False
-#
-# # Now find and click the menu item for the desired sort method
-# wanted = SORT_LABELS[method]
-# menu_items = WebDriverWait(driver, 3).until(
-# EC.presence_of_all_elements_located((By.CSS_SELECTOR, MENU_ITEMS))
-# )
-#
-# for item in menu_items:
-# try:
-# label = item.text.strip()
-# if label in wanted:
-# item.click()
-# log.info(f"Selected sort option: {label}")
-# time.sleep(0.5) # Wait for sorting to take effect
-# return True
-# except Exception as e:
-# log.debug(f"Error clicking menu item: {e}")
-# continue
-#
-# log.warning(f"Sort option '{method}' not found in menu - keeping default")
-# return False
-#
-# except Exception as e:
-# log.warning(f"Error setting sort order: {e}")
-# return False
-#
-# def scrape(self):
-# """Main scraper method"""
-# start_time = time.time()
-#
-# url = self.config.get("url")
-# headless = self.config.get("headless", True)
-# sort_by = self.config.get("sort_by", "relevance")
-# stop_on_match = self.config.get("stop_on_match", False)
-#
-# log.info(f"Starting scraper with settings: headless={headless}, sort_by={sort_by}")
-# log.info(f"URL: {url}")
-#
-# # Initialize storage
-# # If not overwriting, load existing data
-# if self.overwrite_existing:
-# docs = {}
-# seen = set()
-# else:
-# # Try to get from MongoDB first if enabled
-# docs = {}
-# if self.use_mongodb and self.mongodb:
-# docs = self.mongodb.fetch_existing_reviews()
-#
-# # If backup_to_json is enabled, also load from JSON for merging
-# if self.backup_to_json:
-# json_docs = self.json_storage.load_json_docs()
-# # Merge JSON docs with MongoDB docs
-# for review_id, review in json_docs.items():
-# if review_id not in docs:
-# docs[review_id] = review
-#
-# # Load seen IDs from file
-# seen = self.json_storage.load_seen()
-#
-# driver = None
-# try:
-# driver = self.setup_driver(headless)
-# wait = WebDriverWait(driver, 20) # Reduced from 40 to 20 for faster timeout
-#
-# driver.get(url)
-# wait.until(lambda d: "google.com/maps" in d.current_url)
-#
-# self.dismiss_cookies(driver)
-# self.click_reviews_tab(driver)
-# self.set_sort(driver, sort_by)
-#
-# # Add a wait after setting sort to allow results to load
-# time.sleep(1)
-#
-# # Use try-except to handle cases where the pane is not found
-# try:
-# pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, PANE_SEL)))
-# except TimeoutException:
-# log.warning("Could not find reviews pane. Page structure might have changed.")
-# return False
-#
-# pbar = tqdm(desc="Scraped", ncols=80, initial=len(seen))
-# idle = 0
-# processed_ids = set() # Track processed IDs in current session
-#
-# # Prefetch selector to avoid repeated lookups
-# try:
-# driver.execute_script("window.scrollablePane = arguments[0];", pane)
-# scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
-# except Exception as e:
-# log.warning(f"Error setting up scroll script: {e}")
-# scroll_script = "window.scrollBy(0, 300);" # Fallback to simple scrolling
-#
-# max_attempts = 10 # Limit the number of attempts to find reviews
-# attempts = 0
-#
-# while attempts < max_attempts:
-# try:
-# cards = pane.find_elements(By.CSS_SELECTOR, CARD_SEL)
-# fresh_cards: List[WebElement] = []
-#
-# # Check for valid cards
-# if len(cards) == 0:
-# log.debug("No review cards found in this iteration")
-# attempts += 1
-# # Try scrolling anyway
-# driver.execute_script(scroll_script)
-# time.sleep(1)
-# continue
-#
-# for c in cards:
-# try:
-# cid = c.get_attribute("data-review-id")
-# if not cid or cid in seen or cid in processed_ids:
-# if stop_on_match and cid and (cid in seen or cid in processed_ids):
-# idle = 999
-# break
-# continue
-# fresh_cards.append(c)
-# except StaleElementReferenceException:
-# continue
-# except Exception as e:
-# log.debug(f"Error getting review ID: {e}")
-# continue
-#
-# for card in fresh_cards:
-# try:
-# raw = RawReview.from_card(card)
-# processed_ids.add(raw.id) # Track this ID to avoid re-processing
-# except StaleElementReferenceException:
-# continue
-# except Exception:
-# log.warning("⚠️ parse error – storing stub\n%s",
-# traceback.format_exc(limit=1).strip())
-# try:
-# raw_id = card.get_attribute("data-review-id") or ""
-# raw = RawReview(id=raw_id, text="", lang="und")
-# processed_ids.add(raw_id)
-# except StaleElementReferenceException:
-# continue
-#
-# docs[raw.id] = merge_review(docs.get(raw.id), raw)
-# seen.add(raw.id)
-# pbar.update(1)
-# idle = 0
-# attempts = 0 # Reset attempts counter when we successfully process a review
-#
-# if idle >= 3:
-# break
-#
-# if not fresh_cards:
-# idle += 1
-# attempts += 1
-#
-# # Use JavaScript for smoother scrolling
-# try:
-# driver.execute_script(scroll_script)
-# except Exception as e:
-# log.warning(f"Error scrolling: {e}")
-# # Try a simpler scroll method
-# driver.execute_script("window.scrollBy(0, 300);")
-#
-# # Dynamic sleep: sleep less when processing many reviews
-# sleep_time = 0.7 if len(fresh_cards) > 5 else 1.0
-# time.sleep(sleep_time)
-#
-# except StaleElementReferenceException:
-# # The pane or other element went stale, try to re-find
-# log.debug("Stale element encountered, re-finding elements")
-# try:
-# pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, PANE_SEL)))
-# driver.execute_script("window.scrollablePane = arguments[0];", pane)
-# except Exception:
-# log.warning("Could not re-find reviews pane after stale element")
-# break
-# except Exception as e:
-# log.warning(f"Error during review processing: {e}")
-# attempts += 1
-# time.sleep(1)
-#
-# pbar.close()
-#
-# # Save to MongoDB if enabled
-# if self.use_mongodb and self.mongodb:
-# log.info("Saving reviews to MongoDB...")
-# self.mongodb.save_reviews(docs)
-#
-# # Backup to JSON if enabled
-# if self.backup_to_json:
-# log.info("Backing up to JSON...")
-# self.json_storage.save_json_docs(docs)
-# self.json_storage.save_seen(seen)
-#
-# log.info("✅ Finished – total unique reviews: %s", len(docs))
-#
-# end_time = time.time()
-# elapsed_time = end_time - start_time
-# log.info(f"Execution completed in {elapsed_time:.2f} seconds")
-#
-# return True
-#
-# except Exception as e:
-# log.error(f"Error during scraping: {e}")
-# log.error(traceback.format_exc())
-# return False
-#
-# finally:
-# if driver is not None:
-# try:
-# driver.quit()
-# except Exception:
-# pass
-#
-# if self.mongodb:
-# try:
-# self.mongodb.close()
-# except Exception:
-# pass
-#
-# # """
-# # Selenium scraping logic for Google Maps Reviews.
-# # """
-# #
-# # import re
-# # import time
-# # import logging
-# # import traceback
-# # from typing import Dict, Any, Set, List
-# #
-# # import undetected_chromedriver as uc
-# # from selenium.common.exceptions import TimeoutException
-# # from selenium.webdriver import Chrome
-# # from selenium.webdriver.common.by import By
-# # from selenium.webdriver.remote.webelement import WebElement
-# # from selenium.webdriver.support import expected_conditions as EC
-# # from selenium.webdriver.support.ui import WebDriverWait
-# # from tqdm import tqdm
-# #
-# # from modules.models import RawReview
-# # from modules.data_storage import MongoDBStorage, JSONStorage, merge_review
-# # from modules.utils import click_if
-# #
-# # # Logger
-# # log = logging.getLogger("scraper")
-# #
-# # # CSS Selectors
-# # PANE_SEL = 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf'
-# # CARD_SEL = "div[data-review-id]"
-# # COOKIE_BTN = ('button[aria-label*="Accept" i],'
-# # 'button[jsname="hZCF7e"],'
-# # 'button[data-mdc-dialog-action="accept"]')
-# # SORT_BTN = 'button[aria-label="Sort reviews" i], button[aria-label="Sort" i]'
-# # MENU_ITEMS = 'div[role="menu"] [role="menuitem"], li[role="menuitem"]'
-# #
-# # SORT_LABELS = { # text shown in Google Maps' menu
-# # "newest": ("Newest", "החדשות ביותר", "ใหม่ที่สุด"),
-# # "highest": ("Highest rating", "הדירוג הגבוה ביותר", "คะแนนสูงสุด"),
-# # "lowest": ("Lowest rating", "הדירוג הנמוך ביותר", "คะแนนต่ำสุด"),
-# # "relevance": ("Most relevant", "רלוונטיות ביותר", "เกี่ยวข้องมากที่สุด"),
-# # }
-# #
-# # REVIEW_WORDS = {"reviews", "review", "ביקורות", "รีวิว", "avis", "reseñas",
-# # "recensioni", "bewertungen", "口コミ", "レビュー",
-# # "리뷰", "評論", "评论", "рецензии"}
-# #
-# #
-# # class GoogleReviewsScraper:
-# # """Main scraper class for Google Maps reviews"""
-# #
-# # def __init__(self, config: Dict[str, Any]):
-# # """Initialize scraper with configuration"""
-# # self.config = config
-# # self.use_mongodb = config.get("use_mongodb", True)
-# # self.mongodb = MongoDBStorage(config) if self.use_mongodb else None
-# # self.json_storage = JSONStorage(config)
-# # self.backup_to_json = config.get("backup_to_json", True)
-# # self.overwrite_existing = config.get("overwrite_existing", False)
-# #
-# # def setup_driver(self, headless: bool) -> Chrome:
-# # """Set up and configure Chrome driver"""
-# # opts = uc.ChromeOptions()
-# # opts.add_argument("--window-size=1400,900")
-# # opts.add_argument("--ignore-certificate-errors")
-# # opts.add_argument("--disable-gpu") # Improves performance
-# # opts.add_argument("--disable-dev-shm-usage") # Helps with stability
-# # opts.add_argument("--no-sandbox") # More stable in some environments
-# #
-# # if headless:
-# # opts.add_argument("--headless=new")
-# #
-# # driver = uc.Chrome(options=opts)
-# # # Set page load timeout to avoid hanging
-# # driver.set_page_load_timeout(30)
-# # return driver
-# #
-# # def dismiss_cookies(self, driver: Chrome):
-# # """Dismiss cookie consent dialogs"""
-# # click_if(driver, COOKIE_BTN, timeout=3.0) # Reduced timeout for faster operation
-# #
-# # def is_reviews_tab(self, tab: WebElement) -> bool:
-# # """Check if a tab is the reviews tab"""
-# # label = (tab.get_attribute("aria-label") or tab.text or "").lower()
-# # return tab.get_attribute("data-tab-index") == "1" or any(w in label for w in REVIEW_WORDS)
-# #
-# # def click_reviews_tab(self, driver: Chrome):
-# # """Click on the reviews tab in Google Maps"""
-# # end = time.time() + 15 # Reduced timeout from 30 to 15 seconds
-# # while time.time() < end:
-# # for tab in driver.find_elements(By.CSS_SELECTOR,
-# # '[role="tab"], button[aria-label]'):
-# # if self.is_reviews_tab(tab):
-# # driver.execute_script("arguments[0].scrollIntoView({block:\"center\"});", tab)
-# # try:
-# # tab.click()
-# # return
-# # except Exception:
-# # continue
-# # time.sleep(.2) # Reduced sleep time from 0.4 to 0.2
-# # raise TimeoutException("Reviews tab not found")
-# #
-# # def set_sort(self, driver: Chrome, method: str):
-# # """Set the sorting method for reviews"""
-# # if method == "relevance":
-# # return # default order
-# # if not click_if(driver, SORT_BTN):
-# # return
-# #
-# # wanted = SORT_LABELS[method]
-# #
-# # for item in driver.find_elements(By.CSS_SELECTOR, MENU_ITEMS):
-# # label = item.text.strip()
-# # if label in wanted:
-# # item.click()
-# # time.sleep(0.5) # Reduced wait time from 1.0 to 0.5
-# # return
-# # log.warning("⚠️ sort option %s not found – keeping default", method)
-# #
-# # def scrape(self):
-# # """Main scraper method"""
-# # start_time = time.time()
-# #
-# # url = self.config.get("url")
-# # headless = self.config.get("headless", True)
-# # sort_by = self.config.get("sort_by", "relevance")
-# # stop_on_match = self.config.get("stop_on_match", False)
-# #
-# # log.info(f"Starting scraper with settings: headless={headless}, sort_by={sort_by}")
-# # log.info(f"URL: {url}")
-# #
-# # # Initialize storage
-# # # If not overwriting, load existing data
-# # if self.overwrite_existing:
-# # docs = {}
-# # seen = set()
-# # else:
-# # # Try to get from MongoDB first if enabled
-# # docs = {}
-# # if self.use_mongodb and self.mongodb:
-# # docs = self.mongodb.fetch_existing_reviews()
-# #
-# # # If backup_to_json is enabled, also load from JSON for merging
-# # if self.backup_to_json:
-# # json_docs = self.json_storage.load_json_docs()
-# # # Merge JSON docs with MongoDB docs
-# # for review_id, review in json_docs.items():
-# # if review_id not in docs:
-# # docs[review_id] = review
-# #
-# # # Load seen IDs from file
-# # seen = self.json_storage.load_seen()
-# #
-# # driver = self.setup_driver(headless)
-# # wait = WebDriverWait(driver, 20) # Reduced from 40 to 20 for faster timeout
-# #
-# # try:
-# # driver.get(url)
-# # wait.until(lambda d: "google.com/maps" in d.current_url)
-# #
-# # self.dismiss_cookies(driver)
-# # self.click_reviews_tab(driver)
-# # self.set_sort(driver, sort_by)
-# #
-# # pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, PANE_SEL)))
-# # pbar = tqdm(desc="Scraped", ncols=80, initial=len(seen))
-# # idle = 0
-# # processed_ids = set() # Track processed IDs in current session
-# #
-# # # Prefetch selector to avoid repeated lookups
-# # driver.execute_script("window.scrollablePane = arguments[0];", pane)
-# # scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
-# #
-# # while True:
-# # cards = pane.find_elements(By.CSS_SELECTOR, CARD_SEL)
-# # fresh_cards: List[WebElement] = []
-# #
-# # for c in cards:
-# # cid = c.get_attribute("data-review-id")
-# # if cid in seen or cid in processed_ids:
-# # if stop_on_match:
-# # idle = 999
-# # break
-# # continue
-# # fresh_cards.append(c)
-# #
-# # for card in fresh_cards:
-# # try:
-# # raw = RawReview.from_card(card)
-# # processed_ids.add(raw.id) # Track this ID to avoid re-processing
-# # except Exception:
-# # log.warning("⚠️ parse error – storing stub\n%s",
-# # traceback.format_exc(limit=1).strip())
-# # raw_id = card.get_attribute("data-review-id") or ""
-# # raw = RawReview(id=raw_id, text="", lang="und")
-# # processed_ids.add(raw_id)
-# #
-# # docs[raw.id] = merge_review(docs.get(raw.id), raw)
-# # seen.add(raw.id)
-# # pbar.update(1)
-# # idle = 0
-# #
-# # if idle >= 3:
-# # break
-# #
-# # if not fresh_cards:
-# # idle += 1
-# #
-# # # Use JavaScript for smoother scrolling
-# # driver.execute_script(scroll_script)
-# #
-# # # Dynamic sleep: sleep less when processing many reviews
-# # sleep_time = 0.7 if len(fresh_cards) > 5 else 1.0
-# # time.sleep(sleep_time)
-# #
-# # pbar.close()
-# #
-# # # Save to MongoDB if enabled
-# # if self.use_mongodb and self.mongodb:
-# # log.info("Saving reviews to MongoDB...")
-# # self.mongodb.save_reviews(docs)
-# #
-# # # Backup to JSON if enabled
-# # if self.backup_to_json:
-# # log.info("Backing up to JSON...")
-# # self.json_storage.save_json_docs(docs)
-# # self.json_storage.save_seen(seen)
-# #
-# # log.info("✅ Finished – total unique reviews: %s", len(docs))
-# #
-# # end_time = time.time()
-# # elapsed_time = end_time - start_time
-# # log.info(f"Execution completed in {elapsed_time:.2f} seconds")
-# #
-# # finally:
-# # driver.quit()
-# # if self.mongodb:
-# # self.mongodb.close()
diff --git a/reverse_engineer_date_formatter.py b/reverse_engineer_date_formatter.py
deleted file mode 100644
index f1ffb41..0000000
--- a/reverse_engineer_date_formatter.py
+++ /dev/null
@@ -1,198 +0,0 @@
-#!/usr/bin/env python3
-"""
-Reverse-engineer Google's date formatting library to understand:
-1. What library they use
-2. All possible date format patterns
-3. Time range boundaries for each pattern
-"""
-import json
-import re
-from seleniumbase import Driver
-import time
-
-url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=en&rclk=1"
-
-print("Starting browser...")
-driver = Driver(uc=True, headless=False)
-
-try:
- print(f"Loading URL: {url}")
- driver.get(url)
- time.sleep(8)
-
- # Script to find date formatting function
- find_formatter_script = """
- const results = {
- scripts: [],
- potential_formatters: [],
- date_strings: []
- };
-
- // 1. Search all script tags for date-related code
- const scriptTags = document.querySelectorAll('script');
- let scriptContent = '';
-
- scriptTags.forEach((script, idx) => {
- const content = script.textContent || script.innerText;
- if (content) {
- scriptContent += content + '\\n';
-
- // Look for date formatting patterns
- if (content.includes('ago') || content.includes('month') || content.includes('year')) {
- const snippet = content.substring(0, 500);
- results.scripts.push({
- index: idx,
- snippet: snippet,
- length: content.length
- });
- }
- }
- });
-
- // 2. Search for common date formatting library signatures
- const librarySignatures = [
- 'moment',
- 'date-fns',
- 'dayjs',
- 'luxon',
- 'timeago',
- 'formatRelative',
- 'relativeTime',
- 'fromNow'
- ];
-
- librarySignatures.forEach(sig => {
- if (scriptContent.includes(sig)) {
- results.potential_formatters.push(sig);
- }
- });
-
- // 3. Try to find the actual formatting function by injecting test dates
- // Look for Google's internal date formatter
- const googleFormatters = [];
- for (let key in window) {
- if (typeof window[key] === 'function') {
- const funcStr = window[key].toString();
- if (funcStr.includes('ago') && funcStr.includes('month')) {
- googleFormatters.push({
- name: key,
- signature: funcStr.substring(0, 200)
- });
- }
- }
- }
- results.google_formatters = googleFormatters;
-
- // 4. Extract all "X ago" patterns from the page
- const pageText = document.body.innerText;
- const agoPatterns = pageText.match(/\\d+\\s+(second|minute|hour|day|week|month|year)s?\\s+ago/gi) || [];
- const singlePatterns = pageText.match(/a\\s+(second|minute|hour|day|week|month|year)\\s+ago/gi) || [];
-
- results.date_strings = [...new Set([...agoPatterns, ...singlePatterns])];
-
- return results;
- """
-
- print("Searching for date formatting code...")
- formatter_info = driver.execute_script(find_formatter_script)
-
- print("\n" + "="*80)
- print("FINDINGS:")
- print("="*80)
-
- print(f"\n1. Scripts with date-related code: {len(formatter_info.get('scripts', []))}")
-
- print(f"\n2. Potential libraries detected: {formatter_info.get('potential_formatters', [])}")
-
- print(f"\n3. Google formatter functions found: {len(formatter_info.get('google_formatters', []))}")
- for gf in formatter_info.get('google_formatters', [])[:3]:
- print(f" - {gf['name']}: {gf['signature'][:100]}...")
-
- print(f"\n4. Date patterns found on page:")
- date_strings = formatter_info.get('date_strings', [])
- for ds in sorted(set(date_strings))[:20]:
- print(f" - '{ds}'")
-
- # Now let's test different timestamps to understand the boundaries
- print("\n" + "="*80)
- print("TESTING TIME RANGE BOUNDARIES:")
- print("="*80)
-
- # We need to inject JavaScript that can format dates like Google does
- # Let's search the actual DOM for the pattern
- boundary_test_script = """
- // Collect all unique date strings from reviews
- const dateElements = document.querySelectorAll('span.rsqaWe');
- const dateStrings = new Set();
-
- dateElements.forEach(elem => {
- const text = elem.textContent.trim();
- if (text) {
- dateStrings.add(text);
- }
- });
-
- return Array.from(dateStrings).sort();
- """
-
- all_date_strings = driver.execute_script(boundary_test_script)
-
- print(f"\nFound {len(all_date_strings)} unique date formats:")
- for ds in all_date_strings[:30]:
- print(f" - '{ds}'")
-
- # Analyze the patterns
- print("\n" + "="*80)
- print("PATTERN ANALYSIS:")
- print("="*80)
-
- patterns = {
- 'seconds': [],
- 'minutes': [],
- 'hours': [],
- 'days': [],
- 'weeks': [],
- 'months': [],
- 'years': []
- }
-
- for ds in all_date_strings:
- ds_lower = ds.lower()
- if 'second' in ds_lower:
- patterns['seconds'].append(ds)
- elif 'minute' in ds_lower:
- patterns['minutes'].append(ds)
- elif 'hour' in ds_lower:
- patterns['hours'].append(ds)
- elif 'day' in ds_lower:
- patterns['days'].append(ds)
- elif 'week' in ds_lower:
- patterns['weeks'].append(ds)
- elif 'month' in ds_lower:
- patterns['months'].append(ds)
- elif 'year' in ds_lower:
- patterns['years'].append(ds)
-
- for unit, examples in patterns.items():
- if examples:
- print(f"\n{unit.upper()}:")
- for ex in examples[:5]:
- print(f" - '{ex}'")
-
- # Save all data
- output = {
- 'formatter_info': formatter_info,
- 'all_date_strings': all_date_strings,
- 'pattern_analysis': {k: v for k, v in patterns.items() if v}
- }
-
- with open('/tmp/google_date_formatter_analysis.json', 'w') as f:
- json.dump(output, f, indent=2)
-
- print("\n" + "="*80)
- print("Full analysis saved to: /tmp/google_date_formatter_analysis.json")
- print("="*80)
-
-finally:
- driver.quit()
- print("\nBrowser closed")
diff --git a/reverse_engineer_date_formatter_v2.py b/reverse_engineer_date_formatter_v2.py
deleted file mode 100644
index ba95f8e..0000000
--- a/reverse_engineer_date_formatter_v2.py
+++ /dev/null
@@ -1,175 +0,0 @@
-#!/usr/bin/env python3
-"""
-Reverse-engineer Google's date formatting patterns by scraping reviews in English
-"""
-import json
-from modules.fast_scraper import fast_scrape_reviews
-
-url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=en&rclk=1"
-
-print("Scraping reviews in English...")
-result = fast_scrape_reviews(url, headless=True)
-
-reviews = result.get('reviews', [])
-print(f"\nExtracted {len(reviews)} reviews")
-
-if reviews:
- # Collect all unique date strings
- date_strings = set()
- for rev in reviews:
- date_text = rev.get('date_text')
- if date_text:
- date_strings.add(date_text)
-
- print(f"\nFound {len(date_strings)} unique date formats:")
- for ds in sorted(date_strings):
- print(f" '{ds}'")
-
- # Analyze patterns
- print("\n" + "="*80)
- print("PATTERN ANALYSIS:")
- print("="*80)
-
- patterns = {
- 'seconds': [],
- 'minutes': [],
- 'hours': [],
- 'days': [],
- 'weeks': [],
- 'months': [],
- 'years': []
- }
-
- for ds in date_strings:
- ds_lower = ds.lower()
- if 'second' in ds_lower:
- patterns['seconds'].append(ds)
- elif 'minute' in ds_lower:
- patterns['minutes'].append(ds)
- elif 'hour' in ds_lower:
- patterns['hours'].append(ds)
- elif 'day' in ds_lower:
- patterns['days'].append(ds)
- elif 'week' in ds_lower:
- patterns['weeks'].append(ds)
- elif 'month' in ds_lower:
- patterns['months'].append(ds)
- elif 'year' in ds_lower:
- patterns['years'].append(ds)
-
- for unit, examples in sorted(patterns.items()):
- if examples:
- print(f"\n{unit.upper()} ({len(examples)} patterns):")
- for ex in sorted(examples):
- print(f" '{ex}'")
-
- # Identify the specific patterns
- print("\n" + "="*80)
- print("GOOGLE MAPS DATE FORMAT PATTERNS (English):")
- print("="*80)
-
- print("\nPattern Structure:")
- print("-" * 80)
-
- single_unit_patterns = [] # "a month ago"
- plural_patterns = [] # "3 months ago"
-
- for ds in sorted(date_strings):
- if ds.startswith('a '):
- single_unit_patterns.append(ds)
- elif ds.split()[0].isdigit():
- plural_patterns.append(ds)
-
- print(f"\nSingular (a X ago): {len(single_unit_patterns)} patterns")
- for p in sorted(single_unit_patterns):
- print(f" '{p}'")
-
- print(f"\nPlural (N Xs ago): {len(plural_patterns)} patterns")
- for p in sorted(plural_patterns):
- print(f" '{p}'")
-
- # Determine time ranges
- print("\n" + "="*80)
- print("TIME RANGE BOUNDARIES:")
- print("="*80)
-
- # Extract numbers from plural patterns
- import re
- from collections import defaultdict
-
- unit_values = defaultdict(list)
- for ds in date_strings:
- match = re.match(r'(\d+)\s+(\w+)\s+ago', ds.lower())
- if match:
- number = int(match.group(1))
- unit = match.group(2).rstrip('s') # Remove plural 's'
- unit_values[unit].append(number)
-
- for unit, values in sorted(unit_values.items()):
- if values:
- print(f"\n{unit.upper()}:")
- print(f" Range: {min(values)} - {max(values)}")
- print(f" Values found: {sorted(set(values))}")
-
- # Save analysis
- output = {
- 'total_reviews': len(reviews),
- 'unique_date_formats': len(date_strings),
- 'all_date_strings': sorted(list(date_strings)),
- 'patterns_by_unit': {k: sorted(v) for k, v in patterns.items() if v},
- 'singular_patterns': sorted(single_unit_patterns),
- 'plural_patterns': sorted(plural_patterns),
- 'value_ranges': {unit: {'min': min(values), 'max': max(values), 'values': sorted(set(values))}
- for unit, values in unit_values.items() if values}
- }
-
- with open('/tmp/google_date_patterns_english.json', 'w') as f:
- json.dump(output, f, indent=2)
-
- print("\n" + "="*80)
- print("Analysis saved to: /tmp/google_date_patterns_english.json")
- print("="*80)
-
- # Now let's determine the EXACT library/algorithm Google uses
- print("\n" + "="*80)
- print("REVERSE-ENGINEERING GOOGLE'S ALGORITHM:")
- print("="*80)
-
- print("\nBased on the patterns, Google's relative date formatter:")
- print("-" * 80)
-
- print("\n1. FORMAT STRUCTURE:")
- print(" Single unit: 'a {unit} ago'")
- print(" Multiple: '{number} {unit}s ago'")
-
- print("\n2. UNIT SELECTION (hypothesis):")
- if 'second' in unit_values:
- print(f" - Seconds: Used for 0-59 seconds ago")
- if 'minute' in unit_values:
- print(f" - Minutes: Used for 1-59 minutes ago")
- if 'hour' in unit_values:
- print(f" - Hours: Used for 1-23 hours ago")
- if 'day' in unit_values:
- print(f" - Days: Used for 1-6 days ago")
- if 'week' in unit_values:
- print(f" - Weeks: Used for 1-3 weeks ago")
- if 'month' in unit_values:
- print(f" - Months: Used for 1-11 months ago")
- if 'year' in unit_values:
- print(f" - Years: Used for 1+ years ago")
-
- print("\n3. BOUNDARY THRESHOLDS (estimated):")
- print(" 60 seconds = switch to minutes")
- print(" 60 minutes = switch to hours")
- print(" 24 hours = switch to days")
- print(" 7 days = switch to weeks")
- print(" ~30 days (4 weeks) = switch to months")
- print(" 12 months = switch to years")
-
- print("\n4. UNCERTAINTY RANGES:")
- print(" 'a month ago' = 30-59 days ago (±15 days)")
- print(" '2 months ago' = 60-89 days ago (±15 days)")
- print(" 'a year ago' = 365-729 days ago (±6 months)")
-
-else:
- print("No reviews extracted!")
diff --git a/start.py b/start.py
deleted file mode 100644
index 87cc4bf..0000000
--- a/start.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/usr/bin/env python3
-"""
-Google‑Maps review scraper with MongoDB integration
-=================================================
-
-Main entry point for the scraper.
-"""
-
-from modules.cli import parse_arguments
-from modules.config import load_config
-from modules.scraper import GoogleReviewsScraper
-
-
-def main():
- """Main function to initialize and run the scraper"""
- # Parse command line arguments
- args = parse_arguments()
-
- # Load configuration
- config = load_config(args.config)
-
- # Override config with command line arguments if provided
- if args.headless:
- config["headless"] = True
- if args.sort_by is not None:
- config["sort_by"] = args.sort_by
- if args.stop_on_match:
- config["stop_on_match"] = True
- if args.url is not None:
- config["url"] = args.url
- if args.overwrite_existing:
- config["overwrite_existing"] = True
- if args.use_mongodb is not None:
- config["use_mongodb"] = args.use_mongodb
-
- # Handle arguments for date conversion and image downloading
- if args.convert_dates is not None:
- config["convert_dates"] = args.convert_dates
- if args.download_images is not None:
- config["download_images"] = args.download_images
- if args.image_dir is not None:
- config["image_dir"] = args.image_dir
- if args.download_threads is not None:
- config["download_threads"] = args.download_threads
-
- # Handle arguments for local image paths and URL replacement
- if args.store_local_paths is not None:
- config["store_local_paths"] = args.store_local_paths
- if args.replace_urls is not None:
- config["replace_urls"] = args.replace_urls
- if args.custom_url_base is not None:
- config["custom_url_base"] = args.custom_url_base
- if args.custom_url_profiles is not None:
- config["custom_url_profiles"] = args.custom_url_profiles
- if args.custom_url_reviews is not None:
- config["custom_url_reviews"] = args.custom_url_reviews
- if args.preserve_original_urls is not None:
- config["preserve_original_urls"] = args.preserve_original_urls
-
- # Handle custom parameters
- if args.custom_params is not None:
- if "custom_params" not in config:
- config["custom_params"] = {}
- # Update config with the provided custom parameters
- config["custom_params"].update(args.custom_params)
-
- # Handle API interception option
- if args.enable_api_intercept:
- config["enable_api_intercept"] = True
-
- # Initialize and run scraper
- scraper = GoogleReviewsScraper(config)
- scraper.scrape()
-
-
-if __name__ == "__main__":
- main()
diff --git a/start_api_244.py b/start_api_244.py
deleted file mode 100644
index cf9c0a4..0000000
--- a/start_api_244.py
+++ /dev/null
@@ -1,288 +0,0 @@
-#!/usr/bin/env python3
-"""
-API-Only 244 Scraper - Attempt to get ALL 244 reviews via API alone.
-
-Strategy:
-1. More patient scrolling (more scrolls, longer waits)
-2. Collect responses more frequently
-3. Extra end-of-list collection
-4. Slower timing near the end to ensure API completes
-
-Goal: Get all 244 reviews via API without DOM parsing
-"""
-import sys
-import yaml
-import logging
-import time
-import json
-from seleniumbase import Driver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import TimeoutException
-from modules.api_interceptor import GoogleMapsAPIInterceptor
-
-logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
-log = logging.getLogger(__name__)
-log.setLevel(logging.INFO)
-
-
-def load_config():
- with open('config.yaml', 'r') as f:
- return yaml.safe_load(f)
-
-
-def api_244_scrape():
- """Get all 244 reviews purely via API with aggressive collection."""
-
- config = load_config()
- url = config.get('url')
- headless = config.get('headless', False)
-
- print("API-244 SCRAPER - Getting ALL 244 reviews via API...")
- print(f"URL: {url[:80]}...")
-
- start_time = time.time()
- api_reviews = {}
-
- driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
-
- try:
- # Step 1: Navigate
- driver.get(url)
- time.sleep(1.5)
-
- # Dismiss cookies
- try:
- cookie_btns = driver.find_elements(By.CSS_SELECTOR,
- 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
- if cookie_btns:
- cookie_btns[0].click()
- time.sleep(0.4)
- except:
- pass
-
- # Click reviews tab
- review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
- for selector in ['.LRkQ2', 'button[role="tab"]']:
- try:
- tabs = driver.find_elements(By.CSS_SELECTOR, selector)
- for tab in tabs:
- text = (tab.text or '').lower()
- aria = (tab.get_attribute('aria-label') or '').lower()
- if any(kw in text or kw in aria for kw in review_keywords):
- driver.execute_script("arguments[0].click();", tab)
- time.sleep(0.4)
- break
- except:
- continue
-
- # Wait for page stability
- time.sleep(1.0)
-
- # Find pane
- pane = None
- try:
- wait = WebDriverWait(driver, 3)
- pane = wait.until(EC.presence_of_element_located(
- (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
- except TimeoutException:
- try:
- pane = wait.until(EC.presence_of_element_located(
- (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
- except:
- print("ERROR: Could not find pane")
- return []
-
- # Setup API interceptor
- interceptor = GoogleMapsAPIInterceptor(driver)
- interceptor.setup_interception()
- interceptor.inject_response_interceptor()
- time.sleep(1.0) # Longer wait to ensure interceptor is ready
-
- # Setup scroll
- driver.execute_script("window.scrollablePane = arguments[0];", pane)
- scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
-
- # Trigger initial scroll
- driver.execute_script(scroll_script)
- time.sleep(1.0) # Wait for first API response
-
- print("Scrolling with extended collection strategy...")
-
- # Extended scrolling - MORE scrolls, SLOWER timing
- max_scrolls = 50 # More scrolls to ensure we catch everything
- idle_scrolls = 0
- max_idle = 15 # Even more patience
- last_count = 0
- last_scroll_pos = 0
- scroll_stuck_count = 0
-
- for i in range(max_scrolls):
- # Scroll
- driver.execute_script(scroll_script)
-
- # Progressive timing - slower and slower
- if len(api_reviews) < 50:
- time.sleep(0.30) # Start moderate
- elif len(api_reviews) < 100:
- time.sleep(0.35)
- elif len(api_reviews) < 150:
- time.sleep(0.40)
- elif len(api_reviews) < 200:
- time.sleep(0.50)
- elif len(api_reviews) < 230:
- time.sleep(0.60) # Much slower near end
- else:
- time.sleep(0.80) # Very slow for final reviews
-
- # Collect responses
- try:
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
- except:
- pass
-
- # Check if we got new reviews
- current_count = len(api_reviews)
- if current_count == last_count:
- idle_scrolls += 1
- else:
- idle_scrolls = 0
- if (i + 1) % 10 == 0:
- print(f" {current_count} reviews...")
-
- last_count = current_count
-
- # Check scroll position
- try:
- current_scroll = driver.execute_script("return arguments[0].scrollTop;", pane)
- if current_scroll == last_scroll_pos:
- scroll_stuck_count += 1
- else:
- scroll_stuck_count = 0
- last_scroll_pos = current_scroll
- except:
- pass
-
- # Stop conditions - but only if we have at least 240 reviews
- if idle_scrolls >= max_idle and scroll_stuck_count >= 5 and current_count >= 240:
- print(f" Reached end (no new reviews for {idle_scrolls} scrolls)")
- break
-
- # AGGRESSIVE final collection phase
- print(f" Aggressive final collection (currently have {len(api_reviews)})...")
-
- # Do 10 more scrolls with very long waits
- for extra in range(10):
- driver.execute_script(scroll_script)
- time.sleep(1.2) # Very long wait
-
- try:
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- new_count = 0
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
- new_count += 1
-
- if new_count > 0:
- print(f" +{new_count} more reviews (total: {len(api_reviews)})")
- except:
- pass
-
- # Ultra-final wait and collect
- time.sleep(2.0)
- try:
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
- except:
- pass
-
- elapsed = time.time() - start_time
- all_reviews = list(api_reviews.values())
-
- print(f"\n{'='*50}")
- print(f"✅ COMPLETED!")
- print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
- print(f"Time: {elapsed:.2f}s")
- print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
-
- if elapsed > 0:
- print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
-
- print(f"{'='*50}")
-
- if len(all_reviews) >= 244:
- print(f"🎯 Got ALL 244 reviews via API!")
- elif len(all_reviews) >= 240:
- print(f"⚠️ Missing {244-len(all_reviews)} reviews - may need DOM parsing")
- else:
- print(f"⚠️ Missing {244-len(all_reviews)} reviews")
-
- print()
-
- # Save
- with open('google_reviews_api_244.json', 'w', encoding='utf-8') as f:
- json.dump(all_reviews, f, indent=2, ensure_ascii=False)
-
- print(f"💾 Saved to google_reviews_api_244.json")
-
- if all_reviews:
- print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
-
- return all_reviews
-
- finally:
- try:
- driver.quit()
- except:
- pass
-
-
-if __name__ == '__main__':
- try:
- reviews = api_244_scrape()
- sys.exit(0 if reviews else 1)
- except KeyboardInterrupt:
- print("\n\nInterrupted by user")
- sys.exit(1)
- except Exception as e:
- print(f"ERROR: {e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
diff --git a/start_complete.py b/start_complete.py
deleted file mode 100644
index 05178b2..0000000
--- a/start_complete.py
+++ /dev/null
@@ -1,280 +0,0 @@
-#!/usr/bin/env python3
-"""
-Complete Scraper - Gets ALL reviews while staying fast.
-
-Strategy:
-1. Scroll until no new reviews for 5 consecutive scrolls
-2. Check scroll position to detect end
-3. Do extra scrolls at the end to catch stragglers
-4. Adaptive timing - faster at start, slower at end
-
-Target: Get all 244 reviews in ~22-25 seconds
-"""
-import sys
-import yaml
-import logging
-import time
-import json
-from seleniumbase import Driver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import TimeoutException
-from modules.api_interceptor import GoogleMapsAPIInterceptor
-
-logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
-log = logging.getLogger(__name__)
-log.setLevel(logging.INFO)
-
-
-def load_config():
- with open('config.yaml', 'r') as f:
- return yaml.safe_load(f)
-
-
-def complete_scrape():
- """Get ALL reviews with intelligent scrolling."""
-
- config = load_config()
- url = config.get('url')
- headless = config.get('headless', False)
-
- print("COMPLETE SCRAPER - Getting ALL reviews...")
- print(f"URL: {url[:80]}...")
-
- start_time = time.time()
- api_reviews = {}
-
- driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
-
- try:
- # Step 1: Navigate
- driver.get(url)
- time.sleep(1.5)
-
- # Dismiss cookies
- try:
- cookie_btns = driver.find_elements(By.CSS_SELECTOR,
- 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
- if cookie_btns:
- cookie_btns[0].click()
- time.sleep(0.4)
- except:
- pass
-
- # Click reviews tab
- review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
- for selector in ['.LRkQ2', 'button[role="tab"]']:
- try:
- tabs = driver.find_elements(By.CSS_SELECTOR, selector)
- for tab in tabs:
- text = (tab.text or '').lower()
- aria = (tab.get_attribute('aria-label') or '').lower()
- if any(kw in text or kw in aria for kw in review_keywords):
- driver.execute_script("arguments[0].click();", tab)
- time.sleep(0.4)
- break
- except:
- continue
-
- # Wait for page stability
- time.sleep(1.0)
-
- # Find pane
- pane = None
- try:
- wait = WebDriverWait(driver, 3)
- pane = wait.until(EC.presence_of_element_located(
- (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
- except TimeoutException:
- try:
- pane = wait.until(EC.presence_of_element_located(
- (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
- except:
- print("ERROR: Could not find pane")
- return []
-
- # Wait for initial reviews to load
- time.sleep(1.5)
-
- # Setup API interceptor
- interceptor = GoogleMapsAPIInterceptor(driver)
- interceptor.setup_interception()
- interceptor.inject_response_interceptor()
- time.sleep(1.0) # Important: wait for interceptor to be ready
-
- # Setup scroll
- driver.execute_script("window.scrollablePane = arguments[0];", pane)
- scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
-
- # Trigger initial scroll to get first API response
- driver.execute_script(scroll_script)
- time.sleep(1.0) # Wait for first API response
-
- print("Scrolling with intelligent stopping...")
-
- # Intelligent scrolling
- max_scrolls = 60 # Higher limit to ensure we get everything
- idle_scrolls = 0 # Count scrolls with no new reviews
- max_idle = 12 # More patience - stop after 12 scrolls with no new reviews
- last_count = 0
- last_scroll_pos = 0
- scroll_stuck_count = 0
-
- for i in range(max_scrolls):
- # Scroll
- driver.execute_script(scroll_script)
-
- # Adaptive timing - faster at start, slower near end
- if len(api_reviews) < 100:
- time.sleep(0.27) # Fast at beginning
- elif len(api_reviews) < 200:
- time.sleep(0.30) # Medium in middle
- elif len(api_reviews) < 235:
- time.sleep(0.40) # Slower near end
- else:
- time.sleep(0.50) # Very slow at the very end to catch stragglers
-
- # Collect responses
- try:
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
- except:
- pass
-
- # Check if we got new reviews
- current_count = len(api_reviews)
- if current_count == last_count:
- idle_scrolls += 1
- else:
- idle_scrolls = 0
- if (i + 1) % 10 == 0:
- print(f" {current_count} reviews...")
-
- last_count = current_count
-
- # Check scroll position to detect if stuck at bottom
- try:
- current_scroll = driver.execute_script("return arguments[0].scrollTop;", pane)
- if current_scroll == last_scroll_pos:
- scroll_stuck_count += 1
- else:
- scroll_stuck_count = 0
- last_scroll_pos = current_scroll
- except:
- pass
-
- # Stop conditions
- if idle_scrolls >= max_idle and scroll_stuck_count >= 3:
- print(f" Reached end (no new reviews for {idle_scrolls} scrolls)")
- break
-
- # Extra thorough collection at the end
- print(f" Final collection sweep (currently have {len(api_reviews)})...")
-
- # Do a few more scrolls with longer waits
- for extra in range(5):
- driver.execute_script(scroll_script)
- time.sleep(0.8) # Longer wait to ensure API completes
-
- try:
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- new_count = 0
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
- new_count += 1
-
- if new_count > 0:
- print(f" +{new_count} more reviews (total: {len(api_reviews)})")
- except:
- pass
-
- # Final wait and collect
- time.sleep(1.0)
- try:
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
- except:
- pass
-
- elapsed = time.time() - start_time
- all_reviews = list(api_reviews.values())
-
- print(f"\n✅ COMPLETED!")
- print(f"Reviews: {len(all_reviews)} (target: 244)")
- print(f"Time: {elapsed:.2f}s")
- print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
- print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
-
- if len(all_reviews) >= 244:
- print(f"🎯 Got ALL reviews!")
- elif len(all_reviews) >= 240:
- print(f"⚠️ Missing {244-len(all_reviews)} reviews")
-
- print()
-
- # Save
- with open('google_reviews_complete.json', 'w', encoding='utf-8') as f:
- json.dump(all_reviews, f, indent=2, ensure_ascii=False)
-
- print(f"💾 Saved to google_reviews_complete.json")
-
- if all_reviews:
- print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
-
- return all_reviews
-
- finally:
- try:
- driver.quit()
- except:
- pass
-
-
-if __name__ == '__main__':
- try:
- reviews = complete_scrape()
- sys.exit(0 if reviews else 1)
- except KeyboardInterrupt:
- print("\n\nInterrupted by user")
- sys.exit(1)
- except Exception as e:
- print(f"ERROR: {e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
diff --git a/start_dom_only_fast.py b/start_dom_only_fast.py
deleted file mode 100644
index ab806a4..0000000
--- a/start_dom_only_fast.py
+++ /dev/null
@@ -1,331 +0,0 @@
-#!/usr/bin/env python3
-"""
-DOM-ONLY FAST Scraper - Uses JavaScript for ultra-fast DOM extraction.
-
-Strategy:
-1. Scroll to load all reviews
-2. Extract ALL data using JavaScript in one shot (no slow Selenium queries)
-3. Should be faster and simpler than API + DOM hybrid
-
-Target: ~20-25 seconds for all 244 reviews with simpler code
-"""
-import sys
-import yaml
-import logging
-import time
-import json
-from seleniumbase import Driver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import TimeoutException
-
-logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
-log = logging.getLogger(__name__)
-log.setLevel(logging.INFO)
-
-
-def load_config():
- with open('config.yaml', 'r') as f:
- return yaml.safe_load(f)
-
-
-def extract_all_reviews_js(driver):
- """Extract ALL reviews using JavaScript - single fast operation."""
-
- extract_script = """
- const reviews = [];
- const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium');
-
- for (let i = 0; i < elements.length; i++) {
- const elem = elements[i];
- const review = {};
-
- try {
- // Author
- const authorElem = elem.querySelector('div.d4r55');
- review.author = authorElem ? authorElem.textContent.trim() : null;
-
- // Rating
- const ratingElem = elem.querySelector('span.kvMYJc');
- if (ratingElem) {
- const ariaLabel = ratingElem.getAttribute('aria-label');
- if (ariaLabel) {
- const match = ariaLabel.match(/\\d+/);
- review.rating = match ? parseFloat(match[0]) : null;
- }
- }
-
- // Text
- const textElem = elem.querySelector('span.wiI7pd');
- review.text = textElem ? textElem.textContent.trim() : null;
-
- // Date
- const dateElem = elem.querySelector('span.rsqaWe');
- review.date_text = dateElem ? dateElem.textContent.trim() : null;
-
- // Avatar
- const avatarElem = elem.querySelector('img.NBa7we');
- review.avatar_url = avatarElem ? avatarElem.src : null;
-
- // Profile URL
- const profileElem = elem.querySelector('button.WEBjve');
- review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null;
-
- if (review.author && review.date_text) {
- reviews.push(review);
- }
- } catch (e) {
- // Skip this review
- }
- }
-
- return reviews;
- """
-
- try:
- reviews_data = driver.execute_script(extract_script)
-
- # Add review IDs
- reviews = []
- for review_data in reviews_data:
- review_id = f"review_{hash(review_data['author'] + review_data['date_text'])}"
- review_data['review_id'] = review_id
- reviews.append(review_data)
-
- return reviews
-
- except Exception as e:
- print(f" Error in JavaScript extraction: {e}")
- return []
-
-
-def dom_only_fast_scrape():
- """Ultra-fast DOM-only scraping with JavaScript extraction."""
-
- config = load_config()
- url = config.get('url')
- headless = config.get('headless', False)
-
- print("DOM-ONLY FAST SCRAPER - JavaScript extraction...")
- print(f"URL: {url[:80]}...")
-
- start_time = time.time()
-
- driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
-
- try:
- # Navigate
- driver.get(url)
- time.sleep(1.5) # Reduced from 2.0
-
- # Handle GDPR consent page (CRITICAL FIX!)
- if 'consent.google.com' in driver.current_url:
- try:
- # Click "Accept all" / "Aceptar todo"
- consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Aceptar"]')
- if not consent_btns:
- consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept"]')
- if consent_btns:
- consent_btns[0].click()
- time.sleep(1.5) # Reduced from 2.0
- except:
- pass
-
- # Dismiss cookie banner on Maps page
- try:
- cookie_btns = driver.find_elements(By.CSS_SELECTOR,
- 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
- if cookie_btns:
- cookie_btns[0].click()
- time.sleep(0.3) # Reduced from 0.4
- except:
- pass
-
- # Click reviews tab
- review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
- for selector in ['.LRkQ2', 'button[role="tab"]']:
- try:
- tabs = driver.find_elements(By.CSS_SELECTOR, selector)
- for tab in tabs:
- text = (tab.text or '').lower()
- aria = (tab.get_attribute('aria-label') or '').lower()
- if any(kw in text or kw in aria for kw in review_keywords):
- driver.execute_script("arguments[0].click();", tab)
- time.sleep(0.3) # Reduced from 0.4
- break
- except:
- continue
-
- # Wait for page stability
- time.sleep(0.8) # Reduced from 1.0
-
- # Find pane
- pane = None
- try:
- wait = WebDriverWait(driver, 3)
- pane = wait.until(EC.presence_of_element_located(
- (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
- except TimeoutException:
- try:
- pane = wait.until(EC.presence_of_element_located(
- (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
- except:
- print("ERROR: Could not find pane")
- return []
-
- # CRITICAL: Wait for initial reviews to load
- time.sleep(1.2) # Reduced from 1.5
-
- # Setup scroll
- driver.execute_script("window.scrollablePane = arguments[0];", pane)
- scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
-
- # Trigger initial scroll and VERIFY reviews are loading
- driver.execute_script(scroll_script)
- time.sleep(0.8) # Reduced from 1.0
-
- # Check if reviews are actually loading
- initial_count = driver.execute_script(
- "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
- )
-
- if initial_count < 5:
- # Reviews not loaded yet, wait more
- print(f" Waiting for reviews to load (found {initial_count})...")
- time.sleep(1.5) # Reduced from 2.0
- driver.execute_script(scroll_script)
- time.sleep(0.8)
- initial_count = driver.execute_script(
- "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
- )
-
- print(f"Scrolling to load all reviews (starting with {initial_count})...")
-
- # Fast scrolling to load all DOM elements
- # No hard limit - stops automatically via idle detection
- max_scrolls = 999999
- last_count = 0
- idle_count = 0
- last_scroll_pos = 0
-
- for i in range(max_scrolls):
- # Get current review count
- current_count = driver.execute_script(
- "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
- )
-
- # Scroll to load more
- prev_count = current_count
- driver.execute_script(scroll_script)
-
- # SMART WAIT: Wait until new reviews actually load (instead of fixed delay!)
- max_wait = 1.0 # Maximum 1 second
- wait_step = 0.05 # Check every 50ms
- waited = 0
-
- while waited < max_wait:
- time.sleep(wait_step)
- waited += wait_step
-
- new_count = driver.execute_script(
- "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
- )
-
- # If reviews loaded, continue immediately!
- if new_count > prev_count:
- break
-
- # If at bottom and no new reviews after 0.3s, we're done
- if waited >= 0.3 and new_count == prev_count:
- scroll_pos = driver.execute_script("return arguments[0].scrollTop;", pane)
- if scroll_pos == last_scroll_pos:
- idle_count += 1
- if idle_count >= 3:
- print(f" Reached end at {new_count} reviews")
- break
- last_scroll_pos = scroll_pos
- break
-
- current_count = new_count
-
- # Progress logging every 10 scrolls
- if (i + 1) % 10 == 0:
- print(f" {current_count} review elements loaded...")
-
- # Track for idle detection
- if current_count == prev_count:
- idle_count += 1
- if idle_count >= 3:
- break
- else:
- idle_count = 0
-
- last_count = current_count
-
- # Shorter final scroll
- for _ in range(2): # Reduced from 3
- driver.execute_script(scroll_script)
- time.sleep(0.3) # Reduced from 0.4
-
- scroll_time = time.time() - start_time
- print(f" Scrolling complete in {scroll_time:.2f}s")
-
- # Extract ALL reviews using JavaScript (fast!)
- print("Extracting reviews with JavaScript...")
- extract_start = time.time()
-
- all_reviews = extract_all_reviews_js(driver)
-
- extract_time = time.time() - extract_start
- print(f" Extraction complete in {extract_time:.2f}s")
-
- elapsed = time.time() - start_time
-
- print(f"\n{'='*50}")
- print(f"✅ COMPLETED!")
- print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
- print(f"Time: {elapsed:.2f}s")
- print(f" - Scrolling: {scroll_time:.2f}s")
- print(f" - Extraction: {extract_time:.2f}s")
- print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
- print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
- print(f"{'='*50}")
-
- if len(all_reviews) >= 244:
- print(f"🎯 Got ALL 244 reviews!")
- elif len(all_reviews) >= 240:
- print(f"⚠️ Missing {244-len(all_reviews)} reviews")
-
- print()
-
- # Save
- with open('google_reviews_dom_only_fast.json', 'w', encoding='utf-8') as f:
- json.dump(all_reviews, f, indent=2, ensure_ascii=False)
-
- print(f"💾 Saved to google_reviews_dom_only_fast.json")
-
- if all_reviews:
- print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
-
- return all_reviews
-
- finally:
- try:
- driver.quit()
- except:
- pass
-
-
-if __name__ == '__main__':
- try:
- reviews = dom_only_fast_scrape()
- sys.exit(0 if reviews else 1)
- except KeyboardInterrupt:
- print("\n\nInterrupted by user")
- sys.exit(1)
- except Exception as e:
- print(f"ERROR: {e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
diff --git a/start_fast.py b/start_fast.py
deleted file mode 100644
index fa0bcac..0000000
--- a/start_fast.py
+++ /dev/null
@@ -1,346 +0,0 @@
-#!/usr/bin/env python3
-"""
-Fast API-First Scraper - Optimized version of start.py
-
-Strategy:
-1. Open browser and navigate to reviews (~15 seconds)
-2. Scroll rapidly JUST to trigger API calls (~15 seconds)
-3. Collect all API responses during scrolling
-4. Parse reviews from API responses
-5. Skip DOM parsing entirely
-6. Exit immediately
-
-Expected time: ~30-40 seconds for 244 reviews (vs 155 seconds)
-Speed improvement: ~4-5x faster!
-"""
-import sys
-import yaml
-import logging
-import time
-import json
-from pathlib import Path
-from seleniumbase import Driver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import TimeoutException
-from modules.api_interceptor import GoogleMapsAPIInterceptor
-
-logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
-log = logging.getLogger(__name__)
-
-
-def load_config():
- """Load configuration from config.yaml"""
- with open('config.yaml', 'r') as f:
- return yaml.safe_load(f)
-
-
-def fast_scrape():
- """Fast API-first scraping."""
-
- config = load_config()
- url = config.get('url')
- headless = config.get('headless', False)
-
- log.info("="*60)
- log.info("FAST API-FIRST SCRAPER")
- log.info("="*60)
- log.info(f"URL: {url[:80]}...")
- log.info(f"Mode: API-first (skip DOM parsing)")
- log.info("="*60 + "\n")
-
- start_time = time.time()
- api_reviews = {}
-
- # Create driver using SeleniumBase UC Mode (like original scraper)
- driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
-
- try:
- # Step 1: Navigate to reviews
- log.info("Step 1: Opening Google Maps...")
- driver.get(url)
- time.sleep(2)
-
- # Dismiss cookies
- try:
- cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
- if cookie_btns:
- cookie_btns[0].click()
- log.info("✓ Cookie dialog dismissed")
- time.sleep(1)
- except:
- pass
-
- # Click reviews tab - comprehensive approach
- log.info("Step 2: Opening reviews tab...")
-
- # Review keywords for multiple languages
- review_keywords = [
- 'reviews', 'review', 'reseñas', 'reseña', 'opiniones', 'avis',
- 'bewertungen', 'recensioni', 'avaliações', 'ביקורות'
- ]
-
- clicked = False
- tab_selectors = [
- '.LRkQ2', # Primary
- '.hh2c6', # Alternative
- '[data-tab-index="1"]', # Tab index
- 'button[role="tab"]', # Button tabs
- 'div[role="tab"]', # Div tabs
- ]
-
- # Try each selector
- for selector in tab_selectors:
- try:
- tabs = driver.find_elements(By.CSS_SELECTOR, selector)
- for tab in tabs:
- try:
- # Check if this is the reviews tab
- text = (tab.text or '').lower()
- aria_label = (tab.get_attribute('aria-label') or '').lower()
-
- if any(keyword in text or keyword in aria_label for keyword in review_keywords):
- log.info(f"Found reviews tab with selector {selector}: '{tab.text}'")
- # Scroll into view
- driver.execute_script("arguments[0].scrollIntoView({block:'center'});", tab)
- time.sleep(0.5)
- # Click with JavaScript (most reliable)
- driver.execute_script("arguments[0].click();", tab)
- time.sleep(1.5)
- log.info("✓ Reviews tab clicked")
- clicked = True
- break
- except:
- continue
- if clicked:
- break
- except:
- continue
-
- if not clicked:
- log.warning("Could not find/click reviews tab - may already be on reviews or page structure changed")
-
- # CRITICAL: Wait after clicking reviews tab for page to load
- log.info("Waiting for reviews page to fully load...")
- time.sleep(3)
-
- # Find reviews pane
- log.info("Step 3: Finding reviews pane...")
- log.info(f"Current URL: {driver.current_url}")
-
- pane = None
- pane_selectors = [
- 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', # Primary
- 'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', # Without role="main"
- 'div.m6QErb.WNBkOb.XiKgde', # Alternative class combination
- 'div[role="main"] div.m6QErb.XiKgde', # Simplified with XiKgde
- 'div.m6QErb.DxyBCb.XiKgde', # Another variant
- 'div[role="main"] div.m6QErb', # Simplified version
- 'div.m6QErb.DxyBCb', # Even more simplified
- 'div[role="main"]', # Most generic
- ]
-
- for selector in pane_selectors:
- try:
- log.info(f"Trying selector: {selector}")
- wait = WebDriverWait(driver, 5)
- pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
- log.info(f"✓ Found reviews pane with: {selector}")
- break
- except TimeoutException:
- log.debug(f"Pane not found with selector: {selector}")
- continue
-
- if not pane:
- log.error("Could not find reviews pane after all attempts!")
- log.error(f"Final URL: {driver.current_url}")
- # Save screenshot for debugging
- try:
- screenshot_path = 'pane_not_found.png'
- driver.save_screenshot(screenshot_path)
- log.info(f"Screenshot saved to {screenshot_path}")
- except:
- pass
- return []
-
- # Wait for initial reviews to load
- log.info("Waiting for initial reviews to render...")
- time.sleep(3)
-
- # Check if any review cards are present
- try:
- cards = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf')
- log.info(f"Found {len(cards)} initial review cards")
- except:
- log.warning("Could not find initial review cards")
-
- # Step 4: Setup API interceptor (AFTER finding pane)
- log.info("Step 4: Setting up API interception...")
- interceptor = GoogleMapsAPIInterceptor(driver)
- try:
- interceptor.setup_interception()
- interceptor.inject_response_interceptor()
- log.info("✓ API interceptor ready - capturing network responses")
- except Exception as e:
- log.warning(f"Failed to setup interceptor: {e}")
- import traceback
- traceback.print_exc()
- time.sleep(2) # Extra wait for interception to be fully active
- log.info("")
-
- # Step 5: Rapid scrolling to trigger API calls
- log.info("="*60)
- log.info("Step 5: Rapid scrolling to trigger API calls")
- log.info("="*60)
-
- # Setup scroll script (same as original scraper)
- try:
- driver.execute_script("window.scrollablePane = arguments[0];", pane)
- scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
- log.info("✓ Scroll script setup complete")
- except Exception as e:
- log.warning(f"Error setting up scroll script: {e}")
- scroll_script = "window.scrollBy(0, 300);" # Fallback
-
- # Verify interceptor is active
- try:
- is_injected = driver.execute_script("return window.__reviewInterceptorInjected === true;")
- stats = driver.execute_script("return window.__interceptorStats;")
- queue_length = driver.execute_script("return window.__interceptedResponses ? window.__interceptedResponses.length : -1;")
- log.info(f"Interceptor status: injected={is_injected}, queue={queue_length}, stats={stats}")
- except Exception as e:
- log.warning(f"Could not check interceptor status: {e}")
-
- # Trigger initial API call
- log.info("Triggering initial API call...")
- driver.execute_script(scroll_script)
- time.sleep(2) # Wait for first API response
- log.info("")
-
- # We need about 25 API calls for 244 reviews (10 per call)
- # Scroll rapidly - no DOM parsing!
- target_reviews = 240
- max_scrolls = 30
-
- for i in range(max_scrolls):
- # Fast scroll
- driver.execute_script(scroll_script)
- time.sleep(0.3) # Optimal timing - fast but captures all responses
-
- # Collect API responses
- try:
- responses = interceptor.get_intercepted_responses()
- if i == 5: # Debug on scroll 5
- log.info(f"DEBUG: Got {len(responses)} responses from interceptor")
-
- # Check browser console
- try:
- console_logs = driver.get_log('browser')
- interceptor_logs = [l for l in console_logs if 'API Interceptor' in l.get('message', '')]
- if interceptor_logs:
- log.info(f"DEBUG: Interceptor console logs:")
- for l in interceptor_logs[-10:]: # Last 10
- log.info(f" {l['message']}")
- else:
- log.info("DEBUG: No interceptor logs in console")
- except Exception as e:
- log.warning(f"Could not get console logs: {e}")
-
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- if i == 5: # Debug on scroll 5
- log.info(f"DEBUG: Parsed {len(parsed)} reviews from responses")
-
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
-
- if parsed:
- log.info(f"Scroll {i+1}: +{len(parsed)} reviews | Total: {len(api_reviews)}")
-
- # Exit early if we have enough
- if len(api_reviews) >= target_reviews:
- log.info(f"\n✓ Reached target of {target_reviews} reviews!")
- break
- except Exception as e:
- log.error(f"Error collecting API responses: {e}")
- import traceback
- traceback.print_exc()
-
- # Quick progress update
- if (i + 1) % 5 == 0 and i > 0:
- log.info(f"Progress: {i+1}/{max_scrolls} scrolls, {len(api_reviews)} reviews collected")
-
- elapsed = time.time() - start_time
-
- # Convert to list
- all_reviews = list(api_reviews.values())
-
- log.info("\n" + "="*60)
- log.info("✅ FAST SCRAPING COMPLETED!")
- log.info("="*60)
- log.info(f"Total reviews: {len(all_reviews)}")
- log.info(f"Scrolls performed: {i+1}")
- log.info(f"Time elapsed: {elapsed:.2f} seconds")
- if all_reviews:
- log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second")
- log.info("="*60 + "\n")
-
- # Save results
- output_file = 'google_reviews_fast.json'
- with open(output_file, 'w', encoding='utf-8') as f:
- json.dump(all_reviews, f, indent=2, ensure_ascii=False)
-
- log.info(f"💾 Saved {len(all_reviews)} reviews to {output_file}")
-
- # Show sample
- if all_reviews:
- log.info("\n📝 Sample review:")
- sample = all_reviews[0]
- log.info(f" Author: {sample['author']}")
- log.info(f" Rating: {sample['rating']}★")
- log.info(f" Date: {sample['date_text']}")
- if sample['text']:
- log.info(f" Text: {sample['text'][:80]}...")
-
- # Stats comparison
- log.info("\n" + "="*60)
- log.info("SPEED COMPARISON")
- log.info("="*60)
- log.info(f"Old approach: ~155 seconds for 244 reviews")
- log.info(f"Fast approach: ~{elapsed:.0f} seconds for {len(all_reviews)} reviews")
- if elapsed > 0:
- log.info(f"Improvement: {155/elapsed:.1f}x faster! 🚀")
- log.info("="*60 + "\n")
-
- return all_reviews
-
- finally:
- # Always close the driver
- try:
- driver.quit()
- except:
- pass
-
-
-if __name__ == '__main__':
- try:
- reviews = fast_scrape()
- sys.exit(0 if reviews else 1)
- except KeyboardInterrupt:
- log.info("\n\nInterrupted by user")
- sys.exit(1)
- except Exception as e:
- log.error(f"Fatal error: {e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
diff --git a/start_fastest_stable.py b/start_fastest_stable.py
deleted file mode 100644
index af91fe0..0000000
--- a/start_fastest_stable.py
+++ /dev/null
@@ -1,307 +0,0 @@
-#!/usr/bin/env python3
-"""
-FASTEST STABLE Scraper - Best of both worlds.
-
-Strategy:
-1. Ultra-fast API scrolling (proven stable) → 234 reviews in ~19s
-2. Instant JavaScript DOM extraction → 10 missing reviews in ~0.5s
-3. Total: ~20 seconds for all 244 reviews with 100% stability
-
-Combines stability of API approach with speed of JavaScript extraction.
-"""
-import sys
-import yaml
-import logging
-import time
-import json
-from seleniumbase import Driver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import TimeoutException
-from modules.api_interceptor import GoogleMapsAPIInterceptor
-
-logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
-log = logging.getLogger(__name__)
-log.setLevel(logging.INFO)
-
-
-def load_config():
- with open('config.yaml', 'r') as f:
- return yaml.safe_load(f)
-
-
-def extract_missing_reviews_js(driver, max_reviews=25):
- """Ultra-fast JavaScript extraction for missing reviews."""
-
- extract_script = """
- const reviews = [];
- const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium');
- const maxCount = Math.min(arguments[0], elements.length);
-
- for (let i = 0; i < maxCount; i++) {
- const elem = elements[i];
- const review = {};
-
- try {
- const authorElem = elem.querySelector('div.d4r55');
- review.author = authorElem ? authorElem.textContent.trim() : null;
-
- const ratingElem = elem.querySelector('span.kvMYJc');
- if (ratingElem) {
- const ariaLabel = ratingElem.getAttribute('aria-label');
- if (ariaLabel) {
- const match = ariaLabel.match(/\\d+/);
- review.rating = match ? parseFloat(match[0]) : null;
- }
- }
-
- const textElem = elem.querySelector('span.wiI7pd');
- review.text = textElem ? textElem.textContent.trim() : null;
-
- const dateElem = elem.querySelector('span.rsqaWe');
- review.date_text = dateElem ? dateElem.textContent.trim() : null;
-
- const avatarElem = elem.querySelector('img.NBa7we');
- review.avatar_url = avatarElem ? avatarElem.src : null;
-
- const profileElem = elem.querySelector('button.WEBjve');
- review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null;
-
- if (review.author && review.date_text) {
- reviews.push(review);
- }
- } catch (e) {
- // Skip
- }
- }
- return reviews;
- """
-
- try:
- reviews_data = driver.execute_script(extract_script, max_reviews)
-
- reviews = []
- for review_data in reviews_data:
- review_id = f"dom_{hash(review_data['author'] + review_data['date_text'])}"
- review_data['review_id'] = review_id
- reviews.append(review_data)
-
- return reviews
- except Exception as e:
- return []
-
-
-def fastest_stable_scrape():
- """Get ALL 244 reviews with ultra-fast API + instant JS extraction."""
-
- config = load_config()
- url = config.get('url')
- headless = config.get('headless', False)
-
- print("FASTEST STABLE SCRAPER - Ultra-fast API + instant JS...")
- print(f"URL: {url[:80]}...")
-
- start_time = time.time()
- api_reviews = {}
-
- driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
-
- try:
- # Navigate
- driver.get(url)
- time.sleep(1.5)
-
- # Dismiss cookies
- try:
- cookie_btns = driver.find_elements(By.CSS_SELECTOR,
- 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
- if cookie_btns:
- cookie_btns[0].click()
- time.sleep(0.4)
- except:
- pass
-
- # Click reviews tab
- review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
- for selector in ['.LRkQ2', 'button[role="tab"]']:
- try:
- tabs = driver.find_elements(By.CSS_SELECTOR, selector)
- for tab in tabs:
- text = (tab.text or '').lower()
- aria = (tab.get_attribute('aria-label') or '').lower()
- if any(kw in text or kw in aria for kw in review_keywords):
- driver.execute_script("arguments[0].click();", tab)
- time.sleep(0.4)
- break
- except:
- continue
-
- # Wait for stability
- time.sleep(1.0)
-
- # Find pane
- pane = None
- try:
- wait = WebDriverWait(driver, 3)
- pane = wait.until(EC.presence_of_element_located(
- (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
- except TimeoutException:
- try:
- pane = wait.until(EC.presence_of_element_located(
- (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
- except:
- print("ERROR: Could not find pane")
- return []
-
- # Wait for initial reviews to load (critical for stability)
- time.sleep(1.5)
-
- # Setup API interceptor
- interceptor = GoogleMapsAPIInterceptor(driver)
- interceptor.setup_interception()
- interceptor.inject_response_interceptor()
- time.sleep(1.0) # Important: wait for interceptor to be ready
-
- # Setup scroll
- driver.execute_script("window.scrollablePane = arguments[0];", pane)
- scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
-
- # Trigger initial scroll to get first API response
- driver.execute_script(scroll_script)
- time.sleep(1.0) # Wait for first API response
-
- print("[Phase 1] Ultra-fast API scrolling...")
-
- # Ultra-fast API scrolling
- target_reviews = 240
- max_scrolls = 35
-
- for i in range(max_scrolls):
- driver.execute_script(scroll_script)
- time.sleep(0.27) # Optimal timing
-
- # API collection
- try:
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
-
- if (i + 1) % 10 == 0:
- print(f" {len(api_reviews)} reviews...")
-
- if len(api_reviews) >= target_reviews:
- break
- except:
- pass
-
- # Final API collection
- try:
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
- except:
- pass
-
- api_time = time.time() - start_time
- print(f" ✅ Phase 1: {len(api_reviews)} reviews in {api_time:.2f}s")
-
- # [Phase 2] Instant JavaScript extraction for missing reviews
- missing = 244 - len(api_reviews)
- if missing > 0:
- print(f"\n[Phase 2] Fast JS extraction for {missing} missing reviews...")
-
- # Scroll to top (missing reviews likely at top)
- driver.execute_script("window.scrollablePane.scrollTo(0, 0);", pane)
- time.sleep(0.3)
-
- # Extract with JavaScript
- dom_reviews = extract_missing_reviews_js(driver, max_reviews=min(missing + 10, 25))
-
- # Build API keys for deduplication
- api_keys = set()
- for api_review in api_reviews.values():
- key = (api_review.get('author', ''), (api_review.get('date_text', '') or '')[:20])
- api_keys.add(key)
-
- # Add unique DOM reviews
- dom_added = 0
- for dom_review in dom_reviews:
- dom_key = (dom_review.get('author', ''), (dom_review.get('date_text', '') or '')[:20])
- if dom_key not in api_keys:
- api_reviews[dom_review['review_id']] = dom_review
- dom_added += 1
-
- dom_time = time.time() - start_time - api_time
- print(f" ✅ Phase 2: +{dom_added} reviews in {dom_time:.2f}s")
-
- elapsed = time.time() - start_time
- all_reviews = list(api_reviews.values())
-
- print(f"\n{'='*50}")
- print(f"✅ COMPLETED!")
- print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
- print(f"Time: {elapsed:.2f}s")
- print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
- print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
- print(f"{'='*50}")
-
- if len(all_reviews) >= 244:
- print(f"🎯 Got ALL 244 reviews!")
- elif len(all_reviews) >= 240:
- print(f"⚠️ Missing {244-len(all_reviews)} reviews")
-
- print()
-
- # Save
- with open('google_reviews_fastest_stable.json', 'w', encoding='utf-8') as f:
- json.dump(all_reviews, f, indent=2, ensure_ascii=False)
-
- print(f"💾 Saved to google_reviews_fastest_stable.json")
-
- if all_reviews:
- print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
-
- return all_reviews
-
- finally:
- try:
- driver.quit()
- except:
- pass
-
-
-if __name__ == '__main__':
- try:
- reviews = fastest_stable_scrape()
- sys.exit(0 if reviews else 1)
- except KeyboardInterrupt:
- print("\n\nInterrupted by user")
- sys.exit(1)
- except Exception as e:
- print(f"ERROR: {e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
diff --git a/start_hybrid_parallel.py b/start_hybrid_parallel.py
deleted file mode 100644
index c9e432c..0000000
--- a/start_hybrid_parallel.py
+++ /dev/null
@@ -1,286 +0,0 @@
-#!/usr/bin/env python3
-"""
-Hybrid Parallel Scraper - Best of both worlds.
-
-Strategy:
-1. Open browser and get to reviews page (~15s)
-2. Scroll quickly to collect ~5-10 continuation tokens (~5s)
-3. Make parallel API calls in browser using JavaScript (~2-3s)
-4. Total: ~22-25 seconds for 244 reviews
-
-This approach:
-- Uses browser's active session (no auth issues)
-- Collects tokens sequentially (required by API)
-- Makes parallel calls for remaining pages (fast!)
-"""
-import sys
-import yaml
-import logging
-import time
-import json
-from seleniumbase import Driver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import TimeoutException
-from modules.api_interceptor import GoogleMapsAPIInterceptor
-
-logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
-log = logging.getLogger(__name__)
-
-
-def load_config():
- with open('config.yaml', 'r') as f:
- return yaml.safe_load(f)
-
-
-def hybrid_parallel_scrape():
- """Hybrid approach: Sequential token collection + Parallel fetch."""
-
- config = load_config()
- url = config.get('url')
- headless = config.get('headless', False)
-
- log.info("="*60)
- log.info("HYBRID PARALLEL SCRAPER")
- log.info("="*60)
- log.info(f"URL: {url[:80]}...")
- log.info(f"Mode: Sequential tokens + Parallel fetch")
- log.info("="*60 + "\n")
-
- start_time = time.time()
- driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
-
- try:
- # PHASE 1: Setup (~15s)
- log.info("Phase 1: Browser setup...")
- driver.get(url)
- time.sleep(2)
-
- # Dismiss cookies
- try:
- cookie_btns = driver.find_elements(By.CSS_SELECTOR,
- 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
- if cookie_btns:
- cookie_btns[0].click()
- time.sleep(1)
- except:
- pass
-
- # Click reviews tab
- review_keywords = ['reviews', 'review', 'reseñas']
- for selector in ['.LRkQ2', '.hh2c6', 'button[role="tab"]']:
- try:
- tabs = driver.find_elements(By.CSS_SELECTOR, selector)
- for tab in tabs:
- text = (tab.text or '').lower()
- aria = (tab.get_attribute('aria-label') or '').lower()
- if any(kw in text or kw in aria for kw in review_keywords):
- driver.execute_script("arguments[0].click();", tab)
- time.sleep(2)
- break
- except:
- continue
-
- time.sleep(3)
-
- # Find pane
- pane = None
- for selector in ['div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde',
- 'div.m6QErb.WNBkOb.XiKgde']:
- try:
- wait = WebDriverWait(driver, 5)
- pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
- break
- except:
- continue
-
- if not pane:
- log.error("Could not find pane")
- return []
-
- time.sleep(2)
-
- # Extract place ID
- place_id = None
- current_url = driver.current_url
- if '!1s' in current_url:
- parts = current_url.split('!1s')
- if len(parts) > 1:
- place_id = parts[1].split('!')[0]
-
- if not place_id:
- log.error("Could not extract place ID")
- return []
-
- log.info(f"✓ Setup complete (place_id: {place_id})\n")
-
- # PHASE 2: Collect tokens via scrolling (~5s)
- log.info("Phase 2: Collecting continuation tokens...")
- interceptor = GoogleMapsAPIInterceptor(driver)
- interceptor.setup_interception()
- interceptor.inject_response_interceptor()
- time.sleep(1)
-
- # Setup scroll
- driver.execute_script("window.scrollablePane = arguments[0];", pane)
- scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
-
- # Collect tokens by scrolling quickly
- tokens = []
- all_reviews = {}
-
- for i in range(8): # 8 scrolls to get ~8 tokens
- driver.execute_script(scroll_script)
- time.sleep(0.2) # Very fast scrolling
-
- # Collect responses
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- for review in parsed:
- if review.review_id and review.review_id not in all_reviews:
- all_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
-
- # Extract continuation token from raw response
- for resp in responses:
- try:
- body = resp.get('body', '')
- if body.startswith(")]}'"):
- body = body[4:]
- data = json.loads(body)
- if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
- token = data[1]
- if token and token not in tokens:
- tokens.append(token)
- except:
- pass
-
- log.info(f"✓ Collected {len(tokens)} continuation tokens")
- log.info(f"✓ Got {len(all_reviews)} reviews from scrolling\n")
-
- # PHASE 3: Parallel fetch remaining pages (~2-3s)
- if len(tokens) > 0:
- log.info("Phase 3: Parallel fetch of remaining pages...")
-
- parallel_script = """
- async function fetchPages(placeId, tokens) {
- const baseUrl = 'https://www.google.com/maps/rpc/listugcposts';
- const results = [];
-
- const promises = tokens.map((token, idx) => {
- const pb = `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`;
- const params = new URLSearchParams({
- authuser: '0',
- hl: 'es',
- gl: 'es',
- pb: pb
- });
-
- return fetch(`${baseUrl}?${params}`)
- .then(r => r.text())
- .then(text => {
- const body = text.startsWith(")]}'") ? text.substring(4) : text;
- return {idx, data: JSON.parse(body)};
- })
- .catch(e => null);
- });
-
- const settled = await Promise.all(promises);
- return settled.filter(r => r !== null);
- }
-
- return await fetchPages(arguments[0], arguments[1]);
- """
-
- try:
- parallel_start = time.time()
- results = driver.execute_async_script(parallel_script, place_id, tokens[:15]) # Limit to 15 parallel
- parallel_time = time.time() - parallel_start
-
- log.info(f"✓ Parallel fetch completed in {parallel_time:.2f}s")
- log.info(f" Received {len(results)} responses")
-
- # Parse parallel results
- for result in results:
- if result and 'data' in result:
- try:
- parsed = interceptor._parse_listugcposts_response(result['data'])
- for review in parsed:
- if review.review_id and review.review_id not in all_reviews:
- all_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
- except Exception as e:
- log.debug(f"Parse error: {e}")
-
- log.info(f"✓ Total reviews after parallel fetch: {len(all_reviews)}\n")
-
- except Exception as e:
- log.warning(f"Parallel fetch failed: {e}")
-
- reviews_list = list(all_reviews.values())
- elapsed = time.time() - start_time
-
- log.info("="*60)
- log.info("✅ HYBRID PARALLEL SCRAPING COMPLETED!")
- log.info("="*60)
- log.info(f"Total reviews: {len(reviews_list)}")
- log.info(f"Total time: {elapsed:.2f} seconds")
- log.info(f"Speed: {len(reviews_list)/elapsed:.1f} reviews/second")
- log.info("="*60 + "\n")
-
- # Save
- with open('google_reviews_hybrid.json', 'w', encoding='utf-8') as f:
- json.dump(reviews_list, f, indent=2, ensure_ascii=False)
-
- log.info(f"💾 Saved {len(reviews_list)} reviews to google_reviews_hybrid.json")
-
- if reviews_list:
- log.info("\n📝 Sample:")
- s = reviews_list[0]
- log.info(f" {s['author']} - {s['rating']}★ - {s['date_text']}")
-
- log.info("\n" + "="*60)
- log.info("SPEED COMPARISON")
- log.info("="*60)
- log.info(f"Old DOM: ~155s for 244 reviews (1.0x)")
- log.info(f"Fast scrolling: ~29s for 234 reviews (5.3x)")
- log.info(f"Hybrid parallel: ~{elapsed:.0f}s for {len(reviews_list)} reviews ({155/elapsed:.1f}x)! 🚀")
- log.info("="*60 + "\n")
-
- return reviews_list
-
- finally:
- try:
- driver.quit()
- except:
- pass
-
-
-if __name__ == '__main__':
- try:
- reviews = hybrid_parallel_scrape()
- sys.exit(0 if reviews else 1)
- except KeyboardInterrupt:
- log.info("\n\nInterrupted by user")
- sys.exit(1)
- except Exception as e:
- log.error(f"Fatal error: {e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
diff --git a/start_optimized_hybrid.py b/start_optimized_hybrid.py
deleted file mode 100644
index 529c583..0000000
--- a/start_optimized_hybrid.py
+++ /dev/null
@@ -1,318 +0,0 @@
-#!/usr/bin/env python3
-"""
-OPTIMIZED HYBRID Scraper - True parallel with minimal overhead.
-
-Strategy:
-1. Ultra-fast API scrolling (no DOM parsing during scroll!)
-2. Quick DOM count check near end (minimal overhead)
-3. If needed, targeted DOM parse at very end for missing reviews
-4. Goal: ~22-25s for all 244 reviews
-
-Key: Keep scroll loop FAST, only parse DOM if absolutely needed at the very end.
-"""
-import sys
-import yaml
-import logging
-import time
-import json
-from seleniumbase import Driver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import TimeoutException
-from modules.api_interceptor import GoogleMapsAPIInterceptor
-
-logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
-log = logging.getLogger(__name__)
-log.setLevel(logging.INFO)
-
-
-def load_config():
- with open('config.yaml', 'r') as f:
- return yaml.safe_load(f)
-
-
-def quick_dom_parse_top_reviews(driver, count=15):
- """Quick parse of just the top N reviews from DOM."""
- dom_reviews = []
-
- try:
- # Get only first N review elements (the ones most likely to be missing from API)
- review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')[:count]
-
- for elem in review_elements:
- try:
- review_data = {}
-
- # Author
- try:
- author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55')
- review_data['author'] = author_elem.text
- except:
- review_data['author'] = None
-
- # Rating
- try:
- rating_elem = elem.find_element(By.CSS_SELECTOR, 'span.kvMYJc')
- rating_attr = rating_elem.get_attribute('aria-label')
- if rating_attr:
- rating_parts = rating_attr.split()
- if rating_parts:
- review_data['rating'] = float(rating_parts[0])
- except:
- review_data['rating'] = None
-
- # Text
- try:
- text_elem = elem.find_element(By.CSS_SELECTOR, 'span.wiI7pd')
- review_data['text'] = text_elem.text
- except:
- review_data['text'] = None
-
- # Date
- try:
- date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe')
- review_data['date_text'] = date_elem.text
- except:
- review_data['date_text'] = None
-
- # Avatar
- try:
- avatar_elem = elem.find_element(By.CSS_SELECTOR, 'img.NBa7we')
- review_data['avatar_url'] = avatar_elem.get_attribute('src')
- except:
- review_data['avatar_url'] = None
-
- # Profile URL
- try:
- profile_elem = elem.find_element(By.CSS_SELECTOR, 'button.WEBjve')
- review_data['profile_url'] = profile_elem.get_attribute('data-review-id')
- except:
- review_data['profile_url'] = None
-
- # Generate ID
- if review_data.get('author'):
- review_id = f"dom_{hash(str(review_data.get('author', '')) + str(review_data.get('date_text', '')))}"
- review_data['review_id'] = review_id
- dom_reviews.append(review_data)
-
- except:
- continue
-
- except Exception as e:
- pass
-
- return dom_reviews
-
-
-def optimized_hybrid_scrape():
- """Ultra-fast API scrolling + minimal targeted DOM parse."""
-
- config = load_config()
- url = config.get('url')
- headless = config.get('headless', False)
-
- print("OPTIMIZED HYBRID SCRAPER - Ultra-fast API + minimal DOM...")
- print(f"URL: {url[:80]}...")
-
- start_time = time.time()
- api_reviews = {}
-
- driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
-
- try:
- # Navigate
- driver.get(url)
- time.sleep(1.5)
-
- # Dismiss cookies
- try:
- cookie_btns = driver.find_elements(By.CSS_SELECTOR,
- 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
- if cookie_btns:
- cookie_btns[0].click()
- time.sleep(0.4)
- except:
- pass
-
- # Click reviews tab
- review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
- for selector in ['.LRkQ2', 'button[role="tab"]']:
- try:
- tabs = driver.find_elements(By.CSS_SELECTOR, selector)
- for tab in tabs:
- text = (tab.text or '').lower()
- aria = (tab.get_attribute('aria-label') or '').lower()
- if any(kw in text or kw in aria for kw in review_keywords):
- driver.execute_script("arguments[0].click();", tab)
- time.sleep(0.4)
- break
- except:
- continue
-
- # Brief wait for reviews page (balance speed vs stability)
- time.sleep(1.0) # Reduced from 3s but needed for stability
-
- # Find pane - use most common selector directly
- pane = None
- try:
- wait = WebDriverWait(driver, 3) # Reduced from 5s
- pane = wait.until(EC.presence_of_element_located(
- (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
- except TimeoutException:
- try:
- pane = wait.until(EC.presence_of_element_located(
- (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
- except:
- print("ERROR: Could not find pane")
- return []
-
- # Setup API interceptor immediately
- interceptor = GoogleMapsAPIInterceptor(driver)
- interceptor.setup_interception()
- interceptor.inject_response_interceptor()
- time.sleep(0.3) # Minimal wait for interceptor
-
- # Setup scroll
- driver.execute_script("window.scrollablePane = arguments[0];", pane)
- scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
-
- # Trigger initial scroll
- driver.execute_script(scroll_script)
- time.sleep(0.3) # Minimal initial trigger wait
-
- print("Ultra-fast API scrolling...")
-
- # FAST API-only scrolling (NO DOM parsing overhead!)
- max_scrolls = 35
- for i in range(max_scrolls):
- driver.execute_script(scroll_script)
- time.sleep(0.27)
-
- # API collection only
- try:
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
- except:
- pass
-
- if (i + 1) % 10 == 0:
- print(f" {len(api_reviews)} reviews...")
-
- # Final API collection
- try:
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
- except:
- pass
-
- api_time = time.time() - start_time
- print(f" ✅ API complete: {len(api_reviews)} reviews in {api_time:.2f}s")
-
- # Targeted DOM parse ONLY if we're missing reviews
- missing = 244 - len(api_reviews)
- if missing > 0:
- print(f"\nQuick DOM parse for {missing} missing reviews...")
-
- # Scroll to top
- driver.execute_script("window.scrollablePane.scrollTo(0, 0);", pane)
- time.sleep(0.5)
-
- # Quick parse of top reviews (most likely to be missing)
- dom_reviews = quick_dom_parse_top_reviews(driver, count=min(missing + 5, 20))
-
- # Build API keys
- api_keys = set()
- for api_review in api_reviews.values():
- key = (
- api_review.get('author', ''),
- (api_review.get('date_text', '') or '')[:20]
- )
- api_keys.add(key)
-
- # Add unique DOM reviews
- dom_added = 0
- for dom_review in dom_reviews:
- dom_key = (
- dom_review.get('author', ''),
- (dom_review.get('date_text', '') or '')[:20]
- )
- if dom_key not in api_keys and dom_review.get('review_id'):
- api_reviews[dom_review['review_id']] = dom_review
- dom_added += 1
-
- dom_time = time.time() - start_time - api_time
- print(f" ✅ DOM complete: +{dom_added} reviews in {dom_time:.2f}s")
-
- elapsed = time.time() - start_time
- all_reviews = list(api_reviews.values())
-
- print(f"\n{'='*50}")
- print(f"✅ COMPLETED!")
- print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
- print(f"Time: {elapsed:.2f}s")
- print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
- print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
- print(f"{'='*50}")
-
- if len(all_reviews) >= 244:
- print(f"🎯 Got ALL 244 reviews!")
- elif len(all_reviews) >= 240:
- print(f"⚠️ Missing {244-len(all_reviews)} reviews")
-
- print()
-
- # Save
- with open('google_reviews_optimized_hybrid.json', 'w', encoding='utf-8') as f:
- json.dump(all_reviews, f, indent=2, ensure_ascii=False)
-
- print(f"💾 Saved to google_reviews_optimized_hybrid.json")
-
- if all_reviews:
- print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
-
- return all_reviews
-
- finally:
- try:
- driver.quit()
- except:
- pass
-
-
-if __name__ == '__main__':
- try:
- reviews = optimized_hybrid_scrape()
- sys.exit(0 if reviews else 1)
- except KeyboardInterrupt:
- print("\n\nInterrupted by user")
- sys.exit(1)
- except Exception as e:
- print(f"ERROR: {e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
diff --git a/start_parallel.py b/start_parallel.py
deleted file mode 100644
index 6d9b6df..0000000
--- a/start_parallel.py
+++ /dev/null
@@ -1,360 +0,0 @@
-#!/usr/bin/env python3
-"""
-Parallel API Scraper - Capture session, then parallel API calls.
-
-Strategy:
-1. Open browser and navigate to reviews (~15 seconds)
-2. Capture cookies and place ID from active session (~2 seconds)
-3. Make parallel API calls using requests (~5-10 seconds)
-4. Close browser immediately
-
-Expected time: ~20-30 seconds for 244 reviews (vs 155 seconds)
-Speed improvement: ~5-7x faster!
-"""
-import sys
-import yaml
-import logging
-import time
-import json
-from pathlib import Path
-from concurrent.futures import ThreadPoolExecutor, as_completed
-import requests
-from seleniumbase import Driver
-from selenium.webdriver.common.by import By
-from modules.api_interceptor import GoogleMapsAPIInterceptor
-
-logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
-log = logging.getLogger(__name__)
-
-
-def load_config():
- """Load configuration from config.yaml"""
- with open('config.yaml', 'r') as f:
- return yaml.safe_load(f)
-
-
-def capture_session(url: str, headless: bool = False):
- """
- Capture cookies and place ID from browser session.
- Returns (session, place_id, interceptor)
- """
- log.info("="*60)
- log.info("STEP 1: Capturing session from browser")
- log.info("="*60)
-
- driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
-
- try:
- # Navigate to place
- log.info("Opening Google Maps...")
- driver.get(url)
- time.sleep(2)
-
- # Dismiss cookies
- try:
- cookie_btns = driver.find_elements(By.CSS_SELECTOR,
- 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
- if cookie_btns:
- cookie_btns[0].click()
- log.info("✓ Cookie dialog dismissed")
- time.sleep(1)
- except:
- pass
-
- # Click reviews tab
- log.info("Opening reviews tab...")
- review_keywords = ['reviews', 'review', 'reseñas', 'reseña', 'opiniones']
- clicked = False
-
- for selector in ['.LRkQ2', '.hh2c6', '[data-tab-index="1"]', 'button[role="tab"]']:
- try:
- tabs = driver.find_elements(By.CSS_SELECTOR, selector)
- for tab in tabs:
- text = (tab.text or '').lower()
- aria_label = (tab.get_attribute('aria-label') or '').lower()
- if any(kw in text or kw in aria_label for kw in review_keywords):
- driver.execute_script("arguments[0].click();", tab)
- time.sleep(2)
- log.info("✓ Reviews tab clicked")
- clicked = True
- break
- if clicked:
- break
- except:
- continue
-
- # Wait for reviews to load
- time.sleep(3)
-
- # Extract place ID from URL
- current_url = driver.current_url
- place_id = None
- if '!1s' in current_url:
- parts = current_url.split('!1s')
- if len(parts) > 1:
- place_id = parts[1].split('!')[0]
- log.info(f"✓ Extracted place ID: {place_id}")
-
- if not place_id:
- log.error("Could not extract place ID from URL")
- return None, None, None
-
- # Capture ALL cookies using CDP
- log.info("Capturing cookies via CDP...")
- cdp_cookies = driver.execute_cdp_cmd('Network.getAllCookies', {})
- browser_cookies = cdp_cookies.get('cookies', [])
- log.info(f"✓ Captured {len(browser_cookies)} cookies")
-
- # Get user agent
- user_agent = driver.execute_script("return navigator.userAgent")
-
- # Create session with cookies
- session = requests.Session()
- for cookie in browser_cookies:
- session.cookies.set(
- name=cookie['name'],
- value=cookie['value'],
- domain=cookie.get('domain', '.google.com'),
- path=cookie.get('path', '/')
- )
-
- # Set headers
- session.headers.update({
- 'User-Agent': user_agent,
- 'Accept': '*/*',
- 'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8',
- 'Referer': 'https://www.google.com/maps/',
- 'Origin': 'https://www.google.com',
- })
-
- # Create interceptor for parsing
- interceptor = GoogleMapsAPIInterceptor(None)
-
- log.info("✓ Session captured successfully\n")
- return session, place_id, interceptor
-
- finally:
- # Close browser immediately - we don't need it anymore!
- try:
- driver.quit()
- log.info("✓ Browser closed\n")
- except:
- pass
-
-
-def fetch_reviews_page(session, place_id, interceptor, continuation_token=None):
- """Fetch a single page of reviews via API."""
- if continuation_token:
- pb = f"!1m6!1s{place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
- else:
- pb = f"!1m6!1s{place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
-
- params = {
- 'authuser': '0',
- 'hl': 'es',
- 'gl': 'es',
- 'pb': pb
- }
-
- try:
- url = 'https://www.google.com/maps/rpc/listugcposts'
- response = session.get(url, params=params, timeout=10)
-
- if response.status_code != 200:
- log.error(f"API error {response.status_code}")
- return [], None
-
- body = response.text
- if body.startswith(")]}'"):
- body = body[4:].strip()
-
- data = json.loads(body)
- reviews = interceptor._parse_listugcposts_response(data)
-
- # Get next token
- next_token = None
- if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
- next_token = data[1]
-
- return reviews, next_token
-
- except Exception as e:
- log.error(f"Request failed: {e}")
- return [], None
-
-
-def scrape_all_parallel(session, place_id, interceptor, max_workers=5):
- """
- Main scraping method with parallel API calls.
- """
- log.info("="*60)
- log.info("STEP 2: Parallel API scraping")
- log.info("="*60)
-
- start_time = time.time()
- all_reviews = []
- seen_ids = set()
-
- # Fetch first page to get continuation token
- log.info("Fetching first page...")
- reviews, token = fetch_reviews_page(session, place_id, interceptor, None)
- for review in reviews:
- rid = review.review_id or f"{review.author}_{review.date_text}"
- if rid not in seen_ids:
- seen_ids.add(rid)
- all_reviews.append({
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- })
-
- log.info(f" → {len(reviews)} reviews | Total: {len(all_reviews)}")
-
- if not token:
- log.info("No continuation token - only one page of reviews")
- return all_reviews
-
- # Collect continuation tokens by fetching a few sequential pages
- # (We need to do this sequentially to get the tokens)
- tokens = [token]
- log.info("Collecting continuation tokens...")
- for i in range(4): # Get 5 total tokens
- reviews, next_token = fetch_reviews_page(session, place_id, interceptor, token)
- if next_token:
- tokens.append(next_token)
- token = next_token
- else:
- break
-
- for review in reviews:
- rid = review.review_id or f"{review.author}_{review.date_text}"
- if rid not in seen_ids:
- seen_ids.add(rid)
- all_reviews.append({
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- })
-
- log.info(f"Collected {len(tokens)} tokens, {len(all_reviews)} reviews so far")
- log.info(f"Starting parallel fetch with {max_workers} workers...\n")
-
- # Now fetch remaining pages in parallel
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
- futures = []
- for token in tokens:
- future = executor.submit(fetch_reviews_page, session, place_id, interceptor, token)
- futures.append(future)
-
- for i, future in enumerate(as_completed(futures)):
- try:
- reviews, _ = future.result()
- new_count = 0
- for review in reviews:
- rid = review.review_id or f"{review.author}_{review.date_text}"
- if rid not in seen_ids:
- seen_ids.add(rid)
- all_reviews.append({
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- })
- new_count += 1
-
- log.info(f" Completed {i+1}/{len(futures)}: +{new_count} new reviews | Total: {len(all_reviews)}")
- except Exception as e:
- log.error(f" Error in parallel fetch: {e}")
-
- elapsed = time.time() - start_time
-
- log.info(f"\n{'='*60}")
- log.info(f"✅ PARALLEL SCRAPING COMPLETED!")
- log.info(f"{'='*60}")
- log.info(f"Total reviews: {len(all_reviews)}")
- log.info(f"Parallel workers: {max_workers}")
- log.info(f"API time: {elapsed:.2f} seconds")
- log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
- log.info(f"{'='*60}\n")
-
- return all_reviews
-
-
-def main():
- """Main entry point."""
- config = load_config()
- url = config.get('url')
- headless = config.get('headless', False)
-
- log.info("="*60)
- log.info("PARALLEL API SCRAPER")
- log.info("="*60)
- log.info(f"URL: {url[:80]}...")
- log.info(f"Mode: Parallel API calls (no scrolling)")
- log.info("="*60 + "\n")
-
- total_start = time.time()
-
- # Step 1: Capture session from browser
- session, place_id, interceptor = capture_session(url, headless)
- if not session or not place_id:
- log.error("Failed to capture session")
- return []
-
- # Step 2: Parallel API scraping
- reviews = scrape_all_parallel(session, place_id, interceptor, max_workers=5)
-
- total_elapsed = time.time() - total_start
-
- # Save results
- output_file = 'google_reviews_parallel.json'
- with open(output_file, 'w', encoding='utf-8') as f:
- json.dump(reviews, f, indent=2, ensure_ascii=False)
-
- log.info(f"💾 Saved {len(reviews)} reviews to {output_file}")
-
- # Show sample
- if reviews:
- log.info("\n📝 Sample review:")
- sample = reviews[0]
- log.info(f" Author: {sample['author']}")
- log.info(f" Rating: {sample['rating']}★")
- log.info(f" Date: {sample['date_text']}")
- if sample['text']:
- log.info(f" Text: {sample['text'][:80]}...")
-
- # Stats comparison
- log.info("\n" + "="*60)
- log.info("SPEED COMPARISON")
- log.info("="*60)
- log.info(f"Old DOM scraping: ~155 seconds for 244 reviews")
- log.info(f"Fast API scrolling: ~43 seconds for 234 reviews (3.6x faster)")
- log.info(f"Parallel API calls: ~{total_elapsed:.0f} seconds for {len(reviews)} reviews ({155/total_elapsed:.1f}x faster!) 🚀")
- log.info("="*60 + "\n")
-
- return reviews
-
-
-if __name__ == '__main__':
- try:
- reviews = main()
- sys.exit(0 if reviews else 1)
- except KeyboardInterrupt:
- log.info("\n\nInterrupted by user")
- sys.exit(1)
- except Exception as e:
- log.error(f"Fatal error: {e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
diff --git a/start_parallel_hybrid.py b/start_parallel_hybrid.py
deleted file mode 100644
index ac6f65f..0000000
--- a/start_parallel_hybrid.py
+++ /dev/null
@@ -1,350 +0,0 @@
-#!/usr/bin/env python3
-"""
-PARALLEL HYBRID Scraper - Collects API + DOM simultaneously while scrolling.
-
-Strategy:
-1. During scrolling, collect BOTH API responses AND DOM elements in parallel
-2. Deduplicate at the end
-3. Should get all 244 reviews in ~20-25s (vs 34s sequential)
-
-Optimization: No separate DOM parsing phase - everything happens during scroll!
-"""
-import sys
-import yaml
-import logging
-import time
-import json
-from seleniumbase import Driver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
-from modules.api_interceptor import GoogleMapsAPIInterceptor
-
-logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
-log = logging.getLogger(__name__)
-log.setLevel(logging.INFO)
-
-
-def load_config():
- with open('config.yaml', 'r') as f:
- return yaml.safe_load(f)
-
-
-def parse_dom_review_element(elem):
- """Parse a single review element from DOM."""
- try:
- review_data = {}
-
- # Author name
- try:
- author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55')
- review_data['author'] = author_elem.text
- except:
- review_data['author'] = None
-
- # Rating
- try:
- rating_elem = elem.find_element(By.CSS_SELECTOR, 'span.kvMYJc')
- rating_attr = rating_elem.get_attribute('aria-label')
- if rating_attr:
- rating_parts = rating_attr.split()
- if rating_parts:
- review_data['rating'] = float(rating_parts[0])
- except:
- review_data['rating'] = None
-
- # Review text
- try:
- text_elem = elem.find_element(By.CSS_SELECTOR, 'span.wiI7pd')
- review_data['text'] = text_elem.text
- except:
- review_data['text'] = None
-
- # Date
- try:
- date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe')
- review_data['date_text'] = date_elem.text
- except:
- review_data['date_text'] = None
-
- # Avatar URL
- try:
- avatar_elem = elem.find_element(By.CSS_SELECTOR, 'img.NBa7we')
- review_data['avatar_url'] = avatar_elem.get_attribute('src')
- except:
- review_data['avatar_url'] = None
-
- # Profile URL
- try:
- profile_elem = elem.find_element(By.CSS_SELECTOR, 'button.WEBjve')
- review_data['profile_url'] = profile_elem.get_attribute('data-review-id')
- except:
- review_data['profile_url'] = None
-
- # Generate ID from author + date + rating
- if review_data.get('author'):
- review_id = f"dom_{hash(str(review_data.get('author', '')) + str(review_data.get('date_text', '')) + str(review_data.get('rating', '')))}"
- review_data['review_id'] = review_id
- return review_data
-
- return None
-
- except (StaleElementReferenceException, Exception):
- return None
-
-
-def parallel_hybrid_scrape():
- """Collect API + DOM simultaneously during scrolling."""
-
- config = load_config()
- url = config.get('url')
- headless = config.get('headless', False)
-
- print("PARALLEL HYBRID SCRAPER - Collecting API + DOM simultaneously...")
- print(f"URL: {url[:80]}...")
-
- start_time = time.time()
- api_reviews = {}
- dom_reviews = {}
-
- driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
-
- try:
- # Step 1: Navigate
- driver.get(url)
- time.sleep(1.5)
-
- # Dismiss cookies
- try:
- cookie_btns = driver.find_elements(By.CSS_SELECTOR,
- 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
- if cookie_btns:
- cookie_btns[0].click()
- time.sleep(0.4)
- except:
- pass
-
- # Click reviews tab
- review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
- for selector in ['.LRkQ2', 'button[role="tab"]']:
- try:
- tabs = driver.find_elements(By.CSS_SELECTOR, selector)
- for tab in tabs:
- text = (tab.text or '').lower()
- aria = (tab.get_attribute('aria-label') or '').lower()
- if any(kw in text or kw in aria for kw in review_keywords):
- driver.execute_script("arguments[0].click();", tab)
- time.sleep(0.4)
- break
- except:
- continue
-
- # Wait for page stability
- time.sleep(1.0)
-
- # Find pane
- pane = None
- try:
- wait = WebDriverWait(driver, 3)
- pane = wait.until(EC.presence_of_element_located(
- (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
- except TimeoutException:
- try:
- pane = wait.until(EC.presence_of_element_located(
- (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
- except:
- print("ERROR: Could not find pane")
- return []
-
- # Wait for reviews to start loading
- time.sleep(1.5)
-
- # Setup API interceptor
- interceptor = GoogleMapsAPIInterceptor(driver)
- interceptor.setup_interception()
- interceptor.inject_response_interceptor()
- time.sleep(1.0) # Important: wait for interceptor to be ready
-
- # Setup scroll
- driver.execute_script("window.scrollablePane = arguments[0];", pane)
- scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
-
- # Trigger initial scroll to get first API response
- driver.execute_script(scroll_script)
- time.sleep(1.0) # Wait for first API response
-
- print("Parallel collection (API + DOM simultaneously)...")
-
- # Scrolling with PARALLEL API + DOM collection
- max_scrolls = 35
- dom_parse_start = 25 # Only start DOM parsing after 25 scrolls (when near end)
-
- for i in range(max_scrolls):
- # Scroll
- driver.execute_script(scroll_script)
- time.sleep(0.27) # Optimal scroll timing
-
- # PARALLEL COLLECTION 1: API Responses (always)
- try:
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
- except:
- pass
-
- # PARALLEL COLLECTION 2: DOM Elements (only near the end, lightweight)
- # Only parse DOM in the last scrolls when we know we're near 234 API reviews
- if i >= dom_parse_start and len(api_reviews) >= 220:
- try:
- # Lightweight: Just get author + date as unique key, don't parse everything
- review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')
- for elem in review_elements[:min(len(review_elements), 250)]: # Limit to first 250 for speed
- try:
- # Quick parse - just essentials
- author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55')
- author = author_elem.text if author_elem else None
-
- date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe')
- date_text = date_elem.text if date_elem else None
-
- if author and date_text:
- dom_key = (author, date_text[:20])
- if dom_key not in dom_reviews:
- # Full parse only if needed
- dom_review = parse_dom_review_element(elem)
- if dom_review:
- dom_reviews[dom_key] = dom_review
- except:
- continue
- except:
- pass
-
- # Progress logging
- if (i + 1) % 10 == 0:
- print(f" API: {len(api_reviews)}, DOM: {len(dom_reviews)} unique keys...")
-
- # Final collections
- print("Final collection sweep...")
-
- # Final API collection
- try:
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
- except:
- pass
-
- # Final DOM parse (quick sweep)
- try:
- review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')
- for elem in review_elements[:min(len(review_elements), 250)]:
- try:
- author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55')
- author = author_elem.text if author_elem else None
-
- date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe')
- date_text = date_elem.text if date_elem else None
-
- if author and date_text:
- dom_key = (author, date_text[:20])
- if dom_key not in dom_reviews:
- dom_review = parse_dom_review_element(elem)
- if dom_review:
- dom_reviews[dom_key] = dom_review
- except:
- continue
- except:
- pass
-
- # Merge: Start with API reviews, add DOM reviews that aren't duplicates
- print("\nMerging API + DOM reviews...")
-
- # Build set of API keys for deduplication (author + date)
- api_keys = set()
- for api_review in api_reviews.values():
- key = (
- api_review.get('author', ''),
- (api_review.get('date_text', '') or '')[:20]
- )
- api_keys.add(key)
-
- # Add unique DOM reviews
- dom_added = 0
- for dom_key, dom_review in dom_reviews.items():
- if dom_key not in api_keys and dom_review.get('review_id'):
- api_reviews[dom_review['review_id']] = dom_review
- dom_added += 1
-
- elapsed = time.time() - start_time
- all_reviews = list(api_reviews.values())
-
- print(f"\n{'='*50}")
- print(f"✅ COMPLETED!")
- print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
- print(f" - API: {len(api_reviews) - dom_added}")
- print(f" - DOM: {dom_added} unique")
- print(f"Time: {elapsed:.2f}s")
- print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
- print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
- print(f"{'='*50}")
-
- if len(all_reviews) >= 244:
- print(f"🎯 Got ALL 244 reviews!")
- elif len(all_reviews) >= 240:
- print(f"⚠️ Missing {244-len(all_reviews)} reviews")
-
- print()
-
- # Save
- with open('google_reviews_parallel_hybrid.json', 'w', encoding='utf-8') as f:
- json.dump(all_reviews, f, indent=2, ensure_ascii=False)
-
- print(f"💾 Saved to google_reviews_parallel_hybrid.json")
-
- if all_reviews:
- print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
-
- return all_reviews
-
- finally:
- try:
- driver.quit()
- except:
- pass
-
-
-if __name__ == '__main__':
- try:
- reviews = parallel_hybrid_scrape()
- sys.exit(0 if reviews else 1)
- except KeyboardInterrupt:
- print("\n\nInterrupted by user")
- sys.exit(1)
- except Exception as e:
- print(f"ERROR: {e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
diff --git a/start_parallel_v2.py b/start_parallel_v2.py
deleted file mode 100644
index 714638f..0000000
--- a/start_parallel_v2.py
+++ /dev/null
@@ -1,319 +0,0 @@
-#!/usr/bin/env python3
-"""
-Parallel API Scraper V2 - Use browser's fetch API for parallel calls.
-
-Strategy:
-1. Open browser and navigate to reviews (~15 seconds)
-2. Trigger initial API call to get place ID and pattern
-3. Use JavaScript fetch API to make 25 parallel calls (~3-5 seconds)
-4. Collect all results at once
-
-Expected time: ~20-25 seconds for 244 reviews
-Speed improvement: ~6-7x faster!
-"""
-import sys
-import yaml
-import logging
-import time
-import json
-from pathlib import Path
-from seleniumbase import Driver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import TimeoutException
-from modules.api_interceptor import GoogleMapsAPIInterceptor
-
-logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
-log = logging.getLogger(__name__)
-
-
-def load_config():
- """Load configuration from config.yaml"""
- with open('config.yaml', 'r') as f:
- return yaml.safe_load(f)
-
-
-def parallel_scrape():
- """Parallel API-first scraping using browser's fetch API."""
-
- config = load_config()
- url = config.get('url')
- headless = config.get('headless', False)
-
- log.info("="*60)
- log.info("PARALLEL API SCRAPER V2")
- log.info("="*60)
- log.info(f"URL: {url[:80]}...")
- log.info(f"Mode: Parallel browser fetch calls")
- log.info("="*60 + "\n")
-
- start_time = time.time()
-
- driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
-
- try:
- # Step 1: Navigate and setup
- log.info("Step 1: Opening Google Maps...")
- driver.get(url)
- time.sleep(2)
-
- # Dismiss cookies
- try:
- cookie_btns = driver.find_elements(By.CSS_SELECTOR,
- 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
- if cookie_btns:
- cookie_btns[0].click()
- log.info("✓ Cookie dialog dismissed")
- time.sleep(1)
- except:
- pass
-
- # Click reviews tab
- log.info("Step 2: Opening reviews tab...")
- review_keywords = ['reviews', 'review', 'reseñas', 'reseña', 'opiniones']
- clicked = False
-
- for selector in ['.LRkQ2', '.hh2c6', '[data-tab-index="1"]', 'button[role="tab"]']:
- try:
- tabs = driver.find_elements(By.CSS_SELECTOR, selector)
- for tab in tabs:
- text = (tab.text or '').lower()
- aria_label = (tab.get_attribute('aria-label') or '').lower()
- if any(kw in text or kw in aria_label for kw in review_keywords):
- driver.execute_script("arguments[0].click();", tab)
- time.sleep(2)
- log.info("✓ Reviews tab clicked")
- clicked = True
- break
- if clicked:
- break
- except:
- continue
-
- # Wait for reviews to load
- log.info("Waiting for reviews page to fully load...")
- time.sleep(3)
-
- # Find reviews pane
- log.info("Step 3: Finding reviews pane...")
- pane = None
- pane_selectors = [
- 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde',
- 'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde',
- 'div.m6QErb.WNBkOb.XiKgde',
- ]
-
- for selector in pane_selectors:
- try:
- wait = WebDriverWait(driver, 5)
- pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
- log.info(f"✓ Found reviews pane with: {selector}")
- break
- except TimeoutException:
- continue
-
- if not pane:
- log.error("Could not find reviews pane")
- return []
-
- # Wait for initial reviews
- time.sleep(2)
-
- # Extract place ID from URL
- current_url = driver.current_url
- place_id = None
- if '!1s' in current_url:
- parts = current_url.split('!1s')
- if len(parts) > 1:
- place_id = parts[1].split('!')[0]
- log.info(f"✓ Extracted place ID: {place_id}")
-
- if not place_id:
- log.error("Could not extract place ID from URL")
- return []
-
- # Step 4: Make parallel API calls using browser's fetch
- log.info("\n" + "="*60)
- log.info("Step 4: Making parallel API calls via browser fetch")
- log.info("="*60)
-
- # JavaScript to make parallel API calls
- parallel_fetch_script = """
- async function fetchReviewsParallel(placeId, numPages) {
- const baseUrl = 'https://www.google.com/maps/rpc/listugcposts';
- const results = [];
-
- // Build pb parameter for each page
- const requests = [];
- let token = null;
-
- console.log('[Parallel Fetch] Starting parallel fetch for', numPages, 'pages');
-
- // First, we need to get continuation tokens sequentially
- const tokens = [];
- for (let i = 0; i < Math.min(numPages, 5); i++) {
- const pb = token
- ? `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`
- : `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`;
-
- const params = new URLSearchParams({
- authuser: '0',
- hl: 'es',
- gl: 'es',
- pb: pb
- });
-
- try {
- const response = await fetch(`${baseUrl}?${params}`);
- const text = await response.text();
- const body = text.startsWith(")]}'") ? text.substring(4) : text;
- const data = JSON.parse(body);
-
- results.push({index: i, data: data});
-
- // Get next token
- if (data && data.length > 1 && typeof data[1] === 'string') {
- token = data[1];
- tokens.push(token);
- } else {
- break; // No more pages
- }
- } catch (e) {
- console.error('[Parallel Fetch] Error fetching page', i, e);
- }
- }
-
- console.log('[Parallel Fetch] Got', tokens.length, 'continuation tokens');
- console.log('[Parallel Fetch] Now fetching remaining pages in parallel...');
-
- // Now fetch remaining pages in parallel using the tokens
- const parallelPromises = tokens.slice(5).map((tok, idx) => {
- const pb = `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${tok}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`;
- const params = new URLSearchParams({
- authuser: '0',
- hl: 'es',
- gl: 'es',
- pb: pb
- });
-
- return fetch(`${baseUrl}?${params}`)
- .then(r => r.text())
- .then(text => {
- const body = text.startsWith(")]}'") ? text.substring(4) : text;
- return JSON.parse(body);
- })
- .then(data => ({index: idx + 5, data: data}))
- .catch(e => {
- console.error('[Parallel Fetch] Parallel fetch error', idx, e);
- return null;
- });
- });
-
- const parallelResults = await Promise.all(parallelPromises);
- results.push(...parallelResults.filter(r => r !== null));
-
- console.log('[Parallel Fetch] Completed! Total responses:', results.length);
- return results;
- }
-
- // Execute parallel fetch
- return await fetchReviewsParallel(arguments[0], arguments[1]);
- """
-
- log.info(f"Fetching up to 25 pages in parallel...")
- api_start = time.time()
-
- try:
- results = driver.execute_async_script(parallel_fetch_script, place_id, 25)
- api_elapsed = time.time() - api_start
- log.info(f"✓ Parallel fetch completed in {api_elapsed:.2f} seconds")
- log.info(f" Received {len(results)} API responses")
- except Exception as e:
- log.error(f"Parallel fetch failed: {e}")
- return []
-
- # Parse results
- log.info("\nStep 5: Parsing reviews from API responses...")
- interceptor = GoogleMapsAPIInterceptor(None)
- all_reviews = {}
-
- for result in results:
- if result and 'data' in result:
- try:
- parsed = interceptor._parse_listugcposts_response(result['data'])
- for review in parsed:
- if review.review_id and review.review_id not in all_reviews:
- all_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
- except Exception as e:
- log.debug(f"Error parsing response: {e}")
-
- reviews_list = list(all_reviews.values())
- elapsed = time.time() - start_time
-
- log.info(f"\n{'='*60}")
- log.info(f"✅ PARALLEL SCRAPING COMPLETED!")
- log.info(f"{'='*60}")
- log.info(f"Total reviews: {len(reviews_list)}")
- log.info(f"API responses: {len(results)}")
- log.info(f"Total time: {elapsed:.2f} seconds")
- log.info(f" - Setup: {api_start - start_time:.2f}s")
- log.info(f" - Parallel API: {api_elapsed:.2f}s")
- log.info(f"Speed: {len(reviews_list)/elapsed:.1f} reviews/second")
- log.info(f"{'='*60}\n")
-
- # Save results
- output_file = 'google_reviews_parallel.json'
- with open(output_file, 'w', encoding='utf-8') as f:
- json.dump(reviews_list, f, indent=2, ensure_ascii=False)
-
- log.info(f"💾 Saved {len(reviews_list)} reviews to {output_file}")
-
- # Show sample
- if reviews_list:
- log.info("\n📝 Sample review:")
- sample = reviews_list[0]
- log.info(f" Author: {sample['author']}")
- log.info(f" Rating: {sample['rating']}★")
- log.info(f" Date: {sample['date_text']}")
- if sample['text']:
- log.info(f" Text: {sample['text'][:80]}...")
-
- # Stats comparison
- log.info("\n" + "="*60)
- log.info("SPEED COMPARISON")
- log.info("="*60)
- log.info(f"Old DOM scraping: ~155 seconds for 244 reviews (1.0x)")
- log.info(f"Fast API scrolling: ~43 seconds for 234 reviews (3.6x faster)")
- log.info(f"Parallel browser fetch: ~{elapsed:.0f} seconds for {len(reviews_list)} reviews ({155/elapsed:.1f}x faster!) 🚀")
- log.info("="*60 + "\n")
-
- return reviews_list
-
- finally:
- try:
- driver.quit()
- except:
- pass
-
-
-if __name__ == '__main__':
- try:
- reviews = parallel_scrape()
- sys.exit(0 if reviews else 1)
- except KeyboardInterrupt:
- log.info("\n\nInterrupted by user")
- sys.exit(1)
- except Exception as e:
- log.error(f"Fatal error: {e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
diff --git a/start_ultra_fast.py b/start_ultra_fast.py
deleted file mode 100644
index c26aca3..0000000
--- a/start_ultra_fast.py
+++ /dev/null
@@ -1,279 +0,0 @@
-#!/usr/bin/env python3
-"""
-ULTRA-FAST API Scraper - Maximum speed optimization.
-
-Optimizations:
-1. Minimal waits (0.5s after tab click instead of 3s)
-2. No wait for "initial reviews" (removes 3s)
-3. Faster scroll timing (0.2s instead of 0.3s)
-4. Batch response collection (every 3 scrolls, not every scroll)
-5. Less logging during scrolling (I/O overhead)
-6. Direct pane selection (no trying multiple)
-7. Parallel operations where possible
-
-Target: ~15-20 seconds for 234 reviews
-"""
-import sys
-import yaml
-import logging
-import time
-import json
-from seleniumbase import Driver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import TimeoutException
-from modules.api_interceptor import GoogleMapsAPIInterceptor
-
-logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
-log = logging.getLogger(__name__)
-# Only show INFO and above
-log.setLevel(logging.INFO)
-
-
-def load_config():
- with open('config.yaml', 'r') as f:
- return yaml.safe_load(f)
-
-
-def ultra_fast_scrape():
- """Ultra-fast API-first scraping with all optimizations."""
-
- config = load_config()
- url = config.get('url')
- headless = config.get('headless', False)
-
- print("ULTRA-FAST SCRAPER - Starting...")
- print(f"URL: {url[:80]}...")
-
- start_time = time.time()
- api_reviews = {}
-
- driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
-
- try:
- # Step 1: Navigate (minimal waits)
- driver.get(url)
- time.sleep(1.5) # Stable wait
-
- # Dismiss cookies (non-blocking)
- try:
- cookie_btns = driver.find_elements(By.CSS_SELECTOR,
- 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
- if cookie_btns:
- cookie_btns[0].click()
- time.sleep(0.4) # Balanced wait
- except:
- pass
-
- # Click reviews tab
- review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
- for selector in ['.LRkQ2', 'button[role="tab"]']:
- try:
- tabs = driver.find_elements(By.CSS_SELECTOR, selector)
- for tab in tabs:
- text = (tab.text or '').lower()
- aria = (tab.get_attribute('aria-label') or '').lower()
- if any(kw in text or kw in aria for kw in review_keywords):
- driver.execute_script("arguments[0].click();", tab)
- time.sleep(0.4) # Balanced wait
- break
- except:
- continue
-
- # Brief wait for reviews page (balance speed vs stability)
- time.sleep(1.0) # Reduced from 3s but needed for stability
-
- # Find pane - use most common selector directly
- pane = None
- try:
- wait = WebDriverWait(driver, 3) # Reduced from 5s
- pane = wait.until(EC.presence_of_element_located(
- (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
- except TimeoutException:
- try:
- pane = wait.until(EC.presence_of_element_located(
- (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
- except:
- print("ERROR: Could not find pane")
- return []
-
- # NO wait for initial reviews - save 3s!
- # Setup API interceptor immediately
-
- interceptor = GoogleMapsAPIInterceptor(driver)
- interceptor.setup_interception()
- interceptor.inject_response_interceptor()
- time.sleep(0.3) # Minimal wait for interceptor
-
- # Setup scroll
- driver.execute_script("window.scrollablePane = arguments[0];", pane)
- scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
-
- # Trigger initial scroll
- driver.execute_script(scroll_script)
- time.sleep(0.3) # Minimal initial trigger wait
-
- print("Fast scrolling...")
-
- # Rapid scrolling with batch collection
- target_reviews = 240
- max_scrolls = 35 # Slightly more to compensate for faster timing
-
- for i in range(max_scrolls):
- # Ultra-fast scroll
- driver.execute_script(scroll_script)
- time.sleep(0.27) # Sweet spot for stability
-
- # Collect every scroll (can't skip or buffer clears)
- try:
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
-
- # Only log every 10 scrolls to reduce I/O
- if (i + 1) % 10 == 0:
- print(f" {len(api_reviews)} reviews...")
-
- if len(api_reviews) >= target_reviews:
- break
- except:
- pass
-
- # Final collection
- try:
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
- except:
- pass
-
- # Quick DOM parse for missing reviews (only if needed)
- missing = 244 - len(api_reviews)
- if missing > 0:
- print(f"\nQuick DOM parse for {missing} missing reviews...")
- try:
- # Scroll to top
- driver.execute_script("window.scrollablePane.scrollTo(0, 0);", pane)
- time.sleep(0.3)
-
- # Parse top reviews (most likely to be missing)
- review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')[:min(missing + 5, 20)]
-
- # Build API keys for deduplication
- api_keys = set()
- for api_review in api_reviews.values():
- key = (api_review.get('author', ''), (api_review.get('date_text', '') or '')[:20])
- api_keys.add(key)
-
- # Parse and add unique DOM reviews
- dom_added = 0
- for elem in review_elements:
- try:
- review_data = {}
-
- # Author
- author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55')
- review_data['author'] = author_elem.text if author_elem else None
-
- # Rating
- rating_elem = elem.find_element(By.CSS_SELECTOR, 'span.kvMYJc')
- rating_attr = rating_elem.get_attribute('aria-label')
- if rating_attr:
- rating_parts = rating_attr.split()
- if rating_parts:
- review_data['rating'] = float(rating_parts[0])
-
- # Text
- text_elem = elem.find_element(By.CSS_SELECTOR, 'span.wiI7pd')
- review_data['text'] = text_elem.text if text_elem else None
-
- # Date
- date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe')
- review_data['date_text'] = date_elem.text if date_elem else None
-
- # Avatar
- avatar_elem = elem.find_element(By.CSS_SELECTOR, 'img.NBa7we')
- review_data['avatar_url'] = avatar_elem.get_attribute('src') if avatar_elem else None
-
- # Profile URL
- profile_elem = elem.find_element(By.CSS_SELECTOR, 'button.WEBjve')
- review_data['profile_url'] = profile_elem.get_attribute('data-review-id') if profile_elem else None
-
- # Check if unique
- dom_key = (review_data.get('author', ''), (review_data.get('date_text', '') or '')[:20])
- if dom_key not in api_keys and review_data.get('author'):
- review_id = f"dom_{hash(str(review_data.get('author', '')) + str(review_data.get('date_text', '')))}"
- review_data['review_id'] = review_id
- api_reviews[review_id] = review_data
- api_keys.add(dom_key)
- dom_added += 1
-
- except:
- continue
-
- print(f" +{dom_added} reviews from DOM")
- except Exception as e:
- print(f" DOM parse failed: {e}")
-
- elapsed = time.time() - start_time
- all_reviews = list(api_reviews.values())
-
- print(f"\n✅ COMPLETED!")
- print(f"Reviews: {len(all_reviews)}")
- print(f"Time: {elapsed:.2f}s")
- print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
- print(f"Speedup: {155/elapsed:.1f}x faster! 🚀\n")
-
- # Save
- with open('google_reviews_ultra_fast.json', 'w', encoding='utf-8') as f:
- json.dump(all_reviews, f, indent=2, ensure_ascii=False)
-
- print(f"💾 Saved to google_reviews_ultra_fast.json")
-
- if all_reviews:
- print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
-
- return all_reviews
-
- finally:
- try:
- driver.quit()
- except:
- pass
-
-
-if __name__ == '__main__':
- try:
- reviews = ultra_fast_scrape()
- sys.exit(0 if reviews else 1)
- except KeyboardInterrupt:
- print("\n\nInterrupted by user")
- sys.exit(1)
- except Exception as e:
- print(f"ERROR: {e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
diff --git a/start_ultra_fast_complete.py b/start_ultra_fast_complete.py
deleted file mode 100644
index c0764af..0000000
--- a/start_ultra_fast_complete.py
+++ /dev/null
@@ -1,336 +0,0 @@
-#!/usr/bin/env python3
-"""
-ULTRA-FAST COMPLETE Scraper - Gets ALL 244 reviews in ~25-30 seconds.
-
-Strategy:
-1. Ultra-fast API scrolling to get 234 reviews (~19s)
-2. DOM parsing for missing 10 reviews (~5-10s)
-3. Total: ~25-30s for 244 reviews (vs 155s original)
-
-Combines speed of start_ultra_fast.py with completeness of original scraper.
-"""
-import sys
-import yaml
-import logging
-import time
-import json
-from seleniumbase import Driver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import TimeoutException
-from modules.api_interceptor import GoogleMapsAPIInterceptor
-
-logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
-log = logging.getLogger(__name__)
-log.setLevel(logging.INFO)
-
-
-def load_config():
- with open('config.yaml', 'r') as f:
- return yaml.safe_load(f)
-
-
-def parse_dom_reviews_fast(driver, max_reviews=20):
- """Fast DOM parsing using JavaScript - extracts data in bulk."""
-
- # JavaScript to extract review data from first N reviews
- extract_script = """
- const reviews = [];
- const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium');
- const maxCount = Math.min(arguments[0], elements.length);
-
- for (let i = 0; i < maxCount; i++) {
- const elem = elements[i];
- const review = {};
-
- try {
- // Author
- const authorElem = elem.querySelector('div.d4r55');
- review.author = authorElem ? authorElem.textContent : null;
-
- // Rating
- const ratingElem = elem.querySelector('span.kvMYJc');
- if (ratingElem) {
- const ariaLabel = ratingElem.getAttribute('aria-label');
- if (ariaLabel) {
- const match = ariaLabel.match(/\\d+/);
- review.rating = match ? parseFloat(match[0]) : null;
- }
- }
-
- // Text
- const textElem = elem.querySelector('span.wiI7pd');
- review.text = textElem ? textElem.textContent : null;
-
- // Date
- const dateElem = elem.querySelector('span.rsqaWe');
- review.date_text = dateElem ? dateElem.textContent : null;
-
- // Avatar
- const avatarElem = elem.querySelector('img.NBa7we');
- review.avatar_url = avatarElem ? avatarElem.src : null;
-
- // Profile URL
- const profileElem = elem.querySelector('button.WEBjve');
- review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null;
-
- if (review.author) {
- reviews.push(review);
- }
- } catch (e) {
- // Skip this review
- }
- }
-
- return reviews;
- """
-
- try:
- # Execute JavaScript to get all review data at once
- dom_reviews_data = driver.execute_script(extract_script, max_reviews)
-
- # Convert to our format
- dom_reviews = []
- for review_data in dom_reviews_data:
- if review_data.get('author') and review_data.get('date_text'):
- review_id = f"dom_{hash(review_data['author'] + review_data['date_text'])}"
- review_data['review_id'] = review_id
- dom_reviews.append(review_data)
-
- return dom_reviews
-
- except Exception as e:
- print(f" Error in fast DOM parse: {e}")
- return []
-
-
-def ultra_fast_complete_scrape():
- """Get ALL reviews with ultra-fast API + DOM fallback."""
-
- config = load_config()
- url = config.get('url')
- headless = config.get('headless', False)
-
- print("ULTRA-FAST COMPLETE SCRAPER - Getting ALL 244 reviews...")
- print(f"URL: {url[:80]}...")
-
- start_time = time.time()
- api_reviews = {}
-
- driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
-
- try:
- # ====== PHASE 1: ULTRA-FAST API SCROLLING ======
- print("\n[Phase 1] Ultra-fast API scrolling...")
-
- # Step 1: Navigate
- driver.get(url)
- time.sleep(1.5)
-
- # Dismiss cookies
- try:
- cookie_btns = driver.find_elements(By.CSS_SELECTOR,
- 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
- if cookie_btns:
- cookie_btns[0].click()
- time.sleep(0.4)
- except:
- pass
-
- # Click reviews tab
- review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
- for selector in ['.LRkQ2', 'button[role="tab"]']:
- try:
- tabs = driver.find_elements(By.CSS_SELECTOR, selector)
- for tab in tabs:
- text = (tab.text or '').lower()
- aria = (tab.get_attribute('aria-label') or '').lower()
- if any(kw in text or kw in aria for kw in review_keywords):
- driver.execute_script("arguments[0].click();", tab)
- time.sleep(0.4)
- break
- except:
- continue
-
- # Wait for page stability
- time.sleep(1.0)
-
- # Find pane
- pane = None
- try:
- wait = WebDriverWait(driver, 3)
- pane = wait.until(EC.presence_of_element_located(
- (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
- except TimeoutException:
- try:
- pane = wait.until(EC.presence_of_element_located(
- (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
- except:
- print("ERROR: Could not find pane")
- return []
-
- # Setup API interceptor
- interceptor = GoogleMapsAPIInterceptor(driver)
- interceptor.setup_interception()
- interceptor.inject_response_interceptor()
- time.sleep(0.3)
-
- # Setup scroll
- driver.execute_script("window.scrollablePane = arguments[0];", pane)
- scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
-
- # Trigger initial scroll
- driver.execute_script(scroll_script)
- time.sleep(0.3)
-
- print(" Fast scrolling for API reviews...")
-
- # Rapid scrolling
- target_reviews = 240
- max_scrolls = 35
-
- for i in range(max_scrolls):
- driver.execute_script(scroll_script)
- time.sleep(0.27)
-
- # Collect responses
- try:
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
-
- if (i + 1) % 10 == 0:
- print(f" {len(api_reviews)} reviews...")
-
- if len(api_reviews) >= target_reviews:
- break
- except:
- pass
-
- # Final API collection
- try:
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
- except:
- pass
-
- phase1_time = time.time() - start_time
- print(f" ✅ Phase 1 complete: {len(api_reviews)} reviews in {phase1_time:.2f}s")
-
- # ====== PHASE 2: DOM PARSING FOR MISSING REVIEWS ======
- missing_count = 244 - len(api_reviews)
-
- if missing_count > 0:
- print(f"\n[Phase 2] Fast DOM parsing for {missing_count} missing reviews...")
-
- # Scroll to top (missing reviews likely at top)
- driver.execute_script("window.scrollablePane.scrollTo(0, 0);", pane)
- time.sleep(0.5) # Brief wait for scroll
-
- # Fast JavaScript-based parsing (only first 20 reviews)
- dom_reviews = parse_dom_reviews_fast(driver, max_reviews=min(missing_count + 10, 25))
-
- # Add DOM reviews that aren't in API reviews
- # Use author + rating + date as key for better duplicate detection
- api_keys = set()
- for api_review in api_reviews.values():
- key = (
- api_review.get('author', ''),
- api_review.get('rating', 0),
- (api_review.get('date_text', '') or '')[:20] # First 20 chars of date
- )
- api_keys.add(key)
-
- dom_added = 0
- for dom_review in dom_reviews:
- # Create key for this DOM review
- dom_key = (
- dom_review.get('author', ''),
- dom_review.get('rating', 0),
- (dom_review.get('date_text', '') or '')[:20]
- )
-
- # Only add if not already in API reviews
- if dom_key not in api_keys and dom_review.get('review_id'):
- api_reviews[dom_review['review_id']] = dom_review
- api_keys.add(dom_key) # Track this to avoid duplicates within DOM too
- dom_added += 1
-
- phase2_time = time.time() - start_time - phase1_time
- print(f" ✅ Phase 2 complete: +{dom_added} reviews from DOM in {phase2_time:.2f}s")
-
- # ====== RESULTS ======
- elapsed = time.time() - start_time
- all_reviews = list(api_reviews.values())
-
- print(f"\n{'='*50}")
- print(f"✅ COMPLETED!")
- print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
- print(f"Time: {elapsed:.2f}s")
- print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
- print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
- print(f"{'='*50}")
-
- if len(all_reviews) >= 244:
- print(f"🎯 Got ALL 244 reviews!")
- elif len(all_reviews) >= 240:
- print(f"⚠️ Missing {244-len(all_reviews)} reviews")
- else:
- print(f"⚠️ Missing {244-len(all_reviews)} reviews - may need more DOM parsing")
-
- print()
-
- # Save
- with open('google_reviews_ultra_fast_complete.json', 'w', encoding='utf-8') as f:
- json.dump(all_reviews, f, indent=2, ensure_ascii=False)
-
- print(f"💾 Saved to google_reviews_ultra_fast_complete.json")
-
- if all_reviews:
- print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
-
- return all_reviews
-
- finally:
- try:
- driver.quit()
- except:
- pass
-
-
-if __name__ == '__main__':
- try:
- reviews = ultra_fast_complete_scrape()
- sys.exit(0 if reviews else 1)
- except KeyboardInterrupt:
- print("\n\nInterrupted by user")
- sys.exit(1)
- except Exception as e:
- print(f"ERROR: {e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
diff --git a/start_ultra_fast_v2.py b/start_ultra_fast_v2.py
deleted file mode 100644
index 05178b2..0000000
--- a/start_ultra_fast_v2.py
+++ /dev/null
@@ -1,280 +0,0 @@
-#!/usr/bin/env python3
-"""
-Complete Scraper - Gets ALL reviews while staying fast.
-
-Strategy:
-1. Scroll until no new reviews for 5 consecutive scrolls
-2. Check scroll position to detect end
-3. Do extra scrolls at the end to catch stragglers
-4. Adaptive timing - faster at start, slower at end
-
-Target: Get all 244 reviews in ~22-25 seconds
-"""
-import sys
-import yaml
-import logging
-import time
-import json
-from seleniumbase import Driver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import TimeoutException
-from modules.api_interceptor import GoogleMapsAPIInterceptor
-
-logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
-log = logging.getLogger(__name__)
-log.setLevel(logging.INFO)
-
-
-def load_config():
- with open('config.yaml', 'r') as f:
- return yaml.safe_load(f)
-
-
-def complete_scrape():
- """Get ALL reviews with intelligent scrolling."""
-
- config = load_config()
- url = config.get('url')
- headless = config.get('headless', False)
-
- print("COMPLETE SCRAPER - Getting ALL reviews...")
- print(f"URL: {url[:80]}...")
-
- start_time = time.time()
- api_reviews = {}
-
- driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
-
- try:
- # Step 1: Navigate
- driver.get(url)
- time.sleep(1.5)
-
- # Dismiss cookies
- try:
- cookie_btns = driver.find_elements(By.CSS_SELECTOR,
- 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
- if cookie_btns:
- cookie_btns[0].click()
- time.sleep(0.4)
- except:
- pass
-
- # Click reviews tab
- review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
- for selector in ['.LRkQ2', 'button[role="tab"]']:
- try:
- tabs = driver.find_elements(By.CSS_SELECTOR, selector)
- for tab in tabs:
- text = (tab.text or '').lower()
- aria = (tab.get_attribute('aria-label') or '').lower()
- if any(kw in text or kw in aria for kw in review_keywords):
- driver.execute_script("arguments[0].click();", tab)
- time.sleep(0.4)
- break
- except:
- continue
-
- # Wait for page stability
- time.sleep(1.0)
-
- # Find pane
- pane = None
- try:
- wait = WebDriverWait(driver, 3)
- pane = wait.until(EC.presence_of_element_located(
- (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
- except TimeoutException:
- try:
- pane = wait.until(EC.presence_of_element_located(
- (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
- except:
- print("ERROR: Could not find pane")
- return []
-
- # Wait for initial reviews to load
- time.sleep(1.5)
-
- # Setup API interceptor
- interceptor = GoogleMapsAPIInterceptor(driver)
- interceptor.setup_interception()
- interceptor.inject_response_interceptor()
- time.sleep(1.0) # Important: wait for interceptor to be ready
-
- # Setup scroll
- driver.execute_script("window.scrollablePane = arguments[0];", pane)
- scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
-
- # Trigger initial scroll to get first API response
- driver.execute_script(scroll_script)
- time.sleep(1.0) # Wait for first API response
-
- print("Scrolling with intelligent stopping...")
-
- # Intelligent scrolling
- max_scrolls = 60 # Higher limit to ensure we get everything
- idle_scrolls = 0 # Count scrolls with no new reviews
- max_idle = 12 # More patience - stop after 12 scrolls with no new reviews
- last_count = 0
- last_scroll_pos = 0
- scroll_stuck_count = 0
-
- for i in range(max_scrolls):
- # Scroll
- driver.execute_script(scroll_script)
-
- # Adaptive timing - faster at start, slower near end
- if len(api_reviews) < 100:
- time.sleep(0.27) # Fast at beginning
- elif len(api_reviews) < 200:
- time.sleep(0.30) # Medium in middle
- elif len(api_reviews) < 235:
- time.sleep(0.40) # Slower near end
- else:
- time.sleep(0.50) # Very slow at the very end to catch stragglers
-
- # Collect responses
- try:
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
- except:
- pass
-
- # Check if we got new reviews
- current_count = len(api_reviews)
- if current_count == last_count:
- idle_scrolls += 1
- else:
- idle_scrolls = 0
- if (i + 1) % 10 == 0:
- print(f" {current_count} reviews...")
-
- last_count = current_count
-
- # Check scroll position to detect if stuck at bottom
- try:
- current_scroll = driver.execute_script("return arguments[0].scrollTop;", pane)
- if current_scroll == last_scroll_pos:
- scroll_stuck_count += 1
- else:
- scroll_stuck_count = 0
- last_scroll_pos = current_scroll
- except:
- pass
-
- # Stop conditions
- if idle_scrolls >= max_idle and scroll_stuck_count >= 3:
- print(f" Reached end (no new reviews for {idle_scrolls} scrolls)")
- break
-
- # Extra thorough collection at the end
- print(f" Final collection sweep (currently have {len(api_reviews)})...")
-
- # Do a few more scrolls with longer waits
- for extra in range(5):
- driver.execute_script(scroll_script)
- time.sleep(0.8) # Longer wait to ensure API completes
-
- try:
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- new_count = 0
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
- new_count += 1
-
- if new_count > 0:
- print(f" +{new_count} more reviews (total: {len(api_reviews)})")
- except:
- pass
-
- # Final wait and collect
- time.sleep(1.0)
- try:
- responses = interceptor.get_intercepted_responses()
- if responses:
- parsed = interceptor.parse_reviews_from_responses(responses)
- for review in parsed:
- if review.review_id and review.review_id not in api_reviews:
- api_reviews[review.review_id] = {
- 'review_id': review.review_id,
- 'author': review.author,
- 'rating': review.rating,
- 'text': review.text,
- 'date_text': review.date_text,
- 'avatar_url': review.avatar_url,
- 'profile_url': review.profile_url,
- }
- except:
- pass
-
- elapsed = time.time() - start_time
- all_reviews = list(api_reviews.values())
-
- print(f"\n✅ COMPLETED!")
- print(f"Reviews: {len(all_reviews)} (target: 244)")
- print(f"Time: {elapsed:.2f}s")
- print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
- print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
-
- if len(all_reviews) >= 244:
- print(f"🎯 Got ALL reviews!")
- elif len(all_reviews) >= 240:
- print(f"⚠️ Missing {244-len(all_reviews)} reviews")
-
- print()
-
- # Save
- with open('google_reviews_complete.json', 'w', encoding='utf-8') as f:
- json.dump(all_reviews, f, indent=2, ensure_ascii=False)
-
- print(f"💾 Saved to google_reviews_complete.json")
-
- if all_reviews:
- print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
-
- return all_reviews
-
- finally:
- try:
- driver.quit()
- except:
- pass
-
-
-if __name__ == '__main__':
- try:
- reviews = complete_scrape()
- sys.exit(0 if reviews else 1)
- except KeyboardInterrupt:
- print("\n\nInterrupted by user")
- sys.exit(1)
- except Exception as e:
- print(f"ERROR: {e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)