Remove old scraper files - consolidate to scraper_clean
Production (api_server_production.py) only uses: - modules/scraper_clean.py - main scraping logic - modules/fast_scraper.py - validation helpers - modules/database.py, webhooks.py, health_checks.py, chrome_pool.py Deleted 33 unused Python files including: - Old API server (api_server.py) - 14 start*.py experimental scrapers - 7 *_scraper.py variants - Old modules: scraper.py, api_interceptor.py, job_manager.py, cli.py - Various debug/test/utility scripts Saves ~11,000 lines of unmaintained code. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
383
api_server.py
383
api_server.py
@@ -1,383 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
FastAPI server for Google Reviews Scraper.
|
||||
Provides REST API endpoints to trigger and manage scraping jobs.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import asyncio
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Dict, Any, List, Optional
|
||||
|
||||
from fastapi import FastAPI, HTTPException, BackgroundTasks, Query
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from pydantic import BaseModel, HttpUrl, Field
|
||||
|
||||
from modules.job_manager import JobManager, JobStatus, ScrapingJob
|
||||
from modules.chrome_pool import start_worker_pools, stop_worker_pools, get_pool_stats, get_validation_worker, release_validation_worker
|
||||
from modules.fast_scraper import check_reviews_available, get_business_card_info
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
log = logging.getLogger("api_server")
|
||||
|
||||
# Global job manager instance
|
||||
job_manager: Optional[JobManager] = None
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Lifespan context manager for startup and shutdown"""
|
||||
global job_manager
|
||||
|
||||
# Startup
|
||||
log.info("Starting Google Reviews Scraper API Server")
|
||||
|
||||
# Start Chrome worker pools
|
||||
log.info("Initializing Chrome worker pools...")
|
||||
start_worker_pools(
|
||||
validation_size=1, # 1 pre-warmed worker for validation
|
||||
scraping_size=2, # 2 pre-warmed workers for scraping
|
||||
headless=True
|
||||
)
|
||||
|
||||
job_manager = JobManager(max_concurrent_jobs=3)
|
||||
|
||||
# Start auto-cleanup task
|
||||
asyncio.create_task(cleanup_jobs_periodically())
|
||||
|
||||
yield
|
||||
|
||||
# Shutdown
|
||||
log.info("Shutting down Google Reviews Scraper API Server")
|
||||
|
||||
if job_manager:
|
||||
job_manager.shutdown()
|
||||
|
||||
# Stop Chrome worker pools
|
||||
log.info("Stopping Chrome worker pools...")
|
||||
stop_worker_pools()
|
||||
|
||||
|
||||
# Initialize FastAPI app
|
||||
app = FastAPI(
|
||||
title="Google Reviews Scraper API",
|
||||
description="REST API for triggering and managing Google Maps review scraping jobs",
|
||||
version="1.0.0",
|
||||
lifespan=lifespan
|
||||
)
|
||||
|
||||
# Add CORS middleware
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"], # Configure appropriately for production
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
# Pydantic models for API
|
||||
class ScrapeRequest(BaseModel):
|
||||
"""Request model for starting a scrape job"""
|
||||
url: HttpUrl = Field(..., description="Google Maps URL to scrape")
|
||||
headless: Optional[bool] = Field(None, description="Run Chrome in headless mode (default: True)")
|
||||
max_scrolls: Optional[int] = Field(None, description="Maximum scrolls (default: unlimited - stops via idle detection)")
|
||||
sort_by: Optional[str] = Field(None, description="Sort order: newest, highest, lowest, relevance")
|
||||
stop_on_match: Optional[bool] = Field(None, description="Stop when first already-seen review is encountered")
|
||||
overwrite_existing: Optional[bool] = Field(None, description="Overwrite existing reviews instead of appending")
|
||||
download_images: Optional[bool] = Field(None, description="Download images from reviews")
|
||||
use_s3: Optional[bool] = Field(None, description="Upload images to S3")
|
||||
custom_params: Optional[Dict[str, Any]] = Field(None, description="Custom parameters to add to each document")
|
||||
|
||||
|
||||
class JobResponse(BaseModel):
|
||||
"""Response model for job information"""
|
||||
job_id: str
|
||||
status: JobStatus
|
||||
url: str
|
||||
created_at: str
|
||||
started_at: Optional[str] = None
|
||||
completed_at: Optional[str] = None
|
||||
updated_at: Optional[str] = None # Last update time for progress tracking
|
||||
error_message: Optional[str] = None
|
||||
reviews_count: Optional[int] = None
|
||||
total_reviews: Optional[int] = None # Total reviews available for this place
|
||||
images_count: Optional[int] = None
|
||||
progress: Optional[Dict[str, Any]] = None
|
||||
scrape_time: Optional[float] = None # Time taken to scrape in seconds
|
||||
|
||||
|
||||
class JobStatsResponse(BaseModel):
|
||||
"""Response model for job statistics"""
|
||||
total_jobs: int
|
||||
by_status: Dict[str, int]
|
||||
running_jobs: int
|
||||
max_concurrent_jobs: int
|
||||
|
||||
|
||||
class ReviewsResponse(BaseModel):
|
||||
"""Response model for reviews data"""
|
||||
job_id: str
|
||||
reviews: List[Dict[str, Any]]
|
||||
count: int
|
||||
|
||||
|
||||
# Background task for periodic cleanup
|
||||
async def cleanup_jobs_periodically():
|
||||
"""Periodically clean up old jobs"""
|
||||
while True:
|
||||
await asyncio.sleep(3600) # Run every hour
|
||||
if job_manager:
|
||||
job_manager.cleanup_old_jobs(max_age_hours=24)
|
||||
|
||||
|
||||
# API Endpoints
|
||||
|
||||
@app.get("/", summary="API Health Check")
|
||||
async def root():
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"message": "Google Reviews Scraper API is running",
|
||||
"status": "healthy",
|
||||
"version": "1.0.0"
|
||||
}
|
||||
|
||||
|
||||
@app.post("/scrape", response_model=Dict[str, str], summary="Start Scraping Job")
|
||||
async def start_scrape(request: ScrapeRequest, background_tasks: BackgroundTasks):
|
||||
"""
|
||||
Start a new scraping job in the background.
|
||||
|
||||
Returns the job ID that can be used to check status.
|
||||
"""
|
||||
if not job_manager:
|
||||
raise HTTPException(status_code=500, detail="Job manager not initialized")
|
||||
|
||||
# Prepare config overrides
|
||||
config_overrides = {}
|
||||
|
||||
# Only include non-None values
|
||||
for field, value in request.dict().items():
|
||||
if value is not None and field != "url":
|
||||
config_overrides[field] = value
|
||||
|
||||
# Convert URL to string
|
||||
url = str(request.url)
|
||||
|
||||
try:
|
||||
# Create job
|
||||
job_id = job_manager.create_job(url, config_overrides)
|
||||
|
||||
# Start job immediately if possible
|
||||
started = job_manager.start_job(job_id)
|
||||
|
||||
log.info(f"Created scraping job {job_id} for URL: {url}")
|
||||
|
||||
return {
|
||||
"job_id": job_id,
|
||||
"status": "started" if started else "queued",
|
||||
"message": f"Scraping job {'started' if started else 'queued'} successfully"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error creating scraping job: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to create scraping job: {str(e)}")
|
||||
|
||||
|
||||
@app.get("/jobs/{job_id}", response_model=JobResponse, summary="Get Job Status")
|
||||
async def get_job(job_id: str):
|
||||
"""Get detailed information about a specific job"""
|
||||
if not job_manager:
|
||||
raise HTTPException(status_code=500, detail="Job manager not initialized")
|
||||
|
||||
job = job_manager.get_job(job_id)
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
return JobResponse(**job.to_dict())
|
||||
|
||||
|
||||
@app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews")
|
||||
async def get_job_reviews(job_id: str):
|
||||
"""
|
||||
Get the actual reviews data for a completed job.
|
||||
|
||||
Returns 404 if job not found or not completed yet.
|
||||
"""
|
||||
if not job_manager:
|
||||
raise HTTPException(status_code=500, detail="Job manager not initialized")
|
||||
|
||||
reviews = job_manager.get_job_reviews(job_id)
|
||||
if reviews is None:
|
||||
job = job_manager.get_job(job_id)
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
elif job.status != JobStatus.COMPLETED:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Job not completed yet (current status: {job.status})"
|
||||
)
|
||||
else:
|
||||
raise HTTPException(status_code=404, detail="Reviews data not available")
|
||||
|
||||
return ReviewsResponse(
|
||||
job_id=job_id,
|
||||
reviews=reviews,
|
||||
count=len(reviews)
|
||||
)
|
||||
|
||||
|
||||
@app.get("/jobs", response_model=List[JobResponse], summary="List Jobs")
|
||||
async def list_jobs(
|
||||
status: Optional[JobStatus] = Query(None, description="Filter by job status"),
|
||||
limit: int = Query(100, description="Maximum number of jobs to return", ge=1, le=1000)
|
||||
):
|
||||
"""List all jobs, optionally filtered by status"""
|
||||
if not job_manager:
|
||||
raise HTTPException(status_code=500, detail="Job manager not initialized")
|
||||
|
||||
jobs = job_manager.list_jobs(status=status, limit=limit)
|
||||
return [JobResponse(**job.to_dict()) for job in jobs]
|
||||
|
||||
|
||||
@app.post("/jobs/{job_id}/start", summary="Start Pending Job")
|
||||
async def start_job(job_id: str):
|
||||
"""Start a pending job manually"""
|
||||
if not job_manager:
|
||||
raise HTTPException(status_code=500, detail="Job manager not initialized")
|
||||
|
||||
started = job_manager.start_job(job_id)
|
||||
if not started:
|
||||
job = job_manager.get_job(job_id)
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
if job.status != JobStatus.PENDING:
|
||||
raise HTTPException(status_code=400, detail=f"Job is not pending (current status: {job.status})")
|
||||
|
||||
raise HTTPException(status_code=429, detail="Maximum concurrent jobs reached")
|
||||
|
||||
return {"message": "Job started successfully"}
|
||||
|
||||
|
||||
@app.post("/jobs/{job_id}/cancel", summary="Cancel Job")
|
||||
async def cancel_job(job_id: str):
|
||||
"""Cancel a pending or running job"""
|
||||
if not job_manager:
|
||||
raise HTTPException(status_code=500, detail="Job manager not initialized")
|
||||
|
||||
cancelled = job_manager.cancel_job(job_id)
|
||||
if not cancelled:
|
||||
job = job_manager.get_job(job_id)
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
raise HTTPException(status_code=400, detail="Job cannot be cancelled (already completed, failed, or cancelled)")
|
||||
|
||||
return {"message": "Job cancelled successfully"}
|
||||
|
||||
|
||||
@app.delete("/jobs/{job_id}", summary="Delete Job")
|
||||
async def delete_job(job_id: str):
|
||||
"""Delete a job from the system"""
|
||||
if not job_manager:
|
||||
raise HTTPException(status_code=500, detail="Job manager not initialized")
|
||||
|
||||
deleted = job_manager.delete_job(job_id)
|
||||
if not deleted:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
return {"message": "Job deleted successfully"}
|
||||
|
||||
|
||||
@app.get("/stats", response_model=JobStatsResponse, summary="Get Job Statistics")
|
||||
async def get_stats():
|
||||
"""Get job manager statistics"""
|
||||
if not job_manager:
|
||||
raise HTTPException(status_code=500, detail="Job manager not initialized")
|
||||
|
||||
stats = job_manager.get_stats()
|
||||
return JobStatsResponse(**stats)
|
||||
|
||||
|
||||
@app.post("/check-reviews", summary="Check if Business Has Reviews")
|
||||
async def check_reviews(request: Dict[str, str]):
|
||||
"""
|
||||
Lightweight validation endpoint to check if a business has reviews.
|
||||
Uses the Chrome validation pool for fast response.
|
||||
|
||||
Returns business name, rating, address, and review count.
|
||||
"""
|
||||
url = request.get("url")
|
||||
if not url:
|
||||
raise HTTPException(status_code=400, detail="URL is required")
|
||||
|
||||
log.info(f"Validating business at: {url}")
|
||||
|
||||
# Get a worker from validation pool
|
||||
worker = get_validation_worker(timeout=10)
|
||||
|
||||
if not worker:
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="No validation workers available. Please try again in a few seconds."
|
||||
)
|
||||
|
||||
try:
|
||||
# Use the worker's driver to get business card info (faster than check_reviews_available)
|
||||
result = get_business_card_info(
|
||||
url=url,
|
||||
headless=True,
|
||||
driver=worker.driver,
|
||||
return_driver=True # Don't close the driver
|
||||
)
|
||||
|
||||
# Pop the driver from result before returning
|
||||
result.pop('driver', None)
|
||||
|
||||
log.info(f"Validation result: name={result.get('name')}, rating={result.get('rating')}, reviews={result.get('total_reviews')}")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error during validation: {e}")
|
||||
# Recycle worker if there was an error
|
||||
release_validation_worker(worker, recycle=True)
|
||||
raise HTTPException(status_code=500, detail=f"Validation failed: {str(e)}")
|
||||
|
||||
finally:
|
||||
# Release worker back to pool (unless already recycled)
|
||||
if worker and worker.driver:
|
||||
release_validation_worker(worker, recycle=False)
|
||||
|
||||
|
||||
@app.get("/pool-stats", summary="Get Chrome Pool Statistics")
|
||||
async def pool_stats():
|
||||
"""Get statistics about Chrome worker pools"""
|
||||
stats = get_pool_stats()
|
||||
return stats
|
||||
|
||||
|
||||
@app.post("/cleanup", summary="Manual Job Cleanup")
|
||||
async def cleanup_jobs(max_age_hours: int = Query(24, description="Maximum age in hours", ge=1)):
|
||||
"""Manually trigger cleanup of old completed/failed jobs"""
|
||||
if not job_manager:
|
||||
raise HTTPException(status_code=500, detail="Job manager not initialized")
|
||||
|
||||
job_manager.cleanup_old_jobs(max_age_hours=max_age_hours)
|
||||
return {"message": f"Cleaned up jobs older than {max_age_hours} hours"}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
log.info("Starting FastAPI server...")
|
||||
uvicorn.run(
|
||||
"api_server:app",
|
||||
host="0.0.0.0",
|
||||
port=8000,
|
||||
reload=True,
|
||||
log_level="info"
|
||||
)
|
||||
@@ -6,6 +6,7 @@ Production Google Reviews Scraper API Server with Phase 1 features:
|
||||
- Smart health checks with canary testing
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from contextlib import asynccontextmanager
|
||||
@@ -15,12 +16,12 @@ from uuid import UUID
|
||||
from fastapi import FastAPI, HTTPException, Query, Header
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from pydantic import BaseModel, HttpUrl, Field
|
||||
from fastapi.responses import JSONResponse
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
|
||||
from modules.database import DatabaseManager, JobStatus
|
||||
from modules.webhooks import WebhookDispatcher, WebhookManager
|
||||
from modules.health_checks import HealthCheckSystem
|
||||
from modules.scraper_clean import fast_scrape_reviews # Clean scraper with hard refresh recovery
|
||||
from modules.scraper_clean import fast_scrape_reviews, LogCapture # Clean scraper with hard refresh recovery
|
||||
from modules.fast_scraper import check_reviews_available, get_business_card_info # Helper functions
|
||||
from modules.chrome_pool import (
|
||||
start_worker_pools,
|
||||
@@ -48,6 +49,11 @@ health_system: Optional[HealthCheckSystem] = None
|
||||
MAX_CONCURRENT_JOBS = int(os.getenv('MAX_CONCURRENT_JOBS', '5'))
|
||||
job_semaphore = asyncio.Semaphore(MAX_CONCURRENT_JOBS)
|
||||
|
||||
# SSE: Store for broadcasting job updates to connected clients
|
||||
# Format: {job_id: [asyncio.Queue, ...]} for job-specific streams
|
||||
# Format: {"all": [asyncio.Queue, ...]} for all-jobs stream
|
||||
job_update_queues: Dict[str, List[asyncio.Queue]] = {"all": []}
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
@@ -82,11 +88,12 @@ async def lifespan(app: FastAPI):
|
||||
|
||||
# Start Chrome worker pools (1 for validation, 2 for scraping)
|
||||
# These pre-warm Chrome instances for instant availability
|
||||
# headless=False because Docker uses Xvfb virtual display for better compatibility
|
||||
await asyncio.to_thread(
|
||||
start_worker_pools,
|
||||
validation_size=1,
|
||||
scraping_size=2,
|
||||
headless=True
|
||||
headless=False
|
||||
)
|
||||
log.info("Chrome worker pools started (1 validation + 2 scraping)")
|
||||
|
||||
@@ -148,6 +155,9 @@ class JobResponse(BaseModel):
|
||||
scrape_time: Optional[float] = None
|
||||
error_message: Optional[str] = None
|
||||
webhook_url: Optional[str] = None
|
||||
# Business metadata
|
||||
business_name: Optional[str] = None
|
||||
business_address: Optional[str] = None
|
||||
|
||||
|
||||
class ReviewsResponse(BaseModel):
|
||||
@@ -239,12 +249,296 @@ async def get_job(job_id: UUID):
|
||||
started_at=job['started_at'].isoformat() if job['started_at'] else None,
|
||||
completed_at=job['completed_at'].isoformat() if job['completed_at'] else None,
|
||||
reviews_count=job['reviews_count'],
|
||||
total_reviews=job.get('total_reviews'),
|
||||
scrape_time=job['scrape_time'],
|
||||
error_message=job['error_message'],
|
||||
webhook_url=job.get('webhook_url')
|
||||
)
|
||||
|
||||
|
||||
@app.get("/jobs/{job_id}/logs", summary="Get Job Logs")
|
||||
async def get_job_logs(job_id: UUID):
|
||||
"""
|
||||
Get the scraper logs for a job.
|
||||
|
||||
Returns logs from both successful and failed jobs.
|
||||
Useful for debugging scraping issues.
|
||||
"""
|
||||
if not db:
|
||||
raise HTTPException(status_code=500, detail="Database not initialized")
|
||||
|
||||
job = await db.get_job(job_id)
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
# Get scrape_logs from job
|
||||
scrape_logs = job.get('scrape_logs')
|
||||
|
||||
# Parse if string (asyncpg might return JSONB as string)
|
||||
if isinstance(scrape_logs, str):
|
||||
try:
|
||||
scrape_logs = json.loads(scrape_logs)
|
||||
except:
|
||||
scrape_logs = None
|
||||
|
||||
return {
|
||||
"job_id": str(job_id),
|
||||
"status": job['status'],
|
||||
"error_message": job.get('error_message'),
|
||||
"logs": scrape_logs or [],
|
||||
"log_count": len(scrape_logs) if scrape_logs else 0
|
||||
}
|
||||
|
||||
|
||||
# ==================== SSE Streaming Endpoints ====================
|
||||
|
||||
async def broadcast_job_update(job_id: str, event_type: str, data: dict):
|
||||
"""Broadcast an update to all subscribers of a job stream and the all-jobs stream."""
|
||||
message = f"event: {event_type}\ndata: {json.dumps(data)}\n\n"
|
||||
|
||||
# Send to job-specific subscribers
|
||||
if job_id in job_update_queues:
|
||||
for queue in job_update_queues[job_id]:
|
||||
try:
|
||||
await queue.put(message)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Send to all-jobs subscribers
|
||||
for queue in job_update_queues.get("all", []):
|
||||
try:
|
||||
await queue.put(message)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
@app.get("/jobs/{job_id}/stream", summary="Stream Job Updates (SSE)")
|
||||
async def stream_job_updates(job_id: UUID):
|
||||
"""
|
||||
Server-Sent Events stream for real-time job updates.
|
||||
|
||||
Streams:
|
||||
- status: Job status changes
|
||||
- progress: Review count and progress updates
|
||||
- logs: New log entries
|
||||
- complete: Job finished (completed/failed)
|
||||
|
||||
Connect with EventSource in the browser:
|
||||
```javascript
|
||||
const es = new EventSource('/jobs/{job_id}/stream');
|
||||
es.onmessage = (e) => console.log(JSON.parse(e.data));
|
||||
es.addEventListener('logs', (e) => console.log('Logs:', JSON.parse(e.data)));
|
||||
```
|
||||
"""
|
||||
if not db:
|
||||
raise HTTPException(status_code=500, detail="Database not initialized")
|
||||
|
||||
# Verify job exists
|
||||
job = await db.get_job(job_id)
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
job_id_str = str(job_id)
|
||||
|
||||
# Create queue for this client
|
||||
queue: asyncio.Queue = asyncio.Queue()
|
||||
|
||||
# Register subscriber
|
||||
if job_id_str not in job_update_queues:
|
||||
job_update_queues[job_id_str] = []
|
||||
job_update_queues[job_id_str].append(queue)
|
||||
|
||||
async def event_generator():
|
||||
try:
|
||||
# Send initial state
|
||||
job_data = await db.get_job(job_id)
|
||||
if job_data:
|
||||
scrape_logs = job_data.get('scrape_logs')
|
||||
if isinstance(scrape_logs, str):
|
||||
try:
|
||||
scrape_logs = json.loads(scrape_logs)
|
||||
except:
|
||||
scrape_logs = []
|
||||
|
||||
initial = {
|
||||
"job_id": job_id_str,
|
||||
"status": job_data['status'],
|
||||
"reviews_count": job_data.get('reviews_count'),
|
||||
"total_reviews": job_data.get('total_reviews'),
|
||||
"scrape_time": job_data.get('scrape_time'),
|
||||
"error_message": job_data.get('error_message'),
|
||||
"logs": scrape_logs or []
|
||||
}
|
||||
yield f"event: init\ndata: {json.dumps(initial)}\n\n"
|
||||
|
||||
# If job is already complete, send complete event and close
|
||||
if job_data and job_data['status'] in ['completed', 'failed', 'cancelled']:
|
||||
yield f"event: complete\ndata: {json.dumps({'status': job_data['status']})}\n\n"
|
||||
return
|
||||
|
||||
# Keep connection alive and send updates
|
||||
last_log_count = len(scrape_logs) if scrape_logs else 0
|
||||
last_reviews_count = job_data.get('reviews_count') if job_data else 0
|
||||
|
||||
while True:
|
||||
try:
|
||||
# Wait for update with timeout (for keepalive)
|
||||
try:
|
||||
message = await asyncio.wait_for(queue.get(), timeout=2.0)
|
||||
yield message
|
||||
except asyncio.TimeoutError:
|
||||
# Send keepalive comment
|
||||
yield ": keepalive\n\n"
|
||||
|
||||
# Also poll database for updates (backup in case broadcast missed)
|
||||
job_data = await db.get_job(job_id)
|
||||
if job_data:
|
||||
# Check for status change
|
||||
if job_data['status'] in ['completed', 'failed', 'cancelled']:
|
||||
scrape_logs = job_data.get('scrape_logs')
|
||||
if isinstance(scrape_logs, str):
|
||||
try:
|
||||
scrape_logs = json.loads(scrape_logs)
|
||||
except:
|
||||
scrape_logs = []
|
||||
|
||||
final = {
|
||||
"job_id": job_id_str,
|
||||
"status": job_data['status'],
|
||||
"reviews_count": job_data.get('reviews_count'),
|
||||
"total_reviews": job_data.get('total_reviews'),
|
||||
"scrape_time": job_data.get('scrape_time'),
|
||||
"error_message": job_data.get('error_message'),
|
||||
"logs": scrape_logs or []
|
||||
}
|
||||
yield f"event: complete\ndata: {json.dumps(final)}\n\n"
|
||||
return
|
||||
|
||||
# Check for new logs or progress
|
||||
scrape_logs = job_data.get('scrape_logs')
|
||||
if isinstance(scrape_logs, str):
|
||||
try:
|
||||
scrape_logs = json.loads(scrape_logs)
|
||||
except:
|
||||
scrape_logs = []
|
||||
|
||||
current_log_count = len(scrape_logs) if scrape_logs else 0
|
||||
current_reviews = job_data.get('reviews_count') or 0
|
||||
|
||||
if current_log_count > last_log_count or current_reviews != last_reviews_count:
|
||||
update = {
|
||||
"job_id": job_id_str,
|
||||
"status": job_data['status'],
|
||||
"reviews_count": current_reviews,
|
||||
"total_reviews": job_data.get('total_reviews'),
|
||||
"logs": scrape_logs or []
|
||||
}
|
||||
yield f"event: update\ndata: {json.dumps(update)}\n\n"
|
||||
last_log_count = current_log_count
|
||||
last_reviews_count = current_reviews
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error in SSE stream for job {job_id}: {e}")
|
||||
break
|
||||
|
||||
finally:
|
||||
# Unregister subscriber
|
||||
if job_id_str in job_update_queues:
|
||||
try:
|
||||
job_update_queues[job_id_str].remove(queue)
|
||||
if not job_update_queues[job_id_str]:
|
||||
del job_update_queues[job_id_str]
|
||||
except:
|
||||
pass
|
||||
|
||||
return StreamingResponse(
|
||||
event_generator(),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"X-Accel-Buffering": "no" # Disable nginx buffering
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@app.get("/jobs/stream", summary="Stream All Jobs Updates (SSE)")
|
||||
async def stream_all_jobs():
|
||||
"""
|
||||
Server-Sent Events stream for all job updates.
|
||||
|
||||
Streams:
|
||||
- job_created: New job was created
|
||||
- job_updated: Job status/progress changed
|
||||
- job_completed: Job finished
|
||||
|
||||
Connect with EventSource in the browser:
|
||||
```javascript
|
||||
const es = new EventSource('/jobs/stream');
|
||||
es.addEventListener('job_updated', (e) => console.log('Update:', JSON.parse(e.data)));
|
||||
```
|
||||
"""
|
||||
if not db:
|
||||
raise HTTPException(status_code=500, detail="Database not initialized")
|
||||
|
||||
# Create queue for this client
|
||||
queue: asyncio.Queue = asyncio.Queue()
|
||||
|
||||
# Register subscriber to all-jobs stream
|
||||
job_update_queues["all"].append(queue)
|
||||
|
||||
async def event_generator():
|
||||
try:
|
||||
# Send initial jobs list
|
||||
jobs = await db.list_jobs(limit=100)
|
||||
jobs_data = [
|
||||
{
|
||||
"job_id": str(j['job_id']),
|
||||
"status": j['status'],
|
||||
"url": j['url'],
|
||||
"created_at": j['created_at'].isoformat(),
|
||||
"completed_at": j['completed_at'].isoformat() if j.get('completed_at') else None,
|
||||
"reviews_count": j.get('reviews_count'),
|
||||
"scrape_time": j.get('scrape_time'),
|
||||
"error_message": j.get('error_message')
|
||||
}
|
||||
for j in jobs
|
||||
]
|
||||
yield f"event: init\ndata: {json.dumps({'jobs': jobs_data})}\n\n"
|
||||
|
||||
# Keep connection alive and send updates
|
||||
while True:
|
||||
try:
|
||||
# Wait for update with timeout (for keepalive)
|
||||
try:
|
||||
message = await asyncio.wait_for(queue.get(), timeout=5.0)
|
||||
yield message
|
||||
except asyncio.TimeoutError:
|
||||
# Send keepalive comment
|
||||
yield ": keepalive\n\n"
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error in all-jobs SSE stream: {e}")
|
||||
break
|
||||
|
||||
finally:
|
||||
# Unregister subscriber
|
||||
try:
|
||||
job_update_queues["all"].remove(queue)
|
||||
except:
|
||||
pass
|
||||
|
||||
return StreamingResponse(
|
||||
event_generator(),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"X-Accel-Buffering": "no"
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews")
|
||||
async def get_job_reviews(job_id: UUID):
|
||||
"""
|
||||
@@ -298,19 +592,34 @@ async def list_jobs(
|
||||
|
||||
jobs = await db.list_jobs(status=job_status, limit=limit, offset=offset)
|
||||
|
||||
return [
|
||||
JobResponse(
|
||||
result = []
|
||||
for job in jobs:
|
||||
# Extract business info from metadata if available
|
||||
metadata = job.get('metadata')
|
||||
if isinstance(metadata, str):
|
||||
try:
|
||||
metadata = json.loads(metadata)
|
||||
except:
|
||||
metadata = None
|
||||
|
||||
business_name = metadata.get('business_name') if metadata else None
|
||||
business_address = metadata.get('business_address') if metadata else None
|
||||
|
||||
result.append(JobResponse(
|
||||
job_id=str(job['job_id']),
|
||||
status=job['status'],
|
||||
url=job['url'],
|
||||
created_at=job['created_at'].isoformat(),
|
||||
completed_at=job['completed_at'].isoformat() if job.get('completed_at') else None,
|
||||
reviews_count=job.get('reviews_count'),
|
||||
total_reviews=job.get('total_reviews'),
|
||||
scrape_time=job.get('scrape_time'),
|
||||
error_message=job.get('error_message')
|
||||
)
|
||||
for job in jobs
|
||||
]
|
||||
error_message=job.get('error_message'),
|
||||
business_name=business_name,
|
||||
business_address=business_address
|
||||
))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@app.delete("/jobs/{job_id}", summary="Delete Job")
|
||||
@@ -370,11 +679,11 @@ async def check_reviews(request: ScrapeRequest):
|
||||
|
||||
# SIMPLIFIED VALIDATION: If we found a business (name + rating), assume it has reviews
|
||||
# Let the actual scraper determine if reviews exist
|
||||
has_business = result.get('name') and result.get('rating')
|
||||
has_business = bool(result.get('name') and result.get('rating'))
|
||||
|
||||
return {
|
||||
"has_reviews": has_business, # Assume true if business exists
|
||||
"total_reviews": result['total_reviews'] or 0, # Show 0 if unknown
|
||||
"has_reviews": has_business, # Boolean: true if business exists
|
||||
"total_reviews": result.get('total_reviews') or 0, # Show 0 if unknown
|
||||
"name": result.get('name'),
|
||||
"address": result.get('address'),
|
||||
"rating": result.get('rating'),
|
||||
@@ -488,6 +797,8 @@ async def run_scraping_job(job_id: UUID):
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
"""
|
||||
job_id_str = str(job_id)
|
||||
|
||||
async with job_semaphore: # Limit concurrent Chrome instances
|
||||
try:
|
||||
# Update status to running
|
||||
@@ -498,44 +809,79 @@ async def run_scraping_job(job_id: UUID):
|
||||
job = await db.get_job(job_id)
|
||||
url = job['url']
|
||||
|
||||
# Broadcast job started via SSE
|
||||
await broadcast_job_update(job_id_str, "job_started", {
|
||||
"job_id": job_id_str,
|
||||
"status": "running",
|
||||
"url": url
|
||||
})
|
||||
|
||||
# Get the event loop for progress updates from worker thread
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
# Progress callback to update job status with current/total counts
|
||||
# Create log capture instance that we can access for real-time logs
|
||||
log_capture = LogCapture()
|
||||
|
||||
# Progress callback to update job status with current/total counts AND logs
|
||||
def progress_callback(current_count: int, total_count: int):
|
||||
"""Update job progress from worker thread"""
|
||||
"""Update job progress and logs from worker thread"""
|
||||
async def update():
|
||||
# Get current logs from the shared log_capture
|
||||
current_logs = log_capture.get_logs()
|
||||
await db.update_job_status(
|
||||
job_id,
|
||||
JobStatus.RUNNING,
|
||||
reviews_count=current_count,
|
||||
total_reviews=total_count
|
||||
total_reviews=total_count,
|
||||
scrape_logs=current_logs
|
||||
)
|
||||
|
||||
# Broadcast progress via SSE
|
||||
await broadcast_job_update(job_id_str, "job_progress", {
|
||||
"job_id": job_id_str,
|
||||
"status": "running",
|
||||
"reviews_count": current_count,
|
||||
"total_reviews": total_count,
|
||||
"logs": current_logs
|
||||
})
|
||||
|
||||
# Schedule the coroutine on the event loop
|
||||
asyncio.run_coroutine_threadsafe(update(), loop)
|
||||
|
||||
# Run scraping with progress callback
|
||||
# Run scraping with progress callback and shared log capture
|
||||
# headless=False because Docker uses Xvfb virtual display
|
||||
result = await asyncio.to_thread(
|
||||
fast_scrape_reviews,
|
||||
url=url,
|
||||
headless=True,
|
||||
progress_callback=progress_callback
|
||||
headless=False,
|
||||
progress_callback=progress_callback,
|
||||
log_capture=log_capture
|
||||
)
|
||||
|
||||
if result['success']:
|
||||
# Save results to database
|
||||
# Save results to database (including scraper logs)
|
||||
await db.save_job_result(
|
||||
job_id=job_id,
|
||||
reviews=result['reviews'],
|
||||
scrape_time=result['time'],
|
||||
total_reviews=result.get('total_reviews')
|
||||
total_reviews=result.get('total_reviews'),
|
||||
scrape_logs=result.get('logs')
|
||||
)
|
||||
|
||||
log.info(
|
||||
f"Completed job {job_id}: {result['count']} reviews in {result['time']:.1f}s"
|
||||
)
|
||||
|
||||
# Broadcast job completed via SSE
|
||||
await broadcast_job_update(job_id_str, "job_completed", {
|
||||
"job_id": job_id_str,
|
||||
"status": "completed",
|
||||
"reviews_count": result['count'],
|
||||
"total_reviews": result.get('total_reviews'),
|
||||
"scrape_time": result['time'],
|
||||
"logs": result.get('logs', [])
|
||||
})
|
||||
|
||||
# Send webhook if configured
|
||||
if job.get('webhook_url'):
|
||||
webhook_manager = WebhookManager()
|
||||
@@ -553,15 +899,24 @@ async def run_scraping_job(job_id: UUID):
|
||||
)
|
||||
|
||||
else:
|
||||
# Job failed
|
||||
# Job failed - save logs for debugging
|
||||
await db.update_job_status(
|
||||
job_id,
|
||||
JobStatus.FAILED,
|
||||
error_message=result.get('error', 'Unknown error')
|
||||
error_message=result.get('error', 'Unknown error'),
|
||||
scrape_logs=result.get('logs')
|
||||
)
|
||||
|
||||
log.error(f"Failed job {job_id}: {result.get('error')}")
|
||||
|
||||
# Broadcast job failed via SSE
|
||||
await broadcast_job_update(job_id_str, "job_failed", {
|
||||
"job_id": job_id_str,
|
||||
"status": "failed",
|
||||
"error_message": result.get('error'),
|
||||
"logs": result.get('logs', [])
|
||||
})
|
||||
|
||||
# Send failure webhook if configured
|
||||
if job.get('webhook_url'):
|
||||
webhook_manager = WebhookManager()
|
||||
@@ -585,6 +940,14 @@ async def run_scraping_job(job_id: UUID):
|
||||
error_message=str(e)
|
||||
)
|
||||
|
||||
# Broadcast job failed via SSE
|
||||
await broadcast_job_update(job_id_str, "job_failed", {
|
||||
"job_id": job_id_str,
|
||||
"status": "failed",
|
||||
"error_message": str(e),
|
||||
"logs": []
|
||||
})
|
||||
|
||||
# Send failure webhook
|
||||
job = await db.get_job(job_id)
|
||||
if job and job.get('webhook_url'):
|
||||
|
||||
@@ -1,166 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Brute force approach: Try every possible div class combination and see which gives us reviews.
|
||||
"""
|
||||
|
||||
import time
|
||||
from seleniumbase import Driver
|
||||
|
||||
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
|
||||
|
||||
driver = Driver(uc=True, headless=False)
|
||||
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(5)
|
||||
|
||||
# GDPR
|
||||
try:
|
||||
form_btns = driver.find_elements('css selector', 'form button')
|
||||
for btn in form_btns:
|
||||
if 'accept all' in (btn.text or '').lower():
|
||||
btn.click()
|
||||
time.sleep(2)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
time.sleep(2)
|
||||
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
||||
for tab in tabs:
|
||||
if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower():
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(5)
|
||||
break
|
||||
|
||||
# Scroll to load reviews
|
||||
try:
|
||||
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
|
||||
for _ in range(10):
|
||||
driver.execute_script("arguments[0].scrollBy(0, 400);", pane)
|
||||
time.sleep(0.3)
|
||||
except:
|
||||
pass
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("BRUTE FORCE SELECTOR SEARCH")
|
||||
print("="*80)
|
||||
|
||||
# Get ALL unique class combinations from divs inside the reviews pane
|
||||
candidates = driver.execute_script("""
|
||||
// Find the reviews pane
|
||||
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde');
|
||||
if (!pane) return {error: 'Pane not found'};
|
||||
|
||||
// Get all divs inside the pane
|
||||
const allDivs = Array.from(pane.querySelectorAll('div'));
|
||||
|
||||
// For each div, check if it looks like a review
|
||||
const candidates = [];
|
||||
|
||||
for (let div of allDivs) {
|
||||
// Skip if no classes
|
||||
if (!div.className || div.className.length === 0) continue;
|
||||
|
||||
// Check for review indicators
|
||||
const hasRating = !!div.querySelector('[aria-label*="star" i]');
|
||||
const hasText = div.textContent.length > 50 && div.textContent.length < 1000; // Individual review size
|
||||
const hasAuthor = !!div.querySelector('button[aria-label*="photo" i], img');
|
||||
|
||||
// Calculate score
|
||||
let score = 0;
|
||||
if (hasRating) score += 3;
|
||||
if (hasText) score += 2;
|
||||
if (hasAuthor) score += 1;
|
||||
|
||||
if (score >= 4) { // Must have rating + text at minimum
|
||||
candidates.push({
|
||||
classes: div.className,
|
||||
selector: 'div.' + div.className.split(' ').filter(c => c).join('.'),
|
||||
score: score,
|
||||
text_length: div.textContent.length,
|
||||
sample_text: div.textContent.substring(0, 100)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Count how many elements match each selector
|
||||
const selectorCounts = {};
|
||||
for (let candidate of candidates) {
|
||||
const count = pane.querySelectorAll(candidate.selector).length;
|
||||
if (!selectorCounts[candidate.selector]) {
|
||||
selectorCounts[candidate.selector] = {
|
||||
count: count,
|
||||
score: candidate.score,
|
||||
text_length: candidate.text_length,
|
||||
sample: candidate.sample_text
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by count (we want selectors that match many reviews)
|
||||
const sorted = Object.entries(selectorCounts)
|
||||
.sort((a, b) => b[1].count - a[1].count)
|
||||
.slice(0, 10);
|
||||
|
||||
return {
|
||||
top_selectors: sorted.map(([selector, info]) => ({
|
||||
selector: selector,
|
||||
count: info.count,
|
||||
score: info.score,
|
||||
text_length: info.text_length,
|
||||
sample: info.sample
|
||||
}))
|
||||
};
|
||||
""")
|
||||
|
||||
if 'error' in candidates:
|
||||
print(f"ERROR: {candidates['error']}")
|
||||
else:
|
||||
print(f"\nTop 10 candidate selectors (sorted by count):\n")
|
||||
for i, candidate in enumerate(candidates['top_selectors'], 1):
|
||||
print(f"{i}. {candidate['selector']}")
|
||||
print(f" Count: {candidate['count']} | Score: {candidate['score']} | Text length: {candidate['text_length']}")
|
||||
print(f" Sample: {candidate['sample'][:80]}...")
|
||||
print()
|
||||
|
||||
# Test the top selector
|
||||
if candidates['top_selectors']:
|
||||
top_selector = candidates['top_selectors'][0]['selector']
|
||||
print(f"\n{'='*80}")
|
||||
print(f"TESTING TOP SELECTOR: {top_selector}")
|
||||
print(f"{'='*80}")
|
||||
|
||||
test_result = driver.execute_script(f"""
|
||||
const elements = document.querySelectorAll('{top_selector}');
|
||||
const reviews = [];
|
||||
|
||||
for (let i = 0; i < Math.min(3, elements.length); i++) {{
|
||||
const elem = elements[i];
|
||||
const review = {{
|
||||
has_author: !!elem.querySelector('button, img'),
|
||||
has_rating: !!elem.querySelector('[aria-label*="star" i]'),
|
||||
has_date: !!elem.textContent.match(/\\d+\\s*(day|week|month|year|ago)/i),
|
||||
text_length: elem.textContent.length,
|
||||
text_sample: elem.textContent.substring(0, 150)
|
||||
}};
|
||||
reviews.push(review);
|
||||
}}
|
||||
|
||||
return reviews;
|
||||
""")
|
||||
|
||||
print(f"\nFirst 3 elements using {top_selector}:")
|
||||
for i, rev in enumerate(test_result, 1):
|
||||
print(f"\n Element {i}:")
|
||||
for key, value in rev.items():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print("Browser staying open for 60 seconds...")
|
||||
print(f"{'='*80}")
|
||||
time.sleep(60)
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
@@ -1,106 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Check the actual page structure - maybe reviews are already visible without clicking a tab!
|
||||
"""
|
||||
|
||||
import time
|
||||
from seleniumbase import Driver
|
||||
|
||||
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
|
||||
|
||||
driver = Driver(uc=True, headless=False)
|
||||
|
||||
try:
|
||||
driver.get(url)
|
||||
print(f"Initial URL: {url}")
|
||||
time.sleep(5)
|
||||
|
||||
# GDPR
|
||||
try:
|
||||
form_btns = driver.find_elements('css selector', 'form button')
|
||||
for btn in form_btns:
|
||||
if 'accept' in (btn.text or '').lower():
|
||||
btn.click()
|
||||
time.sleep(2)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# Check final URL
|
||||
final_url = driver.current_url
|
||||
print(f"Final URL after redirect: {final_url}")
|
||||
|
||||
# Wait a bit more for dynamic content
|
||||
time.sleep(3)
|
||||
|
||||
# Check page structure
|
||||
print("\n" + "="*80)
|
||||
print("PAGE STRUCTURE ANALYSIS")
|
||||
print("="*80)
|
||||
|
||||
page_info = driver.execute_script("""
|
||||
return {
|
||||
tabs_found: document.querySelectorAll('button[role="tab"]').length,
|
||||
reviews_with_standard_selector: document.querySelectorAll('div.jftiEf.fontBodyMedium').length,
|
||||
reviews_with_jftiEf: document.querySelectorAll('div.jftiEf').length,
|
||||
divs_with_ratings: document.querySelectorAll('[aria-label*="star" i]').length,
|
||||
review_containers: document.querySelectorAll('div.fontBodyMedium').length,
|
||||
page_text_sample: document.body.innerText.substring(0, 500),
|
||||
has_review_text: document.body.innerText.toLowerCase().includes('review'),
|
||||
has_atsiliepimai_text: document.body.innerText.toLowerCase().includes('atsiliepimai')
|
||||
};
|
||||
""")
|
||||
|
||||
print(f"\nTabs with role='tab': {page_info['tabs_found']}")
|
||||
print(f"div.jftiEf.fontBodyMedium: {page_info['reviews_with_standard_selector']}")
|
||||
print(f"div.jftiEf: {page_info['reviews_with_jftiEf']}")
|
||||
print(f"Elements with star ratings: {page_info['divs_with_ratings']}")
|
||||
print(f"div.fontBodyMedium: {page_info['review_containers']}")
|
||||
print(f"Contains 'review': {page_info['has_review_text']}")
|
||||
print(f"Contains 'atsiliepimai' (Lithuanian): {page_info['has_atsiliepimai_text']}")
|
||||
|
||||
print(f"\nPage text sample (first 500 chars):")
|
||||
print(page_info['page_text_sample'])
|
||||
|
||||
# Try to find ANY element with rating
|
||||
print("\n" + "="*80)
|
||||
print("SEARCHING FOR RATING ELEMENTS")
|
||||
print("="*80)
|
||||
|
||||
rating_search = driver.execute_script("""
|
||||
const elements = Array.from(document.querySelectorAll('*'));
|
||||
const withRatings = [];
|
||||
|
||||
for (let elem of elements) {
|
||||
const ariaLabel = elem.getAttribute('aria-label') || '';
|
||||
if (ariaLabel.toLowerCase().includes('star') || ariaLabel.toLowerCase().includes('žvaigžd')) {
|
||||
withRatings.push({
|
||||
tag: elem.tagName,
|
||||
ariaLabel: ariaLabel.substring(0, 100),
|
||||
classes: elem.className.substring(0, 100),
|
||||
parentTag: elem.parentElement ? elem.parentElement.tagName : null,
|
||||
parentClasses: elem.parentElement ? elem.parentElement.className.substring(0, 100) : null
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return withRatings.slice(0, 10); // First 10
|
||||
""")
|
||||
|
||||
print(f"\nFound {len(rating_search)} elements with 'star' in aria-label:")
|
||||
for i, elem in enumerate(rating_search[:5], 1):
|
||||
print(f"\n Element {i}:")
|
||||
print(f" Tag: {elem['tag']}")
|
||||
print(f" Aria-label: {elem['ariaLabel']}")
|
||||
print(f" Classes: {elem['classes']}")
|
||||
print(f" Parent tag: {elem['parentTag']}")
|
||||
print(f" Parent classes: {elem['parentClasses']}")
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print("Browser open for manual inspection...")
|
||||
print("LOOK AT THE PAGE - Are reviews visible? What's their structure?")
|
||||
print(f"{'='*80}")
|
||||
time.sleep(180) # 3 minutes
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
@@ -1,355 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Cookie-based API scraper - Capture fresh cookies on each run, then fast API scraping.
|
||||
|
||||
Flow:
|
||||
1. Start browser (15 seconds)
|
||||
2. Capture cookies from active browser session (5 seconds)
|
||||
3. Close browser
|
||||
4. Use cookies for rapid API pagination (5-10 seconds)
|
||||
|
||||
Total time: ~25-35 seconds for 244 reviews (vs 155 seconds with scrolling)
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from typing import List, Optional, Tuple
|
||||
import requests
|
||||
from seleniumbase import SB
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CookieBasedScraper:
|
||||
"""Capture cookies each run, then scrape via API."""
|
||||
|
||||
def __init__(self, url: str, headless: bool = False):
|
||||
self.url = url
|
||||
self.headless = headless
|
||||
self.session = requests.Session()
|
||||
self.place_id = None
|
||||
self.interceptor = GoogleMapsAPIInterceptor(None)
|
||||
|
||||
def capture_cookies(self) -> bool:
|
||||
"""
|
||||
Capture cookies from a real browser session.
|
||||
Returns True if successful.
|
||||
"""
|
||||
log.info("="*60)
|
||||
log.info("STEP 1: Capturing cookies from browser session")
|
||||
log.info("="*60)
|
||||
|
||||
sb = None
|
||||
sb_context = None
|
||||
try:
|
||||
# Create driver - need to enter the context manually
|
||||
log.info("Starting browser...")
|
||||
sb_context = SB(uc=True, headless=self.headless)
|
||||
sb = sb_context.__enter__() # Manually enter context
|
||||
|
||||
log.info("Opening Google Maps...")
|
||||
sb.open(self.url)
|
||||
time.sleep(2)
|
||||
|
||||
# Dismiss cookie consent
|
||||
try:
|
||||
sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=3)
|
||||
log.info("✓ Cookie dialog dismissed")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
try:
|
||||
sb.click('.LRkQ2', timeout=5)
|
||||
log.info("✓ Opened reviews tab")
|
||||
time.sleep(3) # Wait for reviews to load
|
||||
except Exception as e:
|
||||
log.warning(f"Could not click reviews tab: {e}")
|
||||
|
||||
# Extract place ID from current URL
|
||||
current_url = sb.get_current_url()
|
||||
if '!1s' in current_url:
|
||||
parts = current_url.split('!1s')
|
||||
if len(parts) > 1:
|
||||
self.place_id = parts[1].split('!')[0]
|
||||
log.info(f"✓ Extracted place ID: {self.place_id}")
|
||||
|
||||
if not self.place_id:
|
||||
log.error("Could not extract place ID")
|
||||
return False
|
||||
|
||||
# CRITICAL: Scroll once to trigger an API call!
|
||||
# This causes Google to set the necessary session cookies
|
||||
log.info("Triggering API call by scrolling...")
|
||||
sb.execute_script("window.scrollBy(0, 500)")
|
||||
time.sleep(2) # Wait for API call to complete
|
||||
log.info("✓ API call triggered - session cookies should now be set")
|
||||
|
||||
# CAPTURE COOKIES using CDP (gets httpOnly cookies too!)
|
||||
log.info("Capturing cookies via CDP...")
|
||||
try:
|
||||
# Use Chrome DevTools Protocol to get ALL cookies from all domains
|
||||
cdp_cookies = sb.driver.execute_cdp_cmd('Network.getAllCookies', {})
|
||||
browser_cookies = cdp_cookies.get('cookies', [])
|
||||
log.info(f"✓ Captured {len(browser_cookies)} cookies via CDP")
|
||||
|
||||
# Also try getting cookies for specific Google domains
|
||||
for domain in ['.google.com', 'www.google.com', '.google.es', 'maps.google.com']:
|
||||
try:
|
||||
domain_cookies = sb.driver.execute_cdp_cmd('Network.getCookies', {'urls': [f'https://{domain}']})
|
||||
extra_cookies = domain_cookies.get('cookies', [])
|
||||
if extra_cookies:
|
||||
log.info(f" Found {len(extra_cookies)} cookies for {domain}")
|
||||
# Add any new cookies we don't have yet
|
||||
existing_names = {c['name'] for c in browser_cookies}
|
||||
for cookie in extra_cookies:
|
||||
if cookie['name'] not in existing_names:
|
||||
browser_cookies.append(cookie)
|
||||
except:
|
||||
pass
|
||||
|
||||
log.info(f"✓ Total cookies after checking all domains: {len(browser_cookies)}")
|
||||
except Exception as e:
|
||||
log.warning(f"CDP cookie capture failed: {e}")
|
||||
# Fallback to JavaScript (won't get httpOnly cookies)
|
||||
cookie_string = sb.execute_script("return document.cookie")
|
||||
browser_cookies = []
|
||||
for cookie in cookie_string.split('; '):
|
||||
if '=' in cookie:
|
||||
name, value = cookie.split('=', 1)
|
||||
browser_cookies.append({
|
||||
'name': name,
|
||||
'value': value,
|
||||
'domain': '.google.com',
|
||||
'path': '/'
|
||||
})
|
||||
log.info(f"✓ Fallback: Captured {len(browser_cookies)} cookies via JS")
|
||||
|
||||
# CAPTURE USER AGENT while driver is active
|
||||
user_agent = sb.execute_script("return navigator.userAgent")
|
||||
log.info(f"✓ Captured user agent")
|
||||
|
||||
# Process cookies into session
|
||||
for cookie in browser_cookies:
|
||||
self.session.cookies.set(
|
||||
name=cookie['name'],
|
||||
value=cookie['value'],
|
||||
domain=cookie.get('domain', '.google.com'),
|
||||
path=cookie.get('path', '/')
|
||||
)
|
||||
|
||||
# Set headers
|
||||
self.session.headers.update({
|
||||
'User-Agent': user_agent,
|
||||
'Accept': '*/*',
|
||||
'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8',
|
||||
'Referer': 'https://www.google.com/maps/',
|
||||
'Origin': 'https://www.google.com',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
})
|
||||
|
||||
# Print ALL cookie names for debugging
|
||||
all_cookie_names = [c['name'] for c in browser_cookies]
|
||||
log.info(f"Cookie names: {', '.join(all_cookie_names)}")
|
||||
|
||||
# Print important cookies for debugging
|
||||
important_cookies = ['SID', 'HSID', 'SSID', 'APISID', 'SAPISID', '__Secure-1PSID', '__Secure-3PSID']
|
||||
found_cookies = []
|
||||
for cookie_name in important_cookies:
|
||||
if cookie_name in self.session.cookies:
|
||||
found_cookies.append(cookie_name)
|
||||
|
||||
log.info(f"✓ Found auth cookies: {', '.join(found_cookies) if found_cookies else 'NONE - this is the problem!'}")
|
||||
|
||||
# Check if we have auth cookies
|
||||
if not found_cookies:
|
||||
log.warning("\n" + "="*60)
|
||||
log.warning("⚠️ NO AUTHENTICATION COOKIES FOUND!")
|
||||
log.warning("="*60)
|
||||
log.warning("Google Maps API requires you to be logged into Google.")
|
||||
log.warning("")
|
||||
log.warning("To fix this:")
|
||||
log.warning("1. Log into your Google account in Chrome")
|
||||
log.warning("2. Visit google.com/maps while logged in")
|
||||
log.warning("3. Then run this scraper again")
|
||||
log.warning("")
|
||||
log.warning("Alternatively, use the hybrid scraper (start.py) which")
|
||||
log.warning("handles authentication automatically and already achieves")
|
||||
log.warning("95%+ API coverage with 100% parse rate!")
|
||||
log.warning("="*60 + "\n")
|
||||
|
||||
# Continue anyway to show the error
|
||||
log.info("Continuing anyway to demonstrate the API error...")
|
||||
|
||||
log.info("\n✅ Cookie capture successful!")
|
||||
log.info(f" Total cookies: {len(browser_cookies)}")
|
||||
log.info(f" Place ID: {self.place_id}")
|
||||
log.info(f" Session ready: Yes\n")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Cookie capture failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
finally:
|
||||
# IMPORTANT: Close browser properly
|
||||
if sb_context:
|
||||
try:
|
||||
log.info("Closing browser...")
|
||||
sb_context.__exit__(None, None, None) # Properly exit context
|
||||
log.info("✓ Browser closed\n")
|
||||
except Exception as e:
|
||||
log.debug(f"Error closing browser: {e}")
|
||||
|
||||
def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]:
|
||||
"""
|
||||
Fetch a page of reviews via API using captured cookies.
|
||||
"""
|
||||
# Build pb parameter
|
||||
if continuation_token:
|
||||
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
|
||||
else:
|
||||
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
|
||||
|
||||
params = {
|
||||
'authuser': '0',
|
||||
'hl': 'es',
|
||||
'gl': 'es',
|
||||
'pb': pb
|
||||
}
|
||||
|
||||
try:
|
||||
url = 'https://www.google.com/maps/rpc/listugcposts'
|
||||
response = self.session.get(url, params=params, timeout=10)
|
||||
|
||||
if response.status_code != 200:
|
||||
log.error(f"API error {response.status_code}")
|
||||
log.error(f"Response: {response.text[:500]}")
|
||||
log.debug(f"Request URL: {response.url}")
|
||||
log.debug(f"Request headers: {dict(self.session.headers)}")
|
||||
return [], None
|
||||
|
||||
# Parse response
|
||||
body = response.text
|
||||
if body.startswith(")]}'"):
|
||||
body = body[4:].strip()
|
||||
|
||||
data = json.loads(body)
|
||||
reviews = self.interceptor._parse_listugcposts_response(data)
|
||||
|
||||
# Get next token
|
||||
next_token = None
|
||||
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
|
||||
next_token = data[1]
|
||||
|
||||
return reviews, next_token
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"API request failed: {e}")
|
||||
return [], None
|
||||
|
||||
def scrape_all(self, max_pages: int = 100) -> List[dict]:
|
||||
"""
|
||||
Main scraping method with cookie-based session.
|
||||
"""
|
||||
# Step 1: Capture cookies from browser
|
||||
if not self.capture_cookies():
|
||||
log.error("Failed to capture cookies - aborting")
|
||||
return []
|
||||
|
||||
# Step 2: Scrape via API
|
||||
log.info("="*60)
|
||||
log.info("STEP 2: Fast API scraping (no browser needed)")
|
||||
log.info("="*60)
|
||||
|
||||
start_time = time.time()
|
||||
all_reviews = []
|
||||
seen_ids = set()
|
||||
token = None
|
||||
page = 0
|
||||
|
||||
while page < max_pages:
|
||||
page += 1
|
||||
|
||||
log.info(f"Fetching page {page}...")
|
||||
reviews, token = self.fetch_reviews_page(token)
|
||||
|
||||
if not reviews:
|
||||
if page == 1:
|
||||
log.error("No reviews on first page - cookies may have expired or be invalid")
|
||||
else:
|
||||
log.info("No more reviews found")
|
||||
break
|
||||
|
||||
# Deduplicate
|
||||
for review in reviews:
|
||||
rid = review.review_id or f"{review.author}_{review.date_text}"
|
||||
if rid not in seen_ids:
|
||||
seen_ids.add(rid)
|
||||
all_reviews.append({
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
})
|
||||
|
||||
log.info(f" → {len(reviews)} reviews | Total: {len(all_reviews)}")
|
||||
|
||||
if not token:
|
||||
log.info("No continuation token - all reviews fetched")
|
||||
break
|
||||
|
||||
# Small delay between requests
|
||||
time.sleep(0.2)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
log.info("\n" + "="*60)
|
||||
log.info("✅ SCRAPING COMPLETED!")
|
||||
log.info("="*60)
|
||||
log.info(f"Total reviews: {len(all_reviews)}")
|
||||
log.info(f"API calls: {page}")
|
||||
log.info(f"API scraping time: {elapsed:.2f} seconds")
|
||||
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second")
|
||||
log.info("="*60 + "\n")
|
||||
|
||||
return all_reviews
|
||||
|
||||
|
||||
def main():
|
||||
"""Example usage."""
|
||||
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
|
||||
|
||||
scraper = CookieBasedScraper(url, headless=False)
|
||||
reviews = scraper.scrape_all(max_pages=50)
|
||||
|
||||
if reviews:
|
||||
# Save results
|
||||
output_file = 'cookie_based_reviews.json'
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(reviews, f, indent=2, ensure_ascii=False)
|
||||
|
||||
log.info(f"💾 Saved {len(reviews)} reviews to {output_file}")
|
||||
|
||||
# Show sample
|
||||
log.info("\nSample review:")
|
||||
sample = reviews[0]
|
||||
log.info(f" Author: {sample['author']}")
|
||||
log.info(f" Rating: {sample['rating']}★")
|
||||
log.info(f" Date: {sample['date_text']}")
|
||||
if sample['text']:
|
||||
log.info(f" Text: {sample['text'][:80]}...")
|
||||
else:
|
||||
log.error("No reviews scraped!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,249 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Direct API scraper - fetch Google Maps reviews via API without browser scrolling.
|
||||
This is 10-25x faster than traditional browser-based scraping.
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import urllib.parse
|
||||
from typing import List, Optional, Tuple
|
||||
import requests
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DirectAPIScraper:
|
||||
"""Fetch Google Maps reviews directly via API without browser automation."""
|
||||
|
||||
def __init__(self, place_id: str, language: str = 'en', region: str = 'us'):
|
||||
"""
|
||||
Initialize the direct API scraper.
|
||||
|
||||
Args:
|
||||
place_id: Google Maps place ID (e.g., '0x46dd947294b213bf:0x864c7a232527adb4')
|
||||
language: Language code (e.g., 'en', 'es', 'de')
|
||||
region: Region/country code (e.g., 'us', 'es', 'de')
|
||||
"""
|
||||
self.place_id = place_id
|
||||
self.language = language
|
||||
self.region = region
|
||||
self.base_url = 'https://www.google.com/maps/rpc/listugcposts'
|
||||
|
||||
# Initialize parser (reuse the working parser from api_interceptor)
|
||||
self.interceptor = GoogleMapsAPIInterceptor(None)
|
||||
|
||||
# Session for maintaining cookies
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': '*/*',
|
||||
'Accept-Language': f'{language},{language}-{region.upper()};q=0.9,en;q=0.8',
|
||||
'Referer': 'https://www.google.com/maps/',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
})
|
||||
|
||||
def _build_pb_param(self, continuation_token: Optional[str] = None) -> str:
|
||||
"""
|
||||
Build the Protocol Buffer (pb) parameter for the API request.
|
||||
|
||||
Args:
|
||||
continuation_token: Pagination token from previous response
|
||||
|
||||
Returns:
|
||||
pb parameter string (NOT URL-encoded - that's done by requests)
|
||||
"""
|
||||
# Base structure with place ID and pagination token
|
||||
if continuation_token:
|
||||
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
|
||||
else:
|
||||
# First request without continuation token
|
||||
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
|
||||
|
||||
return pb
|
||||
|
||||
def _establish_session(self):
|
||||
"""Visit Google Maps page to establish session cookies."""
|
||||
try:
|
||||
# Visit the main maps page to get cookies
|
||||
maps_url = f"https://www.google.com/maps/place/?q=place_id:{self.place_id}"
|
||||
log.debug("Establishing session by visiting Google Maps...")
|
||||
response = self.session.get(maps_url, timeout=10)
|
||||
response.raise_for_status()
|
||||
log.debug(f"Session established (cookies: {len(self.session.cookies)})")
|
||||
except Exception as e:
|
||||
log.warning(f"Failed to establish session: {e}")
|
||||
|
||||
def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]:
|
||||
"""
|
||||
Fetch a single page of reviews from the API.
|
||||
|
||||
Args:
|
||||
continuation_token: Pagination token from previous response
|
||||
|
||||
Returns:
|
||||
Tuple of (reviews list, next continuation token or None)
|
||||
"""
|
||||
# Build request parameters
|
||||
params = {
|
||||
'authuser': '0',
|
||||
'hl': self.language,
|
||||
'gl': self.region,
|
||||
'pb': self._build_pb_param(continuation_token)
|
||||
}
|
||||
|
||||
try:
|
||||
log.info(f"Fetching reviews page (token: {'initial' if not continuation_token else 'paginated'})...")
|
||||
|
||||
response = self.session.get(self.base_url, params=params, timeout=10)
|
||||
|
||||
# Log response for debugging
|
||||
log.debug(f"Response status: {response.status_code}")
|
||||
if response.status_code != 200:
|
||||
log.error(f"Response body: {response.text[:500]}")
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
# Google returns responses with )]}' prefix - strip it
|
||||
body = response.text
|
||||
if body.startswith(")]}'"):
|
||||
body = body[4:].strip()
|
||||
|
||||
log.debug(f"Response size: {len(body)} bytes")
|
||||
|
||||
# Parse JSON response
|
||||
data = json.loads(body)
|
||||
|
||||
# Extract reviews using our working parser
|
||||
reviews = self.interceptor._parse_listugcposts_response(data)
|
||||
|
||||
# Extract next continuation token
|
||||
next_token = None
|
||||
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
|
||||
next_token = data[1]
|
||||
log.debug(f"Found continuation token: {next_token[:50]}...")
|
||||
|
||||
log.info(f"✓ Extracted {len(reviews)} reviews from this page")
|
||||
|
||||
return reviews, next_token
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
log.error(f"API request failed: {e}")
|
||||
return [], None
|
||||
except json.JSONDecodeError as e:
|
||||
log.error(f"Failed to parse API response: {e}")
|
||||
return [], None
|
||||
except Exception as e:
|
||||
log.error(f"Unexpected error: {e}")
|
||||
return [], None
|
||||
|
||||
def fetch_all_reviews(self, max_pages: int = 100, delay: float = 0.5) -> List[dict]:
|
||||
"""
|
||||
Fetch all reviews by paginating through the API.
|
||||
|
||||
Args:
|
||||
max_pages: Maximum number of pages to fetch (safety limit)
|
||||
delay: Delay between requests in seconds
|
||||
|
||||
Returns:
|
||||
List of review dictionaries
|
||||
"""
|
||||
all_reviews = []
|
||||
seen_ids = set()
|
||||
continuation_token = None
|
||||
page = 0
|
||||
|
||||
start_time = time.time()
|
||||
log.info(f"Starting direct API scraping for place: {self.place_id}")
|
||||
|
||||
# Establish session first
|
||||
self._establish_session()
|
||||
|
||||
while page < max_pages:
|
||||
page += 1
|
||||
|
||||
# Fetch page
|
||||
reviews, continuation_token = self.fetch_reviews_page(continuation_token)
|
||||
|
||||
if not reviews:
|
||||
log.info("No more reviews found - stopping")
|
||||
break
|
||||
|
||||
# Deduplicate and add reviews
|
||||
for review in reviews:
|
||||
review_id = review.review_id or f"{review.author}_{review.date_text}"
|
||||
if review_id not in seen_ids:
|
||||
seen_ids.add(review_id)
|
||||
|
||||
# Convert to dict
|
||||
all_reviews.append({
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
})
|
||||
|
||||
log.info(f"Page {page}: {len(all_reviews)} total unique reviews")
|
||||
|
||||
# Check if we have a continuation token
|
||||
if not continuation_token:
|
||||
log.info("No continuation token - all reviews fetched")
|
||||
break
|
||||
|
||||
# Rate limiting
|
||||
if delay > 0 and page < max_pages:
|
||||
time.sleep(delay)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
log.info(f"\n{'='*60}")
|
||||
log.info(f"✅ Direct API scraping completed!")
|
||||
log.info(f"{'='*60}")
|
||||
log.info(f"Total reviews: {len(all_reviews)}")
|
||||
log.info(f"Pages fetched: {page}")
|
||||
log.info(f"Time elapsed: {elapsed:.2f} seconds")
|
||||
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second")
|
||||
log.info(f"{'='*60}\n")
|
||||
|
||||
return all_reviews
|
||||
|
||||
|
||||
def main():
|
||||
"""Example usage of the direct API scraper."""
|
||||
|
||||
# Soho Club place ID from the test URL
|
||||
place_id = '0x46dd947294b213bf:0x864c7a232527adb4'
|
||||
|
||||
# Create scraper
|
||||
scraper = DirectAPIScraper(
|
||||
place_id=place_id,
|
||||
language='es',
|
||||
region='es'
|
||||
)
|
||||
|
||||
# Fetch all reviews
|
||||
reviews = scraper.fetch_all_reviews(max_pages=50, delay=0.5)
|
||||
|
||||
# Save to JSON
|
||||
output_file = 'direct_api_reviews.json'
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(reviews, f, indent=2, ensure_ascii=False)
|
||||
|
||||
log.info(f"Saved {len(reviews)} reviews to {output_file}")
|
||||
|
||||
# Show sample
|
||||
if reviews:
|
||||
log.info("\nSample review:")
|
||||
sample = reviews[0]
|
||||
log.info(f" Author: {sample['author']}")
|
||||
log.info(f" Rating: {sample['rating']}★")
|
||||
log.info(f" Date: {sample['date_text']}")
|
||||
log.info(f" Text: {sample['text'][:100]}..." if sample['text'] else " Text: (no text)")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,61 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quick script to dump API responses for debugging
|
||||
"""
|
||||
import json
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
||||
from seleniumbase import SB
|
||||
|
||||
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
|
||||
|
||||
with SB(uc=True, headless=False) as sb:
|
||||
# Set up interceptor BEFORE loading page
|
||||
interceptor = GoogleMapsAPIInterceptor(sb.driver)
|
||||
|
||||
sb.open(url)
|
||||
sb.sleep(2)
|
||||
|
||||
# Inject interceptor early
|
||||
interceptor.inject_response_interceptor()
|
||||
sb.sleep(2)
|
||||
|
||||
# Click reviews tab
|
||||
try:
|
||||
sb.click('.LRkQ2:contains("Reseñas")', timeout=5)
|
||||
except:
|
||||
try:
|
||||
sb.click('.LRkQ2:contains("Reviews")', timeout=5)
|
||||
except:
|
||||
pass
|
||||
|
||||
print("Waiting for reviews to load...")
|
||||
sb.sleep(5)
|
||||
|
||||
# Scroll to trigger more requests
|
||||
print("Scrolling to load more...")
|
||||
for i in range(5):
|
||||
sb.execute_script("window.scrollBy(0, 800)")
|
||||
sb.sleep(2)
|
||||
print(f" Scroll {i+1}/5...")
|
||||
|
||||
print("\nCollecting responses...")
|
||||
|
||||
# Get responses
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
|
||||
print(f"\nCaptured {len(responses)} responses")
|
||||
|
||||
# Dump to files
|
||||
for i, resp in enumerate(responses):
|
||||
filename = f"api_response_{i}.json"
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
json.dump(resp, f, indent=2, ensure_ascii=False)
|
||||
print(f"Saved: {filename} ({len(resp.get('body', ''))} bytes)")
|
||||
|
||||
# Also save just the body for easier viewing
|
||||
body_file = f"api_response_{i}_body.txt"
|
||||
with open(body_file, 'w', encoding='utf-8') as f:
|
||||
f.write(resp.get('body', ''))
|
||||
print(f"Saved body: {body_file}")
|
||||
|
||||
print("\nDone! Check api_response_*.json files")
|
||||
@@ -1,107 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Dump raw API responses for analysis.
|
||||
This will help us understand Google's exact response format.
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from seleniumbase import SB
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
|
||||
|
||||
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
|
||||
|
||||
output_dir = Path("api_response_samples")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
print(f"[INFO] Starting browser...")
|
||||
with SB(uc=True, headless=False) as sb:
|
||||
print("[INFO] Navigating to Google Maps...")
|
||||
sb.open(url)
|
||||
sb.sleep(3)
|
||||
|
||||
# Inject interceptor FIRST
|
||||
print("[INFO] Injecting API interceptor...")
|
||||
interceptor = GoogleMapsAPIInterceptor(sb.driver)
|
||||
interceptor.inject_response_interceptor()
|
||||
sb.sleep(2)
|
||||
|
||||
# Click reviews tab
|
||||
print("[INFO] Looking for reviews tab...")
|
||||
try:
|
||||
sb.click('.LRkQ2', timeout=5)
|
||||
print("[INFO] ✓ Clicked reviews tab")
|
||||
except:
|
||||
print("[WARN] Could not click reviews tab, trying to continue...")
|
||||
|
||||
sb.sleep(5)
|
||||
|
||||
# Scroll multiple times to trigger API calls
|
||||
print("[INFO] Scrolling to trigger API calls...")
|
||||
for i in range(10):
|
||||
sb.execute_script("window.scrollBy(0, 800)")
|
||||
sb.sleep(1.5)
|
||||
|
||||
# Check every few scrolls
|
||||
if (i + 1) % 3 == 0:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
print(f"[INFO] Captured {len(responses)} responses so far...")
|
||||
|
||||
# Final collection
|
||||
print("\n[INFO] Collecting all captured responses...")
|
||||
all_responses = interceptor.get_intercepted_responses()
|
||||
|
||||
if not all_responses:
|
||||
print("[ERROR] No responses captured!")
|
||||
exit(1)
|
||||
|
||||
print(f"[SUCCESS] Captured {len(all_responses)} API responses!\n")
|
||||
|
||||
# Dump each response
|
||||
for i, resp in enumerate(all_responses):
|
||||
url_str = resp.get('url', 'unknown')
|
||||
body = resp.get('body', '')
|
||||
size = len(body)
|
||||
|
||||
# Save full response
|
||||
full_file = output_dir / f"response_{i:02d}_full.json"
|
||||
with open(full_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(resp, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Save just body for easier viewing
|
||||
body_file = output_dir / f"response_{i:02d}_body.txt"
|
||||
with open(body_file, 'w', encoding='utf-8') as f:
|
||||
f.write(body)
|
||||
|
||||
# Try to parse as JSON
|
||||
if body.startswith(")]}'"):
|
||||
clean_body = body[4:].strip()
|
||||
else:
|
||||
clean_body = body
|
||||
|
||||
json_file = output_dir / f"response_{i:02d}_parsed.json"
|
||||
try:
|
||||
parsed = json.loads(clean_body)
|
||||
with open(json_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(parsed, f, indent=2, ensure_ascii=False)
|
||||
print(f" [{i}] ✓ {url_str[:60]}... ({size:,} bytes)")
|
||||
print(f" Full: {full_file}")
|
||||
print(f" Body: {body_file}")
|
||||
print(f" Parsed: {json_file}")
|
||||
except:
|
||||
print(f" [{i}] ✓ {url_str[:60]}... ({size:,} bytes) [Not JSON]")
|
||||
print(f" Full: {full_file}")
|
||||
print(f" Body: {body_file}")
|
||||
print()
|
||||
|
||||
print(f"\n[SUCCESS] Dumped {len(all_responses)} responses to: {output_dir}/")
|
||||
print("\nNext steps:")
|
||||
print(" 1. Open response_00_parsed.json to study the structure")
|
||||
print(" 2. Look for arrays containing review data")
|
||||
print(" 3. Identify patterns for: review ID, author, rating, text, date")
|
||||
print(" 4. Update the parser patterns in modules/api_interceptor.py")
|
||||
|
||||
print("\n[DONE]")
|
||||
@@ -1,249 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fast API scraper - Minimal browser usage, maximum API speed.
|
||||
|
||||
Strategy:
|
||||
1. Start browser and navigate to reviews page
|
||||
2. Capture cookies and user-agent from browser
|
||||
3. Let one API call happen naturally (to warm up the session)
|
||||
4. Close browser
|
||||
5. Use requests library with captured session to make fast API calls
|
||||
6. Paginate through all reviews without any scrolling
|
||||
|
||||
Expected: 10-25x faster than traditional scrolling approach.
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from typing import List, Optional, Tuple
|
||||
import requests
|
||||
from seleniumbase import SB
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FastAPIScraper:
|
||||
"""Minimal browser, maximum speed."""
|
||||
|
||||
def __init__(self, url: str):
|
||||
self.url = url
|
||||
self.session = requests.Session()
|
||||
self.place_id = None
|
||||
self.interceptor = GoogleMapsAPIInterceptor(None)
|
||||
|
||||
def bootstrap_session(self) -> bool:
|
||||
"""
|
||||
Quickly establish session using browser, then close it.
|
||||
"""
|
||||
log.info("Bootstrapping session with minimal browser usage...")
|
||||
|
||||
try:
|
||||
with SB(uc=True, headless=False) as sb:
|
||||
# Navigate
|
||||
log.info("Opening Google Maps...")
|
||||
sb.open(self.url)
|
||||
sb.sleep(2)
|
||||
|
||||
# Dismiss cookies
|
||||
try:
|
||||
sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=3)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews
|
||||
try:
|
||||
sb.click('.LRkQ2', timeout=5)
|
||||
log.info("✓ Opened reviews tab")
|
||||
sb.sleep(2)
|
||||
except:
|
||||
log.warning("Could not click reviews tab")
|
||||
|
||||
# Wait a bit to ensure page is loaded
|
||||
sb.sleep(1)
|
||||
|
||||
# Extract place ID from URL or page
|
||||
current_url = sb.get_current_url()
|
||||
if '!1s' in current_url:
|
||||
parts = current_url.split('!1s')
|
||||
if len(parts) > 1:
|
||||
self.place_id = parts[1].split('!')[0]
|
||||
log.info(f"✓ Extracted place ID: {self.place_id}")
|
||||
|
||||
# Get cookies from browser - do this while browser is still active
|
||||
try:
|
||||
browser_cookies = sb.driver.get_cookies()
|
||||
log.debug(f"Got {len(browser_cookies)} cookies")
|
||||
except Exception as e:
|
||||
log.warning(f"Could not get cookies: {e}")
|
||||
browser_cookies = []
|
||||
|
||||
# Get user agent - do this while browser is still active
|
||||
try:
|
||||
user_agent = sb.execute_script("return navigator.userAgent")
|
||||
log.debug(f"User agent: {user_agent[:50]}...")
|
||||
except Exception as e:
|
||||
log.warning(f"Could not get user agent: {e}")
|
||||
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
||||
|
||||
# Now process cookies and headers (browser context manager still open)
|
||||
for cookie in browser_cookies:
|
||||
try:
|
||||
self.session.cookies.set(
|
||||
name=cookie['name'],
|
||||
value=cookie['value'],
|
||||
domain=cookie.get('domain', '.google.com'),
|
||||
path=cookie.get('path', '/')
|
||||
)
|
||||
except Exception as e:
|
||||
log.debug(f"Could not set cookie {cookie.get('name')}: {e}")
|
||||
|
||||
# Set headers
|
||||
self.session.headers.update({
|
||||
'User-Agent': user_agent,
|
||||
'Accept': '*/*',
|
||||
'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8',
|
||||
'Referer': 'https://www.google.com/maps/',
|
||||
'Origin': 'https://www.google.com',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
})
|
||||
|
||||
log.info(f"✅ Session bootstrapped!")
|
||||
log.info(f" Cookies: {len(browser_cookies)}")
|
||||
log.info(f" Place ID: {self.place_id}")
|
||||
|
||||
# Let browser stay open for a moment to ensure all operations complete
|
||||
sb.sleep(1)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Bootstrap failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]:
|
||||
"""Fetch a page of reviews via API."""
|
||||
|
||||
# Build pb parameter
|
||||
if continuation_token:
|
||||
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
|
||||
else:
|
||||
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
|
||||
|
||||
params = {
|
||||
'authuser': '0',
|
||||
'hl': 'es',
|
||||
'gl': 'es',
|
||||
'pb': pb
|
||||
}
|
||||
|
||||
try:
|
||||
url = 'https://www.google.com/maps/rpc/listugcposts'
|
||||
response = self.session.get(url, params=params, timeout=10)
|
||||
|
||||
if response.status_code != 200:
|
||||
log.error(f"API error {response.status_code}")
|
||||
log.error(f"Response: {response.text[:300]}")
|
||||
return [], None
|
||||
|
||||
# Parse
|
||||
body = response.text
|
||||
if body.startswith(")]}'"):
|
||||
body = body[4:].strip()
|
||||
|
||||
data = json.loads(body)
|
||||
reviews = self.interceptor._parse_listugcposts_response(data)
|
||||
|
||||
# Next token
|
||||
next_token = None
|
||||
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
|
||||
next_token = data[1]
|
||||
|
||||
return reviews, next_token
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Request failed: {e}")
|
||||
return [], None
|
||||
|
||||
def scrape_all(self, max_pages: int = 100) -> List[dict]:
|
||||
"""
|
||||
Main scraping method.
|
||||
"""
|
||||
# Bootstrap
|
||||
if not self.bootstrap_session():
|
||||
return []
|
||||
|
||||
# Scrape via API
|
||||
log.info("\n" + "="*60)
|
||||
log.info("STARTING FAST API SCRAPING")
|
||||
log.info("="*60 + "\n")
|
||||
|
||||
start_time = time.time()
|
||||
all_reviews = []
|
||||
seen_ids = set()
|
||||
token = None
|
||||
page = 0
|
||||
|
||||
while page < max_pages:
|
||||
page += 1
|
||||
|
||||
log.info(f"Fetching page {page}...")
|
||||
reviews, token = self.fetch_reviews_page(token)
|
||||
|
||||
if not reviews:
|
||||
log.info("No more reviews")
|
||||
break
|
||||
|
||||
# Dedup
|
||||
for review in reviews:
|
||||
rid = review.review_id or f"{review.author}_{review.date_text}"
|
||||
if rid not in seen_ids:
|
||||
seen_ids.add(rid)
|
||||
all_reviews.append({
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
})
|
||||
|
||||
log.info(f" → {len(reviews)} reviews | Total: {len(all_reviews)}")
|
||||
|
||||
if not token:
|
||||
break
|
||||
|
||||
time.sleep(0.2) # Small delay
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
log.info("\n" + "="*60)
|
||||
log.info("✅ FAST API SCRAPING COMPLETED!")
|
||||
log.info("="*60)
|
||||
log.info(f"Reviews: {len(all_reviews)}")
|
||||
log.info(f"Pages: {page}")
|
||||
log.info(f"Time: {elapsed:.2f} seconds")
|
||||
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
|
||||
log.info("="*60 + "\n")
|
||||
|
||||
return all_reviews
|
||||
|
||||
|
||||
def main():
|
||||
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
|
||||
|
||||
scraper = FastAPIScraper(url)
|
||||
reviews = scraper.scrape_all(max_pages=50)
|
||||
|
||||
# Save
|
||||
with open('fast_api_reviews.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(reviews, f, indent=2, ensure_ascii=False)
|
||||
|
||||
log.info(f"Saved to fast_api_reviews.json")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,156 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Find the ACTUAL selector for reviews by looking for elements with review structure.
|
||||
"""
|
||||
|
||||
import time
|
||||
from seleniumbase import Driver
|
||||
|
||||
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
|
||||
|
||||
driver = Driver(uc=True, headless=False)
|
||||
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(5)
|
||||
|
||||
# GDPR
|
||||
try:
|
||||
form_btns = driver.find_elements('css selector', 'form button')
|
||||
for btn in form_btns:
|
||||
if 'accept all' in (btn.text or '').lower():
|
||||
btn.click()
|
||||
time.sleep(2)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
time.sleep(2)
|
||||
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
||||
for tab in tabs:
|
||||
if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower():
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(5)
|
||||
break
|
||||
|
||||
# Scroll to load reviews
|
||||
try:
|
||||
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
|
||||
for _ in range(3):
|
||||
driver.execute_script("arguments[0].scrollBy(0, 500);", pane)
|
||||
time.sleep(1)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Use JavaScript to find ALL elements that look like reviews
|
||||
print("\n" + "="*80)
|
||||
print("FINDING ACTUAL REVIEW ELEMENTS BY STRUCTURE:")
|
||||
print("="*80)
|
||||
|
||||
review_info = driver.execute_script("""
|
||||
// Find all elements that have BOTH a rating AND substantial text
|
||||
const allDivs = Array.from(document.querySelectorAll('div'));
|
||||
|
||||
const reviews = [];
|
||||
|
||||
for (let div of allDivs) {
|
||||
// Must have a rating (star aria-label)
|
||||
const ratingElem = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i]');
|
||||
if (!ratingElem) continue;
|
||||
|
||||
// Must have decent text content (>50 chars to avoid buttons)
|
||||
if (div.textContent.length < 50) continue;
|
||||
|
||||
// Get the classes and attributes
|
||||
const info = {
|
||||
classes: div.className,
|
||||
has_author: !!div.querySelector('button, [aria-label*="photo" i]'),
|
||||
has_avatar: !!div.querySelector('img'),
|
||||
has_date: !!div.textContent.match(/\\d+\\s*(day|week|month|year|ago)/i),
|
||||
text_length: div.textContent.length,
|
||||
sample_text: div.textContent.substring(0, 150),
|
||||
tag_name: div.tagName,
|
||||
jslog: div.getAttribute('jslog'),
|
||||
data_review_id: div.getAttribute('data-review-id'),
|
||||
jsaction: div.getAttribute('jsaction')
|
||||
};
|
||||
|
||||
reviews.push(info);
|
||||
}
|
||||
|
||||
return {
|
||||
total_found: reviews.length,
|
||||
first_5: reviews.slice(0, 5)
|
||||
};
|
||||
""")
|
||||
|
||||
print(f"\nFound {review_info['total_found']} elements with review structure")
|
||||
print(f"\nFirst 5 review-like elements:")
|
||||
for i, rev in enumerate(review_info['first_5'], 1):
|
||||
print(f"\n Review {i}:")
|
||||
print(f" Classes: {rev['classes']}")
|
||||
print(f" Has author: {rev['has_author']}")
|
||||
print(f" Has avatar: {rev['has_avatar']}")
|
||||
print(f" Has date: {rev['has_date']}")
|
||||
print(f" Text length: {rev['text_length']}")
|
||||
print(f" jslog: {rev['jslog']}")
|
||||
print(f" data-review-id: {rev['data_review_id']}")
|
||||
print(f" Sample: {rev['sample_text'][:80]}...")
|
||||
|
||||
# Try to find a common class among review elements
|
||||
if review_info['total_found'] > 0:
|
||||
print("\n" + "="*80)
|
||||
print("FINDING COMMON SELECTOR:")
|
||||
print("="*80)
|
||||
|
||||
common_selector = driver.execute_script("""
|
||||
// Find common classes among review elements
|
||||
const reviews = [];
|
||||
const allDivs = Array.from(document.querySelectorAll('div'));
|
||||
|
||||
for (let div of allDivs) {
|
||||
const ratingElem = div.querySelector('[aria-label*="star" i]');
|
||||
if (ratingElem && div.textContent.length > 50) {
|
||||
reviews.push(div);
|
||||
}
|
||||
}
|
||||
|
||||
if (reviews.length === 0) return null;
|
||||
|
||||
// Get classes from first review
|
||||
const firstClasses = reviews[0].className.split(' ').filter(c => c.length > 0);
|
||||
|
||||
// Find classes that appear in ALL reviews
|
||||
const commonClasses = firstClasses.filter(cls => {
|
||||
return reviews.every(rev => rev.classList.contains(cls));
|
||||
});
|
||||
|
||||
return {
|
||||
total_reviews: reviews.length,
|
||||
common_classes: commonClasses,
|
||||
suggested_selector: commonClasses.length > 0 ? 'div.' + commonClasses.join('.') : null,
|
||||
first_review_classes: reviews[0].className
|
||||
};
|
||||
""")
|
||||
|
||||
if common_selector:
|
||||
print(f"Total review elements: {common_selector['total_reviews']}")
|
||||
print(f"Common classes: {common_selector['common_classes']}")
|
||||
print(f"Suggested selector: {common_selector['suggested_selector']}")
|
||||
print(f"First review full classes: {common_selector['first_review_classes']}")
|
||||
|
||||
# Test the suggested selector
|
||||
if common_selector['suggested_selector']:
|
||||
test_count = driver.execute_script(
|
||||
f"return document.querySelectorAll('{common_selector['suggested_selector']}').length;"
|
||||
)
|
||||
print(f"\nTesting suggested selector: Found {test_count} elements")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("Browser staying open for manual inspection (60s)...")
|
||||
print("="*80)
|
||||
time.sleep(60)
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
@@ -1,305 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Header Capture Scraper - Capture COMPLETE request from browser (headers + cookies).
|
||||
|
||||
This captures the exact request the browser makes, including ALL headers and cookies,
|
||||
then replays it for fast API scraping.
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from typing import List, Optional, Tuple
|
||||
import requests
|
||||
from seleniumbase import SB
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HeaderCaptureScraper:
|
||||
"""Capture complete request, then replay for fast scraping."""
|
||||
|
||||
def __init__(self, url: str, headless: bool = False):
|
||||
self.url = url
|
||||
self.headless = headless
|
||||
self.captured_request = None
|
||||
self.place_id = None
|
||||
self.session = requests.Session()
|
||||
self.interceptor = GoogleMapsAPIInterceptor(None)
|
||||
|
||||
def capture_request(self) -> bool:
|
||||
"""
|
||||
Capture a complete API request (URL, headers, cookies) from browser.
|
||||
"""
|
||||
log.info("="*60)
|
||||
log.info("Capturing request from browser...")
|
||||
log.info("="*60)
|
||||
|
||||
sb_context = None
|
||||
sb = None
|
||||
|
||||
try:
|
||||
log.info("Starting browser...")
|
||||
sb_context = SB(uc=True, headless=self.headless)
|
||||
sb = sb_context.__enter__()
|
||||
|
||||
sb.open(self.url)
|
||||
time.sleep(2)
|
||||
|
||||
# Dismiss cookies
|
||||
try:
|
||||
sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=3)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews
|
||||
try:
|
||||
sb.click('.LRkQ2', timeout=5)
|
||||
log.info("✓ Opened reviews")
|
||||
time.sleep(2)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Enable CDP network monitoring
|
||||
sb.driver.execute_cdp_cmd('Network.enable', {})
|
||||
log.info("✓ Network monitoring enabled")
|
||||
|
||||
# Scroll to trigger API call
|
||||
log.info("Scrolling to trigger API request...")
|
||||
sb.execute_script("window.scrollBy(0, 800)")
|
||||
time.sleep(3)
|
||||
|
||||
# Get network logs from CDP
|
||||
log.info("Checking network logs...")
|
||||
logs = sb.driver.get_log('browser')
|
||||
|
||||
# Alternatively, use execute_cdp_cmd to get network events
|
||||
# But simpler: Let's inject JS to capture the request
|
||||
capture_script = """
|
||||
window.__capturedRequest = null;
|
||||
|
||||
const originalFetch = window.fetch;
|
||||
window.fetch = function(...args) {
|
||||
const url = args[0].toString();
|
||||
if (url.includes('listugcposts')) {
|
||||
console.log('[CAPTURE] Intercepted request to:', url);
|
||||
window.__capturedRequest = {
|
||||
url: url,
|
||||
method: 'GET'
|
||||
};
|
||||
}
|
||||
return originalFetch.apply(this, args);
|
||||
};
|
||||
|
||||
const originalXHR = window.XMLHttpRequest;
|
||||
window.XMLHttpRequest = function() {
|
||||
const xhr = new originalXHR();
|
||||
const originalOpen = xhr.open;
|
||||
|
||||
xhr.open = function(method, url, ...rest) {
|
||||
if (url.includes('listugcposts')) {
|
||||
console.log('[CAPTURE] Intercepted XHR:', url);
|
||||
window.__capturedRequest = {
|
||||
url: url,
|
||||
method: method
|
||||
};
|
||||
}
|
||||
return originalOpen.apply(this, [method, url, ...rest]);
|
||||
};
|
||||
|
||||
return xhr;
|
||||
};
|
||||
|
||||
console.log('[CAPTURE] Request interceptor ready');
|
||||
"""
|
||||
|
||||
sb.execute_script(capture_script)
|
||||
log.info("✓ Request interceptor injected")
|
||||
|
||||
# Scroll again to trigger request
|
||||
log.info("Scrolling to capture request...")
|
||||
for i in range(3):
|
||||
sb.execute_script("window.scrollBy(0, 600)")
|
||||
time.sleep(2)
|
||||
|
||||
captured = sb.execute_script("return window.__capturedRequest")
|
||||
if captured:
|
||||
log.info(f"✓ Captured request URL!")
|
||||
self.captured_request = captured
|
||||
break
|
||||
|
||||
if not self.captured_request:
|
||||
log.error("Failed to capture request")
|
||||
return False
|
||||
|
||||
# Extract place ID from URL
|
||||
url = self.captured_request['url']
|
||||
if '!1s' in url:
|
||||
import urllib.parse
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
params = urllib.parse.parse_qs(parsed.query)
|
||||
pb = params.get('pb', [''])[0]
|
||||
if '!1s' in pb:
|
||||
self.place_id = pb.split('!1s')[1].split('!')[0]
|
||||
|
||||
# Now capture ALL cookies via CDP
|
||||
cdp_cookies = sb.driver.execute_cdp_cmd('Network.getAllCookies', {})
|
||||
all_cookies = cdp_cookies.get('cookies', [])
|
||||
|
||||
# Set cookies in session
|
||||
for cookie in all_cookies:
|
||||
self.session.cookies.set(
|
||||
name=cookie['name'],
|
||||
value=cookie['value'],
|
||||
domain=cookie.get('domain', '.google.com'),
|
||||
path=cookie.get('path', '/')
|
||||
)
|
||||
|
||||
# Get user agent
|
||||
user_agent = sb.execute_script("return navigator.userAgent")
|
||||
|
||||
# Set headers to match browser
|
||||
self.session.headers.update({
|
||||
'User-Agent': user_agent,
|
||||
'Accept': '*/*',
|
||||
'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8',
|
||||
'Referer': 'https://www.google.com/maps/',
|
||||
'Origin': 'https://www.google.com',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
})
|
||||
|
||||
log.info(f"\n✅ Request captured successfully!")
|
||||
log.info(f" Place ID: {self.place_id}")
|
||||
log.info(f" Cookies: {len(all_cookies)}")
|
||||
log.info(f" Cookie names: {', '.join([c['name'] for c in all_cookies[:10]])}")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Capture failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
finally:
|
||||
if sb_context:
|
||||
try:
|
||||
log.info("Closing browser...")
|
||||
sb_context.__exit__(None, None, None)
|
||||
log.info("✓ Browser closed\n")
|
||||
except:
|
||||
pass
|
||||
|
||||
def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]:
|
||||
"""Fetch reviews using captured session."""
|
||||
|
||||
if continuation_token:
|
||||
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
|
||||
else:
|
||||
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
|
||||
|
||||
params = {
|
||||
'authuser': '0',
|
||||
'hl': 'es',
|
||||
'gl': 'es',
|
||||
'pb': pb
|
||||
}
|
||||
|
||||
try:
|
||||
url = 'https://www.google.com/maps/rpc/listugcposts'
|
||||
response = self.session.get(url, params=params, timeout=10)
|
||||
|
||||
if response.status_code != 200:
|
||||
log.error(f"API error {response.status_code}: {response.text[:200]}")
|
||||
return [], None
|
||||
|
||||
body = response.text
|
||||
if body.startswith(")]}'"):
|
||||
body = body[4:].strip()
|
||||
|
||||
data = json.loads(body)
|
||||
reviews = self.interceptor._parse_listugcposts_response(data)
|
||||
|
||||
next_token = None
|
||||
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
|
||||
next_token = data[1]
|
||||
|
||||
return reviews, next_token
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Request failed: {e}")
|
||||
return [], None
|
||||
|
||||
def scrape_all(self, max_pages: int = 50) -> List[dict]:
|
||||
"""Main scraping method."""
|
||||
|
||||
if not self.capture_request():
|
||||
return []
|
||||
|
||||
log.info("="*60)
|
||||
log.info("Fast API scraping...")
|
||||
log.info("="*60)
|
||||
|
||||
start_time = time.time()
|
||||
all_reviews = []
|
||||
seen_ids = set()
|
||||
token = None
|
||||
page = 0
|
||||
|
||||
while page < max_pages:
|
||||
page += 1
|
||||
log.info(f"Page {page}...")
|
||||
|
||||
reviews, token = self.fetch_reviews_page(token)
|
||||
|
||||
if not reviews:
|
||||
break
|
||||
|
||||
for review in reviews:
|
||||
rid = review.review_id or f"{review.author}_{review.date_text}"
|
||||
if rid not in seen_ids:
|
||||
seen_ids.add(rid)
|
||||
all_reviews.append({
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
})
|
||||
|
||||
log.info(f" → {len(reviews)} reviews | Total: {len(all_reviews)}")
|
||||
|
||||
if not token:
|
||||
break
|
||||
|
||||
time.sleep(0.2)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
log.info(f"\n{'='*60}")
|
||||
log.info(f"✅ COMPLETED!")
|
||||
log.info(f"{'='*60}")
|
||||
log.info(f"Reviews: {len(all_reviews)}")
|
||||
log.info(f"Time: {elapsed:.2f}s")
|
||||
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
|
||||
log.info(f"{'='*60}\n")
|
||||
|
||||
return all_reviews
|
||||
|
||||
|
||||
def main():
|
||||
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
|
||||
|
||||
scraper = HeaderCaptureScraper(url, headless=False)
|
||||
reviews = scraper.scrape_all()
|
||||
|
||||
if reviews:
|
||||
with open('header_capture_reviews.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(reviews, f, indent=2, ensure_ascii=False)
|
||||
log.info(f"Saved to header_capture_reviews.json")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,352 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Hybrid API scraper - Capture session from browser, then use direct API calls.
|
||||
This combines the best of both worlds:
|
||||
1. Browser establishes authentic session with Google
|
||||
2. We capture ALL headers from real XHR requests
|
||||
3. Replay those headers in direct API calls
|
||||
4. No scrolling needed - just fast API pagination
|
||||
|
||||
Expected speed: 10-25x faster than traditional browser scrolling.
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from typing import List, Optional, Tuple, Dict
|
||||
import requests
|
||||
from seleniumbase import SB
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HybridAPIScraper:
|
||||
"""
|
||||
Capture session from browser, then scrape via direct API calls.
|
||||
"""
|
||||
|
||||
def __init__(self, url: str, headless: bool = False):
|
||||
"""
|
||||
Initialize the hybrid scraper.
|
||||
|
||||
Args:
|
||||
url: Google Maps place URL
|
||||
headless: Run browser in headless mode
|
||||
"""
|
||||
self.url = url
|
||||
self.headless = headless
|
||||
self.captured_headers = None
|
||||
self.place_id = None
|
||||
self.session = requests.Session()
|
||||
|
||||
# Initialize parser
|
||||
self.interceptor = GoogleMapsAPIInterceptor(None)
|
||||
|
||||
def capture_session_from_browser(self) -> bool:
|
||||
"""
|
||||
Start a browser session, capture headers from actual API requests.
|
||||
|
||||
Returns:
|
||||
True if session captured successfully
|
||||
"""
|
||||
log.info("Starting browser to capture session headers...")
|
||||
|
||||
try:
|
||||
with SB(uc=True, headless=self.headless) as sb:
|
||||
# Navigate to the place
|
||||
log.info(f"Navigating to: {self.url[:80]}...")
|
||||
sb.open(self.url)
|
||||
sb.sleep(3)
|
||||
|
||||
# Dismiss cookie consent
|
||||
try:
|
||||
sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=5)
|
||||
log.info("Cookie dialog dismissed")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
log.info("Opening reviews...")
|
||||
try:
|
||||
sb.click('.LRkQ2', timeout=5)
|
||||
sb.sleep(3)
|
||||
except:
|
||||
log.warning("Could not click reviews tab")
|
||||
|
||||
# Enable Chrome DevTools Protocol for network monitoring
|
||||
log.info("Enabling network interception...")
|
||||
sb.driver.execute_cdp_cmd('Network.enable', {})
|
||||
|
||||
# Store captured requests
|
||||
captured_requests = []
|
||||
|
||||
# Create event listener for network requests
|
||||
def add_request_listener():
|
||||
"""Inject JS to capture fetch/XHR requests with headers."""
|
||||
script = """
|
||||
window.__capturedRequests = [];
|
||||
|
||||
// Capture fetch
|
||||
const originalFetch = window.fetch;
|
||||
window.fetch = function(...args) {
|
||||
const url = args[0].toString();
|
||||
if (url.includes('listugcposts')) {
|
||||
console.log('[CAPTURE] Fetch to:', url);
|
||||
// Can't easily get headers from fetch without cloning
|
||||
}
|
||||
return originalFetch.apply(this, args);
|
||||
};
|
||||
|
||||
// Capture XHR (more reliable for headers)
|
||||
const originalXHR = window.XMLHttpRequest;
|
||||
window.XMLHttpRequest = function() {
|
||||
const xhr = new originalXHR();
|
||||
const originalOpen = xhr.open;
|
||||
const originalSetRequestHeader = xhr.setRequestHeader;
|
||||
const headers = {};
|
||||
|
||||
xhr.setRequestHeader = function(name, value) {
|
||||
headers[name.toLowerCase()] = value;
|
||||
return originalSetRequestHeader.apply(this, arguments);
|
||||
};
|
||||
|
||||
xhr.open = function(method, url, ...rest) {
|
||||
if (url.includes('listugcposts')) {
|
||||
console.log('[CAPTURE] XHR to:', url);
|
||||
window.__capturedRequests.push({
|
||||
url: url,
|
||||
method: method,
|
||||
headers: {...headers}
|
||||
});
|
||||
}
|
||||
return originalOpen.apply(this, [method, url, ...rest]);
|
||||
};
|
||||
|
||||
return xhr;
|
||||
};
|
||||
|
||||
console.log('[CAPTURE] Request capture initialized');
|
||||
"""
|
||||
sb.execute_script(script)
|
||||
|
||||
add_request_listener()
|
||||
|
||||
# Scroll to trigger an API call
|
||||
log.info("Scrolling to trigger API request...")
|
||||
for i in range(5):
|
||||
sb.execute_script("window.scrollBy(0, 800)")
|
||||
sb.sleep(1.5)
|
||||
|
||||
# Check captured requests
|
||||
captured_requests = sb.execute_script("return window.__capturedRequests || []")
|
||||
if captured_requests:
|
||||
log.info(f"✓ Captured {len(captured_requests)} API request(s)!")
|
||||
break
|
||||
|
||||
captured_request = captured_requests[0] if captured_requests else {}
|
||||
|
||||
if not captured_request:
|
||||
log.error("Failed to capture API request")
|
||||
return False
|
||||
|
||||
# Extract place ID from URL
|
||||
if 'place_id:' in self.url:
|
||||
self.place_id = self.url.split('place_id:')[1].split('&')[0].split('/')[0]
|
||||
elif '!1s' in captured_request['url']:
|
||||
# Extract from pb parameter
|
||||
import urllib.parse
|
||||
parsed = urllib.parse.urlparse(captured_request['url'])
|
||||
params = urllib.parse.parse_qs(parsed.query)
|
||||
pb = params.get('pb', [''])[0]
|
||||
if '!1s' in pb:
|
||||
self.place_id = pb.split('!1s')[1].split('!')[0]
|
||||
|
||||
# Store captured headers
|
||||
self.captured_headers = captured_request['headers']
|
||||
|
||||
# Also get cookies from browser
|
||||
cookies = sb.driver.get_cookies()
|
||||
for cookie in cookies:
|
||||
self.session.cookies.set(cookie['name'], cookie['value'], domain=cookie.get('domain'))
|
||||
|
||||
log.info(f"\n{'='*60}")
|
||||
log.info("✅ Session captured successfully!")
|
||||
log.info(f"{'='*60}")
|
||||
log.info(f"Place ID: {self.place_id}")
|
||||
log.info(f"Headers captured: {len(self.captured_headers)}")
|
||||
log.info(f"Cookies captured: {len(cookies)}")
|
||||
log.info(f"{'='*60}\n")
|
||||
|
||||
# Print sample headers for debugging
|
||||
log.debug("Sample headers:")
|
||||
for key in ['cookie', 'x-goog-api-key', 'authorization', 'user-agent']:
|
||||
if key in self.captured_headers:
|
||||
value = self.captured_headers[key]
|
||||
preview = value[:50] + '...' if len(value) > 50 else value
|
||||
log.debug(f" {key}: {preview}")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Failed to capture session: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]:
|
||||
"""
|
||||
Fetch reviews page using captured session.
|
||||
|
||||
Args:
|
||||
continuation_token: Pagination token
|
||||
|
||||
Returns:
|
||||
Tuple of (reviews, next_token)
|
||||
"""
|
||||
# Build pb parameter
|
||||
if continuation_token:
|
||||
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
|
||||
else:
|
||||
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
|
||||
|
||||
params = {
|
||||
'authuser': '0',
|
||||
'hl': 'es',
|
||||
'gl': 'es',
|
||||
'pb': pb
|
||||
}
|
||||
|
||||
try:
|
||||
log.info(f"Fetching page (token: {'initial' if not continuation_token else 'paginated'})...")
|
||||
|
||||
# Make request with captured headers
|
||||
url = 'https://www.google.com/maps/rpc/listugcposts'
|
||||
response = self.session.get(url, params=params, headers=self.captured_headers, timeout=10)
|
||||
|
||||
log.debug(f"Response status: {response.status_code}")
|
||||
|
||||
if response.status_code != 200:
|
||||
log.error(f"API error {response.status_code}: {response.text[:500]}")
|
||||
return [], None
|
||||
|
||||
# Parse response
|
||||
body = response.text
|
||||
if body.startswith(")]}'"):
|
||||
body = body[4:].strip()
|
||||
|
||||
data = json.loads(body)
|
||||
|
||||
# Extract reviews
|
||||
reviews = self.interceptor._parse_listugcposts_response(data)
|
||||
|
||||
# Get next token
|
||||
next_token = None
|
||||
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
|
||||
next_token = data[1]
|
||||
|
||||
log.info(f"✓ Extracted {len(reviews)} reviews")
|
||||
|
||||
return reviews, next_token
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"API request failed: {e}")
|
||||
return [], None
|
||||
|
||||
def scrape_all_reviews(self, max_pages: int = 100, delay: float = 0.3) -> List[dict]:
|
||||
"""
|
||||
Scrape all reviews using hybrid approach.
|
||||
|
||||
Args:
|
||||
max_pages: Maximum pages to fetch
|
||||
delay: Delay between API calls
|
||||
|
||||
Returns:
|
||||
List of review dictionaries
|
||||
"""
|
||||
# Step 1: Capture session from browser
|
||||
if not self.capture_session_from_browser():
|
||||
log.error("Failed to capture session - aborting")
|
||||
return []
|
||||
|
||||
# Step 2: Fetch all reviews via API
|
||||
log.info("\nStarting API-based scraping (no browser needed!)...")
|
||||
start_time = time.time()
|
||||
|
||||
all_reviews = []
|
||||
seen_ids = set()
|
||||
continuation_token = None
|
||||
page = 0
|
||||
|
||||
while page < max_pages:
|
||||
page += 1
|
||||
|
||||
reviews, continuation_token = self.fetch_reviews_page(continuation_token)
|
||||
|
||||
if not reviews:
|
||||
log.info("No more reviews found")
|
||||
break
|
||||
|
||||
# Deduplicate
|
||||
for review in reviews:
|
||||
review_id = review.review_id or f"{review.author}_{review.date_text}"
|
||||
if review_id not in seen_ids:
|
||||
seen_ids.add(review_id)
|
||||
all_reviews.append({
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
})
|
||||
|
||||
log.info(f"Page {page}: {len(all_reviews)} total unique reviews")
|
||||
|
||||
if not continuation_token:
|
||||
log.info("No continuation token - finished")
|
||||
break
|
||||
|
||||
if delay > 0:
|
||||
time.sleep(delay)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
log.info(f"\n{'='*60}")
|
||||
log.info(f"✅ API SCRAPING COMPLETED!")
|
||||
log.info(f"{'='*60}")
|
||||
log.info(f"Total reviews: {len(all_reviews)}")
|
||||
log.info(f"API calls: {page}")
|
||||
log.info(f"Time (API only): {elapsed:.2f} seconds")
|
||||
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second")
|
||||
log.info(f"{'='*60}\n")
|
||||
|
||||
return all_reviews
|
||||
|
||||
|
||||
def main():
|
||||
"""Example usage."""
|
||||
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
|
||||
|
||||
scraper = HybridAPIScraper(url, headless=False)
|
||||
reviews = scraper.scrape_all_reviews(max_pages=50, delay=0.3)
|
||||
|
||||
# Save results
|
||||
output_file = 'hybrid_api_reviews.json'
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(reviews, f, indent=2, ensure_ascii=False)
|
||||
|
||||
log.info(f"Saved {len(reviews)} reviews to {output_file}")
|
||||
|
||||
# Show sample
|
||||
if reviews:
|
||||
log.info("\nSample review:")
|
||||
sample = reviews[0]
|
||||
log.info(f" Author: {sample['author']}")
|
||||
log.info(f" Rating: {sample['rating']}★")
|
||||
log.info(f" Text: {sample['text'][:80]}..." if sample['text'] else " Text: (none)")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,157 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Check what's actually inside the reviews pane after scrolling.
|
||||
"""
|
||||
|
||||
import time
|
||||
from seleniumbase import Driver
|
||||
|
||||
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
|
||||
|
||||
driver = Driver(uc=True, headless=False)
|
||||
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(5)
|
||||
|
||||
# GDPR
|
||||
try:
|
||||
form_btns = driver.find_elements('css selector', 'form button')
|
||||
for btn in form_btns:
|
||||
if 'accept all' in (btn.text or '').lower():
|
||||
btn.click()
|
||||
time.sleep(2)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
time.sleep(2)
|
||||
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
||||
review_tab_found = False
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||
print(f"Tab: text='{tab.text}', aria='{tab.get_attribute('aria-label')}'")
|
||||
if 'review' in text or 'review' in aria:
|
||||
print(f" -> Clicking this tab!")
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(6) # Wait longer
|
||||
review_tab_found = True
|
||||
break
|
||||
|
||||
if not review_tab_found:
|
||||
print("WARNING: Reviews tab not found!")
|
||||
|
||||
# Find and scroll the pane
|
||||
print("\nLooking for scrollable pane...")
|
||||
pane = None
|
||||
try:
|
||||
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
|
||||
print(f"Found pane: div.m6QErb.WNBkOb.XiKgde")
|
||||
except:
|
||||
print("Pane not found with standard selector!")
|
||||
try:
|
||||
pane = driver.find_element('css selector', 'div.m6QErb')
|
||||
print(f"Found pane: div.m6QErb")
|
||||
except:
|
||||
print("No pane found at all!")
|
||||
|
||||
if pane:
|
||||
print("\nScrolling pane to load reviews...")
|
||||
for i in range(15):
|
||||
driver.execute_script("arguments[0].scrollBy(0, 400);", pane)
|
||||
time.sleep(0.4)
|
||||
if (i + 1) % 5 == 0:
|
||||
print(f" Scrolled {i+1} times...")
|
||||
|
||||
# Now check what's in the pane
|
||||
print("\n" + "="*80)
|
||||
print("ANALYZING PANE CONTENT")
|
||||
print("="*80)
|
||||
|
||||
content_info = driver.execute_script("""
|
||||
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || document.querySelector('div.m6QErb');
|
||||
if (!pane) return {error: 'No pane found'};
|
||||
|
||||
// Get all child divs (direct and nested)
|
||||
const allDivs = Array.from(pane.querySelectorAll('div'));
|
||||
|
||||
// Get all unique class names used
|
||||
const classNames = new Set();
|
||||
allDivs.forEach(div => {
|
||||
if (div.className) {
|
||||
div.className.split(' ').forEach(cls => {
|
||||
if (cls.trim()) classNames.add(cls.trim());
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// Find divs with ratings
|
||||
const divsWithRatings = allDivs.filter(div => {
|
||||
return !!div.querySelector('[aria-label*="star" i]');
|
||||
});
|
||||
|
||||
// Find divs with author photos
|
||||
const divsWithPhotos = allDivs.filter(div => {
|
||||
return !!div.querySelector('img[src*="photo"], img[src*="avatar"]');
|
||||
});
|
||||
|
||||
// Find divs with date patterns
|
||||
const divsWithDates = allDivs.filter(div => {
|
||||
return !!div.textContent.match(/\\d+\\s*(day|week|month|year|hour|minute|ago)/i);
|
||||
});
|
||||
|
||||
// Find divs with ALL three
|
||||
const reviewLikeDivs = allDivs.filter(div => {
|
||||
const hasRating = !!div.querySelector('[aria-label*="star" i]');
|
||||
const hasPhoto = !!div.querySelector('img');
|
||||
const hasDate = !!div.textContent.match(/\\d+\\s*(day|week|month|year|hour|ago)/i);
|
||||
const textLen = div.textContent.length;
|
||||
return hasRating && hasPhoto && hasDate && textLen > 50 && textLen < 2000;
|
||||
});
|
||||
|
||||
return {
|
||||
total_divs: allDivs.length,
|
||||
unique_classes: Array.from(classNames).sort(),
|
||||
divs_with_ratings: divsWithRatings.length,
|
||||
divs_with_photos: divsWithPhotos.length,
|
||||
divs_with_dates: divsWithDates.length,
|
||||
review_like_divs: reviewLikeDivs.length,
|
||||
review_like_classes: reviewLikeDivs.slice(0, 5).map(d => ({
|
||||
classes: d.className,
|
||||
text_length: d.textContent.length,
|
||||
sample: d.textContent.substring(0, 100)
|
||||
}))
|
||||
};
|
||||
""")
|
||||
|
||||
if 'error' in content_info:
|
||||
print(f"ERROR: {content_info['error']}")
|
||||
else:
|
||||
print(f"\nTotal divs in pane: {content_info['total_divs']}")
|
||||
print(f"Divs with ratings: {content_info['divs_with_ratings']}")
|
||||
print(f"Divs with photos: {content_info['divs_with_photos']}")
|
||||
print(f"Divs with dates: {content_info['divs_with_dates']}")
|
||||
print(f"Divs matching ALL criteria (review-like): {content_info['review_like_divs']}")
|
||||
|
||||
print(f"\nFirst 20 unique classes found in pane:")
|
||||
for cls in content_info['unique_classes'][:20]:
|
||||
print(f" {cls}")
|
||||
|
||||
if content_info['review_like_divs'] > 0:
|
||||
print(f"\nFirst 5 review-like divs:")
|
||||
for i, div_info in enumerate(content_info['review_like_classes'], 1):
|
||||
print(f"\n Div {i}:")
|
||||
print(f" Classes: {div_info['classes']}")
|
||||
print(f" Text length: {div_info['text_length']}")
|
||||
print(f" Sample: {div_info['sample'][:80]}...")
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print("Browser staying open for manual inspection (120 seconds)...")
|
||||
print("Look at the DevTools to see the actual review elements!")
|
||||
print(f"{'='*80}")
|
||||
time.sleep(120)
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
@@ -1,70 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Open the page and keep it open for manual inspection.
|
||||
INSTRUCTIONS:
|
||||
1. Open DevTools (F12)
|
||||
2. Click on an individual review
|
||||
3. Look at the div that contains ONE review (not the whole list)
|
||||
4. Note the class names on that div
|
||||
"""
|
||||
|
||||
import time
|
||||
from seleniumbase import Driver
|
||||
|
||||
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
|
||||
|
||||
driver = Driver(uc=True, headless=False)
|
||||
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(5)
|
||||
|
||||
# GDPR
|
||||
try:
|
||||
form_btns = driver.find_elements('css selector', 'form button')
|
||||
for btn in form_btns:
|
||||
if 'accept all' in (btn.text or '').lower():
|
||||
btn.click()
|
||||
time.sleep(2)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
time.sleep(2)
|
||||
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
||||
for tab in tabs:
|
||||
if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower():
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(5)
|
||||
break
|
||||
|
||||
# Scroll to load a few reviews
|
||||
try:
|
||||
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
|
||||
for _ in range(5):
|
||||
driver.execute_script("arguments[0].scrollBy(0, 300);", pane)
|
||||
time.sleep(0.5)
|
||||
except:
|
||||
pass
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("MANUAL INSPECTION TIME!")
|
||||
print("="*80)
|
||||
print("\n1. The browser is now showing the reviews page")
|
||||
print("2. Open DevTools (F12 or right-click > Inspect)")
|
||||
print("3. Click the 'Select element' tool (top-left of DevTools)")
|
||||
print("4. Hover over an INDIVIDUAL review (not the whole panel)")
|
||||
print("5. Click on it to select it in the inspector")
|
||||
print("6. Look at the <div> that wraps ONE SINGLE review")
|
||||
print("7. Note the 'class' attribute value")
|
||||
print("\n8. The class might look like: class=\"MyWpvb fontBodyMedium\" or similar")
|
||||
print("\n9. Write down the full class name(s) - we'll use this as the selector!")
|
||||
print("\n" + "="*80)
|
||||
print("Browser will stay open for 5 minutes...")
|
||||
print("="*80)
|
||||
|
||||
time.sleep(300) # 5 minutes
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
@@ -1,923 +0,0 @@
|
||||
"""
|
||||
API Interceptor for Google Maps Reviews.
|
||||
Uses Chrome DevTools Protocol (CDP) to intercept network requests and capture
|
||||
Google's internal API responses for faster, more reliable data extraction.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
log = logging.getLogger("api_interceptor")
|
||||
|
||||
|
||||
@dataclass
|
||||
class InterceptedReview:
|
||||
"""Data class for a review extracted from API response"""
|
||||
review_id: str = ""
|
||||
author: str = ""
|
||||
rating: float = 0.0
|
||||
text: str = ""
|
||||
date_text: str = ""
|
||||
timestamp: int = 0
|
||||
likes: int = 0
|
||||
photos: List[str] = field(default_factory=list)
|
||||
profile_url: str = ""
|
||||
avatar_url: str = ""
|
||||
owner_response: str = ""
|
||||
owner_response_date: str = ""
|
||||
lang: str = ""
|
||||
|
||||
|
||||
class GoogleMapsAPIInterceptor:
|
||||
"""
|
||||
Intercepts Google Maps internal API calls to capture review data directly.
|
||||
|
||||
Google Maps uses several internal endpoints for reviews:
|
||||
- /maps/preview/review/listentitiesreviews - Main reviews endpoint
|
||||
- /maps/rpc/placereview - Alternative review endpoint
|
||||
- /maps/preview/reviewsdata - Review data endpoint
|
||||
|
||||
The responses are often in a custom protobuf-like JSON format that needs parsing.
|
||||
"""
|
||||
|
||||
# Patterns for review-related API endpoints
|
||||
REVIEW_API_PATTERNS = [
|
||||
r'maps/preview/review',
|
||||
r'maps/rpc/placereview',
|
||||
r'maps/preview/reviewsdata',
|
||||
r'maps/preview/place',
|
||||
r'maps/api/place',
|
||||
r'/locationhistory/preview',
|
||||
r'batchexecute.*review',
|
||||
]
|
||||
|
||||
def __init__(self, driver):
|
||||
"""Initialize the interceptor with a Selenium driver"""
|
||||
self.driver = driver
|
||||
self.captured_responses: List[Dict[str, Any]] = []
|
||||
self.captured_reviews: List[InterceptedReview] = []
|
||||
self.request_map: Dict[str, Dict] = {} # Map request IDs to URLs
|
||||
self._lock = threading.Lock()
|
||||
self._listening = False
|
||||
self._response_callback: Optional[Callable] = None
|
||||
|
||||
def setup_interception(self):
|
||||
"""Enable network interception via CDP"""
|
||||
try:
|
||||
# Enable network domain
|
||||
self.driver.execute_cdp_cmd('Network.enable', {})
|
||||
|
||||
# Set up request interception patterns
|
||||
self.driver.execute_cdp_cmd('Network.setRequestInterception', {
|
||||
'patterns': [
|
||||
{'urlPattern': '*maps*review*', 'resourceType': 'XHR'},
|
||||
{'urlPattern': '*maps*review*', 'resourceType': 'Fetch'},
|
||||
{'urlPattern': '*batchexecute*', 'resourceType': 'XHR'},
|
||||
{'urlPattern': '*batchexecute*', 'resourceType': 'Fetch'},
|
||||
]
|
||||
})
|
||||
|
||||
self._listening = True
|
||||
log.info("API interception enabled via CDP")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
log.warning(f"Could not enable CDP interception: {e}")
|
||||
# Try alternative approach
|
||||
return self._setup_performance_logging()
|
||||
|
||||
def _setup_performance_logging(self):
|
||||
"""Alternative approach using Performance logging"""
|
||||
try:
|
||||
self.driver.execute_cdp_cmd('Network.enable', {
|
||||
'maxTotalBufferSize': 10000000,
|
||||
'maxResourceBufferSize': 5000000
|
||||
})
|
||||
self._listening = True
|
||||
log.info("API interception enabled via performance logging")
|
||||
return True
|
||||
except Exception as e:
|
||||
log.error(f"Failed to setup performance logging: {e}")
|
||||
return False
|
||||
|
||||
def capture_network_responses(self, duration: float = 5.0):
|
||||
"""
|
||||
Capture network responses for a specified duration.
|
||||
Call this while scrolling/loading more reviews.
|
||||
"""
|
||||
if not self._listening:
|
||||
log.warning("Interception not set up, call setup_interception() first")
|
||||
return []
|
||||
|
||||
captured = []
|
||||
start_time = time.time()
|
||||
|
||||
while time.time() - start_time < duration:
|
||||
try:
|
||||
# Get performance logs which contain network events
|
||||
logs = self.driver.get_log('performance')
|
||||
|
||||
for entry in logs:
|
||||
try:
|
||||
log_data = json.loads(entry['message'])
|
||||
message = log_data.get('message', {})
|
||||
method = message.get('method', '')
|
||||
params = message.get('params', {})
|
||||
|
||||
# Capture response received events
|
||||
if method == 'Network.responseReceived':
|
||||
response = params.get('response', {})
|
||||
url = response.get('url', '')
|
||||
|
||||
if self._is_review_api(url):
|
||||
request_id = params.get('requestId')
|
||||
self.request_map[request_id] = {
|
||||
'url': url,
|
||||
'status': response.get('status'),
|
||||
'headers': response.get('headers', {})
|
||||
}
|
||||
|
||||
# Capture response body when loading is finished
|
||||
elif method == 'Network.loadingFinished':
|
||||
request_id = params.get('requestId')
|
||||
if request_id in self.request_map:
|
||||
body = self._get_response_body(request_id)
|
||||
if body:
|
||||
captured.append({
|
||||
'url': self.request_map[request_id]['url'],
|
||||
'body': body,
|
||||
'timestamp': time.time()
|
||||
})
|
||||
|
||||
except Exception as parse_error:
|
||||
log.debug(f"Error parsing log entry: {parse_error}")
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
# Performance logs might not be available
|
||||
log.debug(f"Could not get performance logs: {e}")
|
||||
break
|
||||
|
||||
time.sleep(0.1)
|
||||
|
||||
with self._lock:
|
||||
self.captured_responses.extend(captured)
|
||||
|
||||
return captured
|
||||
|
||||
def get_response_bodies_cdp(self):
|
||||
"""Get response bodies using CDP directly (more reliable method)"""
|
||||
responses = []
|
||||
|
||||
try:
|
||||
# Use CDP to get all responses
|
||||
result = self.driver.execute_cdp_cmd('Network.getAllCookies', {})
|
||||
|
||||
# Execute JavaScript to intercept fetch/XHR responses
|
||||
intercept_script = """
|
||||
(function() {
|
||||
if (window.__interceptedResponses) {
|
||||
var responses = window.__interceptedResponses;
|
||||
window.__interceptedResponses = [];
|
||||
return responses;
|
||||
}
|
||||
return [];
|
||||
})();
|
||||
"""
|
||||
|
||||
captured = self.driver.execute_script(intercept_script)
|
||||
if captured:
|
||||
responses.extend(captured)
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"CDP response capture error: {e}")
|
||||
|
||||
return responses
|
||||
|
||||
def inject_response_interceptor(self):
|
||||
"""
|
||||
Inject JavaScript to intercept XHR/Fetch responses at the browser level.
|
||||
This is the most reliable method for capturing API responses.
|
||||
"""
|
||||
intercept_script = """
|
||||
(function() {
|
||||
// Skip if already injected
|
||||
if (window.__reviewInterceptorInjected) {
|
||||
console.log('[API Interceptor] Already injected, skipping');
|
||||
return;
|
||||
}
|
||||
window.__reviewInterceptorInjected = true;
|
||||
window.__interceptedResponses = [];
|
||||
window.__interceptorStats = {
|
||||
totalFetch: 0,
|
||||
totalXHR: 0,
|
||||
capturedFetch: 0,
|
||||
capturedXHR: 0,
|
||||
lastCapture: null
|
||||
};
|
||||
|
||||
console.log('[API Interceptor] Initializing...');
|
||||
|
||||
// Store original fetch
|
||||
const originalFetch = window.fetch;
|
||||
|
||||
// Override fetch
|
||||
window.fetch = async function(...args) {
|
||||
window.__interceptorStats.totalFetch++;
|
||||
const url = args[0].toString();
|
||||
|
||||
// Log ALL fetch requests for debugging
|
||||
console.debug('[API Interceptor] FETCH:', url.substring(0, 150));
|
||||
|
||||
const response = await originalFetch.apply(this, args);
|
||||
|
||||
// Check if this is a review-related API call
|
||||
if (url.includes('review') || url.includes('batchexecute') ||
|
||||
url.includes('place') || url.includes('maps') ||
|
||||
url.includes('listugcposts') || url.includes('getreviews')) {
|
||||
try {
|
||||
const clone = response.clone();
|
||||
const text = await clone.text();
|
||||
|
||||
console.log('[API Interceptor] ✅ CAPTURED FETCH:', url.substring(0, 100), 'Size:', text.length);
|
||||
|
||||
window.__interceptedResponses.push({
|
||||
url: url,
|
||||
body: text,
|
||||
timestamp: Date.now(),
|
||||
type: 'fetch',
|
||||
size: text.length
|
||||
});
|
||||
|
||||
window.__interceptorStats.capturedFetch++;
|
||||
window.__interceptorStats.lastCapture = new Date().toISOString();
|
||||
|
||||
// Keep only last 100 responses to avoid memory issues
|
||||
if (window.__interceptedResponses.length > 100) {
|
||||
window.__interceptedResponses = window.__interceptedResponses.slice(-50);
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('[API Interceptor] Response capture error:', e);
|
||||
}
|
||||
}
|
||||
|
||||
return response;
|
||||
};
|
||||
|
||||
// Store original XMLHttpRequest
|
||||
const originalXHR = window.XMLHttpRequest;
|
||||
|
||||
// Create intercepting XHR
|
||||
window.XMLHttpRequest = function() {
|
||||
const xhr = new originalXHR();
|
||||
const originalOpen = xhr.open;
|
||||
const originalSend = xhr.send;
|
||||
let requestUrl = '';
|
||||
|
||||
xhr.open = function(method, url, ...rest) {
|
||||
requestUrl = url;
|
||||
window.__interceptorStats.totalXHR++;
|
||||
console.debug('[API Interceptor] XHR:', method, url.substring(0, 150));
|
||||
return originalOpen.apply(this, [method, url, ...rest]);
|
||||
};
|
||||
|
||||
xhr.addEventListener('load', function() {
|
||||
if (requestUrl.includes('review') || requestUrl.includes('batchexecute') ||
|
||||
requestUrl.includes('place') || requestUrl.includes('maps') ||
|
||||
requestUrl.includes('listugcposts') || requestUrl.includes('getreviews')) {
|
||||
try {
|
||||
console.log('[API Interceptor] ✅ CAPTURED XHR:', requestUrl.substring(0, 100), 'Size:', xhr.responseText.length);
|
||||
|
||||
window.__interceptedResponses.push({
|
||||
url: requestUrl,
|
||||
body: xhr.responseText,
|
||||
timestamp: Date.now(),
|
||||
type: 'xhr',
|
||||
status: xhr.status,
|
||||
size: xhr.responseText.length
|
||||
});
|
||||
|
||||
window.__interceptorStats.capturedXHR++;
|
||||
window.__interceptorStats.lastCapture = new Date().toISOString();
|
||||
|
||||
if (window.__interceptedResponses.length > 100) {
|
||||
window.__interceptedResponses = window.__interceptedResponses.slice(-50);
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('[API Interceptor] XHR capture error:', e);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return xhr;
|
||||
};
|
||||
|
||||
// Copy static properties
|
||||
for (let prop of Object.getOwnPropertyNames(originalXHR)) {
|
||||
try {
|
||||
window.XMLHttpRequest[prop] = originalXHR[prop];
|
||||
} catch (e) {}
|
||||
}
|
||||
|
||||
console.log('[API Interceptor] ✅ Injected successfully! Monitoring network requests...');
|
||||
|
||||
// Log stats every 10 seconds
|
||||
setInterval(() => {
|
||||
if (window.__interceptorStats.totalFetch > 0 || window.__interceptorStats.totalXHR > 0) {
|
||||
console.log('[API Interceptor] Stats:',
|
||||
'Fetch:', window.__interceptorStats.totalFetch, '/', window.__interceptorStats.capturedFetch,
|
||||
'XHR:', window.__interceptorStats.totalXHR, '/', window.__interceptorStats.capturedXHR,
|
||||
'Queue:', window.__interceptedResponses.length);
|
||||
}
|
||||
}, 10000);
|
||||
|
||||
return true;
|
||||
})();
|
||||
"""
|
||||
|
||||
try:
|
||||
result = self.driver.execute_script(intercept_script)
|
||||
log.info("JavaScript response interceptor injected with enhanced debugging")
|
||||
|
||||
# Get initial stats
|
||||
stats = self.get_interceptor_stats()
|
||||
log.debug(f"Interceptor stats: {stats}")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
log.warning(f"Failed to inject interceptor: {e}")
|
||||
return False
|
||||
|
||||
def get_intercepted_responses(self):
|
||||
"""Retrieve intercepted responses from the browser"""
|
||||
try:
|
||||
script = """
|
||||
if (window.__interceptedResponses) {
|
||||
var responses = window.__interceptedResponses.slice();
|
||||
window.__interceptedResponses = [];
|
||||
return responses;
|
||||
}
|
||||
return [];
|
||||
"""
|
||||
responses = self.driver.execute_script(script)
|
||||
|
||||
if responses:
|
||||
log.debug(f"Retrieved {len(responses)} intercepted responses from browser")
|
||||
for resp in responses[:3]: # Log first 3 for debugging
|
||||
log.debug(f" - {resp.get('type', '?').upper()}: {resp.get('url', '')[:100]} ({resp.get('size', 0)} bytes)")
|
||||
else:
|
||||
log.debug("No intercepted responses available")
|
||||
|
||||
return responses or []
|
||||
except Exception as e:
|
||||
log.debug(f"Error getting intercepted responses: {e}")
|
||||
return []
|
||||
|
||||
def get_interceptor_stats(self):
|
||||
"""Get statistics from the JavaScript interceptor"""
|
||||
try:
|
||||
script = """
|
||||
if (window.__interceptorStats) {
|
||||
return window.__interceptorStats;
|
||||
}
|
||||
return null;
|
||||
"""
|
||||
stats = self.driver.execute_script(script)
|
||||
return stats
|
||||
except Exception as e:
|
||||
log.debug(f"Error getting interceptor stats: {e}")
|
||||
return None
|
||||
|
||||
def get_browser_console_logs(self):
|
||||
"""Get browser console logs (for debugging)"""
|
||||
try:
|
||||
logs = self.driver.get_log('browser')
|
||||
return logs
|
||||
except Exception as e:
|
||||
log.debug(f"Could not get browser console logs: {e}")
|
||||
return []
|
||||
|
||||
def dump_responses_to_file(self, responses: List[Dict], output_dir: str = "debug_api_responses"):
|
||||
"""
|
||||
Dump captured responses to files for debugging.
|
||||
Creates one file per response with metadata and body.
|
||||
"""
|
||||
try:
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(exist_ok=True)
|
||||
|
||||
for i, response in enumerate(responses):
|
||||
timestamp = response.get('timestamp', int(time.time() * 1000))
|
||||
url = response.get('url', 'unknown')
|
||||
req_type = response.get('type', 'unknown')
|
||||
|
||||
# Create filename from timestamp and type
|
||||
filename = f"{timestamp}_{req_type}_{i}.json"
|
||||
filepath = output_path / filename
|
||||
|
||||
# Write response with metadata
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
json.dump({
|
||||
'metadata': {
|
||||
'url': url,
|
||||
'type': req_type,
|
||||
'timestamp': timestamp,
|
||||
'size': response.get('size', len(response.get('body', ''))),
|
||||
'status': response.get('status')
|
||||
},
|
||||
'body': response.get('body', '')
|
||||
}, f, indent=2, ensure_ascii=False)
|
||||
|
||||
log.info(f"Dumped {len(responses)} responses to {output_path}")
|
||||
return str(output_path)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error dumping responses to file: {e}")
|
||||
return None
|
||||
|
||||
def _is_review_api(self, url: str) -> bool:
|
||||
"""Check if URL matches review API patterns"""
|
||||
url_lower = url.lower()
|
||||
return any(re.search(pattern, url_lower) for pattern in self.REVIEW_API_PATTERNS)
|
||||
|
||||
def _get_response_body(self, request_id: str) -> Optional[str]:
|
||||
"""Get response body for a request ID using CDP"""
|
||||
try:
|
||||
result = self.driver.execute_cdp_cmd('Network.getResponseBody', {
|
||||
'requestId': request_id
|
||||
})
|
||||
|
||||
body = result.get('body', '')
|
||||
if result.get('base64Encoded'):
|
||||
body = base64.b64decode(body).decode('utf-8', errors='ignore')
|
||||
|
||||
return body
|
||||
except Exception as e:
|
||||
log.debug(f"Could not get response body for {request_id}: {e}")
|
||||
return None
|
||||
|
||||
def parse_reviews_from_responses(self, responses: List[Dict]) -> List[InterceptedReview]:
|
||||
"""
|
||||
Parse review data from captured API responses.
|
||||
Google's API responses use a custom nested array format.
|
||||
"""
|
||||
reviews = []
|
||||
|
||||
for response in responses:
|
||||
try:
|
||||
body = response.get('body', '')
|
||||
url = response.get('url', '')
|
||||
|
||||
# Skip non-JSON responses
|
||||
if not body or body.startswith('<!DOCTYPE'):
|
||||
continue
|
||||
|
||||
# Try to parse as JSON
|
||||
parsed_reviews = self._parse_response_body(body, url)
|
||||
reviews.extend(parsed_reviews)
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error parsing response: {e}")
|
||||
continue
|
||||
|
||||
# Deduplicate by review ID
|
||||
seen_ids = set()
|
||||
unique_reviews = []
|
||||
for review in reviews:
|
||||
if review.review_id and review.review_id not in seen_ids:
|
||||
seen_ids.add(review.review_id)
|
||||
unique_reviews.append(review)
|
||||
|
||||
return unique_reviews
|
||||
|
||||
def _parse_response_body(self, body: str, url: str) -> List[InterceptedReview]:
|
||||
"""Parse a single response body for review data"""
|
||||
reviews = []
|
||||
|
||||
# Skip empty or HTML responses
|
||||
if not body or body.startswith('<!DOCTYPE') or body.startswith('<html'):
|
||||
return reviews
|
||||
|
||||
# Handle batch execute format (starts with )]}' prefix)
|
||||
if body.startswith(")]}'"):
|
||||
body = body[4:].strip()
|
||||
|
||||
try:
|
||||
data = json.loads(body)
|
||||
except json.JSONDecodeError:
|
||||
# Try to extract JSON from the response
|
||||
json_match = re.search(r'\[.*\]', body, re.DOTALL)
|
||||
if json_match:
|
||||
try:
|
||||
data = json.loads(json_match.group())
|
||||
except:
|
||||
log.debug(f"Failed to parse JSON from response")
|
||||
return reviews
|
||||
else:
|
||||
log.debug(f"No JSON found in response")
|
||||
return reviews
|
||||
|
||||
# Special handling for listugcposts endpoint
|
||||
if 'listugcposts' in url.lower():
|
||||
reviews.extend(self._parse_listugcposts_response(data))
|
||||
else:
|
||||
# Generic recursive extraction
|
||||
reviews.extend(self._extract_reviews_recursive(data))
|
||||
|
||||
return reviews
|
||||
|
||||
def _parse_listugcposts_response(self, data: Any) -> List[InterceptedReview]:
|
||||
"""
|
||||
Parse Google Maps listugcposts API response.
|
||||
|
||||
Structure discovered:
|
||||
data[2] = array of review groups
|
||||
data[2][i] = single review group [review_data, metadata, continuation_token]
|
||||
data[2][i][0] = review data (6-item array containing all review info)
|
||||
"""
|
||||
reviews = []
|
||||
|
||||
try:
|
||||
if not isinstance(data, list) or len(data) < 3:
|
||||
log.debug("Response doesn't match expected structure (not a list or too short)")
|
||||
return reviews
|
||||
|
||||
# data[2] contains the review groups
|
||||
review_groups = data[2]
|
||||
if not isinstance(review_groups, list):
|
||||
log.debug("data[2] is not a list")
|
||||
return reviews
|
||||
|
||||
log.debug(f"Found {len(review_groups)} reviews in data[2]")
|
||||
|
||||
# Each group IS ONE REVIEW
|
||||
for group_idx, group in enumerate(review_groups):
|
||||
if not isinstance(group, list) or len(group) == 0:
|
||||
continue
|
||||
|
||||
# group[0] is the review data array (6 items)
|
||||
review_data = group[0]
|
||||
if not isinstance(review_data, list):
|
||||
continue
|
||||
|
||||
try:
|
||||
review = self._parse_google_review_array(review_data)
|
||||
if review:
|
||||
reviews.append(review)
|
||||
log.debug(f"Parsed review {group_idx}: {review.author} - {review.rating}★")
|
||||
except Exception as e:
|
||||
log.debug(f"Error parsing review at group[{group_idx}]: {e}")
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error in _parse_listugcposts_response: {e}")
|
||||
|
||||
return reviews
|
||||
|
||||
def _parse_google_review_array(self, review_data: List) -> Optional[InterceptedReview]:
|
||||
"""
|
||||
Parse a single review from Google's 6-item array format.
|
||||
|
||||
Discovered structure (review_data is a 6-item array):
|
||||
review_data[0] = Review ID (string)
|
||||
review_data[1][4][5][0] = Author Name
|
||||
review_data[1][4][5][3] = User ID
|
||||
review_data[1][6] = Date Text
|
||||
review_data[2][0][0] = Rating (1-5)
|
||||
review_data[2][15][0][0] = Review Text (original)
|
||||
review_data[2][15][1][0] = Review Text (translated)
|
||||
"""
|
||||
review = InterceptedReview()
|
||||
|
||||
try:
|
||||
# Extract review ID from review_data[0]
|
||||
if len(review_data) > 0 and isinstance(review_data[0], str):
|
||||
review.review_id = review_data[0]
|
||||
|
||||
# Extract author info from review_data[1][4][5]
|
||||
if (len(review_data) > 1 and
|
||||
isinstance(review_data[1], list) and
|
||||
len(review_data[1]) > 4 and
|
||||
isinstance(review_data[1][4], list) and
|
||||
len(review_data[1][4]) > 5 and
|
||||
isinstance(review_data[1][4][5], list)):
|
||||
|
||||
author_info = review_data[1][4][5]
|
||||
|
||||
# Author name at [1][4][5][0]
|
||||
if len(author_info) > 0 and isinstance(author_info[0], str):
|
||||
review.author = author_info[0]
|
||||
|
||||
# Profile picture at [1][4][5][1] (if available)
|
||||
if len(author_info) > 1 and isinstance(author_info[1], str):
|
||||
review.avatar_url = author_info[1]
|
||||
|
||||
# Extract date from review_data[1][6]
|
||||
if (len(review_data) > 1 and
|
||||
isinstance(review_data[1], list) and
|
||||
len(review_data[1]) > 6 and
|
||||
isinstance(review_data[1][6], str)):
|
||||
review.date_text = review_data[1][6]
|
||||
|
||||
# Extract rating from review_data[2][0][0]
|
||||
if (len(review_data) > 2 and
|
||||
isinstance(review_data[2], list) and
|
||||
len(review_data[2]) > 0 and
|
||||
isinstance(review_data[2][0], list) and
|
||||
len(review_data[2][0]) > 0):
|
||||
rating_val = review_data[2][0][0]
|
||||
if isinstance(rating_val, (int, float)) and 1 <= rating_val <= 5:
|
||||
review.rating = float(rating_val)
|
||||
|
||||
# Extract review text from review_data[2][15][0][0]
|
||||
if (len(review_data) > 2 and
|
||||
isinstance(review_data[2], list) and
|
||||
len(review_data[2]) > 15 and
|
||||
isinstance(review_data[2][15], list) and
|
||||
len(review_data[2][15]) > 0 and
|
||||
isinstance(review_data[2][15][0], list) and
|
||||
len(review_data[2][15][0]) > 0):
|
||||
text = review_data[2][15][0][0]
|
||||
if isinstance(text, str):
|
||||
review.text = text
|
||||
|
||||
# Only return if we have minimum required data
|
||||
if review.rating > 0 and (review.author or review.text):
|
||||
return review
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error parsing Google review array: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _parse_review_array_v2(self, arr: List) -> Optional[InterceptedReview]:
|
||||
"""
|
||||
Parse review from Google's nested array format.
|
||||
Improved version with better field detection.
|
||||
"""
|
||||
review = InterceptedReview()
|
||||
|
||||
try:
|
||||
# Extract review ID (usually a long string in first few elements)
|
||||
for i, item in enumerate(arr[:5]):
|
||||
if isinstance(item, str) and len(item) > 30 and not item.startswith('http'):
|
||||
review.review_id = item
|
||||
break
|
||||
|
||||
# Extract rating (number between 1-5)
|
||||
for item in arr:
|
||||
if isinstance(item, (int, float)) and 1 <= item <= 5:
|
||||
review.rating = float(item)
|
||||
break
|
||||
elif isinstance(item, list):
|
||||
for subitem in item:
|
||||
if isinstance(subitem, (int, float)) and 1 <= subitem <= 5:
|
||||
review.rating = float(subitem)
|
||||
break
|
||||
if review.rating > 0:
|
||||
break
|
||||
|
||||
# Extract review text (long string, not a URL)
|
||||
for item in arr:
|
||||
if isinstance(item, str) and len(item) > 50 and not item.startswith('http'):
|
||||
if not review.review_id or item != review.review_id:
|
||||
review.text = item
|
||||
break
|
||||
|
||||
# Extract author name (shorter string, not ID or text)
|
||||
for item in arr:
|
||||
if isinstance(item, str) and 3 <= len(item) <= 100:
|
||||
if item != review.review_id and item != review.text and not item.startswith('http'):
|
||||
review.author = item
|
||||
break
|
||||
elif isinstance(item, list):
|
||||
for subitem in item:
|
||||
if isinstance(subitem, str) and 3 <= len(subitem) <= 100:
|
||||
if subitem != review.text and not subitem.startswith('http'):
|
||||
review.author = subitem
|
||||
break
|
||||
if review.author:
|
||||
break
|
||||
|
||||
# Extract dates (strings that look like dates)
|
||||
date_patterns = [r'\d{1,2}/\d{1,2}/\d{2,4}', r'\d{4}-\d{2}-\d{2}', r'hace \d+', r'\d+ days? ago']
|
||||
for item in arr:
|
||||
if isinstance(item, str):
|
||||
for pattern in date_patterns:
|
||||
if re.search(pattern, item, re.IGNORECASE):
|
||||
review.date_text = item
|
||||
break
|
||||
if review.date_text:
|
||||
break
|
||||
|
||||
# Only return if we have meaningful data
|
||||
if (review.review_id or review.author) and review.rating > 0:
|
||||
return review
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error in _parse_review_array_v2: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _extract_reviews_recursive(self, data: Any, depth: int = 0) -> List[InterceptedReview]:
|
||||
"""Recursively search for review data in nested structures"""
|
||||
reviews = []
|
||||
|
||||
if depth > 20: # Prevent infinite recursion
|
||||
return reviews
|
||||
|
||||
# Skip if data is already an InterceptedReview object
|
||||
if isinstance(data, InterceptedReview):
|
||||
return [data]
|
||||
|
||||
if isinstance(data, dict):
|
||||
# Check if this looks like a review object
|
||||
review = self._try_parse_review_dict(data)
|
||||
if review:
|
||||
reviews.append(review)
|
||||
|
||||
# Recurse into dict values
|
||||
for value in data.values():
|
||||
if not isinstance(value, InterceptedReview):
|
||||
reviews.extend(self._extract_reviews_recursive(value, depth + 1))
|
||||
|
||||
elif isinstance(data, list):
|
||||
# Check if this array looks like a review array
|
||||
review = self._try_parse_review_array(data)
|
||||
if review:
|
||||
reviews.append(review)
|
||||
|
||||
# Recurse into list items
|
||||
for item in data:
|
||||
if not isinstance(item, InterceptedReview):
|
||||
reviews.extend(self._extract_reviews_recursive(item, depth + 1))
|
||||
|
||||
return reviews
|
||||
|
||||
def _try_parse_review_dict(self, data: Dict) -> Optional[InterceptedReview]:
|
||||
"""Try to parse a dictionary as a review object"""
|
||||
# Common keys in review objects
|
||||
review_keys = {'reviewId', 'review_id', 'author', 'rating', 'text', 'comment'}
|
||||
|
||||
if not any(k in data for k in review_keys):
|
||||
return None
|
||||
|
||||
try:
|
||||
review = InterceptedReview()
|
||||
|
||||
# Try various key names for each field
|
||||
review.review_id = data.get('reviewId') or data.get('review_id') or data.get('id', '')
|
||||
review.author = data.get('author') or data.get('authorName') or data.get('name', '')
|
||||
review.rating = float(data.get('rating') or data.get('starRating') or 0)
|
||||
review.text = data.get('text') or data.get('comment') or data.get('reviewText', '')
|
||||
review.date_text = data.get('publishTime') or data.get('relativePublishTime') or data.get('date', '')
|
||||
review.likes = int(data.get('thumbsUpCount') or data.get('likes') or 0)
|
||||
|
||||
# Photos
|
||||
photos = data.get('photos') or data.get('reviewPhotos') or []
|
||||
if photos:
|
||||
review.photos = [p.get('url') or p for p in photos if p]
|
||||
|
||||
# Profile
|
||||
author_data = data.get('author') if isinstance(data.get('author'), dict) else {}
|
||||
review.profile_url = author_data.get('profileUrl') or data.get('profileUrl', '')
|
||||
review.avatar_url = author_data.get('profilePhotoUrl') or data.get('avatar', '')
|
||||
|
||||
# Owner response
|
||||
owner_resp = data.get('ownerResponse') or data.get('ownerReply') or {}
|
||||
if isinstance(owner_resp, dict):
|
||||
review.owner_response = owner_resp.get('text', '')
|
||||
review.owner_response_date = owner_resp.get('publishTime', '')
|
||||
|
||||
# Only return if we have meaningful data
|
||||
if review.review_id or (review.author and review.text):
|
||||
return review
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error parsing review dict: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _try_parse_review_array(self, data: List) -> Optional[InterceptedReview]:
|
||||
"""
|
||||
Try to parse a nested array as a review (Google's protobuf-like format).
|
||||
Google often uses positional arrays like: [id, author, [rating], text, ...]
|
||||
"""
|
||||
if not data or len(data) < 3:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Look for patterns that indicate this is a review array
|
||||
# Pattern 1: [review_id, [author_info], rating_array, text, ...]
|
||||
|
||||
review = InterceptedReview()
|
||||
|
||||
# Check if first element looks like a review ID
|
||||
if isinstance(data[0], str) and len(data[0]) > 20:
|
||||
review.review_id = data[0]
|
||||
|
||||
# Search for rating (usually a small number 1-5)
|
||||
for item in data:
|
||||
if isinstance(item, (int, float)) and 1 <= item <= 5:
|
||||
review.rating = float(item)
|
||||
break
|
||||
elif isinstance(item, list) and len(item) >= 1:
|
||||
if isinstance(item[0], (int, float)) and 1 <= item[0] <= 5:
|
||||
review.rating = float(item[0])
|
||||
break
|
||||
|
||||
# Search for text (long string)
|
||||
for item in data:
|
||||
if isinstance(item, str) and len(item) > 30:
|
||||
review.text = item
|
||||
break
|
||||
elif isinstance(item, list):
|
||||
for subitem in item:
|
||||
if isinstance(subitem, str) and len(subitem) > 30:
|
||||
review.text = subitem
|
||||
break
|
||||
|
||||
# Search for author name (shorter string)
|
||||
for item in data:
|
||||
if isinstance(item, list) and len(item) >= 1:
|
||||
for subitem in item:
|
||||
if isinstance(subitem, str) and 2 <= len(subitem) <= 100 and subitem != review.text:
|
||||
review.author = subitem
|
||||
break
|
||||
if review.author:
|
||||
break
|
||||
|
||||
# Search for URLs (photos, profile)
|
||||
for item in data:
|
||||
if isinstance(item, str) and item.startswith('http'):
|
||||
if 'googleusercontent' in item or 'ggpht' in item:
|
||||
if not review.avatar_url:
|
||||
review.avatar_url = item
|
||||
else:
|
||||
review.photos.append(item)
|
||||
elif isinstance(item, list):
|
||||
self._extract_urls_from_array(item, review)
|
||||
|
||||
# Only return if we have meaningful data
|
||||
if review.review_id and review.rating > 0:
|
||||
return review
|
||||
if review.text and review.rating > 0:
|
||||
return review
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error parsing review array: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _extract_urls_from_array(self, arr: List, review: InterceptedReview, depth: int = 0):
|
||||
"""Extract URLs from nested arrays"""
|
||||
if depth > 5:
|
||||
return
|
||||
|
||||
for item in arr:
|
||||
if isinstance(item, str) and item.startswith('http'):
|
||||
if 'googleusercontent' in item or 'ggpht' in item or 'lh3' in item:
|
||||
if 'w72-h72' in item or 'p-rp-mo' in item: # Profile pic pattern
|
||||
review.avatar_url = item
|
||||
else:
|
||||
review.photos.append(item)
|
||||
elif isinstance(item, list):
|
||||
self._extract_urls_from_array(item, depth + 1, review)
|
||||
|
||||
def convert_to_raw_review_format(self, intercepted: InterceptedReview) -> Dict[str, Any]:
|
||||
"""Convert an InterceptedReview to the format used by RawReview/storage"""
|
||||
return {
|
||||
'review_id': intercepted.review_id,
|
||||
'author': intercepted.author,
|
||||
'rating': intercepted.rating,
|
||||
'description': {'en': intercepted.text} if intercepted.text else {},
|
||||
'likes': intercepted.likes,
|
||||
'user_images': intercepted.photos,
|
||||
'author_profile_url': intercepted.profile_url,
|
||||
'profile_picture': intercepted.avatar_url,
|
||||
'owner_responses': {
|
||||
'en': {'text': intercepted.owner_response}
|
||||
} if intercepted.owner_response else {},
|
||||
'review_date': intercepted.date_text,
|
||||
'_source': 'api_intercept'
|
||||
}
|
||||
|
||||
def cleanup(self):
|
||||
"""Clean up interception resources"""
|
||||
try:
|
||||
self.driver.execute_cdp_cmd('Network.disable', {})
|
||||
except:
|
||||
pass
|
||||
|
||||
self.captured_responses.clear()
|
||||
self.captured_reviews.clear()
|
||||
self.request_map.clear()
|
||||
self._listening = False
|
||||
@@ -35,16 +35,45 @@ class ChromeWorker:
|
||||
|
||||
# SeleniumBase Driver automatically includes UC mode anti-detection
|
||||
# Initialize with longer timeouts for large scraping jobs
|
||||
# Chrome arguments for Docker stability
|
||||
chrome_args = [
|
||||
"--disable-dev-shm-usage", # Use /tmp instead of /dev/shm (critical for Docker)
|
||||
"--disable-gpu", # Disable GPU acceleration
|
||||
"--no-sandbox", # Required for Docker
|
||||
"--disable-software-rasterizer",
|
||||
"--disable-extensions",
|
||||
"--disable-background-networking",
|
||||
"--disable-default-apps",
|
||||
"--disable-sync",
|
||||
"--metrics-recording-only",
|
||||
"--mute-audio",
|
||||
"--no-first-run",
|
||||
"--safebrowsing-disable-auto-update",
|
||||
]
|
||||
|
||||
self.driver = Driver(
|
||||
uc=True,
|
||||
headless=self.headless,
|
||||
page_load_strategy="normal"
|
||||
page_load_strategy="normal",
|
||||
chromium_arg=",".join(chrome_args)
|
||||
)
|
||||
|
||||
# Set generous timeouts for large scraping jobs
|
||||
self.driver.set_page_load_timeout(120) # 2 minutes for slow networks
|
||||
self.driver.set_script_timeout(60) # 1 minute for complex extraction
|
||||
|
||||
# Set Chrome geolocation to US (Boston, MA) for consistent Google Maps results
|
||||
# This prevents location-based variations in search results
|
||||
try:
|
||||
self.driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
||||
'latitude': 42.3601,
|
||||
'longitude': -71.0589,
|
||||
'accuracy': 100
|
||||
})
|
||||
log.info(f"Worker {self.worker_id}: Geolocation set to US (Boston, MA)")
|
||||
except Exception as e:
|
||||
log.warning(f"Worker {self.worker_id}: Could not set geolocation: {e}")
|
||||
|
||||
self.driver.maximize_window()
|
||||
self.created_at = time.time()
|
||||
self.last_used = time.time()
|
||||
|
||||
@@ -1,80 +0,0 @@
|
||||
"""
|
||||
Command line interface handling for Google Maps Reviews Scraper.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from modules.config import DEFAULT_CONFIG_PATH
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
"""Parse command line arguments"""
|
||||
ap = argparse.ArgumentParser(description="Google‑Maps review scraper with MongoDB integration")
|
||||
ap.add_argument("-q", "--headless", action="store_true",
|
||||
help="run Chrome in the background")
|
||||
ap.add_argument("-s", "--sort", dest="sort_by",
|
||||
choices=("newest", "highest", "lowest", "relevance"),
|
||||
default=None, help="sorting order for reviews")
|
||||
ap.add_argument("--stop-on-match", action="store_true",
|
||||
help="stop scrolling when first already‑seen id is met "
|
||||
"(useful with --sort newest)")
|
||||
ap.add_argument("--url", type=str, default=None,
|
||||
help="custom Google Maps URL to scrape")
|
||||
ap.add_argument("--overwrite", action="store_true", dest="overwrite_existing",
|
||||
help="overwrite existing reviews instead of appending")
|
||||
ap.add_argument("--config", type=str, default=None,
|
||||
help="path to custom configuration file")
|
||||
ap.add_argument("--use-mongodb", type=bool, default=None,
|
||||
help="whether to use MongoDB for storage")
|
||||
|
||||
# Arguments for date conversion and image downloading
|
||||
ap.add_argument("--convert-dates", type=bool, default=None,
|
||||
help="convert string dates to MongoDB Date objects")
|
||||
ap.add_argument("--download-images", type=bool, default=None,
|
||||
help="download images from reviews")
|
||||
ap.add_argument("--image-dir", type=str, default=None,
|
||||
help="directory to store downloaded images")
|
||||
ap.add_argument("--download-threads", type=int, default=None,
|
||||
help="number of threads for downloading images")
|
||||
|
||||
# Arguments for local image paths and URL replacement
|
||||
ap.add_argument("--store-local-paths", type=bool, default=None,
|
||||
help="whether to store local image paths in documents")
|
||||
ap.add_argument("--replace-urls", type=bool, default=None,
|
||||
help="whether to replace original URLs with custom ones")
|
||||
ap.add_argument("--custom-url-base", type=str, default=None,
|
||||
help="base URL for replacement")
|
||||
ap.add_argument("--custom-url-profiles", type=str, default=None,
|
||||
help="path for profile images")
|
||||
ap.add_argument("--custom-url-reviews", type=str, default=None,
|
||||
help="path for review images")
|
||||
ap.add_argument("--preserve-original-urls", type=bool, default=None,
|
||||
help="whether to preserve original URLs in original_* fields")
|
||||
|
||||
# Arguments for custom parameters
|
||||
ap.add_argument("--custom-params", type=str, default=None,
|
||||
help="JSON string with custom parameters to add to each document (e.g. '{\"company\":\"Thaitours\"}')")
|
||||
|
||||
# API interception option
|
||||
ap.add_argument("--api-intercept", action="store_true", dest="enable_api_intercept",
|
||||
help="enable API response interception for faster data capture (experimental)")
|
||||
|
||||
args = ap.parse_args()
|
||||
|
||||
# Handle config path
|
||||
if args.config is not None:
|
||||
args.config = Path(args.config)
|
||||
else:
|
||||
args.config = DEFAULT_CONFIG_PATH
|
||||
|
||||
# Process custom params if provided
|
||||
if args.custom_params:
|
||||
try:
|
||||
args.custom_params = json.loads(args.custom_params)
|
||||
except json.JSONDecodeError:
|
||||
print(f"Warning: Could not parse custom params JSON: {args.custom_params}")
|
||||
args.custom_params = None
|
||||
|
||||
return args
|
||||
@@ -77,11 +77,17 @@ class DatabaseManager:
|
||||
|
||||
error_message TEXT,
|
||||
metadata JSONB,
|
||||
scrape_logs JSONB,
|
||||
|
||||
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
|
||||
);
|
||||
""")
|
||||
|
||||
# Add scrape_logs column if it doesn't exist (for existing databases)
|
||||
await conn.execute("""
|
||||
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
|
||||
""")
|
||||
|
||||
# Create indexes
|
||||
await conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
|
||||
@@ -182,10 +188,12 @@ class DatabaseManager:
|
||||
started_at,
|
||||
completed_at,
|
||||
reviews_count,
|
||||
total_reviews,
|
||||
reviews_data,
|
||||
scrape_time,
|
||||
error_message,
|
||||
metadata
|
||||
metadata,
|
||||
scrape_logs
|
||||
FROM jobs
|
||||
WHERE job_id = $1
|
||||
""", job_id)
|
||||
@@ -246,8 +254,13 @@ class DatabaseManager:
|
||||
kwargs['completed_at'] = datetime.now()
|
||||
|
||||
for key, value in kwargs.items():
|
||||
set_clauses.append(f"{key} = ${param_idx}")
|
||||
params.append(value)
|
||||
# Handle JSONB fields specially
|
||||
if key == 'scrape_logs' and value is not None:
|
||||
set_clauses.append(f"{key} = ${param_idx}::jsonb")
|
||||
params.append(json.dumps(value) if not isinstance(value, str) else value)
|
||||
else:
|
||||
set_clauses.append(f"{key} = ${param_idx}")
|
||||
params.append(value)
|
||||
param_idx += 1
|
||||
|
||||
query = f"""
|
||||
@@ -264,7 +277,8 @@ class DatabaseManager:
|
||||
job_id: UUID,
|
||||
reviews: List[Dict[str, Any]],
|
||||
scrape_time: float,
|
||||
total_reviews: Optional[int] = None
|
||||
total_reviews: Optional[int] = None,
|
||||
scrape_logs: Optional[List[Dict[str, Any]]] = None
|
||||
):
|
||||
"""
|
||||
Save scraping results to database.
|
||||
@@ -274,6 +288,7 @@ class DatabaseManager:
|
||||
reviews: List of review dictionaries
|
||||
scrape_time: Time taken to scrape in seconds
|
||||
total_reviews: Total reviews available (from page counter)
|
||||
scrape_logs: List of log entries from the scraper
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
@@ -284,9 +299,11 @@ class DatabaseManager:
|
||||
reviews_count = $2,
|
||||
total_reviews = $3,
|
||||
reviews_data = $4::jsonb,
|
||||
scrape_time = $5
|
||||
scrape_time = $5,
|
||||
scrape_logs = $6::jsonb
|
||||
WHERE job_id = $1
|
||||
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time)
|
||||
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
|
||||
json.dumps(scrape_logs) if scrape_logs else None)
|
||||
|
||||
log.info(f"Saved {len(reviews)} reviews for job {job_id}")
|
||||
|
||||
@@ -317,8 +334,10 @@ class DatabaseManager:
|
||||
created_at,
|
||||
completed_at,
|
||||
reviews_count,
|
||||
total_reviews,
|
||||
scrape_time,
|
||||
error_message
|
||||
error_message,
|
||||
metadata
|
||||
FROM jobs
|
||||
WHERE status = $1
|
||||
ORDER BY created_at DESC
|
||||
@@ -333,8 +352,10 @@ class DatabaseManager:
|
||||
created_at,
|
||||
completed_at,
|
||||
reviews_count,
|
||||
total_reviews,
|
||||
scrape_time,
|
||||
error_message
|
||||
error_message,
|
||||
metadata
|
||||
FROM jobs
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $1 OFFSET $2
|
||||
|
||||
@@ -1140,13 +1140,30 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
else:
|
||||
log.info(f"[PROFILE] Using pooled driver (0.00s)")
|
||||
|
||||
# Force English locale for consistent parsing
|
||||
# Force English locale AND US region for consistent parsing/results
|
||||
# This helps avoid geolocation-based variations in Google Maps results
|
||||
if 'hl=' in url:
|
||||
url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en')
|
||||
else:
|
||||
separator = '&' if '?' in url else '?'
|
||||
url = f"{url}{separator}hl=en"
|
||||
|
||||
# Add US region parameter if not present
|
||||
if 'gl=' not in url:
|
||||
url = f"{url}&gl=us"
|
||||
|
||||
# Set Chrome geolocation to US (Boston, MA) using CDP
|
||||
# This ensures Google Maps shows US results regardless of server location
|
||||
try:
|
||||
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
||||
'latitude': 42.3601,
|
||||
'longitude': -71.0589,
|
||||
'accuracy': 100
|
||||
})
|
||||
log.info("Set geolocation to US (Boston, MA)")
|
||||
except Exception as e:
|
||||
log.warning(f"Could not set geolocation: {e}")
|
||||
|
||||
log.info(f"Loading Google Maps page...")
|
||||
t0 = timing_module.time()
|
||||
driver.get(url)
|
||||
@@ -1164,18 +1181,23 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
||||
for btn in form_btns:
|
||||
btn_text = (btn.text or '').lower()
|
||||
if 'aceptar todo' in btn_text or 'accept all' in btn_text:
|
||||
if 'aceptar todo' in btn_text or 'accept all' in btn_text or 'reject all' in btn_text:
|
||||
log.info(f"Clicking GDPR consent: {btn.text}")
|
||||
btn.click()
|
||||
time.sleep(1) # Reduced from 2s
|
||||
time.sleep(1)
|
||||
break
|
||||
else:
|
||||
if len(form_btns) >= 2:
|
||||
log.info("Using fallback: clicking second form button")
|
||||
form_btns[1].click()
|
||||
time.sleep(1) # Reduced from 2s
|
||||
time.sleep(1)
|
||||
except Exception as e:
|
||||
log.warning(f"GDPR consent handling failed: {e}")
|
||||
|
||||
# After GDPR consent, reload the original URL to ensure proper page state
|
||||
log.info(f"Reloading original URL after GDPR consent...")
|
||||
driver.get(url)
|
||||
time.sleep(1)
|
||||
log.info(f"[PROFILE] GDPR consent handling: {timing_module.time() - t0:.2f}s")
|
||||
else:
|
||||
log.info(f"[PROFILE] No GDPR consent page (0.00s)")
|
||||
@@ -1197,14 +1219,77 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
try:
|
||||
log.info("Waiting for Google Maps content to load...")
|
||||
wait = WebDriverWait(driver, 10)
|
||||
# Wait for basic page structure (h1 or heading)
|
||||
wait.until(
|
||||
lambda d: d.find_elements(By.CSS_SELECTOR, 'h1.DUwDvf, h1, [role="article"], [data-review-id]')
|
||||
lambda d: d.find_elements(By.CSS_SELECTOR, 'h1, [role="heading"]')
|
||||
)
|
||||
log.info("Google Maps content loaded successfully")
|
||||
log.info("Basic page structure loaded")
|
||||
|
||||
# Wait for page to settle - search URLs redirect to place URLs
|
||||
# which triggers additional content loading
|
||||
time.sleep(2)
|
||||
|
||||
# Wait specifically for review count element (aria-label ending with "reviews")
|
||||
# This is the most reliable indicator that the business detail is loaded
|
||||
try:
|
||||
WebDriverWait(driver, 5).until(
|
||||
lambda d: d.execute_script("""
|
||||
var elems = document.querySelectorAll('[aria-label]');
|
||||
for (var i = 0; i < elems.length; i++) {
|
||||
var label = elems[i].getAttribute('aria-label') || '';
|
||||
if (/^[0-9]+ reviews?$/.test(label)) return true;
|
||||
}
|
||||
return false;
|
||||
""")
|
||||
)
|
||||
log.info("Review count element loaded")
|
||||
except:
|
||||
# Fallback: Try clicking Reviews tab or rating stars to expose the review count
|
||||
log.info("Review count wait timeout, trying to click Reviews/rating...")
|
||||
try:
|
||||
# Try 1: Click Reviews tab (if exists)
|
||||
clicked = driver.execute_script("""
|
||||
var tabs = document.querySelectorAll('[role="tab"]');
|
||||
for (var i = 0; i < tabs.length; i++) {
|
||||
var txt = (tabs[i].textContent || '').toLowerCase();
|
||||
if (txt.includes('review')) {
|
||||
tabs[i].click();
|
||||
return 'tab';
|
||||
}
|
||||
}
|
||||
// Try 2: Click the rating stars element (often links to reviews)
|
||||
var stars = document.querySelector('[role="img"][aria-label*="star"]');
|
||||
if (stars) {
|
||||
var parent = stars.parentElement;
|
||||
if (parent && parent.tagName.toLowerCase() === 'button') {
|
||||
parent.click();
|
||||
return 'stars_button';
|
||||
}
|
||||
stars.click();
|
||||
return 'stars';
|
||||
}
|
||||
// Try 3: Click "Write a review" or any review-related button
|
||||
var btns = document.querySelectorAll('button[aria-label*="review" i]');
|
||||
for (var b = 0; b < btns.length; b++) {
|
||||
var label = btns[b].getAttribute('aria-label') || '';
|
||||
if (!/write/i.test(label) && /review/i.test(label)) {
|
||||
btns[b].click();
|
||||
return 'review_btn: ' + label;
|
||||
}
|
||||
}
|
||||
return 'none';
|
||||
""")
|
||||
log.info(f"Clicked: {clicked}")
|
||||
time.sleep(2) # Wait for reviews panel to load
|
||||
except Exception as e:
|
||||
log.warning(f"Click attempt failed: {e}")
|
||||
|
||||
except Exception as e:
|
||||
log.warning(f"Timeout waiting for Maps content: {e}")
|
||||
time.sleep(0.5) # Minimal fallback wait
|
||||
time.sleep(2) # Fallback wait
|
||||
log.info(f"[PROFILE] Smart wait for content: {timing_module.time() - t0:.2f}s")
|
||||
log.info(f"DEBUG: Current URL: {driver.current_url[:100]}...")
|
||||
log.info(f"DEBUG: Page title: {driver.title}")
|
||||
|
||||
# Extract business card information using JavaScript
|
||||
t0 = timing_module.time()
|
||||
@@ -1216,85 +1301,166 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
total_reviews: null
|
||||
};
|
||||
|
||||
// Extract business name
|
||||
const nameSelectors = [
|
||||
'h1.DUwDvf',
|
||||
'[role="main"] h1',
|
||||
'h1.fontHeadlineLarge'
|
||||
];
|
||||
// ============ ROBUST EXTRACTION (no class names, aria/data attributes preferred) ============
|
||||
|
||||
for (const selector of nameSelectors) {
|
||||
const elem = document.querySelector(selector);
|
||||
if (elem && elem.textContent) {
|
||||
info.name = elem.textContent.trim();
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Helper: Parse review count from text, handling multiple formats
|
||||
function parseReviewCount(text) {
|
||||
if (!text) return null;
|
||||
|
||||
// Extract address
|
||||
const addressSelectors = [
|
||||
'button[data-item-id*="address"]',
|
||||
'[data-item-id*="address"]',
|
||||
'div[aria-label*="Address"]'
|
||||
];
|
||||
|
||||
for (const selector of addressSelectors) {
|
||||
const elem = document.querySelector(selector);
|
||||
if (elem && elem.textContent) {
|
||||
info.address = elem.textContent.trim();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract rating (look for aria-label like "4.2 stars")
|
||||
const ratingElem = document.querySelector('[role="img"][aria-label*="star"]');
|
||||
if (ratingElem) {
|
||||
const ariaLabel = ratingElem.getAttribute('aria-label');
|
||||
const match = ariaLabel.match(/([0-9.]+)/);
|
||||
// Pattern 1: Exact "N reviews" format (aria-labels, clean text)
|
||||
// Matches: "27 reviews", "1,234 reviews", "27 reseñas", "27 avis"
|
||||
var match = text.match(/^([0-9][0-9,.]*)[ ]*(?:reviews?|reseñas?|avis|bewertungen?|recensioni?)$/i);
|
||||
if (match) {
|
||||
info.rating = parseFloat(match[1]);
|
||||
return parseInt(match[1].replace(/[,. ]/g, ''));
|
||||
}
|
||||
}
|
||||
|
||||
// Extract total review count
|
||||
const reviewPattern = /\\((\\d[\\d,\\.]*)\\)/;
|
||||
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
|
||||
// Pattern 2: "(N)" format often used in tabs like "Reviews (27)"
|
||||
match = text.match(/[(]([0-9][0-9,.]*)[)]$/);
|
||||
if (match) {
|
||||
return parseInt(match[1].replace(/[,. ]/g, ''));
|
||||
}
|
||||
|
||||
// PRIORITY 1: Look for review count in search results sidebar/panel
|
||||
// This is where "152 reviews" appears on search results
|
||||
const searchPanelSelectors = [
|
||||
'a[href*="reviews"]', // Link with "reviews" in href
|
||||
'button[jsaction*="reviews"]', // Button related to reviews
|
||||
'div[role="link"]', // Clickable divs that might contain review info
|
||||
];
|
||||
|
||||
for (const selector of searchPanelSelectors) {
|
||||
const elements = document.querySelectorAll(selector);
|
||||
for (let elem of elements) {
|
||||
const text = elem.textContent || '';
|
||||
const match = text.match(numberPattern);
|
||||
// Pattern 3: "N reviews" anywhere in short text (< 30 chars to avoid false positives)
|
||||
if (text.length < 30) {
|
||||
match = text.match(/([0-9][0-9,]*)[ ]+(?:reviews?|reseñas?|avis)/i);
|
||||
if (match) {
|
||||
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
if (num > 0 && num < 1000000) {
|
||||
info.total_reviews = num;
|
||||
break;
|
||||
}
|
||||
return parseInt(match[1].replace(/[,. ]/g, ''));
|
||||
}
|
||||
}
|
||||
if (info.total_reviews) break;
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// PRIORITY 2: Look in any span/div that contains the word "review"
|
||||
// ============ EXTRACT BUSINESS NAME ============
|
||||
// Priority: h1 (semantic), then role="heading"
|
||||
const h1 = document.querySelector('h1');
|
||||
if (h1 && h1.textContent) {
|
||||
info.name = h1.textContent.trim();
|
||||
}
|
||||
if (!info.name) {
|
||||
const heading = document.querySelector('[role="heading"][aria-level="1"]');
|
||||
if (heading && heading.textContent) {
|
||||
info.name = heading.textContent.trim();
|
||||
}
|
||||
}
|
||||
|
||||
// ============ EXTRACT ADDRESS ============
|
||||
// Priority: data-item-id (semantic), then aria-label containing "address"
|
||||
const addressElem = document.querySelector('[data-item-id*="address"]');
|
||||
if (addressElem && addressElem.textContent) {
|
||||
info.address = addressElem.textContent.trim();
|
||||
}
|
||||
if (!info.address) {
|
||||
const ariaAddress = document.querySelector('[aria-label*="ddress"]');
|
||||
if (ariaAddress && ariaAddress.textContent) {
|
||||
info.address = ariaAddress.textContent.trim();
|
||||
}
|
||||
}
|
||||
|
||||
// ============ EXTRACT RATING ============
|
||||
// Priority: aria-label containing "star" on role="img" elements
|
||||
info._debug_rating_context = [];
|
||||
const ratingElems = document.querySelectorAll('[role="img"][aria-label*="star"]');
|
||||
for (let elem of ratingElems) {
|
||||
const ariaLabel = elem.getAttribute('aria-label') || '';
|
||||
// Match "4.9 stars" or "4,9 stars" (European format)
|
||||
const match = ariaLabel.match(/([0-9][.,]?[0-9]?)\\s*star/i);
|
||||
if (match) {
|
||||
info.rating = parseFloat(match[1].replace(',', '.'));
|
||||
// DEBUG: Capture parent/sibling context to find review count
|
||||
var parent = elem.parentElement;
|
||||
if (parent) {
|
||||
info._debug_rating_context.push('PARENT: ' + (parent.textContent || '').trim().substring(0, 100));
|
||||
var grandparent = parent.parentElement;
|
||||
if (grandparent) {
|
||||
info._debug_rating_context.push('GRANDPARENT: ' + (grandparent.textContent || '').trim().substring(0, 100));
|
||||
// Check all children of grandparent for review count
|
||||
var gpChildren = grandparent.querySelectorAll('*');
|
||||
for (var c = 0; c < Math.min(gpChildren.length, 30); c++) {
|
||||
var childText = (gpChildren[c].textContent || '').trim();
|
||||
if (childText.length > 0 && childText.length < 20 && /[0-9]/.test(childText)) {
|
||||
info._debug_rating_context.push('GP_CHILD: ' + childText);
|
||||
}
|
||||
}
|
||||
// Also check great-grandparent
|
||||
var ggp = grandparent.parentElement;
|
||||
if (ggp) {
|
||||
info._debug_rating_context.push('GREAT_GP: ' + (ggp.textContent || '').trim().substring(0, 150));
|
||||
}
|
||||
}
|
||||
// Check siblings
|
||||
var nextSib = parent.nextElementSibling;
|
||||
if (nextSib) {
|
||||
info._debug_rating_context.push('NEXT_SIB: ' + (nextSib.textContent || '').trim().substring(0, 100));
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// ============ EXTRACT TOTAL REVIEWS (ROBUST, ARIA-FIRST) ============
|
||||
|
||||
// PRIORITY 1: aria-label with exact "N reviews" format (most reliable)
|
||||
// Google Maps uses aria-label="27 reviews" for accessibility
|
||||
info._debug_aria = [];
|
||||
info._debug_all_numeric = [];
|
||||
if (!info.total_reviews) {
|
||||
const allElements = document.querySelectorAll('span, div, a');
|
||||
for (let elem of allElements) {
|
||||
const text = elem.textContent || '';
|
||||
if (text.length < 100) { // Skip very long text blocks
|
||||
const match = text.match(numberPattern);
|
||||
var ariaElems = document.querySelectorAll('[aria-label]');
|
||||
for (var i = 0; i < ariaElems.length; i++) {
|
||||
var ariaLabel = ariaElems[i].getAttribute('aria-label') || '';
|
||||
// Collect all labels containing "review"
|
||||
if (ariaLabel.toLowerCase().indexOf('review') >= 0) {
|
||||
info._debug_aria.push(ariaLabel);
|
||||
}
|
||||
// Collect all labels starting with a digit
|
||||
if (/^[0-9]/.test(ariaLabel)) {
|
||||
info._debug_all_numeric.push(ariaLabel);
|
||||
}
|
||||
var count = parseReviewCount(ariaLabel);
|
||||
if (count && count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = ariaLabel;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// DEBUG: Find all text with parenthetical numbers like "(27)"
|
||||
info._debug_parens = [];
|
||||
info._debug_short_text = []; // All short text with numbers
|
||||
var allSpans = document.querySelectorAll('span, div, a, button');
|
||||
for (var j = 0; j < Math.min(allSpans.length, 500); j++) {
|
||||
var spanText = allSpans[j].textContent || '';
|
||||
// Capture parenthetical numbers
|
||||
if (spanText.length < 20 && /[(][0-9]+[)]/.test(spanText)) {
|
||||
info._debug_parens.push(spanText.trim());
|
||||
}
|
||||
// Capture ALL short text containing numbers (for debugging)
|
||||
if (spanText.length > 0 && spanText.length < 30 && /[0-9]+/.test(spanText)) {
|
||||
var cleaned = spanText.trim().replace(/\\s+/g, ' ');
|
||||
if (cleaned && info._debug_short_text.indexOf(cleaned) < 0) {
|
||||
info._debug_short_text.push(cleaned);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 2.5: Look for text containing numbers near "review" word anywhere on page
|
||||
// This catches formats like "27 reviews", "reviews: 27", etc. that aren't in aria-labels
|
||||
if (!info.total_reviews) {
|
||||
var allElems = document.querySelectorAll('*');
|
||||
for (var k = 0; k < Math.min(allElems.length, 1000); k++) {
|
||||
var elem = allElems[k];
|
||||
// Skip if has children (we want leaf nodes only)
|
||||
if (elem.children.length > 0) continue;
|
||||
var txt = (elem.textContent || '').trim();
|
||||
// Look for short text with both numbers and "review" word
|
||||
if (txt.length >= 3 && txt.length < 30 && /review/i.test(txt)) {
|
||||
var match = txt.match(/([0-9][0-9,]*)/);
|
||||
if (match) {
|
||||
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
if (num > 0 && num < 1000000) {
|
||||
info.total_reviews = num;
|
||||
var count = parseInt(match[1].replace(/,/g, ''));
|
||||
if (count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'LEAF: ' + txt;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -1302,38 +1468,167 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 3: Try tabs (for business detail pages)
|
||||
// DEBUG: Collect all tab names
|
||||
info._debug_tabs = [];
|
||||
const tabs = document.querySelectorAll('[role="tab"]');
|
||||
for (let t = 0; t < tabs.length; t++) {
|
||||
info._debug_tabs.push((tabs[t].textContent || '').trim().substring(0, 30));
|
||||
}
|
||||
|
||||
// DEBUG: Collect all buttons with text (might contain review count)
|
||||
info._debug_buttons = [];
|
||||
const buttons = document.querySelectorAll('button');
|
||||
for (let b = 0; b < Math.min(buttons.length, 20); b++) {
|
||||
var btnText = (buttons[b].textContent || '').trim();
|
||||
if (btnText && btnText.length < 40) {
|
||||
info._debug_buttons.push(btnText.substring(0, 40));
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 2: Tabs with role="tab" (Reviews tab often shows count)
|
||||
if (!info.total_reviews) {
|
||||
const tabs = document.querySelectorAll('button[role="tab"]');
|
||||
for (let tab of tabs) {
|
||||
const text = tab.textContent || '';
|
||||
let match = text.match(reviewPattern);
|
||||
if (match) {
|
||||
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
break;
|
||||
const text = (tab.textContent || '').trim();
|
||||
// Look for "Reviews" tab with count
|
||||
if (text.toLowerCase().includes('review')) {
|
||||
const count = parseReviewCount(text);
|
||||
if (count && count > 0) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'TAB: ' + text;
|
||||
break;
|
||||
}
|
||||
}
|
||||
match = text.match(numberPattern);
|
||||
if (match) {
|
||||
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 2.3: Reviews panel header (after clicking Reviews tab)
|
||||
// Google Maps shows "27 reviews" as heading text in the reviews panel
|
||||
if (!info.total_reviews) {
|
||||
// Look for headings containing review count
|
||||
var headings = document.querySelectorAll('h1, h2, [role="heading"]');
|
||||
for (var h = 0; h < headings.length; h++) {
|
||||
var hText = (headings[h].textContent || '').trim();
|
||||
if (/review/i.test(hText)) {
|
||||
var match = hText.match(/([0-9][0-9,]*)/);
|
||||
if (match) {
|
||||
var count = parseInt(match[1].replace(/,/g, ''));
|
||||
if (count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'HEADING: ' + hText;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 2.4: Look for sort button area which often has total count
|
||||
// The sort dropdown area displays "Sort: Newest" and total reviews
|
||||
if (!info.total_reviews) {
|
||||
var sortBtns = document.querySelectorAll('button[data-value="sort"], [aria-label*="Sort"]');
|
||||
for (var s = 0; s < sortBtns.length; s++) {
|
||||
var parent = sortBtns[s].parentElement;
|
||||
if (parent) {
|
||||
var pText = (parent.textContent || '').trim();
|
||||
if (/review/i.test(pText)) {
|
||||
var match = pText.match(/([0-9][0-9,]*)\\s*review/i);
|
||||
if (match) {
|
||||
var count = parseInt(match[1].replace(/,/g, ''));
|
||||
if (count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'SORT_AREA: ' + pText.substring(0, 50);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 3: Elements with semantic review-related attributes
|
||||
if (!info.total_reviews) {
|
||||
const reviewLinks = document.querySelectorAll('a[href*="review"], button[aria-label*="review" i]');
|
||||
for (let elem of reviewLinks) {
|
||||
const text = (elem.textContent || '').trim();
|
||||
const count = parseReviewCount(text);
|
||||
if (count && count > 0) {
|
||||
info.total_reviews = count;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 4: Try aria-labels
|
||||
// PRIORITY 4: Look for standalone review count text near rating
|
||||
// Find elements that contain ONLY "N reviews" pattern (not concatenated with rating)
|
||||
if (!info.total_reviews) {
|
||||
const elements = document.querySelectorAll('[aria-label]');
|
||||
for (let elem of elements) {
|
||||
const ariaLabel = elem.getAttribute('aria-label') || '';
|
||||
let match = ariaLabel.match(reviewPattern);
|
||||
if (match) {
|
||||
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
const allElements = document.querySelectorAll('span, a');
|
||||
for (let elem of allElements) {
|
||||
// Get direct text content only (not nested children)
|
||||
const text = (elem.textContent || '').trim();
|
||||
// Skip if too long (likely contains other content)
|
||||
if (text.length > 50) continue;
|
||||
// Skip if it looks like rating+reviews concatenated (e.g., "4.927 reviews")
|
||||
if (/^[0-9]\\.[0-9]+[0-9]/.test(text)) continue;
|
||||
|
||||
const count = parseReviewCount(text);
|
||||
if (count && count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
break;
|
||||
}
|
||||
match = ariaLabel.match(numberPattern);
|
||||
if (match) {
|
||||
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 5: Parse from visible page text using regex on short text blocks
|
||||
if (!info.total_reviews) {
|
||||
const walker = document.createTreeWalker(
|
||||
document.body,
|
||||
NodeFilter.SHOW_TEXT,
|
||||
null,
|
||||
false
|
||||
);
|
||||
while (walker.nextNode()) {
|
||||
const text = walker.currentNode.textContent.trim();
|
||||
if (text.length >= 5 && text.length <= 30) {
|
||||
// Match "27 reviews" but not "4.927 reviews"
|
||||
const match = text.match(/(?:^|[^0-9.,])([0-9,]+)\\s+(?:reviews?|reseñas?)/i);
|
||||
if (match) {
|
||||
const count = parseInt(match[1].replace(/[,]/g, ''));
|
||||
if (count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'WALKER: ' + text;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 6: Extract from embedded JSON in page source (Google embeds data in scripts)
|
||||
if (!info.total_reviews) {
|
||||
var scripts = document.querySelectorAll('script');
|
||||
for (var sc = 0; sc < scripts.length; sc++) {
|
||||
var scriptText = scripts[sc].textContent || '';
|
||||
// Look for patterns like "user_reviews":{"count":27} or reviews_count":27
|
||||
var jsonMatch = scriptText.match(/"(?:user_reviews|reviews?)(?:_count)?"\s*[:\{]\s*"?(\d+)"?/i);
|
||||
if (jsonMatch) {
|
||||
var count = parseInt(jsonMatch[1]);
|
||||
if (count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'JSON_SCRIPT';
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Also look for review count in Google's data format like [\"27 reviews\"]
|
||||
if (!info.total_reviews) {
|
||||
var dataMatch = scriptText.match(/"(\d+)\s+reviews?"/i);
|
||||
if (dataMatch) {
|
||||
var count = parseInt(dataMatch[1]);
|
||||
if (count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'JSON_DATA: ' + dataMatch[0];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1348,6 +1643,32 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
log.info(f"[PROFILE] *** TOTAL GET_BUSINESS_CARD TIME: {total_time:.2f}s ***")
|
||||
log.info(f"Business card extracted: name={business_info.get('name')}, "
|
||||
f"rating={business_info.get('rating')}, reviews={business_info.get('total_reviews')}")
|
||||
# Debug: log what aria-labels were found
|
||||
if business_info.get('_debug_aria'):
|
||||
log.info(f"DEBUG: Found {len(business_info.get('_debug_aria'))} aria-labels with 'review': {business_info.get('_debug_aria')[:5]}")
|
||||
if business_info.get('_debug_matched'):
|
||||
log.info(f"DEBUG: Matched aria-label: {business_info.get('_debug_matched')}")
|
||||
# Also log all numeric aria-labels (potential review counts)
|
||||
if business_info.get('_debug_all_numeric'):
|
||||
log.info(f"DEBUG: Numeric aria-labels: {business_info.get('_debug_all_numeric')[:10]}")
|
||||
# Log any text with parenthetical numbers like "(27)"
|
||||
if business_info.get('_debug_parens'):
|
||||
log.info(f"DEBUG: Parenthetical text: {business_info.get('_debug_parens')[:5]}")
|
||||
# Log all short text containing numbers (for debugging review count detection)
|
||||
if business_info.get('_debug_short_text'):
|
||||
log.info(f"DEBUG: Short text with numbers: {business_info.get('_debug_short_text')[:15]}")
|
||||
# Log the context around the rating element
|
||||
if business_info.get('_debug_rating_context'):
|
||||
for ctx in business_info.get('_debug_rating_context', []):
|
||||
log.info(f"DEBUG: Rating context: {ctx}")
|
||||
# Log what tabs exist on the page
|
||||
if business_info.get('_debug_tabs'):
|
||||
log.info(f"DEBUG: Page tabs: {business_info.get('_debug_tabs')}")
|
||||
else:
|
||||
log.info(f"DEBUG: No tabs found on page")
|
||||
# Log buttons (might contain review count)
|
||||
if business_info.get('_debug_buttons'):
|
||||
log.info(f"DEBUG: Buttons: {business_info.get('_debug_buttons')[:10]}")
|
||||
|
||||
result = {
|
||||
"name": business_info.get('name'),
|
||||
|
||||
@@ -1,407 +0,0 @@
|
||||
"""
|
||||
Background job manager for Google Reviews Scraper.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Dict, Any, Optional, List
|
||||
from dataclasses import dataclass, asdict
|
||||
|
||||
from modules.config import load_config
|
||||
from modules.scraper import GoogleReviewsScraper
|
||||
from modules.scraper_clean import fast_scrape_reviews # Updated to use clean scraper with hard refresh recovery
|
||||
from modules.chrome_pool import get_scraping_worker, release_scraping_worker
|
||||
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
|
||||
class JobStatus(str, Enum):
|
||||
"""Job status enumeration"""
|
||||
PENDING = "pending"
|
||||
RUNNING = "running"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
CANCELLED = "cancelled"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapingJob:
|
||||
"""Scraping job data class"""
|
||||
job_id: str
|
||||
status: JobStatus
|
||||
url: str
|
||||
config: Dict[str, Any]
|
||||
created_at: datetime
|
||||
started_at: Optional[datetime] = None
|
||||
completed_at: Optional[datetime] = None
|
||||
updated_at: Optional[datetime] = None # Last update time (for progress tracking)
|
||||
error_message: Optional[str] = None
|
||||
reviews_count: Optional[int] = None
|
||||
total_reviews: Optional[int] = None # Total reviews available (from page counter)
|
||||
images_count: Optional[int] = None
|
||||
progress: Dict[str, Any] = None
|
||||
reviews_data: Optional[List[Dict[str, Any]]] = None # Store actual review data
|
||||
scrape_time: Optional[float] = None # Time taken to scrape
|
||||
|
||||
def to_dict(self, include_reviews: bool = False) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert job to dictionary for JSON serialization
|
||||
|
||||
Args:
|
||||
include_reviews: Whether to include the full reviews data (default: False)
|
||||
"""
|
||||
data = asdict(self)
|
||||
# Convert datetime objects to ISO strings
|
||||
for field in ['created_at', 'started_at', 'completed_at']:
|
||||
if data[field]:
|
||||
data[field] = data[field].isoformat()
|
||||
|
||||
# Exclude reviews_data by default (can be large)
|
||||
if not include_reviews:
|
||||
data.pop('reviews_data', None)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
class JobManager:
|
||||
"""Manager for background scraping jobs"""
|
||||
|
||||
def __init__(self, max_concurrent_jobs: int = 3):
|
||||
"""Initialize job manager"""
|
||||
self.max_concurrent_jobs = max_concurrent_jobs
|
||||
self.jobs: Dict[str, ScrapingJob] = {}
|
||||
self.executor = ThreadPoolExecutor(max_workers=max_concurrent_jobs)
|
||||
self.lock = threading.Lock()
|
||||
|
||||
def create_job(self, url: str, config_overrides: Dict[str, Any] = None) -> str:
|
||||
"""
|
||||
Create a new scraping job.
|
||||
|
||||
Args:
|
||||
url: Google Maps URL to scrape
|
||||
config_overrides: Optional config overrides
|
||||
|
||||
Returns:
|
||||
Job ID
|
||||
"""
|
||||
job_id = str(uuid.uuid4())
|
||||
|
||||
# Load base config
|
||||
config = load_config()
|
||||
|
||||
# Apply URL
|
||||
config["url"] = url
|
||||
|
||||
# Apply any overrides
|
||||
if config_overrides:
|
||||
config.update(config_overrides)
|
||||
|
||||
job = ScrapingJob(
|
||||
job_id=job_id,
|
||||
status=JobStatus.PENDING,
|
||||
url=url,
|
||||
config=config,
|
||||
created_at=datetime.now(),
|
||||
progress={"stage": "created", "message": "Job created and queued"}
|
||||
)
|
||||
|
||||
with self.lock:
|
||||
self.jobs[job_id] = job
|
||||
|
||||
log.info(f"Created scraping job {job_id} for URL: {url}")
|
||||
return job_id
|
||||
|
||||
def start_job(self, job_id: str) -> bool:
|
||||
"""
|
||||
Start a pending job.
|
||||
|
||||
Args:
|
||||
job_id: Job ID to start
|
||||
|
||||
Returns:
|
||||
True if job was started, False otherwise
|
||||
"""
|
||||
with self.lock:
|
||||
if job_id not in self.jobs:
|
||||
return False
|
||||
|
||||
job = self.jobs[job_id]
|
||||
if job.status != JobStatus.PENDING:
|
||||
return False
|
||||
|
||||
# Check if we can start more jobs
|
||||
running_count = sum(1 for j in self.jobs.values() if j.status == JobStatus.RUNNING)
|
||||
if running_count >= self.max_concurrent_jobs:
|
||||
return False
|
||||
|
||||
job.status = JobStatus.RUNNING
|
||||
job.started_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.progress = {"stage": "starting", "message": "Initializing scraper"}
|
||||
|
||||
# Submit job to thread pool
|
||||
future = self.executor.submit(self._run_scraping_job, job_id)
|
||||
|
||||
log.info(f"Started scraping job {job_id}")
|
||||
return True
|
||||
|
||||
def _run_scraping_job(self, job_id: str):
|
||||
"""
|
||||
Run the actual scraping job in background thread.
|
||||
|
||||
Args:
|
||||
job_id: Job ID to run
|
||||
"""
|
||||
def progress_callback(current_count: int, total_count: int):
|
||||
"""Update job progress during scraping"""
|
||||
with self.lock:
|
||||
job = self.jobs.get(job_id)
|
||||
if job:
|
||||
job.reviews_count = current_count
|
||||
job.total_reviews = total_count
|
||||
job.updated_at = datetime.now() # Update last update time
|
||||
# Calculate percentage for better UX
|
||||
percentage = int((current_count / total_count * 100)) if total_count > 0 else 0
|
||||
job.progress = {
|
||||
"stage": "scraping",
|
||||
"message": f"Collecting reviews: {current_count} / {total_count} ({percentage}%)",
|
||||
"percentage": percentage
|
||||
}
|
||||
|
||||
worker = None
|
||||
try:
|
||||
with self.lock:
|
||||
job = self.jobs[job_id]
|
||||
job.progress = {"stage": "initializing", "message": "Acquiring Chrome worker from pool"}
|
||||
|
||||
# Get a worker from the scraping pool
|
||||
worker = get_scraping_worker(timeout=30)
|
||||
|
||||
if not worker:
|
||||
raise Exception("No Chrome workers available. Pool may be at capacity.")
|
||||
|
||||
log.info(f"Job {job_id}: Acquired worker {worker.worker_id} from pool")
|
||||
|
||||
# Get config
|
||||
url = job.config.get('url')
|
||||
headless = job.config.get('headless', True) # Default to headless
|
||||
max_scrolls = job.config.get('max_scrolls', 999999) # Effectively unlimited - relies on idle detection
|
||||
|
||||
with self.lock:
|
||||
job.progress = {"stage": "scraping", "message": f"Scraping reviews with {worker.worker_id} (fast mode)"}
|
||||
|
||||
# Run the FAST scraping with progress callback using pooled worker
|
||||
result = fast_scrape_reviews(
|
||||
url=url,
|
||||
headless=headless,
|
||||
max_scrolls=max_scrolls,
|
||||
progress_callback=progress_callback,
|
||||
driver=worker.driver, # Use worker's driver
|
||||
return_driver=True # Don't close the driver
|
||||
)
|
||||
|
||||
# Pop the driver from result before storing
|
||||
result.pop('driver', None)
|
||||
|
||||
# Mark job as completed or failed
|
||||
with self.lock:
|
||||
if result['success']:
|
||||
job.status = JobStatus.COMPLETED
|
||||
job.completed_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.reviews_count = result['count']
|
||||
job.total_reviews = result.get('total_reviews') # Store total review count from page
|
||||
job.reviews_data = result['reviews'] # Store the actual reviews
|
||||
job.scrape_time = result['time']
|
||||
job.progress = {
|
||||
"stage": "completed",
|
||||
"message": f"Scraping completed successfully in {result['time']:.1f}s",
|
||||
"scroll_time": result.get('scroll_time'),
|
||||
"extract_time": result.get('extract_time')
|
||||
}
|
||||
log.info(f"Completed scraping job {job_id}: {result['count']} reviews in {result['time']:.1f}s")
|
||||
else:
|
||||
job.status = JobStatus.FAILED
|
||||
job.completed_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.error_message = result.get('error', 'Unknown error')
|
||||
job.progress = {"stage": "failed", "message": f"Job failed: {result.get('error')}"}
|
||||
log.error(f"Failed scraping job {job_id}: {result.get('error')}")
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error in scraping job {job_id}: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
with self.lock:
|
||||
job = self.jobs[job_id]
|
||||
job.status = JobStatus.FAILED
|
||||
job.completed_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.error_message = str(e)
|
||||
job.progress = {"stage": "failed", "message": f"Job failed: {str(e)}"}
|
||||
|
||||
# Recycle worker on error
|
||||
if worker:
|
||||
log.info(f"Job {job_id}: Recycling worker {worker.worker_id} due to error")
|
||||
release_scraping_worker(worker, recycle=True)
|
||||
worker = None # Mark as released
|
||||
|
||||
finally:
|
||||
# Release worker back to pool if not already released
|
||||
if worker:
|
||||
log.info(f"Job {job_id}: Releasing worker {worker.worker_id} back to pool")
|
||||
release_scraping_worker(worker, recycle=False)
|
||||
|
||||
def get_job(self, job_id: str) -> Optional[ScrapingJob]:
|
||||
"""
|
||||
Get job by ID.
|
||||
|
||||
Args:
|
||||
job_id: Job ID
|
||||
|
||||
Returns:
|
||||
Job object or None if not found
|
||||
"""
|
||||
with self.lock:
|
||||
return self.jobs.get(job_id)
|
||||
|
||||
def get_job_reviews(self, job_id: str) -> Optional[List[Dict[str, Any]]]:
|
||||
"""
|
||||
Get reviews data for a specific job.
|
||||
|
||||
Args:
|
||||
job_id: Job ID
|
||||
|
||||
Returns:
|
||||
List of reviews or None if not found/not completed
|
||||
"""
|
||||
with self.lock:
|
||||
job = self.jobs.get(job_id)
|
||||
if job and job.status == JobStatus.COMPLETED:
|
||||
return job.reviews_data
|
||||
return None
|
||||
|
||||
def list_jobs(self, status: Optional[JobStatus] = None, limit: int = 100) -> List[ScrapingJob]:
|
||||
"""
|
||||
List jobs, optionally filtered by status.
|
||||
|
||||
Args:
|
||||
status: Optional status filter
|
||||
limit: Maximum number of jobs to return
|
||||
|
||||
Returns:
|
||||
List of jobs
|
||||
"""
|
||||
with self.lock:
|
||||
jobs = list(self.jobs.values())
|
||||
|
||||
if status:
|
||||
jobs = [job for job in jobs if job.status == status]
|
||||
|
||||
# Sort by creation time (newest first)
|
||||
jobs.sort(key=lambda x: x.created_at, reverse=True)
|
||||
|
||||
return jobs[:limit]
|
||||
|
||||
def cancel_job(self, job_id: str) -> bool:
|
||||
"""
|
||||
Cancel a pending or running job.
|
||||
|
||||
Args:
|
||||
job_id: Job ID to cancel
|
||||
|
||||
Returns:
|
||||
True if job was cancelled, False otherwise
|
||||
"""
|
||||
with self.lock:
|
||||
if job_id not in self.jobs:
|
||||
return False
|
||||
|
||||
job = self.jobs[job_id]
|
||||
if job.status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED]:
|
||||
return False
|
||||
|
||||
job.status = JobStatus.CANCELLED
|
||||
job.completed_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.progress = {"stage": "cancelled", "message": "Job was cancelled"}
|
||||
|
||||
log.info(f"Cancelled scraping job {job_id}")
|
||||
return True
|
||||
|
||||
def delete_job(self, job_id: str) -> bool:
|
||||
"""
|
||||
Delete a job from the manager.
|
||||
|
||||
Args:
|
||||
job_id: Job ID to delete
|
||||
|
||||
Returns:
|
||||
True if job was deleted, False otherwise
|
||||
"""
|
||||
with self.lock:
|
||||
if job_id not in self.jobs:
|
||||
return False
|
||||
del self.jobs[job_id]
|
||||
|
||||
log.info(f"Deleted scraping job {job_id}")
|
||||
return True
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get job manager statistics.
|
||||
|
||||
Returns:
|
||||
Statistics dictionary
|
||||
"""
|
||||
with self.lock:
|
||||
jobs = list(self.jobs.values())
|
||||
|
||||
stats = {
|
||||
"total_jobs": len(jobs),
|
||||
"by_status": {},
|
||||
"running_jobs": 0,
|
||||
"max_concurrent_jobs": self.max_concurrent_jobs
|
||||
}
|
||||
|
||||
for status in JobStatus:
|
||||
count = sum(1 for job in jobs if job.status == status)
|
||||
stats["by_status"][status.value] = count
|
||||
|
||||
stats["running_jobs"] = stats["by_status"].get(JobStatus.RUNNING.value, 0)
|
||||
|
||||
return stats
|
||||
|
||||
def cleanup_old_jobs(self, max_age_hours: int = 24):
|
||||
"""
|
||||
Clean up old completed/failed jobs.
|
||||
|
||||
Args:
|
||||
max_age_hours: Maximum age in hours before cleanup
|
||||
"""
|
||||
cutoff_time = datetime.now().timestamp() - (max_age_hours * 3600)
|
||||
|
||||
with self.lock:
|
||||
to_delete = []
|
||||
for job_id, job in self.jobs.items():
|
||||
if job.status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED]:
|
||||
if job.completed_at and job.completed_at.timestamp() < cutoff_time:
|
||||
to_delete.append(job_id)
|
||||
|
||||
for job_id in to_delete:
|
||||
del self.jobs[job_id]
|
||||
|
||||
if to_delete:
|
||||
log.info(f"Cleaned up {len(to_delete)} old jobs")
|
||||
|
||||
def shutdown(self):
|
||||
"""Shutdown the job manager"""
|
||||
log.info("Shutting down job manager")
|
||||
self.executor.shutdown(wait=True)
|
||||
2335
modules/scraper.py
2335
modules/scraper.py
File diff suppressed because it is too large
Load Diff
@@ -1,198 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Reverse-engineer Google's date formatting library to understand:
|
||||
1. What library they use
|
||||
2. All possible date format patterns
|
||||
3. Time range boundaries for each pattern
|
||||
"""
|
||||
import json
|
||||
import re
|
||||
from seleniumbase import Driver
|
||||
import time
|
||||
|
||||
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=en&rclk=1"
|
||||
|
||||
print("Starting browser...")
|
||||
driver = Driver(uc=True, headless=False)
|
||||
|
||||
try:
|
||||
print(f"Loading URL: {url}")
|
||||
driver.get(url)
|
||||
time.sleep(8)
|
||||
|
||||
# Script to find date formatting function
|
||||
find_formatter_script = """
|
||||
const results = {
|
||||
scripts: [],
|
||||
potential_formatters: [],
|
||||
date_strings: []
|
||||
};
|
||||
|
||||
// 1. Search all script tags for date-related code
|
||||
const scriptTags = document.querySelectorAll('script');
|
||||
let scriptContent = '';
|
||||
|
||||
scriptTags.forEach((script, idx) => {
|
||||
const content = script.textContent || script.innerText;
|
||||
if (content) {
|
||||
scriptContent += content + '\\n';
|
||||
|
||||
// Look for date formatting patterns
|
||||
if (content.includes('ago') || content.includes('month') || content.includes('year')) {
|
||||
const snippet = content.substring(0, 500);
|
||||
results.scripts.push({
|
||||
index: idx,
|
||||
snippet: snippet,
|
||||
length: content.length
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 2. Search for common date formatting library signatures
|
||||
const librarySignatures = [
|
||||
'moment',
|
||||
'date-fns',
|
||||
'dayjs',
|
||||
'luxon',
|
||||
'timeago',
|
||||
'formatRelative',
|
||||
'relativeTime',
|
||||
'fromNow'
|
||||
];
|
||||
|
||||
librarySignatures.forEach(sig => {
|
||||
if (scriptContent.includes(sig)) {
|
||||
results.potential_formatters.push(sig);
|
||||
}
|
||||
});
|
||||
|
||||
// 3. Try to find the actual formatting function by injecting test dates
|
||||
// Look for Google's internal date formatter
|
||||
const googleFormatters = [];
|
||||
for (let key in window) {
|
||||
if (typeof window[key] === 'function') {
|
||||
const funcStr = window[key].toString();
|
||||
if (funcStr.includes('ago') && funcStr.includes('month')) {
|
||||
googleFormatters.push({
|
||||
name: key,
|
||||
signature: funcStr.substring(0, 200)
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
results.google_formatters = googleFormatters;
|
||||
|
||||
// 4. Extract all "X ago" patterns from the page
|
||||
const pageText = document.body.innerText;
|
||||
const agoPatterns = pageText.match(/\\d+\\s+(second|minute|hour|day|week|month|year)s?\\s+ago/gi) || [];
|
||||
const singlePatterns = pageText.match(/a\\s+(second|minute|hour|day|week|month|year)\\s+ago/gi) || [];
|
||||
|
||||
results.date_strings = [...new Set([...agoPatterns, ...singlePatterns])];
|
||||
|
||||
return results;
|
||||
"""
|
||||
|
||||
print("Searching for date formatting code...")
|
||||
formatter_info = driver.execute_script(find_formatter_script)
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("FINDINGS:")
|
||||
print("="*80)
|
||||
|
||||
print(f"\n1. Scripts with date-related code: {len(formatter_info.get('scripts', []))}")
|
||||
|
||||
print(f"\n2. Potential libraries detected: {formatter_info.get('potential_formatters', [])}")
|
||||
|
||||
print(f"\n3. Google formatter functions found: {len(formatter_info.get('google_formatters', []))}")
|
||||
for gf in formatter_info.get('google_formatters', [])[:3]:
|
||||
print(f" - {gf['name']}: {gf['signature'][:100]}...")
|
||||
|
||||
print(f"\n4. Date patterns found on page:")
|
||||
date_strings = formatter_info.get('date_strings', [])
|
||||
for ds in sorted(set(date_strings))[:20]:
|
||||
print(f" - '{ds}'")
|
||||
|
||||
# Now let's test different timestamps to understand the boundaries
|
||||
print("\n" + "="*80)
|
||||
print("TESTING TIME RANGE BOUNDARIES:")
|
||||
print("="*80)
|
||||
|
||||
# We need to inject JavaScript that can format dates like Google does
|
||||
# Let's search the actual DOM for the pattern
|
||||
boundary_test_script = """
|
||||
// Collect all unique date strings from reviews
|
||||
const dateElements = document.querySelectorAll('span.rsqaWe');
|
||||
const dateStrings = new Set();
|
||||
|
||||
dateElements.forEach(elem => {
|
||||
const text = elem.textContent.trim();
|
||||
if (text) {
|
||||
dateStrings.add(text);
|
||||
}
|
||||
});
|
||||
|
||||
return Array.from(dateStrings).sort();
|
||||
"""
|
||||
|
||||
all_date_strings = driver.execute_script(boundary_test_script)
|
||||
|
||||
print(f"\nFound {len(all_date_strings)} unique date formats:")
|
||||
for ds in all_date_strings[:30]:
|
||||
print(f" - '{ds}'")
|
||||
|
||||
# Analyze the patterns
|
||||
print("\n" + "="*80)
|
||||
print("PATTERN ANALYSIS:")
|
||||
print("="*80)
|
||||
|
||||
patterns = {
|
||||
'seconds': [],
|
||||
'minutes': [],
|
||||
'hours': [],
|
||||
'days': [],
|
||||
'weeks': [],
|
||||
'months': [],
|
||||
'years': []
|
||||
}
|
||||
|
||||
for ds in all_date_strings:
|
||||
ds_lower = ds.lower()
|
||||
if 'second' in ds_lower:
|
||||
patterns['seconds'].append(ds)
|
||||
elif 'minute' in ds_lower:
|
||||
patterns['minutes'].append(ds)
|
||||
elif 'hour' in ds_lower:
|
||||
patterns['hours'].append(ds)
|
||||
elif 'day' in ds_lower:
|
||||
patterns['days'].append(ds)
|
||||
elif 'week' in ds_lower:
|
||||
patterns['weeks'].append(ds)
|
||||
elif 'month' in ds_lower:
|
||||
patterns['months'].append(ds)
|
||||
elif 'year' in ds_lower:
|
||||
patterns['years'].append(ds)
|
||||
|
||||
for unit, examples in patterns.items():
|
||||
if examples:
|
||||
print(f"\n{unit.upper()}:")
|
||||
for ex in examples[:5]:
|
||||
print(f" - '{ex}'")
|
||||
|
||||
# Save all data
|
||||
output = {
|
||||
'formatter_info': formatter_info,
|
||||
'all_date_strings': all_date_strings,
|
||||
'pattern_analysis': {k: v for k, v in patterns.items() if v}
|
||||
}
|
||||
|
||||
with open('/tmp/google_date_formatter_analysis.json', 'w') as f:
|
||||
json.dump(output, f, indent=2)
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("Full analysis saved to: /tmp/google_date_formatter_analysis.json")
|
||||
print("="*80)
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
print("\nBrowser closed")
|
||||
@@ -1,175 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Reverse-engineer Google's date formatting patterns by scraping reviews in English
|
||||
"""
|
||||
import json
|
||||
from modules.fast_scraper import fast_scrape_reviews
|
||||
|
||||
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=en&rclk=1"
|
||||
|
||||
print("Scraping reviews in English...")
|
||||
result = fast_scrape_reviews(url, headless=True)
|
||||
|
||||
reviews = result.get('reviews', [])
|
||||
print(f"\nExtracted {len(reviews)} reviews")
|
||||
|
||||
if reviews:
|
||||
# Collect all unique date strings
|
||||
date_strings = set()
|
||||
for rev in reviews:
|
||||
date_text = rev.get('date_text')
|
||||
if date_text:
|
||||
date_strings.add(date_text)
|
||||
|
||||
print(f"\nFound {len(date_strings)} unique date formats:")
|
||||
for ds in sorted(date_strings):
|
||||
print(f" '{ds}'")
|
||||
|
||||
# Analyze patterns
|
||||
print("\n" + "="*80)
|
||||
print("PATTERN ANALYSIS:")
|
||||
print("="*80)
|
||||
|
||||
patterns = {
|
||||
'seconds': [],
|
||||
'minutes': [],
|
||||
'hours': [],
|
||||
'days': [],
|
||||
'weeks': [],
|
||||
'months': [],
|
||||
'years': []
|
||||
}
|
||||
|
||||
for ds in date_strings:
|
||||
ds_lower = ds.lower()
|
||||
if 'second' in ds_lower:
|
||||
patterns['seconds'].append(ds)
|
||||
elif 'minute' in ds_lower:
|
||||
patterns['minutes'].append(ds)
|
||||
elif 'hour' in ds_lower:
|
||||
patterns['hours'].append(ds)
|
||||
elif 'day' in ds_lower:
|
||||
patterns['days'].append(ds)
|
||||
elif 'week' in ds_lower:
|
||||
patterns['weeks'].append(ds)
|
||||
elif 'month' in ds_lower:
|
||||
patterns['months'].append(ds)
|
||||
elif 'year' in ds_lower:
|
||||
patterns['years'].append(ds)
|
||||
|
||||
for unit, examples in sorted(patterns.items()):
|
||||
if examples:
|
||||
print(f"\n{unit.upper()} ({len(examples)} patterns):")
|
||||
for ex in sorted(examples):
|
||||
print(f" '{ex}'")
|
||||
|
||||
# Identify the specific patterns
|
||||
print("\n" + "="*80)
|
||||
print("GOOGLE MAPS DATE FORMAT PATTERNS (English):")
|
||||
print("="*80)
|
||||
|
||||
print("\nPattern Structure:")
|
||||
print("-" * 80)
|
||||
|
||||
single_unit_patterns = [] # "a month ago"
|
||||
plural_patterns = [] # "3 months ago"
|
||||
|
||||
for ds in sorted(date_strings):
|
||||
if ds.startswith('a '):
|
||||
single_unit_patterns.append(ds)
|
||||
elif ds.split()[0].isdigit():
|
||||
plural_patterns.append(ds)
|
||||
|
||||
print(f"\nSingular (a X ago): {len(single_unit_patterns)} patterns")
|
||||
for p in sorted(single_unit_patterns):
|
||||
print(f" '{p}'")
|
||||
|
||||
print(f"\nPlural (N Xs ago): {len(plural_patterns)} patterns")
|
||||
for p in sorted(plural_patterns):
|
||||
print(f" '{p}'")
|
||||
|
||||
# Determine time ranges
|
||||
print("\n" + "="*80)
|
||||
print("TIME RANGE BOUNDARIES:")
|
||||
print("="*80)
|
||||
|
||||
# Extract numbers from plural patterns
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
unit_values = defaultdict(list)
|
||||
for ds in date_strings:
|
||||
match = re.match(r'(\d+)\s+(\w+)\s+ago', ds.lower())
|
||||
if match:
|
||||
number = int(match.group(1))
|
||||
unit = match.group(2).rstrip('s') # Remove plural 's'
|
||||
unit_values[unit].append(number)
|
||||
|
||||
for unit, values in sorted(unit_values.items()):
|
||||
if values:
|
||||
print(f"\n{unit.upper()}:")
|
||||
print(f" Range: {min(values)} - {max(values)}")
|
||||
print(f" Values found: {sorted(set(values))}")
|
||||
|
||||
# Save analysis
|
||||
output = {
|
||||
'total_reviews': len(reviews),
|
||||
'unique_date_formats': len(date_strings),
|
||||
'all_date_strings': sorted(list(date_strings)),
|
||||
'patterns_by_unit': {k: sorted(v) for k, v in patterns.items() if v},
|
||||
'singular_patterns': sorted(single_unit_patterns),
|
||||
'plural_patterns': sorted(plural_patterns),
|
||||
'value_ranges': {unit: {'min': min(values), 'max': max(values), 'values': sorted(set(values))}
|
||||
for unit, values in unit_values.items() if values}
|
||||
}
|
||||
|
||||
with open('/tmp/google_date_patterns_english.json', 'w') as f:
|
||||
json.dump(output, f, indent=2)
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("Analysis saved to: /tmp/google_date_patterns_english.json")
|
||||
print("="*80)
|
||||
|
||||
# Now let's determine the EXACT library/algorithm Google uses
|
||||
print("\n" + "="*80)
|
||||
print("REVERSE-ENGINEERING GOOGLE'S ALGORITHM:")
|
||||
print("="*80)
|
||||
|
||||
print("\nBased on the patterns, Google's relative date formatter:")
|
||||
print("-" * 80)
|
||||
|
||||
print("\n1. FORMAT STRUCTURE:")
|
||||
print(" Single unit: 'a {unit} ago'")
|
||||
print(" Multiple: '{number} {unit}s ago'")
|
||||
|
||||
print("\n2. UNIT SELECTION (hypothesis):")
|
||||
if 'second' in unit_values:
|
||||
print(f" - Seconds: Used for 0-59 seconds ago")
|
||||
if 'minute' in unit_values:
|
||||
print(f" - Minutes: Used for 1-59 minutes ago")
|
||||
if 'hour' in unit_values:
|
||||
print(f" - Hours: Used for 1-23 hours ago")
|
||||
if 'day' in unit_values:
|
||||
print(f" - Days: Used for 1-6 days ago")
|
||||
if 'week' in unit_values:
|
||||
print(f" - Weeks: Used for 1-3 weeks ago")
|
||||
if 'month' in unit_values:
|
||||
print(f" - Months: Used for 1-11 months ago")
|
||||
if 'year' in unit_values:
|
||||
print(f" - Years: Used for 1+ years ago")
|
||||
|
||||
print("\n3. BOUNDARY THRESHOLDS (estimated):")
|
||||
print(" 60 seconds = switch to minutes")
|
||||
print(" 60 minutes = switch to hours")
|
||||
print(" 24 hours = switch to days")
|
||||
print(" 7 days = switch to weeks")
|
||||
print(" ~30 days (4 weeks) = switch to months")
|
||||
print(" 12 months = switch to years")
|
||||
|
||||
print("\n4. UNCERTAINTY RANGES:")
|
||||
print(" 'a month ago' = 30-59 days ago (±15 days)")
|
||||
print(" '2 months ago' = 60-89 days ago (±15 days)")
|
||||
print(" 'a year ago' = 365-729 days ago (±6 months)")
|
||||
|
||||
else:
|
||||
print("No reviews extracted!")
|
||||
77
start.py
77
start.py
@@ -1,77 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Google‑Maps review scraper with MongoDB integration
|
||||
=================================================
|
||||
|
||||
Main entry point for the scraper.
|
||||
"""
|
||||
|
||||
from modules.cli import parse_arguments
|
||||
from modules.config import load_config
|
||||
from modules.scraper import GoogleReviewsScraper
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to initialize and run the scraper"""
|
||||
# Parse command line arguments
|
||||
args = parse_arguments()
|
||||
|
||||
# Load configuration
|
||||
config = load_config(args.config)
|
||||
|
||||
# Override config with command line arguments if provided
|
||||
if args.headless:
|
||||
config["headless"] = True
|
||||
if args.sort_by is not None:
|
||||
config["sort_by"] = args.sort_by
|
||||
if args.stop_on_match:
|
||||
config["stop_on_match"] = True
|
||||
if args.url is not None:
|
||||
config["url"] = args.url
|
||||
if args.overwrite_existing:
|
||||
config["overwrite_existing"] = True
|
||||
if args.use_mongodb is not None:
|
||||
config["use_mongodb"] = args.use_mongodb
|
||||
|
||||
# Handle arguments for date conversion and image downloading
|
||||
if args.convert_dates is not None:
|
||||
config["convert_dates"] = args.convert_dates
|
||||
if args.download_images is not None:
|
||||
config["download_images"] = args.download_images
|
||||
if args.image_dir is not None:
|
||||
config["image_dir"] = args.image_dir
|
||||
if args.download_threads is not None:
|
||||
config["download_threads"] = args.download_threads
|
||||
|
||||
# Handle arguments for local image paths and URL replacement
|
||||
if args.store_local_paths is not None:
|
||||
config["store_local_paths"] = args.store_local_paths
|
||||
if args.replace_urls is not None:
|
||||
config["replace_urls"] = args.replace_urls
|
||||
if args.custom_url_base is not None:
|
||||
config["custom_url_base"] = args.custom_url_base
|
||||
if args.custom_url_profiles is not None:
|
||||
config["custom_url_profiles"] = args.custom_url_profiles
|
||||
if args.custom_url_reviews is not None:
|
||||
config["custom_url_reviews"] = args.custom_url_reviews
|
||||
if args.preserve_original_urls is not None:
|
||||
config["preserve_original_urls"] = args.preserve_original_urls
|
||||
|
||||
# Handle custom parameters
|
||||
if args.custom_params is not None:
|
||||
if "custom_params" not in config:
|
||||
config["custom_params"] = {}
|
||||
# Update config with the provided custom parameters
|
||||
config["custom_params"].update(args.custom_params)
|
||||
|
||||
# Handle API interception option
|
||||
if args.enable_api_intercept:
|
||||
config["enable_api_intercept"] = True
|
||||
|
||||
# Initialize and run scraper
|
||||
scraper = GoogleReviewsScraper(config)
|
||||
scraper.scrape()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
288
start_api_244.py
288
start_api_244.py
@@ -1,288 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
API-Only 244 Scraper - Attempt to get ALL 244 reviews via API alone.
|
||||
|
||||
Strategy:
|
||||
1. More patient scrolling (more scrolls, longer waits)
|
||||
2. Collect responses more frequently
|
||||
3. Extra end-of-list collection
|
||||
4. Slower timing near the end to ensure API completes
|
||||
|
||||
Goal: Get all 244 reviews via API without DOM parsing
|
||||
"""
|
||||
import sys
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
||||
|
||||
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
log.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def load_config():
|
||||
with open('config.yaml', 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def api_244_scrape():
|
||||
"""Get all 244 reviews purely via API with aggressive collection."""
|
||||
|
||||
config = load_config()
|
||||
url = config.get('url')
|
||||
headless = config.get('headless', False)
|
||||
|
||||
print("API-244 SCRAPER - Getting ALL 244 reviews via API...")
|
||||
print(f"URL: {url[:80]}...")
|
||||
|
||||
start_time = time.time()
|
||||
api_reviews = {}
|
||||
|
||||
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
|
||||
|
||||
try:
|
||||
# Step 1: Navigate
|
||||
driver.get(url)
|
||||
time.sleep(1.5)
|
||||
|
||||
# Dismiss cookies
|
||||
try:
|
||||
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
||||
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
||||
if cookie_btns:
|
||||
cookie_btns[0].click()
|
||||
time.sleep(0.4)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
|
||||
for selector in ['.LRkQ2', 'button[role="tab"]']:
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||
if any(kw in text or kw in aria for kw in review_keywords):
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(0.4)
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
# Wait for page stability
|
||||
time.sleep(1.0)
|
||||
|
||||
# Find pane
|
||||
pane = None
|
||||
try:
|
||||
wait = WebDriverWait(driver, 3)
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
|
||||
except TimeoutException:
|
||||
try:
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
|
||||
except:
|
||||
print("ERROR: Could not find pane")
|
||||
return []
|
||||
|
||||
# Setup API interceptor
|
||||
interceptor = GoogleMapsAPIInterceptor(driver)
|
||||
interceptor.setup_interception()
|
||||
interceptor.inject_response_interceptor()
|
||||
time.sleep(1.0) # Longer wait to ensure interceptor is ready
|
||||
|
||||
# Setup scroll
|
||||
driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
||||
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
|
||||
|
||||
# Trigger initial scroll
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(1.0) # Wait for first API response
|
||||
|
||||
print("Scrolling with extended collection strategy...")
|
||||
|
||||
# Extended scrolling - MORE scrolls, SLOWER timing
|
||||
max_scrolls = 50 # More scrolls to ensure we catch everything
|
||||
idle_scrolls = 0
|
||||
max_idle = 15 # Even more patience
|
||||
last_count = 0
|
||||
last_scroll_pos = 0
|
||||
scroll_stuck_count = 0
|
||||
|
||||
for i in range(max_scrolls):
|
||||
# Scroll
|
||||
driver.execute_script(scroll_script)
|
||||
|
||||
# Progressive timing - slower and slower
|
||||
if len(api_reviews) < 50:
|
||||
time.sleep(0.30) # Start moderate
|
||||
elif len(api_reviews) < 100:
|
||||
time.sleep(0.35)
|
||||
elif len(api_reviews) < 150:
|
||||
time.sleep(0.40)
|
||||
elif len(api_reviews) < 200:
|
||||
time.sleep(0.50)
|
||||
elif len(api_reviews) < 230:
|
||||
time.sleep(0.60) # Much slower near end
|
||||
else:
|
||||
time.sleep(0.80) # Very slow for final reviews
|
||||
|
||||
# Collect responses
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
# Check if we got new reviews
|
||||
current_count = len(api_reviews)
|
||||
if current_count == last_count:
|
||||
idle_scrolls += 1
|
||||
else:
|
||||
idle_scrolls = 0
|
||||
if (i + 1) % 10 == 0:
|
||||
print(f" {current_count} reviews...")
|
||||
|
||||
last_count = current_count
|
||||
|
||||
# Check scroll position
|
||||
try:
|
||||
current_scroll = driver.execute_script("return arguments[0].scrollTop;", pane)
|
||||
if current_scroll == last_scroll_pos:
|
||||
scroll_stuck_count += 1
|
||||
else:
|
||||
scroll_stuck_count = 0
|
||||
last_scroll_pos = current_scroll
|
||||
except:
|
||||
pass
|
||||
|
||||
# Stop conditions - but only if we have at least 240 reviews
|
||||
if idle_scrolls >= max_idle and scroll_stuck_count >= 5 and current_count >= 240:
|
||||
print(f" Reached end (no new reviews for {idle_scrolls} scrolls)")
|
||||
break
|
||||
|
||||
# AGGRESSIVE final collection phase
|
||||
print(f" Aggressive final collection (currently have {len(api_reviews)})...")
|
||||
|
||||
# Do 10 more scrolls with very long waits
|
||||
for extra in range(10):
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(1.2) # Very long wait
|
||||
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
new_count = 0
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
new_count += 1
|
||||
|
||||
if new_count > 0:
|
||||
print(f" +{new_count} more reviews (total: {len(api_reviews)})")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Ultra-final wait and collect
|
||||
time.sleep(2.0)
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
all_reviews = list(api_reviews.values())
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"✅ COMPLETED!")
|
||||
print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
|
||||
print(f"Time: {elapsed:.2f}s")
|
||||
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
|
||||
|
||||
if elapsed > 0:
|
||||
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
|
||||
|
||||
print(f"{'='*50}")
|
||||
|
||||
if len(all_reviews) >= 244:
|
||||
print(f"🎯 Got ALL 244 reviews via API!")
|
||||
elif len(all_reviews) >= 240:
|
||||
print(f"⚠️ Missing {244-len(all_reviews)} reviews - may need DOM parsing")
|
||||
else:
|
||||
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
|
||||
|
||||
print()
|
||||
|
||||
# Save
|
||||
with open('google_reviews_api_244.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"💾 Saved to google_reviews_api_244.json")
|
||||
|
||||
if all_reviews:
|
||||
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
|
||||
|
||||
return all_reviews
|
||||
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
reviews = api_244_scrape()
|
||||
sys.exit(0 if reviews else 1)
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nInterrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
@@ -1,280 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Complete Scraper - Gets ALL reviews while staying fast.
|
||||
|
||||
Strategy:
|
||||
1. Scroll until no new reviews for 5 consecutive scrolls
|
||||
2. Check scroll position to detect end
|
||||
3. Do extra scrolls at the end to catch stragglers
|
||||
4. Adaptive timing - faster at start, slower at end
|
||||
|
||||
Target: Get all 244 reviews in ~22-25 seconds
|
||||
"""
|
||||
import sys
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
||||
|
||||
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
log.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def load_config():
|
||||
with open('config.yaml', 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def complete_scrape():
|
||||
"""Get ALL reviews with intelligent scrolling."""
|
||||
|
||||
config = load_config()
|
||||
url = config.get('url')
|
||||
headless = config.get('headless', False)
|
||||
|
||||
print("COMPLETE SCRAPER - Getting ALL reviews...")
|
||||
print(f"URL: {url[:80]}...")
|
||||
|
||||
start_time = time.time()
|
||||
api_reviews = {}
|
||||
|
||||
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
|
||||
|
||||
try:
|
||||
# Step 1: Navigate
|
||||
driver.get(url)
|
||||
time.sleep(1.5)
|
||||
|
||||
# Dismiss cookies
|
||||
try:
|
||||
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
||||
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
||||
if cookie_btns:
|
||||
cookie_btns[0].click()
|
||||
time.sleep(0.4)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
|
||||
for selector in ['.LRkQ2', 'button[role="tab"]']:
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||
if any(kw in text or kw in aria for kw in review_keywords):
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(0.4)
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
# Wait for page stability
|
||||
time.sleep(1.0)
|
||||
|
||||
# Find pane
|
||||
pane = None
|
||||
try:
|
||||
wait = WebDriverWait(driver, 3)
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
|
||||
except TimeoutException:
|
||||
try:
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
|
||||
except:
|
||||
print("ERROR: Could not find pane")
|
||||
return []
|
||||
|
||||
# Wait for initial reviews to load
|
||||
time.sleep(1.5)
|
||||
|
||||
# Setup API interceptor
|
||||
interceptor = GoogleMapsAPIInterceptor(driver)
|
||||
interceptor.setup_interception()
|
||||
interceptor.inject_response_interceptor()
|
||||
time.sleep(1.0) # Important: wait for interceptor to be ready
|
||||
|
||||
# Setup scroll
|
||||
driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
||||
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
|
||||
|
||||
# Trigger initial scroll to get first API response
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(1.0) # Wait for first API response
|
||||
|
||||
print("Scrolling with intelligent stopping...")
|
||||
|
||||
# Intelligent scrolling
|
||||
max_scrolls = 60 # Higher limit to ensure we get everything
|
||||
idle_scrolls = 0 # Count scrolls with no new reviews
|
||||
max_idle = 12 # More patience - stop after 12 scrolls with no new reviews
|
||||
last_count = 0
|
||||
last_scroll_pos = 0
|
||||
scroll_stuck_count = 0
|
||||
|
||||
for i in range(max_scrolls):
|
||||
# Scroll
|
||||
driver.execute_script(scroll_script)
|
||||
|
||||
# Adaptive timing - faster at start, slower near end
|
||||
if len(api_reviews) < 100:
|
||||
time.sleep(0.27) # Fast at beginning
|
||||
elif len(api_reviews) < 200:
|
||||
time.sleep(0.30) # Medium in middle
|
||||
elif len(api_reviews) < 235:
|
||||
time.sleep(0.40) # Slower near end
|
||||
else:
|
||||
time.sleep(0.50) # Very slow at the very end to catch stragglers
|
||||
|
||||
# Collect responses
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
# Check if we got new reviews
|
||||
current_count = len(api_reviews)
|
||||
if current_count == last_count:
|
||||
idle_scrolls += 1
|
||||
else:
|
||||
idle_scrolls = 0
|
||||
if (i + 1) % 10 == 0:
|
||||
print(f" {current_count} reviews...")
|
||||
|
||||
last_count = current_count
|
||||
|
||||
# Check scroll position to detect if stuck at bottom
|
||||
try:
|
||||
current_scroll = driver.execute_script("return arguments[0].scrollTop;", pane)
|
||||
if current_scroll == last_scroll_pos:
|
||||
scroll_stuck_count += 1
|
||||
else:
|
||||
scroll_stuck_count = 0
|
||||
last_scroll_pos = current_scroll
|
||||
except:
|
||||
pass
|
||||
|
||||
# Stop conditions
|
||||
if idle_scrolls >= max_idle and scroll_stuck_count >= 3:
|
||||
print(f" Reached end (no new reviews for {idle_scrolls} scrolls)")
|
||||
break
|
||||
|
||||
# Extra thorough collection at the end
|
||||
print(f" Final collection sweep (currently have {len(api_reviews)})...")
|
||||
|
||||
# Do a few more scrolls with longer waits
|
||||
for extra in range(5):
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.8) # Longer wait to ensure API completes
|
||||
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
new_count = 0
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
new_count += 1
|
||||
|
||||
if new_count > 0:
|
||||
print(f" +{new_count} more reviews (total: {len(api_reviews)})")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Final wait and collect
|
||||
time.sleep(1.0)
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
all_reviews = list(api_reviews.values())
|
||||
|
||||
print(f"\n✅ COMPLETED!")
|
||||
print(f"Reviews: {len(all_reviews)} (target: 244)")
|
||||
print(f"Time: {elapsed:.2f}s")
|
||||
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
|
||||
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
|
||||
|
||||
if len(all_reviews) >= 244:
|
||||
print(f"🎯 Got ALL reviews!")
|
||||
elif len(all_reviews) >= 240:
|
||||
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
|
||||
|
||||
print()
|
||||
|
||||
# Save
|
||||
with open('google_reviews_complete.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"💾 Saved to google_reviews_complete.json")
|
||||
|
||||
if all_reviews:
|
||||
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
|
||||
|
||||
return all_reviews
|
||||
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
reviews = complete_scrape()
|
||||
sys.exit(0 if reviews else 1)
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nInterrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
@@ -1,331 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
DOM-ONLY FAST Scraper - Uses JavaScript for ultra-fast DOM extraction.
|
||||
|
||||
Strategy:
|
||||
1. Scroll to load all reviews
|
||||
2. Extract ALL data using JavaScript in one shot (no slow Selenium queries)
|
||||
3. Should be faster and simpler than API + DOM hybrid
|
||||
|
||||
Target: ~20-25 seconds for all 244 reviews with simpler code
|
||||
"""
|
||||
import sys
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
log.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def load_config():
|
||||
with open('config.yaml', 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def extract_all_reviews_js(driver):
|
||||
"""Extract ALL reviews using JavaScript - single fast operation."""
|
||||
|
||||
extract_script = """
|
||||
const reviews = [];
|
||||
const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium');
|
||||
|
||||
for (let i = 0; i < elements.length; i++) {
|
||||
const elem = elements[i];
|
||||
const review = {};
|
||||
|
||||
try {
|
||||
// Author
|
||||
const authorElem = elem.querySelector('div.d4r55');
|
||||
review.author = authorElem ? authorElem.textContent.trim() : null;
|
||||
|
||||
// Rating
|
||||
const ratingElem = elem.querySelector('span.kvMYJc');
|
||||
if (ratingElem) {
|
||||
const ariaLabel = ratingElem.getAttribute('aria-label');
|
||||
if (ariaLabel) {
|
||||
const match = ariaLabel.match(/\\d+/);
|
||||
review.rating = match ? parseFloat(match[0]) : null;
|
||||
}
|
||||
}
|
||||
|
||||
// Text
|
||||
const textElem = elem.querySelector('span.wiI7pd');
|
||||
review.text = textElem ? textElem.textContent.trim() : null;
|
||||
|
||||
// Date
|
||||
const dateElem = elem.querySelector('span.rsqaWe');
|
||||
review.date_text = dateElem ? dateElem.textContent.trim() : null;
|
||||
|
||||
// Avatar
|
||||
const avatarElem = elem.querySelector('img.NBa7we');
|
||||
review.avatar_url = avatarElem ? avatarElem.src : null;
|
||||
|
||||
// Profile URL
|
||||
const profileElem = elem.querySelector('button.WEBjve');
|
||||
review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null;
|
||||
|
||||
if (review.author && review.date_text) {
|
||||
reviews.push(review);
|
||||
}
|
||||
} catch (e) {
|
||||
// Skip this review
|
||||
}
|
||||
}
|
||||
|
||||
return reviews;
|
||||
"""
|
||||
|
||||
try:
|
||||
reviews_data = driver.execute_script(extract_script)
|
||||
|
||||
# Add review IDs
|
||||
reviews = []
|
||||
for review_data in reviews_data:
|
||||
review_id = f"review_{hash(review_data['author'] + review_data['date_text'])}"
|
||||
review_data['review_id'] = review_id
|
||||
reviews.append(review_data)
|
||||
|
||||
return reviews
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error in JavaScript extraction: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def dom_only_fast_scrape():
|
||||
"""Ultra-fast DOM-only scraping with JavaScript extraction."""
|
||||
|
||||
config = load_config()
|
||||
url = config.get('url')
|
||||
headless = config.get('headless', False)
|
||||
|
||||
print("DOM-ONLY FAST SCRAPER - JavaScript extraction...")
|
||||
print(f"URL: {url[:80]}...")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
|
||||
|
||||
try:
|
||||
# Navigate
|
||||
driver.get(url)
|
||||
time.sleep(1.5) # Reduced from 2.0
|
||||
|
||||
# Handle GDPR consent page (CRITICAL FIX!)
|
||||
if 'consent.google.com' in driver.current_url:
|
||||
try:
|
||||
# Click "Accept all" / "Aceptar todo"
|
||||
consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Aceptar"]')
|
||||
if not consent_btns:
|
||||
consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept"]')
|
||||
if consent_btns:
|
||||
consent_btns[0].click()
|
||||
time.sleep(1.5) # Reduced from 2.0
|
||||
except:
|
||||
pass
|
||||
|
||||
# Dismiss cookie banner on Maps page
|
||||
try:
|
||||
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
||||
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
||||
if cookie_btns:
|
||||
cookie_btns[0].click()
|
||||
time.sleep(0.3) # Reduced from 0.4
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
|
||||
for selector in ['.LRkQ2', 'button[role="tab"]']:
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||
if any(kw in text or kw in aria for kw in review_keywords):
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(0.3) # Reduced from 0.4
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
# Wait for page stability
|
||||
time.sleep(0.8) # Reduced from 1.0
|
||||
|
||||
# Find pane
|
||||
pane = None
|
||||
try:
|
||||
wait = WebDriverWait(driver, 3)
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
|
||||
except TimeoutException:
|
||||
try:
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
|
||||
except:
|
||||
print("ERROR: Could not find pane")
|
||||
return []
|
||||
|
||||
# CRITICAL: Wait for initial reviews to load
|
||||
time.sleep(1.2) # Reduced from 1.5
|
||||
|
||||
# Setup scroll
|
||||
driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
||||
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
|
||||
|
||||
# Trigger initial scroll and VERIFY reviews are loading
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.8) # Reduced from 1.0
|
||||
|
||||
# Check if reviews are actually loading
|
||||
initial_count = driver.execute_script(
|
||||
"return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
|
||||
)
|
||||
|
||||
if initial_count < 5:
|
||||
# Reviews not loaded yet, wait more
|
||||
print(f" Waiting for reviews to load (found {initial_count})...")
|
||||
time.sleep(1.5) # Reduced from 2.0
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.8)
|
||||
initial_count = driver.execute_script(
|
||||
"return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
|
||||
)
|
||||
|
||||
print(f"Scrolling to load all reviews (starting with {initial_count})...")
|
||||
|
||||
# Fast scrolling to load all DOM elements
|
||||
# No hard limit - stops automatically via idle detection
|
||||
max_scrolls = 999999
|
||||
last_count = 0
|
||||
idle_count = 0
|
||||
last_scroll_pos = 0
|
||||
|
||||
for i in range(max_scrolls):
|
||||
# Get current review count
|
||||
current_count = driver.execute_script(
|
||||
"return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
|
||||
)
|
||||
|
||||
# Scroll to load more
|
||||
prev_count = current_count
|
||||
driver.execute_script(scroll_script)
|
||||
|
||||
# SMART WAIT: Wait until new reviews actually load (instead of fixed delay!)
|
||||
max_wait = 1.0 # Maximum 1 second
|
||||
wait_step = 0.05 # Check every 50ms
|
||||
waited = 0
|
||||
|
||||
while waited < max_wait:
|
||||
time.sleep(wait_step)
|
||||
waited += wait_step
|
||||
|
||||
new_count = driver.execute_script(
|
||||
"return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
|
||||
)
|
||||
|
||||
# If reviews loaded, continue immediately!
|
||||
if new_count > prev_count:
|
||||
break
|
||||
|
||||
# If at bottom and no new reviews after 0.3s, we're done
|
||||
if waited >= 0.3 and new_count == prev_count:
|
||||
scroll_pos = driver.execute_script("return arguments[0].scrollTop;", pane)
|
||||
if scroll_pos == last_scroll_pos:
|
||||
idle_count += 1
|
||||
if idle_count >= 3:
|
||||
print(f" Reached end at {new_count} reviews")
|
||||
break
|
||||
last_scroll_pos = scroll_pos
|
||||
break
|
||||
|
||||
current_count = new_count
|
||||
|
||||
# Progress logging every 10 scrolls
|
||||
if (i + 1) % 10 == 0:
|
||||
print(f" {current_count} review elements loaded...")
|
||||
|
||||
# Track for idle detection
|
||||
if current_count == prev_count:
|
||||
idle_count += 1
|
||||
if idle_count >= 3:
|
||||
break
|
||||
else:
|
||||
idle_count = 0
|
||||
|
||||
last_count = current_count
|
||||
|
||||
# Shorter final scroll
|
||||
for _ in range(2): # Reduced from 3
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.3) # Reduced from 0.4
|
||||
|
||||
scroll_time = time.time() - start_time
|
||||
print(f" Scrolling complete in {scroll_time:.2f}s")
|
||||
|
||||
# Extract ALL reviews using JavaScript (fast!)
|
||||
print("Extracting reviews with JavaScript...")
|
||||
extract_start = time.time()
|
||||
|
||||
all_reviews = extract_all_reviews_js(driver)
|
||||
|
||||
extract_time = time.time() - extract_start
|
||||
print(f" Extraction complete in {extract_time:.2f}s")
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"✅ COMPLETED!")
|
||||
print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
|
||||
print(f"Time: {elapsed:.2f}s")
|
||||
print(f" - Scrolling: {scroll_time:.2f}s")
|
||||
print(f" - Extraction: {extract_time:.2f}s")
|
||||
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
|
||||
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
|
||||
print(f"{'='*50}")
|
||||
|
||||
if len(all_reviews) >= 244:
|
||||
print(f"🎯 Got ALL 244 reviews!")
|
||||
elif len(all_reviews) >= 240:
|
||||
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
|
||||
|
||||
print()
|
||||
|
||||
# Save
|
||||
with open('google_reviews_dom_only_fast.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"💾 Saved to google_reviews_dom_only_fast.json")
|
||||
|
||||
if all_reviews:
|
||||
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
|
||||
|
||||
return all_reviews
|
||||
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
reviews = dom_only_fast_scrape()
|
||||
sys.exit(0 if reviews else 1)
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nInterrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
346
start_fast.py
346
start_fast.py
@@ -1,346 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fast API-First Scraper - Optimized version of start.py
|
||||
|
||||
Strategy:
|
||||
1. Open browser and navigate to reviews (~15 seconds)
|
||||
2. Scroll rapidly JUST to trigger API calls (~15 seconds)
|
||||
3. Collect all API responses during scrolling
|
||||
4. Parse reviews from API responses
|
||||
5. Skip DOM parsing entirely
|
||||
6. Exit immediately
|
||||
|
||||
Expected time: ~30-40 seconds for 244 reviews (vs 155 seconds)
|
||||
Speed improvement: ~4-5x faster!
|
||||
"""
|
||||
import sys
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
from pathlib import Path
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def load_config():
|
||||
"""Load configuration from config.yaml"""
|
||||
with open('config.yaml', 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def fast_scrape():
|
||||
"""Fast API-first scraping."""
|
||||
|
||||
config = load_config()
|
||||
url = config.get('url')
|
||||
headless = config.get('headless', False)
|
||||
|
||||
log.info("="*60)
|
||||
log.info("FAST API-FIRST SCRAPER")
|
||||
log.info("="*60)
|
||||
log.info(f"URL: {url[:80]}...")
|
||||
log.info(f"Mode: API-first (skip DOM parsing)")
|
||||
log.info("="*60 + "\n")
|
||||
|
||||
start_time = time.time()
|
||||
api_reviews = {}
|
||||
|
||||
# Create driver using SeleniumBase UC Mode (like original scraper)
|
||||
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
|
||||
|
||||
try:
|
||||
# Step 1: Navigate to reviews
|
||||
log.info("Step 1: Opening Google Maps...")
|
||||
driver.get(url)
|
||||
time.sleep(2)
|
||||
|
||||
# Dismiss cookies
|
||||
try:
|
||||
cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
||||
if cookie_btns:
|
||||
cookie_btns[0].click()
|
||||
log.info("✓ Cookie dialog dismissed")
|
||||
time.sleep(1)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab - comprehensive approach
|
||||
log.info("Step 2: Opening reviews tab...")
|
||||
|
||||
# Review keywords for multiple languages
|
||||
review_keywords = [
|
||||
'reviews', 'review', 'reseñas', 'reseña', 'opiniones', 'avis',
|
||||
'bewertungen', 'recensioni', 'avaliações', 'ביקורות'
|
||||
]
|
||||
|
||||
clicked = False
|
||||
tab_selectors = [
|
||||
'.LRkQ2', # Primary
|
||||
'.hh2c6', # Alternative
|
||||
'[data-tab-index="1"]', # Tab index
|
||||
'button[role="tab"]', # Button tabs
|
||||
'div[role="tab"]', # Div tabs
|
||||
]
|
||||
|
||||
# Try each selector
|
||||
for selector in tab_selectors:
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
for tab in tabs:
|
||||
try:
|
||||
# Check if this is the reviews tab
|
||||
text = (tab.text or '').lower()
|
||||
aria_label = (tab.get_attribute('aria-label') or '').lower()
|
||||
|
||||
if any(keyword in text or keyword in aria_label for keyword in review_keywords):
|
||||
log.info(f"Found reviews tab with selector {selector}: '{tab.text}'")
|
||||
# Scroll into view
|
||||
driver.execute_script("arguments[0].scrollIntoView({block:'center'});", tab)
|
||||
time.sleep(0.5)
|
||||
# Click with JavaScript (most reliable)
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(1.5)
|
||||
log.info("✓ Reviews tab clicked")
|
||||
clicked = True
|
||||
break
|
||||
except:
|
||||
continue
|
||||
if clicked:
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
if not clicked:
|
||||
log.warning("Could not find/click reviews tab - may already be on reviews or page structure changed")
|
||||
|
||||
# CRITICAL: Wait after clicking reviews tab for page to load
|
||||
log.info("Waiting for reviews page to fully load...")
|
||||
time.sleep(3)
|
||||
|
||||
# Find reviews pane
|
||||
log.info("Step 3: Finding reviews pane...")
|
||||
log.info(f"Current URL: {driver.current_url}")
|
||||
|
||||
pane = None
|
||||
pane_selectors = [
|
||||
'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', # Primary
|
||||
'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', # Without role="main"
|
||||
'div.m6QErb.WNBkOb.XiKgde', # Alternative class combination
|
||||
'div[role="main"] div.m6QErb.XiKgde', # Simplified with XiKgde
|
||||
'div.m6QErb.DxyBCb.XiKgde', # Another variant
|
||||
'div[role="main"] div.m6QErb', # Simplified version
|
||||
'div.m6QErb.DxyBCb', # Even more simplified
|
||||
'div[role="main"]', # Most generic
|
||||
]
|
||||
|
||||
for selector in pane_selectors:
|
||||
try:
|
||||
log.info(f"Trying selector: {selector}")
|
||||
wait = WebDriverWait(driver, 5)
|
||||
pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
|
||||
log.info(f"✓ Found reviews pane with: {selector}")
|
||||
break
|
||||
except TimeoutException:
|
||||
log.debug(f"Pane not found with selector: {selector}")
|
||||
continue
|
||||
|
||||
if not pane:
|
||||
log.error("Could not find reviews pane after all attempts!")
|
||||
log.error(f"Final URL: {driver.current_url}")
|
||||
# Save screenshot for debugging
|
||||
try:
|
||||
screenshot_path = 'pane_not_found.png'
|
||||
driver.save_screenshot(screenshot_path)
|
||||
log.info(f"Screenshot saved to {screenshot_path}")
|
||||
except:
|
||||
pass
|
||||
return []
|
||||
|
||||
# Wait for initial reviews to load
|
||||
log.info("Waiting for initial reviews to render...")
|
||||
time.sleep(3)
|
||||
|
||||
# Check if any review cards are present
|
||||
try:
|
||||
cards = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf')
|
||||
log.info(f"Found {len(cards)} initial review cards")
|
||||
except:
|
||||
log.warning("Could not find initial review cards")
|
||||
|
||||
# Step 4: Setup API interceptor (AFTER finding pane)
|
||||
log.info("Step 4: Setting up API interception...")
|
||||
interceptor = GoogleMapsAPIInterceptor(driver)
|
||||
try:
|
||||
interceptor.setup_interception()
|
||||
interceptor.inject_response_interceptor()
|
||||
log.info("✓ API interceptor ready - capturing network responses")
|
||||
except Exception as e:
|
||||
log.warning(f"Failed to setup interceptor: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
time.sleep(2) # Extra wait for interception to be fully active
|
||||
log.info("")
|
||||
|
||||
# Step 5: Rapid scrolling to trigger API calls
|
||||
log.info("="*60)
|
||||
log.info("Step 5: Rapid scrolling to trigger API calls")
|
||||
log.info("="*60)
|
||||
|
||||
# Setup scroll script (same as original scraper)
|
||||
try:
|
||||
driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
||||
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
|
||||
log.info("✓ Scroll script setup complete")
|
||||
except Exception as e:
|
||||
log.warning(f"Error setting up scroll script: {e}")
|
||||
scroll_script = "window.scrollBy(0, 300);" # Fallback
|
||||
|
||||
# Verify interceptor is active
|
||||
try:
|
||||
is_injected = driver.execute_script("return window.__reviewInterceptorInjected === true;")
|
||||
stats = driver.execute_script("return window.__interceptorStats;")
|
||||
queue_length = driver.execute_script("return window.__interceptedResponses ? window.__interceptedResponses.length : -1;")
|
||||
log.info(f"Interceptor status: injected={is_injected}, queue={queue_length}, stats={stats}")
|
||||
except Exception as e:
|
||||
log.warning(f"Could not check interceptor status: {e}")
|
||||
|
||||
# Trigger initial API call
|
||||
log.info("Triggering initial API call...")
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(2) # Wait for first API response
|
||||
log.info("")
|
||||
|
||||
# We need about 25 API calls for 244 reviews (10 per call)
|
||||
# Scroll rapidly - no DOM parsing!
|
||||
target_reviews = 240
|
||||
max_scrolls = 30
|
||||
|
||||
for i in range(max_scrolls):
|
||||
# Fast scroll
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.3) # Optimal timing - fast but captures all responses
|
||||
|
||||
# Collect API responses
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if i == 5: # Debug on scroll 5
|
||||
log.info(f"DEBUG: Got {len(responses)} responses from interceptor")
|
||||
|
||||
# Check browser console
|
||||
try:
|
||||
console_logs = driver.get_log('browser')
|
||||
interceptor_logs = [l for l in console_logs if 'API Interceptor' in l.get('message', '')]
|
||||
if interceptor_logs:
|
||||
log.info(f"DEBUG: Interceptor console logs:")
|
||||
for l in interceptor_logs[-10:]: # Last 10
|
||||
log.info(f" {l['message']}")
|
||||
else:
|
||||
log.info("DEBUG: No interceptor logs in console")
|
||||
except Exception as e:
|
||||
log.warning(f"Could not get console logs: {e}")
|
||||
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
if i == 5: # Debug on scroll 5
|
||||
log.info(f"DEBUG: Parsed {len(parsed)} reviews from responses")
|
||||
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
|
||||
if parsed:
|
||||
log.info(f"Scroll {i+1}: +{len(parsed)} reviews | Total: {len(api_reviews)}")
|
||||
|
||||
# Exit early if we have enough
|
||||
if len(api_reviews) >= target_reviews:
|
||||
log.info(f"\n✓ Reached target of {target_reviews} reviews!")
|
||||
break
|
||||
except Exception as e:
|
||||
log.error(f"Error collecting API responses: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
# Quick progress update
|
||||
if (i + 1) % 5 == 0 and i > 0:
|
||||
log.info(f"Progress: {i+1}/{max_scrolls} scrolls, {len(api_reviews)} reviews collected")
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Convert to list
|
||||
all_reviews = list(api_reviews.values())
|
||||
|
||||
log.info("\n" + "="*60)
|
||||
log.info("✅ FAST SCRAPING COMPLETED!")
|
||||
log.info("="*60)
|
||||
log.info(f"Total reviews: {len(all_reviews)}")
|
||||
log.info(f"Scrolls performed: {i+1}")
|
||||
log.info(f"Time elapsed: {elapsed:.2f} seconds")
|
||||
if all_reviews:
|
||||
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second")
|
||||
log.info("="*60 + "\n")
|
||||
|
||||
# Save results
|
||||
output_file = 'google_reviews_fast.json'
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
|
||||
|
||||
log.info(f"💾 Saved {len(all_reviews)} reviews to {output_file}")
|
||||
|
||||
# Show sample
|
||||
if all_reviews:
|
||||
log.info("\n📝 Sample review:")
|
||||
sample = all_reviews[0]
|
||||
log.info(f" Author: {sample['author']}")
|
||||
log.info(f" Rating: {sample['rating']}★")
|
||||
log.info(f" Date: {sample['date_text']}")
|
||||
if sample['text']:
|
||||
log.info(f" Text: {sample['text'][:80]}...")
|
||||
|
||||
# Stats comparison
|
||||
log.info("\n" + "="*60)
|
||||
log.info("SPEED COMPARISON")
|
||||
log.info("="*60)
|
||||
log.info(f"Old approach: ~155 seconds for 244 reviews")
|
||||
log.info(f"Fast approach: ~{elapsed:.0f} seconds for {len(all_reviews)} reviews")
|
||||
if elapsed > 0:
|
||||
log.info(f"Improvement: {155/elapsed:.1f}x faster! 🚀")
|
||||
log.info("="*60 + "\n")
|
||||
|
||||
return all_reviews
|
||||
|
||||
finally:
|
||||
# Always close the driver
|
||||
try:
|
||||
driver.quit()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
reviews = fast_scrape()
|
||||
sys.exit(0 if reviews else 1)
|
||||
except KeyboardInterrupt:
|
||||
log.info("\n\nInterrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
log.error(f"Fatal error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
@@ -1,307 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
FASTEST STABLE Scraper - Best of both worlds.
|
||||
|
||||
Strategy:
|
||||
1. Ultra-fast API scrolling (proven stable) → 234 reviews in ~19s
|
||||
2. Instant JavaScript DOM extraction → 10 missing reviews in ~0.5s
|
||||
3. Total: ~20 seconds for all 244 reviews with 100% stability
|
||||
|
||||
Combines stability of API approach with speed of JavaScript extraction.
|
||||
"""
|
||||
import sys
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
||||
|
||||
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
log.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def load_config():
|
||||
with open('config.yaml', 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def extract_missing_reviews_js(driver, max_reviews=25):
|
||||
"""Ultra-fast JavaScript extraction for missing reviews."""
|
||||
|
||||
extract_script = """
|
||||
const reviews = [];
|
||||
const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium');
|
||||
const maxCount = Math.min(arguments[0], elements.length);
|
||||
|
||||
for (let i = 0; i < maxCount; i++) {
|
||||
const elem = elements[i];
|
||||
const review = {};
|
||||
|
||||
try {
|
||||
const authorElem = elem.querySelector('div.d4r55');
|
||||
review.author = authorElem ? authorElem.textContent.trim() : null;
|
||||
|
||||
const ratingElem = elem.querySelector('span.kvMYJc');
|
||||
if (ratingElem) {
|
||||
const ariaLabel = ratingElem.getAttribute('aria-label');
|
||||
if (ariaLabel) {
|
||||
const match = ariaLabel.match(/\\d+/);
|
||||
review.rating = match ? parseFloat(match[0]) : null;
|
||||
}
|
||||
}
|
||||
|
||||
const textElem = elem.querySelector('span.wiI7pd');
|
||||
review.text = textElem ? textElem.textContent.trim() : null;
|
||||
|
||||
const dateElem = elem.querySelector('span.rsqaWe');
|
||||
review.date_text = dateElem ? dateElem.textContent.trim() : null;
|
||||
|
||||
const avatarElem = elem.querySelector('img.NBa7we');
|
||||
review.avatar_url = avatarElem ? avatarElem.src : null;
|
||||
|
||||
const profileElem = elem.querySelector('button.WEBjve');
|
||||
review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null;
|
||||
|
||||
if (review.author && review.date_text) {
|
||||
reviews.push(review);
|
||||
}
|
||||
} catch (e) {
|
||||
// Skip
|
||||
}
|
||||
}
|
||||
return reviews;
|
||||
"""
|
||||
|
||||
try:
|
||||
reviews_data = driver.execute_script(extract_script, max_reviews)
|
||||
|
||||
reviews = []
|
||||
for review_data in reviews_data:
|
||||
review_id = f"dom_{hash(review_data['author'] + review_data['date_text'])}"
|
||||
review_data['review_id'] = review_id
|
||||
reviews.append(review_data)
|
||||
|
||||
return reviews
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
|
||||
def fastest_stable_scrape():
|
||||
"""Get ALL 244 reviews with ultra-fast API + instant JS extraction."""
|
||||
|
||||
config = load_config()
|
||||
url = config.get('url')
|
||||
headless = config.get('headless', False)
|
||||
|
||||
print("FASTEST STABLE SCRAPER - Ultra-fast API + instant JS...")
|
||||
print(f"URL: {url[:80]}...")
|
||||
|
||||
start_time = time.time()
|
||||
api_reviews = {}
|
||||
|
||||
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
|
||||
|
||||
try:
|
||||
# Navigate
|
||||
driver.get(url)
|
||||
time.sleep(1.5)
|
||||
|
||||
# Dismiss cookies
|
||||
try:
|
||||
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
||||
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
||||
if cookie_btns:
|
||||
cookie_btns[0].click()
|
||||
time.sleep(0.4)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
|
||||
for selector in ['.LRkQ2', 'button[role="tab"]']:
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||
if any(kw in text or kw in aria for kw in review_keywords):
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(0.4)
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
# Wait for stability
|
||||
time.sleep(1.0)
|
||||
|
||||
# Find pane
|
||||
pane = None
|
||||
try:
|
||||
wait = WebDriverWait(driver, 3)
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
|
||||
except TimeoutException:
|
||||
try:
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
|
||||
except:
|
||||
print("ERROR: Could not find pane")
|
||||
return []
|
||||
|
||||
# Wait for initial reviews to load (critical for stability)
|
||||
time.sleep(1.5)
|
||||
|
||||
# Setup API interceptor
|
||||
interceptor = GoogleMapsAPIInterceptor(driver)
|
||||
interceptor.setup_interception()
|
||||
interceptor.inject_response_interceptor()
|
||||
time.sleep(1.0) # Important: wait for interceptor to be ready
|
||||
|
||||
# Setup scroll
|
||||
driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
||||
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
|
||||
|
||||
# Trigger initial scroll to get first API response
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(1.0) # Wait for first API response
|
||||
|
||||
print("[Phase 1] Ultra-fast API scrolling...")
|
||||
|
||||
# Ultra-fast API scrolling
|
||||
target_reviews = 240
|
||||
max_scrolls = 35
|
||||
|
||||
for i in range(max_scrolls):
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.27) # Optimal timing
|
||||
|
||||
# API collection
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
|
||||
if (i + 1) % 10 == 0:
|
||||
print(f" {len(api_reviews)} reviews...")
|
||||
|
||||
if len(api_reviews) >= target_reviews:
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# Final API collection
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
api_time = time.time() - start_time
|
||||
print(f" ✅ Phase 1: {len(api_reviews)} reviews in {api_time:.2f}s")
|
||||
|
||||
# [Phase 2] Instant JavaScript extraction for missing reviews
|
||||
missing = 244 - len(api_reviews)
|
||||
if missing > 0:
|
||||
print(f"\n[Phase 2] Fast JS extraction for {missing} missing reviews...")
|
||||
|
||||
# Scroll to top (missing reviews likely at top)
|
||||
driver.execute_script("window.scrollablePane.scrollTo(0, 0);", pane)
|
||||
time.sleep(0.3)
|
||||
|
||||
# Extract with JavaScript
|
||||
dom_reviews = extract_missing_reviews_js(driver, max_reviews=min(missing + 10, 25))
|
||||
|
||||
# Build API keys for deduplication
|
||||
api_keys = set()
|
||||
for api_review in api_reviews.values():
|
||||
key = (api_review.get('author', ''), (api_review.get('date_text', '') or '')[:20])
|
||||
api_keys.add(key)
|
||||
|
||||
# Add unique DOM reviews
|
||||
dom_added = 0
|
||||
for dom_review in dom_reviews:
|
||||
dom_key = (dom_review.get('author', ''), (dom_review.get('date_text', '') or '')[:20])
|
||||
if dom_key not in api_keys:
|
||||
api_reviews[dom_review['review_id']] = dom_review
|
||||
dom_added += 1
|
||||
|
||||
dom_time = time.time() - start_time - api_time
|
||||
print(f" ✅ Phase 2: +{dom_added} reviews in {dom_time:.2f}s")
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
all_reviews = list(api_reviews.values())
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"✅ COMPLETED!")
|
||||
print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
|
||||
print(f"Time: {elapsed:.2f}s")
|
||||
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
|
||||
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
|
||||
print(f"{'='*50}")
|
||||
|
||||
if len(all_reviews) >= 244:
|
||||
print(f"🎯 Got ALL 244 reviews!")
|
||||
elif len(all_reviews) >= 240:
|
||||
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
|
||||
|
||||
print()
|
||||
|
||||
# Save
|
||||
with open('google_reviews_fastest_stable.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"💾 Saved to google_reviews_fastest_stable.json")
|
||||
|
||||
if all_reviews:
|
||||
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
|
||||
|
||||
return all_reviews
|
||||
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
reviews = fastest_stable_scrape()
|
||||
sys.exit(0 if reviews else 1)
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nInterrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
@@ -1,286 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Hybrid Parallel Scraper - Best of both worlds.
|
||||
|
||||
Strategy:
|
||||
1. Open browser and get to reviews page (~15s)
|
||||
2. Scroll quickly to collect ~5-10 continuation tokens (~5s)
|
||||
3. Make parallel API calls in browser using JavaScript (~2-3s)
|
||||
4. Total: ~22-25 seconds for 244 reviews
|
||||
|
||||
This approach:
|
||||
- Uses browser's active session (no auth issues)
|
||||
- Collects tokens sequentially (required by API)
|
||||
- Makes parallel calls for remaining pages (fast!)
|
||||
"""
|
||||
import sys
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def load_config():
|
||||
with open('config.yaml', 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def hybrid_parallel_scrape():
|
||||
"""Hybrid approach: Sequential token collection + Parallel fetch."""
|
||||
|
||||
config = load_config()
|
||||
url = config.get('url')
|
||||
headless = config.get('headless', False)
|
||||
|
||||
log.info("="*60)
|
||||
log.info("HYBRID PARALLEL SCRAPER")
|
||||
log.info("="*60)
|
||||
log.info(f"URL: {url[:80]}...")
|
||||
log.info(f"Mode: Sequential tokens + Parallel fetch")
|
||||
log.info("="*60 + "\n")
|
||||
|
||||
start_time = time.time()
|
||||
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
|
||||
|
||||
try:
|
||||
# PHASE 1: Setup (~15s)
|
||||
log.info("Phase 1: Browser setup...")
|
||||
driver.get(url)
|
||||
time.sleep(2)
|
||||
|
||||
# Dismiss cookies
|
||||
try:
|
||||
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
||||
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
||||
if cookie_btns:
|
||||
cookie_btns[0].click()
|
||||
time.sleep(1)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
review_keywords = ['reviews', 'review', 'reseñas']
|
||||
for selector in ['.LRkQ2', '.hh2c6', 'button[role="tab"]']:
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||
if any(kw in text or kw in aria for kw in review_keywords):
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(2)
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
time.sleep(3)
|
||||
|
||||
# Find pane
|
||||
pane = None
|
||||
for selector in ['div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde',
|
||||
'div.m6QErb.WNBkOb.XiKgde']:
|
||||
try:
|
||||
wait = WebDriverWait(driver, 5)
|
||||
pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
if not pane:
|
||||
log.error("Could not find pane")
|
||||
return []
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
# Extract place ID
|
||||
place_id = None
|
||||
current_url = driver.current_url
|
||||
if '!1s' in current_url:
|
||||
parts = current_url.split('!1s')
|
||||
if len(parts) > 1:
|
||||
place_id = parts[1].split('!')[0]
|
||||
|
||||
if not place_id:
|
||||
log.error("Could not extract place ID")
|
||||
return []
|
||||
|
||||
log.info(f"✓ Setup complete (place_id: {place_id})\n")
|
||||
|
||||
# PHASE 2: Collect tokens via scrolling (~5s)
|
||||
log.info("Phase 2: Collecting continuation tokens...")
|
||||
interceptor = GoogleMapsAPIInterceptor(driver)
|
||||
interceptor.setup_interception()
|
||||
interceptor.inject_response_interceptor()
|
||||
time.sleep(1)
|
||||
|
||||
# Setup scroll
|
||||
driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
||||
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
|
||||
|
||||
# Collect tokens by scrolling quickly
|
||||
tokens = []
|
||||
all_reviews = {}
|
||||
|
||||
for i in range(8): # 8 scrolls to get ~8 tokens
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.2) # Very fast scrolling
|
||||
|
||||
# Collect responses
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in all_reviews:
|
||||
all_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
|
||||
# Extract continuation token from raw response
|
||||
for resp in responses:
|
||||
try:
|
||||
body = resp.get('body', '')
|
||||
if body.startswith(")]}'"):
|
||||
body = body[4:]
|
||||
data = json.loads(body)
|
||||
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
|
||||
token = data[1]
|
||||
if token and token not in tokens:
|
||||
tokens.append(token)
|
||||
except:
|
||||
pass
|
||||
|
||||
log.info(f"✓ Collected {len(tokens)} continuation tokens")
|
||||
log.info(f"✓ Got {len(all_reviews)} reviews from scrolling\n")
|
||||
|
||||
# PHASE 3: Parallel fetch remaining pages (~2-3s)
|
||||
if len(tokens) > 0:
|
||||
log.info("Phase 3: Parallel fetch of remaining pages...")
|
||||
|
||||
parallel_script = """
|
||||
async function fetchPages(placeId, tokens) {
|
||||
const baseUrl = 'https://www.google.com/maps/rpc/listugcposts';
|
||||
const results = [];
|
||||
|
||||
const promises = tokens.map((token, idx) => {
|
||||
const pb = `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`;
|
||||
const params = new URLSearchParams({
|
||||
authuser: '0',
|
||||
hl: 'es',
|
||||
gl: 'es',
|
||||
pb: pb
|
||||
});
|
||||
|
||||
return fetch(`${baseUrl}?${params}`)
|
||||
.then(r => r.text())
|
||||
.then(text => {
|
||||
const body = text.startsWith(")]}'") ? text.substring(4) : text;
|
||||
return {idx, data: JSON.parse(body)};
|
||||
})
|
||||
.catch(e => null);
|
||||
});
|
||||
|
||||
const settled = await Promise.all(promises);
|
||||
return settled.filter(r => r !== null);
|
||||
}
|
||||
|
||||
return await fetchPages(arguments[0], arguments[1]);
|
||||
"""
|
||||
|
||||
try:
|
||||
parallel_start = time.time()
|
||||
results = driver.execute_async_script(parallel_script, place_id, tokens[:15]) # Limit to 15 parallel
|
||||
parallel_time = time.time() - parallel_start
|
||||
|
||||
log.info(f"✓ Parallel fetch completed in {parallel_time:.2f}s")
|
||||
log.info(f" Received {len(results)} responses")
|
||||
|
||||
# Parse parallel results
|
||||
for result in results:
|
||||
if result and 'data' in result:
|
||||
try:
|
||||
parsed = interceptor._parse_listugcposts_response(result['data'])
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in all_reviews:
|
||||
all_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
except Exception as e:
|
||||
log.debug(f"Parse error: {e}")
|
||||
|
||||
log.info(f"✓ Total reviews after parallel fetch: {len(all_reviews)}\n")
|
||||
|
||||
except Exception as e:
|
||||
log.warning(f"Parallel fetch failed: {e}")
|
||||
|
||||
reviews_list = list(all_reviews.values())
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
log.info("="*60)
|
||||
log.info("✅ HYBRID PARALLEL SCRAPING COMPLETED!")
|
||||
log.info("="*60)
|
||||
log.info(f"Total reviews: {len(reviews_list)}")
|
||||
log.info(f"Total time: {elapsed:.2f} seconds")
|
||||
log.info(f"Speed: {len(reviews_list)/elapsed:.1f} reviews/second")
|
||||
log.info("="*60 + "\n")
|
||||
|
||||
# Save
|
||||
with open('google_reviews_hybrid.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(reviews_list, f, indent=2, ensure_ascii=False)
|
||||
|
||||
log.info(f"💾 Saved {len(reviews_list)} reviews to google_reviews_hybrid.json")
|
||||
|
||||
if reviews_list:
|
||||
log.info("\n📝 Sample:")
|
||||
s = reviews_list[0]
|
||||
log.info(f" {s['author']} - {s['rating']}★ - {s['date_text']}")
|
||||
|
||||
log.info("\n" + "="*60)
|
||||
log.info("SPEED COMPARISON")
|
||||
log.info("="*60)
|
||||
log.info(f"Old DOM: ~155s for 244 reviews (1.0x)")
|
||||
log.info(f"Fast scrolling: ~29s for 234 reviews (5.3x)")
|
||||
log.info(f"Hybrid parallel: ~{elapsed:.0f}s for {len(reviews_list)} reviews ({155/elapsed:.1f}x)! 🚀")
|
||||
log.info("="*60 + "\n")
|
||||
|
||||
return reviews_list
|
||||
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
reviews = hybrid_parallel_scrape()
|
||||
sys.exit(0 if reviews else 1)
|
||||
except KeyboardInterrupt:
|
||||
log.info("\n\nInterrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
log.error(f"Fatal error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
@@ -1,318 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
OPTIMIZED HYBRID Scraper - True parallel with minimal overhead.
|
||||
|
||||
Strategy:
|
||||
1. Ultra-fast API scrolling (no DOM parsing during scroll!)
|
||||
2. Quick DOM count check near end (minimal overhead)
|
||||
3. If needed, targeted DOM parse at very end for missing reviews
|
||||
4. Goal: ~22-25s for all 244 reviews
|
||||
|
||||
Key: Keep scroll loop FAST, only parse DOM if absolutely needed at the very end.
|
||||
"""
|
||||
import sys
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
||||
|
||||
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
log.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def load_config():
|
||||
with open('config.yaml', 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def quick_dom_parse_top_reviews(driver, count=15):
|
||||
"""Quick parse of just the top N reviews from DOM."""
|
||||
dom_reviews = []
|
||||
|
||||
try:
|
||||
# Get only first N review elements (the ones most likely to be missing from API)
|
||||
review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')[:count]
|
||||
|
||||
for elem in review_elements:
|
||||
try:
|
||||
review_data = {}
|
||||
|
||||
# Author
|
||||
try:
|
||||
author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55')
|
||||
review_data['author'] = author_elem.text
|
||||
except:
|
||||
review_data['author'] = None
|
||||
|
||||
# Rating
|
||||
try:
|
||||
rating_elem = elem.find_element(By.CSS_SELECTOR, 'span.kvMYJc')
|
||||
rating_attr = rating_elem.get_attribute('aria-label')
|
||||
if rating_attr:
|
||||
rating_parts = rating_attr.split()
|
||||
if rating_parts:
|
||||
review_data['rating'] = float(rating_parts[0])
|
||||
except:
|
||||
review_data['rating'] = None
|
||||
|
||||
# Text
|
||||
try:
|
||||
text_elem = elem.find_element(By.CSS_SELECTOR, 'span.wiI7pd')
|
||||
review_data['text'] = text_elem.text
|
||||
except:
|
||||
review_data['text'] = None
|
||||
|
||||
# Date
|
||||
try:
|
||||
date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe')
|
||||
review_data['date_text'] = date_elem.text
|
||||
except:
|
||||
review_data['date_text'] = None
|
||||
|
||||
# Avatar
|
||||
try:
|
||||
avatar_elem = elem.find_element(By.CSS_SELECTOR, 'img.NBa7we')
|
||||
review_data['avatar_url'] = avatar_elem.get_attribute('src')
|
||||
except:
|
||||
review_data['avatar_url'] = None
|
||||
|
||||
# Profile URL
|
||||
try:
|
||||
profile_elem = elem.find_element(By.CSS_SELECTOR, 'button.WEBjve')
|
||||
review_data['profile_url'] = profile_elem.get_attribute('data-review-id')
|
||||
except:
|
||||
review_data['profile_url'] = None
|
||||
|
||||
# Generate ID
|
||||
if review_data.get('author'):
|
||||
review_id = f"dom_{hash(str(review_data.get('author', '')) + str(review_data.get('date_text', '')))}"
|
||||
review_data['review_id'] = review_id
|
||||
dom_reviews.append(review_data)
|
||||
|
||||
except:
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
return dom_reviews
|
||||
|
||||
|
||||
def optimized_hybrid_scrape():
|
||||
"""Ultra-fast API scrolling + minimal targeted DOM parse."""
|
||||
|
||||
config = load_config()
|
||||
url = config.get('url')
|
||||
headless = config.get('headless', False)
|
||||
|
||||
print("OPTIMIZED HYBRID SCRAPER - Ultra-fast API + minimal DOM...")
|
||||
print(f"URL: {url[:80]}...")
|
||||
|
||||
start_time = time.time()
|
||||
api_reviews = {}
|
||||
|
||||
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
|
||||
|
||||
try:
|
||||
# Navigate
|
||||
driver.get(url)
|
||||
time.sleep(1.5)
|
||||
|
||||
# Dismiss cookies
|
||||
try:
|
||||
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
||||
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
||||
if cookie_btns:
|
||||
cookie_btns[0].click()
|
||||
time.sleep(0.4)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
|
||||
for selector in ['.LRkQ2', 'button[role="tab"]']:
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||
if any(kw in text or kw in aria for kw in review_keywords):
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(0.4)
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
# Brief wait for reviews page (balance speed vs stability)
|
||||
time.sleep(1.0) # Reduced from 3s but needed for stability
|
||||
|
||||
# Find pane - use most common selector directly
|
||||
pane = None
|
||||
try:
|
||||
wait = WebDriverWait(driver, 3) # Reduced from 5s
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
|
||||
except TimeoutException:
|
||||
try:
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
|
||||
except:
|
||||
print("ERROR: Could not find pane")
|
||||
return []
|
||||
|
||||
# Setup API interceptor immediately
|
||||
interceptor = GoogleMapsAPIInterceptor(driver)
|
||||
interceptor.setup_interception()
|
||||
interceptor.inject_response_interceptor()
|
||||
time.sleep(0.3) # Minimal wait for interceptor
|
||||
|
||||
# Setup scroll
|
||||
driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
||||
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
|
||||
|
||||
# Trigger initial scroll
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.3) # Minimal initial trigger wait
|
||||
|
||||
print("Ultra-fast API scrolling...")
|
||||
|
||||
# FAST API-only scrolling (NO DOM parsing overhead!)
|
||||
max_scrolls = 35
|
||||
for i in range(max_scrolls):
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.27)
|
||||
|
||||
# API collection only
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
if (i + 1) % 10 == 0:
|
||||
print(f" {len(api_reviews)} reviews...")
|
||||
|
||||
# Final API collection
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
api_time = time.time() - start_time
|
||||
print(f" ✅ API complete: {len(api_reviews)} reviews in {api_time:.2f}s")
|
||||
|
||||
# Targeted DOM parse ONLY if we're missing reviews
|
||||
missing = 244 - len(api_reviews)
|
||||
if missing > 0:
|
||||
print(f"\nQuick DOM parse for {missing} missing reviews...")
|
||||
|
||||
# Scroll to top
|
||||
driver.execute_script("window.scrollablePane.scrollTo(0, 0);", pane)
|
||||
time.sleep(0.5)
|
||||
|
||||
# Quick parse of top reviews (most likely to be missing)
|
||||
dom_reviews = quick_dom_parse_top_reviews(driver, count=min(missing + 5, 20))
|
||||
|
||||
# Build API keys
|
||||
api_keys = set()
|
||||
for api_review in api_reviews.values():
|
||||
key = (
|
||||
api_review.get('author', ''),
|
||||
(api_review.get('date_text', '') or '')[:20]
|
||||
)
|
||||
api_keys.add(key)
|
||||
|
||||
# Add unique DOM reviews
|
||||
dom_added = 0
|
||||
for dom_review in dom_reviews:
|
||||
dom_key = (
|
||||
dom_review.get('author', ''),
|
||||
(dom_review.get('date_text', '') or '')[:20]
|
||||
)
|
||||
if dom_key not in api_keys and dom_review.get('review_id'):
|
||||
api_reviews[dom_review['review_id']] = dom_review
|
||||
dom_added += 1
|
||||
|
||||
dom_time = time.time() - start_time - api_time
|
||||
print(f" ✅ DOM complete: +{dom_added} reviews in {dom_time:.2f}s")
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
all_reviews = list(api_reviews.values())
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"✅ COMPLETED!")
|
||||
print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
|
||||
print(f"Time: {elapsed:.2f}s")
|
||||
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
|
||||
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
|
||||
print(f"{'='*50}")
|
||||
|
||||
if len(all_reviews) >= 244:
|
||||
print(f"🎯 Got ALL 244 reviews!")
|
||||
elif len(all_reviews) >= 240:
|
||||
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
|
||||
|
||||
print()
|
||||
|
||||
# Save
|
||||
with open('google_reviews_optimized_hybrid.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"💾 Saved to google_reviews_optimized_hybrid.json")
|
||||
|
||||
if all_reviews:
|
||||
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
|
||||
|
||||
return all_reviews
|
||||
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
reviews = optimized_hybrid_scrape()
|
||||
sys.exit(0 if reviews else 1)
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nInterrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
@@ -1,360 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Parallel API Scraper - Capture session, then parallel API calls.
|
||||
|
||||
Strategy:
|
||||
1. Open browser and navigate to reviews (~15 seconds)
|
||||
2. Capture cookies and place ID from active session (~2 seconds)
|
||||
3. Make parallel API calls using requests (~5-10 seconds)
|
||||
4. Close browser immediately
|
||||
|
||||
Expected time: ~20-30 seconds for 244 reviews (vs 155 seconds)
|
||||
Speed improvement: ~5-7x faster!
|
||||
"""
|
||||
import sys
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import requests
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def load_config():
|
||||
"""Load configuration from config.yaml"""
|
||||
with open('config.yaml', 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def capture_session(url: str, headless: bool = False):
|
||||
"""
|
||||
Capture cookies and place ID from browser session.
|
||||
Returns (session, place_id, interceptor)
|
||||
"""
|
||||
log.info("="*60)
|
||||
log.info("STEP 1: Capturing session from browser")
|
||||
log.info("="*60)
|
||||
|
||||
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
|
||||
|
||||
try:
|
||||
# Navigate to place
|
||||
log.info("Opening Google Maps...")
|
||||
driver.get(url)
|
||||
time.sleep(2)
|
||||
|
||||
# Dismiss cookies
|
||||
try:
|
||||
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
||||
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
||||
if cookie_btns:
|
||||
cookie_btns[0].click()
|
||||
log.info("✓ Cookie dialog dismissed")
|
||||
time.sleep(1)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
log.info("Opening reviews tab...")
|
||||
review_keywords = ['reviews', 'review', 'reseñas', 'reseña', 'opiniones']
|
||||
clicked = False
|
||||
|
||||
for selector in ['.LRkQ2', '.hh2c6', '[data-tab-index="1"]', 'button[role="tab"]']:
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria_label = (tab.get_attribute('aria-label') or '').lower()
|
||||
if any(kw in text or kw in aria_label for kw in review_keywords):
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(2)
|
||||
log.info("✓ Reviews tab clicked")
|
||||
clicked = True
|
||||
break
|
||||
if clicked:
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
# Wait for reviews to load
|
||||
time.sleep(3)
|
||||
|
||||
# Extract place ID from URL
|
||||
current_url = driver.current_url
|
||||
place_id = None
|
||||
if '!1s' in current_url:
|
||||
parts = current_url.split('!1s')
|
||||
if len(parts) > 1:
|
||||
place_id = parts[1].split('!')[0]
|
||||
log.info(f"✓ Extracted place ID: {place_id}")
|
||||
|
||||
if not place_id:
|
||||
log.error("Could not extract place ID from URL")
|
||||
return None, None, None
|
||||
|
||||
# Capture ALL cookies using CDP
|
||||
log.info("Capturing cookies via CDP...")
|
||||
cdp_cookies = driver.execute_cdp_cmd('Network.getAllCookies', {})
|
||||
browser_cookies = cdp_cookies.get('cookies', [])
|
||||
log.info(f"✓ Captured {len(browser_cookies)} cookies")
|
||||
|
||||
# Get user agent
|
||||
user_agent = driver.execute_script("return navigator.userAgent")
|
||||
|
||||
# Create session with cookies
|
||||
session = requests.Session()
|
||||
for cookie in browser_cookies:
|
||||
session.cookies.set(
|
||||
name=cookie['name'],
|
||||
value=cookie['value'],
|
||||
domain=cookie.get('domain', '.google.com'),
|
||||
path=cookie.get('path', '/')
|
||||
)
|
||||
|
||||
# Set headers
|
||||
session.headers.update({
|
||||
'User-Agent': user_agent,
|
||||
'Accept': '*/*',
|
||||
'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8',
|
||||
'Referer': 'https://www.google.com/maps/',
|
||||
'Origin': 'https://www.google.com',
|
||||
})
|
||||
|
||||
# Create interceptor for parsing
|
||||
interceptor = GoogleMapsAPIInterceptor(None)
|
||||
|
||||
log.info("✓ Session captured successfully\n")
|
||||
return session, place_id, interceptor
|
||||
|
||||
finally:
|
||||
# Close browser immediately - we don't need it anymore!
|
||||
try:
|
||||
driver.quit()
|
||||
log.info("✓ Browser closed\n")
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
def fetch_reviews_page(session, place_id, interceptor, continuation_token=None):
|
||||
"""Fetch a single page of reviews via API."""
|
||||
if continuation_token:
|
||||
pb = f"!1m6!1s{place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
|
||||
else:
|
||||
pb = f"!1m6!1s{place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
|
||||
|
||||
params = {
|
||||
'authuser': '0',
|
||||
'hl': 'es',
|
||||
'gl': 'es',
|
||||
'pb': pb
|
||||
}
|
||||
|
||||
try:
|
||||
url = 'https://www.google.com/maps/rpc/listugcposts'
|
||||
response = session.get(url, params=params, timeout=10)
|
||||
|
||||
if response.status_code != 200:
|
||||
log.error(f"API error {response.status_code}")
|
||||
return [], None
|
||||
|
||||
body = response.text
|
||||
if body.startswith(")]}'"):
|
||||
body = body[4:].strip()
|
||||
|
||||
data = json.loads(body)
|
||||
reviews = interceptor._parse_listugcposts_response(data)
|
||||
|
||||
# Get next token
|
||||
next_token = None
|
||||
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
|
||||
next_token = data[1]
|
||||
|
||||
return reviews, next_token
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Request failed: {e}")
|
||||
return [], None
|
||||
|
||||
|
||||
def scrape_all_parallel(session, place_id, interceptor, max_workers=5):
|
||||
"""
|
||||
Main scraping method with parallel API calls.
|
||||
"""
|
||||
log.info("="*60)
|
||||
log.info("STEP 2: Parallel API scraping")
|
||||
log.info("="*60)
|
||||
|
||||
start_time = time.time()
|
||||
all_reviews = []
|
||||
seen_ids = set()
|
||||
|
||||
# Fetch first page to get continuation token
|
||||
log.info("Fetching first page...")
|
||||
reviews, token = fetch_reviews_page(session, place_id, interceptor, None)
|
||||
for review in reviews:
|
||||
rid = review.review_id or f"{review.author}_{review.date_text}"
|
||||
if rid not in seen_ids:
|
||||
seen_ids.add(rid)
|
||||
all_reviews.append({
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
})
|
||||
|
||||
log.info(f" → {len(reviews)} reviews | Total: {len(all_reviews)}")
|
||||
|
||||
if not token:
|
||||
log.info("No continuation token - only one page of reviews")
|
||||
return all_reviews
|
||||
|
||||
# Collect continuation tokens by fetching a few sequential pages
|
||||
# (We need to do this sequentially to get the tokens)
|
||||
tokens = [token]
|
||||
log.info("Collecting continuation tokens...")
|
||||
for i in range(4): # Get 5 total tokens
|
||||
reviews, next_token = fetch_reviews_page(session, place_id, interceptor, token)
|
||||
if next_token:
|
||||
tokens.append(next_token)
|
||||
token = next_token
|
||||
else:
|
||||
break
|
||||
|
||||
for review in reviews:
|
||||
rid = review.review_id or f"{review.author}_{review.date_text}"
|
||||
if rid not in seen_ids:
|
||||
seen_ids.add(rid)
|
||||
all_reviews.append({
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
})
|
||||
|
||||
log.info(f"Collected {len(tokens)} tokens, {len(all_reviews)} reviews so far")
|
||||
log.info(f"Starting parallel fetch with {max_workers} workers...\n")
|
||||
|
||||
# Now fetch remaining pages in parallel
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = []
|
||||
for token in tokens:
|
||||
future = executor.submit(fetch_reviews_page, session, place_id, interceptor, token)
|
||||
futures.append(future)
|
||||
|
||||
for i, future in enumerate(as_completed(futures)):
|
||||
try:
|
||||
reviews, _ = future.result()
|
||||
new_count = 0
|
||||
for review in reviews:
|
||||
rid = review.review_id or f"{review.author}_{review.date_text}"
|
||||
if rid not in seen_ids:
|
||||
seen_ids.add(rid)
|
||||
all_reviews.append({
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
})
|
||||
new_count += 1
|
||||
|
||||
log.info(f" Completed {i+1}/{len(futures)}: +{new_count} new reviews | Total: {len(all_reviews)}")
|
||||
except Exception as e:
|
||||
log.error(f" Error in parallel fetch: {e}")
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
log.info(f"\n{'='*60}")
|
||||
log.info(f"✅ PARALLEL SCRAPING COMPLETED!")
|
||||
log.info(f"{'='*60}")
|
||||
log.info(f"Total reviews: {len(all_reviews)}")
|
||||
log.info(f"Parallel workers: {max_workers}")
|
||||
log.info(f"API time: {elapsed:.2f} seconds")
|
||||
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
|
||||
log.info(f"{'='*60}\n")
|
||||
|
||||
return all_reviews
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
config = load_config()
|
||||
url = config.get('url')
|
||||
headless = config.get('headless', False)
|
||||
|
||||
log.info("="*60)
|
||||
log.info("PARALLEL API SCRAPER")
|
||||
log.info("="*60)
|
||||
log.info(f"URL: {url[:80]}...")
|
||||
log.info(f"Mode: Parallel API calls (no scrolling)")
|
||||
log.info("="*60 + "\n")
|
||||
|
||||
total_start = time.time()
|
||||
|
||||
# Step 1: Capture session from browser
|
||||
session, place_id, interceptor = capture_session(url, headless)
|
||||
if not session or not place_id:
|
||||
log.error("Failed to capture session")
|
||||
return []
|
||||
|
||||
# Step 2: Parallel API scraping
|
||||
reviews = scrape_all_parallel(session, place_id, interceptor, max_workers=5)
|
||||
|
||||
total_elapsed = time.time() - total_start
|
||||
|
||||
# Save results
|
||||
output_file = 'google_reviews_parallel.json'
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(reviews, f, indent=2, ensure_ascii=False)
|
||||
|
||||
log.info(f"💾 Saved {len(reviews)} reviews to {output_file}")
|
||||
|
||||
# Show sample
|
||||
if reviews:
|
||||
log.info("\n📝 Sample review:")
|
||||
sample = reviews[0]
|
||||
log.info(f" Author: {sample['author']}")
|
||||
log.info(f" Rating: {sample['rating']}★")
|
||||
log.info(f" Date: {sample['date_text']}")
|
||||
if sample['text']:
|
||||
log.info(f" Text: {sample['text'][:80]}...")
|
||||
|
||||
# Stats comparison
|
||||
log.info("\n" + "="*60)
|
||||
log.info("SPEED COMPARISON")
|
||||
log.info("="*60)
|
||||
log.info(f"Old DOM scraping: ~155 seconds for 244 reviews")
|
||||
log.info(f"Fast API scrolling: ~43 seconds for 234 reviews (3.6x faster)")
|
||||
log.info(f"Parallel API calls: ~{total_elapsed:.0f} seconds for {len(reviews)} reviews ({155/total_elapsed:.1f}x faster!) 🚀")
|
||||
log.info("="*60 + "\n")
|
||||
|
||||
return reviews
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
reviews = main()
|
||||
sys.exit(0 if reviews else 1)
|
||||
except KeyboardInterrupt:
|
||||
log.info("\n\nInterrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
log.error(f"Fatal error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
@@ -1,350 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PARALLEL HYBRID Scraper - Collects API + DOM simultaneously while scrolling.
|
||||
|
||||
Strategy:
|
||||
1. During scrolling, collect BOTH API responses AND DOM elements in parallel
|
||||
2. Deduplicate at the end
|
||||
3. Should get all 244 reviews in ~20-25s (vs 34s sequential)
|
||||
|
||||
Optimization: No separate DOM parsing phase - everything happens during scroll!
|
||||
"""
|
||||
import sys
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
||||
|
||||
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
log.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def load_config():
|
||||
with open('config.yaml', 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def parse_dom_review_element(elem):
|
||||
"""Parse a single review element from DOM."""
|
||||
try:
|
||||
review_data = {}
|
||||
|
||||
# Author name
|
||||
try:
|
||||
author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55')
|
||||
review_data['author'] = author_elem.text
|
||||
except:
|
||||
review_data['author'] = None
|
||||
|
||||
# Rating
|
||||
try:
|
||||
rating_elem = elem.find_element(By.CSS_SELECTOR, 'span.kvMYJc')
|
||||
rating_attr = rating_elem.get_attribute('aria-label')
|
||||
if rating_attr:
|
||||
rating_parts = rating_attr.split()
|
||||
if rating_parts:
|
||||
review_data['rating'] = float(rating_parts[0])
|
||||
except:
|
||||
review_data['rating'] = None
|
||||
|
||||
# Review text
|
||||
try:
|
||||
text_elem = elem.find_element(By.CSS_SELECTOR, 'span.wiI7pd')
|
||||
review_data['text'] = text_elem.text
|
||||
except:
|
||||
review_data['text'] = None
|
||||
|
||||
# Date
|
||||
try:
|
||||
date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe')
|
||||
review_data['date_text'] = date_elem.text
|
||||
except:
|
||||
review_data['date_text'] = None
|
||||
|
||||
# Avatar URL
|
||||
try:
|
||||
avatar_elem = elem.find_element(By.CSS_SELECTOR, 'img.NBa7we')
|
||||
review_data['avatar_url'] = avatar_elem.get_attribute('src')
|
||||
except:
|
||||
review_data['avatar_url'] = None
|
||||
|
||||
# Profile URL
|
||||
try:
|
||||
profile_elem = elem.find_element(By.CSS_SELECTOR, 'button.WEBjve')
|
||||
review_data['profile_url'] = profile_elem.get_attribute('data-review-id')
|
||||
except:
|
||||
review_data['profile_url'] = None
|
||||
|
||||
# Generate ID from author + date + rating
|
||||
if review_data.get('author'):
|
||||
review_id = f"dom_{hash(str(review_data.get('author', '')) + str(review_data.get('date_text', '')) + str(review_data.get('rating', '')))}"
|
||||
review_data['review_id'] = review_id
|
||||
return review_data
|
||||
|
||||
return None
|
||||
|
||||
except (StaleElementReferenceException, Exception):
|
||||
return None
|
||||
|
||||
|
||||
def parallel_hybrid_scrape():
|
||||
"""Collect API + DOM simultaneously during scrolling."""
|
||||
|
||||
config = load_config()
|
||||
url = config.get('url')
|
||||
headless = config.get('headless', False)
|
||||
|
||||
print("PARALLEL HYBRID SCRAPER - Collecting API + DOM simultaneously...")
|
||||
print(f"URL: {url[:80]}...")
|
||||
|
||||
start_time = time.time()
|
||||
api_reviews = {}
|
||||
dom_reviews = {}
|
||||
|
||||
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
|
||||
|
||||
try:
|
||||
# Step 1: Navigate
|
||||
driver.get(url)
|
||||
time.sleep(1.5)
|
||||
|
||||
# Dismiss cookies
|
||||
try:
|
||||
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
||||
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
||||
if cookie_btns:
|
||||
cookie_btns[0].click()
|
||||
time.sleep(0.4)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
|
||||
for selector in ['.LRkQ2', 'button[role="tab"]']:
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||
if any(kw in text or kw in aria for kw in review_keywords):
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(0.4)
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
# Wait for page stability
|
||||
time.sleep(1.0)
|
||||
|
||||
# Find pane
|
||||
pane = None
|
||||
try:
|
||||
wait = WebDriverWait(driver, 3)
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
|
||||
except TimeoutException:
|
||||
try:
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
|
||||
except:
|
||||
print("ERROR: Could not find pane")
|
||||
return []
|
||||
|
||||
# Wait for reviews to start loading
|
||||
time.sleep(1.5)
|
||||
|
||||
# Setup API interceptor
|
||||
interceptor = GoogleMapsAPIInterceptor(driver)
|
||||
interceptor.setup_interception()
|
||||
interceptor.inject_response_interceptor()
|
||||
time.sleep(1.0) # Important: wait for interceptor to be ready
|
||||
|
||||
# Setup scroll
|
||||
driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
||||
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
|
||||
|
||||
# Trigger initial scroll to get first API response
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(1.0) # Wait for first API response
|
||||
|
||||
print("Parallel collection (API + DOM simultaneously)...")
|
||||
|
||||
# Scrolling with PARALLEL API + DOM collection
|
||||
max_scrolls = 35
|
||||
dom_parse_start = 25 # Only start DOM parsing after 25 scrolls (when near end)
|
||||
|
||||
for i in range(max_scrolls):
|
||||
# Scroll
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.27) # Optimal scroll timing
|
||||
|
||||
# PARALLEL COLLECTION 1: API Responses (always)
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
# PARALLEL COLLECTION 2: DOM Elements (only near the end, lightweight)
|
||||
# Only parse DOM in the last scrolls when we know we're near 234 API reviews
|
||||
if i >= dom_parse_start and len(api_reviews) >= 220:
|
||||
try:
|
||||
# Lightweight: Just get author + date as unique key, don't parse everything
|
||||
review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')
|
||||
for elem in review_elements[:min(len(review_elements), 250)]: # Limit to first 250 for speed
|
||||
try:
|
||||
# Quick parse - just essentials
|
||||
author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55')
|
||||
author = author_elem.text if author_elem else None
|
||||
|
||||
date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe')
|
||||
date_text = date_elem.text if date_elem else None
|
||||
|
||||
if author and date_text:
|
||||
dom_key = (author, date_text[:20])
|
||||
if dom_key not in dom_reviews:
|
||||
# Full parse only if needed
|
||||
dom_review = parse_dom_review_element(elem)
|
||||
if dom_review:
|
||||
dom_reviews[dom_key] = dom_review
|
||||
except:
|
||||
continue
|
||||
except:
|
||||
pass
|
||||
|
||||
# Progress logging
|
||||
if (i + 1) % 10 == 0:
|
||||
print(f" API: {len(api_reviews)}, DOM: {len(dom_reviews)} unique keys...")
|
||||
|
||||
# Final collections
|
||||
print("Final collection sweep...")
|
||||
|
||||
# Final API collection
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
# Final DOM parse (quick sweep)
|
||||
try:
|
||||
review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')
|
||||
for elem in review_elements[:min(len(review_elements), 250)]:
|
||||
try:
|
||||
author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55')
|
||||
author = author_elem.text if author_elem else None
|
||||
|
||||
date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe')
|
||||
date_text = date_elem.text if date_elem else None
|
||||
|
||||
if author and date_text:
|
||||
dom_key = (author, date_text[:20])
|
||||
if dom_key not in dom_reviews:
|
||||
dom_review = parse_dom_review_element(elem)
|
||||
if dom_review:
|
||||
dom_reviews[dom_key] = dom_review
|
||||
except:
|
||||
continue
|
||||
except:
|
||||
pass
|
||||
|
||||
# Merge: Start with API reviews, add DOM reviews that aren't duplicates
|
||||
print("\nMerging API + DOM reviews...")
|
||||
|
||||
# Build set of API keys for deduplication (author + date)
|
||||
api_keys = set()
|
||||
for api_review in api_reviews.values():
|
||||
key = (
|
||||
api_review.get('author', ''),
|
||||
(api_review.get('date_text', '') or '')[:20]
|
||||
)
|
||||
api_keys.add(key)
|
||||
|
||||
# Add unique DOM reviews
|
||||
dom_added = 0
|
||||
for dom_key, dom_review in dom_reviews.items():
|
||||
if dom_key not in api_keys and dom_review.get('review_id'):
|
||||
api_reviews[dom_review['review_id']] = dom_review
|
||||
dom_added += 1
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
all_reviews = list(api_reviews.values())
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"✅ COMPLETED!")
|
||||
print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
|
||||
print(f" - API: {len(api_reviews) - dom_added}")
|
||||
print(f" - DOM: {dom_added} unique")
|
||||
print(f"Time: {elapsed:.2f}s")
|
||||
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
|
||||
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
|
||||
print(f"{'='*50}")
|
||||
|
||||
if len(all_reviews) >= 244:
|
||||
print(f"🎯 Got ALL 244 reviews!")
|
||||
elif len(all_reviews) >= 240:
|
||||
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
|
||||
|
||||
print()
|
||||
|
||||
# Save
|
||||
with open('google_reviews_parallel_hybrid.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"💾 Saved to google_reviews_parallel_hybrid.json")
|
||||
|
||||
if all_reviews:
|
||||
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
|
||||
|
||||
return all_reviews
|
||||
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
reviews = parallel_hybrid_scrape()
|
||||
sys.exit(0 if reviews else 1)
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nInterrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
@@ -1,319 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Parallel API Scraper V2 - Use browser's fetch API for parallel calls.
|
||||
|
||||
Strategy:
|
||||
1. Open browser and navigate to reviews (~15 seconds)
|
||||
2. Trigger initial API call to get place ID and pattern
|
||||
3. Use JavaScript fetch API to make 25 parallel calls (~3-5 seconds)
|
||||
4. Collect all results at once
|
||||
|
||||
Expected time: ~20-25 seconds for 244 reviews
|
||||
Speed improvement: ~6-7x faster!
|
||||
"""
|
||||
import sys
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
from pathlib import Path
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def load_config():
|
||||
"""Load configuration from config.yaml"""
|
||||
with open('config.yaml', 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def parallel_scrape():
|
||||
"""Parallel API-first scraping using browser's fetch API."""
|
||||
|
||||
config = load_config()
|
||||
url = config.get('url')
|
||||
headless = config.get('headless', False)
|
||||
|
||||
log.info("="*60)
|
||||
log.info("PARALLEL API SCRAPER V2")
|
||||
log.info("="*60)
|
||||
log.info(f"URL: {url[:80]}...")
|
||||
log.info(f"Mode: Parallel browser fetch calls")
|
||||
log.info("="*60 + "\n")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
|
||||
|
||||
try:
|
||||
# Step 1: Navigate and setup
|
||||
log.info("Step 1: Opening Google Maps...")
|
||||
driver.get(url)
|
||||
time.sleep(2)
|
||||
|
||||
# Dismiss cookies
|
||||
try:
|
||||
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
||||
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
||||
if cookie_btns:
|
||||
cookie_btns[0].click()
|
||||
log.info("✓ Cookie dialog dismissed")
|
||||
time.sleep(1)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
log.info("Step 2: Opening reviews tab...")
|
||||
review_keywords = ['reviews', 'review', 'reseñas', 'reseña', 'opiniones']
|
||||
clicked = False
|
||||
|
||||
for selector in ['.LRkQ2', '.hh2c6', '[data-tab-index="1"]', 'button[role="tab"]']:
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria_label = (tab.get_attribute('aria-label') or '').lower()
|
||||
if any(kw in text or kw in aria_label for kw in review_keywords):
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(2)
|
||||
log.info("✓ Reviews tab clicked")
|
||||
clicked = True
|
||||
break
|
||||
if clicked:
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
# Wait for reviews to load
|
||||
log.info("Waiting for reviews page to fully load...")
|
||||
time.sleep(3)
|
||||
|
||||
# Find reviews pane
|
||||
log.info("Step 3: Finding reviews pane...")
|
||||
pane = None
|
||||
pane_selectors = [
|
||||
'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde',
|
||||
'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde',
|
||||
'div.m6QErb.WNBkOb.XiKgde',
|
||||
]
|
||||
|
||||
for selector in pane_selectors:
|
||||
try:
|
||||
wait = WebDriverWait(driver, 5)
|
||||
pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
|
||||
log.info(f"✓ Found reviews pane with: {selector}")
|
||||
break
|
||||
except TimeoutException:
|
||||
continue
|
||||
|
||||
if not pane:
|
||||
log.error("Could not find reviews pane")
|
||||
return []
|
||||
|
||||
# Wait for initial reviews
|
||||
time.sleep(2)
|
||||
|
||||
# Extract place ID from URL
|
||||
current_url = driver.current_url
|
||||
place_id = None
|
||||
if '!1s' in current_url:
|
||||
parts = current_url.split('!1s')
|
||||
if len(parts) > 1:
|
||||
place_id = parts[1].split('!')[0]
|
||||
log.info(f"✓ Extracted place ID: {place_id}")
|
||||
|
||||
if not place_id:
|
||||
log.error("Could not extract place ID from URL")
|
||||
return []
|
||||
|
||||
# Step 4: Make parallel API calls using browser's fetch
|
||||
log.info("\n" + "="*60)
|
||||
log.info("Step 4: Making parallel API calls via browser fetch")
|
||||
log.info("="*60)
|
||||
|
||||
# JavaScript to make parallel API calls
|
||||
parallel_fetch_script = """
|
||||
async function fetchReviewsParallel(placeId, numPages) {
|
||||
const baseUrl = 'https://www.google.com/maps/rpc/listugcposts';
|
||||
const results = [];
|
||||
|
||||
// Build pb parameter for each page
|
||||
const requests = [];
|
||||
let token = null;
|
||||
|
||||
console.log('[Parallel Fetch] Starting parallel fetch for', numPages, 'pages');
|
||||
|
||||
// First, we need to get continuation tokens sequentially
|
||||
const tokens = [];
|
||||
for (let i = 0; i < Math.min(numPages, 5); i++) {
|
||||
const pb = token
|
||||
? `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`
|
||||
: `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`;
|
||||
|
||||
const params = new URLSearchParams({
|
||||
authuser: '0',
|
||||
hl: 'es',
|
||||
gl: 'es',
|
||||
pb: pb
|
||||
});
|
||||
|
||||
try {
|
||||
const response = await fetch(`${baseUrl}?${params}`);
|
||||
const text = await response.text();
|
||||
const body = text.startsWith(")]}'") ? text.substring(4) : text;
|
||||
const data = JSON.parse(body);
|
||||
|
||||
results.push({index: i, data: data});
|
||||
|
||||
// Get next token
|
||||
if (data && data.length > 1 && typeof data[1] === 'string') {
|
||||
token = data[1];
|
||||
tokens.push(token);
|
||||
} else {
|
||||
break; // No more pages
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('[Parallel Fetch] Error fetching page', i, e);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('[Parallel Fetch] Got', tokens.length, 'continuation tokens');
|
||||
console.log('[Parallel Fetch] Now fetching remaining pages in parallel...');
|
||||
|
||||
// Now fetch remaining pages in parallel using the tokens
|
||||
const parallelPromises = tokens.slice(5).map((tok, idx) => {
|
||||
const pb = `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${tok}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`;
|
||||
const params = new URLSearchParams({
|
||||
authuser: '0',
|
||||
hl: 'es',
|
||||
gl: 'es',
|
||||
pb: pb
|
||||
});
|
||||
|
||||
return fetch(`${baseUrl}?${params}`)
|
||||
.then(r => r.text())
|
||||
.then(text => {
|
||||
const body = text.startsWith(")]}'") ? text.substring(4) : text;
|
||||
return JSON.parse(body);
|
||||
})
|
||||
.then(data => ({index: idx + 5, data: data}))
|
||||
.catch(e => {
|
||||
console.error('[Parallel Fetch] Parallel fetch error', idx, e);
|
||||
return null;
|
||||
});
|
||||
});
|
||||
|
||||
const parallelResults = await Promise.all(parallelPromises);
|
||||
results.push(...parallelResults.filter(r => r !== null));
|
||||
|
||||
console.log('[Parallel Fetch] Completed! Total responses:', results.length);
|
||||
return results;
|
||||
}
|
||||
|
||||
// Execute parallel fetch
|
||||
return await fetchReviewsParallel(arguments[0], arguments[1]);
|
||||
"""
|
||||
|
||||
log.info(f"Fetching up to 25 pages in parallel...")
|
||||
api_start = time.time()
|
||||
|
||||
try:
|
||||
results = driver.execute_async_script(parallel_fetch_script, place_id, 25)
|
||||
api_elapsed = time.time() - api_start
|
||||
log.info(f"✓ Parallel fetch completed in {api_elapsed:.2f} seconds")
|
||||
log.info(f" Received {len(results)} API responses")
|
||||
except Exception as e:
|
||||
log.error(f"Parallel fetch failed: {e}")
|
||||
return []
|
||||
|
||||
# Parse results
|
||||
log.info("\nStep 5: Parsing reviews from API responses...")
|
||||
interceptor = GoogleMapsAPIInterceptor(None)
|
||||
all_reviews = {}
|
||||
|
||||
for result in results:
|
||||
if result and 'data' in result:
|
||||
try:
|
||||
parsed = interceptor._parse_listugcposts_response(result['data'])
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in all_reviews:
|
||||
all_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
except Exception as e:
|
||||
log.debug(f"Error parsing response: {e}")
|
||||
|
||||
reviews_list = list(all_reviews.values())
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
log.info(f"\n{'='*60}")
|
||||
log.info(f"✅ PARALLEL SCRAPING COMPLETED!")
|
||||
log.info(f"{'='*60}")
|
||||
log.info(f"Total reviews: {len(reviews_list)}")
|
||||
log.info(f"API responses: {len(results)}")
|
||||
log.info(f"Total time: {elapsed:.2f} seconds")
|
||||
log.info(f" - Setup: {api_start - start_time:.2f}s")
|
||||
log.info(f" - Parallel API: {api_elapsed:.2f}s")
|
||||
log.info(f"Speed: {len(reviews_list)/elapsed:.1f} reviews/second")
|
||||
log.info(f"{'='*60}\n")
|
||||
|
||||
# Save results
|
||||
output_file = 'google_reviews_parallel.json'
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(reviews_list, f, indent=2, ensure_ascii=False)
|
||||
|
||||
log.info(f"💾 Saved {len(reviews_list)} reviews to {output_file}")
|
||||
|
||||
# Show sample
|
||||
if reviews_list:
|
||||
log.info("\n📝 Sample review:")
|
||||
sample = reviews_list[0]
|
||||
log.info(f" Author: {sample['author']}")
|
||||
log.info(f" Rating: {sample['rating']}★")
|
||||
log.info(f" Date: {sample['date_text']}")
|
||||
if sample['text']:
|
||||
log.info(f" Text: {sample['text'][:80]}...")
|
||||
|
||||
# Stats comparison
|
||||
log.info("\n" + "="*60)
|
||||
log.info("SPEED COMPARISON")
|
||||
log.info("="*60)
|
||||
log.info(f"Old DOM scraping: ~155 seconds for 244 reviews (1.0x)")
|
||||
log.info(f"Fast API scrolling: ~43 seconds for 234 reviews (3.6x faster)")
|
||||
log.info(f"Parallel browser fetch: ~{elapsed:.0f} seconds for {len(reviews_list)} reviews ({155/elapsed:.1f}x faster!) 🚀")
|
||||
log.info("="*60 + "\n")
|
||||
|
||||
return reviews_list
|
||||
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
reviews = parallel_scrape()
|
||||
sys.exit(0 if reviews else 1)
|
||||
except KeyboardInterrupt:
|
||||
log.info("\n\nInterrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
log.error(f"Fatal error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
@@ -1,279 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
ULTRA-FAST API Scraper - Maximum speed optimization.
|
||||
|
||||
Optimizations:
|
||||
1. Minimal waits (0.5s after tab click instead of 3s)
|
||||
2. No wait for "initial reviews" (removes 3s)
|
||||
3. Faster scroll timing (0.2s instead of 0.3s)
|
||||
4. Batch response collection (every 3 scrolls, not every scroll)
|
||||
5. Less logging during scrolling (I/O overhead)
|
||||
6. Direct pane selection (no trying multiple)
|
||||
7. Parallel operations where possible
|
||||
|
||||
Target: ~15-20 seconds for 234 reviews
|
||||
"""
|
||||
import sys
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
||||
|
||||
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
# Only show INFO and above
|
||||
log.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def load_config():
|
||||
with open('config.yaml', 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def ultra_fast_scrape():
|
||||
"""Ultra-fast API-first scraping with all optimizations."""
|
||||
|
||||
config = load_config()
|
||||
url = config.get('url')
|
||||
headless = config.get('headless', False)
|
||||
|
||||
print("ULTRA-FAST SCRAPER - Starting...")
|
||||
print(f"URL: {url[:80]}...")
|
||||
|
||||
start_time = time.time()
|
||||
api_reviews = {}
|
||||
|
||||
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
|
||||
|
||||
try:
|
||||
# Step 1: Navigate (minimal waits)
|
||||
driver.get(url)
|
||||
time.sleep(1.5) # Stable wait
|
||||
|
||||
# Dismiss cookies (non-blocking)
|
||||
try:
|
||||
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
||||
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
||||
if cookie_btns:
|
||||
cookie_btns[0].click()
|
||||
time.sleep(0.4) # Balanced wait
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
|
||||
for selector in ['.LRkQ2', 'button[role="tab"]']:
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||
if any(kw in text or kw in aria for kw in review_keywords):
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(0.4) # Balanced wait
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
# Brief wait for reviews page (balance speed vs stability)
|
||||
time.sleep(1.0) # Reduced from 3s but needed for stability
|
||||
|
||||
# Find pane - use most common selector directly
|
||||
pane = None
|
||||
try:
|
||||
wait = WebDriverWait(driver, 3) # Reduced from 5s
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
|
||||
except TimeoutException:
|
||||
try:
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
|
||||
except:
|
||||
print("ERROR: Could not find pane")
|
||||
return []
|
||||
|
||||
# NO wait for initial reviews - save 3s!
|
||||
# Setup API interceptor immediately
|
||||
|
||||
interceptor = GoogleMapsAPIInterceptor(driver)
|
||||
interceptor.setup_interception()
|
||||
interceptor.inject_response_interceptor()
|
||||
time.sleep(0.3) # Minimal wait for interceptor
|
||||
|
||||
# Setup scroll
|
||||
driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
||||
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
|
||||
|
||||
# Trigger initial scroll
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.3) # Minimal initial trigger wait
|
||||
|
||||
print("Fast scrolling...")
|
||||
|
||||
# Rapid scrolling with batch collection
|
||||
target_reviews = 240
|
||||
max_scrolls = 35 # Slightly more to compensate for faster timing
|
||||
|
||||
for i in range(max_scrolls):
|
||||
# Ultra-fast scroll
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.27) # Sweet spot for stability
|
||||
|
||||
# Collect every scroll (can't skip or buffer clears)
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
|
||||
# Only log every 10 scrolls to reduce I/O
|
||||
if (i + 1) % 10 == 0:
|
||||
print(f" {len(api_reviews)} reviews...")
|
||||
|
||||
if len(api_reviews) >= target_reviews:
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# Final collection
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
# Quick DOM parse for missing reviews (only if needed)
|
||||
missing = 244 - len(api_reviews)
|
||||
if missing > 0:
|
||||
print(f"\nQuick DOM parse for {missing} missing reviews...")
|
||||
try:
|
||||
# Scroll to top
|
||||
driver.execute_script("window.scrollablePane.scrollTo(0, 0);", pane)
|
||||
time.sleep(0.3)
|
||||
|
||||
# Parse top reviews (most likely to be missing)
|
||||
review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')[:min(missing + 5, 20)]
|
||||
|
||||
# Build API keys for deduplication
|
||||
api_keys = set()
|
||||
for api_review in api_reviews.values():
|
||||
key = (api_review.get('author', ''), (api_review.get('date_text', '') or '')[:20])
|
||||
api_keys.add(key)
|
||||
|
||||
# Parse and add unique DOM reviews
|
||||
dom_added = 0
|
||||
for elem in review_elements:
|
||||
try:
|
||||
review_data = {}
|
||||
|
||||
# Author
|
||||
author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55')
|
||||
review_data['author'] = author_elem.text if author_elem else None
|
||||
|
||||
# Rating
|
||||
rating_elem = elem.find_element(By.CSS_SELECTOR, 'span.kvMYJc')
|
||||
rating_attr = rating_elem.get_attribute('aria-label')
|
||||
if rating_attr:
|
||||
rating_parts = rating_attr.split()
|
||||
if rating_parts:
|
||||
review_data['rating'] = float(rating_parts[0])
|
||||
|
||||
# Text
|
||||
text_elem = elem.find_element(By.CSS_SELECTOR, 'span.wiI7pd')
|
||||
review_data['text'] = text_elem.text if text_elem else None
|
||||
|
||||
# Date
|
||||
date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe')
|
||||
review_data['date_text'] = date_elem.text if date_elem else None
|
||||
|
||||
# Avatar
|
||||
avatar_elem = elem.find_element(By.CSS_SELECTOR, 'img.NBa7we')
|
||||
review_data['avatar_url'] = avatar_elem.get_attribute('src') if avatar_elem else None
|
||||
|
||||
# Profile URL
|
||||
profile_elem = elem.find_element(By.CSS_SELECTOR, 'button.WEBjve')
|
||||
review_data['profile_url'] = profile_elem.get_attribute('data-review-id') if profile_elem else None
|
||||
|
||||
# Check if unique
|
||||
dom_key = (review_data.get('author', ''), (review_data.get('date_text', '') or '')[:20])
|
||||
if dom_key not in api_keys and review_data.get('author'):
|
||||
review_id = f"dom_{hash(str(review_data.get('author', '')) + str(review_data.get('date_text', '')))}"
|
||||
review_data['review_id'] = review_id
|
||||
api_reviews[review_id] = review_data
|
||||
api_keys.add(dom_key)
|
||||
dom_added += 1
|
||||
|
||||
except:
|
||||
continue
|
||||
|
||||
print(f" +{dom_added} reviews from DOM")
|
||||
except Exception as e:
|
||||
print(f" DOM parse failed: {e}")
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
all_reviews = list(api_reviews.values())
|
||||
|
||||
print(f"\n✅ COMPLETED!")
|
||||
print(f"Reviews: {len(all_reviews)}")
|
||||
print(f"Time: {elapsed:.2f}s")
|
||||
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
|
||||
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀\n")
|
||||
|
||||
# Save
|
||||
with open('google_reviews_ultra_fast.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"💾 Saved to google_reviews_ultra_fast.json")
|
||||
|
||||
if all_reviews:
|
||||
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
|
||||
|
||||
return all_reviews
|
||||
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
reviews = ultra_fast_scrape()
|
||||
sys.exit(0 if reviews else 1)
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nInterrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
@@ -1,336 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
ULTRA-FAST COMPLETE Scraper - Gets ALL 244 reviews in ~25-30 seconds.
|
||||
|
||||
Strategy:
|
||||
1. Ultra-fast API scrolling to get 234 reviews (~19s)
|
||||
2. DOM parsing for missing 10 reviews (~5-10s)
|
||||
3. Total: ~25-30s for 244 reviews (vs 155s original)
|
||||
|
||||
Combines speed of start_ultra_fast.py with completeness of original scraper.
|
||||
"""
|
||||
import sys
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
||||
|
||||
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
log.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def load_config():
|
||||
with open('config.yaml', 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def parse_dom_reviews_fast(driver, max_reviews=20):
|
||||
"""Fast DOM parsing using JavaScript - extracts data in bulk."""
|
||||
|
||||
# JavaScript to extract review data from first N reviews
|
||||
extract_script = """
|
||||
const reviews = [];
|
||||
const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium');
|
||||
const maxCount = Math.min(arguments[0], elements.length);
|
||||
|
||||
for (let i = 0; i < maxCount; i++) {
|
||||
const elem = elements[i];
|
||||
const review = {};
|
||||
|
||||
try {
|
||||
// Author
|
||||
const authorElem = elem.querySelector('div.d4r55');
|
||||
review.author = authorElem ? authorElem.textContent : null;
|
||||
|
||||
// Rating
|
||||
const ratingElem = elem.querySelector('span.kvMYJc');
|
||||
if (ratingElem) {
|
||||
const ariaLabel = ratingElem.getAttribute('aria-label');
|
||||
if (ariaLabel) {
|
||||
const match = ariaLabel.match(/\\d+/);
|
||||
review.rating = match ? parseFloat(match[0]) : null;
|
||||
}
|
||||
}
|
||||
|
||||
// Text
|
||||
const textElem = elem.querySelector('span.wiI7pd');
|
||||
review.text = textElem ? textElem.textContent : null;
|
||||
|
||||
// Date
|
||||
const dateElem = elem.querySelector('span.rsqaWe');
|
||||
review.date_text = dateElem ? dateElem.textContent : null;
|
||||
|
||||
// Avatar
|
||||
const avatarElem = elem.querySelector('img.NBa7we');
|
||||
review.avatar_url = avatarElem ? avatarElem.src : null;
|
||||
|
||||
// Profile URL
|
||||
const profileElem = elem.querySelector('button.WEBjve');
|
||||
review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null;
|
||||
|
||||
if (review.author) {
|
||||
reviews.push(review);
|
||||
}
|
||||
} catch (e) {
|
||||
// Skip this review
|
||||
}
|
||||
}
|
||||
|
||||
return reviews;
|
||||
"""
|
||||
|
||||
try:
|
||||
# Execute JavaScript to get all review data at once
|
||||
dom_reviews_data = driver.execute_script(extract_script, max_reviews)
|
||||
|
||||
# Convert to our format
|
||||
dom_reviews = []
|
||||
for review_data in dom_reviews_data:
|
||||
if review_data.get('author') and review_data.get('date_text'):
|
||||
review_id = f"dom_{hash(review_data['author'] + review_data['date_text'])}"
|
||||
review_data['review_id'] = review_id
|
||||
dom_reviews.append(review_data)
|
||||
|
||||
return dom_reviews
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error in fast DOM parse: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def ultra_fast_complete_scrape():
|
||||
"""Get ALL reviews with ultra-fast API + DOM fallback."""
|
||||
|
||||
config = load_config()
|
||||
url = config.get('url')
|
||||
headless = config.get('headless', False)
|
||||
|
||||
print("ULTRA-FAST COMPLETE SCRAPER - Getting ALL 244 reviews...")
|
||||
print(f"URL: {url[:80]}...")
|
||||
|
||||
start_time = time.time()
|
||||
api_reviews = {}
|
||||
|
||||
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
|
||||
|
||||
try:
|
||||
# ====== PHASE 1: ULTRA-FAST API SCROLLING ======
|
||||
print("\n[Phase 1] Ultra-fast API scrolling...")
|
||||
|
||||
# Step 1: Navigate
|
||||
driver.get(url)
|
||||
time.sleep(1.5)
|
||||
|
||||
# Dismiss cookies
|
||||
try:
|
||||
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
||||
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
||||
if cookie_btns:
|
||||
cookie_btns[0].click()
|
||||
time.sleep(0.4)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
|
||||
for selector in ['.LRkQ2', 'button[role="tab"]']:
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||
if any(kw in text or kw in aria for kw in review_keywords):
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(0.4)
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
# Wait for page stability
|
||||
time.sleep(1.0)
|
||||
|
||||
# Find pane
|
||||
pane = None
|
||||
try:
|
||||
wait = WebDriverWait(driver, 3)
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
|
||||
except TimeoutException:
|
||||
try:
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
|
||||
except:
|
||||
print("ERROR: Could not find pane")
|
||||
return []
|
||||
|
||||
# Setup API interceptor
|
||||
interceptor = GoogleMapsAPIInterceptor(driver)
|
||||
interceptor.setup_interception()
|
||||
interceptor.inject_response_interceptor()
|
||||
time.sleep(0.3)
|
||||
|
||||
# Setup scroll
|
||||
driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
||||
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
|
||||
|
||||
# Trigger initial scroll
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.3)
|
||||
|
||||
print(" Fast scrolling for API reviews...")
|
||||
|
||||
# Rapid scrolling
|
||||
target_reviews = 240
|
||||
max_scrolls = 35
|
||||
|
||||
for i in range(max_scrolls):
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.27)
|
||||
|
||||
# Collect responses
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
|
||||
if (i + 1) % 10 == 0:
|
||||
print(f" {len(api_reviews)} reviews...")
|
||||
|
||||
if len(api_reviews) >= target_reviews:
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# Final API collection
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
phase1_time = time.time() - start_time
|
||||
print(f" ✅ Phase 1 complete: {len(api_reviews)} reviews in {phase1_time:.2f}s")
|
||||
|
||||
# ====== PHASE 2: DOM PARSING FOR MISSING REVIEWS ======
|
||||
missing_count = 244 - len(api_reviews)
|
||||
|
||||
if missing_count > 0:
|
||||
print(f"\n[Phase 2] Fast DOM parsing for {missing_count} missing reviews...")
|
||||
|
||||
# Scroll to top (missing reviews likely at top)
|
||||
driver.execute_script("window.scrollablePane.scrollTo(0, 0);", pane)
|
||||
time.sleep(0.5) # Brief wait for scroll
|
||||
|
||||
# Fast JavaScript-based parsing (only first 20 reviews)
|
||||
dom_reviews = parse_dom_reviews_fast(driver, max_reviews=min(missing_count + 10, 25))
|
||||
|
||||
# Add DOM reviews that aren't in API reviews
|
||||
# Use author + rating + date as key for better duplicate detection
|
||||
api_keys = set()
|
||||
for api_review in api_reviews.values():
|
||||
key = (
|
||||
api_review.get('author', ''),
|
||||
api_review.get('rating', 0),
|
||||
(api_review.get('date_text', '') or '')[:20] # First 20 chars of date
|
||||
)
|
||||
api_keys.add(key)
|
||||
|
||||
dom_added = 0
|
||||
for dom_review in dom_reviews:
|
||||
# Create key for this DOM review
|
||||
dom_key = (
|
||||
dom_review.get('author', ''),
|
||||
dom_review.get('rating', 0),
|
||||
(dom_review.get('date_text', '') or '')[:20]
|
||||
)
|
||||
|
||||
# Only add if not already in API reviews
|
||||
if dom_key not in api_keys and dom_review.get('review_id'):
|
||||
api_reviews[dom_review['review_id']] = dom_review
|
||||
api_keys.add(dom_key) # Track this to avoid duplicates within DOM too
|
||||
dom_added += 1
|
||||
|
||||
phase2_time = time.time() - start_time - phase1_time
|
||||
print(f" ✅ Phase 2 complete: +{dom_added} reviews from DOM in {phase2_time:.2f}s")
|
||||
|
||||
# ====== RESULTS ======
|
||||
elapsed = time.time() - start_time
|
||||
all_reviews = list(api_reviews.values())
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"✅ COMPLETED!")
|
||||
print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
|
||||
print(f"Time: {elapsed:.2f}s")
|
||||
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
|
||||
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
|
||||
print(f"{'='*50}")
|
||||
|
||||
if len(all_reviews) >= 244:
|
||||
print(f"🎯 Got ALL 244 reviews!")
|
||||
elif len(all_reviews) >= 240:
|
||||
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
|
||||
else:
|
||||
print(f"⚠️ Missing {244-len(all_reviews)} reviews - may need more DOM parsing")
|
||||
|
||||
print()
|
||||
|
||||
# Save
|
||||
with open('google_reviews_ultra_fast_complete.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"💾 Saved to google_reviews_ultra_fast_complete.json")
|
||||
|
||||
if all_reviews:
|
||||
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
|
||||
|
||||
return all_reviews
|
||||
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
reviews = ultra_fast_complete_scrape()
|
||||
sys.exit(0 if reviews else 1)
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nInterrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
@@ -1,280 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Complete Scraper - Gets ALL reviews while staying fast.
|
||||
|
||||
Strategy:
|
||||
1. Scroll until no new reviews for 5 consecutive scrolls
|
||||
2. Check scroll position to detect end
|
||||
3. Do extra scrolls at the end to catch stragglers
|
||||
4. Adaptive timing - faster at start, slower at end
|
||||
|
||||
Target: Get all 244 reviews in ~22-25 seconds
|
||||
"""
|
||||
import sys
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
||||
|
||||
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
log.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def load_config():
|
||||
with open('config.yaml', 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def complete_scrape():
|
||||
"""Get ALL reviews with intelligent scrolling."""
|
||||
|
||||
config = load_config()
|
||||
url = config.get('url')
|
||||
headless = config.get('headless', False)
|
||||
|
||||
print("COMPLETE SCRAPER - Getting ALL reviews...")
|
||||
print(f"URL: {url[:80]}...")
|
||||
|
||||
start_time = time.time()
|
||||
api_reviews = {}
|
||||
|
||||
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
|
||||
|
||||
try:
|
||||
# Step 1: Navigate
|
||||
driver.get(url)
|
||||
time.sleep(1.5)
|
||||
|
||||
# Dismiss cookies
|
||||
try:
|
||||
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
||||
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
||||
if cookie_btns:
|
||||
cookie_btns[0].click()
|
||||
time.sleep(0.4)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
|
||||
for selector in ['.LRkQ2', 'button[role="tab"]']:
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||
if any(kw in text or kw in aria for kw in review_keywords):
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(0.4)
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
# Wait for page stability
|
||||
time.sleep(1.0)
|
||||
|
||||
# Find pane
|
||||
pane = None
|
||||
try:
|
||||
wait = WebDriverWait(driver, 3)
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
|
||||
except TimeoutException:
|
||||
try:
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
|
||||
except:
|
||||
print("ERROR: Could not find pane")
|
||||
return []
|
||||
|
||||
# Wait for initial reviews to load
|
||||
time.sleep(1.5)
|
||||
|
||||
# Setup API interceptor
|
||||
interceptor = GoogleMapsAPIInterceptor(driver)
|
||||
interceptor.setup_interception()
|
||||
interceptor.inject_response_interceptor()
|
||||
time.sleep(1.0) # Important: wait for interceptor to be ready
|
||||
|
||||
# Setup scroll
|
||||
driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
||||
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
|
||||
|
||||
# Trigger initial scroll to get first API response
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(1.0) # Wait for first API response
|
||||
|
||||
print("Scrolling with intelligent stopping...")
|
||||
|
||||
# Intelligent scrolling
|
||||
max_scrolls = 60 # Higher limit to ensure we get everything
|
||||
idle_scrolls = 0 # Count scrolls with no new reviews
|
||||
max_idle = 12 # More patience - stop after 12 scrolls with no new reviews
|
||||
last_count = 0
|
||||
last_scroll_pos = 0
|
||||
scroll_stuck_count = 0
|
||||
|
||||
for i in range(max_scrolls):
|
||||
# Scroll
|
||||
driver.execute_script(scroll_script)
|
||||
|
||||
# Adaptive timing - faster at start, slower near end
|
||||
if len(api_reviews) < 100:
|
||||
time.sleep(0.27) # Fast at beginning
|
||||
elif len(api_reviews) < 200:
|
||||
time.sleep(0.30) # Medium in middle
|
||||
elif len(api_reviews) < 235:
|
||||
time.sleep(0.40) # Slower near end
|
||||
else:
|
||||
time.sleep(0.50) # Very slow at the very end to catch stragglers
|
||||
|
||||
# Collect responses
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
# Check if we got new reviews
|
||||
current_count = len(api_reviews)
|
||||
if current_count == last_count:
|
||||
idle_scrolls += 1
|
||||
else:
|
||||
idle_scrolls = 0
|
||||
if (i + 1) % 10 == 0:
|
||||
print(f" {current_count} reviews...")
|
||||
|
||||
last_count = current_count
|
||||
|
||||
# Check scroll position to detect if stuck at bottom
|
||||
try:
|
||||
current_scroll = driver.execute_script("return arguments[0].scrollTop;", pane)
|
||||
if current_scroll == last_scroll_pos:
|
||||
scroll_stuck_count += 1
|
||||
else:
|
||||
scroll_stuck_count = 0
|
||||
last_scroll_pos = current_scroll
|
||||
except:
|
||||
pass
|
||||
|
||||
# Stop conditions
|
||||
if idle_scrolls >= max_idle and scroll_stuck_count >= 3:
|
||||
print(f" Reached end (no new reviews for {idle_scrolls} scrolls)")
|
||||
break
|
||||
|
||||
# Extra thorough collection at the end
|
||||
print(f" Final collection sweep (currently have {len(api_reviews)})...")
|
||||
|
||||
# Do a few more scrolls with longer waits
|
||||
for extra in range(5):
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.8) # Longer wait to ensure API completes
|
||||
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
new_count = 0
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
new_count += 1
|
||||
|
||||
if new_count > 0:
|
||||
print(f" +{new_count} more reviews (total: {len(api_reviews)})")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Final wait and collect
|
||||
time.sleep(1.0)
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
all_reviews = list(api_reviews.values())
|
||||
|
||||
print(f"\n✅ COMPLETED!")
|
||||
print(f"Reviews: {len(all_reviews)} (target: 244)")
|
||||
print(f"Time: {elapsed:.2f}s")
|
||||
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
|
||||
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
|
||||
|
||||
if len(all_reviews) >= 244:
|
||||
print(f"🎯 Got ALL reviews!")
|
||||
elif len(all_reviews) >= 240:
|
||||
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
|
||||
|
||||
print()
|
||||
|
||||
# Save
|
||||
with open('google_reviews_complete.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"💾 Saved to google_reviews_complete.json")
|
||||
|
||||
if all_reviews:
|
||||
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
|
||||
|
||||
return all_reviews
|
||||
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
reviews = complete_scrape()
|
||||
sys.exit(0 if reviews else 1)
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nInterrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user