Remove old scraper files - consolidate to scraper_clean

Production (api_server_production.py) only uses:
- modules/scraper_clean.py - main scraping logic
- modules/fast_scraper.py - validation helpers
- modules/database.py, webhooks.py, health_checks.py, chrome_pool.py

Deleted 33 unused Python files including:
- Old API server (api_server.py)
- 14 start*.py experimental scrapers
- 7 *_scraper.py variants
- Old modules: scraper.py, api_interceptor.py, job_manager.py, cli.py
- Various debug/test/utility scripts

Saves ~11,000 lines of unmaintained code.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-23 17:25:00 +00:00
parent 80e7771c00
commit 8ccf72a489
37 changed files with 859 additions and 11116 deletions

View File

@@ -1,383 +0,0 @@
#!/usr/bin/env python3
"""
FastAPI server for Google Reviews Scraper.
Provides REST API endpoints to trigger and manage scraping jobs.
"""
import logging
import asyncio
from contextlib import asynccontextmanager
from typing import Dict, Any, List, Optional
from fastapi import FastAPI, HTTPException, BackgroundTasks, Query
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, HttpUrl, Field
from modules.job_manager import JobManager, JobStatus, ScrapingJob
from modules.chrome_pool import start_worker_pools, stop_worker_pools, get_pool_stats, get_validation_worker, release_validation_worker
from modules.fast_scraper import check_reviews_available, get_business_card_info
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
log = logging.getLogger("api_server")
# Global job manager instance
job_manager: Optional[JobManager] = None
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Lifespan context manager for startup and shutdown"""
global job_manager
# Startup
log.info("Starting Google Reviews Scraper API Server")
# Start Chrome worker pools
log.info("Initializing Chrome worker pools...")
start_worker_pools(
validation_size=1, # 1 pre-warmed worker for validation
scraping_size=2, # 2 pre-warmed workers for scraping
headless=True
)
job_manager = JobManager(max_concurrent_jobs=3)
# Start auto-cleanup task
asyncio.create_task(cleanup_jobs_periodically())
yield
# Shutdown
log.info("Shutting down Google Reviews Scraper API Server")
if job_manager:
job_manager.shutdown()
# Stop Chrome worker pools
log.info("Stopping Chrome worker pools...")
stop_worker_pools()
# Initialize FastAPI app
app = FastAPI(
title="Google Reviews Scraper API",
description="REST API for triggering and managing Google Maps review scraping jobs",
version="1.0.0",
lifespan=lifespan
)
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Configure appropriately for production
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Pydantic models for API
class ScrapeRequest(BaseModel):
"""Request model for starting a scrape job"""
url: HttpUrl = Field(..., description="Google Maps URL to scrape")
headless: Optional[bool] = Field(None, description="Run Chrome in headless mode (default: True)")
max_scrolls: Optional[int] = Field(None, description="Maximum scrolls (default: unlimited - stops via idle detection)")
sort_by: Optional[str] = Field(None, description="Sort order: newest, highest, lowest, relevance")
stop_on_match: Optional[bool] = Field(None, description="Stop when first already-seen review is encountered")
overwrite_existing: Optional[bool] = Field(None, description="Overwrite existing reviews instead of appending")
download_images: Optional[bool] = Field(None, description="Download images from reviews")
use_s3: Optional[bool] = Field(None, description="Upload images to S3")
custom_params: Optional[Dict[str, Any]] = Field(None, description="Custom parameters to add to each document")
class JobResponse(BaseModel):
"""Response model for job information"""
job_id: str
status: JobStatus
url: str
created_at: str
started_at: Optional[str] = None
completed_at: Optional[str] = None
updated_at: Optional[str] = None # Last update time for progress tracking
error_message: Optional[str] = None
reviews_count: Optional[int] = None
total_reviews: Optional[int] = None # Total reviews available for this place
images_count: Optional[int] = None
progress: Optional[Dict[str, Any]] = None
scrape_time: Optional[float] = None # Time taken to scrape in seconds
class JobStatsResponse(BaseModel):
"""Response model for job statistics"""
total_jobs: int
by_status: Dict[str, int]
running_jobs: int
max_concurrent_jobs: int
class ReviewsResponse(BaseModel):
"""Response model for reviews data"""
job_id: str
reviews: List[Dict[str, Any]]
count: int
# Background task for periodic cleanup
async def cleanup_jobs_periodically():
"""Periodically clean up old jobs"""
while True:
await asyncio.sleep(3600) # Run every hour
if job_manager:
job_manager.cleanup_old_jobs(max_age_hours=24)
# API Endpoints
@app.get("/", summary="API Health Check")
async def root():
"""Health check endpoint"""
return {
"message": "Google Reviews Scraper API is running",
"status": "healthy",
"version": "1.0.0"
}
@app.post("/scrape", response_model=Dict[str, str], summary="Start Scraping Job")
async def start_scrape(request: ScrapeRequest, background_tasks: BackgroundTasks):
"""
Start a new scraping job in the background.
Returns the job ID that can be used to check status.
"""
if not job_manager:
raise HTTPException(status_code=500, detail="Job manager not initialized")
# Prepare config overrides
config_overrides = {}
# Only include non-None values
for field, value in request.dict().items():
if value is not None and field != "url":
config_overrides[field] = value
# Convert URL to string
url = str(request.url)
try:
# Create job
job_id = job_manager.create_job(url, config_overrides)
# Start job immediately if possible
started = job_manager.start_job(job_id)
log.info(f"Created scraping job {job_id} for URL: {url}")
return {
"job_id": job_id,
"status": "started" if started else "queued",
"message": f"Scraping job {'started' if started else 'queued'} successfully"
}
except Exception as e:
log.error(f"Error creating scraping job: {e}")
raise HTTPException(status_code=500, detail=f"Failed to create scraping job: {str(e)}")
@app.get("/jobs/{job_id}", response_model=JobResponse, summary="Get Job Status")
async def get_job(job_id: str):
"""Get detailed information about a specific job"""
if not job_manager:
raise HTTPException(status_code=500, detail="Job manager not initialized")
job = job_manager.get_job(job_id)
if not job:
raise HTTPException(status_code=404, detail="Job not found")
return JobResponse(**job.to_dict())
@app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews")
async def get_job_reviews(job_id: str):
"""
Get the actual reviews data for a completed job.
Returns 404 if job not found or not completed yet.
"""
if not job_manager:
raise HTTPException(status_code=500, detail="Job manager not initialized")
reviews = job_manager.get_job_reviews(job_id)
if reviews is None:
job = job_manager.get_job(job_id)
if not job:
raise HTTPException(status_code=404, detail="Job not found")
elif job.status != JobStatus.COMPLETED:
raise HTTPException(
status_code=400,
detail=f"Job not completed yet (current status: {job.status})"
)
else:
raise HTTPException(status_code=404, detail="Reviews data not available")
return ReviewsResponse(
job_id=job_id,
reviews=reviews,
count=len(reviews)
)
@app.get("/jobs", response_model=List[JobResponse], summary="List Jobs")
async def list_jobs(
status: Optional[JobStatus] = Query(None, description="Filter by job status"),
limit: int = Query(100, description="Maximum number of jobs to return", ge=1, le=1000)
):
"""List all jobs, optionally filtered by status"""
if not job_manager:
raise HTTPException(status_code=500, detail="Job manager not initialized")
jobs = job_manager.list_jobs(status=status, limit=limit)
return [JobResponse(**job.to_dict()) for job in jobs]
@app.post("/jobs/{job_id}/start", summary="Start Pending Job")
async def start_job(job_id: str):
"""Start a pending job manually"""
if not job_manager:
raise HTTPException(status_code=500, detail="Job manager not initialized")
started = job_manager.start_job(job_id)
if not started:
job = job_manager.get_job(job_id)
if not job:
raise HTTPException(status_code=404, detail="Job not found")
if job.status != JobStatus.PENDING:
raise HTTPException(status_code=400, detail=f"Job is not pending (current status: {job.status})")
raise HTTPException(status_code=429, detail="Maximum concurrent jobs reached")
return {"message": "Job started successfully"}
@app.post("/jobs/{job_id}/cancel", summary="Cancel Job")
async def cancel_job(job_id: str):
"""Cancel a pending or running job"""
if not job_manager:
raise HTTPException(status_code=500, detail="Job manager not initialized")
cancelled = job_manager.cancel_job(job_id)
if not cancelled:
job = job_manager.get_job(job_id)
if not job:
raise HTTPException(status_code=404, detail="Job not found")
raise HTTPException(status_code=400, detail="Job cannot be cancelled (already completed, failed, or cancelled)")
return {"message": "Job cancelled successfully"}
@app.delete("/jobs/{job_id}", summary="Delete Job")
async def delete_job(job_id: str):
"""Delete a job from the system"""
if not job_manager:
raise HTTPException(status_code=500, detail="Job manager not initialized")
deleted = job_manager.delete_job(job_id)
if not deleted:
raise HTTPException(status_code=404, detail="Job not found")
return {"message": "Job deleted successfully"}
@app.get("/stats", response_model=JobStatsResponse, summary="Get Job Statistics")
async def get_stats():
"""Get job manager statistics"""
if not job_manager:
raise HTTPException(status_code=500, detail="Job manager not initialized")
stats = job_manager.get_stats()
return JobStatsResponse(**stats)
@app.post("/check-reviews", summary="Check if Business Has Reviews")
async def check_reviews(request: Dict[str, str]):
"""
Lightweight validation endpoint to check if a business has reviews.
Uses the Chrome validation pool for fast response.
Returns business name, rating, address, and review count.
"""
url = request.get("url")
if not url:
raise HTTPException(status_code=400, detail="URL is required")
log.info(f"Validating business at: {url}")
# Get a worker from validation pool
worker = get_validation_worker(timeout=10)
if not worker:
raise HTTPException(
status_code=503,
detail="No validation workers available. Please try again in a few seconds."
)
try:
# Use the worker's driver to get business card info (faster than check_reviews_available)
result = get_business_card_info(
url=url,
headless=True,
driver=worker.driver,
return_driver=True # Don't close the driver
)
# Pop the driver from result before returning
result.pop('driver', None)
log.info(f"Validation result: name={result.get('name')}, rating={result.get('rating')}, reviews={result.get('total_reviews')}")
return result
except Exception as e:
log.error(f"Error during validation: {e}")
# Recycle worker if there was an error
release_validation_worker(worker, recycle=True)
raise HTTPException(status_code=500, detail=f"Validation failed: {str(e)}")
finally:
# Release worker back to pool (unless already recycled)
if worker and worker.driver:
release_validation_worker(worker, recycle=False)
@app.get("/pool-stats", summary="Get Chrome Pool Statistics")
async def pool_stats():
"""Get statistics about Chrome worker pools"""
stats = get_pool_stats()
return stats
@app.post("/cleanup", summary="Manual Job Cleanup")
async def cleanup_jobs(max_age_hours: int = Query(24, description="Maximum age in hours", ge=1)):
"""Manually trigger cleanup of old completed/failed jobs"""
if not job_manager:
raise HTTPException(status_code=500, detail="Job manager not initialized")
job_manager.cleanup_old_jobs(max_age_hours=max_age_hours)
return {"message": f"Cleaned up jobs older than {max_age_hours} hours"}
if __name__ == "__main__":
import uvicorn
log.info("Starting FastAPI server...")
uvicorn.run(
"api_server:app",
host="0.0.0.0",
port=8000,
reload=True,
log_level="info"
)

View File

@@ -6,6 +6,7 @@ Production Google Reviews Scraper API Server with Phase 1 features:
- Smart health checks with canary testing - Smart health checks with canary testing
""" """
import asyncio import asyncio
import json
import logging import logging
import os import os
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
@@ -15,12 +16,12 @@ from uuid import UUID
from fastapi import FastAPI, HTTPException, Query, Header from fastapi import FastAPI, HTTPException, Query, Header
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, HttpUrl, Field from pydantic import BaseModel, HttpUrl, Field
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse, StreamingResponse
from modules.database import DatabaseManager, JobStatus from modules.database import DatabaseManager, JobStatus
from modules.webhooks import WebhookDispatcher, WebhookManager from modules.webhooks import WebhookDispatcher, WebhookManager
from modules.health_checks import HealthCheckSystem from modules.health_checks import HealthCheckSystem
from modules.scraper_clean import fast_scrape_reviews # Clean scraper with hard refresh recovery from modules.scraper_clean import fast_scrape_reviews, LogCapture # Clean scraper with hard refresh recovery
from modules.fast_scraper import check_reviews_available, get_business_card_info # Helper functions from modules.fast_scraper import check_reviews_available, get_business_card_info # Helper functions
from modules.chrome_pool import ( from modules.chrome_pool import (
start_worker_pools, start_worker_pools,
@@ -48,6 +49,11 @@ health_system: Optional[HealthCheckSystem] = None
MAX_CONCURRENT_JOBS = int(os.getenv('MAX_CONCURRENT_JOBS', '5')) MAX_CONCURRENT_JOBS = int(os.getenv('MAX_CONCURRENT_JOBS', '5'))
job_semaphore = asyncio.Semaphore(MAX_CONCURRENT_JOBS) job_semaphore = asyncio.Semaphore(MAX_CONCURRENT_JOBS)
# SSE: Store for broadcasting job updates to connected clients
# Format: {job_id: [asyncio.Queue, ...]} for job-specific streams
# Format: {"all": [asyncio.Queue, ...]} for all-jobs stream
job_update_queues: Dict[str, List[asyncio.Queue]] = {"all": []}
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI): async def lifespan(app: FastAPI):
@@ -82,11 +88,12 @@ async def lifespan(app: FastAPI):
# Start Chrome worker pools (1 for validation, 2 for scraping) # Start Chrome worker pools (1 for validation, 2 for scraping)
# These pre-warm Chrome instances for instant availability # These pre-warm Chrome instances for instant availability
# headless=False because Docker uses Xvfb virtual display for better compatibility
await asyncio.to_thread( await asyncio.to_thread(
start_worker_pools, start_worker_pools,
validation_size=1, validation_size=1,
scraping_size=2, scraping_size=2,
headless=True headless=False
) )
log.info("Chrome worker pools started (1 validation + 2 scraping)") log.info("Chrome worker pools started (1 validation + 2 scraping)")
@@ -148,6 +155,9 @@ class JobResponse(BaseModel):
scrape_time: Optional[float] = None scrape_time: Optional[float] = None
error_message: Optional[str] = None error_message: Optional[str] = None
webhook_url: Optional[str] = None webhook_url: Optional[str] = None
# Business metadata
business_name: Optional[str] = None
business_address: Optional[str] = None
class ReviewsResponse(BaseModel): class ReviewsResponse(BaseModel):
@@ -239,12 +249,296 @@ async def get_job(job_id: UUID):
started_at=job['started_at'].isoformat() if job['started_at'] else None, started_at=job['started_at'].isoformat() if job['started_at'] else None,
completed_at=job['completed_at'].isoformat() if job['completed_at'] else None, completed_at=job['completed_at'].isoformat() if job['completed_at'] else None,
reviews_count=job['reviews_count'], reviews_count=job['reviews_count'],
total_reviews=job.get('total_reviews'),
scrape_time=job['scrape_time'], scrape_time=job['scrape_time'],
error_message=job['error_message'], error_message=job['error_message'],
webhook_url=job.get('webhook_url') webhook_url=job.get('webhook_url')
) )
@app.get("/jobs/{job_id}/logs", summary="Get Job Logs")
async def get_job_logs(job_id: UUID):
"""
Get the scraper logs for a job.
Returns logs from both successful and failed jobs.
Useful for debugging scraping issues.
"""
if not db:
raise HTTPException(status_code=500, detail="Database not initialized")
job = await db.get_job(job_id)
if not job:
raise HTTPException(status_code=404, detail="Job not found")
# Get scrape_logs from job
scrape_logs = job.get('scrape_logs')
# Parse if string (asyncpg might return JSONB as string)
if isinstance(scrape_logs, str):
try:
scrape_logs = json.loads(scrape_logs)
except:
scrape_logs = None
return {
"job_id": str(job_id),
"status": job['status'],
"error_message": job.get('error_message'),
"logs": scrape_logs or [],
"log_count": len(scrape_logs) if scrape_logs else 0
}
# ==================== SSE Streaming Endpoints ====================
async def broadcast_job_update(job_id: str, event_type: str, data: dict):
"""Broadcast an update to all subscribers of a job stream and the all-jobs stream."""
message = f"event: {event_type}\ndata: {json.dumps(data)}\n\n"
# Send to job-specific subscribers
if job_id in job_update_queues:
for queue in job_update_queues[job_id]:
try:
await queue.put(message)
except:
pass
# Send to all-jobs subscribers
for queue in job_update_queues.get("all", []):
try:
await queue.put(message)
except:
pass
@app.get("/jobs/{job_id}/stream", summary="Stream Job Updates (SSE)")
async def stream_job_updates(job_id: UUID):
"""
Server-Sent Events stream for real-time job updates.
Streams:
- status: Job status changes
- progress: Review count and progress updates
- logs: New log entries
- complete: Job finished (completed/failed)
Connect with EventSource in the browser:
```javascript
const es = new EventSource('/jobs/{job_id}/stream');
es.onmessage = (e) => console.log(JSON.parse(e.data));
es.addEventListener('logs', (e) => console.log('Logs:', JSON.parse(e.data)));
```
"""
if not db:
raise HTTPException(status_code=500, detail="Database not initialized")
# Verify job exists
job = await db.get_job(job_id)
if not job:
raise HTTPException(status_code=404, detail="Job not found")
job_id_str = str(job_id)
# Create queue for this client
queue: asyncio.Queue = asyncio.Queue()
# Register subscriber
if job_id_str not in job_update_queues:
job_update_queues[job_id_str] = []
job_update_queues[job_id_str].append(queue)
async def event_generator():
try:
# Send initial state
job_data = await db.get_job(job_id)
if job_data:
scrape_logs = job_data.get('scrape_logs')
if isinstance(scrape_logs, str):
try:
scrape_logs = json.loads(scrape_logs)
except:
scrape_logs = []
initial = {
"job_id": job_id_str,
"status": job_data['status'],
"reviews_count": job_data.get('reviews_count'),
"total_reviews": job_data.get('total_reviews'),
"scrape_time": job_data.get('scrape_time'),
"error_message": job_data.get('error_message'),
"logs": scrape_logs or []
}
yield f"event: init\ndata: {json.dumps(initial)}\n\n"
# If job is already complete, send complete event and close
if job_data and job_data['status'] in ['completed', 'failed', 'cancelled']:
yield f"event: complete\ndata: {json.dumps({'status': job_data['status']})}\n\n"
return
# Keep connection alive and send updates
last_log_count = len(scrape_logs) if scrape_logs else 0
last_reviews_count = job_data.get('reviews_count') if job_data else 0
while True:
try:
# Wait for update with timeout (for keepalive)
try:
message = await asyncio.wait_for(queue.get(), timeout=2.0)
yield message
except asyncio.TimeoutError:
# Send keepalive comment
yield ": keepalive\n\n"
# Also poll database for updates (backup in case broadcast missed)
job_data = await db.get_job(job_id)
if job_data:
# Check for status change
if job_data['status'] in ['completed', 'failed', 'cancelled']:
scrape_logs = job_data.get('scrape_logs')
if isinstance(scrape_logs, str):
try:
scrape_logs = json.loads(scrape_logs)
except:
scrape_logs = []
final = {
"job_id": job_id_str,
"status": job_data['status'],
"reviews_count": job_data.get('reviews_count'),
"total_reviews": job_data.get('total_reviews'),
"scrape_time": job_data.get('scrape_time'),
"error_message": job_data.get('error_message'),
"logs": scrape_logs or []
}
yield f"event: complete\ndata: {json.dumps(final)}\n\n"
return
# Check for new logs or progress
scrape_logs = job_data.get('scrape_logs')
if isinstance(scrape_logs, str):
try:
scrape_logs = json.loads(scrape_logs)
except:
scrape_logs = []
current_log_count = len(scrape_logs) if scrape_logs else 0
current_reviews = job_data.get('reviews_count') or 0
if current_log_count > last_log_count or current_reviews != last_reviews_count:
update = {
"job_id": job_id_str,
"status": job_data['status'],
"reviews_count": current_reviews,
"total_reviews": job_data.get('total_reviews'),
"logs": scrape_logs or []
}
yield f"event: update\ndata: {json.dumps(update)}\n\n"
last_log_count = current_log_count
last_reviews_count = current_reviews
except Exception as e:
log.error(f"Error in SSE stream for job {job_id}: {e}")
break
finally:
# Unregister subscriber
if job_id_str in job_update_queues:
try:
job_update_queues[job_id_str].remove(queue)
if not job_update_queues[job_id_str]:
del job_update_queues[job_id_str]
except:
pass
return StreamingResponse(
event_generator(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no" # Disable nginx buffering
}
)
@app.get("/jobs/stream", summary="Stream All Jobs Updates (SSE)")
async def stream_all_jobs():
"""
Server-Sent Events stream for all job updates.
Streams:
- job_created: New job was created
- job_updated: Job status/progress changed
- job_completed: Job finished
Connect with EventSource in the browser:
```javascript
const es = new EventSource('/jobs/stream');
es.addEventListener('job_updated', (e) => console.log('Update:', JSON.parse(e.data)));
```
"""
if not db:
raise HTTPException(status_code=500, detail="Database not initialized")
# Create queue for this client
queue: asyncio.Queue = asyncio.Queue()
# Register subscriber to all-jobs stream
job_update_queues["all"].append(queue)
async def event_generator():
try:
# Send initial jobs list
jobs = await db.list_jobs(limit=100)
jobs_data = [
{
"job_id": str(j['job_id']),
"status": j['status'],
"url": j['url'],
"created_at": j['created_at'].isoformat(),
"completed_at": j['completed_at'].isoformat() if j.get('completed_at') else None,
"reviews_count": j.get('reviews_count'),
"scrape_time": j.get('scrape_time'),
"error_message": j.get('error_message')
}
for j in jobs
]
yield f"event: init\ndata: {json.dumps({'jobs': jobs_data})}\n\n"
# Keep connection alive and send updates
while True:
try:
# Wait for update with timeout (for keepalive)
try:
message = await asyncio.wait_for(queue.get(), timeout=5.0)
yield message
except asyncio.TimeoutError:
# Send keepalive comment
yield ": keepalive\n\n"
except Exception as e:
log.error(f"Error in all-jobs SSE stream: {e}")
break
finally:
# Unregister subscriber
try:
job_update_queues["all"].remove(queue)
except:
pass
return StreamingResponse(
event_generator(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no"
}
)
@app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews") @app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews")
async def get_job_reviews(job_id: UUID): async def get_job_reviews(job_id: UUID):
""" """
@@ -298,19 +592,34 @@ async def list_jobs(
jobs = await db.list_jobs(status=job_status, limit=limit, offset=offset) jobs = await db.list_jobs(status=job_status, limit=limit, offset=offset)
return [ result = []
JobResponse( for job in jobs:
# Extract business info from metadata if available
metadata = job.get('metadata')
if isinstance(metadata, str):
try:
metadata = json.loads(metadata)
except:
metadata = None
business_name = metadata.get('business_name') if metadata else None
business_address = metadata.get('business_address') if metadata else None
result.append(JobResponse(
job_id=str(job['job_id']), job_id=str(job['job_id']),
status=job['status'], status=job['status'],
url=job['url'], url=job['url'],
created_at=job['created_at'].isoformat(), created_at=job['created_at'].isoformat(),
completed_at=job['completed_at'].isoformat() if job.get('completed_at') else None, completed_at=job['completed_at'].isoformat() if job.get('completed_at') else None,
reviews_count=job.get('reviews_count'), reviews_count=job.get('reviews_count'),
total_reviews=job.get('total_reviews'),
scrape_time=job.get('scrape_time'), scrape_time=job.get('scrape_time'),
error_message=job.get('error_message') error_message=job.get('error_message'),
) business_name=business_name,
for job in jobs business_address=business_address
] ))
return result
@app.delete("/jobs/{job_id}", summary="Delete Job") @app.delete("/jobs/{job_id}", summary="Delete Job")
@@ -370,11 +679,11 @@ async def check_reviews(request: ScrapeRequest):
# SIMPLIFIED VALIDATION: If we found a business (name + rating), assume it has reviews # SIMPLIFIED VALIDATION: If we found a business (name + rating), assume it has reviews
# Let the actual scraper determine if reviews exist # Let the actual scraper determine if reviews exist
has_business = result.get('name') and result.get('rating') has_business = bool(result.get('name') and result.get('rating'))
return { return {
"has_reviews": has_business, # Assume true if business exists "has_reviews": has_business, # Boolean: true if business exists
"total_reviews": result['total_reviews'] or 0, # Show 0 if unknown "total_reviews": result.get('total_reviews') or 0, # Show 0 if unknown
"name": result.get('name'), "name": result.get('name'),
"address": result.get('address'), "address": result.get('address'),
"rating": result.get('rating'), "rating": result.get('rating'),
@@ -488,6 +797,8 @@ async def run_scraping_job(job_id: UUID):
Args: Args:
job_id: Job UUID job_id: Job UUID
""" """
job_id_str = str(job_id)
async with job_semaphore: # Limit concurrent Chrome instances async with job_semaphore: # Limit concurrent Chrome instances
try: try:
# Update status to running # Update status to running
@@ -498,44 +809,79 @@ async def run_scraping_job(job_id: UUID):
job = await db.get_job(job_id) job = await db.get_job(job_id)
url = job['url'] url = job['url']
# Broadcast job started via SSE
await broadcast_job_update(job_id_str, "job_started", {
"job_id": job_id_str,
"status": "running",
"url": url
})
# Get the event loop for progress updates from worker thread # Get the event loop for progress updates from worker thread
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()
# Progress callback to update job status with current/total counts # Create log capture instance that we can access for real-time logs
log_capture = LogCapture()
# Progress callback to update job status with current/total counts AND logs
def progress_callback(current_count: int, total_count: int): def progress_callback(current_count: int, total_count: int):
"""Update job progress from worker thread""" """Update job progress and logs from worker thread"""
async def update(): async def update():
# Get current logs from the shared log_capture
current_logs = log_capture.get_logs()
await db.update_job_status( await db.update_job_status(
job_id, job_id,
JobStatus.RUNNING, JobStatus.RUNNING,
reviews_count=current_count, reviews_count=current_count,
total_reviews=total_count total_reviews=total_count,
scrape_logs=current_logs
) )
# Broadcast progress via SSE
await broadcast_job_update(job_id_str, "job_progress", {
"job_id": job_id_str,
"status": "running",
"reviews_count": current_count,
"total_reviews": total_count,
"logs": current_logs
})
# Schedule the coroutine on the event loop # Schedule the coroutine on the event loop
asyncio.run_coroutine_threadsafe(update(), loop) asyncio.run_coroutine_threadsafe(update(), loop)
# Run scraping with progress callback # Run scraping with progress callback and shared log capture
# headless=False because Docker uses Xvfb virtual display
result = await asyncio.to_thread( result = await asyncio.to_thread(
fast_scrape_reviews, fast_scrape_reviews,
url=url, url=url,
headless=True, headless=False,
progress_callback=progress_callback progress_callback=progress_callback,
log_capture=log_capture
) )
if result['success']: if result['success']:
# Save results to database # Save results to database (including scraper logs)
await db.save_job_result( await db.save_job_result(
job_id=job_id, job_id=job_id,
reviews=result['reviews'], reviews=result['reviews'],
scrape_time=result['time'], scrape_time=result['time'],
total_reviews=result.get('total_reviews') total_reviews=result.get('total_reviews'),
scrape_logs=result.get('logs')
) )
log.info( log.info(
f"Completed job {job_id}: {result['count']} reviews in {result['time']:.1f}s" f"Completed job {job_id}: {result['count']} reviews in {result['time']:.1f}s"
) )
# Broadcast job completed via SSE
await broadcast_job_update(job_id_str, "job_completed", {
"job_id": job_id_str,
"status": "completed",
"reviews_count": result['count'],
"total_reviews": result.get('total_reviews'),
"scrape_time": result['time'],
"logs": result.get('logs', [])
})
# Send webhook if configured # Send webhook if configured
if job.get('webhook_url'): if job.get('webhook_url'):
webhook_manager = WebhookManager() webhook_manager = WebhookManager()
@@ -553,15 +899,24 @@ async def run_scraping_job(job_id: UUID):
) )
else: else:
# Job failed # Job failed - save logs for debugging
await db.update_job_status( await db.update_job_status(
job_id, job_id,
JobStatus.FAILED, JobStatus.FAILED,
error_message=result.get('error', 'Unknown error') error_message=result.get('error', 'Unknown error'),
scrape_logs=result.get('logs')
) )
log.error(f"Failed job {job_id}: {result.get('error')}") log.error(f"Failed job {job_id}: {result.get('error')}")
# Broadcast job failed via SSE
await broadcast_job_update(job_id_str, "job_failed", {
"job_id": job_id_str,
"status": "failed",
"error_message": result.get('error'),
"logs": result.get('logs', [])
})
# Send failure webhook if configured # Send failure webhook if configured
if job.get('webhook_url'): if job.get('webhook_url'):
webhook_manager = WebhookManager() webhook_manager = WebhookManager()
@@ -585,6 +940,14 @@ async def run_scraping_job(job_id: UUID):
error_message=str(e) error_message=str(e)
) )
# Broadcast job failed via SSE
await broadcast_job_update(job_id_str, "job_failed", {
"job_id": job_id_str,
"status": "failed",
"error_message": str(e),
"logs": []
})
# Send failure webhook # Send failure webhook
job = await db.get_job(job_id) job = await db.get_job(job_id)
if job and job.get('webhook_url'): if job and job.get('webhook_url'):

View File

@@ -1,166 +0,0 @@
#!/usr/bin/env python3
"""
Brute force approach: Try every possible div class combination and see which gives us reviews.
"""
import time
from seleniumbase import Driver
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
driver = Driver(uc=True, headless=False)
try:
driver.get(url)
time.sleep(5)
# GDPR
try:
form_btns = driver.find_elements('css selector', 'form button')
for btn in form_btns:
if 'accept all' in (btn.text or '').lower():
btn.click()
time.sleep(2)
break
except:
pass
# Click reviews tab
time.sleep(2)
tabs = driver.find_elements('css selector', 'button[role="tab"]')
for tab in tabs:
if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower():
driver.execute_script("arguments[0].click();", tab)
time.sleep(5)
break
# Scroll to load reviews
try:
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
for _ in range(10):
driver.execute_script("arguments[0].scrollBy(0, 400);", pane)
time.sleep(0.3)
except:
pass
print("\n" + "="*80)
print("BRUTE FORCE SELECTOR SEARCH")
print("="*80)
# Get ALL unique class combinations from divs inside the reviews pane
candidates = driver.execute_script("""
// Find the reviews pane
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde');
if (!pane) return {error: 'Pane not found'};
// Get all divs inside the pane
const allDivs = Array.from(pane.querySelectorAll('div'));
// For each div, check if it looks like a review
const candidates = [];
for (let div of allDivs) {
// Skip if no classes
if (!div.className || div.className.length === 0) continue;
// Check for review indicators
const hasRating = !!div.querySelector('[aria-label*="star" i]');
const hasText = div.textContent.length > 50 && div.textContent.length < 1000; // Individual review size
const hasAuthor = !!div.querySelector('button[aria-label*="photo" i], img');
// Calculate score
let score = 0;
if (hasRating) score += 3;
if (hasText) score += 2;
if (hasAuthor) score += 1;
if (score >= 4) { // Must have rating + text at minimum
candidates.push({
classes: div.className,
selector: 'div.' + div.className.split(' ').filter(c => c).join('.'),
score: score,
text_length: div.textContent.length,
sample_text: div.textContent.substring(0, 100)
});
}
}
// Count how many elements match each selector
const selectorCounts = {};
for (let candidate of candidates) {
const count = pane.querySelectorAll(candidate.selector).length;
if (!selectorCounts[candidate.selector]) {
selectorCounts[candidate.selector] = {
count: count,
score: candidate.score,
text_length: candidate.text_length,
sample: candidate.sample_text
};
}
}
// Sort by count (we want selectors that match many reviews)
const sorted = Object.entries(selectorCounts)
.sort((a, b) => b[1].count - a[1].count)
.slice(0, 10);
return {
top_selectors: sorted.map(([selector, info]) => ({
selector: selector,
count: info.count,
score: info.score,
text_length: info.text_length,
sample: info.sample
}))
};
""")
if 'error' in candidates:
print(f"ERROR: {candidates['error']}")
else:
print(f"\nTop 10 candidate selectors (sorted by count):\n")
for i, candidate in enumerate(candidates['top_selectors'], 1):
print(f"{i}. {candidate['selector']}")
print(f" Count: {candidate['count']} | Score: {candidate['score']} | Text length: {candidate['text_length']}")
print(f" Sample: {candidate['sample'][:80]}...")
print()
# Test the top selector
if candidates['top_selectors']:
top_selector = candidates['top_selectors'][0]['selector']
print(f"\n{'='*80}")
print(f"TESTING TOP SELECTOR: {top_selector}")
print(f"{'='*80}")
test_result = driver.execute_script(f"""
const elements = document.querySelectorAll('{top_selector}');
const reviews = [];
for (let i = 0; i < Math.min(3, elements.length); i++) {{
const elem = elements[i];
const review = {{
has_author: !!elem.querySelector('button, img'),
has_rating: !!elem.querySelector('[aria-label*="star" i]'),
has_date: !!elem.textContent.match(/\\d+\\s*(day|week|month|year|ago)/i),
text_length: elem.textContent.length,
text_sample: elem.textContent.substring(0, 150)
}};
reviews.push(review);
}}
return reviews;
""")
print(f"\nFirst 3 elements using {top_selector}:")
for i, rev in enumerate(test_result, 1):
print(f"\n Element {i}:")
for key, value in rev.items():
print(f" {key}: {value}")
print(f"\n{'='*80}")
print("Browser staying open for 60 seconds...")
print(f"{'='*80}")
time.sleep(60)
finally:
driver.quit()

View File

@@ -1,106 +0,0 @@
#!/usr/bin/env python3
"""
Check the actual page structure - maybe reviews are already visible without clicking a tab!
"""
import time
from seleniumbase import Driver
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
driver = Driver(uc=True, headless=False)
try:
driver.get(url)
print(f"Initial URL: {url}")
time.sleep(5)
# GDPR
try:
form_btns = driver.find_elements('css selector', 'form button')
for btn in form_btns:
if 'accept' in (btn.text or '').lower():
btn.click()
time.sleep(2)
break
except:
pass
# Check final URL
final_url = driver.current_url
print(f"Final URL after redirect: {final_url}")
# Wait a bit more for dynamic content
time.sleep(3)
# Check page structure
print("\n" + "="*80)
print("PAGE STRUCTURE ANALYSIS")
print("="*80)
page_info = driver.execute_script("""
return {
tabs_found: document.querySelectorAll('button[role="tab"]').length,
reviews_with_standard_selector: document.querySelectorAll('div.jftiEf.fontBodyMedium').length,
reviews_with_jftiEf: document.querySelectorAll('div.jftiEf').length,
divs_with_ratings: document.querySelectorAll('[aria-label*="star" i]').length,
review_containers: document.querySelectorAll('div.fontBodyMedium').length,
page_text_sample: document.body.innerText.substring(0, 500),
has_review_text: document.body.innerText.toLowerCase().includes('review'),
has_atsiliepimai_text: document.body.innerText.toLowerCase().includes('atsiliepimai')
};
""")
print(f"\nTabs with role='tab': {page_info['tabs_found']}")
print(f"div.jftiEf.fontBodyMedium: {page_info['reviews_with_standard_selector']}")
print(f"div.jftiEf: {page_info['reviews_with_jftiEf']}")
print(f"Elements with star ratings: {page_info['divs_with_ratings']}")
print(f"div.fontBodyMedium: {page_info['review_containers']}")
print(f"Contains 'review': {page_info['has_review_text']}")
print(f"Contains 'atsiliepimai' (Lithuanian): {page_info['has_atsiliepimai_text']}")
print(f"\nPage text sample (first 500 chars):")
print(page_info['page_text_sample'])
# Try to find ANY element with rating
print("\n" + "="*80)
print("SEARCHING FOR RATING ELEMENTS")
print("="*80)
rating_search = driver.execute_script("""
const elements = Array.from(document.querySelectorAll('*'));
const withRatings = [];
for (let elem of elements) {
const ariaLabel = elem.getAttribute('aria-label') || '';
if (ariaLabel.toLowerCase().includes('star') || ariaLabel.toLowerCase().includes('žvaigžd')) {
withRatings.push({
tag: elem.tagName,
ariaLabel: ariaLabel.substring(0, 100),
classes: elem.className.substring(0, 100),
parentTag: elem.parentElement ? elem.parentElement.tagName : null,
parentClasses: elem.parentElement ? elem.parentElement.className.substring(0, 100) : null
});
}
}
return withRatings.slice(0, 10); // First 10
""")
print(f"\nFound {len(rating_search)} elements with 'star' in aria-label:")
for i, elem in enumerate(rating_search[:5], 1):
print(f"\n Element {i}:")
print(f" Tag: {elem['tag']}")
print(f" Aria-label: {elem['ariaLabel']}")
print(f" Classes: {elem['classes']}")
print(f" Parent tag: {elem['parentTag']}")
print(f" Parent classes: {elem['parentClasses']}")
print(f"\n{'='*80}")
print("Browser open for manual inspection...")
print("LOOK AT THE PAGE - Are reviews visible? What's their structure?")
print(f"{'='*80}")
time.sleep(180) # 3 minutes
finally:
driver.quit()

View File

@@ -1,355 +0,0 @@
#!/usr/bin/env python3
"""
Cookie-based API scraper - Capture fresh cookies on each run, then fast API scraping.
Flow:
1. Start browser (15 seconds)
2. Capture cookies from active browser session (5 seconds)
3. Close browser
4. Use cookies for rapid API pagination (5-10 seconds)
Total time: ~25-35 seconds for 244 reviews (vs 155 seconds with scrolling)
"""
import json
import logging
import time
from typing import List, Optional, Tuple
import requests
from seleniumbase import SB
from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
class CookieBasedScraper:
"""Capture cookies each run, then scrape via API."""
def __init__(self, url: str, headless: bool = False):
self.url = url
self.headless = headless
self.session = requests.Session()
self.place_id = None
self.interceptor = GoogleMapsAPIInterceptor(None)
def capture_cookies(self) -> bool:
"""
Capture cookies from a real browser session.
Returns True if successful.
"""
log.info("="*60)
log.info("STEP 1: Capturing cookies from browser session")
log.info("="*60)
sb = None
sb_context = None
try:
# Create driver - need to enter the context manually
log.info("Starting browser...")
sb_context = SB(uc=True, headless=self.headless)
sb = sb_context.__enter__() # Manually enter context
log.info("Opening Google Maps...")
sb.open(self.url)
time.sleep(2)
# Dismiss cookie consent
try:
sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=3)
log.info("✓ Cookie dialog dismissed")
except:
pass
# Click reviews tab
try:
sb.click('.LRkQ2', timeout=5)
log.info("✓ Opened reviews tab")
time.sleep(3) # Wait for reviews to load
except Exception as e:
log.warning(f"Could not click reviews tab: {e}")
# Extract place ID from current URL
current_url = sb.get_current_url()
if '!1s' in current_url:
parts = current_url.split('!1s')
if len(parts) > 1:
self.place_id = parts[1].split('!')[0]
log.info(f"✓ Extracted place ID: {self.place_id}")
if not self.place_id:
log.error("Could not extract place ID")
return False
# CRITICAL: Scroll once to trigger an API call!
# This causes Google to set the necessary session cookies
log.info("Triggering API call by scrolling...")
sb.execute_script("window.scrollBy(0, 500)")
time.sleep(2) # Wait for API call to complete
log.info("✓ API call triggered - session cookies should now be set")
# CAPTURE COOKIES using CDP (gets httpOnly cookies too!)
log.info("Capturing cookies via CDP...")
try:
# Use Chrome DevTools Protocol to get ALL cookies from all domains
cdp_cookies = sb.driver.execute_cdp_cmd('Network.getAllCookies', {})
browser_cookies = cdp_cookies.get('cookies', [])
log.info(f"✓ Captured {len(browser_cookies)} cookies via CDP")
# Also try getting cookies for specific Google domains
for domain in ['.google.com', 'www.google.com', '.google.es', 'maps.google.com']:
try:
domain_cookies = sb.driver.execute_cdp_cmd('Network.getCookies', {'urls': [f'https://{domain}']})
extra_cookies = domain_cookies.get('cookies', [])
if extra_cookies:
log.info(f" Found {len(extra_cookies)} cookies for {domain}")
# Add any new cookies we don't have yet
existing_names = {c['name'] for c in browser_cookies}
for cookie in extra_cookies:
if cookie['name'] not in existing_names:
browser_cookies.append(cookie)
except:
pass
log.info(f"✓ Total cookies after checking all domains: {len(browser_cookies)}")
except Exception as e:
log.warning(f"CDP cookie capture failed: {e}")
# Fallback to JavaScript (won't get httpOnly cookies)
cookie_string = sb.execute_script("return document.cookie")
browser_cookies = []
for cookie in cookie_string.split('; '):
if '=' in cookie:
name, value = cookie.split('=', 1)
browser_cookies.append({
'name': name,
'value': value,
'domain': '.google.com',
'path': '/'
})
log.info(f"✓ Fallback: Captured {len(browser_cookies)} cookies via JS")
# CAPTURE USER AGENT while driver is active
user_agent = sb.execute_script("return navigator.userAgent")
log.info(f"✓ Captured user agent")
# Process cookies into session
for cookie in browser_cookies:
self.session.cookies.set(
name=cookie['name'],
value=cookie['value'],
domain=cookie.get('domain', '.google.com'),
path=cookie.get('path', '/')
)
# Set headers
self.session.headers.update({
'User-Agent': user_agent,
'Accept': '*/*',
'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8',
'Referer': 'https://www.google.com/maps/',
'Origin': 'https://www.google.com',
'X-Requested-With': 'XMLHttpRequest',
})
# Print ALL cookie names for debugging
all_cookie_names = [c['name'] for c in browser_cookies]
log.info(f"Cookie names: {', '.join(all_cookie_names)}")
# Print important cookies for debugging
important_cookies = ['SID', 'HSID', 'SSID', 'APISID', 'SAPISID', '__Secure-1PSID', '__Secure-3PSID']
found_cookies = []
for cookie_name in important_cookies:
if cookie_name in self.session.cookies:
found_cookies.append(cookie_name)
log.info(f"✓ Found auth cookies: {', '.join(found_cookies) if found_cookies else 'NONE - this is the problem!'}")
# Check if we have auth cookies
if not found_cookies:
log.warning("\n" + "="*60)
log.warning("⚠️ NO AUTHENTICATION COOKIES FOUND!")
log.warning("="*60)
log.warning("Google Maps API requires you to be logged into Google.")
log.warning("")
log.warning("To fix this:")
log.warning("1. Log into your Google account in Chrome")
log.warning("2. Visit google.com/maps while logged in")
log.warning("3. Then run this scraper again")
log.warning("")
log.warning("Alternatively, use the hybrid scraper (start.py) which")
log.warning("handles authentication automatically and already achieves")
log.warning("95%+ API coverage with 100% parse rate!")
log.warning("="*60 + "\n")
# Continue anyway to show the error
log.info("Continuing anyway to demonstrate the API error...")
log.info("\n✅ Cookie capture successful!")
log.info(f" Total cookies: {len(browser_cookies)}")
log.info(f" Place ID: {self.place_id}")
log.info(f" Session ready: Yes\n")
return True
except Exception as e:
log.error(f"Cookie capture failed: {e}")
import traceback
traceback.print_exc()
return False
finally:
# IMPORTANT: Close browser properly
if sb_context:
try:
log.info("Closing browser...")
sb_context.__exit__(None, None, None) # Properly exit context
log.info("✓ Browser closed\n")
except Exception as e:
log.debug(f"Error closing browser: {e}")
def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]:
"""
Fetch a page of reviews via API using captured cookies.
"""
# Build pb parameter
if continuation_token:
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
else:
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
params = {
'authuser': '0',
'hl': 'es',
'gl': 'es',
'pb': pb
}
try:
url = 'https://www.google.com/maps/rpc/listugcposts'
response = self.session.get(url, params=params, timeout=10)
if response.status_code != 200:
log.error(f"API error {response.status_code}")
log.error(f"Response: {response.text[:500]}")
log.debug(f"Request URL: {response.url}")
log.debug(f"Request headers: {dict(self.session.headers)}")
return [], None
# Parse response
body = response.text
if body.startswith(")]}'"):
body = body[4:].strip()
data = json.loads(body)
reviews = self.interceptor._parse_listugcposts_response(data)
# Get next token
next_token = None
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
next_token = data[1]
return reviews, next_token
except Exception as e:
log.error(f"API request failed: {e}")
return [], None
def scrape_all(self, max_pages: int = 100) -> List[dict]:
"""
Main scraping method with cookie-based session.
"""
# Step 1: Capture cookies from browser
if not self.capture_cookies():
log.error("Failed to capture cookies - aborting")
return []
# Step 2: Scrape via API
log.info("="*60)
log.info("STEP 2: Fast API scraping (no browser needed)")
log.info("="*60)
start_time = time.time()
all_reviews = []
seen_ids = set()
token = None
page = 0
while page < max_pages:
page += 1
log.info(f"Fetching page {page}...")
reviews, token = self.fetch_reviews_page(token)
if not reviews:
if page == 1:
log.error("No reviews on first page - cookies may have expired or be invalid")
else:
log.info("No more reviews found")
break
# Deduplicate
for review in reviews:
rid = review.review_id or f"{review.author}_{review.date_text}"
if rid not in seen_ids:
seen_ids.add(rid)
all_reviews.append({
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
})
log.info(f"{len(reviews)} reviews | Total: {len(all_reviews)}")
if not token:
log.info("No continuation token - all reviews fetched")
break
# Small delay between requests
time.sleep(0.2)
elapsed = time.time() - start_time
log.info("\n" + "="*60)
log.info("✅ SCRAPING COMPLETED!")
log.info("="*60)
log.info(f"Total reviews: {len(all_reviews)}")
log.info(f"API calls: {page}")
log.info(f"API scraping time: {elapsed:.2f} seconds")
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second")
log.info("="*60 + "\n")
return all_reviews
def main():
"""Example usage."""
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
scraper = CookieBasedScraper(url, headless=False)
reviews = scraper.scrape_all(max_pages=50)
if reviews:
# Save results
output_file = 'cookie_based_reviews.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(reviews, f, indent=2, ensure_ascii=False)
log.info(f"💾 Saved {len(reviews)} reviews to {output_file}")
# Show sample
log.info("\nSample review:")
sample = reviews[0]
log.info(f" Author: {sample['author']}")
log.info(f" Rating: {sample['rating']}")
log.info(f" Date: {sample['date_text']}")
if sample['text']:
log.info(f" Text: {sample['text'][:80]}...")
else:
log.error("No reviews scraped!")
if __name__ == '__main__':
main()

View File

@@ -1,249 +0,0 @@
#!/usr/bin/env python3
"""
Direct API scraper - fetch Google Maps reviews via API without browser scrolling.
This is 10-25x faster than traditional browser-based scraping.
"""
import json
import logging
import time
import urllib.parse
from typing import List, Optional, Tuple
import requests
from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
class DirectAPIScraper:
"""Fetch Google Maps reviews directly via API without browser automation."""
def __init__(self, place_id: str, language: str = 'en', region: str = 'us'):
"""
Initialize the direct API scraper.
Args:
place_id: Google Maps place ID (e.g., '0x46dd947294b213bf:0x864c7a232527adb4')
language: Language code (e.g., 'en', 'es', 'de')
region: Region/country code (e.g., 'us', 'es', 'de')
"""
self.place_id = place_id
self.language = language
self.region = region
self.base_url = 'https://www.google.com/maps/rpc/listugcposts'
# Initialize parser (reuse the working parser from api_interceptor)
self.interceptor = GoogleMapsAPIInterceptor(None)
# Session for maintaining cookies
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': '*/*',
'Accept-Language': f'{language},{language}-{region.upper()};q=0.9,en;q=0.8',
'Referer': 'https://www.google.com/maps/',
'X-Requested-With': 'XMLHttpRequest',
})
def _build_pb_param(self, continuation_token: Optional[str] = None) -> str:
"""
Build the Protocol Buffer (pb) parameter for the API request.
Args:
continuation_token: Pagination token from previous response
Returns:
pb parameter string (NOT URL-encoded - that's done by requests)
"""
# Base structure with place ID and pagination token
if continuation_token:
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
else:
# First request without continuation token
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
return pb
def _establish_session(self):
"""Visit Google Maps page to establish session cookies."""
try:
# Visit the main maps page to get cookies
maps_url = f"https://www.google.com/maps/place/?q=place_id:{self.place_id}"
log.debug("Establishing session by visiting Google Maps...")
response = self.session.get(maps_url, timeout=10)
response.raise_for_status()
log.debug(f"Session established (cookies: {len(self.session.cookies)})")
except Exception as e:
log.warning(f"Failed to establish session: {e}")
def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]:
"""
Fetch a single page of reviews from the API.
Args:
continuation_token: Pagination token from previous response
Returns:
Tuple of (reviews list, next continuation token or None)
"""
# Build request parameters
params = {
'authuser': '0',
'hl': self.language,
'gl': self.region,
'pb': self._build_pb_param(continuation_token)
}
try:
log.info(f"Fetching reviews page (token: {'initial' if not continuation_token else 'paginated'})...")
response = self.session.get(self.base_url, params=params, timeout=10)
# Log response for debugging
log.debug(f"Response status: {response.status_code}")
if response.status_code != 200:
log.error(f"Response body: {response.text[:500]}")
response.raise_for_status()
# Google returns responses with )]}' prefix - strip it
body = response.text
if body.startswith(")]}'"):
body = body[4:].strip()
log.debug(f"Response size: {len(body)} bytes")
# Parse JSON response
data = json.loads(body)
# Extract reviews using our working parser
reviews = self.interceptor._parse_listugcposts_response(data)
# Extract next continuation token
next_token = None
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
next_token = data[1]
log.debug(f"Found continuation token: {next_token[:50]}...")
log.info(f"✓ Extracted {len(reviews)} reviews from this page")
return reviews, next_token
except requests.exceptions.RequestException as e:
log.error(f"API request failed: {e}")
return [], None
except json.JSONDecodeError as e:
log.error(f"Failed to parse API response: {e}")
return [], None
except Exception as e:
log.error(f"Unexpected error: {e}")
return [], None
def fetch_all_reviews(self, max_pages: int = 100, delay: float = 0.5) -> List[dict]:
"""
Fetch all reviews by paginating through the API.
Args:
max_pages: Maximum number of pages to fetch (safety limit)
delay: Delay between requests in seconds
Returns:
List of review dictionaries
"""
all_reviews = []
seen_ids = set()
continuation_token = None
page = 0
start_time = time.time()
log.info(f"Starting direct API scraping for place: {self.place_id}")
# Establish session first
self._establish_session()
while page < max_pages:
page += 1
# Fetch page
reviews, continuation_token = self.fetch_reviews_page(continuation_token)
if not reviews:
log.info("No more reviews found - stopping")
break
# Deduplicate and add reviews
for review in reviews:
review_id = review.review_id or f"{review.author}_{review.date_text}"
if review_id not in seen_ids:
seen_ids.add(review_id)
# Convert to dict
all_reviews.append({
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
})
log.info(f"Page {page}: {len(all_reviews)} total unique reviews")
# Check if we have a continuation token
if not continuation_token:
log.info("No continuation token - all reviews fetched")
break
# Rate limiting
if delay > 0 and page < max_pages:
time.sleep(delay)
elapsed = time.time() - start_time
log.info(f"\n{'='*60}")
log.info(f"✅ Direct API scraping completed!")
log.info(f"{'='*60}")
log.info(f"Total reviews: {len(all_reviews)}")
log.info(f"Pages fetched: {page}")
log.info(f"Time elapsed: {elapsed:.2f} seconds")
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second")
log.info(f"{'='*60}\n")
return all_reviews
def main():
"""Example usage of the direct API scraper."""
# Soho Club place ID from the test URL
place_id = '0x46dd947294b213bf:0x864c7a232527adb4'
# Create scraper
scraper = DirectAPIScraper(
place_id=place_id,
language='es',
region='es'
)
# Fetch all reviews
reviews = scraper.fetch_all_reviews(max_pages=50, delay=0.5)
# Save to JSON
output_file = 'direct_api_reviews.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(reviews, f, indent=2, ensure_ascii=False)
log.info(f"Saved {len(reviews)} reviews to {output_file}")
# Show sample
if reviews:
log.info("\nSample review:")
sample = reviews[0]
log.info(f" Author: {sample['author']}")
log.info(f" Rating: {sample['rating']}")
log.info(f" Date: {sample['date_text']}")
log.info(f" Text: {sample['text'][:100]}..." if sample['text'] else " Text: (no text)")
if __name__ == '__main__':
main()

View File

@@ -1,61 +0,0 @@
#!/usr/bin/env python3
"""
Quick script to dump API responses for debugging
"""
import json
from modules.api_interceptor import GoogleMapsAPIInterceptor
from seleniumbase import SB
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
with SB(uc=True, headless=False) as sb:
# Set up interceptor BEFORE loading page
interceptor = GoogleMapsAPIInterceptor(sb.driver)
sb.open(url)
sb.sleep(2)
# Inject interceptor early
interceptor.inject_response_interceptor()
sb.sleep(2)
# Click reviews tab
try:
sb.click('.LRkQ2:contains("Reseñas")', timeout=5)
except:
try:
sb.click('.LRkQ2:contains("Reviews")', timeout=5)
except:
pass
print("Waiting for reviews to load...")
sb.sleep(5)
# Scroll to trigger more requests
print("Scrolling to load more...")
for i in range(5):
sb.execute_script("window.scrollBy(0, 800)")
sb.sleep(2)
print(f" Scroll {i+1}/5...")
print("\nCollecting responses...")
# Get responses
responses = interceptor.get_intercepted_responses()
print(f"\nCaptured {len(responses)} responses")
# Dump to files
for i, resp in enumerate(responses):
filename = f"api_response_{i}.json"
with open(filename, 'w', encoding='utf-8') as f:
json.dump(resp, f, indent=2, ensure_ascii=False)
print(f"Saved: {filename} ({len(resp.get('body', ''))} bytes)")
# Also save just the body for easier viewing
body_file = f"api_response_{i}_body.txt"
with open(body_file, 'w', encoding='utf-8') as f:
f.write(resp.get('body', ''))
print(f"Saved body: {body_file}")
print("\nDone! Check api_response_*.json files")

View File

@@ -1,107 +0,0 @@
#!/usr/bin/env python3
"""
Dump raw API responses for analysis.
This will help us understand Google's exact response format.
"""
import json
import logging
from pathlib import Path
from seleniumbase import SB
from modules.api_interceptor import GoogleMapsAPIInterceptor
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
output_dir = Path("api_response_samples")
output_dir.mkdir(exist_ok=True)
print(f"[INFO] Starting browser...")
with SB(uc=True, headless=False) as sb:
print("[INFO] Navigating to Google Maps...")
sb.open(url)
sb.sleep(3)
# Inject interceptor FIRST
print("[INFO] Injecting API interceptor...")
interceptor = GoogleMapsAPIInterceptor(sb.driver)
interceptor.inject_response_interceptor()
sb.sleep(2)
# Click reviews tab
print("[INFO] Looking for reviews tab...")
try:
sb.click('.LRkQ2', timeout=5)
print("[INFO] ✓ Clicked reviews tab")
except:
print("[WARN] Could not click reviews tab, trying to continue...")
sb.sleep(5)
# Scroll multiple times to trigger API calls
print("[INFO] Scrolling to trigger API calls...")
for i in range(10):
sb.execute_script("window.scrollBy(0, 800)")
sb.sleep(1.5)
# Check every few scrolls
if (i + 1) % 3 == 0:
responses = interceptor.get_intercepted_responses()
if responses:
print(f"[INFO] Captured {len(responses)} responses so far...")
# Final collection
print("\n[INFO] Collecting all captured responses...")
all_responses = interceptor.get_intercepted_responses()
if not all_responses:
print("[ERROR] No responses captured!")
exit(1)
print(f"[SUCCESS] Captured {len(all_responses)} API responses!\n")
# Dump each response
for i, resp in enumerate(all_responses):
url_str = resp.get('url', 'unknown')
body = resp.get('body', '')
size = len(body)
# Save full response
full_file = output_dir / f"response_{i:02d}_full.json"
with open(full_file, 'w', encoding='utf-8') as f:
json.dump(resp, f, indent=2, ensure_ascii=False)
# Save just body for easier viewing
body_file = output_dir / f"response_{i:02d}_body.txt"
with open(body_file, 'w', encoding='utf-8') as f:
f.write(body)
# Try to parse as JSON
if body.startswith(")]}'"):
clean_body = body[4:].strip()
else:
clean_body = body
json_file = output_dir / f"response_{i:02d}_parsed.json"
try:
parsed = json.loads(clean_body)
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(parsed, f, indent=2, ensure_ascii=False)
print(f" [{i}] ✓ {url_str[:60]}... ({size:,} bytes)")
print(f" Full: {full_file}")
print(f" Body: {body_file}")
print(f" Parsed: {json_file}")
except:
print(f" [{i}] ✓ {url_str[:60]}... ({size:,} bytes) [Not JSON]")
print(f" Full: {full_file}")
print(f" Body: {body_file}")
print()
print(f"\n[SUCCESS] Dumped {len(all_responses)} responses to: {output_dir}/")
print("\nNext steps:")
print(" 1. Open response_00_parsed.json to study the structure")
print(" 2. Look for arrays containing review data")
print(" 3. Identify patterns for: review ID, author, rating, text, date")
print(" 4. Update the parser patterns in modules/api_interceptor.py")
print("\n[DONE]")

View File

@@ -1,249 +0,0 @@
#!/usr/bin/env python3
"""
Fast API scraper - Minimal browser usage, maximum API speed.
Strategy:
1. Start browser and navigate to reviews page
2. Capture cookies and user-agent from browser
3. Let one API call happen naturally (to warm up the session)
4. Close browser
5. Use requests library with captured session to make fast API calls
6. Paginate through all reviews without any scrolling
Expected: 10-25x faster than traditional scrolling approach.
"""
import json
import logging
import time
from typing import List, Optional, Tuple
import requests
from seleniumbase import SB
from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
class FastAPIScraper:
"""Minimal browser, maximum speed."""
def __init__(self, url: str):
self.url = url
self.session = requests.Session()
self.place_id = None
self.interceptor = GoogleMapsAPIInterceptor(None)
def bootstrap_session(self) -> bool:
"""
Quickly establish session using browser, then close it.
"""
log.info("Bootstrapping session with minimal browser usage...")
try:
with SB(uc=True, headless=False) as sb:
# Navigate
log.info("Opening Google Maps...")
sb.open(self.url)
sb.sleep(2)
# Dismiss cookies
try:
sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=3)
except:
pass
# Click reviews
try:
sb.click('.LRkQ2', timeout=5)
log.info("✓ Opened reviews tab")
sb.sleep(2)
except:
log.warning("Could not click reviews tab")
# Wait a bit to ensure page is loaded
sb.sleep(1)
# Extract place ID from URL or page
current_url = sb.get_current_url()
if '!1s' in current_url:
parts = current_url.split('!1s')
if len(parts) > 1:
self.place_id = parts[1].split('!')[0]
log.info(f"✓ Extracted place ID: {self.place_id}")
# Get cookies from browser - do this while browser is still active
try:
browser_cookies = sb.driver.get_cookies()
log.debug(f"Got {len(browser_cookies)} cookies")
except Exception as e:
log.warning(f"Could not get cookies: {e}")
browser_cookies = []
# Get user agent - do this while browser is still active
try:
user_agent = sb.execute_script("return navigator.userAgent")
log.debug(f"User agent: {user_agent[:50]}...")
except Exception as e:
log.warning(f"Could not get user agent: {e}")
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
# Now process cookies and headers (browser context manager still open)
for cookie in browser_cookies:
try:
self.session.cookies.set(
name=cookie['name'],
value=cookie['value'],
domain=cookie.get('domain', '.google.com'),
path=cookie.get('path', '/')
)
except Exception as e:
log.debug(f"Could not set cookie {cookie.get('name')}: {e}")
# Set headers
self.session.headers.update({
'User-Agent': user_agent,
'Accept': '*/*',
'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8',
'Referer': 'https://www.google.com/maps/',
'Origin': 'https://www.google.com',
'X-Requested-With': 'XMLHttpRequest',
})
log.info(f"✅ Session bootstrapped!")
log.info(f" Cookies: {len(browser_cookies)}")
log.info(f" Place ID: {self.place_id}")
# Let browser stay open for a moment to ensure all operations complete
sb.sleep(1)
return True
except Exception as e:
log.error(f"Bootstrap failed: {e}")
import traceback
traceback.print_exc()
return False
def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]:
"""Fetch a page of reviews via API."""
# Build pb parameter
if continuation_token:
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
else:
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
params = {
'authuser': '0',
'hl': 'es',
'gl': 'es',
'pb': pb
}
try:
url = 'https://www.google.com/maps/rpc/listugcposts'
response = self.session.get(url, params=params, timeout=10)
if response.status_code != 200:
log.error(f"API error {response.status_code}")
log.error(f"Response: {response.text[:300]}")
return [], None
# Parse
body = response.text
if body.startswith(")]}'"):
body = body[4:].strip()
data = json.loads(body)
reviews = self.interceptor._parse_listugcposts_response(data)
# Next token
next_token = None
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
next_token = data[1]
return reviews, next_token
except Exception as e:
log.error(f"Request failed: {e}")
return [], None
def scrape_all(self, max_pages: int = 100) -> List[dict]:
"""
Main scraping method.
"""
# Bootstrap
if not self.bootstrap_session():
return []
# Scrape via API
log.info("\n" + "="*60)
log.info("STARTING FAST API SCRAPING")
log.info("="*60 + "\n")
start_time = time.time()
all_reviews = []
seen_ids = set()
token = None
page = 0
while page < max_pages:
page += 1
log.info(f"Fetching page {page}...")
reviews, token = self.fetch_reviews_page(token)
if not reviews:
log.info("No more reviews")
break
# Dedup
for review in reviews:
rid = review.review_id or f"{review.author}_{review.date_text}"
if rid not in seen_ids:
seen_ids.add(rid)
all_reviews.append({
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
})
log.info(f"{len(reviews)} reviews | Total: {len(all_reviews)}")
if not token:
break
time.sleep(0.2) # Small delay
elapsed = time.time() - start_time
log.info("\n" + "="*60)
log.info("✅ FAST API SCRAPING COMPLETED!")
log.info("="*60)
log.info(f"Reviews: {len(all_reviews)}")
log.info(f"Pages: {page}")
log.info(f"Time: {elapsed:.2f} seconds")
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
log.info("="*60 + "\n")
return all_reviews
def main():
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
scraper = FastAPIScraper(url)
reviews = scraper.scrape_all(max_pages=50)
# Save
with open('fast_api_reviews.json', 'w', encoding='utf-8') as f:
json.dump(reviews, f, indent=2, ensure_ascii=False)
log.info(f"Saved to fast_api_reviews.json")
if __name__ == '__main__':
main()

View File

@@ -1,156 +0,0 @@
#!/usr/bin/env python3
"""
Find the ACTUAL selector for reviews by looking for elements with review structure.
"""
import time
from seleniumbase import Driver
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
driver = Driver(uc=True, headless=False)
try:
driver.get(url)
time.sleep(5)
# GDPR
try:
form_btns = driver.find_elements('css selector', 'form button')
for btn in form_btns:
if 'accept all' in (btn.text or '').lower():
btn.click()
time.sleep(2)
break
except:
pass
# Click reviews tab
time.sleep(2)
tabs = driver.find_elements('css selector', 'button[role="tab"]')
for tab in tabs:
if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower():
driver.execute_script("arguments[0].click();", tab)
time.sleep(5)
break
# Scroll to load reviews
try:
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
for _ in range(3):
driver.execute_script("arguments[0].scrollBy(0, 500);", pane)
time.sleep(1)
except:
pass
# Use JavaScript to find ALL elements that look like reviews
print("\n" + "="*80)
print("FINDING ACTUAL REVIEW ELEMENTS BY STRUCTURE:")
print("="*80)
review_info = driver.execute_script("""
// Find all elements that have BOTH a rating AND substantial text
const allDivs = Array.from(document.querySelectorAll('div'));
const reviews = [];
for (let div of allDivs) {
// Must have a rating (star aria-label)
const ratingElem = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i]');
if (!ratingElem) continue;
// Must have decent text content (>50 chars to avoid buttons)
if (div.textContent.length < 50) continue;
// Get the classes and attributes
const info = {
classes: div.className,
has_author: !!div.querySelector('button, [aria-label*="photo" i]'),
has_avatar: !!div.querySelector('img'),
has_date: !!div.textContent.match(/\\d+\\s*(day|week|month|year|ago)/i),
text_length: div.textContent.length,
sample_text: div.textContent.substring(0, 150),
tag_name: div.tagName,
jslog: div.getAttribute('jslog'),
data_review_id: div.getAttribute('data-review-id'),
jsaction: div.getAttribute('jsaction')
};
reviews.push(info);
}
return {
total_found: reviews.length,
first_5: reviews.slice(0, 5)
};
""")
print(f"\nFound {review_info['total_found']} elements with review structure")
print(f"\nFirst 5 review-like elements:")
for i, rev in enumerate(review_info['first_5'], 1):
print(f"\n Review {i}:")
print(f" Classes: {rev['classes']}")
print(f" Has author: {rev['has_author']}")
print(f" Has avatar: {rev['has_avatar']}")
print(f" Has date: {rev['has_date']}")
print(f" Text length: {rev['text_length']}")
print(f" jslog: {rev['jslog']}")
print(f" data-review-id: {rev['data_review_id']}")
print(f" Sample: {rev['sample_text'][:80]}...")
# Try to find a common class among review elements
if review_info['total_found'] > 0:
print("\n" + "="*80)
print("FINDING COMMON SELECTOR:")
print("="*80)
common_selector = driver.execute_script("""
// Find common classes among review elements
const reviews = [];
const allDivs = Array.from(document.querySelectorAll('div'));
for (let div of allDivs) {
const ratingElem = div.querySelector('[aria-label*="star" i]');
if (ratingElem && div.textContent.length > 50) {
reviews.push(div);
}
}
if (reviews.length === 0) return null;
// Get classes from first review
const firstClasses = reviews[0].className.split(' ').filter(c => c.length > 0);
// Find classes that appear in ALL reviews
const commonClasses = firstClasses.filter(cls => {
return reviews.every(rev => rev.classList.contains(cls));
});
return {
total_reviews: reviews.length,
common_classes: commonClasses,
suggested_selector: commonClasses.length > 0 ? 'div.' + commonClasses.join('.') : null,
first_review_classes: reviews[0].className
};
""")
if common_selector:
print(f"Total review elements: {common_selector['total_reviews']}")
print(f"Common classes: {common_selector['common_classes']}")
print(f"Suggested selector: {common_selector['suggested_selector']}")
print(f"First review full classes: {common_selector['first_review_classes']}")
# Test the suggested selector
if common_selector['suggested_selector']:
test_count = driver.execute_script(
f"return document.querySelectorAll('{common_selector['suggested_selector']}').length;"
)
print(f"\nTesting suggested selector: Found {test_count} elements")
print("\n" + "="*80)
print("Browser staying open for manual inspection (60s)...")
print("="*80)
time.sleep(60)
finally:
driver.quit()

View File

@@ -1,305 +0,0 @@
#!/usr/bin/env python3
"""
Header Capture Scraper - Capture COMPLETE request from browser (headers + cookies).
This captures the exact request the browser makes, including ALL headers and cookies,
then replays it for fast API scraping.
"""
import json
import logging
import time
from typing import List, Optional, Tuple
import requests
from seleniumbase import SB
from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
class HeaderCaptureScraper:
"""Capture complete request, then replay for fast scraping."""
def __init__(self, url: str, headless: bool = False):
self.url = url
self.headless = headless
self.captured_request = None
self.place_id = None
self.session = requests.Session()
self.interceptor = GoogleMapsAPIInterceptor(None)
def capture_request(self) -> bool:
"""
Capture a complete API request (URL, headers, cookies) from browser.
"""
log.info("="*60)
log.info("Capturing request from browser...")
log.info("="*60)
sb_context = None
sb = None
try:
log.info("Starting browser...")
sb_context = SB(uc=True, headless=self.headless)
sb = sb_context.__enter__()
sb.open(self.url)
time.sleep(2)
# Dismiss cookies
try:
sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=3)
except:
pass
# Click reviews
try:
sb.click('.LRkQ2', timeout=5)
log.info("✓ Opened reviews")
time.sleep(2)
except:
pass
# Enable CDP network monitoring
sb.driver.execute_cdp_cmd('Network.enable', {})
log.info("✓ Network monitoring enabled")
# Scroll to trigger API call
log.info("Scrolling to trigger API request...")
sb.execute_script("window.scrollBy(0, 800)")
time.sleep(3)
# Get network logs from CDP
log.info("Checking network logs...")
logs = sb.driver.get_log('browser')
# Alternatively, use execute_cdp_cmd to get network events
# But simpler: Let's inject JS to capture the request
capture_script = """
window.__capturedRequest = null;
const originalFetch = window.fetch;
window.fetch = function(...args) {
const url = args[0].toString();
if (url.includes('listugcposts')) {
console.log('[CAPTURE] Intercepted request to:', url);
window.__capturedRequest = {
url: url,
method: 'GET'
};
}
return originalFetch.apply(this, args);
};
const originalXHR = window.XMLHttpRequest;
window.XMLHttpRequest = function() {
const xhr = new originalXHR();
const originalOpen = xhr.open;
xhr.open = function(method, url, ...rest) {
if (url.includes('listugcposts')) {
console.log('[CAPTURE] Intercepted XHR:', url);
window.__capturedRequest = {
url: url,
method: method
};
}
return originalOpen.apply(this, [method, url, ...rest]);
};
return xhr;
};
console.log('[CAPTURE] Request interceptor ready');
"""
sb.execute_script(capture_script)
log.info("✓ Request interceptor injected")
# Scroll again to trigger request
log.info("Scrolling to capture request...")
for i in range(3):
sb.execute_script("window.scrollBy(0, 600)")
time.sleep(2)
captured = sb.execute_script("return window.__capturedRequest")
if captured:
log.info(f"✓ Captured request URL!")
self.captured_request = captured
break
if not self.captured_request:
log.error("Failed to capture request")
return False
# Extract place ID from URL
url = self.captured_request['url']
if '!1s' in url:
import urllib.parse
parsed = urllib.parse.urlparse(url)
params = urllib.parse.parse_qs(parsed.query)
pb = params.get('pb', [''])[0]
if '!1s' in pb:
self.place_id = pb.split('!1s')[1].split('!')[0]
# Now capture ALL cookies via CDP
cdp_cookies = sb.driver.execute_cdp_cmd('Network.getAllCookies', {})
all_cookies = cdp_cookies.get('cookies', [])
# Set cookies in session
for cookie in all_cookies:
self.session.cookies.set(
name=cookie['name'],
value=cookie['value'],
domain=cookie.get('domain', '.google.com'),
path=cookie.get('path', '/')
)
# Get user agent
user_agent = sb.execute_script("return navigator.userAgent")
# Set headers to match browser
self.session.headers.update({
'User-Agent': user_agent,
'Accept': '*/*',
'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8',
'Referer': 'https://www.google.com/maps/',
'Origin': 'https://www.google.com',
'X-Requested-With': 'XMLHttpRequest',
})
log.info(f"\n✅ Request captured successfully!")
log.info(f" Place ID: {self.place_id}")
log.info(f" Cookies: {len(all_cookies)}")
log.info(f" Cookie names: {', '.join([c['name'] for c in all_cookies[:10]])}")
return True
except Exception as e:
log.error(f"Capture failed: {e}")
import traceback
traceback.print_exc()
return False
finally:
if sb_context:
try:
log.info("Closing browser...")
sb_context.__exit__(None, None, None)
log.info("✓ Browser closed\n")
except:
pass
def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]:
"""Fetch reviews using captured session."""
if continuation_token:
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
else:
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
params = {
'authuser': '0',
'hl': 'es',
'gl': 'es',
'pb': pb
}
try:
url = 'https://www.google.com/maps/rpc/listugcposts'
response = self.session.get(url, params=params, timeout=10)
if response.status_code != 200:
log.error(f"API error {response.status_code}: {response.text[:200]}")
return [], None
body = response.text
if body.startswith(")]}'"):
body = body[4:].strip()
data = json.loads(body)
reviews = self.interceptor._parse_listugcposts_response(data)
next_token = None
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
next_token = data[1]
return reviews, next_token
except Exception as e:
log.error(f"Request failed: {e}")
return [], None
def scrape_all(self, max_pages: int = 50) -> List[dict]:
"""Main scraping method."""
if not self.capture_request():
return []
log.info("="*60)
log.info("Fast API scraping...")
log.info("="*60)
start_time = time.time()
all_reviews = []
seen_ids = set()
token = None
page = 0
while page < max_pages:
page += 1
log.info(f"Page {page}...")
reviews, token = self.fetch_reviews_page(token)
if not reviews:
break
for review in reviews:
rid = review.review_id or f"{review.author}_{review.date_text}"
if rid not in seen_ids:
seen_ids.add(rid)
all_reviews.append({
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
})
log.info(f"{len(reviews)} reviews | Total: {len(all_reviews)}")
if not token:
break
time.sleep(0.2)
elapsed = time.time() - start_time
log.info(f"\n{'='*60}")
log.info(f"✅ COMPLETED!")
log.info(f"{'='*60}")
log.info(f"Reviews: {len(all_reviews)}")
log.info(f"Time: {elapsed:.2f}s")
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
log.info(f"{'='*60}\n")
return all_reviews
def main():
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
scraper = HeaderCaptureScraper(url, headless=False)
reviews = scraper.scrape_all()
if reviews:
with open('header_capture_reviews.json', 'w', encoding='utf-8') as f:
json.dump(reviews, f, indent=2, ensure_ascii=False)
log.info(f"Saved to header_capture_reviews.json")
if __name__ == '__main__':
main()

View File

@@ -1,352 +0,0 @@
#!/usr/bin/env python3
"""
Hybrid API scraper - Capture session from browser, then use direct API calls.
This combines the best of both worlds:
1. Browser establishes authentic session with Google
2. We capture ALL headers from real XHR requests
3. Replay those headers in direct API calls
4. No scrolling needed - just fast API pagination
Expected speed: 10-25x faster than traditional browser scrolling.
"""
import json
import logging
import time
from typing import List, Optional, Tuple, Dict
import requests
from seleniumbase import SB
from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
class HybridAPIScraper:
"""
Capture session from browser, then scrape via direct API calls.
"""
def __init__(self, url: str, headless: bool = False):
"""
Initialize the hybrid scraper.
Args:
url: Google Maps place URL
headless: Run browser in headless mode
"""
self.url = url
self.headless = headless
self.captured_headers = None
self.place_id = None
self.session = requests.Session()
# Initialize parser
self.interceptor = GoogleMapsAPIInterceptor(None)
def capture_session_from_browser(self) -> bool:
"""
Start a browser session, capture headers from actual API requests.
Returns:
True if session captured successfully
"""
log.info("Starting browser to capture session headers...")
try:
with SB(uc=True, headless=self.headless) as sb:
# Navigate to the place
log.info(f"Navigating to: {self.url[:80]}...")
sb.open(self.url)
sb.sleep(3)
# Dismiss cookie consent
try:
sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=5)
log.info("Cookie dialog dismissed")
except:
pass
# Click reviews tab
log.info("Opening reviews...")
try:
sb.click('.LRkQ2', timeout=5)
sb.sleep(3)
except:
log.warning("Could not click reviews tab")
# Enable Chrome DevTools Protocol for network monitoring
log.info("Enabling network interception...")
sb.driver.execute_cdp_cmd('Network.enable', {})
# Store captured requests
captured_requests = []
# Create event listener for network requests
def add_request_listener():
"""Inject JS to capture fetch/XHR requests with headers."""
script = """
window.__capturedRequests = [];
// Capture fetch
const originalFetch = window.fetch;
window.fetch = function(...args) {
const url = args[0].toString();
if (url.includes('listugcposts')) {
console.log('[CAPTURE] Fetch to:', url);
// Can't easily get headers from fetch without cloning
}
return originalFetch.apply(this, args);
};
// Capture XHR (more reliable for headers)
const originalXHR = window.XMLHttpRequest;
window.XMLHttpRequest = function() {
const xhr = new originalXHR();
const originalOpen = xhr.open;
const originalSetRequestHeader = xhr.setRequestHeader;
const headers = {};
xhr.setRequestHeader = function(name, value) {
headers[name.toLowerCase()] = value;
return originalSetRequestHeader.apply(this, arguments);
};
xhr.open = function(method, url, ...rest) {
if (url.includes('listugcposts')) {
console.log('[CAPTURE] XHR to:', url);
window.__capturedRequests.push({
url: url,
method: method,
headers: {...headers}
});
}
return originalOpen.apply(this, [method, url, ...rest]);
};
return xhr;
};
console.log('[CAPTURE] Request capture initialized');
"""
sb.execute_script(script)
add_request_listener()
# Scroll to trigger an API call
log.info("Scrolling to trigger API request...")
for i in range(5):
sb.execute_script("window.scrollBy(0, 800)")
sb.sleep(1.5)
# Check captured requests
captured_requests = sb.execute_script("return window.__capturedRequests || []")
if captured_requests:
log.info(f"✓ Captured {len(captured_requests)} API request(s)!")
break
captured_request = captured_requests[0] if captured_requests else {}
if not captured_request:
log.error("Failed to capture API request")
return False
# Extract place ID from URL
if 'place_id:' in self.url:
self.place_id = self.url.split('place_id:')[1].split('&')[0].split('/')[0]
elif '!1s' in captured_request['url']:
# Extract from pb parameter
import urllib.parse
parsed = urllib.parse.urlparse(captured_request['url'])
params = urllib.parse.parse_qs(parsed.query)
pb = params.get('pb', [''])[0]
if '!1s' in pb:
self.place_id = pb.split('!1s')[1].split('!')[0]
# Store captured headers
self.captured_headers = captured_request['headers']
# Also get cookies from browser
cookies = sb.driver.get_cookies()
for cookie in cookies:
self.session.cookies.set(cookie['name'], cookie['value'], domain=cookie.get('domain'))
log.info(f"\n{'='*60}")
log.info("✅ Session captured successfully!")
log.info(f"{'='*60}")
log.info(f"Place ID: {self.place_id}")
log.info(f"Headers captured: {len(self.captured_headers)}")
log.info(f"Cookies captured: {len(cookies)}")
log.info(f"{'='*60}\n")
# Print sample headers for debugging
log.debug("Sample headers:")
for key in ['cookie', 'x-goog-api-key', 'authorization', 'user-agent']:
if key in self.captured_headers:
value = self.captured_headers[key]
preview = value[:50] + '...' if len(value) > 50 else value
log.debug(f" {key}: {preview}")
return True
except Exception as e:
log.error(f"Failed to capture session: {e}")
import traceback
traceback.print_exc()
return False
def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]:
"""
Fetch reviews page using captured session.
Args:
continuation_token: Pagination token
Returns:
Tuple of (reviews, next_token)
"""
# Build pb parameter
if continuation_token:
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
else:
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
params = {
'authuser': '0',
'hl': 'es',
'gl': 'es',
'pb': pb
}
try:
log.info(f"Fetching page (token: {'initial' if not continuation_token else 'paginated'})...")
# Make request with captured headers
url = 'https://www.google.com/maps/rpc/listugcposts'
response = self.session.get(url, params=params, headers=self.captured_headers, timeout=10)
log.debug(f"Response status: {response.status_code}")
if response.status_code != 200:
log.error(f"API error {response.status_code}: {response.text[:500]}")
return [], None
# Parse response
body = response.text
if body.startswith(")]}'"):
body = body[4:].strip()
data = json.loads(body)
# Extract reviews
reviews = self.interceptor._parse_listugcposts_response(data)
# Get next token
next_token = None
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
next_token = data[1]
log.info(f"✓ Extracted {len(reviews)} reviews")
return reviews, next_token
except Exception as e:
log.error(f"API request failed: {e}")
return [], None
def scrape_all_reviews(self, max_pages: int = 100, delay: float = 0.3) -> List[dict]:
"""
Scrape all reviews using hybrid approach.
Args:
max_pages: Maximum pages to fetch
delay: Delay between API calls
Returns:
List of review dictionaries
"""
# Step 1: Capture session from browser
if not self.capture_session_from_browser():
log.error("Failed to capture session - aborting")
return []
# Step 2: Fetch all reviews via API
log.info("\nStarting API-based scraping (no browser needed!)...")
start_time = time.time()
all_reviews = []
seen_ids = set()
continuation_token = None
page = 0
while page < max_pages:
page += 1
reviews, continuation_token = self.fetch_reviews_page(continuation_token)
if not reviews:
log.info("No more reviews found")
break
# Deduplicate
for review in reviews:
review_id = review.review_id or f"{review.author}_{review.date_text}"
if review_id not in seen_ids:
seen_ids.add(review_id)
all_reviews.append({
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
})
log.info(f"Page {page}: {len(all_reviews)} total unique reviews")
if not continuation_token:
log.info("No continuation token - finished")
break
if delay > 0:
time.sleep(delay)
elapsed = time.time() - start_time
log.info(f"\n{'='*60}")
log.info(f"✅ API SCRAPING COMPLETED!")
log.info(f"{'='*60}")
log.info(f"Total reviews: {len(all_reviews)}")
log.info(f"API calls: {page}")
log.info(f"Time (API only): {elapsed:.2f} seconds")
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second")
log.info(f"{'='*60}\n")
return all_reviews
def main():
"""Example usage."""
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
scraper = HybridAPIScraper(url, headless=False)
reviews = scraper.scrape_all_reviews(max_pages=50, delay=0.3)
# Save results
output_file = 'hybrid_api_reviews.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(reviews, f, indent=2, ensure_ascii=False)
log.info(f"Saved {len(reviews)} reviews to {output_file}")
# Show sample
if reviews:
log.info("\nSample review:")
sample = reviews[0]
log.info(f" Author: {sample['author']}")
log.info(f" Rating: {sample['rating']}")
log.info(f" Text: {sample['text'][:80]}..." if sample['text'] else " Text: (none)")
if __name__ == '__main__':
main()

View File

@@ -1,157 +0,0 @@
#!/usr/bin/env python3
"""
Check what's actually inside the reviews pane after scrolling.
"""
import time
from seleniumbase import Driver
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
driver = Driver(uc=True, headless=False)
try:
driver.get(url)
time.sleep(5)
# GDPR
try:
form_btns = driver.find_elements('css selector', 'form button')
for btn in form_btns:
if 'accept all' in (btn.text or '').lower():
btn.click()
time.sleep(2)
break
except:
pass
# Click reviews tab
time.sleep(2)
tabs = driver.find_elements('css selector', 'button[role="tab"]')
review_tab_found = False
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
print(f"Tab: text='{tab.text}', aria='{tab.get_attribute('aria-label')}'")
if 'review' in text or 'review' in aria:
print(f" -> Clicking this tab!")
driver.execute_script("arguments[0].click();", tab)
time.sleep(6) # Wait longer
review_tab_found = True
break
if not review_tab_found:
print("WARNING: Reviews tab not found!")
# Find and scroll the pane
print("\nLooking for scrollable pane...")
pane = None
try:
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
print(f"Found pane: div.m6QErb.WNBkOb.XiKgde")
except:
print("Pane not found with standard selector!")
try:
pane = driver.find_element('css selector', 'div.m6QErb')
print(f"Found pane: div.m6QErb")
except:
print("No pane found at all!")
if pane:
print("\nScrolling pane to load reviews...")
for i in range(15):
driver.execute_script("arguments[0].scrollBy(0, 400);", pane)
time.sleep(0.4)
if (i + 1) % 5 == 0:
print(f" Scrolled {i+1} times...")
# Now check what's in the pane
print("\n" + "="*80)
print("ANALYZING PANE CONTENT")
print("="*80)
content_info = driver.execute_script("""
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || document.querySelector('div.m6QErb');
if (!pane) return {error: 'No pane found'};
// Get all child divs (direct and nested)
const allDivs = Array.from(pane.querySelectorAll('div'));
// Get all unique class names used
const classNames = new Set();
allDivs.forEach(div => {
if (div.className) {
div.className.split(' ').forEach(cls => {
if (cls.trim()) classNames.add(cls.trim());
});
}
});
// Find divs with ratings
const divsWithRatings = allDivs.filter(div => {
return !!div.querySelector('[aria-label*="star" i]');
});
// Find divs with author photos
const divsWithPhotos = allDivs.filter(div => {
return !!div.querySelector('img[src*="photo"], img[src*="avatar"]');
});
// Find divs with date patterns
const divsWithDates = allDivs.filter(div => {
return !!div.textContent.match(/\\d+\\s*(day|week|month|year|hour|minute|ago)/i);
});
// Find divs with ALL three
const reviewLikeDivs = allDivs.filter(div => {
const hasRating = !!div.querySelector('[aria-label*="star" i]');
const hasPhoto = !!div.querySelector('img');
const hasDate = !!div.textContent.match(/\\d+\\s*(day|week|month|year|hour|ago)/i);
const textLen = div.textContent.length;
return hasRating && hasPhoto && hasDate && textLen > 50 && textLen < 2000;
});
return {
total_divs: allDivs.length,
unique_classes: Array.from(classNames).sort(),
divs_with_ratings: divsWithRatings.length,
divs_with_photos: divsWithPhotos.length,
divs_with_dates: divsWithDates.length,
review_like_divs: reviewLikeDivs.length,
review_like_classes: reviewLikeDivs.slice(0, 5).map(d => ({
classes: d.className,
text_length: d.textContent.length,
sample: d.textContent.substring(0, 100)
}))
};
""")
if 'error' in content_info:
print(f"ERROR: {content_info['error']}")
else:
print(f"\nTotal divs in pane: {content_info['total_divs']}")
print(f"Divs with ratings: {content_info['divs_with_ratings']}")
print(f"Divs with photos: {content_info['divs_with_photos']}")
print(f"Divs with dates: {content_info['divs_with_dates']}")
print(f"Divs matching ALL criteria (review-like): {content_info['review_like_divs']}")
print(f"\nFirst 20 unique classes found in pane:")
for cls in content_info['unique_classes'][:20]:
print(f" {cls}")
if content_info['review_like_divs'] > 0:
print(f"\nFirst 5 review-like divs:")
for i, div_info in enumerate(content_info['review_like_classes'], 1):
print(f"\n Div {i}:")
print(f" Classes: {div_info['classes']}")
print(f" Text length: {div_info['text_length']}")
print(f" Sample: {div_info['sample'][:80]}...")
print(f"\n{'='*80}")
print("Browser staying open for manual inspection (120 seconds)...")
print("Look at the DevTools to see the actual review elements!")
print(f"{'='*80}")
time.sleep(120)
finally:
driver.quit()

View File

@@ -1,70 +0,0 @@
#!/usr/bin/env python3
"""
Open the page and keep it open for manual inspection.
INSTRUCTIONS:
1. Open DevTools (F12)
2. Click on an individual review
3. Look at the div that contains ONE review (not the whole list)
4. Note the class names on that div
"""
import time
from seleniumbase import Driver
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
driver = Driver(uc=True, headless=False)
try:
driver.get(url)
time.sleep(5)
# GDPR
try:
form_btns = driver.find_elements('css selector', 'form button')
for btn in form_btns:
if 'accept all' in (btn.text or '').lower():
btn.click()
time.sleep(2)
break
except:
pass
# Click reviews tab
time.sleep(2)
tabs = driver.find_elements('css selector', 'button[role="tab"]')
for tab in tabs:
if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower():
driver.execute_script("arguments[0].click();", tab)
time.sleep(5)
break
# Scroll to load a few reviews
try:
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
for _ in range(5):
driver.execute_script("arguments[0].scrollBy(0, 300);", pane)
time.sleep(0.5)
except:
pass
print("\n" + "="*80)
print("MANUAL INSPECTION TIME!")
print("="*80)
print("\n1. The browser is now showing the reviews page")
print("2. Open DevTools (F12 or right-click > Inspect)")
print("3. Click the 'Select element' tool (top-left of DevTools)")
print("4. Hover over an INDIVIDUAL review (not the whole panel)")
print("5. Click on it to select it in the inspector")
print("6. Look at the <div> that wraps ONE SINGLE review")
print("7. Note the 'class' attribute value")
print("\n8. The class might look like: class=\"MyWpvb fontBodyMedium\" or similar")
print("\n9. Write down the full class name(s) - we'll use this as the selector!")
print("\n" + "="*80)
print("Browser will stay open for 5 minutes...")
print("="*80)
time.sleep(300) # 5 minutes
finally:
driver.quit()

View File

@@ -1,923 +0,0 @@
"""
API Interceptor for Google Maps Reviews.
Uses Chrome DevTools Protocol (CDP) to intercept network requests and capture
Google's internal API responses for faster, more reliable data extraction.
"""
import base64
import json
import logging
import os
import re
import threading
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional
from urllib.parse import parse_qs, urlparse
log = logging.getLogger("api_interceptor")
@dataclass
class InterceptedReview:
"""Data class for a review extracted from API response"""
review_id: str = ""
author: str = ""
rating: float = 0.0
text: str = ""
date_text: str = ""
timestamp: int = 0
likes: int = 0
photos: List[str] = field(default_factory=list)
profile_url: str = ""
avatar_url: str = ""
owner_response: str = ""
owner_response_date: str = ""
lang: str = ""
class GoogleMapsAPIInterceptor:
"""
Intercepts Google Maps internal API calls to capture review data directly.
Google Maps uses several internal endpoints for reviews:
- /maps/preview/review/listentitiesreviews - Main reviews endpoint
- /maps/rpc/placereview - Alternative review endpoint
- /maps/preview/reviewsdata - Review data endpoint
The responses are often in a custom protobuf-like JSON format that needs parsing.
"""
# Patterns for review-related API endpoints
REVIEW_API_PATTERNS = [
r'maps/preview/review',
r'maps/rpc/placereview',
r'maps/preview/reviewsdata',
r'maps/preview/place',
r'maps/api/place',
r'/locationhistory/preview',
r'batchexecute.*review',
]
def __init__(self, driver):
"""Initialize the interceptor with a Selenium driver"""
self.driver = driver
self.captured_responses: List[Dict[str, Any]] = []
self.captured_reviews: List[InterceptedReview] = []
self.request_map: Dict[str, Dict] = {} # Map request IDs to URLs
self._lock = threading.Lock()
self._listening = False
self._response_callback: Optional[Callable] = None
def setup_interception(self):
"""Enable network interception via CDP"""
try:
# Enable network domain
self.driver.execute_cdp_cmd('Network.enable', {})
# Set up request interception patterns
self.driver.execute_cdp_cmd('Network.setRequestInterception', {
'patterns': [
{'urlPattern': '*maps*review*', 'resourceType': 'XHR'},
{'urlPattern': '*maps*review*', 'resourceType': 'Fetch'},
{'urlPattern': '*batchexecute*', 'resourceType': 'XHR'},
{'urlPattern': '*batchexecute*', 'resourceType': 'Fetch'},
]
})
self._listening = True
log.info("API interception enabled via CDP")
return True
except Exception as e:
log.warning(f"Could not enable CDP interception: {e}")
# Try alternative approach
return self._setup_performance_logging()
def _setup_performance_logging(self):
"""Alternative approach using Performance logging"""
try:
self.driver.execute_cdp_cmd('Network.enable', {
'maxTotalBufferSize': 10000000,
'maxResourceBufferSize': 5000000
})
self._listening = True
log.info("API interception enabled via performance logging")
return True
except Exception as e:
log.error(f"Failed to setup performance logging: {e}")
return False
def capture_network_responses(self, duration: float = 5.0):
"""
Capture network responses for a specified duration.
Call this while scrolling/loading more reviews.
"""
if not self._listening:
log.warning("Interception not set up, call setup_interception() first")
return []
captured = []
start_time = time.time()
while time.time() - start_time < duration:
try:
# Get performance logs which contain network events
logs = self.driver.get_log('performance')
for entry in logs:
try:
log_data = json.loads(entry['message'])
message = log_data.get('message', {})
method = message.get('method', '')
params = message.get('params', {})
# Capture response received events
if method == 'Network.responseReceived':
response = params.get('response', {})
url = response.get('url', '')
if self._is_review_api(url):
request_id = params.get('requestId')
self.request_map[request_id] = {
'url': url,
'status': response.get('status'),
'headers': response.get('headers', {})
}
# Capture response body when loading is finished
elif method == 'Network.loadingFinished':
request_id = params.get('requestId')
if request_id in self.request_map:
body = self._get_response_body(request_id)
if body:
captured.append({
'url': self.request_map[request_id]['url'],
'body': body,
'timestamp': time.time()
})
except Exception as parse_error:
log.debug(f"Error parsing log entry: {parse_error}")
continue
except Exception as e:
# Performance logs might not be available
log.debug(f"Could not get performance logs: {e}")
break
time.sleep(0.1)
with self._lock:
self.captured_responses.extend(captured)
return captured
def get_response_bodies_cdp(self):
"""Get response bodies using CDP directly (more reliable method)"""
responses = []
try:
# Use CDP to get all responses
result = self.driver.execute_cdp_cmd('Network.getAllCookies', {})
# Execute JavaScript to intercept fetch/XHR responses
intercept_script = """
(function() {
if (window.__interceptedResponses) {
var responses = window.__interceptedResponses;
window.__interceptedResponses = [];
return responses;
}
return [];
})();
"""
captured = self.driver.execute_script(intercept_script)
if captured:
responses.extend(captured)
except Exception as e:
log.debug(f"CDP response capture error: {e}")
return responses
def inject_response_interceptor(self):
"""
Inject JavaScript to intercept XHR/Fetch responses at the browser level.
This is the most reliable method for capturing API responses.
"""
intercept_script = """
(function() {
// Skip if already injected
if (window.__reviewInterceptorInjected) {
console.log('[API Interceptor] Already injected, skipping');
return;
}
window.__reviewInterceptorInjected = true;
window.__interceptedResponses = [];
window.__interceptorStats = {
totalFetch: 0,
totalXHR: 0,
capturedFetch: 0,
capturedXHR: 0,
lastCapture: null
};
console.log('[API Interceptor] Initializing...');
// Store original fetch
const originalFetch = window.fetch;
// Override fetch
window.fetch = async function(...args) {
window.__interceptorStats.totalFetch++;
const url = args[0].toString();
// Log ALL fetch requests for debugging
console.debug('[API Interceptor] FETCH:', url.substring(0, 150));
const response = await originalFetch.apply(this, args);
// Check if this is a review-related API call
if (url.includes('review') || url.includes('batchexecute') ||
url.includes('place') || url.includes('maps') ||
url.includes('listugcposts') || url.includes('getreviews')) {
try {
const clone = response.clone();
const text = await clone.text();
console.log('[API Interceptor] ✅ CAPTURED FETCH:', url.substring(0, 100), 'Size:', text.length);
window.__interceptedResponses.push({
url: url,
body: text,
timestamp: Date.now(),
type: 'fetch',
size: text.length
});
window.__interceptorStats.capturedFetch++;
window.__interceptorStats.lastCapture = new Date().toISOString();
// Keep only last 100 responses to avoid memory issues
if (window.__interceptedResponses.length > 100) {
window.__interceptedResponses = window.__interceptedResponses.slice(-50);
}
} catch (e) {
console.error('[API Interceptor] Response capture error:', e);
}
}
return response;
};
// Store original XMLHttpRequest
const originalXHR = window.XMLHttpRequest;
// Create intercepting XHR
window.XMLHttpRequest = function() {
const xhr = new originalXHR();
const originalOpen = xhr.open;
const originalSend = xhr.send;
let requestUrl = '';
xhr.open = function(method, url, ...rest) {
requestUrl = url;
window.__interceptorStats.totalXHR++;
console.debug('[API Interceptor] XHR:', method, url.substring(0, 150));
return originalOpen.apply(this, [method, url, ...rest]);
};
xhr.addEventListener('load', function() {
if (requestUrl.includes('review') || requestUrl.includes('batchexecute') ||
requestUrl.includes('place') || requestUrl.includes('maps') ||
requestUrl.includes('listugcposts') || requestUrl.includes('getreviews')) {
try {
console.log('[API Interceptor] ✅ CAPTURED XHR:', requestUrl.substring(0, 100), 'Size:', xhr.responseText.length);
window.__interceptedResponses.push({
url: requestUrl,
body: xhr.responseText,
timestamp: Date.now(),
type: 'xhr',
status: xhr.status,
size: xhr.responseText.length
});
window.__interceptorStats.capturedXHR++;
window.__interceptorStats.lastCapture = new Date().toISOString();
if (window.__interceptedResponses.length > 100) {
window.__interceptedResponses = window.__interceptedResponses.slice(-50);
}
} catch (e) {
console.error('[API Interceptor] XHR capture error:', e);
}
}
});
return xhr;
};
// Copy static properties
for (let prop of Object.getOwnPropertyNames(originalXHR)) {
try {
window.XMLHttpRequest[prop] = originalXHR[prop];
} catch (e) {}
}
console.log('[API Interceptor] ✅ Injected successfully! Monitoring network requests...');
// Log stats every 10 seconds
setInterval(() => {
if (window.__interceptorStats.totalFetch > 0 || window.__interceptorStats.totalXHR > 0) {
console.log('[API Interceptor] Stats:',
'Fetch:', window.__interceptorStats.totalFetch, '/', window.__interceptorStats.capturedFetch,
'XHR:', window.__interceptorStats.totalXHR, '/', window.__interceptorStats.capturedXHR,
'Queue:', window.__interceptedResponses.length);
}
}, 10000);
return true;
})();
"""
try:
result = self.driver.execute_script(intercept_script)
log.info("JavaScript response interceptor injected with enhanced debugging")
# Get initial stats
stats = self.get_interceptor_stats()
log.debug(f"Interceptor stats: {stats}")
return True
except Exception as e:
log.warning(f"Failed to inject interceptor: {e}")
return False
def get_intercepted_responses(self):
"""Retrieve intercepted responses from the browser"""
try:
script = """
if (window.__interceptedResponses) {
var responses = window.__interceptedResponses.slice();
window.__interceptedResponses = [];
return responses;
}
return [];
"""
responses = self.driver.execute_script(script)
if responses:
log.debug(f"Retrieved {len(responses)} intercepted responses from browser")
for resp in responses[:3]: # Log first 3 for debugging
log.debug(f" - {resp.get('type', '?').upper()}: {resp.get('url', '')[:100]} ({resp.get('size', 0)} bytes)")
else:
log.debug("No intercepted responses available")
return responses or []
except Exception as e:
log.debug(f"Error getting intercepted responses: {e}")
return []
def get_interceptor_stats(self):
"""Get statistics from the JavaScript interceptor"""
try:
script = """
if (window.__interceptorStats) {
return window.__interceptorStats;
}
return null;
"""
stats = self.driver.execute_script(script)
return stats
except Exception as e:
log.debug(f"Error getting interceptor stats: {e}")
return None
def get_browser_console_logs(self):
"""Get browser console logs (for debugging)"""
try:
logs = self.driver.get_log('browser')
return logs
except Exception as e:
log.debug(f"Could not get browser console logs: {e}")
return []
def dump_responses_to_file(self, responses: List[Dict], output_dir: str = "debug_api_responses"):
"""
Dump captured responses to files for debugging.
Creates one file per response with metadata and body.
"""
try:
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
for i, response in enumerate(responses):
timestamp = response.get('timestamp', int(time.time() * 1000))
url = response.get('url', 'unknown')
req_type = response.get('type', 'unknown')
# Create filename from timestamp and type
filename = f"{timestamp}_{req_type}_{i}.json"
filepath = output_path / filename
# Write response with metadata
with open(filepath, 'w', encoding='utf-8') as f:
json.dump({
'metadata': {
'url': url,
'type': req_type,
'timestamp': timestamp,
'size': response.get('size', len(response.get('body', ''))),
'status': response.get('status')
},
'body': response.get('body', '')
}, f, indent=2, ensure_ascii=False)
log.info(f"Dumped {len(responses)} responses to {output_path}")
return str(output_path)
except Exception as e:
log.error(f"Error dumping responses to file: {e}")
return None
def _is_review_api(self, url: str) -> bool:
"""Check if URL matches review API patterns"""
url_lower = url.lower()
return any(re.search(pattern, url_lower) for pattern in self.REVIEW_API_PATTERNS)
def _get_response_body(self, request_id: str) -> Optional[str]:
"""Get response body for a request ID using CDP"""
try:
result = self.driver.execute_cdp_cmd('Network.getResponseBody', {
'requestId': request_id
})
body = result.get('body', '')
if result.get('base64Encoded'):
body = base64.b64decode(body).decode('utf-8', errors='ignore')
return body
except Exception as e:
log.debug(f"Could not get response body for {request_id}: {e}")
return None
def parse_reviews_from_responses(self, responses: List[Dict]) -> List[InterceptedReview]:
"""
Parse review data from captured API responses.
Google's API responses use a custom nested array format.
"""
reviews = []
for response in responses:
try:
body = response.get('body', '')
url = response.get('url', '')
# Skip non-JSON responses
if not body or body.startswith('<!DOCTYPE'):
continue
# Try to parse as JSON
parsed_reviews = self._parse_response_body(body, url)
reviews.extend(parsed_reviews)
except Exception as e:
log.debug(f"Error parsing response: {e}")
continue
# Deduplicate by review ID
seen_ids = set()
unique_reviews = []
for review in reviews:
if review.review_id and review.review_id not in seen_ids:
seen_ids.add(review.review_id)
unique_reviews.append(review)
return unique_reviews
def _parse_response_body(self, body: str, url: str) -> List[InterceptedReview]:
"""Parse a single response body for review data"""
reviews = []
# Skip empty or HTML responses
if not body or body.startswith('<!DOCTYPE') or body.startswith('<html'):
return reviews
# Handle batch execute format (starts with )]}' prefix)
if body.startswith(")]}'"):
body = body[4:].strip()
try:
data = json.loads(body)
except json.JSONDecodeError:
# Try to extract JSON from the response
json_match = re.search(r'\[.*\]', body, re.DOTALL)
if json_match:
try:
data = json.loads(json_match.group())
except:
log.debug(f"Failed to parse JSON from response")
return reviews
else:
log.debug(f"No JSON found in response")
return reviews
# Special handling for listugcposts endpoint
if 'listugcposts' in url.lower():
reviews.extend(self._parse_listugcposts_response(data))
else:
# Generic recursive extraction
reviews.extend(self._extract_reviews_recursive(data))
return reviews
def _parse_listugcposts_response(self, data: Any) -> List[InterceptedReview]:
"""
Parse Google Maps listugcposts API response.
Structure discovered:
data[2] = array of review groups
data[2][i] = single review group [review_data, metadata, continuation_token]
data[2][i][0] = review data (6-item array containing all review info)
"""
reviews = []
try:
if not isinstance(data, list) or len(data) < 3:
log.debug("Response doesn't match expected structure (not a list or too short)")
return reviews
# data[2] contains the review groups
review_groups = data[2]
if not isinstance(review_groups, list):
log.debug("data[2] is not a list")
return reviews
log.debug(f"Found {len(review_groups)} reviews in data[2]")
# Each group IS ONE REVIEW
for group_idx, group in enumerate(review_groups):
if not isinstance(group, list) or len(group) == 0:
continue
# group[0] is the review data array (6 items)
review_data = group[0]
if not isinstance(review_data, list):
continue
try:
review = self._parse_google_review_array(review_data)
if review:
reviews.append(review)
log.debug(f"Parsed review {group_idx}: {review.author} - {review.rating}")
except Exception as e:
log.debug(f"Error parsing review at group[{group_idx}]: {e}")
except Exception as e:
log.debug(f"Error in _parse_listugcposts_response: {e}")
return reviews
def _parse_google_review_array(self, review_data: List) -> Optional[InterceptedReview]:
"""
Parse a single review from Google's 6-item array format.
Discovered structure (review_data is a 6-item array):
review_data[0] = Review ID (string)
review_data[1][4][5][0] = Author Name
review_data[1][4][5][3] = User ID
review_data[1][6] = Date Text
review_data[2][0][0] = Rating (1-5)
review_data[2][15][0][0] = Review Text (original)
review_data[2][15][1][0] = Review Text (translated)
"""
review = InterceptedReview()
try:
# Extract review ID from review_data[0]
if len(review_data) > 0 and isinstance(review_data[0], str):
review.review_id = review_data[0]
# Extract author info from review_data[1][4][5]
if (len(review_data) > 1 and
isinstance(review_data[1], list) and
len(review_data[1]) > 4 and
isinstance(review_data[1][4], list) and
len(review_data[1][4]) > 5 and
isinstance(review_data[1][4][5], list)):
author_info = review_data[1][4][5]
# Author name at [1][4][5][0]
if len(author_info) > 0 and isinstance(author_info[0], str):
review.author = author_info[0]
# Profile picture at [1][4][5][1] (if available)
if len(author_info) > 1 and isinstance(author_info[1], str):
review.avatar_url = author_info[1]
# Extract date from review_data[1][6]
if (len(review_data) > 1 and
isinstance(review_data[1], list) and
len(review_data[1]) > 6 and
isinstance(review_data[1][6], str)):
review.date_text = review_data[1][6]
# Extract rating from review_data[2][0][0]
if (len(review_data) > 2 and
isinstance(review_data[2], list) and
len(review_data[2]) > 0 and
isinstance(review_data[2][0], list) and
len(review_data[2][0]) > 0):
rating_val = review_data[2][0][0]
if isinstance(rating_val, (int, float)) and 1 <= rating_val <= 5:
review.rating = float(rating_val)
# Extract review text from review_data[2][15][0][0]
if (len(review_data) > 2 and
isinstance(review_data[2], list) and
len(review_data[2]) > 15 and
isinstance(review_data[2][15], list) and
len(review_data[2][15]) > 0 and
isinstance(review_data[2][15][0], list) and
len(review_data[2][15][0]) > 0):
text = review_data[2][15][0][0]
if isinstance(text, str):
review.text = text
# Only return if we have minimum required data
if review.rating > 0 and (review.author or review.text):
return review
except Exception as e:
log.debug(f"Error parsing Google review array: {e}")
return None
def _parse_review_array_v2(self, arr: List) -> Optional[InterceptedReview]:
"""
Parse review from Google's nested array format.
Improved version with better field detection.
"""
review = InterceptedReview()
try:
# Extract review ID (usually a long string in first few elements)
for i, item in enumerate(arr[:5]):
if isinstance(item, str) and len(item) > 30 and not item.startswith('http'):
review.review_id = item
break
# Extract rating (number between 1-5)
for item in arr:
if isinstance(item, (int, float)) and 1 <= item <= 5:
review.rating = float(item)
break
elif isinstance(item, list):
for subitem in item:
if isinstance(subitem, (int, float)) and 1 <= subitem <= 5:
review.rating = float(subitem)
break
if review.rating > 0:
break
# Extract review text (long string, not a URL)
for item in arr:
if isinstance(item, str) and len(item) > 50 and not item.startswith('http'):
if not review.review_id or item != review.review_id:
review.text = item
break
# Extract author name (shorter string, not ID or text)
for item in arr:
if isinstance(item, str) and 3 <= len(item) <= 100:
if item != review.review_id and item != review.text and not item.startswith('http'):
review.author = item
break
elif isinstance(item, list):
for subitem in item:
if isinstance(subitem, str) and 3 <= len(subitem) <= 100:
if subitem != review.text and not subitem.startswith('http'):
review.author = subitem
break
if review.author:
break
# Extract dates (strings that look like dates)
date_patterns = [r'\d{1,2}/\d{1,2}/\d{2,4}', r'\d{4}-\d{2}-\d{2}', r'hace \d+', r'\d+ days? ago']
for item in arr:
if isinstance(item, str):
for pattern in date_patterns:
if re.search(pattern, item, re.IGNORECASE):
review.date_text = item
break
if review.date_text:
break
# Only return if we have meaningful data
if (review.review_id or review.author) and review.rating > 0:
return review
except Exception as e:
log.debug(f"Error in _parse_review_array_v2: {e}")
return None
def _extract_reviews_recursive(self, data: Any, depth: int = 0) -> List[InterceptedReview]:
"""Recursively search for review data in nested structures"""
reviews = []
if depth > 20: # Prevent infinite recursion
return reviews
# Skip if data is already an InterceptedReview object
if isinstance(data, InterceptedReview):
return [data]
if isinstance(data, dict):
# Check if this looks like a review object
review = self._try_parse_review_dict(data)
if review:
reviews.append(review)
# Recurse into dict values
for value in data.values():
if not isinstance(value, InterceptedReview):
reviews.extend(self._extract_reviews_recursive(value, depth + 1))
elif isinstance(data, list):
# Check if this array looks like a review array
review = self._try_parse_review_array(data)
if review:
reviews.append(review)
# Recurse into list items
for item in data:
if not isinstance(item, InterceptedReview):
reviews.extend(self._extract_reviews_recursive(item, depth + 1))
return reviews
def _try_parse_review_dict(self, data: Dict) -> Optional[InterceptedReview]:
"""Try to parse a dictionary as a review object"""
# Common keys in review objects
review_keys = {'reviewId', 'review_id', 'author', 'rating', 'text', 'comment'}
if not any(k in data for k in review_keys):
return None
try:
review = InterceptedReview()
# Try various key names for each field
review.review_id = data.get('reviewId') or data.get('review_id') or data.get('id', '')
review.author = data.get('author') or data.get('authorName') or data.get('name', '')
review.rating = float(data.get('rating') or data.get('starRating') or 0)
review.text = data.get('text') or data.get('comment') or data.get('reviewText', '')
review.date_text = data.get('publishTime') or data.get('relativePublishTime') or data.get('date', '')
review.likes = int(data.get('thumbsUpCount') or data.get('likes') or 0)
# Photos
photos = data.get('photos') or data.get('reviewPhotos') or []
if photos:
review.photos = [p.get('url') or p for p in photos if p]
# Profile
author_data = data.get('author') if isinstance(data.get('author'), dict) else {}
review.profile_url = author_data.get('profileUrl') or data.get('profileUrl', '')
review.avatar_url = author_data.get('profilePhotoUrl') or data.get('avatar', '')
# Owner response
owner_resp = data.get('ownerResponse') or data.get('ownerReply') or {}
if isinstance(owner_resp, dict):
review.owner_response = owner_resp.get('text', '')
review.owner_response_date = owner_resp.get('publishTime', '')
# Only return if we have meaningful data
if review.review_id or (review.author and review.text):
return review
except Exception as e:
log.debug(f"Error parsing review dict: {e}")
return None
def _try_parse_review_array(self, data: List) -> Optional[InterceptedReview]:
"""
Try to parse a nested array as a review (Google's protobuf-like format).
Google often uses positional arrays like: [id, author, [rating], text, ...]
"""
if not data or len(data) < 3:
return None
try:
# Look for patterns that indicate this is a review array
# Pattern 1: [review_id, [author_info], rating_array, text, ...]
review = InterceptedReview()
# Check if first element looks like a review ID
if isinstance(data[0], str) and len(data[0]) > 20:
review.review_id = data[0]
# Search for rating (usually a small number 1-5)
for item in data:
if isinstance(item, (int, float)) and 1 <= item <= 5:
review.rating = float(item)
break
elif isinstance(item, list) and len(item) >= 1:
if isinstance(item[0], (int, float)) and 1 <= item[0] <= 5:
review.rating = float(item[0])
break
# Search for text (long string)
for item in data:
if isinstance(item, str) and len(item) > 30:
review.text = item
break
elif isinstance(item, list):
for subitem in item:
if isinstance(subitem, str) and len(subitem) > 30:
review.text = subitem
break
# Search for author name (shorter string)
for item in data:
if isinstance(item, list) and len(item) >= 1:
for subitem in item:
if isinstance(subitem, str) and 2 <= len(subitem) <= 100 and subitem != review.text:
review.author = subitem
break
if review.author:
break
# Search for URLs (photos, profile)
for item in data:
if isinstance(item, str) and item.startswith('http'):
if 'googleusercontent' in item or 'ggpht' in item:
if not review.avatar_url:
review.avatar_url = item
else:
review.photos.append(item)
elif isinstance(item, list):
self._extract_urls_from_array(item, review)
# Only return if we have meaningful data
if review.review_id and review.rating > 0:
return review
if review.text and review.rating > 0:
return review
except Exception as e:
log.debug(f"Error parsing review array: {e}")
return None
def _extract_urls_from_array(self, arr: List, review: InterceptedReview, depth: int = 0):
"""Extract URLs from nested arrays"""
if depth > 5:
return
for item in arr:
if isinstance(item, str) and item.startswith('http'):
if 'googleusercontent' in item or 'ggpht' in item or 'lh3' in item:
if 'w72-h72' in item or 'p-rp-mo' in item: # Profile pic pattern
review.avatar_url = item
else:
review.photos.append(item)
elif isinstance(item, list):
self._extract_urls_from_array(item, depth + 1, review)
def convert_to_raw_review_format(self, intercepted: InterceptedReview) -> Dict[str, Any]:
"""Convert an InterceptedReview to the format used by RawReview/storage"""
return {
'review_id': intercepted.review_id,
'author': intercepted.author,
'rating': intercepted.rating,
'description': {'en': intercepted.text} if intercepted.text else {},
'likes': intercepted.likes,
'user_images': intercepted.photos,
'author_profile_url': intercepted.profile_url,
'profile_picture': intercepted.avatar_url,
'owner_responses': {
'en': {'text': intercepted.owner_response}
} if intercepted.owner_response else {},
'review_date': intercepted.date_text,
'_source': 'api_intercept'
}
def cleanup(self):
"""Clean up interception resources"""
try:
self.driver.execute_cdp_cmd('Network.disable', {})
except:
pass
self.captured_responses.clear()
self.captured_reviews.clear()
self.request_map.clear()
self._listening = False

View File

@@ -35,16 +35,45 @@ class ChromeWorker:
# SeleniumBase Driver automatically includes UC mode anti-detection # SeleniumBase Driver automatically includes UC mode anti-detection
# Initialize with longer timeouts for large scraping jobs # Initialize with longer timeouts for large scraping jobs
# Chrome arguments for Docker stability
chrome_args = [
"--disable-dev-shm-usage", # Use /tmp instead of /dev/shm (critical for Docker)
"--disable-gpu", # Disable GPU acceleration
"--no-sandbox", # Required for Docker
"--disable-software-rasterizer",
"--disable-extensions",
"--disable-background-networking",
"--disable-default-apps",
"--disable-sync",
"--metrics-recording-only",
"--mute-audio",
"--no-first-run",
"--safebrowsing-disable-auto-update",
]
self.driver = Driver( self.driver = Driver(
uc=True, uc=True,
headless=self.headless, headless=self.headless,
page_load_strategy="normal" page_load_strategy="normal",
chromium_arg=",".join(chrome_args)
) )
# Set generous timeouts for large scraping jobs # Set generous timeouts for large scraping jobs
self.driver.set_page_load_timeout(120) # 2 minutes for slow networks self.driver.set_page_load_timeout(120) # 2 minutes for slow networks
self.driver.set_script_timeout(60) # 1 minute for complex extraction self.driver.set_script_timeout(60) # 1 minute for complex extraction
# Set Chrome geolocation to US (Boston, MA) for consistent Google Maps results
# This prevents location-based variations in search results
try:
self.driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': 42.3601,
'longitude': -71.0589,
'accuracy': 100
})
log.info(f"Worker {self.worker_id}: Geolocation set to US (Boston, MA)")
except Exception as e:
log.warning(f"Worker {self.worker_id}: Could not set geolocation: {e}")
self.driver.maximize_window() self.driver.maximize_window()
self.created_at = time.time() self.created_at = time.time()
self.last_used = time.time() self.last_used = time.time()

View File

@@ -1,80 +0,0 @@
"""
Command line interface handling for Google Maps Reviews Scraper.
"""
import argparse
import json
from pathlib import Path
from modules.config import DEFAULT_CONFIG_PATH
def parse_arguments():
"""Parse command line arguments"""
ap = argparse.ArgumentParser(description="GoogleMaps review scraper with MongoDB integration")
ap.add_argument("-q", "--headless", action="store_true",
help="run Chrome in the background")
ap.add_argument("-s", "--sort", dest="sort_by",
choices=("newest", "highest", "lowest", "relevance"),
default=None, help="sorting order for reviews")
ap.add_argument("--stop-on-match", action="store_true",
help="stop scrolling when first alreadyseen id is met "
"(useful with --sort newest)")
ap.add_argument("--url", type=str, default=None,
help="custom Google Maps URL to scrape")
ap.add_argument("--overwrite", action="store_true", dest="overwrite_existing",
help="overwrite existing reviews instead of appending")
ap.add_argument("--config", type=str, default=None,
help="path to custom configuration file")
ap.add_argument("--use-mongodb", type=bool, default=None,
help="whether to use MongoDB for storage")
# Arguments for date conversion and image downloading
ap.add_argument("--convert-dates", type=bool, default=None,
help="convert string dates to MongoDB Date objects")
ap.add_argument("--download-images", type=bool, default=None,
help="download images from reviews")
ap.add_argument("--image-dir", type=str, default=None,
help="directory to store downloaded images")
ap.add_argument("--download-threads", type=int, default=None,
help="number of threads for downloading images")
# Arguments for local image paths and URL replacement
ap.add_argument("--store-local-paths", type=bool, default=None,
help="whether to store local image paths in documents")
ap.add_argument("--replace-urls", type=bool, default=None,
help="whether to replace original URLs with custom ones")
ap.add_argument("--custom-url-base", type=str, default=None,
help="base URL for replacement")
ap.add_argument("--custom-url-profiles", type=str, default=None,
help="path for profile images")
ap.add_argument("--custom-url-reviews", type=str, default=None,
help="path for review images")
ap.add_argument("--preserve-original-urls", type=bool, default=None,
help="whether to preserve original URLs in original_* fields")
# Arguments for custom parameters
ap.add_argument("--custom-params", type=str, default=None,
help="JSON string with custom parameters to add to each document (e.g. '{\"company\":\"Thaitours\"}')")
# API interception option
ap.add_argument("--api-intercept", action="store_true", dest="enable_api_intercept",
help="enable API response interception for faster data capture (experimental)")
args = ap.parse_args()
# Handle config path
if args.config is not None:
args.config = Path(args.config)
else:
args.config = DEFAULT_CONFIG_PATH
# Process custom params if provided
if args.custom_params:
try:
args.custom_params = json.loads(args.custom_params)
except json.JSONDecodeError:
print(f"Warning: Could not parse custom params JSON: {args.custom_params}")
args.custom_params = None
return args

View File

@@ -77,11 +77,17 @@ class DatabaseManager:
error_message TEXT, error_message TEXT,
metadata JSONB, metadata JSONB,
scrape_logs JSONB,
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled')) CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
); );
""") """)
# Add scrape_logs column if it doesn't exist (for existing databases)
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
""")
# Create indexes # Create indexes
await conn.execute(""" await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status); CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
@@ -182,10 +188,12 @@ class DatabaseManager:
started_at, started_at,
completed_at, completed_at,
reviews_count, reviews_count,
total_reviews,
reviews_data, reviews_data,
scrape_time, scrape_time,
error_message, error_message,
metadata metadata,
scrape_logs
FROM jobs FROM jobs
WHERE job_id = $1 WHERE job_id = $1
""", job_id) """, job_id)
@@ -246,8 +254,13 @@ class DatabaseManager:
kwargs['completed_at'] = datetime.now() kwargs['completed_at'] = datetime.now()
for key, value in kwargs.items(): for key, value in kwargs.items():
set_clauses.append(f"{key} = ${param_idx}") # Handle JSONB fields specially
params.append(value) if key == 'scrape_logs' and value is not None:
set_clauses.append(f"{key} = ${param_idx}::jsonb")
params.append(json.dumps(value) if not isinstance(value, str) else value)
else:
set_clauses.append(f"{key} = ${param_idx}")
params.append(value)
param_idx += 1 param_idx += 1
query = f""" query = f"""
@@ -264,7 +277,8 @@ class DatabaseManager:
job_id: UUID, job_id: UUID,
reviews: List[Dict[str, Any]], reviews: List[Dict[str, Any]],
scrape_time: float, scrape_time: float,
total_reviews: Optional[int] = None total_reviews: Optional[int] = None,
scrape_logs: Optional[List[Dict[str, Any]]] = None
): ):
""" """
Save scraping results to database. Save scraping results to database.
@@ -274,6 +288,7 @@ class DatabaseManager:
reviews: List of review dictionaries reviews: List of review dictionaries
scrape_time: Time taken to scrape in seconds scrape_time: Time taken to scrape in seconds
total_reviews: Total reviews available (from page counter) total_reviews: Total reviews available (from page counter)
scrape_logs: List of log entries from the scraper
""" """
async with self.pool.acquire() as conn: async with self.pool.acquire() as conn:
await conn.execute(""" await conn.execute("""
@@ -284,9 +299,11 @@ class DatabaseManager:
reviews_count = $2, reviews_count = $2,
total_reviews = $3, total_reviews = $3,
reviews_data = $4::jsonb, reviews_data = $4::jsonb,
scrape_time = $5 scrape_time = $5,
scrape_logs = $6::jsonb
WHERE job_id = $1 WHERE job_id = $1
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time) """, job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
json.dumps(scrape_logs) if scrape_logs else None)
log.info(f"Saved {len(reviews)} reviews for job {job_id}") log.info(f"Saved {len(reviews)} reviews for job {job_id}")
@@ -317,8 +334,10 @@ class DatabaseManager:
created_at, created_at,
completed_at, completed_at,
reviews_count, reviews_count,
total_reviews,
scrape_time, scrape_time,
error_message error_message,
metadata
FROM jobs FROM jobs
WHERE status = $1 WHERE status = $1
ORDER BY created_at DESC ORDER BY created_at DESC
@@ -333,8 +352,10 @@ class DatabaseManager:
created_at, created_at,
completed_at, completed_at,
reviews_count, reviews_count,
total_reviews,
scrape_time, scrape_time,
error_message error_message,
metadata
FROM jobs FROM jobs
ORDER BY created_at DESC ORDER BY created_at DESC
LIMIT $1 OFFSET $2 LIMIT $1 OFFSET $2

View File

@@ -1140,13 +1140,30 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
else: else:
log.info(f"[PROFILE] Using pooled driver (0.00s)") log.info(f"[PROFILE] Using pooled driver (0.00s)")
# Force English locale for consistent parsing # Force English locale AND US region for consistent parsing/results
# This helps avoid geolocation-based variations in Google Maps results
if 'hl=' in url: if 'hl=' in url:
url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en') url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en')
else: else:
separator = '&' if '?' in url else '?' separator = '&' if '?' in url else '?'
url = f"{url}{separator}hl=en" url = f"{url}{separator}hl=en"
# Add US region parameter if not present
if 'gl=' not in url:
url = f"{url}&gl=us"
# Set Chrome geolocation to US (Boston, MA) using CDP
# This ensures Google Maps shows US results regardless of server location
try:
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': 42.3601,
'longitude': -71.0589,
'accuracy': 100
})
log.info("Set geolocation to US (Boston, MA)")
except Exception as e:
log.warning(f"Could not set geolocation: {e}")
log.info(f"Loading Google Maps page...") log.info(f"Loading Google Maps page...")
t0 = timing_module.time() t0 = timing_module.time()
driver.get(url) driver.get(url)
@@ -1164,18 +1181,23 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button') form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
for btn in form_btns: for btn in form_btns:
btn_text = (btn.text or '').lower() btn_text = (btn.text or '').lower()
if 'aceptar todo' in btn_text or 'accept all' in btn_text: if 'aceptar todo' in btn_text or 'accept all' in btn_text or 'reject all' in btn_text:
log.info(f"Clicking GDPR consent: {btn.text}") log.info(f"Clicking GDPR consent: {btn.text}")
btn.click() btn.click()
time.sleep(1) # Reduced from 2s time.sleep(1)
break break
else: else:
if len(form_btns) >= 2: if len(form_btns) >= 2:
log.info("Using fallback: clicking second form button") log.info("Using fallback: clicking second form button")
form_btns[1].click() form_btns[1].click()
time.sleep(1) # Reduced from 2s time.sleep(1)
except Exception as e: except Exception as e:
log.warning(f"GDPR consent handling failed: {e}") log.warning(f"GDPR consent handling failed: {e}")
# After GDPR consent, reload the original URL to ensure proper page state
log.info(f"Reloading original URL after GDPR consent...")
driver.get(url)
time.sleep(1)
log.info(f"[PROFILE] GDPR consent handling: {timing_module.time() - t0:.2f}s") log.info(f"[PROFILE] GDPR consent handling: {timing_module.time() - t0:.2f}s")
else: else:
log.info(f"[PROFILE] No GDPR consent page (0.00s)") log.info(f"[PROFILE] No GDPR consent page (0.00s)")
@@ -1197,14 +1219,77 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
try: try:
log.info("Waiting for Google Maps content to load...") log.info("Waiting for Google Maps content to load...")
wait = WebDriverWait(driver, 10) wait = WebDriverWait(driver, 10)
# Wait for basic page structure (h1 or heading)
wait.until( wait.until(
lambda d: d.find_elements(By.CSS_SELECTOR, 'h1.DUwDvf, h1, [role="article"], [data-review-id]') lambda d: d.find_elements(By.CSS_SELECTOR, 'h1, [role="heading"]')
) )
log.info("Google Maps content loaded successfully") log.info("Basic page structure loaded")
# Wait for page to settle - search URLs redirect to place URLs
# which triggers additional content loading
time.sleep(2)
# Wait specifically for review count element (aria-label ending with "reviews")
# This is the most reliable indicator that the business detail is loaded
try:
WebDriverWait(driver, 5).until(
lambda d: d.execute_script("""
var elems = document.querySelectorAll('[aria-label]');
for (var i = 0; i < elems.length; i++) {
var label = elems[i].getAttribute('aria-label') || '';
if (/^[0-9]+ reviews?$/.test(label)) return true;
}
return false;
""")
)
log.info("Review count element loaded")
except:
# Fallback: Try clicking Reviews tab or rating stars to expose the review count
log.info("Review count wait timeout, trying to click Reviews/rating...")
try:
# Try 1: Click Reviews tab (if exists)
clicked = driver.execute_script("""
var tabs = document.querySelectorAll('[role="tab"]');
for (var i = 0; i < tabs.length; i++) {
var txt = (tabs[i].textContent || '').toLowerCase();
if (txt.includes('review')) {
tabs[i].click();
return 'tab';
}
}
// Try 2: Click the rating stars element (often links to reviews)
var stars = document.querySelector('[role="img"][aria-label*="star"]');
if (stars) {
var parent = stars.parentElement;
if (parent && parent.tagName.toLowerCase() === 'button') {
parent.click();
return 'stars_button';
}
stars.click();
return 'stars';
}
// Try 3: Click "Write a review" or any review-related button
var btns = document.querySelectorAll('button[aria-label*="review" i]');
for (var b = 0; b < btns.length; b++) {
var label = btns[b].getAttribute('aria-label') || '';
if (!/write/i.test(label) && /review/i.test(label)) {
btns[b].click();
return 'review_btn: ' + label;
}
}
return 'none';
""")
log.info(f"Clicked: {clicked}")
time.sleep(2) # Wait for reviews panel to load
except Exception as e:
log.warning(f"Click attempt failed: {e}")
except Exception as e: except Exception as e:
log.warning(f"Timeout waiting for Maps content: {e}") log.warning(f"Timeout waiting for Maps content: {e}")
time.sleep(0.5) # Minimal fallback wait time.sleep(2) # Fallback wait
log.info(f"[PROFILE] Smart wait for content: {timing_module.time() - t0:.2f}s") log.info(f"[PROFILE] Smart wait for content: {timing_module.time() - t0:.2f}s")
log.info(f"DEBUG: Current URL: {driver.current_url[:100]}...")
log.info(f"DEBUG: Page title: {driver.title}")
# Extract business card information using JavaScript # Extract business card information using JavaScript
t0 = timing_module.time() t0 = timing_module.time()
@@ -1216,85 +1301,166 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
total_reviews: null total_reviews: null
}; };
// Extract business name // ============ ROBUST EXTRACTION (no class names, aria/data attributes preferred) ============
const nameSelectors = [
'h1.DUwDvf',
'[role="main"] h1',
'h1.fontHeadlineLarge'
];
for (const selector of nameSelectors) { // Helper: Parse review count from text, handling multiple formats
const elem = document.querySelector(selector); function parseReviewCount(text) {
if (elem && elem.textContent) { if (!text) return null;
info.name = elem.textContent.trim();
break;
}
}
// Extract address // Pattern 1: Exact "N reviews" format (aria-labels, clean text)
const addressSelectors = [ // Matches: "27 reviews", "1,234 reviews", "27 reseñas", "27 avis"
'button[data-item-id*="address"]', var match = text.match(/^([0-9][0-9,.]*)[ ]*(?:reviews?|reseñas?|avis|bewertungen?|recensioni?)$/i);
'[data-item-id*="address"]',
'div[aria-label*="Address"]'
];
for (const selector of addressSelectors) {
const elem = document.querySelector(selector);
if (elem && elem.textContent) {
info.address = elem.textContent.trim();
break;
}
}
// Extract rating (look for aria-label like "4.2 stars")
const ratingElem = document.querySelector('[role="img"][aria-label*="star"]');
if (ratingElem) {
const ariaLabel = ratingElem.getAttribute('aria-label');
const match = ariaLabel.match(/([0-9.]+)/);
if (match) { if (match) {
info.rating = parseFloat(match[1]); return parseInt(match[1].replace(/[,. ]/g, ''));
} }
}
// Extract total review count // Pattern 2: "(N)" format often used in tabs like "Reviews (27)"
const reviewPattern = /\\((\\d[\\d,\\.]*)\\)/; match = text.match(/[(]([0-9][0-9,.]*)[)]$/);
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i; if (match) {
return parseInt(match[1].replace(/[,. ]/g, ''));
}
// PRIORITY 1: Look for review count in search results sidebar/panel // Pattern 3: "N reviews" anywhere in short text (< 30 chars to avoid false positives)
// This is where "152 reviews" appears on search results if (text.length < 30) {
const searchPanelSelectors = [ match = text.match(/([0-9][0-9,]*)[ ]+(?:reviews?|reseñas?|avis)/i);
'a[href*="reviews"]', // Link with "reviews" in href
'button[jsaction*="reviews"]', // Button related to reviews
'div[role="link"]', // Clickable divs that might contain review info
];
for (const selector of searchPanelSelectors) {
const elements = document.querySelectorAll(selector);
for (let elem of elements) {
const text = elem.textContent || '';
const match = text.match(numberPattern);
if (match) { if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, '')); return parseInt(match[1].replace(/[,. ]/g, ''));
if (num > 0 && num < 1000000) {
info.total_reviews = num;
break;
}
} }
} }
if (info.total_reviews) break;
return null;
} }
// PRIORITY 2: Look in any span/div that contains the word "review" // ============ EXTRACT BUSINESS NAME ============
// Priority: h1 (semantic), then role="heading"
const h1 = document.querySelector('h1');
if (h1 && h1.textContent) {
info.name = h1.textContent.trim();
}
if (!info.name) {
const heading = document.querySelector('[role="heading"][aria-level="1"]');
if (heading && heading.textContent) {
info.name = heading.textContent.trim();
}
}
// ============ EXTRACT ADDRESS ============
// Priority: data-item-id (semantic), then aria-label containing "address"
const addressElem = document.querySelector('[data-item-id*="address"]');
if (addressElem && addressElem.textContent) {
info.address = addressElem.textContent.trim();
}
if (!info.address) {
const ariaAddress = document.querySelector('[aria-label*="ddress"]');
if (ariaAddress && ariaAddress.textContent) {
info.address = ariaAddress.textContent.trim();
}
}
// ============ EXTRACT RATING ============
// Priority: aria-label containing "star" on role="img" elements
info._debug_rating_context = [];
const ratingElems = document.querySelectorAll('[role="img"][aria-label*="star"]');
for (let elem of ratingElems) {
const ariaLabel = elem.getAttribute('aria-label') || '';
// Match "4.9 stars" or "4,9 stars" (European format)
const match = ariaLabel.match(/([0-9][.,]?[0-9]?)\\s*star/i);
if (match) {
info.rating = parseFloat(match[1].replace(',', '.'));
// DEBUG: Capture parent/sibling context to find review count
var parent = elem.parentElement;
if (parent) {
info._debug_rating_context.push('PARENT: ' + (parent.textContent || '').trim().substring(0, 100));
var grandparent = parent.parentElement;
if (grandparent) {
info._debug_rating_context.push('GRANDPARENT: ' + (grandparent.textContent || '').trim().substring(0, 100));
// Check all children of grandparent for review count
var gpChildren = grandparent.querySelectorAll('*');
for (var c = 0; c < Math.min(gpChildren.length, 30); c++) {
var childText = (gpChildren[c].textContent || '').trim();
if (childText.length > 0 && childText.length < 20 && /[0-9]/.test(childText)) {
info._debug_rating_context.push('GP_CHILD: ' + childText);
}
}
// Also check great-grandparent
var ggp = grandparent.parentElement;
if (ggp) {
info._debug_rating_context.push('GREAT_GP: ' + (ggp.textContent || '').trim().substring(0, 150));
}
}
// Check siblings
var nextSib = parent.nextElementSibling;
if (nextSib) {
info._debug_rating_context.push('NEXT_SIB: ' + (nextSib.textContent || '').trim().substring(0, 100));
}
}
break;
}
}
// ============ EXTRACT TOTAL REVIEWS (ROBUST, ARIA-FIRST) ============
// PRIORITY 1: aria-label with exact "N reviews" format (most reliable)
// Google Maps uses aria-label="27 reviews" for accessibility
info._debug_aria = [];
info._debug_all_numeric = [];
if (!info.total_reviews) { if (!info.total_reviews) {
const allElements = document.querySelectorAll('span, div, a'); var ariaElems = document.querySelectorAll('[aria-label]');
for (let elem of allElements) { for (var i = 0; i < ariaElems.length; i++) {
const text = elem.textContent || ''; var ariaLabel = ariaElems[i].getAttribute('aria-label') || '';
if (text.length < 100) { // Skip very long text blocks // Collect all labels containing "review"
const match = text.match(numberPattern); if (ariaLabel.toLowerCase().indexOf('review') >= 0) {
info._debug_aria.push(ariaLabel);
}
// Collect all labels starting with a digit
if (/^[0-9]/.test(ariaLabel)) {
info._debug_all_numeric.push(ariaLabel);
}
var count = parseReviewCount(ariaLabel);
if (count && count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = ariaLabel;
break;
}
}
}
// DEBUG: Find all text with parenthetical numbers like "(27)"
info._debug_parens = [];
info._debug_short_text = []; // All short text with numbers
var allSpans = document.querySelectorAll('span, div, a, button');
for (var j = 0; j < Math.min(allSpans.length, 500); j++) {
var spanText = allSpans[j].textContent || '';
// Capture parenthetical numbers
if (spanText.length < 20 && /[(][0-9]+[)]/.test(spanText)) {
info._debug_parens.push(spanText.trim());
}
// Capture ALL short text containing numbers (for debugging)
if (spanText.length > 0 && spanText.length < 30 && /[0-9]+/.test(spanText)) {
var cleaned = spanText.trim().replace(/\\s+/g, ' ');
if (cleaned && info._debug_short_text.indexOf(cleaned) < 0) {
info._debug_short_text.push(cleaned);
}
}
}
// PRIORITY 2.5: Look for text containing numbers near "review" word anywhere on page
// This catches formats like "27 reviews", "reviews: 27", etc. that aren't in aria-labels
if (!info.total_reviews) {
var allElems = document.querySelectorAll('*');
for (var k = 0; k < Math.min(allElems.length, 1000); k++) {
var elem = allElems[k];
// Skip if has children (we want leaf nodes only)
if (elem.children.length > 0) continue;
var txt = (elem.textContent || '').trim();
// Look for short text with both numbers and "review" word
if (txt.length >= 3 && txt.length < 30 && /review/i.test(txt)) {
var match = txt.match(/([0-9][0-9,]*)/);
if (match) { if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, '')); var count = parseInt(match[1].replace(/,/g, ''));
if (num > 0 && num < 1000000) { if (count > 0 && count < 100000) {
info.total_reviews = num; info.total_reviews = count;
info._debug_matched = 'LEAF: ' + txt;
break; break;
} }
} }
@@ -1302,38 +1468,167 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
} }
} }
// PRIORITY 3: Try tabs (for business detail pages) // DEBUG: Collect all tab names
info._debug_tabs = [];
const tabs = document.querySelectorAll('[role="tab"]');
for (let t = 0; t < tabs.length; t++) {
info._debug_tabs.push((tabs[t].textContent || '').trim().substring(0, 30));
}
// DEBUG: Collect all buttons with text (might contain review count)
info._debug_buttons = [];
const buttons = document.querySelectorAll('button');
for (let b = 0; b < Math.min(buttons.length, 20); b++) {
var btnText = (buttons[b].textContent || '').trim();
if (btnText && btnText.length < 40) {
info._debug_buttons.push(btnText.substring(0, 40));
}
}
// PRIORITY 2: Tabs with role="tab" (Reviews tab often shows count)
if (!info.total_reviews) { if (!info.total_reviews) {
const tabs = document.querySelectorAll('button[role="tab"]');
for (let tab of tabs) { for (let tab of tabs) {
const text = tab.textContent || ''; const text = (tab.textContent || '').trim();
let match = text.match(reviewPattern); // Look for "Reviews" tab with count
if (match) { if (text.toLowerCase().includes('review')) {
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, '')); const count = parseReviewCount(text);
break; if (count && count > 0) {
info.total_reviews = count;
info._debug_matched = 'TAB: ' + text;
break;
}
} }
match = text.match(numberPattern); }
if (match) { }
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
// PRIORITY 2.3: Reviews panel header (after clicking Reviews tab)
// Google Maps shows "27 reviews" as heading text in the reviews panel
if (!info.total_reviews) {
// Look for headings containing review count
var headings = document.querySelectorAll('h1, h2, [role="heading"]');
for (var h = 0; h < headings.length; h++) {
var hText = (headings[h].textContent || '').trim();
if (/review/i.test(hText)) {
var match = hText.match(/([0-9][0-9,]*)/);
if (match) {
var count = parseInt(match[1].replace(/,/g, ''));
if (count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = 'HEADING: ' + hText;
break;
}
}
}
}
}
// PRIORITY 2.4: Look for sort button area which often has total count
// The sort dropdown area displays "Sort: Newest" and total reviews
if (!info.total_reviews) {
var sortBtns = document.querySelectorAll('button[data-value="sort"], [aria-label*="Sort"]');
for (var s = 0; s < sortBtns.length; s++) {
var parent = sortBtns[s].parentElement;
if (parent) {
var pText = (parent.textContent || '').trim();
if (/review/i.test(pText)) {
var match = pText.match(/([0-9][0-9,]*)\\s*review/i);
if (match) {
var count = parseInt(match[1].replace(/,/g, ''));
if (count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = 'SORT_AREA: ' + pText.substring(0, 50);
break;
}
}
}
}
}
}
// PRIORITY 3: Elements with semantic review-related attributes
if (!info.total_reviews) {
const reviewLinks = document.querySelectorAll('a[href*="review"], button[aria-label*="review" i]');
for (let elem of reviewLinks) {
const text = (elem.textContent || '').trim();
const count = parseReviewCount(text);
if (count && count > 0) {
info.total_reviews = count;
break; break;
} }
} }
} }
// PRIORITY 4: Try aria-labels // PRIORITY 4: Look for standalone review count text near rating
// Find elements that contain ONLY "N reviews" pattern (not concatenated with rating)
if (!info.total_reviews) { if (!info.total_reviews) {
const elements = document.querySelectorAll('[aria-label]'); const allElements = document.querySelectorAll('span, a');
for (let elem of elements) { for (let elem of allElements) {
const ariaLabel = elem.getAttribute('aria-label') || ''; // Get direct text content only (not nested children)
let match = ariaLabel.match(reviewPattern); const text = (elem.textContent || '').trim();
if (match) { // Skip if too long (likely contains other content)
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, '')); if (text.length > 50) continue;
// Skip if it looks like rating+reviews concatenated (e.g., "4.927 reviews")
if (/^[0-9]\\.[0-9]+[0-9]/.test(text)) continue;
const count = parseReviewCount(text);
if (count && count > 0 && count < 100000) {
info.total_reviews = count;
break; break;
} }
match = ariaLabel.match(numberPattern); }
if (match) { }
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
break; // PRIORITY 5: Parse from visible page text using regex on short text blocks
if (!info.total_reviews) {
const walker = document.createTreeWalker(
document.body,
NodeFilter.SHOW_TEXT,
null,
false
);
while (walker.nextNode()) {
const text = walker.currentNode.textContent.trim();
if (text.length >= 5 && text.length <= 30) {
// Match "27 reviews" but not "4.927 reviews"
const match = text.match(/(?:^|[^0-9.,])([0-9,]+)\\s+(?:reviews?|reseñas?)/i);
if (match) {
const count = parseInt(match[1].replace(/[,]/g, ''));
if (count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = 'WALKER: ' + text;
break;
}
}
}
}
}
// PRIORITY 6: Extract from embedded JSON in page source (Google embeds data in scripts)
if (!info.total_reviews) {
var scripts = document.querySelectorAll('script');
for (var sc = 0; sc < scripts.length; sc++) {
var scriptText = scripts[sc].textContent || '';
// Look for patterns like "user_reviews":{"count":27} or reviews_count":27
var jsonMatch = scriptText.match(/"(?:user_reviews|reviews?)(?:_count)?"\s*[:\{]\s*"?(\d+)"?/i);
if (jsonMatch) {
var count = parseInt(jsonMatch[1]);
if (count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = 'JSON_SCRIPT';
break;
}
}
// Also look for review count in Google's data format like [\"27 reviews\"]
if (!info.total_reviews) {
var dataMatch = scriptText.match(/"(\d+)\s+reviews?"/i);
if (dataMatch) {
var count = parseInt(dataMatch[1]);
if (count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = 'JSON_DATA: ' + dataMatch[0];
break;
}
}
} }
} }
} }
@@ -1348,6 +1643,32 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
log.info(f"[PROFILE] *** TOTAL GET_BUSINESS_CARD TIME: {total_time:.2f}s ***") log.info(f"[PROFILE] *** TOTAL GET_BUSINESS_CARD TIME: {total_time:.2f}s ***")
log.info(f"Business card extracted: name={business_info.get('name')}, " log.info(f"Business card extracted: name={business_info.get('name')}, "
f"rating={business_info.get('rating')}, reviews={business_info.get('total_reviews')}") f"rating={business_info.get('rating')}, reviews={business_info.get('total_reviews')}")
# Debug: log what aria-labels were found
if business_info.get('_debug_aria'):
log.info(f"DEBUG: Found {len(business_info.get('_debug_aria'))} aria-labels with 'review': {business_info.get('_debug_aria')[:5]}")
if business_info.get('_debug_matched'):
log.info(f"DEBUG: Matched aria-label: {business_info.get('_debug_matched')}")
# Also log all numeric aria-labels (potential review counts)
if business_info.get('_debug_all_numeric'):
log.info(f"DEBUG: Numeric aria-labels: {business_info.get('_debug_all_numeric')[:10]}")
# Log any text with parenthetical numbers like "(27)"
if business_info.get('_debug_parens'):
log.info(f"DEBUG: Parenthetical text: {business_info.get('_debug_parens')[:5]}")
# Log all short text containing numbers (for debugging review count detection)
if business_info.get('_debug_short_text'):
log.info(f"DEBUG: Short text with numbers: {business_info.get('_debug_short_text')[:15]}")
# Log the context around the rating element
if business_info.get('_debug_rating_context'):
for ctx in business_info.get('_debug_rating_context', []):
log.info(f"DEBUG: Rating context: {ctx}")
# Log what tabs exist on the page
if business_info.get('_debug_tabs'):
log.info(f"DEBUG: Page tabs: {business_info.get('_debug_tabs')}")
else:
log.info(f"DEBUG: No tabs found on page")
# Log buttons (might contain review count)
if business_info.get('_debug_buttons'):
log.info(f"DEBUG: Buttons: {business_info.get('_debug_buttons')[:10]}")
result = { result = {
"name": business_info.get('name'), "name": business_info.get('name'),

View File

@@ -1,407 +0,0 @@
"""
Background job manager for Google Reviews Scraper.
"""
import asyncio
import logging
import threading
import time
import uuid
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from enum import Enum
from typing import Dict, Any, Optional, List
from dataclasses import dataclass, asdict
from modules.config import load_config
from modules.scraper import GoogleReviewsScraper
from modules.scraper_clean import fast_scrape_reviews # Updated to use clean scraper with hard refresh recovery
from modules.chrome_pool import get_scraping_worker, release_scraping_worker
log = logging.getLogger("scraper")
class JobStatus(str, Enum):
"""Job status enumeration"""
PENDING = "pending"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
@dataclass
class ScrapingJob:
"""Scraping job data class"""
job_id: str
status: JobStatus
url: str
config: Dict[str, Any]
created_at: datetime
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
updated_at: Optional[datetime] = None # Last update time (for progress tracking)
error_message: Optional[str] = None
reviews_count: Optional[int] = None
total_reviews: Optional[int] = None # Total reviews available (from page counter)
images_count: Optional[int] = None
progress: Dict[str, Any] = None
reviews_data: Optional[List[Dict[str, Any]]] = None # Store actual review data
scrape_time: Optional[float] = None # Time taken to scrape
def to_dict(self, include_reviews: bool = False) -> Dict[str, Any]:
"""
Convert job to dictionary for JSON serialization
Args:
include_reviews: Whether to include the full reviews data (default: False)
"""
data = asdict(self)
# Convert datetime objects to ISO strings
for field in ['created_at', 'started_at', 'completed_at']:
if data[field]:
data[field] = data[field].isoformat()
# Exclude reviews_data by default (can be large)
if not include_reviews:
data.pop('reviews_data', None)
return data
class JobManager:
"""Manager for background scraping jobs"""
def __init__(self, max_concurrent_jobs: int = 3):
"""Initialize job manager"""
self.max_concurrent_jobs = max_concurrent_jobs
self.jobs: Dict[str, ScrapingJob] = {}
self.executor = ThreadPoolExecutor(max_workers=max_concurrent_jobs)
self.lock = threading.Lock()
def create_job(self, url: str, config_overrides: Dict[str, Any] = None) -> str:
"""
Create a new scraping job.
Args:
url: Google Maps URL to scrape
config_overrides: Optional config overrides
Returns:
Job ID
"""
job_id = str(uuid.uuid4())
# Load base config
config = load_config()
# Apply URL
config["url"] = url
# Apply any overrides
if config_overrides:
config.update(config_overrides)
job = ScrapingJob(
job_id=job_id,
status=JobStatus.PENDING,
url=url,
config=config,
created_at=datetime.now(),
progress={"stage": "created", "message": "Job created and queued"}
)
with self.lock:
self.jobs[job_id] = job
log.info(f"Created scraping job {job_id} for URL: {url}")
return job_id
def start_job(self, job_id: str) -> bool:
"""
Start a pending job.
Args:
job_id: Job ID to start
Returns:
True if job was started, False otherwise
"""
with self.lock:
if job_id not in self.jobs:
return False
job = self.jobs[job_id]
if job.status != JobStatus.PENDING:
return False
# Check if we can start more jobs
running_count = sum(1 for j in self.jobs.values() if j.status == JobStatus.RUNNING)
if running_count >= self.max_concurrent_jobs:
return False
job.status = JobStatus.RUNNING
job.started_at = datetime.now()
job.updated_at = datetime.now()
job.progress = {"stage": "starting", "message": "Initializing scraper"}
# Submit job to thread pool
future = self.executor.submit(self._run_scraping_job, job_id)
log.info(f"Started scraping job {job_id}")
return True
def _run_scraping_job(self, job_id: str):
"""
Run the actual scraping job in background thread.
Args:
job_id: Job ID to run
"""
def progress_callback(current_count: int, total_count: int):
"""Update job progress during scraping"""
with self.lock:
job = self.jobs.get(job_id)
if job:
job.reviews_count = current_count
job.total_reviews = total_count
job.updated_at = datetime.now() # Update last update time
# Calculate percentage for better UX
percentage = int((current_count / total_count * 100)) if total_count > 0 else 0
job.progress = {
"stage": "scraping",
"message": f"Collecting reviews: {current_count} / {total_count} ({percentage}%)",
"percentage": percentage
}
worker = None
try:
with self.lock:
job = self.jobs[job_id]
job.progress = {"stage": "initializing", "message": "Acquiring Chrome worker from pool"}
# Get a worker from the scraping pool
worker = get_scraping_worker(timeout=30)
if not worker:
raise Exception("No Chrome workers available. Pool may be at capacity.")
log.info(f"Job {job_id}: Acquired worker {worker.worker_id} from pool")
# Get config
url = job.config.get('url')
headless = job.config.get('headless', True) # Default to headless
max_scrolls = job.config.get('max_scrolls', 999999) # Effectively unlimited - relies on idle detection
with self.lock:
job.progress = {"stage": "scraping", "message": f"Scraping reviews with {worker.worker_id} (fast mode)"}
# Run the FAST scraping with progress callback using pooled worker
result = fast_scrape_reviews(
url=url,
headless=headless,
max_scrolls=max_scrolls,
progress_callback=progress_callback,
driver=worker.driver, # Use worker's driver
return_driver=True # Don't close the driver
)
# Pop the driver from result before storing
result.pop('driver', None)
# Mark job as completed or failed
with self.lock:
if result['success']:
job.status = JobStatus.COMPLETED
job.completed_at = datetime.now()
job.updated_at = datetime.now()
job.reviews_count = result['count']
job.total_reviews = result.get('total_reviews') # Store total review count from page
job.reviews_data = result['reviews'] # Store the actual reviews
job.scrape_time = result['time']
job.progress = {
"stage": "completed",
"message": f"Scraping completed successfully in {result['time']:.1f}s",
"scroll_time": result.get('scroll_time'),
"extract_time": result.get('extract_time')
}
log.info(f"Completed scraping job {job_id}: {result['count']} reviews in {result['time']:.1f}s")
else:
job.status = JobStatus.FAILED
job.completed_at = datetime.now()
job.updated_at = datetime.now()
job.error_message = result.get('error', 'Unknown error')
job.progress = {"stage": "failed", "message": f"Job failed: {result.get('error')}"}
log.error(f"Failed scraping job {job_id}: {result.get('error')}")
except Exception as e:
log.error(f"Error in scraping job {job_id}: {e}")
import traceback
traceback.print_exc()
with self.lock:
job = self.jobs[job_id]
job.status = JobStatus.FAILED
job.completed_at = datetime.now()
job.updated_at = datetime.now()
job.error_message = str(e)
job.progress = {"stage": "failed", "message": f"Job failed: {str(e)}"}
# Recycle worker on error
if worker:
log.info(f"Job {job_id}: Recycling worker {worker.worker_id} due to error")
release_scraping_worker(worker, recycle=True)
worker = None # Mark as released
finally:
# Release worker back to pool if not already released
if worker:
log.info(f"Job {job_id}: Releasing worker {worker.worker_id} back to pool")
release_scraping_worker(worker, recycle=False)
def get_job(self, job_id: str) -> Optional[ScrapingJob]:
"""
Get job by ID.
Args:
job_id: Job ID
Returns:
Job object or None if not found
"""
with self.lock:
return self.jobs.get(job_id)
def get_job_reviews(self, job_id: str) -> Optional[List[Dict[str, Any]]]:
"""
Get reviews data for a specific job.
Args:
job_id: Job ID
Returns:
List of reviews or None if not found/not completed
"""
with self.lock:
job = self.jobs.get(job_id)
if job and job.status == JobStatus.COMPLETED:
return job.reviews_data
return None
def list_jobs(self, status: Optional[JobStatus] = None, limit: int = 100) -> List[ScrapingJob]:
"""
List jobs, optionally filtered by status.
Args:
status: Optional status filter
limit: Maximum number of jobs to return
Returns:
List of jobs
"""
with self.lock:
jobs = list(self.jobs.values())
if status:
jobs = [job for job in jobs if job.status == status]
# Sort by creation time (newest first)
jobs.sort(key=lambda x: x.created_at, reverse=True)
return jobs[:limit]
def cancel_job(self, job_id: str) -> bool:
"""
Cancel a pending or running job.
Args:
job_id: Job ID to cancel
Returns:
True if job was cancelled, False otherwise
"""
with self.lock:
if job_id not in self.jobs:
return False
job = self.jobs[job_id]
if job.status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED]:
return False
job.status = JobStatus.CANCELLED
job.completed_at = datetime.now()
job.updated_at = datetime.now()
job.progress = {"stage": "cancelled", "message": "Job was cancelled"}
log.info(f"Cancelled scraping job {job_id}")
return True
def delete_job(self, job_id: str) -> bool:
"""
Delete a job from the manager.
Args:
job_id: Job ID to delete
Returns:
True if job was deleted, False otherwise
"""
with self.lock:
if job_id not in self.jobs:
return False
del self.jobs[job_id]
log.info(f"Deleted scraping job {job_id}")
return True
def get_stats(self) -> Dict[str, Any]:
"""
Get job manager statistics.
Returns:
Statistics dictionary
"""
with self.lock:
jobs = list(self.jobs.values())
stats = {
"total_jobs": len(jobs),
"by_status": {},
"running_jobs": 0,
"max_concurrent_jobs": self.max_concurrent_jobs
}
for status in JobStatus:
count = sum(1 for job in jobs if job.status == status)
stats["by_status"][status.value] = count
stats["running_jobs"] = stats["by_status"].get(JobStatus.RUNNING.value, 0)
return stats
def cleanup_old_jobs(self, max_age_hours: int = 24):
"""
Clean up old completed/failed jobs.
Args:
max_age_hours: Maximum age in hours before cleanup
"""
cutoff_time = datetime.now().timestamp() - (max_age_hours * 3600)
with self.lock:
to_delete = []
for job_id, job in self.jobs.items():
if job.status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED]:
if job.completed_at and job.completed_at.timestamp() < cutoff_time:
to_delete.append(job_id)
for job_id in to_delete:
del self.jobs[job_id]
if to_delete:
log.info(f"Cleaned up {len(to_delete)} old jobs")
def shutdown(self):
"""Shutdown the job manager"""
log.info("Shutting down job manager")
self.executor.shutdown(wait=True)

File diff suppressed because it is too large Load Diff

View File

@@ -1,198 +0,0 @@
#!/usr/bin/env python3
"""
Reverse-engineer Google's date formatting library to understand:
1. What library they use
2. All possible date format patterns
3. Time range boundaries for each pattern
"""
import json
import re
from seleniumbase import Driver
import time
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=en&rclk=1"
print("Starting browser...")
driver = Driver(uc=True, headless=False)
try:
print(f"Loading URL: {url}")
driver.get(url)
time.sleep(8)
# Script to find date formatting function
find_formatter_script = """
const results = {
scripts: [],
potential_formatters: [],
date_strings: []
};
// 1. Search all script tags for date-related code
const scriptTags = document.querySelectorAll('script');
let scriptContent = '';
scriptTags.forEach((script, idx) => {
const content = script.textContent || script.innerText;
if (content) {
scriptContent += content + '\\n';
// Look for date formatting patterns
if (content.includes('ago') || content.includes('month') || content.includes('year')) {
const snippet = content.substring(0, 500);
results.scripts.push({
index: idx,
snippet: snippet,
length: content.length
});
}
}
});
// 2. Search for common date formatting library signatures
const librarySignatures = [
'moment',
'date-fns',
'dayjs',
'luxon',
'timeago',
'formatRelative',
'relativeTime',
'fromNow'
];
librarySignatures.forEach(sig => {
if (scriptContent.includes(sig)) {
results.potential_formatters.push(sig);
}
});
// 3. Try to find the actual formatting function by injecting test dates
// Look for Google's internal date formatter
const googleFormatters = [];
for (let key in window) {
if (typeof window[key] === 'function') {
const funcStr = window[key].toString();
if (funcStr.includes('ago') && funcStr.includes('month')) {
googleFormatters.push({
name: key,
signature: funcStr.substring(0, 200)
});
}
}
}
results.google_formatters = googleFormatters;
// 4. Extract all "X ago" patterns from the page
const pageText = document.body.innerText;
const agoPatterns = pageText.match(/\\d+\\s+(second|minute|hour|day|week|month|year)s?\\s+ago/gi) || [];
const singlePatterns = pageText.match(/a\\s+(second|minute|hour|day|week|month|year)\\s+ago/gi) || [];
results.date_strings = [...new Set([...agoPatterns, ...singlePatterns])];
return results;
"""
print("Searching for date formatting code...")
formatter_info = driver.execute_script(find_formatter_script)
print("\n" + "="*80)
print("FINDINGS:")
print("="*80)
print(f"\n1. Scripts with date-related code: {len(formatter_info.get('scripts', []))}")
print(f"\n2. Potential libraries detected: {formatter_info.get('potential_formatters', [])}")
print(f"\n3. Google formatter functions found: {len(formatter_info.get('google_formatters', []))}")
for gf in formatter_info.get('google_formatters', [])[:3]:
print(f" - {gf['name']}: {gf['signature'][:100]}...")
print(f"\n4. Date patterns found on page:")
date_strings = formatter_info.get('date_strings', [])
for ds in sorted(set(date_strings))[:20]:
print(f" - '{ds}'")
# Now let's test different timestamps to understand the boundaries
print("\n" + "="*80)
print("TESTING TIME RANGE BOUNDARIES:")
print("="*80)
# We need to inject JavaScript that can format dates like Google does
# Let's search the actual DOM for the pattern
boundary_test_script = """
// Collect all unique date strings from reviews
const dateElements = document.querySelectorAll('span.rsqaWe');
const dateStrings = new Set();
dateElements.forEach(elem => {
const text = elem.textContent.trim();
if (text) {
dateStrings.add(text);
}
});
return Array.from(dateStrings).sort();
"""
all_date_strings = driver.execute_script(boundary_test_script)
print(f"\nFound {len(all_date_strings)} unique date formats:")
for ds in all_date_strings[:30]:
print(f" - '{ds}'")
# Analyze the patterns
print("\n" + "="*80)
print("PATTERN ANALYSIS:")
print("="*80)
patterns = {
'seconds': [],
'minutes': [],
'hours': [],
'days': [],
'weeks': [],
'months': [],
'years': []
}
for ds in all_date_strings:
ds_lower = ds.lower()
if 'second' in ds_lower:
patterns['seconds'].append(ds)
elif 'minute' in ds_lower:
patterns['minutes'].append(ds)
elif 'hour' in ds_lower:
patterns['hours'].append(ds)
elif 'day' in ds_lower:
patterns['days'].append(ds)
elif 'week' in ds_lower:
patterns['weeks'].append(ds)
elif 'month' in ds_lower:
patterns['months'].append(ds)
elif 'year' in ds_lower:
patterns['years'].append(ds)
for unit, examples in patterns.items():
if examples:
print(f"\n{unit.upper()}:")
for ex in examples[:5]:
print(f" - '{ex}'")
# Save all data
output = {
'formatter_info': formatter_info,
'all_date_strings': all_date_strings,
'pattern_analysis': {k: v for k, v in patterns.items() if v}
}
with open('/tmp/google_date_formatter_analysis.json', 'w') as f:
json.dump(output, f, indent=2)
print("\n" + "="*80)
print("Full analysis saved to: /tmp/google_date_formatter_analysis.json")
print("="*80)
finally:
driver.quit()
print("\nBrowser closed")

View File

@@ -1,175 +0,0 @@
#!/usr/bin/env python3
"""
Reverse-engineer Google's date formatting patterns by scraping reviews in English
"""
import json
from modules.fast_scraper import fast_scrape_reviews
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=en&rclk=1"
print("Scraping reviews in English...")
result = fast_scrape_reviews(url, headless=True)
reviews = result.get('reviews', [])
print(f"\nExtracted {len(reviews)} reviews")
if reviews:
# Collect all unique date strings
date_strings = set()
for rev in reviews:
date_text = rev.get('date_text')
if date_text:
date_strings.add(date_text)
print(f"\nFound {len(date_strings)} unique date formats:")
for ds in sorted(date_strings):
print(f" '{ds}'")
# Analyze patterns
print("\n" + "="*80)
print("PATTERN ANALYSIS:")
print("="*80)
patterns = {
'seconds': [],
'minutes': [],
'hours': [],
'days': [],
'weeks': [],
'months': [],
'years': []
}
for ds in date_strings:
ds_lower = ds.lower()
if 'second' in ds_lower:
patterns['seconds'].append(ds)
elif 'minute' in ds_lower:
patterns['minutes'].append(ds)
elif 'hour' in ds_lower:
patterns['hours'].append(ds)
elif 'day' in ds_lower:
patterns['days'].append(ds)
elif 'week' in ds_lower:
patterns['weeks'].append(ds)
elif 'month' in ds_lower:
patterns['months'].append(ds)
elif 'year' in ds_lower:
patterns['years'].append(ds)
for unit, examples in sorted(patterns.items()):
if examples:
print(f"\n{unit.upper()} ({len(examples)} patterns):")
for ex in sorted(examples):
print(f" '{ex}'")
# Identify the specific patterns
print("\n" + "="*80)
print("GOOGLE MAPS DATE FORMAT PATTERNS (English):")
print("="*80)
print("\nPattern Structure:")
print("-" * 80)
single_unit_patterns = [] # "a month ago"
plural_patterns = [] # "3 months ago"
for ds in sorted(date_strings):
if ds.startswith('a '):
single_unit_patterns.append(ds)
elif ds.split()[0].isdigit():
plural_patterns.append(ds)
print(f"\nSingular (a X ago): {len(single_unit_patterns)} patterns")
for p in sorted(single_unit_patterns):
print(f" '{p}'")
print(f"\nPlural (N Xs ago): {len(plural_patterns)} patterns")
for p in sorted(plural_patterns):
print(f" '{p}'")
# Determine time ranges
print("\n" + "="*80)
print("TIME RANGE BOUNDARIES:")
print("="*80)
# Extract numbers from plural patterns
import re
from collections import defaultdict
unit_values = defaultdict(list)
for ds in date_strings:
match = re.match(r'(\d+)\s+(\w+)\s+ago', ds.lower())
if match:
number = int(match.group(1))
unit = match.group(2).rstrip('s') # Remove plural 's'
unit_values[unit].append(number)
for unit, values in sorted(unit_values.items()):
if values:
print(f"\n{unit.upper()}:")
print(f" Range: {min(values)} - {max(values)}")
print(f" Values found: {sorted(set(values))}")
# Save analysis
output = {
'total_reviews': len(reviews),
'unique_date_formats': len(date_strings),
'all_date_strings': sorted(list(date_strings)),
'patterns_by_unit': {k: sorted(v) for k, v in patterns.items() if v},
'singular_patterns': sorted(single_unit_patterns),
'plural_patterns': sorted(plural_patterns),
'value_ranges': {unit: {'min': min(values), 'max': max(values), 'values': sorted(set(values))}
for unit, values in unit_values.items() if values}
}
with open('/tmp/google_date_patterns_english.json', 'w') as f:
json.dump(output, f, indent=2)
print("\n" + "="*80)
print("Analysis saved to: /tmp/google_date_patterns_english.json")
print("="*80)
# Now let's determine the EXACT library/algorithm Google uses
print("\n" + "="*80)
print("REVERSE-ENGINEERING GOOGLE'S ALGORITHM:")
print("="*80)
print("\nBased on the patterns, Google's relative date formatter:")
print("-" * 80)
print("\n1. FORMAT STRUCTURE:")
print(" Single unit: 'a {unit} ago'")
print(" Multiple: '{number} {unit}s ago'")
print("\n2. UNIT SELECTION (hypothesis):")
if 'second' in unit_values:
print(f" - Seconds: Used for 0-59 seconds ago")
if 'minute' in unit_values:
print(f" - Minutes: Used for 1-59 minutes ago")
if 'hour' in unit_values:
print(f" - Hours: Used for 1-23 hours ago")
if 'day' in unit_values:
print(f" - Days: Used for 1-6 days ago")
if 'week' in unit_values:
print(f" - Weeks: Used for 1-3 weeks ago")
if 'month' in unit_values:
print(f" - Months: Used for 1-11 months ago")
if 'year' in unit_values:
print(f" - Years: Used for 1+ years ago")
print("\n3. BOUNDARY THRESHOLDS (estimated):")
print(" 60 seconds = switch to minutes")
print(" 60 minutes = switch to hours")
print(" 24 hours = switch to days")
print(" 7 days = switch to weeks")
print(" ~30 days (4 weeks) = switch to months")
print(" 12 months = switch to years")
print("\n4. UNCERTAINTY RANGES:")
print(" 'a month ago' = 30-59 days ago (±15 days)")
print(" '2 months ago' = 60-89 days ago (±15 days)")
print(" 'a year ago' = 365-729 days ago (±6 months)")
else:
print("No reviews extracted!")

View File

@@ -1,77 +0,0 @@
#!/usr/bin/env python3
"""
GoogleMaps review scraper with MongoDB integration
=================================================
Main entry point for the scraper.
"""
from modules.cli import parse_arguments
from modules.config import load_config
from modules.scraper import GoogleReviewsScraper
def main():
"""Main function to initialize and run the scraper"""
# Parse command line arguments
args = parse_arguments()
# Load configuration
config = load_config(args.config)
# Override config with command line arguments if provided
if args.headless:
config["headless"] = True
if args.sort_by is not None:
config["sort_by"] = args.sort_by
if args.stop_on_match:
config["stop_on_match"] = True
if args.url is not None:
config["url"] = args.url
if args.overwrite_existing:
config["overwrite_existing"] = True
if args.use_mongodb is not None:
config["use_mongodb"] = args.use_mongodb
# Handle arguments for date conversion and image downloading
if args.convert_dates is not None:
config["convert_dates"] = args.convert_dates
if args.download_images is not None:
config["download_images"] = args.download_images
if args.image_dir is not None:
config["image_dir"] = args.image_dir
if args.download_threads is not None:
config["download_threads"] = args.download_threads
# Handle arguments for local image paths and URL replacement
if args.store_local_paths is not None:
config["store_local_paths"] = args.store_local_paths
if args.replace_urls is not None:
config["replace_urls"] = args.replace_urls
if args.custom_url_base is not None:
config["custom_url_base"] = args.custom_url_base
if args.custom_url_profiles is not None:
config["custom_url_profiles"] = args.custom_url_profiles
if args.custom_url_reviews is not None:
config["custom_url_reviews"] = args.custom_url_reviews
if args.preserve_original_urls is not None:
config["preserve_original_urls"] = args.preserve_original_urls
# Handle custom parameters
if args.custom_params is not None:
if "custom_params" not in config:
config["custom_params"] = {}
# Update config with the provided custom parameters
config["custom_params"].update(args.custom_params)
# Handle API interception option
if args.enable_api_intercept:
config["enable_api_intercept"] = True
# Initialize and run scraper
scraper = GoogleReviewsScraper(config)
scraper.scrape()
if __name__ == "__main__":
main()

View File

@@ -1,288 +0,0 @@
#!/usr/bin/env python3
"""
API-Only 244 Scraper - Attempt to get ALL 244 reviews via API alone.
Strategy:
1. More patient scrolling (more scrolls, longer waits)
2. Collect responses more frequently
3. Extra end-of-list collection
4. Slower timing near the end to ensure API completes
Goal: Get all 244 reviews via API without DOM parsing
"""
import sys
import yaml
import logging
import time
import json
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from modules.api_interceptor import GoogleMapsAPIInterceptor
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
def load_config():
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def api_244_scrape():
"""Get all 244 reviews purely via API with aggressive collection."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
print("API-244 SCRAPER - Getting ALL 244 reviews via API...")
print(f"URL: {url[:80]}...")
start_time = time.time()
api_reviews = {}
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# Step 1: Navigate
driver.get(url)
time.sleep(1.5)
# Dismiss cookies
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
time.sleep(0.4)
except:
pass
# Click reviews tab
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
for selector in ['.LRkQ2', 'button[role="tab"]']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria for kw in review_keywords):
driver.execute_script("arguments[0].click();", tab)
time.sleep(0.4)
break
except:
continue
# Wait for page stability
time.sleep(1.0)
# Find pane
pane = None
try:
wait = WebDriverWait(driver, 3)
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
except TimeoutException:
try:
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
except:
print("ERROR: Could not find pane")
return []
# Setup API interceptor
interceptor = GoogleMapsAPIInterceptor(driver)
interceptor.setup_interception()
interceptor.inject_response_interceptor()
time.sleep(1.0) # Longer wait to ensure interceptor is ready
# Setup scroll
driver.execute_script("window.scrollablePane = arguments[0];", pane)
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
# Trigger initial scroll
driver.execute_script(scroll_script)
time.sleep(1.0) # Wait for first API response
print("Scrolling with extended collection strategy...")
# Extended scrolling - MORE scrolls, SLOWER timing
max_scrolls = 50 # More scrolls to ensure we catch everything
idle_scrolls = 0
max_idle = 15 # Even more patience
last_count = 0
last_scroll_pos = 0
scroll_stuck_count = 0
for i in range(max_scrolls):
# Scroll
driver.execute_script(scroll_script)
# Progressive timing - slower and slower
if len(api_reviews) < 50:
time.sleep(0.30) # Start moderate
elif len(api_reviews) < 100:
time.sleep(0.35)
elif len(api_reviews) < 150:
time.sleep(0.40)
elif len(api_reviews) < 200:
time.sleep(0.50)
elif len(api_reviews) < 230:
time.sleep(0.60) # Much slower near end
else:
time.sleep(0.80) # Very slow for final reviews
# Collect responses
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except:
pass
# Check if we got new reviews
current_count = len(api_reviews)
if current_count == last_count:
idle_scrolls += 1
else:
idle_scrolls = 0
if (i + 1) % 10 == 0:
print(f" {current_count} reviews...")
last_count = current_count
# Check scroll position
try:
current_scroll = driver.execute_script("return arguments[0].scrollTop;", pane)
if current_scroll == last_scroll_pos:
scroll_stuck_count += 1
else:
scroll_stuck_count = 0
last_scroll_pos = current_scroll
except:
pass
# Stop conditions - but only if we have at least 240 reviews
if idle_scrolls >= max_idle and scroll_stuck_count >= 5 and current_count >= 240:
print(f" Reached end (no new reviews for {idle_scrolls} scrolls)")
break
# AGGRESSIVE final collection phase
print(f" Aggressive final collection (currently have {len(api_reviews)})...")
# Do 10 more scrolls with very long waits
for extra in range(10):
driver.execute_script(scroll_script)
time.sleep(1.2) # Very long wait
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
new_count = 0
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
new_count += 1
if new_count > 0:
print(f" +{new_count} more reviews (total: {len(api_reviews)})")
except:
pass
# Ultra-final wait and collect
time.sleep(2.0)
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except:
pass
elapsed = time.time() - start_time
all_reviews = list(api_reviews.values())
print(f"\n{'='*50}")
print(f"✅ COMPLETED!")
print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
print(f"Time: {elapsed:.2f}s")
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
if elapsed > 0:
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
print(f"{'='*50}")
if len(all_reviews) >= 244:
print(f"🎯 Got ALL 244 reviews via API!")
elif len(all_reviews) >= 240:
print(f"⚠️ Missing {244-len(all_reviews)} reviews - may need DOM parsing")
else:
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
print()
# Save
with open('google_reviews_api_244.json', 'w', encoding='utf-8') as f:
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
print(f"💾 Saved to google_reviews_api_244.json")
if all_reviews:
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}")
return all_reviews
finally:
try:
driver.quit()
except:
pass
if __name__ == '__main__':
try:
reviews = api_244_scrape()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
print("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -1,280 +0,0 @@
#!/usr/bin/env python3
"""
Complete Scraper - Gets ALL reviews while staying fast.
Strategy:
1. Scroll until no new reviews for 5 consecutive scrolls
2. Check scroll position to detect end
3. Do extra scrolls at the end to catch stragglers
4. Adaptive timing - faster at start, slower at end
Target: Get all 244 reviews in ~22-25 seconds
"""
import sys
import yaml
import logging
import time
import json
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from modules.api_interceptor import GoogleMapsAPIInterceptor
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
def load_config():
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def complete_scrape():
"""Get ALL reviews with intelligent scrolling."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
print("COMPLETE SCRAPER - Getting ALL reviews...")
print(f"URL: {url[:80]}...")
start_time = time.time()
api_reviews = {}
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# Step 1: Navigate
driver.get(url)
time.sleep(1.5)
# Dismiss cookies
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
time.sleep(0.4)
except:
pass
# Click reviews tab
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
for selector in ['.LRkQ2', 'button[role="tab"]']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria for kw in review_keywords):
driver.execute_script("arguments[0].click();", tab)
time.sleep(0.4)
break
except:
continue
# Wait for page stability
time.sleep(1.0)
# Find pane
pane = None
try:
wait = WebDriverWait(driver, 3)
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
except TimeoutException:
try:
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
except:
print("ERROR: Could not find pane")
return []
# Wait for initial reviews to load
time.sleep(1.5)
# Setup API interceptor
interceptor = GoogleMapsAPIInterceptor(driver)
interceptor.setup_interception()
interceptor.inject_response_interceptor()
time.sleep(1.0) # Important: wait for interceptor to be ready
# Setup scroll
driver.execute_script("window.scrollablePane = arguments[0];", pane)
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
# Trigger initial scroll to get first API response
driver.execute_script(scroll_script)
time.sleep(1.0) # Wait for first API response
print("Scrolling with intelligent stopping...")
# Intelligent scrolling
max_scrolls = 60 # Higher limit to ensure we get everything
idle_scrolls = 0 # Count scrolls with no new reviews
max_idle = 12 # More patience - stop after 12 scrolls with no new reviews
last_count = 0
last_scroll_pos = 0
scroll_stuck_count = 0
for i in range(max_scrolls):
# Scroll
driver.execute_script(scroll_script)
# Adaptive timing - faster at start, slower near end
if len(api_reviews) < 100:
time.sleep(0.27) # Fast at beginning
elif len(api_reviews) < 200:
time.sleep(0.30) # Medium in middle
elif len(api_reviews) < 235:
time.sleep(0.40) # Slower near end
else:
time.sleep(0.50) # Very slow at the very end to catch stragglers
# Collect responses
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except:
pass
# Check if we got new reviews
current_count = len(api_reviews)
if current_count == last_count:
idle_scrolls += 1
else:
idle_scrolls = 0
if (i + 1) % 10 == 0:
print(f" {current_count} reviews...")
last_count = current_count
# Check scroll position to detect if stuck at bottom
try:
current_scroll = driver.execute_script("return arguments[0].scrollTop;", pane)
if current_scroll == last_scroll_pos:
scroll_stuck_count += 1
else:
scroll_stuck_count = 0
last_scroll_pos = current_scroll
except:
pass
# Stop conditions
if idle_scrolls >= max_idle and scroll_stuck_count >= 3:
print(f" Reached end (no new reviews for {idle_scrolls} scrolls)")
break
# Extra thorough collection at the end
print(f" Final collection sweep (currently have {len(api_reviews)})...")
# Do a few more scrolls with longer waits
for extra in range(5):
driver.execute_script(scroll_script)
time.sleep(0.8) # Longer wait to ensure API completes
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
new_count = 0
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
new_count += 1
if new_count > 0:
print(f" +{new_count} more reviews (total: {len(api_reviews)})")
except:
pass
# Final wait and collect
time.sleep(1.0)
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except:
pass
elapsed = time.time() - start_time
all_reviews = list(api_reviews.values())
print(f"\n✅ COMPLETED!")
print(f"Reviews: {len(all_reviews)} (target: 244)")
print(f"Time: {elapsed:.2f}s")
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
if len(all_reviews) >= 244:
print(f"🎯 Got ALL reviews!")
elif len(all_reviews) >= 240:
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
print()
# Save
with open('google_reviews_complete.json', 'w', encoding='utf-8') as f:
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
print(f"💾 Saved to google_reviews_complete.json")
if all_reviews:
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}")
return all_reviews
finally:
try:
driver.quit()
except:
pass
if __name__ == '__main__':
try:
reviews = complete_scrape()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
print("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -1,331 +0,0 @@
#!/usr/bin/env python3
"""
DOM-ONLY FAST Scraper - Uses JavaScript for ultra-fast DOM extraction.
Strategy:
1. Scroll to load all reviews
2. Extract ALL data using JavaScript in one shot (no slow Selenium queries)
3. Should be faster and simpler than API + DOM hybrid
Target: ~20-25 seconds for all 244 reviews with simpler code
"""
import sys
import yaml
import logging
import time
import json
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
def load_config():
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def extract_all_reviews_js(driver):
"""Extract ALL reviews using JavaScript - single fast operation."""
extract_script = """
const reviews = [];
const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium');
for (let i = 0; i < elements.length; i++) {
const elem = elements[i];
const review = {};
try {
// Author
const authorElem = elem.querySelector('div.d4r55');
review.author = authorElem ? authorElem.textContent.trim() : null;
// Rating
const ratingElem = elem.querySelector('span.kvMYJc');
if (ratingElem) {
const ariaLabel = ratingElem.getAttribute('aria-label');
if (ariaLabel) {
const match = ariaLabel.match(/\\d+/);
review.rating = match ? parseFloat(match[0]) : null;
}
}
// Text
const textElem = elem.querySelector('span.wiI7pd');
review.text = textElem ? textElem.textContent.trim() : null;
// Date
const dateElem = elem.querySelector('span.rsqaWe');
review.date_text = dateElem ? dateElem.textContent.trim() : null;
// Avatar
const avatarElem = elem.querySelector('img.NBa7we');
review.avatar_url = avatarElem ? avatarElem.src : null;
// Profile URL
const profileElem = elem.querySelector('button.WEBjve');
review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null;
if (review.author && review.date_text) {
reviews.push(review);
}
} catch (e) {
// Skip this review
}
}
return reviews;
"""
try:
reviews_data = driver.execute_script(extract_script)
# Add review IDs
reviews = []
for review_data in reviews_data:
review_id = f"review_{hash(review_data['author'] + review_data['date_text'])}"
review_data['review_id'] = review_id
reviews.append(review_data)
return reviews
except Exception as e:
print(f" Error in JavaScript extraction: {e}")
return []
def dom_only_fast_scrape():
"""Ultra-fast DOM-only scraping with JavaScript extraction."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
print("DOM-ONLY FAST SCRAPER - JavaScript extraction...")
print(f"URL: {url[:80]}...")
start_time = time.time()
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# Navigate
driver.get(url)
time.sleep(1.5) # Reduced from 2.0
# Handle GDPR consent page (CRITICAL FIX!)
if 'consent.google.com' in driver.current_url:
try:
# Click "Accept all" / "Aceptar todo"
consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Aceptar"]')
if not consent_btns:
consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept"]')
if consent_btns:
consent_btns[0].click()
time.sleep(1.5) # Reduced from 2.0
except:
pass
# Dismiss cookie banner on Maps page
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
time.sleep(0.3) # Reduced from 0.4
except:
pass
# Click reviews tab
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
for selector in ['.LRkQ2', 'button[role="tab"]']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria for kw in review_keywords):
driver.execute_script("arguments[0].click();", tab)
time.sleep(0.3) # Reduced from 0.4
break
except:
continue
# Wait for page stability
time.sleep(0.8) # Reduced from 1.0
# Find pane
pane = None
try:
wait = WebDriverWait(driver, 3)
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
except TimeoutException:
try:
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
except:
print("ERROR: Could not find pane")
return []
# CRITICAL: Wait for initial reviews to load
time.sleep(1.2) # Reduced from 1.5
# Setup scroll
driver.execute_script("window.scrollablePane = arguments[0];", pane)
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
# Trigger initial scroll and VERIFY reviews are loading
driver.execute_script(scroll_script)
time.sleep(0.8) # Reduced from 1.0
# Check if reviews are actually loading
initial_count = driver.execute_script(
"return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
)
if initial_count < 5:
# Reviews not loaded yet, wait more
print(f" Waiting for reviews to load (found {initial_count})...")
time.sleep(1.5) # Reduced from 2.0
driver.execute_script(scroll_script)
time.sleep(0.8)
initial_count = driver.execute_script(
"return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
)
print(f"Scrolling to load all reviews (starting with {initial_count})...")
# Fast scrolling to load all DOM elements
# No hard limit - stops automatically via idle detection
max_scrolls = 999999
last_count = 0
idle_count = 0
last_scroll_pos = 0
for i in range(max_scrolls):
# Get current review count
current_count = driver.execute_script(
"return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
)
# Scroll to load more
prev_count = current_count
driver.execute_script(scroll_script)
# SMART WAIT: Wait until new reviews actually load (instead of fixed delay!)
max_wait = 1.0 # Maximum 1 second
wait_step = 0.05 # Check every 50ms
waited = 0
while waited < max_wait:
time.sleep(wait_step)
waited += wait_step
new_count = driver.execute_script(
"return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
)
# If reviews loaded, continue immediately!
if new_count > prev_count:
break
# If at bottom and no new reviews after 0.3s, we're done
if waited >= 0.3 and new_count == prev_count:
scroll_pos = driver.execute_script("return arguments[0].scrollTop;", pane)
if scroll_pos == last_scroll_pos:
idle_count += 1
if idle_count >= 3:
print(f" Reached end at {new_count} reviews")
break
last_scroll_pos = scroll_pos
break
current_count = new_count
# Progress logging every 10 scrolls
if (i + 1) % 10 == 0:
print(f" {current_count} review elements loaded...")
# Track for idle detection
if current_count == prev_count:
idle_count += 1
if idle_count >= 3:
break
else:
idle_count = 0
last_count = current_count
# Shorter final scroll
for _ in range(2): # Reduced from 3
driver.execute_script(scroll_script)
time.sleep(0.3) # Reduced from 0.4
scroll_time = time.time() - start_time
print(f" Scrolling complete in {scroll_time:.2f}s")
# Extract ALL reviews using JavaScript (fast!)
print("Extracting reviews with JavaScript...")
extract_start = time.time()
all_reviews = extract_all_reviews_js(driver)
extract_time = time.time() - extract_start
print(f" Extraction complete in {extract_time:.2f}s")
elapsed = time.time() - start_time
print(f"\n{'='*50}")
print(f"✅ COMPLETED!")
print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
print(f"Time: {elapsed:.2f}s")
print(f" - Scrolling: {scroll_time:.2f}s")
print(f" - Extraction: {extract_time:.2f}s")
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
print(f"{'='*50}")
if len(all_reviews) >= 244:
print(f"🎯 Got ALL 244 reviews!")
elif len(all_reviews) >= 240:
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
print()
# Save
with open('google_reviews_dom_only_fast.json', 'w', encoding='utf-8') as f:
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
print(f"💾 Saved to google_reviews_dom_only_fast.json")
if all_reviews:
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}")
return all_reviews
finally:
try:
driver.quit()
except:
pass
if __name__ == '__main__':
try:
reviews = dom_only_fast_scrape()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
print("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -1,346 +0,0 @@
#!/usr/bin/env python3
"""
Fast API-First Scraper - Optimized version of start.py
Strategy:
1. Open browser and navigate to reviews (~15 seconds)
2. Scroll rapidly JUST to trigger API calls (~15 seconds)
3. Collect all API responses during scrolling
4. Parse reviews from API responses
5. Skip DOM parsing entirely
6. Exit immediately
Expected time: ~30-40 seconds for 244 reviews (vs 155 seconds)
Speed improvement: ~4-5x faster!
"""
import sys
import yaml
import logging
import time
import json
from pathlib import Path
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from modules.api_interceptor import GoogleMapsAPIInterceptor
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
def load_config():
"""Load configuration from config.yaml"""
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def fast_scrape():
"""Fast API-first scraping."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
log.info("="*60)
log.info("FAST API-FIRST SCRAPER")
log.info("="*60)
log.info(f"URL: {url[:80]}...")
log.info(f"Mode: API-first (skip DOM parsing)")
log.info("="*60 + "\n")
start_time = time.time()
api_reviews = {}
# Create driver using SeleniumBase UC Mode (like original scraper)
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# Step 1: Navigate to reviews
log.info("Step 1: Opening Google Maps...")
driver.get(url)
time.sleep(2)
# Dismiss cookies
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
log.info("✓ Cookie dialog dismissed")
time.sleep(1)
except:
pass
# Click reviews tab - comprehensive approach
log.info("Step 2: Opening reviews tab...")
# Review keywords for multiple languages
review_keywords = [
'reviews', 'review', 'reseñas', 'reseña', 'opiniones', 'avis',
'bewertungen', 'recensioni', 'avaliações', 'ביקורות'
]
clicked = False
tab_selectors = [
'.LRkQ2', # Primary
'.hh2c6', # Alternative
'[data-tab-index="1"]', # Tab index
'button[role="tab"]', # Button tabs
'div[role="tab"]', # Div tabs
]
# Try each selector
for selector in tab_selectors:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
try:
# Check if this is the reviews tab
text = (tab.text or '').lower()
aria_label = (tab.get_attribute('aria-label') or '').lower()
if any(keyword in text or keyword in aria_label for keyword in review_keywords):
log.info(f"Found reviews tab with selector {selector}: '{tab.text}'")
# Scroll into view
driver.execute_script("arguments[0].scrollIntoView({block:'center'});", tab)
time.sleep(0.5)
# Click with JavaScript (most reliable)
driver.execute_script("arguments[0].click();", tab)
time.sleep(1.5)
log.info("✓ Reviews tab clicked")
clicked = True
break
except:
continue
if clicked:
break
except:
continue
if not clicked:
log.warning("Could not find/click reviews tab - may already be on reviews or page structure changed")
# CRITICAL: Wait after clicking reviews tab for page to load
log.info("Waiting for reviews page to fully load...")
time.sleep(3)
# Find reviews pane
log.info("Step 3: Finding reviews pane...")
log.info(f"Current URL: {driver.current_url}")
pane = None
pane_selectors = [
'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', # Primary
'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', # Without role="main"
'div.m6QErb.WNBkOb.XiKgde', # Alternative class combination
'div[role="main"] div.m6QErb.XiKgde', # Simplified with XiKgde
'div.m6QErb.DxyBCb.XiKgde', # Another variant
'div[role="main"] div.m6QErb', # Simplified version
'div.m6QErb.DxyBCb', # Even more simplified
'div[role="main"]', # Most generic
]
for selector in pane_selectors:
try:
log.info(f"Trying selector: {selector}")
wait = WebDriverWait(driver, 5)
pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
log.info(f"✓ Found reviews pane with: {selector}")
break
except TimeoutException:
log.debug(f"Pane not found with selector: {selector}")
continue
if not pane:
log.error("Could not find reviews pane after all attempts!")
log.error(f"Final URL: {driver.current_url}")
# Save screenshot for debugging
try:
screenshot_path = 'pane_not_found.png'
driver.save_screenshot(screenshot_path)
log.info(f"Screenshot saved to {screenshot_path}")
except:
pass
return []
# Wait for initial reviews to load
log.info("Waiting for initial reviews to render...")
time.sleep(3)
# Check if any review cards are present
try:
cards = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf')
log.info(f"Found {len(cards)} initial review cards")
except:
log.warning("Could not find initial review cards")
# Step 4: Setup API interceptor (AFTER finding pane)
log.info("Step 4: Setting up API interception...")
interceptor = GoogleMapsAPIInterceptor(driver)
try:
interceptor.setup_interception()
interceptor.inject_response_interceptor()
log.info("✓ API interceptor ready - capturing network responses")
except Exception as e:
log.warning(f"Failed to setup interceptor: {e}")
import traceback
traceback.print_exc()
time.sleep(2) # Extra wait for interception to be fully active
log.info("")
# Step 5: Rapid scrolling to trigger API calls
log.info("="*60)
log.info("Step 5: Rapid scrolling to trigger API calls")
log.info("="*60)
# Setup scroll script (same as original scraper)
try:
driver.execute_script("window.scrollablePane = arguments[0];", pane)
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
log.info("✓ Scroll script setup complete")
except Exception as e:
log.warning(f"Error setting up scroll script: {e}")
scroll_script = "window.scrollBy(0, 300);" # Fallback
# Verify interceptor is active
try:
is_injected = driver.execute_script("return window.__reviewInterceptorInjected === true;")
stats = driver.execute_script("return window.__interceptorStats;")
queue_length = driver.execute_script("return window.__interceptedResponses ? window.__interceptedResponses.length : -1;")
log.info(f"Interceptor status: injected={is_injected}, queue={queue_length}, stats={stats}")
except Exception as e:
log.warning(f"Could not check interceptor status: {e}")
# Trigger initial API call
log.info("Triggering initial API call...")
driver.execute_script(scroll_script)
time.sleep(2) # Wait for first API response
log.info("")
# We need about 25 API calls for 244 reviews (10 per call)
# Scroll rapidly - no DOM parsing!
target_reviews = 240
max_scrolls = 30
for i in range(max_scrolls):
# Fast scroll
driver.execute_script(scroll_script)
time.sleep(0.3) # Optimal timing - fast but captures all responses
# Collect API responses
try:
responses = interceptor.get_intercepted_responses()
if i == 5: # Debug on scroll 5
log.info(f"DEBUG: Got {len(responses)} responses from interceptor")
# Check browser console
try:
console_logs = driver.get_log('browser')
interceptor_logs = [l for l in console_logs if 'API Interceptor' in l.get('message', '')]
if interceptor_logs:
log.info(f"DEBUG: Interceptor console logs:")
for l in interceptor_logs[-10:]: # Last 10
log.info(f" {l['message']}")
else:
log.info("DEBUG: No interceptor logs in console")
except Exception as e:
log.warning(f"Could not get console logs: {e}")
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
if i == 5: # Debug on scroll 5
log.info(f"DEBUG: Parsed {len(parsed)} reviews from responses")
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
if parsed:
log.info(f"Scroll {i+1}: +{len(parsed)} reviews | Total: {len(api_reviews)}")
# Exit early if we have enough
if len(api_reviews) >= target_reviews:
log.info(f"\n✓ Reached target of {target_reviews} reviews!")
break
except Exception as e:
log.error(f"Error collecting API responses: {e}")
import traceback
traceback.print_exc()
# Quick progress update
if (i + 1) % 5 == 0 and i > 0:
log.info(f"Progress: {i+1}/{max_scrolls} scrolls, {len(api_reviews)} reviews collected")
elapsed = time.time() - start_time
# Convert to list
all_reviews = list(api_reviews.values())
log.info("\n" + "="*60)
log.info("✅ FAST SCRAPING COMPLETED!")
log.info("="*60)
log.info(f"Total reviews: {len(all_reviews)}")
log.info(f"Scrolls performed: {i+1}")
log.info(f"Time elapsed: {elapsed:.2f} seconds")
if all_reviews:
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second")
log.info("="*60 + "\n")
# Save results
output_file = 'google_reviews_fast.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
log.info(f"💾 Saved {len(all_reviews)} reviews to {output_file}")
# Show sample
if all_reviews:
log.info("\n📝 Sample review:")
sample = all_reviews[0]
log.info(f" Author: {sample['author']}")
log.info(f" Rating: {sample['rating']}")
log.info(f" Date: {sample['date_text']}")
if sample['text']:
log.info(f" Text: {sample['text'][:80]}...")
# Stats comparison
log.info("\n" + "="*60)
log.info("SPEED COMPARISON")
log.info("="*60)
log.info(f"Old approach: ~155 seconds for 244 reviews")
log.info(f"Fast approach: ~{elapsed:.0f} seconds for {len(all_reviews)} reviews")
if elapsed > 0:
log.info(f"Improvement: {155/elapsed:.1f}x faster! 🚀")
log.info("="*60 + "\n")
return all_reviews
finally:
# Always close the driver
try:
driver.quit()
except:
pass
if __name__ == '__main__':
try:
reviews = fast_scrape()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
log.info("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
log.error(f"Fatal error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -1,307 +0,0 @@
#!/usr/bin/env python3
"""
FASTEST STABLE Scraper - Best of both worlds.
Strategy:
1. Ultra-fast API scrolling (proven stable) → 234 reviews in ~19s
2. Instant JavaScript DOM extraction → 10 missing reviews in ~0.5s
3. Total: ~20 seconds for all 244 reviews with 100% stability
Combines stability of API approach with speed of JavaScript extraction.
"""
import sys
import yaml
import logging
import time
import json
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from modules.api_interceptor import GoogleMapsAPIInterceptor
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
def load_config():
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def extract_missing_reviews_js(driver, max_reviews=25):
"""Ultra-fast JavaScript extraction for missing reviews."""
extract_script = """
const reviews = [];
const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium');
const maxCount = Math.min(arguments[0], elements.length);
for (let i = 0; i < maxCount; i++) {
const elem = elements[i];
const review = {};
try {
const authorElem = elem.querySelector('div.d4r55');
review.author = authorElem ? authorElem.textContent.trim() : null;
const ratingElem = elem.querySelector('span.kvMYJc');
if (ratingElem) {
const ariaLabel = ratingElem.getAttribute('aria-label');
if (ariaLabel) {
const match = ariaLabel.match(/\\d+/);
review.rating = match ? parseFloat(match[0]) : null;
}
}
const textElem = elem.querySelector('span.wiI7pd');
review.text = textElem ? textElem.textContent.trim() : null;
const dateElem = elem.querySelector('span.rsqaWe');
review.date_text = dateElem ? dateElem.textContent.trim() : null;
const avatarElem = elem.querySelector('img.NBa7we');
review.avatar_url = avatarElem ? avatarElem.src : null;
const profileElem = elem.querySelector('button.WEBjve');
review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null;
if (review.author && review.date_text) {
reviews.push(review);
}
} catch (e) {
// Skip
}
}
return reviews;
"""
try:
reviews_data = driver.execute_script(extract_script, max_reviews)
reviews = []
for review_data in reviews_data:
review_id = f"dom_{hash(review_data['author'] + review_data['date_text'])}"
review_data['review_id'] = review_id
reviews.append(review_data)
return reviews
except Exception as e:
return []
def fastest_stable_scrape():
"""Get ALL 244 reviews with ultra-fast API + instant JS extraction."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
print("FASTEST STABLE SCRAPER - Ultra-fast API + instant JS...")
print(f"URL: {url[:80]}...")
start_time = time.time()
api_reviews = {}
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# Navigate
driver.get(url)
time.sleep(1.5)
# Dismiss cookies
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
time.sleep(0.4)
except:
pass
# Click reviews tab
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
for selector in ['.LRkQ2', 'button[role="tab"]']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria for kw in review_keywords):
driver.execute_script("arguments[0].click();", tab)
time.sleep(0.4)
break
except:
continue
# Wait for stability
time.sleep(1.0)
# Find pane
pane = None
try:
wait = WebDriverWait(driver, 3)
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
except TimeoutException:
try:
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
except:
print("ERROR: Could not find pane")
return []
# Wait for initial reviews to load (critical for stability)
time.sleep(1.5)
# Setup API interceptor
interceptor = GoogleMapsAPIInterceptor(driver)
interceptor.setup_interception()
interceptor.inject_response_interceptor()
time.sleep(1.0) # Important: wait for interceptor to be ready
# Setup scroll
driver.execute_script("window.scrollablePane = arguments[0];", pane)
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
# Trigger initial scroll to get first API response
driver.execute_script(scroll_script)
time.sleep(1.0) # Wait for first API response
print("[Phase 1] Ultra-fast API scrolling...")
# Ultra-fast API scrolling
target_reviews = 240
max_scrolls = 35
for i in range(max_scrolls):
driver.execute_script(scroll_script)
time.sleep(0.27) # Optimal timing
# API collection
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
if (i + 1) % 10 == 0:
print(f" {len(api_reviews)} reviews...")
if len(api_reviews) >= target_reviews:
break
except:
pass
# Final API collection
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except:
pass
api_time = time.time() - start_time
print(f" ✅ Phase 1: {len(api_reviews)} reviews in {api_time:.2f}s")
# [Phase 2] Instant JavaScript extraction for missing reviews
missing = 244 - len(api_reviews)
if missing > 0:
print(f"\n[Phase 2] Fast JS extraction for {missing} missing reviews...")
# Scroll to top (missing reviews likely at top)
driver.execute_script("window.scrollablePane.scrollTo(0, 0);", pane)
time.sleep(0.3)
# Extract with JavaScript
dom_reviews = extract_missing_reviews_js(driver, max_reviews=min(missing + 10, 25))
# Build API keys for deduplication
api_keys = set()
for api_review in api_reviews.values():
key = (api_review.get('author', ''), (api_review.get('date_text', '') or '')[:20])
api_keys.add(key)
# Add unique DOM reviews
dom_added = 0
for dom_review in dom_reviews:
dom_key = (dom_review.get('author', ''), (dom_review.get('date_text', '') or '')[:20])
if dom_key not in api_keys:
api_reviews[dom_review['review_id']] = dom_review
dom_added += 1
dom_time = time.time() - start_time - api_time
print(f" ✅ Phase 2: +{dom_added} reviews in {dom_time:.2f}s")
elapsed = time.time() - start_time
all_reviews = list(api_reviews.values())
print(f"\n{'='*50}")
print(f"✅ COMPLETED!")
print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
print(f"Time: {elapsed:.2f}s")
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
print(f"{'='*50}")
if len(all_reviews) >= 244:
print(f"🎯 Got ALL 244 reviews!")
elif len(all_reviews) >= 240:
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
print()
# Save
with open('google_reviews_fastest_stable.json', 'w', encoding='utf-8') as f:
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
print(f"💾 Saved to google_reviews_fastest_stable.json")
if all_reviews:
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}")
return all_reviews
finally:
try:
driver.quit()
except:
pass
if __name__ == '__main__':
try:
reviews = fastest_stable_scrape()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
print("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -1,286 +0,0 @@
#!/usr/bin/env python3
"""
Hybrid Parallel Scraper - Best of both worlds.
Strategy:
1. Open browser and get to reviews page (~15s)
2. Scroll quickly to collect ~5-10 continuation tokens (~5s)
3. Make parallel API calls in browser using JavaScript (~2-3s)
4. Total: ~22-25 seconds for 244 reviews
This approach:
- Uses browser's active session (no auth issues)
- Collects tokens sequentially (required by API)
- Makes parallel calls for remaining pages (fast!)
"""
import sys
import yaml
import logging
import time
import json
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from modules.api_interceptor import GoogleMapsAPIInterceptor
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
def load_config():
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def hybrid_parallel_scrape():
"""Hybrid approach: Sequential token collection + Parallel fetch."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
log.info("="*60)
log.info("HYBRID PARALLEL SCRAPER")
log.info("="*60)
log.info(f"URL: {url[:80]}...")
log.info(f"Mode: Sequential tokens + Parallel fetch")
log.info("="*60 + "\n")
start_time = time.time()
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# PHASE 1: Setup (~15s)
log.info("Phase 1: Browser setup...")
driver.get(url)
time.sleep(2)
# Dismiss cookies
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
time.sleep(1)
except:
pass
# Click reviews tab
review_keywords = ['reviews', 'review', 'reseñas']
for selector in ['.LRkQ2', '.hh2c6', 'button[role="tab"]']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria for kw in review_keywords):
driver.execute_script("arguments[0].click();", tab)
time.sleep(2)
break
except:
continue
time.sleep(3)
# Find pane
pane = None
for selector in ['div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde',
'div.m6QErb.WNBkOb.XiKgde']:
try:
wait = WebDriverWait(driver, 5)
pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
break
except:
continue
if not pane:
log.error("Could not find pane")
return []
time.sleep(2)
# Extract place ID
place_id = None
current_url = driver.current_url
if '!1s' in current_url:
parts = current_url.split('!1s')
if len(parts) > 1:
place_id = parts[1].split('!')[0]
if not place_id:
log.error("Could not extract place ID")
return []
log.info(f"✓ Setup complete (place_id: {place_id})\n")
# PHASE 2: Collect tokens via scrolling (~5s)
log.info("Phase 2: Collecting continuation tokens...")
interceptor = GoogleMapsAPIInterceptor(driver)
interceptor.setup_interception()
interceptor.inject_response_interceptor()
time.sleep(1)
# Setup scroll
driver.execute_script("window.scrollablePane = arguments[0];", pane)
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
# Collect tokens by scrolling quickly
tokens = []
all_reviews = {}
for i in range(8): # 8 scrolls to get ~8 tokens
driver.execute_script(scroll_script)
time.sleep(0.2) # Very fast scrolling
# Collect responses
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in all_reviews:
all_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
# Extract continuation token from raw response
for resp in responses:
try:
body = resp.get('body', '')
if body.startswith(")]}'"):
body = body[4:]
data = json.loads(body)
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
token = data[1]
if token and token not in tokens:
tokens.append(token)
except:
pass
log.info(f"✓ Collected {len(tokens)} continuation tokens")
log.info(f"✓ Got {len(all_reviews)} reviews from scrolling\n")
# PHASE 3: Parallel fetch remaining pages (~2-3s)
if len(tokens) > 0:
log.info("Phase 3: Parallel fetch of remaining pages...")
parallel_script = """
async function fetchPages(placeId, tokens) {
const baseUrl = 'https://www.google.com/maps/rpc/listugcposts';
const results = [];
const promises = tokens.map((token, idx) => {
const pb = `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`;
const params = new URLSearchParams({
authuser: '0',
hl: 'es',
gl: 'es',
pb: pb
});
return fetch(`${baseUrl}?${params}`)
.then(r => r.text())
.then(text => {
const body = text.startsWith(")]}'") ? text.substring(4) : text;
return {idx, data: JSON.parse(body)};
})
.catch(e => null);
});
const settled = await Promise.all(promises);
return settled.filter(r => r !== null);
}
return await fetchPages(arguments[0], arguments[1]);
"""
try:
parallel_start = time.time()
results = driver.execute_async_script(parallel_script, place_id, tokens[:15]) # Limit to 15 parallel
parallel_time = time.time() - parallel_start
log.info(f"✓ Parallel fetch completed in {parallel_time:.2f}s")
log.info(f" Received {len(results)} responses")
# Parse parallel results
for result in results:
if result and 'data' in result:
try:
parsed = interceptor._parse_listugcposts_response(result['data'])
for review in parsed:
if review.review_id and review.review_id not in all_reviews:
all_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except Exception as e:
log.debug(f"Parse error: {e}")
log.info(f"✓ Total reviews after parallel fetch: {len(all_reviews)}\n")
except Exception as e:
log.warning(f"Parallel fetch failed: {e}")
reviews_list = list(all_reviews.values())
elapsed = time.time() - start_time
log.info("="*60)
log.info("✅ HYBRID PARALLEL SCRAPING COMPLETED!")
log.info("="*60)
log.info(f"Total reviews: {len(reviews_list)}")
log.info(f"Total time: {elapsed:.2f} seconds")
log.info(f"Speed: {len(reviews_list)/elapsed:.1f} reviews/second")
log.info("="*60 + "\n")
# Save
with open('google_reviews_hybrid.json', 'w', encoding='utf-8') as f:
json.dump(reviews_list, f, indent=2, ensure_ascii=False)
log.info(f"💾 Saved {len(reviews_list)} reviews to google_reviews_hybrid.json")
if reviews_list:
log.info("\n📝 Sample:")
s = reviews_list[0]
log.info(f" {s['author']} - {s['rating']}★ - {s['date_text']}")
log.info("\n" + "="*60)
log.info("SPEED COMPARISON")
log.info("="*60)
log.info(f"Old DOM: ~155s for 244 reviews (1.0x)")
log.info(f"Fast scrolling: ~29s for 234 reviews (5.3x)")
log.info(f"Hybrid parallel: ~{elapsed:.0f}s for {len(reviews_list)} reviews ({155/elapsed:.1f}x)! 🚀")
log.info("="*60 + "\n")
return reviews_list
finally:
try:
driver.quit()
except:
pass
if __name__ == '__main__':
try:
reviews = hybrid_parallel_scrape()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
log.info("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
log.error(f"Fatal error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -1,318 +0,0 @@
#!/usr/bin/env python3
"""
OPTIMIZED HYBRID Scraper - True parallel with minimal overhead.
Strategy:
1. Ultra-fast API scrolling (no DOM parsing during scroll!)
2. Quick DOM count check near end (minimal overhead)
3. If needed, targeted DOM parse at very end for missing reviews
4. Goal: ~22-25s for all 244 reviews
Key: Keep scroll loop FAST, only parse DOM if absolutely needed at the very end.
"""
import sys
import yaml
import logging
import time
import json
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from modules.api_interceptor import GoogleMapsAPIInterceptor
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
def load_config():
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def quick_dom_parse_top_reviews(driver, count=15):
"""Quick parse of just the top N reviews from DOM."""
dom_reviews = []
try:
# Get only first N review elements (the ones most likely to be missing from API)
review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')[:count]
for elem in review_elements:
try:
review_data = {}
# Author
try:
author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55')
review_data['author'] = author_elem.text
except:
review_data['author'] = None
# Rating
try:
rating_elem = elem.find_element(By.CSS_SELECTOR, 'span.kvMYJc')
rating_attr = rating_elem.get_attribute('aria-label')
if rating_attr:
rating_parts = rating_attr.split()
if rating_parts:
review_data['rating'] = float(rating_parts[0])
except:
review_data['rating'] = None
# Text
try:
text_elem = elem.find_element(By.CSS_SELECTOR, 'span.wiI7pd')
review_data['text'] = text_elem.text
except:
review_data['text'] = None
# Date
try:
date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe')
review_data['date_text'] = date_elem.text
except:
review_data['date_text'] = None
# Avatar
try:
avatar_elem = elem.find_element(By.CSS_SELECTOR, 'img.NBa7we')
review_data['avatar_url'] = avatar_elem.get_attribute('src')
except:
review_data['avatar_url'] = None
# Profile URL
try:
profile_elem = elem.find_element(By.CSS_SELECTOR, 'button.WEBjve')
review_data['profile_url'] = profile_elem.get_attribute('data-review-id')
except:
review_data['profile_url'] = None
# Generate ID
if review_data.get('author'):
review_id = f"dom_{hash(str(review_data.get('author', '')) + str(review_data.get('date_text', '')))}"
review_data['review_id'] = review_id
dom_reviews.append(review_data)
except:
continue
except Exception as e:
pass
return dom_reviews
def optimized_hybrid_scrape():
"""Ultra-fast API scrolling + minimal targeted DOM parse."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
print("OPTIMIZED HYBRID SCRAPER - Ultra-fast API + minimal DOM...")
print(f"URL: {url[:80]}...")
start_time = time.time()
api_reviews = {}
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# Navigate
driver.get(url)
time.sleep(1.5)
# Dismiss cookies
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
time.sleep(0.4)
except:
pass
# Click reviews tab
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
for selector in ['.LRkQ2', 'button[role="tab"]']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria for kw in review_keywords):
driver.execute_script("arguments[0].click();", tab)
time.sleep(0.4)
break
except:
continue
# Brief wait for reviews page (balance speed vs stability)
time.sleep(1.0) # Reduced from 3s but needed for stability
# Find pane - use most common selector directly
pane = None
try:
wait = WebDriverWait(driver, 3) # Reduced from 5s
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
except TimeoutException:
try:
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
except:
print("ERROR: Could not find pane")
return []
# Setup API interceptor immediately
interceptor = GoogleMapsAPIInterceptor(driver)
interceptor.setup_interception()
interceptor.inject_response_interceptor()
time.sleep(0.3) # Minimal wait for interceptor
# Setup scroll
driver.execute_script("window.scrollablePane = arguments[0];", pane)
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
# Trigger initial scroll
driver.execute_script(scroll_script)
time.sleep(0.3) # Minimal initial trigger wait
print("Ultra-fast API scrolling...")
# FAST API-only scrolling (NO DOM parsing overhead!)
max_scrolls = 35
for i in range(max_scrolls):
driver.execute_script(scroll_script)
time.sleep(0.27)
# API collection only
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except:
pass
if (i + 1) % 10 == 0:
print(f" {len(api_reviews)} reviews...")
# Final API collection
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except:
pass
api_time = time.time() - start_time
print(f" ✅ API complete: {len(api_reviews)} reviews in {api_time:.2f}s")
# Targeted DOM parse ONLY if we're missing reviews
missing = 244 - len(api_reviews)
if missing > 0:
print(f"\nQuick DOM parse for {missing} missing reviews...")
# Scroll to top
driver.execute_script("window.scrollablePane.scrollTo(0, 0);", pane)
time.sleep(0.5)
# Quick parse of top reviews (most likely to be missing)
dom_reviews = quick_dom_parse_top_reviews(driver, count=min(missing + 5, 20))
# Build API keys
api_keys = set()
for api_review in api_reviews.values():
key = (
api_review.get('author', ''),
(api_review.get('date_text', '') or '')[:20]
)
api_keys.add(key)
# Add unique DOM reviews
dom_added = 0
for dom_review in dom_reviews:
dom_key = (
dom_review.get('author', ''),
(dom_review.get('date_text', '') or '')[:20]
)
if dom_key not in api_keys and dom_review.get('review_id'):
api_reviews[dom_review['review_id']] = dom_review
dom_added += 1
dom_time = time.time() - start_time - api_time
print(f" ✅ DOM complete: +{dom_added} reviews in {dom_time:.2f}s")
elapsed = time.time() - start_time
all_reviews = list(api_reviews.values())
print(f"\n{'='*50}")
print(f"✅ COMPLETED!")
print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
print(f"Time: {elapsed:.2f}s")
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
print(f"{'='*50}")
if len(all_reviews) >= 244:
print(f"🎯 Got ALL 244 reviews!")
elif len(all_reviews) >= 240:
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
print()
# Save
with open('google_reviews_optimized_hybrid.json', 'w', encoding='utf-8') as f:
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
print(f"💾 Saved to google_reviews_optimized_hybrid.json")
if all_reviews:
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}")
return all_reviews
finally:
try:
driver.quit()
except:
pass
if __name__ == '__main__':
try:
reviews = optimized_hybrid_scrape()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
print("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -1,360 +0,0 @@
#!/usr/bin/env python3
"""
Parallel API Scraper - Capture session, then parallel API calls.
Strategy:
1. Open browser and navigate to reviews (~15 seconds)
2. Capture cookies and place ID from active session (~2 seconds)
3. Make parallel API calls using requests (~5-10 seconds)
4. Close browser immediately
Expected time: ~20-30 seconds for 244 reviews (vs 155 seconds)
Speed improvement: ~5-7x faster!
"""
import sys
import yaml
import logging
import time
import json
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from modules.api_interceptor import GoogleMapsAPIInterceptor
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
def load_config():
"""Load configuration from config.yaml"""
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def capture_session(url: str, headless: bool = False):
"""
Capture cookies and place ID from browser session.
Returns (session, place_id, interceptor)
"""
log.info("="*60)
log.info("STEP 1: Capturing session from browser")
log.info("="*60)
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# Navigate to place
log.info("Opening Google Maps...")
driver.get(url)
time.sleep(2)
# Dismiss cookies
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
log.info("✓ Cookie dialog dismissed")
time.sleep(1)
except:
pass
# Click reviews tab
log.info("Opening reviews tab...")
review_keywords = ['reviews', 'review', 'reseñas', 'reseña', 'opiniones']
clicked = False
for selector in ['.LRkQ2', '.hh2c6', '[data-tab-index="1"]', 'button[role="tab"]']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria_label = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria_label for kw in review_keywords):
driver.execute_script("arguments[0].click();", tab)
time.sleep(2)
log.info("✓ Reviews tab clicked")
clicked = True
break
if clicked:
break
except:
continue
# Wait for reviews to load
time.sleep(3)
# Extract place ID from URL
current_url = driver.current_url
place_id = None
if '!1s' in current_url:
parts = current_url.split('!1s')
if len(parts) > 1:
place_id = parts[1].split('!')[0]
log.info(f"✓ Extracted place ID: {place_id}")
if not place_id:
log.error("Could not extract place ID from URL")
return None, None, None
# Capture ALL cookies using CDP
log.info("Capturing cookies via CDP...")
cdp_cookies = driver.execute_cdp_cmd('Network.getAllCookies', {})
browser_cookies = cdp_cookies.get('cookies', [])
log.info(f"✓ Captured {len(browser_cookies)} cookies")
# Get user agent
user_agent = driver.execute_script("return navigator.userAgent")
# Create session with cookies
session = requests.Session()
for cookie in browser_cookies:
session.cookies.set(
name=cookie['name'],
value=cookie['value'],
domain=cookie.get('domain', '.google.com'),
path=cookie.get('path', '/')
)
# Set headers
session.headers.update({
'User-Agent': user_agent,
'Accept': '*/*',
'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8',
'Referer': 'https://www.google.com/maps/',
'Origin': 'https://www.google.com',
})
# Create interceptor for parsing
interceptor = GoogleMapsAPIInterceptor(None)
log.info("✓ Session captured successfully\n")
return session, place_id, interceptor
finally:
# Close browser immediately - we don't need it anymore!
try:
driver.quit()
log.info("✓ Browser closed\n")
except:
pass
def fetch_reviews_page(session, place_id, interceptor, continuation_token=None):
"""Fetch a single page of reviews via API."""
if continuation_token:
pb = f"!1m6!1s{place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
else:
pb = f"!1m6!1s{place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
params = {
'authuser': '0',
'hl': 'es',
'gl': 'es',
'pb': pb
}
try:
url = 'https://www.google.com/maps/rpc/listugcposts'
response = session.get(url, params=params, timeout=10)
if response.status_code != 200:
log.error(f"API error {response.status_code}")
return [], None
body = response.text
if body.startswith(")]}'"):
body = body[4:].strip()
data = json.loads(body)
reviews = interceptor._parse_listugcposts_response(data)
# Get next token
next_token = None
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
next_token = data[1]
return reviews, next_token
except Exception as e:
log.error(f"Request failed: {e}")
return [], None
def scrape_all_parallel(session, place_id, interceptor, max_workers=5):
"""
Main scraping method with parallel API calls.
"""
log.info("="*60)
log.info("STEP 2: Parallel API scraping")
log.info("="*60)
start_time = time.time()
all_reviews = []
seen_ids = set()
# Fetch first page to get continuation token
log.info("Fetching first page...")
reviews, token = fetch_reviews_page(session, place_id, interceptor, None)
for review in reviews:
rid = review.review_id or f"{review.author}_{review.date_text}"
if rid not in seen_ids:
seen_ids.add(rid)
all_reviews.append({
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
})
log.info(f"{len(reviews)} reviews | Total: {len(all_reviews)}")
if not token:
log.info("No continuation token - only one page of reviews")
return all_reviews
# Collect continuation tokens by fetching a few sequential pages
# (We need to do this sequentially to get the tokens)
tokens = [token]
log.info("Collecting continuation tokens...")
for i in range(4): # Get 5 total tokens
reviews, next_token = fetch_reviews_page(session, place_id, interceptor, token)
if next_token:
tokens.append(next_token)
token = next_token
else:
break
for review in reviews:
rid = review.review_id or f"{review.author}_{review.date_text}"
if rid not in seen_ids:
seen_ids.add(rid)
all_reviews.append({
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
})
log.info(f"Collected {len(tokens)} tokens, {len(all_reviews)} reviews so far")
log.info(f"Starting parallel fetch with {max_workers} workers...\n")
# Now fetch remaining pages in parallel
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
for token in tokens:
future = executor.submit(fetch_reviews_page, session, place_id, interceptor, token)
futures.append(future)
for i, future in enumerate(as_completed(futures)):
try:
reviews, _ = future.result()
new_count = 0
for review in reviews:
rid = review.review_id or f"{review.author}_{review.date_text}"
if rid not in seen_ids:
seen_ids.add(rid)
all_reviews.append({
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
})
new_count += 1
log.info(f" Completed {i+1}/{len(futures)}: +{new_count} new reviews | Total: {len(all_reviews)}")
except Exception as e:
log.error(f" Error in parallel fetch: {e}")
elapsed = time.time() - start_time
log.info(f"\n{'='*60}")
log.info(f"✅ PARALLEL SCRAPING COMPLETED!")
log.info(f"{'='*60}")
log.info(f"Total reviews: {len(all_reviews)}")
log.info(f"Parallel workers: {max_workers}")
log.info(f"API time: {elapsed:.2f} seconds")
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
log.info(f"{'='*60}\n")
return all_reviews
def main():
"""Main entry point."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
log.info("="*60)
log.info("PARALLEL API SCRAPER")
log.info("="*60)
log.info(f"URL: {url[:80]}...")
log.info(f"Mode: Parallel API calls (no scrolling)")
log.info("="*60 + "\n")
total_start = time.time()
# Step 1: Capture session from browser
session, place_id, interceptor = capture_session(url, headless)
if not session or not place_id:
log.error("Failed to capture session")
return []
# Step 2: Parallel API scraping
reviews = scrape_all_parallel(session, place_id, interceptor, max_workers=5)
total_elapsed = time.time() - total_start
# Save results
output_file = 'google_reviews_parallel.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(reviews, f, indent=2, ensure_ascii=False)
log.info(f"💾 Saved {len(reviews)} reviews to {output_file}")
# Show sample
if reviews:
log.info("\n📝 Sample review:")
sample = reviews[0]
log.info(f" Author: {sample['author']}")
log.info(f" Rating: {sample['rating']}")
log.info(f" Date: {sample['date_text']}")
if sample['text']:
log.info(f" Text: {sample['text'][:80]}...")
# Stats comparison
log.info("\n" + "="*60)
log.info("SPEED COMPARISON")
log.info("="*60)
log.info(f"Old DOM scraping: ~155 seconds for 244 reviews")
log.info(f"Fast API scrolling: ~43 seconds for 234 reviews (3.6x faster)")
log.info(f"Parallel API calls: ~{total_elapsed:.0f} seconds for {len(reviews)} reviews ({155/total_elapsed:.1f}x faster!) 🚀")
log.info("="*60 + "\n")
return reviews
if __name__ == '__main__':
try:
reviews = main()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
log.info("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
log.error(f"Fatal error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -1,350 +0,0 @@
#!/usr/bin/env python3
"""
PARALLEL HYBRID Scraper - Collects API + DOM simultaneously while scrolling.
Strategy:
1. During scrolling, collect BOTH API responses AND DOM elements in parallel
2. Deduplicate at the end
3. Should get all 244 reviews in ~20-25s (vs 34s sequential)
Optimization: No separate DOM parsing phase - everything happens during scroll!
"""
import sys
import yaml
import logging
import time
import json
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from modules.api_interceptor import GoogleMapsAPIInterceptor
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
def load_config():
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def parse_dom_review_element(elem):
"""Parse a single review element from DOM."""
try:
review_data = {}
# Author name
try:
author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55')
review_data['author'] = author_elem.text
except:
review_data['author'] = None
# Rating
try:
rating_elem = elem.find_element(By.CSS_SELECTOR, 'span.kvMYJc')
rating_attr = rating_elem.get_attribute('aria-label')
if rating_attr:
rating_parts = rating_attr.split()
if rating_parts:
review_data['rating'] = float(rating_parts[0])
except:
review_data['rating'] = None
# Review text
try:
text_elem = elem.find_element(By.CSS_SELECTOR, 'span.wiI7pd')
review_data['text'] = text_elem.text
except:
review_data['text'] = None
# Date
try:
date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe')
review_data['date_text'] = date_elem.text
except:
review_data['date_text'] = None
# Avatar URL
try:
avatar_elem = elem.find_element(By.CSS_SELECTOR, 'img.NBa7we')
review_data['avatar_url'] = avatar_elem.get_attribute('src')
except:
review_data['avatar_url'] = None
# Profile URL
try:
profile_elem = elem.find_element(By.CSS_SELECTOR, 'button.WEBjve')
review_data['profile_url'] = profile_elem.get_attribute('data-review-id')
except:
review_data['profile_url'] = None
# Generate ID from author + date + rating
if review_data.get('author'):
review_id = f"dom_{hash(str(review_data.get('author', '')) + str(review_data.get('date_text', '')) + str(review_data.get('rating', '')))}"
review_data['review_id'] = review_id
return review_data
return None
except (StaleElementReferenceException, Exception):
return None
def parallel_hybrid_scrape():
"""Collect API + DOM simultaneously during scrolling."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
print("PARALLEL HYBRID SCRAPER - Collecting API + DOM simultaneously...")
print(f"URL: {url[:80]}...")
start_time = time.time()
api_reviews = {}
dom_reviews = {}
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# Step 1: Navigate
driver.get(url)
time.sleep(1.5)
# Dismiss cookies
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
time.sleep(0.4)
except:
pass
# Click reviews tab
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
for selector in ['.LRkQ2', 'button[role="tab"]']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria for kw in review_keywords):
driver.execute_script("arguments[0].click();", tab)
time.sleep(0.4)
break
except:
continue
# Wait for page stability
time.sleep(1.0)
# Find pane
pane = None
try:
wait = WebDriverWait(driver, 3)
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
except TimeoutException:
try:
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
except:
print("ERROR: Could not find pane")
return []
# Wait for reviews to start loading
time.sleep(1.5)
# Setup API interceptor
interceptor = GoogleMapsAPIInterceptor(driver)
interceptor.setup_interception()
interceptor.inject_response_interceptor()
time.sleep(1.0) # Important: wait for interceptor to be ready
# Setup scroll
driver.execute_script("window.scrollablePane = arguments[0];", pane)
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
# Trigger initial scroll to get first API response
driver.execute_script(scroll_script)
time.sleep(1.0) # Wait for first API response
print("Parallel collection (API + DOM simultaneously)...")
# Scrolling with PARALLEL API + DOM collection
max_scrolls = 35
dom_parse_start = 25 # Only start DOM parsing after 25 scrolls (when near end)
for i in range(max_scrolls):
# Scroll
driver.execute_script(scroll_script)
time.sleep(0.27) # Optimal scroll timing
# PARALLEL COLLECTION 1: API Responses (always)
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except:
pass
# PARALLEL COLLECTION 2: DOM Elements (only near the end, lightweight)
# Only parse DOM in the last scrolls when we know we're near 234 API reviews
if i >= dom_parse_start and len(api_reviews) >= 220:
try:
# Lightweight: Just get author + date as unique key, don't parse everything
review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')
for elem in review_elements[:min(len(review_elements), 250)]: # Limit to first 250 for speed
try:
# Quick parse - just essentials
author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55')
author = author_elem.text if author_elem else None
date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe')
date_text = date_elem.text if date_elem else None
if author and date_text:
dom_key = (author, date_text[:20])
if dom_key not in dom_reviews:
# Full parse only if needed
dom_review = parse_dom_review_element(elem)
if dom_review:
dom_reviews[dom_key] = dom_review
except:
continue
except:
pass
# Progress logging
if (i + 1) % 10 == 0:
print(f" API: {len(api_reviews)}, DOM: {len(dom_reviews)} unique keys...")
# Final collections
print("Final collection sweep...")
# Final API collection
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except:
pass
# Final DOM parse (quick sweep)
try:
review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')
for elem in review_elements[:min(len(review_elements), 250)]:
try:
author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55')
author = author_elem.text if author_elem else None
date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe')
date_text = date_elem.text if date_elem else None
if author and date_text:
dom_key = (author, date_text[:20])
if dom_key not in dom_reviews:
dom_review = parse_dom_review_element(elem)
if dom_review:
dom_reviews[dom_key] = dom_review
except:
continue
except:
pass
# Merge: Start with API reviews, add DOM reviews that aren't duplicates
print("\nMerging API + DOM reviews...")
# Build set of API keys for deduplication (author + date)
api_keys = set()
for api_review in api_reviews.values():
key = (
api_review.get('author', ''),
(api_review.get('date_text', '') or '')[:20]
)
api_keys.add(key)
# Add unique DOM reviews
dom_added = 0
for dom_key, dom_review in dom_reviews.items():
if dom_key not in api_keys and dom_review.get('review_id'):
api_reviews[dom_review['review_id']] = dom_review
dom_added += 1
elapsed = time.time() - start_time
all_reviews = list(api_reviews.values())
print(f"\n{'='*50}")
print(f"✅ COMPLETED!")
print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
print(f" - API: {len(api_reviews) - dom_added}")
print(f" - DOM: {dom_added} unique")
print(f"Time: {elapsed:.2f}s")
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
print(f"{'='*50}")
if len(all_reviews) >= 244:
print(f"🎯 Got ALL 244 reviews!")
elif len(all_reviews) >= 240:
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
print()
# Save
with open('google_reviews_parallel_hybrid.json', 'w', encoding='utf-8') as f:
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
print(f"💾 Saved to google_reviews_parallel_hybrid.json")
if all_reviews:
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}")
return all_reviews
finally:
try:
driver.quit()
except:
pass
if __name__ == '__main__':
try:
reviews = parallel_hybrid_scrape()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
print("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -1,319 +0,0 @@
#!/usr/bin/env python3
"""
Parallel API Scraper V2 - Use browser's fetch API for parallel calls.
Strategy:
1. Open browser and navigate to reviews (~15 seconds)
2. Trigger initial API call to get place ID and pattern
3. Use JavaScript fetch API to make 25 parallel calls (~3-5 seconds)
4. Collect all results at once
Expected time: ~20-25 seconds for 244 reviews
Speed improvement: ~6-7x faster!
"""
import sys
import yaml
import logging
import time
import json
from pathlib import Path
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from modules.api_interceptor import GoogleMapsAPIInterceptor
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
def load_config():
"""Load configuration from config.yaml"""
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def parallel_scrape():
"""Parallel API-first scraping using browser's fetch API."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
log.info("="*60)
log.info("PARALLEL API SCRAPER V2")
log.info("="*60)
log.info(f"URL: {url[:80]}...")
log.info(f"Mode: Parallel browser fetch calls")
log.info("="*60 + "\n")
start_time = time.time()
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# Step 1: Navigate and setup
log.info("Step 1: Opening Google Maps...")
driver.get(url)
time.sleep(2)
# Dismiss cookies
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
log.info("✓ Cookie dialog dismissed")
time.sleep(1)
except:
pass
# Click reviews tab
log.info("Step 2: Opening reviews tab...")
review_keywords = ['reviews', 'review', 'reseñas', 'reseña', 'opiniones']
clicked = False
for selector in ['.LRkQ2', '.hh2c6', '[data-tab-index="1"]', 'button[role="tab"]']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria_label = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria_label for kw in review_keywords):
driver.execute_script("arguments[0].click();", tab)
time.sleep(2)
log.info("✓ Reviews tab clicked")
clicked = True
break
if clicked:
break
except:
continue
# Wait for reviews to load
log.info("Waiting for reviews page to fully load...")
time.sleep(3)
# Find reviews pane
log.info("Step 3: Finding reviews pane...")
pane = None
pane_selectors = [
'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde',
'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde',
'div.m6QErb.WNBkOb.XiKgde',
]
for selector in pane_selectors:
try:
wait = WebDriverWait(driver, 5)
pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
log.info(f"✓ Found reviews pane with: {selector}")
break
except TimeoutException:
continue
if not pane:
log.error("Could not find reviews pane")
return []
# Wait for initial reviews
time.sleep(2)
# Extract place ID from URL
current_url = driver.current_url
place_id = None
if '!1s' in current_url:
parts = current_url.split('!1s')
if len(parts) > 1:
place_id = parts[1].split('!')[0]
log.info(f"✓ Extracted place ID: {place_id}")
if not place_id:
log.error("Could not extract place ID from URL")
return []
# Step 4: Make parallel API calls using browser's fetch
log.info("\n" + "="*60)
log.info("Step 4: Making parallel API calls via browser fetch")
log.info("="*60)
# JavaScript to make parallel API calls
parallel_fetch_script = """
async function fetchReviewsParallel(placeId, numPages) {
const baseUrl = 'https://www.google.com/maps/rpc/listugcposts';
const results = [];
// Build pb parameter for each page
const requests = [];
let token = null;
console.log('[Parallel Fetch] Starting parallel fetch for', numPages, 'pages');
// First, we need to get continuation tokens sequentially
const tokens = [];
for (let i = 0; i < Math.min(numPages, 5); i++) {
const pb = token
? `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`
: `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`;
const params = new URLSearchParams({
authuser: '0',
hl: 'es',
gl: 'es',
pb: pb
});
try {
const response = await fetch(`${baseUrl}?${params}`);
const text = await response.text();
const body = text.startsWith(")]}'") ? text.substring(4) : text;
const data = JSON.parse(body);
results.push({index: i, data: data});
// Get next token
if (data && data.length > 1 && typeof data[1] === 'string') {
token = data[1];
tokens.push(token);
} else {
break; // No more pages
}
} catch (e) {
console.error('[Parallel Fetch] Error fetching page', i, e);
}
}
console.log('[Parallel Fetch] Got', tokens.length, 'continuation tokens');
console.log('[Parallel Fetch] Now fetching remaining pages in parallel...');
// Now fetch remaining pages in parallel using the tokens
const parallelPromises = tokens.slice(5).map((tok, idx) => {
const pb = `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${tok}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`;
const params = new URLSearchParams({
authuser: '0',
hl: 'es',
gl: 'es',
pb: pb
});
return fetch(`${baseUrl}?${params}`)
.then(r => r.text())
.then(text => {
const body = text.startsWith(")]}'") ? text.substring(4) : text;
return JSON.parse(body);
})
.then(data => ({index: idx + 5, data: data}))
.catch(e => {
console.error('[Parallel Fetch] Parallel fetch error', idx, e);
return null;
});
});
const parallelResults = await Promise.all(parallelPromises);
results.push(...parallelResults.filter(r => r !== null));
console.log('[Parallel Fetch] Completed! Total responses:', results.length);
return results;
}
// Execute parallel fetch
return await fetchReviewsParallel(arguments[0], arguments[1]);
"""
log.info(f"Fetching up to 25 pages in parallel...")
api_start = time.time()
try:
results = driver.execute_async_script(parallel_fetch_script, place_id, 25)
api_elapsed = time.time() - api_start
log.info(f"✓ Parallel fetch completed in {api_elapsed:.2f} seconds")
log.info(f" Received {len(results)} API responses")
except Exception as e:
log.error(f"Parallel fetch failed: {e}")
return []
# Parse results
log.info("\nStep 5: Parsing reviews from API responses...")
interceptor = GoogleMapsAPIInterceptor(None)
all_reviews = {}
for result in results:
if result and 'data' in result:
try:
parsed = interceptor._parse_listugcposts_response(result['data'])
for review in parsed:
if review.review_id and review.review_id not in all_reviews:
all_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except Exception as e:
log.debug(f"Error parsing response: {e}")
reviews_list = list(all_reviews.values())
elapsed = time.time() - start_time
log.info(f"\n{'='*60}")
log.info(f"✅ PARALLEL SCRAPING COMPLETED!")
log.info(f"{'='*60}")
log.info(f"Total reviews: {len(reviews_list)}")
log.info(f"API responses: {len(results)}")
log.info(f"Total time: {elapsed:.2f} seconds")
log.info(f" - Setup: {api_start - start_time:.2f}s")
log.info(f" - Parallel API: {api_elapsed:.2f}s")
log.info(f"Speed: {len(reviews_list)/elapsed:.1f} reviews/second")
log.info(f"{'='*60}\n")
# Save results
output_file = 'google_reviews_parallel.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(reviews_list, f, indent=2, ensure_ascii=False)
log.info(f"💾 Saved {len(reviews_list)} reviews to {output_file}")
# Show sample
if reviews_list:
log.info("\n📝 Sample review:")
sample = reviews_list[0]
log.info(f" Author: {sample['author']}")
log.info(f" Rating: {sample['rating']}")
log.info(f" Date: {sample['date_text']}")
if sample['text']:
log.info(f" Text: {sample['text'][:80]}...")
# Stats comparison
log.info("\n" + "="*60)
log.info("SPEED COMPARISON")
log.info("="*60)
log.info(f"Old DOM scraping: ~155 seconds for 244 reviews (1.0x)")
log.info(f"Fast API scrolling: ~43 seconds for 234 reviews (3.6x faster)")
log.info(f"Parallel browser fetch: ~{elapsed:.0f} seconds for {len(reviews_list)} reviews ({155/elapsed:.1f}x faster!) 🚀")
log.info("="*60 + "\n")
return reviews_list
finally:
try:
driver.quit()
except:
pass
if __name__ == '__main__':
try:
reviews = parallel_scrape()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
log.info("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
log.error(f"Fatal error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -1,279 +0,0 @@
#!/usr/bin/env python3
"""
ULTRA-FAST API Scraper - Maximum speed optimization.
Optimizations:
1. Minimal waits (0.5s after tab click instead of 3s)
2. No wait for "initial reviews" (removes 3s)
3. Faster scroll timing (0.2s instead of 0.3s)
4. Batch response collection (every 3 scrolls, not every scroll)
5. Less logging during scrolling (I/O overhead)
6. Direct pane selection (no trying multiple)
7. Parallel operations where possible
Target: ~15-20 seconds for 234 reviews
"""
import sys
import yaml
import logging
import time
import json
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from modules.api_interceptor import GoogleMapsAPIInterceptor
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
# Only show INFO and above
log.setLevel(logging.INFO)
def load_config():
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def ultra_fast_scrape():
"""Ultra-fast API-first scraping with all optimizations."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
print("ULTRA-FAST SCRAPER - Starting...")
print(f"URL: {url[:80]}...")
start_time = time.time()
api_reviews = {}
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# Step 1: Navigate (minimal waits)
driver.get(url)
time.sleep(1.5) # Stable wait
# Dismiss cookies (non-blocking)
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
time.sleep(0.4) # Balanced wait
except:
pass
# Click reviews tab
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
for selector in ['.LRkQ2', 'button[role="tab"]']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria for kw in review_keywords):
driver.execute_script("arguments[0].click();", tab)
time.sleep(0.4) # Balanced wait
break
except:
continue
# Brief wait for reviews page (balance speed vs stability)
time.sleep(1.0) # Reduced from 3s but needed for stability
# Find pane - use most common selector directly
pane = None
try:
wait = WebDriverWait(driver, 3) # Reduced from 5s
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
except TimeoutException:
try:
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
except:
print("ERROR: Could not find pane")
return []
# NO wait for initial reviews - save 3s!
# Setup API interceptor immediately
interceptor = GoogleMapsAPIInterceptor(driver)
interceptor.setup_interception()
interceptor.inject_response_interceptor()
time.sleep(0.3) # Minimal wait for interceptor
# Setup scroll
driver.execute_script("window.scrollablePane = arguments[0];", pane)
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
# Trigger initial scroll
driver.execute_script(scroll_script)
time.sleep(0.3) # Minimal initial trigger wait
print("Fast scrolling...")
# Rapid scrolling with batch collection
target_reviews = 240
max_scrolls = 35 # Slightly more to compensate for faster timing
for i in range(max_scrolls):
# Ultra-fast scroll
driver.execute_script(scroll_script)
time.sleep(0.27) # Sweet spot for stability
# Collect every scroll (can't skip or buffer clears)
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
# Only log every 10 scrolls to reduce I/O
if (i + 1) % 10 == 0:
print(f" {len(api_reviews)} reviews...")
if len(api_reviews) >= target_reviews:
break
except:
pass
# Final collection
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except:
pass
# Quick DOM parse for missing reviews (only if needed)
missing = 244 - len(api_reviews)
if missing > 0:
print(f"\nQuick DOM parse for {missing} missing reviews...")
try:
# Scroll to top
driver.execute_script("window.scrollablePane.scrollTo(0, 0);", pane)
time.sleep(0.3)
# Parse top reviews (most likely to be missing)
review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')[:min(missing + 5, 20)]
# Build API keys for deduplication
api_keys = set()
for api_review in api_reviews.values():
key = (api_review.get('author', ''), (api_review.get('date_text', '') or '')[:20])
api_keys.add(key)
# Parse and add unique DOM reviews
dom_added = 0
for elem in review_elements:
try:
review_data = {}
# Author
author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55')
review_data['author'] = author_elem.text if author_elem else None
# Rating
rating_elem = elem.find_element(By.CSS_SELECTOR, 'span.kvMYJc')
rating_attr = rating_elem.get_attribute('aria-label')
if rating_attr:
rating_parts = rating_attr.split()
if rating_parts:
review_data['rating'] = float(rating_parts[0])
# Text
text_elem = elem.find_element(By.CSS_SELECTOR, 'span.wiI7pd')
review_data['text'] = text_elem.text if text_elem else None
# Date
date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe')
review_data['date_text'] = date_elem.text if date_elem else None
# Avatar
avatar_elem = elem.find_element(By.CSS_SELECTOR, 'img.NBa7we')
review_data['avatar_url'] = avatar_elem.get_attribute('src') if avatar_elem else None
# Profile URL
profile_elem = elem.find_element(By.CSS_SELECTOR, 'button.WEBjve')
review_data['profile_url'] = profile_elem.get_attribute('data-review-id') if profile_elem else None
# Check if unique
dom_key = (review_data.get('author', ''), (review_data.get('date_text', '') or '')[:20])
if dom_key not in api_keys and review_data.get('author'):
review_id = f"dom_{hash(str(review_data.get('author', '')) + str(review_data.get('date_text', '')))}"
review_data['review_id'] = review_id
api_reviews[review_id] = review_data
api_keys.add(dom_key)
dom_added += 1
except:
continue
print(f" +{dom_added} reviews from DOM")
except Exception as e:
print(f" DOM parse failed: {e}")
elapsed = time.time() - start_time
all_reviews = list(api_reviews.values())
print(f"\n✅ COMPLETED!")
print(f"Reviews: {len(all_reviews)}")
print(f"Time: {elapsed:.2f}s")
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀\n")
# Save
with open('google_reviews_ultra_fast.json', 'w', encoding='utf-8') as f:
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
print(f"💾 Saved to google_reviews_ultra_fast.json")
if all_reviews:
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}")
return all_reviews
finally:
try:
driver.quit()
except:
pass
if __name__ == '__main__':
try:
reviews = ultra_fast_scrape()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
print("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -1,336 +0,0 @@
#!/usr/bin/env python3
"""
ULTRA-FAST COMPLETE Scraper - Gets ALL 244 reviews in ~25-30 seconds.
Strategy:
1. Ultra-fast API scrolling to get 234 reviews (~19s)
2. DOM parsing for missing 10 reviews (~5-10s)
3. Total: ~25-30s for 244 reviews (vs 155s original)
Combines speed of start_ultra_fast.py with completeness of original scraper.
"""
import sys
import yaml
import logging
import time
import json
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from modules.api_interceptor import GoogleMapsAPIInterceptor
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
def load_config():
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def parse_dom_reviews_fast(driver, max_reviews=20):
"""Fast DOM parsing using JavaScript - extracts data in bulk."""
# JavaScript to extract review data from first N reviews
extract_script = """
const reviews = [];
const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium');
const maxCount = Math.min(arguments[0], elements.length);
for (let i = 0; i < maxCount; i++) {
const elem = elements[i];
const review = {};
try {
// Author
const authorElem = elem.querySelector('div.d4r55');
review.author = authorElem ? authorElem.textContent : null;
// Rating
const ratingElem = elem.querySelector('span.kvMYJc');
if (ratingElem) {
const ariaLabel = ratingElem.getAttribute('aria-label');
if (ariaLabel) {
const match = ariaLabel.match(/\\d+/);
review.rating = match ? parseFloat(match[0]) : null;
}
}
// Text
const textElem = elem.querySelector('span.wiI7pd');
review.text = textElem ? textElem.textContent : null;
// Date
const dateElem = elem.querySelector('span.rsqaWe');
review.date_text = dateElem ? dateElem.textContent : null;
// Avatar
const avatarElem = elem.querySelector('img.NBa7we');
review.avatar_url = avatarElem ? avatarElem.src : null;
// Profile URL
const profileElem = elem.querySelector('button.WEBjve');
review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null;
if (review.author) {
reviews.push(review);
}
} catch (e) {
// Skip this review
}
}
return reviews;
"""
try:
# Execute JavaScript to get all review data at once
dom_reviews_data = driver.execute_script(extract_script, max_reviews)
# Convert to our format
dom_reviews = []
for review_data in dom_reviews_data:
if review_data.get('author') and review_data.get('date_text'):
review_id = f"dom_{hash(review_data['author'] + review_data['date_text'])}"
review_data['review_id'] = review_id
dom_reviews.append(review_data)
return dom_reviews
except Exception as e:
print(f" Error in fast DOM parse: {e}")
return []
def ultra_fast_complete_scrape():
"""Get ALL reviews with ultra-fast API + DOM fallback."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
print("ULTRA-FAST COMPLETE SCRAPER - Getting ALL 244 reviews...")
print(f"URL: {url[:80]}...")
start_time = time.time()
api_reviews = {}
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# ====== PHASE 1: ULTRA-FAST API SCROLLING ======
print("\n[Phase 1] Ultra-fast API scrolling...")
# Step 1: Navigate
driver.get(url)
time.sleep(1.5)
# Dismiss cookies
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
time.sleep(0.4)
except:
pass
# Click reviews tab
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
for selector in ['.LRkQ2', 'button[role="tab"]']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria for kw in review_keywords):
driver.execute_script("arguments[0].click();", tab)
time.sleep(0.4)
break
except:
continue
# Wait for page stability
time.sleep(1.0)
# Find pane
pane = None
try:
wait = WebDriverWait(driver, 3)
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
except TimeoutException:
try:
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
except:
print("ERROR: Could not find pane")
return []
# Setup API interceptor
interceptor = GoogleMapsAPIInterceptor(driver)
interceptor.setup_interception()
interceptor.inject_response_interceptor()
time.sleep(0.3)
# Setup scroll
driver.execute_script("window.scrollablePane = arguments[0];", pane)
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
# Trigger initial scroll
driver.execute_script(scroll_script)
time.sleep(0.3)
print(" Fast scrolling for API reviews...")
# Rapid scrolling
target_reviews = 240
max_scrolls = 35
for i in range(max_scrolls):
driver.execute_script(scroll_script)
time.sleep(0.27)
# Collect responses
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
if (i + 1) % 10 == 0:
print(f" {len(api_reviews)} reviews...")
if len(api_reviews) >= target_reviews:
break
except:
pass
# Final API collection
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except:
pass
phase1_time = time.time() - start_time
print(f" ✅ Phase 1 complete: {len(api_reviews)} reviews in {phase1_time:.2f}s")
# ====== PHASE 2: DOM PARSING FOR MISSING REVIEWS ======
missing_count = 244 - len(api_reviews)
if missing_count > 0:
print(f"\n[Phase 2] Fast DOM parsing for {missing_count} missing reviews...")
# Scroll to top (missing reviews likely at top)
driver.execute_script("window.scrollablePane.scrollTo(0, 0);", pane)
time.sleep(0.5) # Brief wait for scroll
# Fast JavaScript-based parsing (only first 20 reviews)
dom_reviews = parse_dom_reviews_fast(driver, max_reviews=min(missing_count + 10, 25))
# Add DOM reviews that aren't in API reviews
# Use author + rating + date as key for better duplicate detection
api_keys = set()
for api_review in api_reviews.values():
key = (
api_review.get('author', ''),
api_review.get('rating', 0),
(api_review.get('date_text', '') or '')[:20] # First 20 chars of date
)
api_keys.add(key)
dom_added = 0
for dom_review in dom_reviews:
# Create key for this DOM review
dom_key = (
dom_review.get('author', ''),
dom_review.get('rating', 0),
(dom_review.get('date_text', '') or '')[:20]
)
# Only add if not already in API reviews
if dom_key not in api_keys and dom_review.get('review_id'):
api_reviews[dom_review['review_id']] = dom_review
api_keys.add(dom_key) # Track this to avoid duplicates within DOM too
dom_added += 1
phase2_time = time.time() - start_time - phase1_time
print(f" ✅ Phase 2 complete: +{dom_added} reviews from DOM in {phase2_time:.2f}s")
# ====== RESULTS ======
elapsed = time.time() - start_time
all_reviews = list(api_reviews.values())
print(f"\n{'='*50}")
print(f"✅ COMPLETED!")
print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
print(f"Time: {elapsed:.2f}s")
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
print(f"{'='*50}")
if len(all_reviews) >= 244:
print(f"🎯 Got ALL 244 reviews!")
elif len(all_reviews) >= 240:
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
else:
print(f"⚠️ Missing {244-len(all_reviews)} reviews - may need more DOM parsing")
print()
# Save
with open('google_reviews_ultra_fast_complete.json', 'w', encoding='utf-8') as f:
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
print(f"💾 Saved to google_reviews_ultra_fast_complete.json")
if all_reviews:
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}")
return all_reviews
finally:
try:
driver.quit()
except:
pass
if __name__ == '__main__':
try:
reviews = ultra_fast_complete_scrape()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
print("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -1,280 +0,0 @@
#!/usr/bin/env python3
"""
Complete Scraper - Gets ALL reviews while staying fast.
Strategy:
1. Scroll until no new reviews for 5 consecutive scrolls
2. Check scroll position to detect end
3. Do extra scrolls at the end to catch stragglers
4. Adaptive timing - faster at start, slower at end
Target: Get all 244 reviews in ~22-25 seconds
"""
import sys
import yaml
import logging
import time
import json
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from modules.api_interceptor import GoogleMapsAPIInterceptor
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
def load_config():
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def complete_scrape():
"""Get ALL reviews with intelligent scrolling."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
print("COMPLETE SCRAPER - Getting ALL reviews...")
print(f"URL: {url[:80]}...")
start_time = time.time()
api_reviews = {}
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# Step 1: Navigate
driver.get(url)
time.sleep(1.5)
# Dismiss cookies
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
time.sleep(0.4)
except:
pass
# Click reviews tab
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
for selector in ['.LRkQ2', 'button[role="tab"]']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria for kw in review_keywords):
driver.execute_script("arguments[0].click();", tab)
time.sleep(0.4)
break
except:
continue
# Wait for page stability
time.sleep(1.0)
# Find pane
pane = None
try:
wait = WebDriverWait(driver, 3)
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
except TimeoutException:
try:
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
except:
print("ERROR: Could not find pane")
return []
# Wait for initial reviews to load
time.sleep(1.5)
# Setup API interceptor
interceptor = GoogleMapsAPIInterceptor(driver)
interceptor.setup_interception()
interceptor.inject_response_interceptor()
time.sleep(1.0) # Important: wait for interceptor to be ready
# Setup scroll
driver.execute_script("window.scrollablePane = arguments[0];", pane)
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
# Trigger initial scroll to get first API response
driver.execute_script(scroll_script)
time.sleep(1.0) # Wait for first API response
print("Scrolling with intelligent stopping...")
# Intelligent scrolling
max_scrolls = 60 # Higher limit to ensure we get everything
idle_scrolls = 0 # Count scrolls with no new reviews
max_idle = 12 # More patience - stop after 12 scrolls with no new reviews
last_count = 0
last_scroll_pos = 0
scroll_stuck_count = 0
for i in range(max_scrolls):
# Scroll
driver.execute_script(scroll_script)
# Adaptive timing - faster at start, slower near end
if len(api_reviews) < 100:
time.sleep(0.27) # Fast at beginning
elif len(api_reviews) < 200:
time.sleep(0.30) # Medium in middle
elif len(api_reviews) < 235:
time.sleep(0.40) # Slower near end
else:
time.sleep(0.50) # Very slow at the very end to catch stragglers
# Collect responses
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except:
pass
# Check if we got new reviews
current_count = len(api_reviews)
if current_count == last_count:
idle_scrolls += 1
else:
idle_scrolls = 0
if (i + 1) % 10 == 0:
print(f" {current_count} reviews...")
last_count = current_count
# Check scroll position to detect if stuck at bottom
try:
current_scroll = driver.execute_script("return arguments[0].scrollTop;", pane)
if current_scroll == last_scroll_pos:
scroll_stuck_count += 1
else:
scroll_stuck_count = 0
last_scroll_pos = current_scroll
except:
pass
# Stop conditions
if idle_scrolls >= max_idle and scroll_stuck_count >= 3:
print(f" Reached end (no new reviews for {idle_scrolls} scrolls)")
break
# Extra thorough collection at the end
print(f" Final collection sweep (currently have {len(api_reviews)})...")
# Do a few more scrolls with longer waits
for extra in range(5):
driver.execute_script(scroll_script)
time.sleep(0.8) # Longer wait to ensure API completes
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
new_count = 0
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
new_count += 1
if new_count > 0:
print(f" +{new_count} more reviews (total: {len(api_reviews)})")
except:
pass
# Final wait and collect
time.sleep(1.0)
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except:
pass
elapsed = time.time() - start_time
all_reviews = list(api_reviews.values())
print(f"\n✅ COMPLETED!")
print(f"Reviews: {len(all_reviews)} (target: 244)")
print(f"Time: {elapsed:.2f}s")
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
if len(all_reviews) >= 244:
print(f"🎯 Got ALL reviews!")
elif len(all_reviews) >= 240:
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
print()
# Save
with open('google_reviews_complete.json', 'w', encoding='utf-8') as f:
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
print(f"💾 Saved to google_reviews_complete.json")
if all_reviews:
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}")
return all_reviews
finally:
try:
driver.quit()
except:
pass
if __name__ == '__main__':
try:
reviews = complete_scrape()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
print("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)