Clean up project root - remove 51 obsolete files
Deleted: - 26 old markdown summary/documentation files - 16 debug/test Python scripts (debug_*, test_*, diagnose_*) - 10 untracked JSON files from api_response_samples - terms-of-usage.md, pane_not_found.png Also includes pending web app changes: - Jobs management UI (JobsView, Sidebar components) - API routes for job streaming and comparison - Enhanced ReviewAnalytics and ScraperTest components Final clean structure: ├── api_server_production.py (main entry) ├── modules/ (core Python) ├── web/ (Next.js frontend) ├── tests/ (test suite) ├── docs/ (documentation) └── examples/ (usage examples) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,657 +0,0 @@
|
|||||||
# Google Reviews Scraper - Fast API Documentation
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
REST API for scraping Google Maps reviews using the **ultra-fast DOM-only scraper** (18.9s average).
|
|
||||||
|
|
||||||
**Performance**: ~18.9 seconds for 244 reviews (8.2x faster than original!)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
|
|
||||||
### 1. Install Dependencies
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install fastapi uvicorn seleniumbase pyyaml
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Start the API Server
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python api_server.py
|
|
||||||
```
|
|
||||||
|
|
||||||
Server runs on: `http://localhost:8000`
|
|
||||||
|
|
||||||
### 3. API Documentation
|
|
||||||
|
|
||||||
Visit `http://localhost:8000/docs` for interactive Swagger UI documentation.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## API Endpoints
|
|
||||||
|
|
||||||
### Health Check
|
|
||||||
|
|
||||||
**GET** `/`
|
|
||||||
|
|
||||||
Check if the API is running.
|
|
||||||
|
|
||||||
**Response:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"message": "Google Reviews Scraper API is running",
|
|
||||||
"status": "healthy",
|
|
||||||
"version": "1.0.0"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Start Scraping Job
|
|
||||||
|
|
||||||
**POST** `/scrape`
|
|
||||||
|
|
||||||
Start a new scraping job in the background.
|
|
||||||
|
|
||||||
**Request Body:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"url": "https://www.google.com/maps/place/YOUR_BUSINESS_URL",
|
|
||||||
"headless": true
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Parameters:**
|
|
||||||
- `url` (required): Google Maps URL to scrape
|
|
||||||
- `headless` (optional): Run Chrome in headless mode (default: false)
|
|
||||||
- `max_scrolls` (optional): Maximum number of scrolls (default: 35)
|
|
||||||
|
|
||||||
**Response:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"job_id": "550e8400-e29b-41d4-a716-446655440000",
|
|
||||||
"status": "started",
|
|
||||||
"message": "Scraping job started successfully"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Example (curl):**
|
|
||||||
```bash
|
|
||||||
curl -X POST "http://localhost:8000/scrape" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"url": "https://www.google.com/maps/place/...",
|
|
||||||
"headless": true
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
**Example (Python):**
|
|
||||||
```python
|
|
||||||
import requests
|
|
||||||
|
|
||||||
response = requests.post(
|
|
||||||
"http://localhost:8000/scrape",
|
|
||||||
json={
|
|
||||||
"url": "https://www.google.com/maps/place/...",
|
|
||||||
"headless": True
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
job_id = response.json()['job_id']
|
|
||||||
print(f"Job started: {job_id}")
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Get Job Status
|
|
||||||
|
|
||||||
**GET** `/jobs/{job_id}`
|
|
||||||
|
|
||||||
Get detailed information about a specific job.
|
|
||||||
|
|
||||||
**Response:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"job_id": "550e8400-e29b-41d4-a716-446655440000",
|
|
||||||
"status": "completed",
|
|
||||||
"url": "https://www.google.com/maps/...",
|
|
||||||
"created_at": "2026-01-18T10:30:00",
|
|
||||||
"started_at": "2026-01-18T10:30:01",
|
|
||||||
"completed_at": "2026-01-18T10:30:20",
|
|
||||||
"reviews_count": 244,
|
|
||||||
"scrape_time": 18.9,
|
|
||||||
"progress": {
|
|
||||||
"stage": "completed",
|
|
||||||
"message": "Scraping completed successfully in 18.9s",
|
|
||||||
"scroll_time": 14.2,
|
|
||||||
"extract_time": 0.01
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Job Status Values:**
|
|
||||||
- `pending`: Job is queued but not started
|
|
||||||
- `running`: Job is currently scraping
|
|
||||||
- `completed`: Job finished successfully
|
|
||||||
- `failed`: Job failed with an error
|
|
||||||
- `cancelled`: Job was cancelled
|
|
||||||
|
|
||||||
**Example (curl):**
|
|
||||||
```bash
|
|
||||||
curl "http://localhost:8000/jobs/550e8400-e29b-41d4-a716-446655440000"
|
|
||||||
```
|
|
||||||
|
|
||||||
**Example (Python - Poll until complete):**
|
|
||||||
```python
|
|
||||||
import requests
|
|
||||||
import time
|
|
||||||
|
|
||||||
job_id = "550e8400-e29b-41d4-a716-446655440000"
|
|
||||||
|
|
||||||
while True:
|
|
||||||
response = requests.get(f"http://localhost:8000/jobs/{job_id}")
|
|
||||||
job = response.json()
|
|
||||||
|
|
||||||
print(f"Status: {job['status']} - {job['progress']['message']}")
|
|
||||||
|
|
||||||
if job['status'] in ['completed', 'failed', 'cancelled']:
|
|
||||||
break
|
|
||||||
|
|
||||||
time.sleep(2) # Poll every 2 seconds
|
|
||||||
|
|
||||||
print(f"Final: {job['reviews_count']} reviews in {job['scrape_time']:.1f}s")
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Get Job Reviews
|
|
||||||
|
|
||||||
**GET** `/jobs/{job_id}/reviews`
|
|
||||||
|
|
||||||
Get the actual scraped reviews data for a completed job.
|
|
||||||
|
|
||||||
**Response:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"job_id": "550e8400-e29b-41d4-a716-446655440000",
|
|
||||||
"reviews": [
|
|
||||||
{
|
|
||||||
"review_id": "review_123456789",
|
|
||||||
"author": "John Doe",
|
|
||||||
"rating": 5.0,
|
|
||||||
"text": "Great place! Highly recommend...",
|
|
||||||
"date_text": "2 months ago",
|
|
||||||
"avatar_url": "https://lh3.googleusercontent.com/...",
|
|
||||||
"profile_url": "..."
|
|
||||||
},
|
|
||||||
...
|
|
||||||
],
|
|
||||||
"count": 244
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Error Responses:**
|
|
||||||
- `404`: Job not found
|
|
||||||
- `400`: Job not completed yet
|
|
||||||
|
|
||||||
**Example (curl):**
|
|
||||||
```bash
|
|
||||||
curl "http://localhost:8000/jobs/550e8400-e29b-41d4-a716-446655440000/reviews" \
|
|
||||||
-o reviews.json
|
|
||||||
```
|
|
||||||
|
|
||||||
**Example (Python):**
|
|
||||||
```python
|
|
||||||
import requests
|
|
||||||
import json
|
|
||||||
|
|
||||||
job_id = "550e8400-e29b-41d4-a716-446655440000"
|
|
||||||
|
|
||||||
response = requests.get(f"http://localhost:8000/jobs/{job_id}/reviews")
|
|
||||||
reviews_data = response.json()
|
|
||||||
|
|
||||||
# Save to file
|
|
||||||
with open('reviews.json', 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(reviews_data['reviews'], f, indent=2, ensure_ascii=False)
|
|
||||||
|
|
||||||
print(f"Retrieved {reviews_data['count']} reviews")
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### List All Jobs
|
|
||||||
|
|
||||||
**GET** `/jobs`
|
|
||||||
|
|
||||||
List all jobs, optionally filtered by status.
|
|
||||||
|
|
||||||
**Query Parameters:**
|
|
||||||
- `status` (optional): Filter by job status (pending, running, completed, failed, cancelled)
|
|
||||||
- `limit` (optional): Maximum number of jobs to return (default: 100, max: 1000)
|
|
||||||
|
|
||||||
**Response:**
|
|
||||||
```json
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"job_id": "550e8400-e29b-41d4-a716-446655440000",
|
|
||||||
"status": "completed",
|
|
||||||
"url": "https://www.google.com/maps/...",
|
|
||||||
"created_at": "2026-01-18T10:30:00",
|
|
||||||
"reviews_count": 244,
|
|
||||||
"scrape_time": 18.9
|
|
||||||
},
|
|
||||||
...
|
|
||||||
]
|
|
||||||
```
|
|
||||||
|
|
||||||
**Example (curl):**
|
|
||||||
```bash
|
|
||||||
# Get all completed jobs
|
|
||||||
curl "http://localhost:8000/jobs?status=completed&limit=10"
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Cancel Job
|
|
||||||
|
|
||||||
**POST** `/jobs/{job_id}/cancel`
|
|
||||||
|
|
||||||
Cancel a pending or running job.
|
|
||||||
|
|
||||||
**Response:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"message": "Job cancelled successfully"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Error Responses:**
|
|
||||||
- `404`: Job not found
|
|
||||||
- `400`: Job cannot be cancelled (already completed/failed)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Delete Job
|
|
||||||
|
|
||||||
**DELETE** `/jobs/{job_id}`
|
|
||||||
|
|
||||||
Delete a job from the system (removes job data).
|
|
||||||
|
|
||||||
**Response:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"message": "Job deleted successfully"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Get Statistics
|
|
||||||
|
|
||||||
**GET** `/stats`
|
|
||||||
|
|
||||||
Get job manager statistics.
|
|
||||||
|
|
||||||
**Response:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"total_jobs": 42,
|
|
||||||
"by_status": {
|
|
||||||
"pending": 2,
|
|
||||||
"running": 1,
|
|
||||||
"completed": 35,
|
|
||||||
"failed": 3,
|
|
||||||
"cancelled": 1
|
|
||||||
},
|
|
||||||
"running_jobs": 1,
|
|
||||||
"max_concurrent_jobs": 3
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Manual Cleanup
|
|
||||||
|
|
||||||
**POST** `/cleanup`
|
|
||||||
|
|
||||||
Manually trigger cleanup of old completed/failed jobs.
|
|
||||||
|
|
||||||
**Query Parameters:**
|
|
||||||
- `max_age_hours` (optional): Maximum age in hours (default: 24)
|
|
||||||
|
|
||||||
**Response:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"message": "Cleaned up jobs older than 24 hours"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Complete Workflow Example
|
|
||||||
|
|
||||||
### Python Script
|
|
||||||
|
|
||||||
```python
|
|
||||||
import requests
|
|
||||||
import time
|
|
||||||
import json
|
|
||||||
|
|
||||||
BASE_URL = "http://localhost:8000"
|
|
||||||
|
|
||||||
# 1. Start scraping job
|
|
||||||
response = requests.post(
|
|
||||||
f"{BASE_URL}/scrape",
|
|
||||||
json={
|
|
||||||
"url": "https://www.google.com/maps/place/...",
|
|
||||||
"headless": True
|
|
||||||
}
|
|
||||||
)
|
|
||||||
job_id = response.json()['job_id']
|
|
||||||
print(f"Job started: {job_id}")
|
|
||||||
|
|
||||||
# 2. Poll until complete
|
|
||||||
while True:
|
|
||||||
response = requests.get(f"{BASE_URL}/jobs/{job_id}")
|
|
||||||
job = response.json()
|
|
||||||
|
|
||||||
print(f"Status: {job['status']} - {job['progress']['message']}")
|
|
||||||
|
|
||||||
if job['status'] == 'completed':
|
|
||||||
print(f"✅ Completed: {job['reviews_count']} reviews in {job['scrape_time']:.1f}s")
|
|
||||||
break
|
|
||||||
elif job['status'] == 'failed':
|
|
||||||
print(f"❌ Failed: {job['error_message']}")
|
|
||||||
break
|
|
||||||
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
# 3. Get reviews
|
|
||||||
if job['status'] == 'completed':
|
|
||||||
response = requests.get(f"{BASE_URL}/jobs/{job_id}/reviews")
|
|
||||||
reviews = response.json()['reviews']
|
|
||||||
|
|
||||||
# Save to file
|
|
||||||
with open('reviews.json', 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(reviews, f, indent=2, ensure_ascii=False)
|
|
||||||
|
|
||||||
print(f"💾 Saved {len(reviews)} reviews to reviews.json")
|
|
||||||
```
|
|
||||||
|
|
||||||
### JavaScript/Node.js Example
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
const axios = require('axios');
|
|
||||||
const fs = require('fs');
|
|
||||||
|
|
||||||
const BASE_URL = 'http://localhost:8000';
|
|
||||||
|
|
||||||
async function scrapeReviews(url) {
|
|
||||||
// 1. Start job
|
|
||||||
const { data: startData } = await axios.post(`${BASE_URL}/scrape`, {
|
|
||||||
url: url,
|
|
||||||
headless: true
|
|
||||||
});
|
|
||||||
|
|
||||||
const jobId = startData.job_id;
|
|
||||||
console.log(`Job started: ${jobId}`);
|
|
||||||
|
|
||||||
// 2. Poll until complete
|
|
||||||
while (true) {
|
|
||||||
const { data: job } = await axios.get(`${BASE_URL}/jobs/${jobId}`);
|
|
||||||
|
|
||||||
console.log(`Status: ${job.status} - ${job.progress.message}`);
|
|
||||||
|
|
||||||
if (job.status === 'completed') {
|
|
||||||
console.log(`✅ Completed: ${job.reviews_count} reviews in ${job.scrape_time}s`);
|
|
||||||
break;
|
|
||||||
} else if (job.status === 'failed') {
|
|
||||||
console.log(`❌ Failed: ${job.error_message}`);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3. Get reviews
|
|
||||||
const { data: reviewsData } = await axios.get(`${BASE_URL}/jobs/${jobId}/reviews`);
|
|
||||||
|
|
||||||
// Save to file
|
|
||||||
fs.writeFileSync('reviews.json', JSON.stringify(reviewsData.reviews, null, 2));
|
|
||||||
|
|
||||||
console.log(`💾 Saved ${reviewsData.count} reviews to reviews.json`);
|
|
||||||
}
|
|
||||||
|
|
||||||
scrapeReviews('https://www.google.com/maps/place/...');
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Performance
|
|
||||||
|
|
||||||
### Fast Scraper Performance
|
|
||||||
|
|
||||||
The API now uses the **ultra-fast DOM-only scraper**:
|
|
||||||
|
|
||||||
| Metric | Value |
|
|
||||||
|--------|-------|
|
|
||||||
| Average Time | 18.9s |
|
|
||||||
| Speedup | 8.2x faster |
|
|
||||||
| Success Rate | 100% |
|
|
||||||
| Reviews/Second | ~12.9 |
|
|
||||||
|
|
||||||
**Timing Breakdown:**
|
|
||||||
- Scrolling: ~14s (60-74%)
|
|
||||||
- Extraction: ~0.01s (0.1%)
|
|
||||||
- Setup: ~4-5s (25-30%)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
### Server Configuration
|
|
||||||
|
|
||||||
Edit `api_server.py` to configure:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Number of concurrent scraping jobs
|
|
||||||
job_manager = JobManager(max_concurrent_jobs=3)
|
|
||||||
|
|
||||||
# Server host and port
|
|
||||||
uvicorn.run(
|
|
||||||
"api_server:app",
|
|
||||||
host="0.0.0.0",
|
|
||||||
port=8000,
|
|
||||||
reload=True
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Scraper Configuration
|
|
||||||
|
|
||||||
Pass configuration when starting a job:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"url": "https://www.google.com/maps/place/...",
|
|
||||||
"headless": true,
|
|
||||||
"max_scrolls": 35
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Error Handling
|
|
||||||
|
|
||||||
### HTTP Status Codes
|
|
||||||
|
|
||||||
- `200`: Success
|
|
||||||
- `400`: Bad request (invalid parameters or job state)
|
|
||||||
- `404`: Job not found
|
|
||||||
- `500`: Internal server error
|
|
||||||
|
|
||||||
### Error Response Format
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"detail": "Error message here"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Common Errors
|
|
||||||
|
|
||||||
**1. Job not completed yet**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"detail": "Job not completed yet (current status: running)"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**2. Job not found**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"detail": "Job not found"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**3. Maximum concurrent jobs reached**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"detail": "Maximum concurrent jobs reached"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Testing
|
|
||||||
|
|
||||||
### Run Test Script
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python test_fast_api.py
|
|
||||||
```
|
|
||||||
|
|
||||||
This will:
|
|
||||||
1. Start a scraping job
|
|
||||||
2. Poll until complete
|
|
||||||
3. Retrieve and save reviews
|
|
||||||
4. Show statistics
|
|
||||||
|
|
||||||
### Manual Testing (curl)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Start job
|
|
||||||
curl -X POST "http://localhost:8000/scrape" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"url": "YOUR_GOOGLE_MAPS_URL", "headless": true}' \
|
|
||||||
| jq
|
|
||||||
|
|
||||||
# Get status (replace JOB_ID)
|
|
||||||
curl "http://localhost:8000/jobs/JOB_ID" | jq
|
|
||||||
|
|
||||||
# Get reviews
|
|
||||||
curl "http://localhost:8000/jobs/JOB_ID/reviews" | jq
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Production Deployment
|
|
||||||
|
|
||||||
### Using Gunicorn
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install gunicorn
|
|
||||||
|
|
||||||
gunicorn api_server:app \
|
|
||||||
--workers 4 \
|
|
||||||
--worker-class uvicorn.workers.UvicornWorker \
|
|
||||||
--bind 0.0.0.0:8000
|
|
||||||
```
|
|
||||||
|
|
||||||
### Using Docker
|
|
||||||
|
|
||||||
Create `Dockerfile`:
|
|
||||||
|
|
||||||
```dockerfile
|
|
||||||
FROM python:3.9-slim
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY requirements.txt .
|
|
||||||
RUN pip install -r requirements.txt
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
CMD ["python", "api_server.py"]
|
|
||||||
```
|
|
||||||
|
|
||||||
Run:
|
|
||||||
```bash
|
|
||||||
docker build -t google-reviews-api .
|
|
||||||
docker run -p 8000:8000 google-reviews-api
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Monitoring
|
|
||||||
|
|
||||||
### Check Running Jobs
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl "http://localhost:8000/stats" | jq
|
|
||||||
```
|
|
||||||
|
|
||||||
### List Recent Jobs
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl "http://localhost:8000/jobs?limit=10" | jq
|
|
||||||
```
|
|
||||||
|
|
||||||
### Auto-Cleanup
|
|
||||||
|
|
||||||
Jobs are automatically cleaned up after 24 hours. Configure in `api_server.py`:
|
|
||||||
|
|
||||||
```python
|
|
||||||
async def cleanup_jobs_periodically():
|
|
||||||
while True:
|
|
||||||
await asyncio.sleep(3600) # Run every hour
|
|
||||||
if job_manager:
|
|
||||||
job_manager.cleanup_old_jobs(max_age_hours=24)
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Troubleshooting
|
|
||||||
|
|
||||||
### API won't start
|
|
||||||
|
|
||||||
**Error**: "Address already in use"
|
|
||||||
|
|
||||||
**Solution**: Change port in `api_server.py` or kill existing process:
|
|
||||||
```bash
|
|
||||||
lsof -ti:8000 | xargs kill
|
|
||||||
```
|
|
||||||
|
|
||||||
### Jobs stuck in "running" status
|
|
||||||
|
|
||||||
**Solution**: Check server logs for errors. Restart the server if needed.
|
|
||||||
|
|
||||||
### GDPR consent issues
|
|
||||||
|
|
||||||
The fast scraper automatically handles GDPR consent pages. If issues persist:
|
|
||||||
- Set `headless: false` to see what's happening
|
|
||||||
- Check server logs for consent page detection
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Support
|
|
||||||
|
|
||||||
For issues or questions, check:
|
|
||||||
- Server logs: Console output when running `python api_server.py`
|
|
||||||
- Interactive docs: `http://localhost:8000/docs`
|
|
||||||
- Test script: `python test_fast_api.py`
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**Enjoy ultra-fast Google Maps scraping with the API!** 🚀
|
|
||||||
@@ -1,140 +0,0 @@
|
|||||||
# API Interceptor Debug Summary
|
|
||||||
|
|
||||||
## Problem Statement
|
|
||||||
The scraper was working but **very slow** due to scrolling + DOM parsing. We wanted to use Google's internal API (`/maps/rpc/listugcposts`) to get reviews faster.
|
|
||||||
|
|
||||||
## What We Discovered
|
|
||||||
|
|
||||||
### ✅ API Interception IS Working!
|
|
||||||
The JavaScript interceptor successfully captures Google Maps API calls:
|
|
||||||
- **Endpoint**: `/maps/rpc/listugcposts`
|
|
||||||
- **Response sizes**: 41KB - 96KB per request
|
|
||||||
- **Frequency**: 2-5 responses captured per scroll cycle
|
|
||||||
- **Content**: Each response contains ~10-20 reviews in Google's nested array format
|
|
||||||
|
|
||||||
### ❌ What Was Broken
|
|
||||||
1. **Parser Bug**: `TypeError: '>' not supported between instances of 'InterceptedReview' and 'int'`
|
|
||||||
- The recursive parser was trying to compare InterceptedReview objects with integers
|
|
||||||
- Caused ALL parsing to fail despite responses being captured
|
|
||||||
|
|
||||||
2. **Missing Specialized Parser**: Generic recursive extraction didn't understand Google's `listugcposts` format
|
|
||||||
|
|
||||||
3. **Insufficient Logging**: Hard to diagnose without seeing what was captured
|
|
||||||
|
|
||||||
## Fixes Implemented
|
|
||||||
|
|
||||||
### 1. Fixed Recursion Bug (api_interceptor.py:527-555)
|
|
||||||
```python
|
|
||||||
def _extract_reviews_recursive(self, data: Any, depth: int = 0) -> List[InterceptedReview]:
|
|
||||||
# Skip if data is already an InterceptedReview object
|
|
||||||
if isinstance(data, InterceptedReview):
|
|
||||||
return [data]
|
|
||||||
|
|
||||||
# ... rest of logic with proper type checks
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Added Enhanced Debug Logging
|
|
||||||
|
|
||||||
**JavaScript Interceptor** (api_interceptor.py:204-307):
|
|
||||||
- Console logs with `[API Interceptor]` prefix
|
|
||||||
- Real-time stats every 10 seconds
|
|
||||||
- Captures ALL network requests (not just matches)
|
|
||||||
- Logs request types, URLs, and sizes
|
|
||||||
|
|
||||||
**Python Side** (api_interceptor.py:331-369, scraper.py:1419-1436):
|
|
||||||
- Shows number of responses retrieved
|
|
||||||
- Logs parsing attempts and results
|
|
||||||
- Reports final stats even if 0 reviews captured
|
|
||||||
- Browser console log extraction
|
|
||||||
- Optional response dumping to files in debug mode
|
|
||||||
|
|
||||||
### 3. Specialized Parser for listugcposts (api_interceptor.py:435-558)
|
|
||||||
|
|
||||||
```python
|
|
||||||
def _parse_listugcposts_response(self, data: Any) -> List[InterceptedReview]:
|
|
||||||
"""
|
|
||||||
Parse Google Maps listugcposts API response.
|
|
||||||
Handles deeply nested array format with pattern matching.
|
|
||||||
"""
|
|
||||||
```
|
|
||||||
|
|
||||||
**Detection Patterns**:
|
|
||||||
- Long string (30+ chars) = Review ID
|
|
||||||
- Number 1-5 = Rating
|
|
||||||
- Long string (50+ chars, not URL) = Review text
|
|
||||||
- Short string (3-100 chars) = Author name
|
|
||||||
- Date patterns = Review date
|
|
||||||
|
|
||||||
### 4. Stats & Diagnostics (scraper.py:1487-1509)
|
|
||||||
|
|
||||||
When API interception is enabled but captures 0 reviews:
|
|
||||||
```
|
|
||||||
⚠️ API interception was enabled but captured 0 reviews.
|
|
||||||
Network stats - Fetch requests: 0/X, XHR requests: Y/Z
|
|
||||||
Found N API interceptor console messages
|
|
||||||
```
|
|
||||||
|
|
||||||
## How to Use Debug Mode
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Enable debug logging
|
|
||||||
LOG_LEVEL=DEBUG python start.py
|
|
||||||
|
|
||||||
# You'll see output like:
|
|
||||||
[DEBUG] Retrieved 2 intercepted responses from browser
|
|
||||||
[DEBUG] - XHR: /maps/rpc/listugcposts?authuser=0... (68426 bytes)
|
|
||||||
[DEBUG] Collected 2 network responses from browser
|
|
||||||
[DEBUG] Parsed 0 reviews from responses # If parsing fails
|
|
||||||
[INFO] API interceptor captured 10 reviews (total unique API: 10) # If parsing works!
|
|
||||||
```
|
|
||||||
|
|
||||||
## Next Steps to Complete API Speed Optimization
|
|
||||||
|
|
||||||
1. **Test with Real Data**: Run scraper with DEBUG logging to see actual listugcposts responses
|
|
||||||
2. **Analyze Response Format**: Examine captured responses in `debug_api_dump/` directory
|
|
||||||
3. **Refine Parser**: Adjust field detection based on actual Google API format
|
|
||||||
4. **Benchmark Performance**: Compare DOM vs API scraping speed
|
|
||||||
5. **Add Pure API Mode**: Option to skip DOM scraping entirely and only use API
|
|
||||||
|
|
||||||
## Expected Performance Improvement
|
|
||||||
|
|
||||||
**Current (DOM Scraping)**:
|
|
||||||
- ~2-4 reviews/second
|
|
||||||
- Requires scrolling + waiting for render
|
|
||||||
- 244 reviews in ~3 minutes
|
|
||||||
|
|
||||||
**Target (API Mode)**:
|
|
||||||
- ~20-50 reviews/second (10-25x faster!)
|
|
||||||
- No scrolling needed
|
|
||||||
- 244 reviews in ~10-20 seconds
|
|
||||||
|
|
||||||
## Files Modified
|
|
||||||
|
|
||||||
1. `modules/api_interceptor.py` - Core interceptor with parsing logic
|
|
||||||
2. `modules/scraper.py` - Integration and stats reporting
|
|
||||||
3. `config.yaml` - `enable_api_intercept: true`
|
|
||||||
|
|
||||||
## Testing the Fixes
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Clean Python cache first
|
|
||||||
find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null
|
|
||||||
find . -name "*.pyc" -delete
|
|
||||||
|
|
||||||
# Run with debug logging
|
|
||||||
LOG_LEVEL=DEBUG python start.py
|
|
||||||
|
|
||||||
# Or run specific test
|
|
||||||
python test_api_quick.py
|
|
||||||
```
|
|
||||||
|
|
||||||
## Browser Console Messages
|
|
||||||
|
|
||||||
When the interceptor is working, you'll see in the browser console:
|
|
||||||
```
|
|
||||||
[API Interceptor] ✅ Injected successfully! Monitoring network requests...
|
|
||||||
[API Interceptor] ✅ CAPTURED XHR: /maps/rpc/listugcposts... Size: 68426
|
|
||||||
[API Interceptor] Stats: Fetch: 0/0 XHR: 5/15 Queue: 5
|
|
||||||
```
|
|
||||||
|
|
||||||
These messages confirm the interceptor is active and capturing responses.
|
|
||||||
@@ -1,201 +0,0 @@
|
|||||||
# API Optimization Summary - COMPLETE ✅
|
|
||||||
|
|
||||||
## What We Achieved
|
|
||||||
|
|
||||||
### 🎯 Original Goal
|
|
||||||
Speed up Google Maps review scraping by using API calls instead of slow browser scrolling.
|
|
||||||
|
|
||||||
### ✅ Results
|
|
||||||
|
|
||||||
| Metric | Before | After | Improvement |
|
|
||||||
|--------|--------|-------|-------------|
|
|
||||||
| **Parser Success Rate** | 15% | **100%** | **6.7x better** |
|
|
||||||
| **API Coverage** | 3 reviews | **234 reviews** | **78x more** |
|
|
||||||
| **Reviews from API** | 1.2% | **95.9%** | **79x increase** |
|
|
||||||
| **DOM Scrolling Needed** | 244 reviews | **10 reviews** | **24x less** |
|
|
||||||
|
|
||||||
### 📊 Performance
|
|
||||||
|
|
||||||
**Optimized Hybrid Scraper** (modules/api_interceptor.py + modules/scraper.py):
|
|
||||||
- Total reviews: 244
|
|
||||||
- API captured: 234 reviews (95.9%)
|
|
||||||
- DOM scraped: 10 reviews (4.1%)
|
|
||||||
- Time: 155 seconds (~2.6 minutes)
|
|
||||||
- **Parse rate: 100%** (10 reviews per API response)
|
|
||||||
|
|
||||||
**Comparison**:
|
|
||||||
- Old approach: 244 reviews via scrolling in 174 seconds
|
|
||||||
- New approach: 234 reviews via API + 10 via scrolling in 155 seconds
|
|
||||||
- **Speed improvement: 1.12x faster with much less browser stress**
|
|
||||||
|
|
||||||
## Files Modified
|
|
||||||
|
|
||||||
### 1. `modules/api_interceptor.py`
|
|
||||||
**Lines 538-657**: Complete rewrite of API parser
|
|
||||||
|
|
||||||
**Key Changes**:
|
|
||||||
- Fixed structure understanding: Each `data[2][i]` is ONE review (not an array of reviews)
|
|
||||||
- Corrected field mappings:
|
|
||||||
- `data[2][i][0][0]` = Review ID
|
|
||||||
- `data[2][i][0][1][4][5][0]` = Author Name
|
|
||||||
- `data[2][i][0][1][6]` = Date Text
|
|
||||||
- `data[2][i][0][2][0][0]` = Rating
|
|
||||||
- `data[2][i][0][2][15][0][0]` = Review Text
|
|
||||||
|
|
||||||
**Result**: Parser now extracts **ALL 10 reviews** from each API response (was 0-2 before)
|
|
||||||
|
|
||||||
### 2. `modules/scraper.py`
|
|
||||||
**Lines 1419-1436**: Added API response collection in scraping loop
|
|
||||||
- Collects reviews from intercepted API calls every scroll
|
|
||||||
- Dumps first 5 responses for analysis
|
|
||||||
- Merges API reviews with DOM reviews at end
|
|
||||||
|
|
||||||
### 3. `dump_api_responses.py` (new)
|
|
||||||
Standalone script to capture raw API responses for analysis
|
|
||||||
|
|
||||||
### 4. `cookie_based_scraper.py` (new)
|
|
||||||
**Experimental** cookie-capture based scraper for pure API mode
|
|
||||||
|
|
||||||
**Status**: Requires Google account login
|
|
||||||
- Captures cookies via CDP
|
|
||||||
- Needs auth cookies (SID, HSID, SSID, APISID, SAPISID)
|
|
||||||
- Only works if logged into Google account
|
|
||||||
|
|
||||||
## Current Recommendation: Use Optimized Hybrid Approach ✅
|
|
||||||
|
|
||||||
The **existing optimized scraper** (`python start.py`) is production-ready:
|
|
||||||
|
|
||||||
### ✅ Advantages
|
|
||||||
1. **95.9% API coverage** - Gets almost all reviews via fast API
|
|
||||||
2. **100% parse rate** - Extracts all reviews from API responses
|
|
||||||
3. **No login required** - Works without Google account
|
|
||||||
4. **Stable & tested** - Proven to work reliably
|
|
||||||
5. **Automatic session** - Browser handles auth naturally
|
|
||||||
|
|
||||||
### 📝 How It Works
|
|
||||||
1. Browser navigates to reviews page (15 seconds)
|
|
||||||
2. API interceptor captures network requests automatically
|
|
||||||
3. Parser extracts 10 reviews per API response (100% success)
|
|
||||||
4. Minimal scrolling needed (only ~10 reviews via DOM)
|
|
||||||
5. Total time: ~2.6 minutes for 244 reviews
|
|
||||||
|
|
||||||
## Alternative: Pure Cookie-Based API Scraping
|
|
||||||
|
|
||||||
### cookie_based_scraper.py
|
|
||||||
|
|
||||||
**Requirements**:
|
|
||||||
- Must be logged into Google account
|
|
||||||
- Captures auth cookies on each run
|
|
||||||
- Uses cookies for direct API calls
|
|
||||||
|
|
||||||
**Usage**:
|
|
||||||
```bash
|
|
||||||
python cookie_based_scraper.py
|
|
||||||
```
|
|
||||||
|
|
||||||
**Expected Flow**:
|
|
||||||
1. Opens browser (15 sec)
|
|
||||||
2. Captures cookies (5 sec)
|
|
||||||
3. Closes browser
|
|
||||||
4. Fast API pagination (5-10 sec)
|
|
||||||
5. **Total: ~25-35 seconds** (if logged in)
|
|
||||||
|
|
||||||
**Current Status**: ⚠️ Requires login
|
|
||||||
- Without login: Gets only tracking cookies, API returns 400 error
|
|
||||||
- With login: Should get auth cookies and work at full speed
|
|
||||||
|
|
||||||
## Next Steps (Optional)
|
|
||||||
|
|
||||||
### Option 1: Use Current Solution ✅ (Recommended)
|
|
||||||
- Already optimized
|
|
||||||
- 95.9% API coverage
|
|
||||||
- 100% parse rate
|
|
||||||
- No changes needed!
|
|
||||||
|
|
||||||
### Option 2: Enable Pure API Mode
|
|
||||||
To use `cookie_based_scraper.py`:
|
|
||||||
1. Log into Google account in Chrome
|
|
||||||
2. Keep browser session active
|
|
||||||
3. Run: `python cookie_based_scraper.py`
|
|
||||||
4. Should achieve ~10-25x speed improvement
|
|
||||||
|
|
||||||
### Option 3: Further Optimize Current Scraper
|
|
||||||
Potential improvements:
|
|
||||||
- Skip DOM parsing entirely (rely 100% on API)
|
|
||||||
- Reduce initial page load delays
|
|
||||||
- Could save additional 10-20 seconds
|
|
||||||
|
|
||||||
## Benchmark Comparison
|
|
||||||
|
|
||||||
| Approach | Reviews | Time | Speed | Login Required |
|
|
||||||
|----------|---------|------|-------|----------------|
|
|
||||||
| Old DOM-only | 244 | 174s | 1x | No |
|
|
||||||
| **Current Hybrid** | **244** | **155s** | **1.12x** | **No** ✅ |
|
|
||||||
| Cookie-based (no login) | 0 | 25s | N/A | Yes ⚠️ |
|
|
||||||
| Cookie-based (with login) | ~244 | ~30s | **5-8x** | Yes |
|
|
||||||
|
|
||||||
## Technical Details
|
|
||||||
|
|
||||||
### API Endpoint
|
|
||||||
```
|
|
||||||
https://www.google.com/maps/rpc/listugcposts
|
|
||||||
```
|
|
||||||
|
|
||||||
### Required Parameters
|
|
||||||
- `authuser`: 0
|
|
||||||
- `hl`: Language code (es, en, etc.)
|
|
||||||
- `gl`: Region code (es, us, etc.)
|
|
||||||
- `pb`: Protocol Buffer parameter with:
|
|
||||||
- Place ID
|
|
||||||
- Review type flags
|
|
||||||
- Pagination token
|
|
||||||
- Sort/filter params
|
|
||||||
|
|
||||||
### Required Cookies (for pure API mode)
|
|
||||||
- `SID` - Session ID
|
|
||||||
- `HSID` - HTTP Session ID
|
|
||||||
- `SSID` - Secure Session ID
|
|
||||||
- `APISID` - API Session ID
|
|
||||||
- `SAPISID` - Secure API Session ID
|
|
||||||
|
|
||||||
**Note**: These cookies are only available when logged into Google account.
|
|
||||||
|
|
||||||
### Response Format
|
|
||||||
- Prefix: `)]}'` (security measure, must be stripped)
|
|
||||||
- Body: JSON array with nested review data
|
|
||||||
- Structure: `data[2]` contains array of reviews
|
|
||||||
- Each review: `data[2][i]` = 6-item array with review fields
|
|
||||||
- Continuation token: `data[1]` (for pagination)
|
|
||||||
|
|
||||||
## Conclusion
|
|
||||||
|
|
||||||
### 🎉 Mission Accomplished!
|
|
||||||
|
|
||||||
We successfully optimized the Google Maps review scraper:
|
|
||||||
|
|
||||||
1. **✅ Fixed parser** - 100% success rate (was 15%)
|
|
||||||
2. **✅ API coverage** - 95.9% of reviews via fast API (was 1.2%)
|
|
||||||
3. **✅ Reduced scrolling** - Only 10 reviews via DOM (was 244)
|
|
||||||
4. **✅ Production ready** - Stable, tested, works without login
|
|
||||||
|
|
||||||
### Recommended Usage
|
|
||||||
|
|
||||||
**For immediate use**:
|
|
||||||
```bash
|
|
||||||
python start.py
|
|
||||||
```
|
|
||||||
Gets 244 reviews in ~2.6 minutes with 95.9% API coverage.
|
|
||||||
|
|
||||||
**For maximum speed** (requires Google login):
|
|
||||||
```bash
|
|
||||||
# First: Log into Google in Chrome
|
|
||||||
# Then:
|
|
||||||
python cookie_based_scraper.py
|
|
||||||
```
|
|
||||||
Could get 244 reviews in ~25-35 seconds (10-25x faster).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**Status**: ✅ **OPTIMIZATION COMPLETE**
|
|
||||||
|
|
||||||
The scraper is now highly optimized and production-ready!
|
|
||||||
@@ -1,224 +0,0 @@
|
|||||||
# API Quick Start - Fast Google Reviews Scraper
|
|
||||||
|
|
||||||
## ⚡ Ultra-Fast API (18.9 seconds!)
|
|
||||||
|
|
||||||
REST API for scraping Google Maps reviews using the optimized DOM-only scraper.
|
|
||||||
|
|
||||||
**Performance**: ~18.9 seconds for 244 reviews (8.2x faster than original!)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🚀 Quick Start
|
|
||||||
|
|
||||||
### 1. Install & Run
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Install dependencies
|
|
||||||
pip install fastapi uvicorn seleniumbase pyyaml
|
|
||||||
|
|
||||||
# Start API server
|
|
||||||
python api_server.py
|
|
||||||
```
|
|
||||||
|
|
||||||
Server starts on: `http://localhost:8000`
|
|
||||||
|
|
||||||
### 2. Use the API
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Start a scraping job
|
|
||||||
curl -X POST "http://localhost:8000/scrape" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"url": "https://www.google.com/maps/place/YOUR_BUSINESS_URL",
|
|
||||||
"headless": true
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
**Response:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"job_id": "550e8400-e29b-41d4-a716-446655440000",
|
|
||||||
"status": "started"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Check Status
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Check job status
|
|
||||||
curl "http://localhost:8000/jobs/550e8400-e29b-41d4-a716-446655440000"
|
|
||||||
```
|
|
||||||
|
|
||||||
**Response:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"status": "completed",
|
|
||||||
"reviews_count": 244,
|
|
||||||
"scrape_time": 18.9
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### 4. Get Reviews
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Get the actual reviews
|
|
||||||
curl "http://localhost:8000/jobs/550e8400-e29b-41d4-a716-446655440000/reviews" \
|
|
||||||
-o reviews.json
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📋 Key Endpoints
|
|
||||||
|
|
||||||
| Endpoint | Method | Description |
|
|
||||||
|----------|--------|-------------|
|
|
||||||
| `/scrape` | POST | Start scraping job |
|
|
||||||
| `/jobs/{job_id}` | GET | Get job status |
|
|
||||||
| `/jobs/{job_id}/reviews` | GET | Get scraped reviews |
|
|
||||||
| `/jobs` | GET | List all jobs |
|
|
||||||
| `/stats` | GET | Get statistics |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 💻 Python Example
|
|
||||||
|
|
||||||
```python
|
|
||||||
import requests
|
|
||||||
import time
|
|
||||||
|
|
||||||
# 1. Start job
|
|
||||||
response = requests.post(
|
|
||||||
"http://localhost:8000/scrape",
|
|
||||||
json={
|
|
||||||
"url": "https://www.google.com/maps/place/...",
|
|
||||||
"headless": True
|
|
||||||
}
|
|
||||||
)
|
|
||||||
job_id = response.json()['job_id']
|
|
||||||
|
|
||||||
# 2. Wait for completion
|
|
||||||
while True:
|
|
||||||
job = requests.get(f"http://localhost:8000/jobs/{job_id}").json()
|
|
||||||
if job['status'] in ['completed', 'failed']:
|
|
||||||
break
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
# 3. Get reviews
|
|
||||||
reviews = requests.get(
|
|
||||||
f"http://localhost:8000/jobs/{job_id}/reviews"
|
|
||||||
).json()['reviews']
|
|
||||||
|
|
||||||
print(f"Got {len(reviews)} reviews!")
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🧪 Test It
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Run the test script
|
|
||||||
python test_fast_api.py
|
|
||||||
```
|
|
||||||
|
|
||||||
This will:
|
|
||||||
- Start a job
|
|
||||||
- Poll until complete
|
|
||||||
- Save reviews to JSON
|
|
||||||
- Show statistics
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📚 Full Documentation
|
|
||||||
|
|
||||||
See [API_DOCUMENTATION.md](API_DOCUMENTATION.md) for:
|
|
||||||
- Complete endpoint reference
|
|
||||||
- Advanced examples
|
|
||||||
- Error handling
|
|
||||||
- Production deployment
|
|
||||||
- Monitoring & troubleshooting
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🎯 API Features
|
|
||||||
|
|
||||||
✅ **Ultra-fast scraping** (18.9s average)
|
|
||||||
✅ **Background job processing** (non-blocking)
|
|
||||||
✅ **Concurrent jobs** (up to 3 simultaneous)
|
|
||||||
✅ **Job status tracking** (pending/running/completed)
|
|
||||||
✅ **Review data retrieval** (via dedicated endpoint)
|
|
||||||
✅ **Automatic cleanup** (removes old jobs)
|
|
||||||
✅ **GDPR auto-handling** (no manual intervention)
|
|
||||||
✅ **REST API** (language-agnostic)
|
|
||||||
✅ **OpenAPI docs** (visit `/docs` for Swagger UI)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🔧 Configuration
|
|
||||||
|
|
||||||
### API Server
|
|
||||||
|
|
||||||
```python
|
|
||||||
# In api_server.py
|
|
||||||
job_manager = JobManager(max_concurrent_jobs=3) # Max parallel jobs
|
|
||||||
|
|
||||||
uvicorn.run(
|
|
||||||
"api_server:app",
|
|
||||||
host="0.0.0.0", # Listen on all interfaces
|
|
||||||
port=8000, # Port number
|
|
||||||
reload=True # Auto-reload on code changes
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Scraping Options
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"url": "https://www.google.com/maps/place/...",
|
|
||||||
"headless": true, // Run Chrome in headless mode
|
|
||||||
"max_scrolls": 35 // Maximum scrolls (default: 35)
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📊 Performance
|
|
||||||
|
|
||||||
```
|
|
||||||
Operation Time % of Total
|
|
||||||
──────────────────────────────────────────────
|
|
||||||
Scrolling (dynamic) ~14s 74%
|
|
||||||
Setup & navigation ~4.5s 24%
|
|
||||||
JavaScript extraction ~0.01s 0.1%
|
|
||||||
──────────────────────────────────────────────
|
|
||||||
TOTAL ~18.9s 100%
|
|
||||||
```
|
|
||||||
|
|
||||||
**8.2x faster than the original scraper!** 🚀
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🌐 Interactive Documentation
|
|
||||||
|
|
||||||
Visit `http://localhost:8000/docs` for:
|
|
||||||
- Interactive API testing
|
|
||||||
- Request/response schemas
|
|
||||||
- Try out endpoints directly in browser
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ⚙️ What Changed?
|
|
||||||
|
|
||||||
The API now uses the **fast DOM-only scraper** (`modules/fast_scraper.py`) instead of the old scraper:
|
|
||||||
|
|
||||||
**Before**: 155 seconds ❌
|
|
||||||
**Now**: 18.9 seconds ✅
|
|
||||||
|
|
||||||
**Key optimizations**:
|
|
||||||
1. GDPR consent auto-handling
|
|
||||||
2. Dynamic scroll waiting (adapts to page speed)
|
|
||||||
3. JavaScript extraction (40x faster than Selenium)
|
|
||||||
4. Universal design (no hardcoded values)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**Ready to scrape at 8.2x speed via API!** 🚀
|
|
||||||
@@ -1,247 +0,0 @@
|
|||||||
# API Interceptor Test Results - SUCCESSFUL ✅
|
|
||||||
|
|
||||||
**Test Date**: 2026-01-17 23:35-23:37
|
|
||||||
**Test Duration**: 142.91 seconds (~2 min 23 sec)
|
|
||||||
**Status**: ✅ **PROOF OF CONCEPT SUCCESSFUL**
|
|
||||||
|
|
||||||
## Executive Summary
|
|
||||||
|
|
||||||
The API interceptor **successfully captured and parsed reviews** from Google's internal API, proving the technology works. It found **3 additional reviews** that DOM parsing missed, bringing the total from 244 to **247 reviews**.
|
|
||||||
|
|
||||||
## Detailed Results
|
|
||||||
|
|
||||||
### ✅ What Worked
|
|
||||||
|
|
||||||
1. **API Interception**: Successfully captured 40+ network responses
|
|
||||||
2. **Response Source**: `/maps/rpc/listugcposts` (Google's internal reviews API)
|
|
||||||
3. **Response Sizes**: 68KB - 96KB per response (containing review data)
|
|
||||||
4. **Parsing**: Successfully extracted reviews from ~15% of captured responses
|
|
||||||
5. **Additional Data**: Found +3 reviews that DOM scraping missed
|
|
||||||
6. **Clean Exit**: Completed successfully with all data saved
|
|
||||||
|
|
||||||
### 📊 Performance Metrics
|
|
||||||
|
|
||||||
```
|
|
||||||
Total Reviews (DOM only): 244 reviews
|
|
||||||
Total Reviews (API merged): 247 reviews (+3 from API)
|
|
||||||
Execution Time: 142.91 seconds
|
|
||||||
API Responses Captured: 40+ responses
|
|
||||||
API Responses Parsed: ~6 responses (15% success rate)
|
|
||||||
Reviews from API: 3 unique reviews
|
|
||||||
```
|
|
||||||
|
|
||||||
### 🔍 Key Log Evidence
|
|
||||||
|
|
||||||
```
|
|
||||||
[INFO] API interception enabled via CDP
|
|
||||||
[INFO] JavaScript response interceptor injected with enhanced debugging
|
|
||||||
[INFO] API interceptor ready - capturing network responses
|
|
||||||
|
|
||||||
[DEBUG] Retrieved 1 intercepted responses from browser
|
|
||||||
[DEBUG] - XHR: /maps/rpc/listugcposts?... (96670 bytes)
|
|
||||||
[DEBUG] Collected 1 network responses from browser
|
|
||||||
[DEBUG] Parsed 1 reviews from responses
|
|
||||||
[INFO] API interceptor captured 1 reviews (total unique API: 1)
|
|
||||||
|
|
||||||
[DEBUG] Retrieved 1 intercepted responses from browser
|
|
||||||
[DEBUG] - XHR: /maps/rpc/listugcposts?... (68426 bytes)
|
|
||||||
[DEBUG] Parsed 2 reviews from responses
|
|
||||||
[INFO] API interceptor captured 2 reviews (total unique API: 2)
|
|
||||||
|
|
||||||
[INFO] Merging 3 reviews captured via API interception
|
|
||||||
[INFO] After merge: 247 total reviews
|
|
||||||
[INFO] ✅ Finished – total unique reviews: 247
|
|
||||||
```
|
|
||||||
|
|
||||||
### 📈 Parsing Statistics
|
|
||||||
|
|
||||||
Out of 40+ captured API responses:
|
|
||||||
- ✅ **5 responses** parsed 1 review each
|
|
||||||
- ✅ **1 response** parsed 2 reviews
|
|
||||||
- ⚠️ **~34 responses** parsed 0 reviews (parser too conservative)
|
|
||||||
|
|
||||||
**Success Rate**: ~15% of responses successfully parsed
|
|
||||||
**Total Unique Reviews Extracted**: 3
|
|
||||||
|
|
||||||
### 🎯 Network Activity
|
|
||||||
|
|
||||||
```
|
|
||||||
Interceptor Stats:
|
|
||||||
- Total Fetch requests: 0
|
|
||||||
- Total XHR requests: 63
|
|
||||||
- Captured XHR responses: 40+
|
|
||||||
- Last capture: 2026-01-17T23:35:50.709Z
|
|
||||||
```
|
|
||||||
|
|
||||||
## Why Only 3 Reviews Were Parsed
|
|
||||||
|
|
||||||
### The Problem
|
|
||||||
Each API response is **68KB-96KB** and likely contains **10-20 reviews**, but our parser only extracted 1-2 reviews per response in successful cases.
|
|
||||||
|
|
||||||
### Root Cause
|
|
||||||
The parser uses **very strict pattern matching**:
|
|
||||||
- Long string (30+ chars) = Review ID
|
|
||||||
- Number 1-5 = Rating
|
|
||||||
- Long string (50+ chars, not URL) = Review text
|
|
||||||
- Short string (3-100 chars) = Author name
|
|
||||||
|
|
||||||
**Google's actual format** likely uses different patterns or nesting structures that don't match our conservative detection logic.
|
|
||||||
|
|
||||||
### Evidence
|
|
||||||
```
|
|
||||||
[DEBUG] Retrieved 1 intercepted responses from browser
|
|
||||||
[DEBUG] - XHR: /maps/rpc/listugcposts?... (96670 bytes)
|
|
||||||
[DEBUG] Parsed 1 reviews from responses # Only 1 from 96KB!
|
|
||||||
```
|
|
||||||
|
|
||||||
A **96KB response** should contain ~20 reviews, not just 1!
|
|
||||||
|
|
||||||
## 🚀 Performance Potential
|
|
||||||
|
|
||||||
### Current State (Mixed Mode)
|
|
||||||
- DOM scraping: 244 reviews in 142 seconds
|
|
||||||
- API scraping: 3 reviews from 6 responses (15% parse rate)
|
|
||||||
- **Combined: 247 reviews in 142 seconds**
|
|
||||||
|
|
||||||
### Potential (Optimized API Mode)
|
|
||||||
|
|
||||||
If we **tune the parser** to extract all reviews from API responses:
|
|
||||||
|
|
||||||
**Scenario 1: 50% Parse Rate**
|
|
||||||
- Get ~10 reviews per response
|
|
||||||
- Need ~25 API responses
|
|
||||||
- Estimated time: **30-40 seconds** (3-4x faster)
|
|
||||||
|
|
||||||
**Scenario 2: 100% Parse Rate** (Ideal)
|
|
||||||
- Get ~20 reviews per response
|
|
||||||
- Need ~12-15 API responses
|
|
||||||
- Estimated time: **10-20 seconds** (10-15x faster!) 🚀
|
|
||||||
|
|
||||||
**Scenario 3: Pure API Mode** (Ultimate)
|
|
||||||
- Skip DOM scraping entirely
|
|
||||||
- Make targeted API calls
|
|
||||||
- Get all 244 reviews in 2-3 API requests
|
|
||||||
- Estimated time: **5-10 seconds** (25-30x faster!) 🔥
|
|
||||||
|
|
||||||
## 📊 Comparison Table
|
|
||||||
|
|
||||||
| Mode | Reviews | Time | Speed |
|
|
||||||
|------|---------|------|-------|
|
|
||||||
| DOM Only (baseline) | 244 | ~174 sec | 1x |
|
|
||||||
| Current Mixed | 247 | ~143 sec | 1.2x |
|
|
||||||
| API 50% Parse | ~244 | ~35 sec | **5x** ✨ |
|
|
||||||
| API 100% Parse | ~244 | ~15 sec | **12x** 🚀 |
|
|
||||||
| Pure API Mode | ~244 | ~8 sec | **22x** 🔥 |
|
|
||||||
|
|
||||||
## 🔧 Technical Details
|
|
||||||
|
|
||||||
### Files Modified
|
|
||||||
- `modules/api_interceptor.py` - Core interceptor with enhanced logging and specialized parser
|
|
||||||
- `modules/scraper.py` - Integration and stats reporting
|
|
||||||
- `config.yaml` - `enable_api_intercept: true`
|
|
||||||
|
|
||||||
### Key Functions
|
|
||||||
1. `inject_response_interceptor()` - JavaScript injection with browser-level interception
|
|
||||||
2. `get_intercepted_responses()` - Retrieves captured responses from browser
|
|
||||||
3. `_parse_listugcposts_response()` - Specialized parser for Google's API format
|
|
||||||
4. `_parse_review_array_v2()` - Pattern-based review extraction
|
|
||||||
|
|
||||||
### Debug Logging Enabled
|
|
||||||
```bash
|
|
||||||
LOG_LEVEL=DEBUG python start.py
|
|
||||||
```
|
|
||||||
|
|
||||||
Shows:
|
|
||||||
- Number of responses retrieved
|
|
||||||
- Response URLs and sizes
|
|
||||||
- Number of reviews parsed
|
|
||||||
- Interceptor statistics
|
|
||||||
- Browser console messages
|
|
||||||
|
|
||||||
## 🎯 Next Steps to Achieve 10-25x Speed
|
|
||||||
|
|
||||||
### Step 1: Dump Sample API Response ✅ NEEDED
|
|
||||||
```bash
|
|
||||||
# Add code to dump first successful response
|
|
||||||
# Analyze the exact JSON/array structure
|
|
||||||
```
|
|
||||||
|
|
||||||
### Step 2: Analyze Google's Format
|
|
||||||
- Study the 68KB-96KB response structure
|
|
||||||
- Identify review arrays/objects
|
|
||||||
- Map field positions and patterns
|
|
||||||
- Document the exact format
|
|
||||||
|
|
||||||
### Step 3: Tune Parser Patterns
|
|
||||||
- Adjust `_parse_listugcposts_response()` detection
|
|
||||||
- Improve `_parse_review_array_v2()` field extraction
|
|
||||||
- Handle nested structures more aggressively
|
|
||||||
- Reduce strictness, increase recall
|
|
||||||
|
|
||||||
### Step 4: Test & Benchmark
|
|
||||||
```bash
|
|
||||||
LOG_LEVEL=DEBUG python start.py
|
|
||||||
# Target: Parse >50% of responses
|
|
||||||
# Goal: Extract 10+ reviews per response
|
|
||||||
```
|
|
||||||
|
|
||||||
### Step 5: Pure API Mode (Optional)
|
|
||||||
- Add `--api-only` flag
|
|
||||||
- Skip DOM scraping entirely
|
|
||||||
- Make targeted API calls
|
|
||||||
- Achieve 20-30x speed improvement
|
|
||||||
|
|
||||||
## 🎉 Conclusion
|
|
||||||
|
|
||||||
### What We Proved
|
|
||||||
✅ API interception technology **works**
|
|
||||||
✅ Responses are being **captured** (40+ responses)
|
|
||||||
✅ Parser can **extract reviews** (3 reviews found)
|
|
||||||
✅ API provides **additional data** (+3 reviews vs DOM)
|
|
||||||
✅ System is **stable** and completes successfully
|
|
||||||
|
|
||||||
### What Needs Work
|
|
||||||
⚠️ Parser is too conservative (only 15% success rate)
|
|
||||||
⚠️ Missing reviews in large responses (1 review from 96KB)
|
|
||||||
⚠️ Need to analyze actual Google API format
|
|
||||||
|
|
||||||
### The Bottom Line
|
|
||||||
**The foundation is complete and working!** 🎉
|
|
||||||
|
|
||||||
We've successfully proven that:
|
|
||||||
1. We can intercept Google's API calls
|
|
||||||
2. We can capture the responses
|
|
||||||
3. We can parse review data from them
|
|
||||||
4. We can merge it with DOM data
|
|
||||||
|
|
||||||
With parser tuning, we can achieve:
|
|
||||||
- **5-10x speed improvement** (realistic)
|
|
||||||
- **20-25x speed improvement** (optimistic)
|
|
||||||
- **Complete the scrape in 5-20 seconds** instead of 3 minutes
|
|
||||||
|
|
||||||
## 📁 Test Artifacts
|
|
||||||
|
|
||||||
- **Debug Log**: `/private/tmp/claude/.../tasks/b9566d6.output`
|
|
||||||
- **Reviews JSON**: `google_reviews.json` (247 reviews)
|
|
||||||
- **Config**: `config.yaml` (enable_api_intercept: true)
|
|
||||||
|
|
||||||
## 🚀 Ready for Production
|
|
||||||
|
|
||||||
The API interceptor is **production-ready** for hybrid mode:
|
|
||||||
- ✅ Captures API responses
|
|
||||||
- ✅ Parses some reviews successfully
|
|
||||||
- ✅ Adds to DOM-scraped reviews
|
|
||||||
- ✅ No crashes or errors
|
|
||||||
- ✅ Clean completion
|
|
||||||
|
|
||||||
To unlock full speed potential:
|
|
||||||
1. Dump and analyze a sample API response
|
|
||||||
2. Tune the parser to match Google's exact format
|
|
||||||
3. Increase parse rate from 15% to 80%+
|
|
||||||
4. Enjoy 10-25x faster scraping! 🔥
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**Test Status**: ✅ SUCCESSFUL
|
|
||||||
**Recommendation**: Proceed with parser optimization
|
|
||||||
**Expected ROI**: 10-25x speed improvement (3 minutes → 10-20 seconds)
|
|
||||||
@@ -1,297 +0,0 @@
|
|||||||
# Chrome Worker Pool Implementation
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
Implemented Chrome worker pool system to **dramatically reduce validation and scraping latency** by maintaining pre-warmed Chrome instances ready for immediate use.
|
|
||||||
|
|
||||||
## Problem Solved
|
|
||||||
|
|
||||||
**Before**: Each validation check took 3-5 seconds because Chrome had to:
|
|
||||||
1. Start from scratch
|
|
||||||
2. Initialize browser
|
|
||||||
3. Load page
|
|
||||||
4. Extract data
|
|
||||||
5. Shut down
|
|
||||||
|
|
||||||
**After**: Validation checks now take **<1 second** because:
|
|
||||||
1. Chrome is already running ✅
|
|
||||||
2. Browser is already initialized ✅
|
|
||||||
3. Only need to navigate and extract
|
|
||||||
|
|
||||||
## Architecture
|
|
||||||
|
|
||||||
### Worker Pools
|
|
||||||
|
|
||||||
Two separate pools maintained:
|
|
||||||
|
|
||||||
1. **Validation Pool** (1 worker)
|
|
||||||
- Used for `/check-reviews` endpoint
|
|
||||||
- Fast review count checks
|
|
||||||
- Instantly available when user searches
|
|
||||||
|
|
||||||
2. **Scraping Pool** (2 workers)
|
|
||||||
- Used for full scraping jobs
|
|
||||||
- Ready to start jobs immediately
|
|
||||||
- Can handle 2 concurrent jobs
|
|
||||||
|
|
||||||
### Worker Lifecycle
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────────────────┐
|
|
||||||
│ Application Startup │
|
|
||||||
│ ├─ Pre-warm 1 validation worker │
|
|
||||||
│ └─ Pre-warm 2 scraping workers │
|
|
||||||
└─────────────────────────────────────────────────┘
|
|
||||||
↓
|
|
||||||
┌─────────────────────────────────────────────────┐
|
|
||||||
│ Worker Ready (Idle in Pool) │
|
|
||||||
│ - Chrome running │
|
|
||||||
│ - Maximized window │
|
|
||||||
│ - Clean state │
|
|
||||||
└─────────────────────────────────────────────────┘
|
|
||||||
↓
|
|
||||||
┌─────────────────────────────────────────────────┐
|
|
||||||
│ Request Arrives │
|
|
||||||
│ └─ Acquire worker from pool (instant) │
|
|
||||||
└─────────────────────────────────────────────────┘
|
|
||||||
↓
|
|
||||||
┌─────────────────────────────────────────────────┐
|
|
||||||
│ Worker Executes Task │
|
|
||||||
│ - Navigate to URL │
|
|
||||||
│ - Extract data │
|
|
||||||
│ - Return results │
|
|
||||||
└─────────────────────────────────────────────────┘
|
|
||||||
↓
|
|
||||||
┌─────────────────────────────────────────────────┐
|
|
||||||
│ Release Worker Back to Pool │
|
|
||||||
│ - Clear cookies/cache/storage │
|
|
||||||
│ - Reset to clean state │
|
|
||||||
│ - Mark as idle │
|
|
||||||
└─────────────────────────────────────────────────┘
|
|
||||||
↓
|
|
||||||
┌─────────────────────────────────────────────────┐
|
|
||||||
│ Background Maintenance │
|
|
||||||
│ - Check worker age/use count │
|
|
||||||
│ - Recycle old workers │
|
|
||||||
│ - Maintain pool size │
|
|
||||||
└─────────────────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
## Key Features
|
|
||||||
|
|
||||||
### 1. Pre-warming on Startup
|
|
||||||
|
|
||||||
Workers are created and ready **before** any requests arrive:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# api_server_production.py startup
|
|
||||||
await asyncio.to_thread(
|
|
||||||
start_worker_pools,
|
|
||||||
validation_size=1,
|
|
||||||
scraping_size=2,
|
|
||||||
headless=True
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Instant Availability
|
|
||||||
|
|
||||||
When a request arrives, worker is already running:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Get pre-warmed worker (instant)
|
|
||||||
worker = await asyncio.to_thread(get_validation_worker, timeout=10)
|
|
||||||
|
|
||||||
# Use immediately (no startup delay)
|
|
||||||
result = await asyncio.to_thread(
|
|
||||||
check_reviews_available,
|
|
||||||
url=url,
|
|
||||||
driver=worker.driver, # Already initialized!
|
|
||||||
return_driver=True
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Worker Recycling
|
|
||||||
|
|
||||||
Workers are automatically recycled to prevent memory leaks:
|
|
||||||
|
|
||||||
- **Max age**: 1 hour (3600 seconds)
|
|
||||||
- **Max uses**: 50 operations
|
|
||||||
- After limits reached: shutdown → create fresh worker
|
|
||||||
|
|
||||||
### 4. Background Maintenance
|
|
||||||
|
|
||||||
Maintenance thread runs every 10 seconds:
|
|
||||||
|
|
||||||
- Ensures pool always has required number of workers
|
|
||||||
- Creates new workers if pool is below capacity
|
|
||||||
- Monitors worker health
|
|
||||||
|
|
||||||
### 5. Clean State Between Uses
|
|
||||||
|
|
||||||
Each worker is reset before returning to pool:
|
|
||||||
|
|
||||||
```python
|
|
||||||
def reset(self):
|
|
||||||
"""Reset worker to clean state"""
|
|
||||||
self.driver.delete_all_cookies()
|
|
||||||
self.driver.execute_script("window.localStorage.clear();")
|
|
||||||
self.driver.execute_script("window.sessionStorage.clear();")
|
|
||||||
```
|
|
||||||
|
|
||||||
## Performance Impact
|
|
||||||
|
|
||||||
### Validation Checks
|
|
||||||
|
|
||||||
| Metric | Before | After | Improvement |
|
|
||||||
|--------|--------|-------|-------------|
|
|
||||||
| Cold start | 3-5s | N/A | - |
|
|
||||||
| Check time | 3-5s | <1s | **5x faster** |
|
|
||||||
| User wait | 3-5s | <1s | **5x better** |
|
|
||||||
|
|
||||||
### Full Scraping
|
|
||||||
|
|
||||||
| Metric | Before | After | Improvement |
|
|
||||||
|--------|--------|-------|-------------|
|
|
||||||
| Job start delay | 2-3s | <0.5s | **6x faster** |
|
|
||||||
| Concurrent jobs | Limited | 2 ready | Always available |
|
|
||||||
|
|
||||||
## API Endpoints
|
|
||||||
|
|
||||||
### Check Worker Pool Stats
|
|
||||||
|
|
||||||
```bash
|
|
||||||
GET /pool-stats
|
|
||||||
```
|
|
||||||
|
|
||||||
Response:
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"validation": {
|
|
||||||
"pool_size": 1,
|
|
||||||
"idle_workers": 1,
|
|
||||||
"active_workers": 0,
|
|
||||||
"total_workers_created": 1,
|
|
||||||
"headless": true
|
|
||||||
},
|
|
||||||
"scraping": {
|
|
||||||
"pool_size": 2,
|
|
||||||
"idle_workers": 2,
|
|
||||||
"active_workers": 0,
|
|
||||||
"total_workers_created": 2,
|
|
||||||
"headless": true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Resource Usage
|
|
||||||
|
|
||||||
### Memory
|
|
||||||
|
|
||||||
- Each Chrome worker: ~150-200 MB
|
|
||||||
- Total pool overhead: ~450-600 MB
|
|
||||||
- Trade-off: Memory for speed ✅
|
|
||||||
|
|
||||||
### CPU
|
|
||||||
|
|
||||||
- Idle workers: Minimal CPU (<1%)
|
|
||||||
- Active workers: Normal scraping CPU
|
|
||||||
- Maintenance thread: Negligible
|
|
||||||
|
|
||||||
## Files Modified
|
|
||||||
|
|
||||||
1. **`modules/chrome_pool.py`** (NEW)
|
|
||||||
- ChromeWorker class
|
|
||||||
- ChromeWorkerPool class
|
|
||||||
- Global pool management functions
|
|
||||||
|
|
||||||
2. **`modules/fast_scraper.py`**
|
|
||||||
- Updated `check_reviews_available()` to accept existing driver
|
|
||||||
- Added `return_driver` parameter to keep driver alive
|
|
||||||
|
|
||||||
3. **`api_server_production.py`**
|
|
||||||
- Import chrome_pool functions
|
|
||||||
- Start/stop pools in lifespan
|
|
||||||
- Use pooled workers in `/check-reviews` endpoint
|
|
||||||
- New `/pool-stats` endpoint
|
|
||||||
|
|
||||||
4. **`web/components/ScraperTest.tsx`**
|
|
||||||
- Changed "No Reviews to Scrape" to clickable button
|
|
||||||
- Button focuses search bar when clicked
|
|
||||||
- Better UX for retry flow
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
### Environment Variables
|
|
||||||
|
|
||||||
Can be configured via environment:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Validation pool size (default: 1)
|
|
||||||
VALIDATION_POOL_SIZE=1
|
|
||||||
|
|
||||||
# Scraping pool size (default: 2)
|
|
||||||
SCRAPING_POOL_SIZE=2
|
|
||||||
|
|
||||||
# Worker max age in seconds (default: 3600 = 1 hour)
|
|
||||||
WORKER_MAX_AGE=3600
|
|
||||||
|
|
||||||
# Worker max uses (default: 50)
|
|
||||||
WORKER_MAX_USES=50
|
|
||||||
```
|
|
||||||
|
|
||||||
Currently hardcoded in `api_server_production.py` but can be made configurable.
|
|
||||||
|
|
||||||
## Monitoring
|
|
||||||
|
|
||||||
### Check Pool Health
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl http://localhost:8000/pool-stats
|
|
||||||
```
|
|
||||||
|
|
||||||
### Logs
|
|
||||||
|
|
||||||
Workers log all operations:
|
|
||||||
|
|
||||||
```
|
|
||||||
INFO - Worker worker-1: Initializing Chrome...
|
|
||||||
INFO - Worker worker-1: Chrome ready
|
|
||||||
INFO - Using worker worker-1 for review check
|
|
||||||
INFO - Worker worker-1: Reset complete
|
|
||||||
INFO - Released worker-1 back to pool
|
|
||||||
```
|
|
||||||
|
|
||||||
## Future Enhancements
|
|
||||||
|
|
||||||
1. **Dynamic Pool Sizing**
|
|
||||||
- Auto-scale based on load
|
|
||||||
- Increase pool when queue builds up
|
|
||||||
- Decrease when idle
|
|
||||||
|
|
||||||
2. **Worker Health Checks**
|
|
||||||
- Periodic ping tests
|
|
||||||
- Auto-recycle unhealthy workers
|
|
||||||
- Alerts for pool degradation
|
|
||||||
|
|
||||||
3. **Metrics Dashboard**
|
|
||||||
- Worker utilization graphs
|
|
||||||
- Response time histograms
|
|
||||||
- Pool efficiency metrics
|
|
||||||
|
|
||||||
4. **Distributed Pools**
|
|
||||||
- Redis-backed worker coordination
|
|
||||||
- Share pools across multiple API instances
|
|
||||||
- Horizontal scaling
|
|
||||||
|
|
||||||
## Summary
|
|
||||||
|
|
||||||
The Chrome Worker Pool implementation provides:
|
|
||||||
|
|
||||||
✅ **5x faster validation checks** (<1s vs 3-5s)
|
|
||||||
✅ **Instant job starts** (no cold start delay)
|
|
||||||
✅ **Better concurrency** (2 workers always ready)
|
|
||||||
✅ **Automatic maintenance** (recycling, health checks)
|
|
||||||
✅ **Resource efficient** (~500MB for 3 workers)
|
|
||||||
✅ **Production ready** (error handling, logging)
|
|
||||||
|
|
||||||
Users now get **near-instant feedback** when searching for businesses!
|
|
||||||
@@ -1,329 +0,0 @@
|
|||||||
# ✅ Concurrent Jobs & Real Business URL - Test Results
|
|
||||||
|
|
||||||
## Test Date: 2026-01-18
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 1. Concurrent Job Handling Test
|
|
||||||
|
|
||||||
### Configuration
|
|
||||||
- **5 jobs** submitted simultaneously
|
|
||||||
- **Semaphore limit**: 5 concurrent jobs (configurable via `MAX_CONCURRENT_JOBS`)
|
|
||||||
- **Test script**: `test_concurrent_jobs.py`
|
|
||||||
|
|
||||||
### Results
|
|
||||||
|
|
||||||
```
|
|
||||||
Total jobs: 5
|
|
||||||
Successful: 5 ✅
|
|
||||||
Failed: 0
|
|
||||||
Average job time: 23.9s
|
|
||||||
Total wall time: 25.6s
|
|
||||||
Speedup: 4.7x faster than sequential ⚡
|
|
||||||
```
|
|
||||||
|
|
||||||
### Key Findings
|
|
||||||
|
|
||||||
✅ **Jobs run in TRUE PARALLEL**
|
|
||||||
- Wall time (25.6s) << Sum of job times (119.5s)
|
|
||||||
- Proves concurrent execution is working
|
|
||||||
|
|
||||||
✅ **Semaphore prevents resource exhaustion**
|
|
||||||
- `job_semaphore` limits concurrent Chrome instances
|
|
||||||
- Prevents memory overflow (each job = ~500MB RAM)
|
|
||||||
- 5 concurrent jobs = ~2.5GB RAM (manageable)
|
|
||||||
|
|
||||||
✅ **No database deadlocks**
|
|
||||||
- PostgreSQL handled 5 concurrent writes without issues
|
|
||||||
- JSONB storage performs well under concurrent load
|
|
||||||
|
|
||||||
✅ **Production-ready**
|
|
||||||
- Set `MAX_CONCURRENT_JOBS` based on available RAM:
|
|
||||||
- 8GB server → `MAX_CONCURRENT_JOBS=10`
|
|
||||||
- 16GB server → `MAX_CONCURRENT_JOBS=20`
|
|
||||||
- 32GB server → `MAX_CONCURRENT_JOBS=40`
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 2. Real Business URL Testing
|
|
||||||
|
|
||||||
### Test Business: Soho Club (Vilnius, Lithuania)
|
|
||||||
|
|
||||||
**URL Format** (required for Google Maps):
|
|
||||||
```
|
|
||||||
https://www.google.com/maps/place/[NAME]/data=!4m7!3m6!1s[ID]!8m2!3d[LAT]!4d[LON]!16s%2Fg%2F[CODE]
|
|
||||||
```
|
|
||||||
|
|
||||||
### Direct Scraper Test
|
|
||||||
|
|
||||||
```bash
|
|
||||||
$ python modules/fast_scraper.py
|
|
||||||
```
|
|
||||||
|
|
||||||
**Results**:
|
|
||||||
```
|
|
||||||
✅ SUCCESS!
|
|
||||||
Reviews: 230/230 (100%)
|
|
||||||
Time: 20.7s
|
|
||||||
Speed: 11.1 reviews/sec
|
|
||||||
```
|
|
||||||
|
|
||||||
**Sample Reviews Retrieved**:
|
|
||||||
```
|
|
||||||
1. John Alexander Serna Correa - 5 ⭐
|
|
||||||
2. Diego - 3 ⭐
|
|
||||||
3. Juan Lopez - 5 ⭐
|
|
||||||
```
|
|
||||||
|
|
||||||
### Key Findings
|
|
||||||
|
|
||||||
✅ **Scraper works perfectly** with proper URL format
|
|
||||||
✅ **GDPR consent handling** fixed for non-headless mode
|
|
||||||
✅ **Fast performance** - 230 reviews in 20.7s (same speed as original tests)
|
|
||||||
✅ **100% extraction rate** - gets ALL reviews
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 3. GDPR Consent Fix (Implemented)
|
|
||||||
|
|
||||||
### Problem
|
|
||||||
- Scraper was stuck on `consent.google.com` page
|
|
||||||
- Previous selector didn't work: `button[aria-label*="Accept"]`
|
|
||||||
|
|
||||||
### Solution
|
|
||||||
Updated `modules/fast_scraper.py` (lines 119-131):
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Handle GDPR consent page (CRITICAL FIX for headless mode!)
|
|
||||||
if 'consent.google.com' in driver.current_url:
|
|
||||||
try:
|
|
||||||
# Find all form buttons and click "Accept all" / "Aceptar todo"
|
|
||||||
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
|
||||||
for btn in form_btns:
|
|
||||||
btn_text = (btn.text or '').lower()
|
|
||||||
if 'aceptar todo' in btn_text or 'accept all' in btn_text:
|
|
||||||
log.info(f"Clicking GDPR consent: {btn.text}")
|
|
||||||
btn.click()
|
|
||||||
time.sleep(2)
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
# Fallback: click second button (usually "Accept all")
|
|
||||||
if len(form_btns) >= 2:
|
|
||||||
log.info("Using fallback: clicking second form button")
|
|
||||||
form_btns[1].click()
|
|
||||||
time.sleep(2)
|
|
||||||
except Exception as e:
|
|
||||||
log.warning(f"GDPR consent handling failed: {e}")
|
|
||||||
```
|
|
||||||
|
|
||||||
**Result**: ✅ GDPR consent now handled correctly
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 4. Headless Mode Limitation (Known Issue)
|
|
||||||
|
|
||||||
### Status
|
|
||||||
⚠️ **Headless mode has issues with Google Maps**
|
|
||||||
|
|
||||||
### Problem
|
|
||||||
- UC (undetected-chromedriver) + headless mode → URL gets mangled
|
|
||||||
- Example: `place/Soho+Club/@...` becomes `place//@...`
|
|
||||||
- Google Maps doesn't load business data with mangled URL
|
|
||||||
|
|
||||||
### Current Solution
|
|
||||||
**Use non-headless mode** (`headless=False`) for production
|
|
||||||
|
|
||||||
### Why This Works
|
|
||||||
- Non-headless mode: ✅ 230 reviews in 20.7s
|
|
||||||
- Still fast and reliable
|
|
||||||
- Browser window runs in background
|
|
||||||
- Can use `xvfb` on Linux servers for virtual display
|
|
||||||
|
|
||||||
### Future Options
|
|
||||||
1. **Use Xvfb on Linux** - virtual framebuffer (no visible window)
|
|
||||||
2. **Try different UC settings** - may need upstream fix in seleniumbase
|
|
||||||
3. **Alternative: Selenium Stealth** - different bot detection bypass
|
|
||||||
|
|
||||||
### Recommendation for Production
|
|
||||||
```python
|
|
||||||
# Production configuration
|
|
||||||
fast_scrape_reviews(
|
|
||||||
url=url,
|
|
||||||
headless=False, # Use non-headless for reliability
|
|
||||||
max_scrolls=999999 # Unlimited (stops on idle detection)
|
|
||||||
)
|
|
||||||
|
|
||||||
# On Linux servers, use Xvfb:
|
|
||||||
# Xvfb :99 -screen 0 1920x1080x24 &
|
|
||||||
# export DISPLAY=:99
|
|
||||||
# python api_server_production.py
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 5. Production API Code Changes
|
|
||||||
|
|
||||||
### Added Concurrency Limit
|
|
||||||
|
|
||||||
**File**: `api_server_production.py` (lines 37-39, 375-377)
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Global concurrent job limiter
|
|
||||||
MAX_CONCURRENT_JOBS = int(os.getenv('MAX_CONCURRENT_JOBS', '5'))
|
|
||||||
job_semaphore = asyncio.Semaphore(MAX_CONCURRENT_JOBS)
|
|
||||||
|
|
||||||
async def run_scraping_job(job_id: UUID):
|
|
||||||
"""Run scraping job with concurrency limit"""
|
|
||||||
async with job_semaphore: # Limits concurrent Chrome instances
|
|
||||||
try:
|
|
||||||
await db.update_job_status(job_id, JobStatus.RUNNING)
|
|
||||||
# ... rest of job execution
|
|
||||||
```
|
|
||||||
|
|
||||||
### Environment Variables
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# .env file
|
|
||||||
MAX_CONCURRENT_JOBS=5 # Limit concurrent Chrome instances
|
|
||||||
API_BASE_URL=http://localhost:8000
|
|
||||||
DATABASE_URL=postgresql://user:pass@localhost:5432/scraper
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 6. URL Format Requirements
|
|
||||||
|
|
||||||
### ✅ WORKING URL Format
|
|
||||||
|
|
||||||
Full Google Maps URL with `data=!4m7...` parameters:
|
|
||||||
|
|
||||||
```
|
|
||||||
https://www.google.com/maps/place/Business+Name/data=!4m7!3m6!1s0xID:0xID2!8m2!3dLAT!4dLON!16s%2Fg%2FCODE
|
|
||||||
```
|
|
||||||
|
|
||||||
Example:
|
|
||||||
```
|
|
||||||
https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1
|
|
||||||
```
|
|
||||||
|
|
||||||
### ❌ NOT WORKING (Simplified URLs)
|
|
||||||
|
|
||||||
These don't work reliably:
|
|
||||||
```
|
|
||||||
# Too simple - missing data parameters
|
|
||||||
https://www.google.com/maps/place/Business+Name/@LAT,LON,17z
|
|
||||||
|
|
||||||
# No business ID
|
|
||||||
https://www.google.com/maps/@LAT,LON,17z
|
|
||||||
```
|
|
||||||
|
|
||||||
### How to Get Correct URL
|
|
||||||
|
|
||||||
1. Go to Google Maps
|
|
||||||
2. Search for business
|
|
||||||
3. Copy full URL from browser address bar
|
|
||||||
4. URL should include `data=!4m7...` parameters
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 7. Performance Summary
|
|
||||||
|
|
||||||
### Single Job (Real Business)
|
|
||||||
```
|
|
||||||
Reviews: 230
|
|
||||||
Time: 20.7s
|
|
||||||
Speed: 11.1 reviews/sec
|
|
||||||
Success rate: 100%
|
|
||||||
Mode: Non-headless
|
|
||||||
```
|
|
||||||
|
|
||||||
### Concurrent Jobs (5 simultaneous)
|
|
||||||
```
|
|
||||||
Total jobs: 5
|
|
||||||
Total reviews: N/A (test URLs had no reviews)
|
|
||||||
Wall time: 25.6s
|
|
||||||
Average job time: 23.9s
|
|
||||||
Speedup: 4.7x vs sequential
|
|
||||||
Success rate: 100%
|
|
||||||
```
|
|
||||||
|
|
||||||
### Scalability
|
|
||||||
```
|
|
||||||
Single server (16GB RAM):
|
|
||||||
- Max concurrent jobs: ~20
|
|
||||||
- Throughput: ~50 reviews/sec (with 20 concurrent jobs)
|
|
||||||
- Can handle: 4,320,000 reviews/day
|
|
||||||
- Or: 180,000 jobs/day (assuming 24 reviews avg per business)
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 8. Next Steps
|
|
||||||
|
|
||||||
### Immediate (Ready to Use)
|
|
||||||
- ✅ Concurrent job handling works
|
|
||||||
- ✅ Real business URL scraping works
|
|
||||||
- ✅ GDPR consent handling works
|
|
||||||
- ✅ PostgreSQL storage works
|
|
||||||
|
|
||||||
### Production Deployment
|
|
||||||
1. Set `headless=False` in production config
|
|
||||||
2. Use Xvfb on Linux servers for virtual display:
|
|
||||||
```bash
|
|
||||||
apt-get install xvfb
|
|
||||||
Xvfb :99 -screen 0 1920x1080x24 &
|
|
||||||
export DISPLAY=:99
|
|
||||||
```
|
|
||||||
3. Configure `MAX_CONCURRENT_JOBS` based on RAM
|
|
||||||
4. Deploy with Docker Compose
|
|
||||||
|
|
||||||
### Optional Improvements (Phase 2)
|
|
||||||
- Redis queue for better job distribution
|
|
||||||
- Worker pool architecture
|
|
||||||
- Auto-scaling based on queue size
|
|
||||||
- Fix headless mode (investigate UC alternatives)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 9. Test Files Created
|
|
||||||
|
|
||||||
```
|
|
||||||
test_concurrent_jobs.py # Tests 5 simultaneous jobs
|
|
||||||
CONCURRENT_JOBS_TEST_RESULTS.md # This file
|
|
||||||
```
|
|
||||||
|
|
||||||
### Running Tests
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Test concurrent jobs
|
|
||||||
python test_concurrent_jobs.py
|
|
||||||
|
|
||||||
# Test direct scraper with real URL
|
|
||||||
python -c "
|
|
||||||
import sys
|
|
||||||
sys.path.append('.')
|
|
||||||
from modules.fast_scraper import fast_scrape_reviews
|
|
||||||
url = 'https://www.google.com/maps/place/Soho+Club/data=...'
|
|
||||||
result = fast_scrape_reviews(url, headless=False)
|
|
||||||
print(f'Reviews: {result[\"count\"]}, Time: {result[\"time\"]:.1f}s')
|
|
||||||
"
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ✅ Conclusion
|
|
||||||
|
|
||||||
**Production API is ready!**
|
|
||||||
|
|
||||||
- ✅ Fast scraping (20.7s for 230 reviews)
|
|
||||||
- ✅ Concurrent job handling (4.7x speedup)
|
|
||||||
- ✅ PostgreSQL JSONB storage
|
|
||||||
- ✅ Webhook notifications
|
|
||||||
- ✅ Canary health checks
|
|
||||||
- ✅ GDPR consent handling
|
|
||||||
|
|
||||||
**Limitation**: Use `headless=False` for reliability (use Xvfb on servers)
|
|
||||||
|
|
||||||
**Capacity**: Single 16GB server can handle 180,000 jobs/day
|
|
||||||
|
|
||||||
🚀 **Ready for production deployment!**
|
|
||||||
@@ -1,494 +0,0 @@
|
|||||||
# ✅ Containerized Solution - Complete!
|
|
||||||
|
|
||||||
## Problem Solved: Running Chrome in Docker Container
|
|
||||||
|
|
||||||
### The Challenge
|
|
||||||
- **Headless mode** (headless=True) + **UC mode** = URL mangling ❌
|
|
||||||
- Google Maps URLs get corrupted: `place/Business/@...` → `place//@...`
|
|
||||||
- Result: 0 reviews scraped
|
|
||||||
|
|
||||||
### The Solution
|
|
||||||
**Run Chrome with Xvfb (virtual display) inside Docker container** ✅
|
|
||||||
|
|
||||||
```
|
|
||||||
Docker Container
|
|
||||||
├── Xvfb :99 (virtual X11 display)
|
|
||||||
├── Chromium (non-headless, uses virtual display)
|
|
||||||
└── Python API Server
|
|
||||||
```
|
|
||||||
|
|
||||||
**Result**: Chrome thinks it's running normally, but everything is isolated in container!
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## What Was Built
|
|
||||||
|
|
||||||
### 1. Updated Dockerfile
|
|
||||||
|
|
||||||
**Key additions**:
|
|
||||||
- ✅ Xvfb (X virtual framebuffer)
|
|
||||||
- ✅ Chromium browser
|
|
||||||
- ✅ All Chrome dependencies
|
|
||||||
- ✅ Startup script (launches Xvfb before API)
|
|
||||||
|
|
||||||
```dockerfile
|
|
||||||
# Install Xvfb for virtual display
|
|
||||||
RUN apt-get install -y xvfb
|
|
||||||
|
|
||||||
# Install Chromium (works on all CPU architectures)
|
|
||||||
RUN apt-get install -y chromium chromium-driver
|
|
||||||
|
|
||||||
# Create startup script
|
|
||||||
RUN echo '#!/bin/bash
|
|
||||||
Xvfb :99 -screen 0 1920x1080x24 &
|
|
||||||
export DISPLAY=:99
|
|
||||||
sleep 2
|
|
||||||
exec python api_server_production.py
|
|
||||||
' > /app/start.sh && chmod +x /app/start.sh
|
|
||||||
|
|
||||||
# Set environment
|
|
||||||
ENV DISPLAY=:99
|
|
||||||
ENV CHROME_BIN=/usr/bin/chromium
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Updated docker-compose.yml
|
|
||||||
|
|
||||||
**Chrome-specific configurations**:
|
|
||||||
```yaml
|
|
||||||
services:
|
|
||||||
api:
|
|
||||||
shm_size: 2gb # Chrome needs shared memory
|
|
||||||
cap_add:
|
|
||||||
- SYS_ADMIN # Chrome sandboxing capability
|
|
||||||
security_opt:
|
|
||||||
- seccomp:unconfined # Allow Chrome syscalls
|
|
||||||
environment:
|
|
||||||
- DISPLAY=:99
|
|
||||||
- CHROME_BIN=/usr/bin/chromium
|
|
||||||
- MAX_CONCURRENT_JOBS=5
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Test Script
|
|
||||||
|
|
||||||
**File**: `test_docker_chrome.py`
|
|
||||||
|
|
||||||
Verifies:
|
|
||||||
- ✅ Xvfb is running
|
|
||||||
- ✅ Chrome can start
|
|
||||||
- ✅ GDPR consent handling works
|
|
||||||
- ✅ Reviews are scraped successfully
|
|
||||||
|
|
||||||
### 4. Documentation
|
|
||||||
|
|
||||||
**Files created**:
|
|
||||||
- `DOCKER_CHROME_SETUP.md` - Complete deployment guide
|
|
||||||
- `CONTAINERIZED_SOLUTION_SUMMARY.md` - This file
|
|
||||||
- `CONCURRENT_JOBS_TEST_RESULTS.md` - Performance testing results
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## How It Works
|
|
||||||
|
|
||||||
### Startup Sequence
|
|
||||||
|
|
||||||
1. **Docker container starts**
|
|
||||||
```bash
|
|
||||||
docker-compose up -d
|
|
||||||
```
|
|
||||||
|
|
||||||
2. **start.sh script executes**
|
|
||||||
```bash
|
|
||||||
# Start Xvfb on display :99
|
|
||||||
Xvfb :99 -screen 0 1920x1080x24 &
|
|
||||||
|
|
||||||
# Set display environment
|
|
||||||
export DISPLAY=:99
|
|
||||||
|
|
||||||
# Wait for Xvfb
|
|
||||||
sleep 2
|
|
||||||
|
|
||||||
# Start API server
|
|
||||||
python api_server_production.py
|
|
||||||
```
|
|
||||||
|
|
||||||
3. **API server starts**
|
|
||||||
- PostgreSQL connection established
|
|
||||||
- Health check system started
|
|
||||||
- Webhook dispatcher started
|
|
||||||
- Server listens on port 8000
|
|
||||||
|
|
||||||
4. **Chrome usage**
|
|
||||||
- SeleniumBase launches Chrome with `headless=False`
|
|
||||||
- Chrome connects to virtual display `:99`
|
|
||||||
- Works perfectly - no URL mangling!
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
|
|
||||||
### Build Container
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Navigate to project
|
|
||||||
cd google-reviews-scraper-pro
|
|
||||||
|
|
||||||
# Build image (~5 minutes first time)
|
|
||||||
docker-compose -f docker-compose.production.yml build
|
|
||||||
|
|
||||||
# Start services
|
|
||||||
docker-compose -f docker-compose.production.yml up -d
|
|
||||||
|
|
||||||
# Check logs
|
|
||||||
docker-compose -f docker-compose.production.yml logs -f api
|
|
||||||
```
|
|
||||||
|
|
||||||
### Test Chrome in Container
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Run test script inside container
|
|
||||||
docker-compose -f docker-compose.production.yml exec api python test_docker_chrome.py
|
|
||||||
```
|
|
||||||
|
|
||||||
**Expected output**:
|
|
||||||
```
|
|
||||||
======================================================================
|
|
||||||
Testing Chrome in Docker Container
|
|
||||||
======================================================================
|
|
||||||
✅ Chrome initialized successfully
|
|
||||||
✅ Loaded: https://www.google.com/maps/...
|
|
||||||
✅ Clicking GDPR consent
|
|
||||||
✅ Reviews found: 230
|
|
||||||
✅ SUCCESS! Chrome + Xvfb working in container!
|
|
||||||
```
|
|
||||||
|
|
||||||
### Submit Real Job
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -X POST "http://localhost:8000/scrape" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"url": "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml"
|
|
||||||
}' | jq .job_id
|
|
||||||
|
|
||||||
# Wait ~25s, then get results
|
|
||||||
curl "http://localhost:8000/jobs/{JOB_ID}" | jq
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Performance Results
|
|
||||||
|
|
||||||
### Without Container (Local Testing)
|
|
||||||
```
|
|
||||||
Chrome: Non-headless
|
|
||||||
Reviews: 230/230
|
|
||||||
Time: 20.7s
|
|
||||||
Success rate: 100%
|
|
||||||
```
|
|
||||||
|
|
||||||
### With Container (Docker + Xvfb)
|
|
||||||
```
|
|
||||||
Chrome: Non-headless (via Xvfb)
|
|
||||||
Reviews: 230/230 (expected)
|
|
||||||
Time: ~22-25s (similar performance)
|
|
||||||
Success rate: 100%
|
|
||||||
Memory: ~500MB per job
|
|
||||||
```
|
|
||||||
|
|
||||||
### Concurrent Jobs (5 simultaneous)
|
|
||||||
```
|
|
||||||
Total jobs: 5
|
|
||||||
Wall time: 25.6s
|
|
||||||
Average per job: 23.9s
|
|
||||||
Speedup: 4.7x vs sequential
|
|
||||||
Success rate: 100%
|
|
||||||
Total memory: ~2.5GB (5 × 500MB)
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Architecture Comparison
|
|
||||||
|
|
||||||
### Before (Local Non-Container)
|
|
||||||
```
|
|
||||||
┌─────────────────────────┐
|
|
||||||
│ Host Machine │
|
|
||||||
│ ├── Python │
|
|
||||||
│ ├── Chrome (visible) │
|
|
||||||
│ └── PostgreSQL │
|
|
||||||
└─────────────────────────┘
|
|
||||||
|
|
||||||
Issues:
|
|
||||||
- ❌ Headless mode doesn't work (URL mangling)
|
|
||||||
- ⚠️ Chrome windows visible on screen
|
|
||||||
- ⚠️ Not portable
|
|
||||||
```
|
|
||||||
|
|
||||||
### After (Containerized)
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────┐
|
|
||||||
│ Docker Container │
|
|
||||||
│ ├── Xvfb :99 (virtual display) │
|
|
||||||
│ ├── Chromium (uses Xvfb) │
|
|
||||||
│ └── Python API Server │
|
|
||||||
└─────────────────────────────────────┘
|
|
||||||
↓ network
|
|
||||||
┌─────────────────────────────────────┐
|
|
||||||
│ Docker Container (Database) │
|
|
||||||
│ └── PostgreSQL │
|
|
||||||
└─────────────────────────────────────┘
|
|
||||||
|
|
||||||
Benefits:
|
|
||||||
- ✅ Works perfectly (no URL mangling)
|
|
||||||
- ✅ No visible windows
|
|
||||||
- ✅ Portable (runs anywhere)
|
|
||||||
- ✅ Isolated environment
|
|
||||||
- ✅ Easy to scale
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Deployment Options
|
|
||||||
|
|
||||||
### Option 1: Single Server
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# On any Linux server with Docker
|
|
||||||
docker-compose -f docker-compose.production.yml up -d
|
|
||||||
```
|
|
||||||
|
|
||||||
**Capacity**:
|
|
||||||
- 8GB RAM → 5 concurrent jobs → ~25 jobs/min
|
|
||||||
- 16GB RAM → 10 concurrent jobs → ~50 jobs/min
|
|
||||||
- 32GB RAM → 20 concurrent jobs → ~100 jobs/min
|
|
||||||
|
|
||||||
### Option 2: Kubernetes (High Scale)
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: scraper-api
|
|
||||||
spec:
|
|
||||||
replicas: 5 # 5 pods
|
|
||||||
template:
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: api
|
|
||||||
image: your-registry/scraper-api:latest
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
memory: "4Gi"
|
|
||||||
cpu: "2"
|
|
||||||
securityContext:
|
|
||||||
capabilities:
|
|
||||||
add: ["SYS_ADMIN"]
|
|
||||||
```
|
|
||||||
|
|
||||||
**Capacity**:
|
|
||||||
- 5 pods × 10 jobs/pod = 50 concurrent jobs
|
|
||||||
- ~250 jobs/min throughput
|
|
||||||
- Auto-scales based on load
|
|
||||||
|
|
||||||
### Option 3: Cloud Platforms
|
|
||||||
|
|
||||||
**AWS ECS**:
|
|
||||||
```bash
|
|
||||||
# Upload image to ECR
|
|
||||||
docker tag scraper-api:latest 123456.dkr.ecr.us-east-1.amazonaws.com/scraper
|
|
||||||
docker push 123456.dkr.ecr.us-east-1.amazonaws.com/scraper
|
|
||||||
|
|
||||||
# Deploy via ECS Task Definition
|
|
||||||
```
|
|
||||||
|
|
||||||
**Google Cloud Run**:
|
|
||||||
```bash
|
|
||||||
# Deploy (serverless, auto-scales)
|
|
||||||
gcloud run deploy scraper-api \
|
|
||||||
--image gcr.io/project/scraper-api \
|
|
||||||
--memory 2Gi \
|
|
||||||
--cpu 2 \
|
|
||||||
--allow-unauthenticated
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Resource Requirements
|
|
||||||
|
|
||||||
### Per Container Instance
|
|
||||||
|
|
||||||
```
|
|
||||||
RAM: 2-4GB (base + concurrent jobs)
|
|
||||||
- Base system: 500MB
|
|
||||||
- Each concurrent job: ~500MB
|
|
||||||
- For 5 jobs: 2.5GB total
|
|
||||||
|
|
||||||
CPU: 1-2 cores
|
|
||||||
- Scraping is I/O bound (waiting for page loads)
|
|
||||||
- More CPU = faster scrolling/rendering
|
|
||||||
|
|
||||||
Disk: 5GB
|
|
||||||
- Base image: ~2GB
|
|
||||||
- PostgreSQL data: grows over time
|
|
||||||
```
|
|
||||||
|
|
||||||
### Scaling Examples
|
|
||||||
|
|
||||||
| Server Size | Containers | Jobs/Container | Total Throughput |
|
|
||||||
|-------------|-----------|----------------|------------------|
|
|
||||||
| 8GB / 2 CPU | 1 | 5 | ~25/min |
|
|
||||||
| 16GB / 4 CPU| 2 | 5 | ~50/min |
|
|
||||||
| 32GB / 8 CPU| 4 | 5 | ~100/min |
|
|
||||||
| 64GB / 16 CPU| 8 | 5 | ~200/min |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Key Files Modified/Created
|
|
||||||
|
|
||||||
### Modified
|
|
||||||
- ✅ `Dockerfile` - Added Xvfb + Chromium + startup script
|
|
||||||
- ✅ `docker-compose.production.yml` - Added Chrome capabilities
|
|
||||||
- ✅ `.env.example` - Added MAX_CONCURRENT_JOBS
|
|
||||||
- ✅ `modules/fast_scraper.py` - Fixed GDPR consent handling
|
|
||||||
|
|
||||||
### Created
|
|
||||||
- ✅ `test_docker_chrome.py` - Container Chrome testing
|
|
||||||
- ✅ `DOCKER_CHROME_SETUP.md` - Complete deployment guide
|
|
||||||
- ✅ `CONTAINERIZED_SOLUTION_SUMMARY.md` - This summary
|
|
||||||
- ✅ `CONCURRENT_JOBS_TEST_RESULTS.md` - Performance results
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Troubleshooting
|
|
||||||
|
|
||||||
### Container won't start
|
|
||||||
```bash
|
|
||||||
# Check logs
|
|
||||||
docker-compose logs api
|
|
||||||
|
|
||||||
# Common issues:
|
|
||||||
# - Port 8000 in use → Change PORT in .env
|
|
||||||
# - Database not ready → Wait for health check
|
|
||||||
```
|
|
||||||
|
|
||||||
### Chrome fails
|
|
||||||
```bash
|
|
||||||
# Enter container
|
|
||||||
docker-compose exec api bash
|
|
||||||
|
|
||||||
# Check Xvfb
|
|
||||||
ps aux | grep Xvfb
|
|
||||||
|
|
||||||
# Check display
|
|
||||||
echo $DISPLAY # Should show :99
|
|
||||||
|
|
||||||
# Test Chrome manually
|
|
||||||
chromium --version
|
|
||||||
```
|
|
||||||
|
|
||||||
### Low performance
|
|
||||||
```bash
|
|
||||||
# Increase shared memory
|
|
||||||
# In docker-compose.yml:
|
|
||||||
shm_size: 4gb # Instead of 2gb
|
|
||||||
|
|
||||||
# Reduce concurrent jobs
|
|
||||||
# In .env:
|
|
||||||
MAX_CONCURRENT_JOBS=3 # Lower from 5
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Next Steps
|
|
||||||
|
|
||||||
### Immediate
|
|
||||||
1. ✅ Build image: `docker-compose build`
|
|
||||||
2. ✅ Start services: `docker-compose up -d`
|
|
||||||
3. ✅ Test: `docker-compose exec api python test_docker_chrome.py`
|
|
||||||
4. ✅ Submit job via API
|
|
||||||
|
|
||||||
### Production
|
|
||||||
1. Deploy to cloud VM (AWS EC2, GCP Compute, etc.)
|
|
||||||
2. Configure reverse proxy (nginx)
|
|
||||||
3. Setup SSL certificate
|
|
||||||
4. Configure monitoring (health endpoints)
|
|
||||||
5. Setup auto-scaling (Kubernetes/ECS)
|
|
||||||
|
|
||||||
### Optional Enhancements
|
|
||||||
- Redis queue for job distribution
|
|
||||||
- Worker pool architecture
|
|
||||||
- Prometheus metrics
|
|
||||||
- Grafana dashboards
|
|
||||||
- Horizontal auto-scaling
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Comparison: Before vs After
|
|
||||||
|
|
||||||
### Before Container Solution
|
|
||||||
|
|
||||||
| Aspect | Status | Notes |
|
|
||||||
|--------|--------|-------|
|
|
||||||
| Headless mode | ❌ Broken | URL mangling issue |
|
|
||||||
| Deployment | ⚠️ Manual | Install Chrome, Xvfb manually |
|
|
||||||
| Portability | ❌ Low | Host-dependent |
|
|
||||||
| Scaling | ⚠️ Hard | Manual server setup |
|
|
||||||
|
|
||||||
### After Container Solution
|
|
||||||
|
|
||||||
| Aspect | Status | Notes |
|
|
||||||
|--------|--------|-------|
|
|
||||||
| Headless mode | ✅ Works | Via Xvfb virtual display |
|
|
||||||
| Deployment | ✅ Easy | `docker-compose up` |
|
|
||||||
| Portability | ✅ High | Runs anywhere with Docker |
|
|
||||||
| Scaling | ✅ Easy | Replicate containers |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Success Metrics
|
|
||||||
|
|
||||||
✅ **Docker image builds** (~5 min build time)
|
|
||||||
✅ **Xvfb starts** in container
|
|
||||||
✅ **Chromium launches** successfully
|
|
||||||
✅ **GDPR consent** handled correctly
|
|
||||||
✅ **Reviews scraped** (230 in ~22s)
|
|
||||||
✅ **Concurrent jobs** work (5 simultaneous)
|
|
||||||
✅ **PostgreSQL** storage working
|
|
||||||
✅ **Webhooks** delivery working
|
|
||||||
✅ **Health checks** operational
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Conclusion
|
|
||||||
|
|
||||||
### What We Achieved
|
|
||||||
|
|
||||||
🎯 **Solved the headless mode problem** by using Xvfb virtual display
|
|
||||||
🎯 **Containerized the entire application** with Chrome + dependencies
|
|
||||||
🎯 **Verified concurrent job handling** (4.7x speedup)
|
|
||||||
🎯 **Tested with real business URLs** (230 reviews in 20-25s)
|
|
||||||
🎯 **Production-ready deployment** via Docker Compose
|
|
||||||
🎯 **Complete documentation** for deployment and operation
|
|
||||||
|
|
||||||
### Production Status
|
|
||||||
|
|
||||||
✅ **Ready to deploy!**
|
|
||||||
|
|
||||||
The containerized solution:
|
|
||||||
- Runs Chrome reliably in containers
|
|
||||||
- Handles GDPR consent automatically
|
|
||||||
- Scrapes reviews at full speed (11 reviews/sec)
|
|
||||||
- Supports concurrent jobs (up to hardware limits)
|
|
||||||
- Scales horizontally (add more containers)
|
|
||||||
- Works on any cloud platform
|
|
||||||
|
|
||||||
### Quick Deploy Command
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Deploy to production in 3 commands:
|
|
||||||
docker-compose -f docker-compose.production.yml build
|
|
||||||
docker-compose -f docker-compose.production.yml up -d
|
|
||||||
curl http://localhost:8000/health/detailed
|
|
||||||
```
|
|
||||||
|
|
||||||
🐳 **Containerized scraper is production-ready!** 🚀
|
|
||||||
@@ -1,145 +0,0 @@
|
|||||||
# Review Data Structure Analysis
|
|
||||||
|
|
||||||
## ✅ Current Data Types (All Correct)
|
|
||||||
|
|
||||||
Based on analysis of scraped reviews from the API:
|
|
||||||
|
|
||||||
```typescript
|
|
||||||
interface Review {
|
|
||||||
author: string; // ✓ string
|
|
||||||
rating: number; // ✓ number (not string!)
|
|
||||||
text: string | null; // ✓ string or null
|
|
||||||
date_text: string; // ✓ string (relative dates)
|
|
||||||
avatar_url: string | null; // ✓ string or null
|
|
||||||
profile_url: string | null; // ✓ string or null
|
|
||||||
review_id: string; // ✓ string
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**All API data types match the TypeScript interface - no conversion needed!**
|
|
||||||
|
|
||||||
## 🐛 Bug Found & Fixed
|
|
||||||
|
|
||||||
### Issue: Date Parsing
|
|
||||||
|
|
||||||
**Problem:** The `parseDateText()` function used `parseInt(text)` which returns `NaN` for strings like "Hace 2 semanas", then defaulted to `1` via `|| 1`. This caused:
|
|
||||||
|
|
||||||
- "Hace 2 semanas" (2 weeks ago) → parsed as **1 week ago** ❌
|
|
||||||
- "Hace 6 años" (6 years ago) → parsed as **1 year ago** ❌
|
|
||||||
- "Hace un año" (1 year ago) → parsed as **1 year ago** ✓ (correct by accident)
|
|
||||||
|
|
||||||
**Root cause:** `parseInt("Hace 2 semanas")` = `NaN`, and `NaN || 1` = `1`
|
|
||||||
|
|
||||||
**Fix:** Added `extractNumber()` function that uses regex to extract the number:
|
|
||||||
|
|
||||||
```typescript
|
|
||||||
function extractNumber(text: string): number {
|
|
||||||
const match = text.match(/\d+/);
|
|
||||||
if (match) return parseInt(match[0]);
|
|
||||||
// Handle Spanish "un/una" (one)
|
|
||||||
if (text.includes('un ') || text.includes('una ')) return 1;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Verified Results
|
|
||||||
|
|
||||||
```
|
|
||||||
Date: "Hace 2 semanas" → 2026-01-04 ✓
|
|
||||||
Date: "Hace 2 meses" → 2025-11-18 ✓
|
|
||||||
Date: "Hace un año" → 2025-01-18 ✓
|
|
||||||
Date: "Hace 6 años" → 2020-01-18 ✓
|
|
||||||
```
|
|
||||||
|
|
||||||
## 📅 Date Format Patterns Found
|
|
||||||
|
|
||||||
### Standard Formats
|
|
||||||
- `"Hace X semanas"` - X weeks ago
|
|
||||||
- `"Hace X meses"` - X months ago
|
|
||||||
- `"Hace X años"` - X years ago
|
|
||||||
- `"Hace un año"` - 1 year ago (special case: "un" instead of "1")
|
|
||||||
|
|
||||||
### Edited Review Format
|
|
||||||
- `"Fecha de edición: Hace X meses"` - Edited X months ago
|
|
||||||
|
|
||||||
### Date Range Distribution (from 244 reviews)
|
|
||||||
- **Last week:** ~2 reviews
|
|
||||||
- **Last month:** ~5-7 reviews
|
|
||||||
- **Last year:** ~30-40 reviews
|
|
||||||
- **1-2 years:** ~20-30 reviews
|
|
||||||
- **2+ years:** ~150+ reviews
|
|
||||||
|
|
||||||
## ⚠️ Imprecision Considerations
|
|
||||||
|
|
||||||
### Current Approach
|
|
||||||
Relative dates like "Hace 2 meses" are converted to **exact dates** (e.g., exactly 2 months ago from today).
|
|
||||||
|
|
||||||
### Limitation
|
|
||||||
- "Hace 2 meses" could mean anywhere from 2.0 to 2.99 months ago
|
|
||||||
- This introduces a ~±15 day margin of error for month boundaries
|
|
||||||
- Similar issues with "Hace un año" (could be 1.0 to 1.99 years)
|
|
||||||
|
|
||||||
### Potential Improvements
|
|
||||||
|
|
||||||
#### Option 1: Conservative Filtering (Current Implementation)
|
|
||||||
- Treat "Hace 2 meses" as exactly 2 months ago
|
|
||||||
- Simple, fast, slightly underestimates recency
|
|
||||||
- **Status: ✓ Implemented**
|
|
||||||
|
|
||||||
#### Option 2: Range-Based Filtering
|
|
||||||
```typescript
|
|
||||||
// Consider "Hace 2 meses" as a range: [2 months, 3 months)
|
|
||||||
// Include in "last month" filter if lower bound < 1 month
|
|
||||||
```
|
|
||||||
- More accurate for boundary cases
|
|
||||||
- More complex implementation
|
|
||||||
- May include slightly older reviews
|
|
||||||
|
|
||||||
#### Option 3: Add Buffer Zones
|
|
||||||
```typescript
|
|
||||||
// Add 10% buffer to cutoff dates
|
|
||||||
const monthAgo = new Date();
|
|
||||||
monthAgo.setMonth(monthAgo.getMonth() - 1.1); // Include slight overlap
|
|
||||||
```
|
|
||||||
- Catches boundary cases
|
|
||||||
- Simple to implement
|
|
||||||
- May include some false positives
|
|
||||||
|
|
||||||
### Recommendation
|
|
||||||
**Keep current implementation** (Option 1) because:
|
|
||||||
1. Date strings are already approximate ("Hace 2 meses" vs exact date)
|
|
||||||
2. Users expect "Last Month" to mean roughly 30 days, not exactly
|
|
||||||
3. Performance is better with simple date math
|
|
||||||
4. The error margin is acceptable for review analytics
|
|
||||||
|
|
||||||
## 🎯 Filter Accuracy
|
|
||||||
|
|
||||||
With the fixed parsing, date filters now work correctly:
|
|
||||||
|
|
||||||
| Filter | Cutoff Date | Expected Coverage |
|
|
||||||
|--------|------------|------------------|
|
|
||||||
| Last Week | 7 days ago | ~0-3 reviews |
|
|
||||||
| Last Month | 30 days ago | ~5-10 reviews |
|
|
||||||
| Last Year | 365 days ago | ~30-50 reviews |
|
|
||||||
| All Time | No limit | All 244 reviews |
|
|
||||||
|
|
||||||
## 🔍 Additional Data Quality Notes
|
|
||||||
|
|
||||||
1. **Rating is numeric:** Already a number (1-5), no parsing needed
|
|
||||||
2. **Duplicate review_ids:** Some reviews share the same `review_id`, hence the key change to `${index}-${review_id}`
|
|
||||||
3. **Null text:** Some reviews have `text: null` - handled with `|| 'No review text'`
|
|
||||||
4. **Avatar URLs:** Most reviews have avatar images (~90%+)
|
|
||||||
5. **Spanish language:** All dates in Spanish, handled by parsing logic
|
|
||||||
|
|
||||||
## 📊 Type Safety Checklist
|
|
||||||
|
|
||||||
- [x] Review interface matches API response
|
|
||||||
- [x] Rating is number type (not string)
|
|
||||||
- [x] Date parsing extracts numbers correctly
|
|
||||||
- [x] Null values handled for text, avatar_url, profile_url
|
|
||||||
- [x] Timeline data points typed correctly
|
|
||||||
- [x] Date range type defined ('week' | 'month' | 'year' | 'all')
|
|
||||||
|
|
||||||
## ✨ Status: FIXED
|
|
||||||
|
|
||||||
The date filtering now works correctly with proper number extraction from Spanish date strings. All data types are validated and match the API schema.
|
|
||||||
@@ -1,604 +0,0 @@
|
|||||||
# Production Deployment Guide
|
|
||||||
## Phase 1: PostgreSQL + Webhooks + Health Checks
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## <20><>️ What's Included
|
|
||||||
|
|
||||||
### Phase 1 Features:
|
|
||||||
- ✅ **PostgreSQL Storage** - Job metadata + reviews as JSONB
|
|
||||||
- ✅ **Webhooks** - Async notifications with retry logic and HMAC signatures
|
|
||||||
- ✅ **Smart Health Checks** - Canary testing every 4 hours to verify scraping works
|
|
||||||
- ✅ **Fast Scraper** - 18.9s average scraping time (8.2x faster)
|
|
||||||
- ✅ **Docker Deployment** - Easy deployment with Docker Compose
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🚀 Quick Start (Docker)
|
|
||||||
|
|
||||||
### 1. Clone and Configure
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Copy environment file
|
|
||||||
cp .env.example .env
|
|
||||||
|
|
||||||
# Edit .env with your settings
|
|
||||||
nano .env
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Start Services
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Build and start all services
|
|
||||||
docker-compose -f docker-compose.production.yml up -d
|
|
||||||
|
|
||||||
# Check logs
|
|
||||||
docker-compose -f docker-compose.production.yml logs -f api
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Verify Health
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Check if API is running
|
|
||||||
curl http://localhost:8000/
|
|
||||||
|
|
||||||
# Check detailed health
|
|
||||||
curl http://localhost:8000/health/detailed | jq
|
|
||||||
```
|
|
||||||
|
|
||||||
**Done!** API is running on `http://localhost:8000`
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🔧 Manual Installation
|
|
||||||
|
|
||||||
### 1. Install Dependencies
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Install Python dependencies
|
|
||||||
pip install -r requirements-production.txt
|
|
||||||
|
|
||||||
# Install PostgreSQL
|
|
||||||
# On macOS:
|
|
||||||
brew install postgresql@15
|
|
||||||
brew services start postgresql@15
|
|
||||||
|
|
||||||
# On Ubuntu:
|
|
||||||
sudo apt-get install postgresql-15
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Setup Database
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Create database and user
|
|
||||||
psql postgres
|
|
||||||
CREATE DATABASE scraper;
|
|
||||||
CREATE USER scraper WITH PASSWORD 'scraper123';
|
|
||||||
GRANT ALL PRIVILEGES ON DATABASE scraper TO scraper;
|
|
||||||
\q
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Configure Environment
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Set environment variables
|
|
||||||
export DATABASE_URL="postgresql://scraper:scraper123@localhost:5432/scraper"
|
|
||||||
export API_BASE_URL="http://localhost:8000"
|
|
||||||
```
|
|
||||||
|
|
||||||
### 4. Run Server
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python api_server_production.py
|
|
||||||
```
|
|
||||||
|
|
||||||
Server runs on `http://localhost:8000`
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📡 API Usage
|
|
||||||
|
|
||||||
### 1. Submit Job with Webhook
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -X POST "http://localhost:8000/scrape" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"url": "https://www.google.com/maps/place/YOUR_BUSINESS_URL",
|
|
||||||
"webhook_url": "https://your-server.com/webhook",
|
|
||||||
"webhook_secret": "your-secret-key"
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
**Response:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"job_id": "550e8400-e29b-41d4-a716-446655440000",
|
|
||||||
"status": "started"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Check Status
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl "http://localhost:8000/jobs/550e8400-e29b-41d4-a716-446655440000" | jq
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Receive Webhook (When Complete)
|
|
||||||
|
|
||||||
Your webhook endpoint will receive:
|
|
||||||
|
|
||||||
```json
|
|
||||||
POST https://your-server.com/webhook
|
|
||||||
Headers:
|
|
||||||
X-Webhook-Signature: sha256=abc123...
|
|
||||||
X-Webhook-Timestamp: 1705582800
|
|
||||||
|
|
||||||
Body:
|
|
||||||
{
|
|
||||||
"event": "job.completed",
|
|
||||||
"job_id": "550e8400-e29b-41d4-a716-446655440000",
|
|
||||||
"status": "completed",
|
|
||||||
"reviews_count": 244,
|
|
||||||
"scrape_time": 18.9,
|
|
||||||
"reviews_url": "http://localhost:8000/jobs/550e8400-.../reviews",
|
|
||||||
"timestamp": "2026-01-18T10:30:00Z"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### 4. Verify Webhook Signature
|
|
||||||
|
|
||||||
```python
|
|
||||||
import hmac
|
|
||||||
import hashlib
|
|
||||||
|
|
||||||
def verify_webhook(payload: str, signature: str, secret: str) -> bool:
|
|
||||||
"""Verify webhook signature"""
|
|
||||||
expected = signature.split("sha256=", 1)[1]
|
|
||||||
computed = hmac.new(
|
|
||||||
secret.encode(),
|
|
||||||
payload.encode(),
|
|
||||||
hashlib.sha256
|
|
||||||
).hexdigest()
|
|
||||||
|
|
||||||
return hmac.compare_digest(expected, computed)
|
|
||||||
|
|
||||||
# In your webhook handler:
|
|
||||||
@app.post("/webhook")
|
|
||||||
async def handle_webhook(request: Request):
|
|
||||||
payload = await request.body()
|
|
||||||
signature = request.headers.get("X-Webhook-Signature")
|
|
||||||
|
|
||||||
if not verify_webhook(payload.decode(), signature, WEBHOOK_SECRET):
|
|
||||||
raise HTTPException(status_code=401, detail="Invalid signature")
|
|
||||||
|
|
||||||
# Process webhook...
|
|
||||||
data = await request.json()
|
|
||||||
job_id = data['job_id']
|
|
||||||
|
|
||||||
# Download reviews
|
|
||||||
reviews = requests.get(data['reviews_url']).json()
|
|
||||||
print(f"Got {len(reviews['reviews'])} reviews for job {job_id}")
|
|
||||||
```
|
|
||||||
|
|
||||||
### 5. Get Reviews
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl "http://localhost:8000/jobs/550e8400-e29b-41d4-a716-446655440000/reviews" | jq
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🏥 Health Checks
|
|
||||||
|
|
||||||
### Liveness (Is server alive?)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl http://localhost:8000/health/live
|
|
||||||
```
|
|
||||||
|
|
||||||
**Use**: Kubernetes liveness probe (restart if fails)
|
|
||||||
|
|
||||||
### Readiness (Can handle traffic?)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl http://localhost:8000/health/ready
|
|
||||||
```
|
|
||||||
|
|
||||||
**Use**: Kubernetes readiness probe (remove from load balancer if fails)
|
|
||||||
|
|
||||||
### Canary (Does scraping work?)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl http://localhost:8000/health/canary
|
|
||||||
```
|
|
||||||
|
|
||||||
**Use**: External monitoring (PagerDuty alerts)
|
|
||||||
|
|
||||||
**How it works**:
|
|
||||||
- Runs real scrape test every 4 hours on test URL
|
|
||||||
- Verifies Chrome, selectors, GDPR handling all work
|
|
||||||
- Alerts if 3 consecutive failures
|
|
||||||
|
|
||||||
### Detailed Health
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl http://localhost:8000/health/detailed | jq
|
|
||||||
```
|
|
||||||
|
|
||||||
**Example response:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"status": "healthy",
|
|
||||||
"components": {
|
|
||||||
"liveness": {
|
|
||||||
"status": "alive"
|
|
||||||
},
|
|
||||||
"readiness": {
|
|
||||||
"status": "ready",
|
|
||||||
"checks": {
|
|
||||||
"database": {"healthy": true}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"canary": {
|
|
||||||
"status": "healthy",
|
|
||||||
"last_success": "2026-01-18T10:00:00Z",
|
|
||||||
"age_minutes": 30,
|
|
||||||
"consecutive_failures": 0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📊 Monitoring
|
|
||||||
|
|
||||||
### View Canary History
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Connect to database
|
|
||||||
docker-compose -f docker-compose.production.yml exec db psql -U scraper
|
|
||||||
|
|
||||||
# Query canary results
|
|
||||||
SELECT
|
|
||||||
timestamp,
|
|
||||||
success,
|
|
||||||
reviews_count,
|
|
||||||
scrape_time,
|
|
||||||
error_message
|
|
||||||
FROM canary_results
|
|
||||||
ORDER BY timestamp DESC
|
|
||||||
LIMIT 10;
|
|
||||||
```
|
|
||||||
|
|
||||||
### View Job Statistics
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl http://localhost:8000/stats | jq
|
|
||||||
```
|
|
||||||
|
|
||||||
**Response:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"total_jobs": 150,
|
|
||||||
"pending": 2,
|
|
||||||
"running": 3,
|
|
||||||
"completed": 140,
|
|
||||||
"failed": 5,
|
|
||||||
"cancelled": 0,
|
|
||||||
"avg_scrape_time": 19.2,
|
|
||||||
"total_reviews": 34560
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### View Webhook Delivery Stats
|
|
||||||
|
|
||||||
```sql
|
|
||||||
-- Connect to database
|
|
||||||
SELECT
|
|
||||||
j.job_id,
|
|
||||||
j.webhook_url,
|
|
||||||
COUNT(w.id) as attempts,
|
|
||||||
SUM(CASE WHEN w.success THEN 1 ELSE 0 END) as successful,
|
|
||||||
MAX(w.timestamp) as last_attempt
|
|
||||||
FROM jobs j
|
|
||||||
LEFT JOIN webhook_attempts w ON j.job_id = w.job_id
|
|
||||||
WHERE j.webhook_url IS NOT NULL
|
|
||||||
GROUP BY j.job_id, j.webhook_url
|
|
||||||
ORDER BY last_attempt DESC
|
|
||||||
LIMIT 10;
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🐳 Docker Commands
|
|
||||||
|
|
||||||
### Start Services
|
|
||||||
|
|
||||||
```bash
|
|
||||||
docker-compose -f docker-compose.production.yml up -d
|
|
||||||
```
|
|
||||||
|
|
||||||
### Stop Services
|
|
||||||
|
|
||||||
```bash
|
|
||||||
docker-compose -f docker-compose.production.yml down
|
|
||||||
```
|
|
||||||
|
|
||||||
### View Logs
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# All services
|
|
||||||
docker-compose -f docker-compose.production.yml logs -f
|
|
||||||
|
|
||||||
# Just API
|
|
||||||
docker-compose -f docker-compose.production.yml logs -f api
|
|
||||||
|
|
||||||
# Just database
|
|
||||||
docker-compose -f docker-compose.production.yml logs -f db
|
|
||||||
```
|
|
||||||
|
|
||||||
### Restart Services
|
|
||||||
|
|
||||||
```bash
|
|
||||||
docker-compose -f docker-compose.production.yml restart api
|
|
||||||
```
|
|
||||||
|
|
||||||
### Access Database
|
|
||||||
|
|
||||||
```bash
|
|
||||||
docker-compose -f docker-compose.production.yml exec db psql -U scraper
|
|
||||||
```
|
|
||||||
|
|
||||||
### Backup Database
|
|
||||||
|
|
||||||
```bash
|
|
||||||
docker-compose -f docker-compose.production.yml exec db pg_dump -U scraper scraper > backup.sql
|
|
||||||
```
|
|
||||||
|
|
||||||
### Restore Database
|
|
||||||
|
|
||||||
```bash
|
|
||||||
docker-compose -f docker-compose.production.yml exec -T db psql -U scraper scraper < backup.sql
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🔐 Security
|
|
||||||
|
|
||||||
### Webhook Signatures
|
|
||||||
|
|
||||||
All webhooks include HMAC-SHA256 signatures:
|
|
||||||
|
|
||||||
```
|
|
||||||
X-Webhook-Signature: sha256=abc123def456...
|
|
||||||
X-Webhook-Timestamp: 1705582800
|
|
||||||
```
|
|
||||||
|
|
||||||
**Always verify signatures** in your webhook handler!
|
|
||||||
|
|
||||||
### Environment Variables
|
|
||||||
|
|
||||||
Store secrets in `.env` file (never commit to git):
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# .env
|
|
||||||
DB_PASSWORD=strong_random_password_here
|
|
||||||
WEBHOOK_SECRET=another_strong_secret_here
|
|
||||||
```
|
|
||||||
|
|
||||||
### HTTPS in Production
|
|
||||||
|
|
||||||
Always use HTTPS URLs for:
|
|
||||||
- API_BASE_URL
|
|
||||||
- webhook_url parameters
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📈 Scaling
|
|
||||||
|
|
||||||
### Vertical Scaling (Single Server)
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
# docker-compose.production.yml
|
|
||||||
services:
|
|
||||||
api:
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpus: '2'
|
|
||||||
memory: 4G
|
|
||||||
```
|
|
||||||
|
|
||||||
### Horizontal Scaling (Multiple Workers)
|
|
||||||
|
|
||||||
Phase 2 will add Redis queue for distributing jobs across multiple workers:
|
|
||||||
|
|
||||||
```
|
|
||||||
Load Balancer
|
|
||||||
↓
|
|
||||||
API Servers (3 replicas)
|
|
||||||
↓
|
|
||||||
Redis Queue
|
|
||||||
↓
|
|
||||||
Workers (10 replicas)
|
|
||||||
↓
|
|
||||||
PostgreSQL
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🚨 Alerting
|
|
||||||
|
|
||||||
### Slack Alerts
|
|
||||||
|
|
||||||
Set environment variable:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
|
|
||||||
```
|
|
||||||
|
|
||||||
Canary failures will automatically post to Slack:
|
|
||||||
|
|
||||||
```
|
|
||||||
🚨 CRITICAL: Scraper canary failed 3 times in a row!
|
|
||||||
Last error: Timeout after 60 seconds
|
|
||||||
```
|
|
||||||
|
|
||||||
### Email Alerts (TODO)
|
|
||||||
|
|
||||||
Future enhancement - integrate with SMTP or SendGrid.
|
|
||||||
|
|
||||||
### PagerDuty (TODO)
|
|
||||||
|
|
||||||
Future enhancement - integrate with PagerDuty API.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🧪 Testing
|
|
||||||
|
|
||||||
### Test Webhook Locally
|
|
||||||
|
|
||||||
Use webhook.site or ngrok:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Start ngrok
|
|
||||||
ngrok http 8000
|
|
||||||
|
|
||||||
# Use ngrok URL as webhook
|
|
||||||
curl -X POST "http://localhost:8000/scrape" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"url": "https://maps.google.com/...",
|
|
||||||
"webhook_url": "https://your-id.ngrok.io/webhook"
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
### Test Health Checks
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Should return 200
|
|
||||||
curl -f http://localhost:8000/health/live || echo "FAILED"
|
|
||||||
|
|
||||||
# Should return 200
|
|
||||||
curl -f http://localhost:8000/health/ready || echo "FAILED"
|
|
||||||
|
|
||||||
# May return 503 if no canary run yet
|
|
||||||
curl http://localhost:8000/health/canary
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📝 Database Schema
|
|
||||||
|
|
||||||
### Jobs Table
|
|
||||||
|
|
||||||
```sql
|
|
||||||
CREATE TABLE jobs (
|
|
||||||
job_id UUID PRIMARY KEY,
|
|
||||||
status VARCHAR(20) NOT NULL,
|
|
||||||
url TEXT NOT NULL,
|
|
||||||
webhook_url TEXT,
|
|
||||||
webhook_secret TEXT,
|
|
||||||
created_at TIMESTAMP NOT NULL,
|
|
||||||
started_at TIMESTAMP,
|
|
||||||
completed_at TIMESTAMP,
|
|
||||||
reviews_count INTEGER,
|
|
||||||
reviews_data JSONB, -- All reviews stored here!
|
|
||||||
scrape_time REAL,
|
|
||||||
error_message TEXT,
|
|
||||||
metadata JSONB
|
|
||||||
);
|
|
||||||
```
|
|
||||||
|
|
||||||
### Canary Results Table
|
|
||||||
|
|
||||||
```sql
|
|
||||||
CREATE TABLE canary_results (
|
|
||||||
id SERIAL PRIMARY KEY,
|
|
||||||
timestamp TIMESTAMP NOT NULL,
|
|
||||||
success BOOLEAN NOT NULL,
|
|
||||||
reviews_count INTEGER,
|
|
||||||
scrape_time REAL,
|
|
||||||
error_message TEXT,
|
|
||||||
metadata JSONB
|
|
||||||
);
|
|
||||||
```
|
|
||||||
|
|
||||||
### Webhook Attempts Table
|
|
||||||
|
|
||||||
```sql
|
|
||||||
CREATE TABLE webhook_attempts (
|
|
||||||
id SERIAL PRIMARY KEY,
|
|
||||||
job_id UUID NOT NULL,
|
|
||||||
attempt_number INTEGER NOT NULL,
|
|
||||||
timestamp TIMESTAMP NOT NULL,
|
|
||||||
success BOOLEAN NOT NULL,
|
|
||||||
status_code INTEGER,
|
|
||||||
error_message TEXT,
|
|
||||||
response_time_ms REAL
|
|
||||||
);
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🎯 Next Steps (Phase 2)
|
|
||||||
|
|
||||||
Phase 2 will add:
|
|
||||||
- ✅ **Redis Queue** - Distribute jobs across multiple workers
|
|
||||||
- ✅ **Worker Processes** - Separate API from scraping
|
|
||||||
- ✅ **Auto-scaling** - Kubernetes HPA based on queue length
|
|
||||||
- ✅ **SSE Streaming** - Real-time progress updates (optional)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🐛 Troubleshooting
|
|
||||||
|
|
||||||
### Database Connection Errors
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Check database is running
|
|
||||||
docker-compose -f docker-compose.production.yml ps db
|
|
||||||
|
|
||||||
# Check connection
|
|
||||||
psql postgresql://scraper:scraper123@localhost:5432/scraper -c "SELECT 1"
|
|
||||||
```
|
|
||||||
|
|
||||||
### Canary Always Failing
|
|
||||||
|
|
||||||
Check canary test URL is accessible:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -I "https://www.google.com/maps/place/Soho+Factory/@54.6738155,25.2595844,17z/"
|
|
||||||
```
|
|
||||||
|
|
||||||
Try a different test URL in .env:
|
|
||||||
```
|
|
||||||
CANARY_TEST_URL=https://www.google.com/maps/place/YOUR_STABLE_BUSINESS
|
|
||||||
```
|
|
||||||
|
|
||||||
### Webhooks Not Delivered
|
|
||||||
|
|
||||||
Check webhook attempts table:
|
|
||||||
|
|
||||||
```sql
|
|
||||||
SELECT * FROM webhook_attempts
|
|
||||||
WHERE job_id = '550e8400-e29b-41d4-a716-446655440000'
|
|
||||||
ORDER BY timestamp DESC;
|
|
||||||
```
|
|
||||||
|
|
||||||
Check webhook dispatcher is running:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
docker-compose -f docker-compose.production.yml logs -f api | grep "webhook"
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**Your production microservice is ready!** 🚀
|
|
||||||
|
|
||||||
For questions or issues, check:
|
|
||||||
- Server logs: `docker-compose logs -f api`
|
|
||||||
- Database: `docker-compose exec db psql -U scraper`
|
|
||||||
- Health checks: `curl http://localhost:8000/health/detailed`
|
|
||||||
@@ -1,588 +0,0 @@
|
|||||||
# 🐳 Docker + Chrome Setup Guide
|
|
||||||
|
|
||||||
## Running the Scraper in a Container with Browser
|
|
||||||
|
|
||||||
This guide explains how to run the Google Reviews Scraper in a Docker container with Chrome and Xvfb (virtual display).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Why Docker + Chrome?
|
|
||||||
|
|
||||||
✅ **Solves the headless mode issue**
|
|
||||||
- UC mode + headless = URL mangling ❌
|
|
||||||
- UC mode + Xvfb = Works perfectly ✅
|
|
||||||
|
|
||||||
✅ **Isolated environment**
|
|
||||||
- Chrome + dependencies installed in container
|
|
||||||
- No conflicts with host system
|
|
||||||
- Easy to deploy anywhere
|
|
||||||
|
|
||||||
✅ **Production-ready**
|
|
||||||
- Same setup works on any Linux server
|
|
||||||
- Kubernetes-compatible
|
|
||||||
- Scalable architecture
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Architecture
|
|
||||||
|
|
||||||
```
|
|
||||||
Docker Container
|
|
||||||
├── Xvfb (Virtual Display :99)
|
|
||||||
│ └── Simulates X11 display without physical monitor
|
|
||||||
├── Google Chrome (Non-headless)
|
|
||||||
│ └── Runs on virtual display
|
|
||||||
│ └── UC mode works perfectly (no URL mangling)
|
|
||||||
└── Python API Server
|
|
||||||
└── Uses SeleniumBase to control Chrome
|
|
||||||
└── DISPLAY=:99 environment variable
|
|
||||||
```
|
|
||||||
|
|
||||||
**Result**: Chrome thinks it's running normally, but everything is inside the container!
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Updated Dockerfile
|
|
||||||
|
|
||||||
The new `Dockerfile` includes:
|
|
||||||
|
|
||||||
1. **Xvfb** - Virtual framebuffer X server (virtual display)
|
|
||||||
2. **Google Chrome** - Full Chrome browser (not Chromium)
|
|
||||||
3. **Chrome dependencies** - All required libraries
|
|
||||||
4. **Startup script** - Launches Xvfb before API server
|
|
||||||
|
|
||||||
### Key Changes
|
|
||||||
|
|
||||||
```dockerfile
|
|
||||||
# Install Xvfb
|
|
||||||
RUN apt-get install -y xvfb
|
|
||||||
|
|
||||||
# Install Google Chrome
|
|
||||||
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
|
||||||
&& echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
|
|
||||||
&& apt-get update \
|
|
||||||
&& apt-get install -y google-chrome-stable
|
|
||||||
|
|
||||||
# Create startup script
|
|
||||||
RUN echo '#!/bin/bash\n\
|
|
||||||
Xvfb :99 -screen 0 1920x1080x24 -ac +extension GLX +render -noreset &\n\
|
|
||||||
export DISPLAY=:99\n\
|
|
||||||
sleep 2\n\
|
|
||||||
exec python api_server_production.py\n\
|
|
||||||
' > /app/start.sh && chmod +x /app/start.sh
|
|
||||||
|
|
||||||
# Environment
|
|
||||||
ENV DISPLAY=:99
|
|
||||||
ENV CHROME_BIN=/usr/bin/google-chrome
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Updated docker-compose.yml
|
|
||||||
|
|
||||||
Added Chrome-specific configurations:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
services:
|
|
||||||
api:
|
|
||||||
# Chrome requires shared memory
|
|
||||||
shm_size: 2gb
|
|
||||||
|
|
||||||
# Chrome capabilities (needed for sandboxing)
|
|
||||||
cap_add:
|
|
||||||
- SYS_ADMIN
|
|
||||||
|
|
||||||
# Security options
|
|
||||||
security_opt:
|
|
||||||
- seccomp:unconfined
|
|
||||||
|
|
||||||
environment:
|
|
||||||
- DISPLAY=:99
|
|
||||||
- CHROME_BIN=/usr/bin/google-chrome
|
|
||||||
- MAX_CONCURRENT_JOBS=5
|
|
||||||
```
|
|
||||||
|
|
||||||
**Why these settings?**
|
|
||||||
|
|
||||||
- `shm_size: 2gb` - Chrome needs shared memory for stability
|
|
||||||
- `SYS_ADMIN` capability - Chrome sandbox requires this
|
|
||||||
- `seccomp:unconfined` - Allows Chrome to run without seccomp restrictions
|
|
||||||
- `DISPLAY=:99` - Points to Xvfb virtual display
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
|
|
||||||
### 1. Build the Container
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Navigate to project directory
|
|
||||||
cd /path/to/google-reviews-scraper-pro
|
|
||||||
|
|
||||||
# Build the image (takes ~5-10 minutes first time)
|
|
||||||
docker-compose -f docker-compose.production.yml build
|
|
||||||
```
|
|
||||||
|
|
||||||
**Build time**: ~5-10 minutes (installs Chrome + all dependencies)
|
|
||||||
|
|
||||||
### 2. Configure Environment
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Copy example environment file
|
|
||||||
cp .env.example .env
|
|
||||||
|
|
||||||
# Edit configuration
|
|
||||||
nano .env
|
|
||||||
```
|
|
||||||
|
|
||||||
**Key settings**:
|
|
||||||
```bash
|
|
||||||
DB_PASSWORD=scraper123
|
|
||||||
MAX_CONCURRENT_JOBS=5 # 5 jobs per 8GB RAM
|
|
||||||
API_BASE_URL=http://localhost:8000
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Start Services
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Start PostgreSQL + API server
|
|
||||||
docker-compose -f docker-compose.production.yml up -d
|
|
||||||
|
|
||||||
# Check logs
|
|
||||||
docker-compose -f docker-compose.production.yml logs -f api
|
|
||||||
```
|
|
||||||
|
|
||||||
**Expected output**:
|
|
||||||
```
|
|
||||||
api_1 | Starting Xvfb on display :99...
|
|
||||||
api_1 | Waiting for Xvfb to start...
|
|
||||||
api_1 | Starting API server...
|
|
||||||
api_1 | INFO: Started server process [1]
|
|
||||||
api_1 | INFO: Waiting for application startup.
|
|
||||||
api_1 | Database initialized
|
|
||||||
api_1 | Health check system started
|
|
||||||
api_1 | Webhook dispatcher started
|
|
||||||
```
|
|
||||||
|
|
||||||
### 4. Verify Setup
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Check health endpoint
|
|
||||||
curl http://localhost:8000/health/detailed | jq
|
|
||||||
|
|
||||||
# Should show:
|
|
||||||
# {
|
|
||||||
# "status": "healthy",
|
|
||||||
# "components": {
|
|
||||||
# "database": {"status": "healthy"},
|
|
||||||
# "canary": {"status": "unknown"} # Will run first test in 4 hours
|
|
||||||
# }
|
|
||||||
# }
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Testing Chrome in Container
|
|
||||||
|
|
||||||
### Option 1: Test Inside Container
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Run test script inside container
|
|
||||||
docker-compose -f docker-compose.production.yml exec api python test_docker_chrome.py
|
|
||||||
```
|
|
||||||
|
|
||||||
**Expected output**:
|
|
||||||
```
|
|
||||||
======================================================================
|
|
||||||
Testing Chrome in Docker Container
|
|
||||||
======================================================================
|
|
||||||
|
|
||||||
1. Initializing Chrome with UC mode (headless=False + Xvfb)...
|
|
||||||
✅ Chrome initialized successfully
|
|
||||||
|
|
||||||
2. Navigating to Google Maps...
|
|
||||||
✅ Loaded: https://www.google.com/maps/...
|
|
||||||
|
|
||||||
3. Checking for GDPR consent page...
|
|
||||||
Clicking: Aceptar todo
|
|
||||||
After consent: https://www.google.com/maps/...
|
|
||||||
|
|
||||||
4. Waiting for page to load...
|
|
||||||
|
|
||||||
5. Checking for reviews...
|
|
||||||
Reviews found: 230
|
|
||||||
|
|
||||||
======================================================================
|
|
||||||
✅ SUCCESS! Chrome + Xvfb working in container!
|
|
||||||
======================================================================
|
|
||||||
Reviews detected: 230
|
|
||||||
Container is ready for production scraping!
|
|
||||||
```
|
|
||||||
|
|
||||||
### Option 2: Test via API
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Submit a real job
|
|
||||||
curl -X POST "http://localhost:8000/scrape" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"url": "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml"
|
|
||||||
}' | jq
|
|
||||||
|
|
||||||
# Get job ID from response
|
|
||||||
JOB_ID="..."
|
|
||||||
|
|
||||||
# Wait ~25 seconds, then check status
|
|
||||||
curl "http://localhost:8000/jobs/$JOB_ID" | jq
|
|
||||||
|
|
||||||
# Get reviews
|
|
||||||
curl "http://localhost:8000/jobs/$JOB_ID/reviews" | jq
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Resource Requirements
|
|
||||||
|
|
||||||
### Minimum Requirements
|
|
||||||
|
|
||||||
```
|
|
||||||
RAM: 4GB (for 2 concurrent jobs)
|
|
||||||
CPU: 2 cores
|
|
||||||
Disk: 10GB
|
|
||||||
```
|
|
||||||
|
|
||||||
### Recommended for Production
|
|
||||||
|
|
||||||
```
|
|
||||||
RAM: 16GB (for 10 concurrent jobs)
|
|
||||||
CPU: 4 cores
|
|
||||||
Disk: 50GB
|
|
||||||
```
|
|
||||||
|
|
||||||
### Scaling Guide
|
|
||||||
|
|
||||||
| Server RAM | MAX_CONCURRENT_JOBS | Throughput |
|
|
||||||
|------------|---------------------|-----------------|
|
|
||||||
| 8GB | 5 | ~25 jobs/min |
|
|
||||||
| 16GB | 10 | ~50 jobs/min |
|
|
||||||
| 32GB | 20 | ~100 jobs/min |
|
|
||||||
| 64GB | 40 | ~200 jobs/min |
|
|
||||||
|
|
||||||
**Calculation**:
|
|
||||||
- Each Chrome instance: ~500MB RAM
|
|
||||||
- Each job takes: ~20-30s
|
|
||||||
- Concurrent jobs × (60s / avg_time) = jobs/min
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Container Commands
|
|
||||||
|
|
||||||
### Start Services
|
|
||||||
```bash
|
|
||||||
docker-compose -f docker-compose.production.yml up -d
|
|
||||||
```
|
|
||||||
|
|
||||||
### Stop Services
|
|
||||||
```bash
|
|
||||||
docker-compose -f docker-compose.production.yml down
|
|
||||||
```
|
|
||||||
|
|
||||||
### View Logs
|
|
||||||
```bash
|
|
||||||
# All logs
|
|
||||||
docker-compose -f docker-compose.production.yml logs -f
|
|
||||||
|
|
||||||
# Just API logs
|
|
||||||
docker-compose -f docker-compose.production.yml logs -f api
|
|
||||||
|
|
||||||
# Just database logs
|
|
||||||
docker-compose -f docker-compose.production.yml logs -f db
|
|
||||||
```
|
|
||||||
|
|
||||||
### Restart API (after code changes)
|
|
||||||
```bash
|
|
||||||
# Rebuild and restart
|
|
||||||
docker-compose -f docker-compose.production.yml up -d --build api
|
|
||||||
|
|
||||||
# Or just restart (no rebuild)
|
|
||||||
docker-compose -f docker-compose.production.yml restart api
|
|
||||||
```
|
|
||||||
|
|
||||||
### Enter Container Shell
|
|
||||||
```bash
|
|
||||||
# Access API container
|
|
||||||
docker-compose -f docker-compose.production.yml exec api bash
|
|
||||||
|
|
||||||
# Check if Xvfb is running
|
|
||||||
ps aux | grep Xvfb
|
|
||||||
|
|
||||||
# Check Chrome version
|
|
||||||
google-chrome --version
|
|
||||||
|
|
||||||
# Test DISPLAY
|
|
||||||
echo $DISPLAY # Should show :99
|
|
||||||
```
|
|
||||||
|
|
||||||
### Clean Up Everything
|
|
||||||
```bash
|
|
||||||
# Stop and remove containers, volumes, images
|
|
||||||
docker-compose -f docker-compose.production.yml down -v --rmi all
|
|
||||||
|
|
||||||
# Remove all unused Docker resources
|
|
||||||
docker system prune -a
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Troubleshooting
|
|
||||||
|
|
||||||
### Issue: Container exits immediately
|
|
||||||
|
|
||||||
**Check logs**:
|
|
||||||
```bash
|
|
||||||
docker-compose -f docker-compose.production.yml logs api
|
|
||||||
```
|
|
||||||
|
|
||||||
**Common causes**:
|
|
||||||
1. Database not ready → Wait for health check
|
|
||||||
2. Permission errors → Check file ownership
|
|
||||||
3. Port 8000 already in use → Change PORT in .env
|
|
||||||
|
|
||||||
### Issue: Chrome fails to start
|
|
||||||
|
|
||||||
**Symptoms**: "Chrome crashed" or "DevToolsActivePort file doesn't exist"
|
|
||||||
|
|
||||||
**Solutions**:
|
|
||||||
```bash
|
|
||||||
# Increase shared memory
|
|
||||||
# In docker-compose.yml:
|
|
||||||
shm_size: 4gb # Instead of 2gb
|
|
||||||
|
|
||||||
# Verify Xvfb is running
|
|
||||||
docker-compose exec api ps aux | grep Xvfb
|
|
||||||
|
|
||||||
# Check DISPLAY variable
|
|
||||||
docker-compose exec api echo $DISPLAY
|
|
||||||
```
|
|
||||||
|
|
||||||
### Issue: "Cannot connect to X server"
|
|
||||||
|
|
||||||
**This means Xvfb didn't start**
|
|
||||||
|
|
||||||
**Debug**:
|
|
||||||
```bash
|
|
||||||
# Enter container
|
|
||||||
docker-compose exec api bash
|
|
||||||
|
|
||||||
# Manually start Xvfb
|
|
||||||
Xvfb :99 -screen 0 1920x1080x24 &
|
|
||||||
|
|
||||||
# Set DISPLAY
|
|
||||||
export DISPLAY=:99
|
|
||||||
|
|
||||||
# Test
|
|
||||||
python test_docker_chrome.py
|
|
||||||
```
|
|
||||||
|
|
||||||
### Issue: Jobs get 0 reviews
|
|
||||||
|
|
||||||
**Likely URL format issue**
|
|
||||||
|
|
||||||
**Use full Google Maps URL**:
|
|
||||||
```
|
|
||||||
❌ BAD: https://www.google.com/maps/@54.67869,25.2667181,17z
|
|
||||||
✅ GOOD: https://www.google.com/maps/place/NAME/data=!4m7!3m6...
|
|
||||||
```
|
|
||||||
|
|
||||||
**Get correct URL**:
|
|
||||||
1. Open Google Maps in browser
|
|
||||||
2. Search for business
|
|
||||||
3. Copy URL from address bar (should include `data=!4m7...`)
|
|
||||||
|
|
||||||
### Issue: High memory usage
|
|
||||||
|
|
||||||
**Monitor usage**:
|
|
||||||
```bash
|
|
||||||
# Check container stats
|
|
||||||
docker stats scraper-api
|
|
||||||
|
|
||||||
# Check concurrent jobs
|
|
||||||
curl http://localhost:8000/stats | jq
|
|
||||||
```
|
|
||||||
|
|
||||||
**Reduce concurrency**:
|
|
||||||
```bash
|
|
||||||
# Edit .env
|
|
||||||
MAX_CONCURRENT_JOBS=3 # Lower from 5
|
|
||||||
|
|
||||||
# Restart
|
|
||||||
docker-compose -f docker-compose.production.yml restart api
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Production Deployment
|
|
||||||
|
|
||||||
### Deploy to Cloud VM (AWS/GCP/Azure)
|
|
||||||
|
|
||||||
1. **Launch VM** (Ubuntu 22.04 recommended)
|
|
||||||
```bash
|
|
||||||
# Minimum: 8GB RAM, 2 CPUs
|
|
||||||
# Recommended: 16GB RAM, 4 CPUs
|
|
||||||
```
|
|
||||||
|
|
||||||
2. **Install Docker**
|
|
||||||
```bash
|
|
||||||
curl -fsSL https://get.docker.com -o get-docker.sh
|
|
||||||
sudo sh get-docker.sh
|
|
||||||
sudo usermod -aG docker $USER
|
|
||||||
```
|
|
||||||
|
|
||||||
3. **Install Docker Compose**
|
|
||||||
```bash
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install docker-compose-plugin
|
|
||||||
```
|
|
||||||
|
|
||||||
4. **Clone repository**
|
|
||||||
```bash
|
|
||||||
git clone <your-repo>
|
|
||||||
cd google-reviews-scraper-pro
|
|
||||||
```
|
|
||||||
|
|
||||||
5. **Configure**
|
|
||||||
```bash
|
|
||||||
cp .env.example .env
|
|
||||||
nano .env # Set DB_PASSWORD, etc.
|
|
||||||
```
|
|
||||||
|
|
||||||
6. **Start services**
|
|
||||||
```bash
|
|
||||||
docker-compose -f docker-compose.production.yml up -d
|
|
||||||
```
|
|
||||||
|
|
||||||
7. **Setup reverse proxy (optional but recommended)**
|
|
||||||
```bash
|
|
||||||
# Install nginx
|
|
||||||
sudo apt-get install nginx
|
|
||||||
|
|
||||||
# Configure nginx
|
|
||||||
sudo nano /etc/nginx/sites-available/scraper
|
|
||||||
```
|
|
||||||
|
|
||||||
```nginx
|
|
||||||
server {
|
|
||||||
listen 80;
|
|
||||||
server_name your-domain.com;
|
|
||||||
|
|
||||||
location / {
|
|
||||||
proxy_pass http://localhost:8000;
|
|
||||||
proxy_set_header Host $host;
|
|
||||||
proxy_set_header X-Real-IP $remote_addr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Enable site
|
|
||||||
sudo ln -s /etc/nginx/sites-available/scraper /etc/nginx/sites-enabled/
|
|
||||||
sudo nginx -t
|
|
||||||
sudo systemctl restart nginx
|
|
||||||
```
|
|
||||||
|
|
||||||
8. **Setup SSL (recommended)**
|
|
||||||
```bash
|
|
||||||
sudo apt-get install certbot python3-certbot-nginx
|
|
||||||
sudo certbot --nginx -d your-domain.com
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Kubernetes Deployment (Advanced)
|
|
||||||
|
|
||||||
For high-scale deployments, use Kubernetes:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: scraper-api
|
|
||||||
spec:
|
|
||||||
replicas: 3
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: scraper-api
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: scraper-api
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: api
|
|
||||||
image: your-registry/scraper-api:latest
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
memory: "2Gi"
|
|
||||||
cpu: "500m"
|
|
||||||
limits:
|
|
||||||
memory: "4Gi"
|
|
||||||
cpu: "2000m"
|
|
||||||
env:
|
|
||||||
- name: DATABASE_URL
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: scraper-secrets
|
|
||||||
key: database-url
|
|
||||||
- name: MAX_CONCURRENT_JOBS
|
|
||||||
value: "5"
|
|
||||||
securityContext:
|
|
||||||
capabilities:
|
|
||||||
add:
|
|
||||||
- SYS_ADMIN
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Performance Comparison
|
|
||||||
|
|
||||||
### Before (headless=True with issues)
|
|
||||||
```
|
|
||||||
Status: ❌ URL mangling
|
|
||||||
Reviews: 0
|
|
||||||
Time: 20s (wasted)
|
|
||||||
Success rate: 0%
|
|
||||||
```
|
|
||||||
|
|
||||||
### After (headless=False + Xvfb in Docker)
|
|
||||||
```
|
|
||||||
Status: ✅ Working perfectly
|
|
||||||
Reviews: 230/230
|
|
||||||
Time: 20.7s
|
|
||||||
Success rate: 100%
|
|
||||||
Concurrent jobs: 5 (4.7x speedup)
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Next Steps
|
|
||||||
|
|
||||||
1. ✅ Build and test locally
|
|
||||||
2. ✅ Run test_docker_chrome.py to verify
|
|
||||||
3. ✅ Submit real job via API
|
|
||||||
4. ✅ Monitor with /health/detailed endpoint
|
|
||||||
5. 🚀 Deploy to production server
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Summary
|
|
||||||
|
|
||||||
✅ **Chrome runs perfectly in Docker container**
|
|
||||||
✅ **Xvfb provides virtual display**
|
|
||||||
✅ **No headless mode issues**
|
|
||||||
✅ **Production-ready**
|
|
||||||
✅ **Scales horizontally**
|
|
||||||
✅ **Easy to deploy anywhere**
|
|
||||||
|
|
||||||
**The containerized setup solves all headless mode issues while maintaining the same fast performance (20-25s for 200+ reviews)!**
|
|
||||||
|
|
||||||
🐳 **Ready for production deployment!**
|
|
||||||
@@ -1,184 +0,0 @@
|
|||||||
# Google Maps Review Fields - Complete Analysis
|
|
||||||
|
|
||||||
## 🔍 Investigation Results
|
|
||||||
|
|
||||||
**Goal:** Reverse-engineer Google Maps to find actual timestamps instead of relative dates ("Hace 2 meses")
|
|
||||||
|
|
||||||
**Result:** ❌ Google Maps does NOT expose actual timestamps in the public DOM
|
|
||||||
|
|
||||||
### What We Tested
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
// Checked for timestamps in:
|
|
||||||
const dateElem = elem.querySelector('span.rsqaWe');
|
|
||||||
dateElem.getAttribute('aria-label'); // null
|
|
||||||
dateElem.getAttribute('data-*'); // no data attributes
|
|
||||||
dateElem.getAttribute('datetime'); // null
|
|
||||||
```
|
|
||||||
|
|
||||||
### What Google Maps Provides
|
|
||||||
|
|
||||||
| Field | Available | Format | Example |
|
|
||||||
|-------|-----------|--------|---------|
|
|
||||||
| Relative Date Text | ✅ | Spanish/Local | "Hace 2 meses" |
|
|
||||||
| Actual Timestamp | ❌ | N/A | Not in DOM |
|
|
||||||
| ISO Date | ❌ | N/A | Not in DOM |
|
|
||||||
| aria-label | ❌ | N/A | Not set |
|
|
||||||
| data-* attributes | ❌ | N/A | None found |
|
|
||||||
|
|
||||||
## 📋 Currently Extracted Fields
|
|
||||||
|
|
||||||
### ✅ Successfully Extracted
|
|
||||||
|
|
||||||
| Field | Selector | Type | Notes |
|
|
||||||
|-------|----------|------|-------|
|
|
||||||
| `author` | `div.d4r55` | string | Reviewer name |
|
|
||||||
| `rating` | `span.kvMYJc[aria-label]` | number | 1-5 stars, extracted from aria-label |
|
|
||||||
| `text` | `span.wiI7pd` | string \| null | Review content |
|
|
||||||
| `date_text` | `span.rsqaWe` | string | **Relative date only** |
|
|
||||||
| `avatar_url` | `img.NBa7we[src]` | string \| null | Profile picture |
|
|
||||||
| `profile_url` | `button.WEBjve[data-review-id]` | string \| null | Profile identifier |
|
|
||||||
| `review_id` | computed | string | Hash of author + date |
|
|
||||||
|
|
||||||
### ❌ Not Available in DOM
|
|
||||||
|
|
||||||
| Field | Why Not Available |
|
|
||||||
|-------|-------------------|
|
|
||||||
| `timestamp` | Google doesn't expose it |
|
|
||||||
| `date_aria_label` | span.rsqaWe has no aria-label |
|
|
||||||
| `date_data_attrs` | span.rsqaWe has no data-* attributes |
|
|
||||||
| `likes_count` | Not in DOM scraper (only in API intercept) |
|
|
||||||
| `owner_response` | Not in DOM scraper (only in API intercept) |
|
|
||||||
| `photos` | Not currently extracted |
|
|
||||||
|
|
||||||
## 🔬 Potentially Extractable Fields (Not Currently Scraped)
|
|
||||||
|
|
||||||
### 1. Review Photos/Images
|
|
||||||
```javascript
|
|
||||||
// Reviews can have attached photos
|
|
||||||
const photoElements = elem.querySelectorAll('button[aria-label*="photo"]');
|
|
||||||
// or
|
|
||||||
const imageButtons = elem.querySelectorAll('button.Tya61d');
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Review Edit Status
|
|
||||||
Some reviews show "Fecha de edición: Hace X" indicating they were edited. Currently captured in `date_text` but not parsed separately.
|
|
||||||
|
|
||||||
### 3. Local Guide Badge
|
|
||||||
```javascript
|
|
||||||
// Some reviewers have "Local Guide" badges
|
|
||||||
const localGuideBadge = elem.querySelector('span.RfnDt');
|
|
||||||
```
|
|
||||||
|
|
||||||
### 4. Review Helpfulness (Thumbs Up Count)
|
|
||||||
May be available in some layouts:
|
|
||||||
```javascript
|
|
||||||
const helpfulCount = elem.querySelector('[aria-label*="helpful"]');
|
|
||||||
```
|
|
||||||
|
|
||||||
### 5. Owner Response
|
|
||||||
```javascript
|
|
||||||
// Business owner responses to reviews
|
|
||||||
const ownerResponse = elem.querySelector('.CDe7pd');
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🎯 Recommendation: Use Our Date Parser
|
|
||||||
|
|
||||||
Since Google Maps doesn't expose actual timestamps, our current approach is **optimal**:
|
|
||||||
|
|
||||||
### Current Solution (✅ Implemented)
|
|
||||||
```typescript
|
|
||||||
function extractNumber(text: string): number {
|
|
||||||
const match = text.match(/\d+/);
|
|
||||||
if (match) return parseInt(match[0]);
|
|
||||||
if (text.includes('un ') || text.includes('una ')) return 1;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
function parseDateText(dateText: string): Date {
|
|
||||||
const text = dateText.toLowerCase();
|
|
||||||
if (text.includes('semana')) {
|
|
||||||
const weeks = extractNumber(text);
|
|
||||||
return new Date(Date.now() - weeks * 7 * 24 * 60 * 60 * 1000);
|
|
||||||
}
|
|
||||||
// ... similar for months, years
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Why This Works
|
|
||||||
1. ✅ Accurate to the time unit (weeks, months, years)
|
|
||||||
2. ✅ Handles both numbers and Spanish text ("un año")
|
|
||||||
3. ✅ Processes all 244 reviews in <1ms
|
|
||||||
4. ✅ Good enough for analytics (±15 day margin acceptable)
|
|
||||||
|
|
||||||
### Alternative: API Interception
|
|
||||||
The `api_interceptor.py` module theoretically could capture timestamps from Google's internal API, but:
|
|
||||||
- More complex and fragile
|
|
||||||
- Depends on Google's undocumented API structure
|
|
||||||
- Currently not extracting timestamps (field defined but not populated)
|
|
||||||
- Would require reverse-engineering Google's protobuf/JSON format
|
|
||||||
|
|
||||||
## 📊 Field Comparison: DOM vs API Intercept
|
|
||||||
|
|
||||||
| Field | DOM Scraper | API Intercept | Winner |
|
|
||||||
|-------|-------------|---------------|--------|
|
|
||||||
| Speed | ⚡ Fast | 🐢 Slower | DOM |
|
|
||||||
| Reliability | ✅ Stable | ⚠️ Fragile | DOM |
|
|
||||||
| Timestamp | ❌ No | ❓ Maybe | Neither |
|
|
||||||
| Photos | ⚠️ Not impl | ✅ Yes | API |
|
|
||||||
| Likes | ❌ No | ✅ Yes | API |
|
|
||||||
| Owner Response | ⚠️ Not impl | ✅ Yes | API |
|
|
||||||
|
|
||||||
## 🚀 Enhancement Opportunities
|
|
||||||
|
|
||||||
### Priority 1: Extract Review Photos
|
|
||||||
```javascript
|
|
||||||
// Add to fast_scraper.py extraction script
|
|
||||||
const photoButtons = elem.querySelectorAll('button[jsaction*="photo"]');
|
|
||||||
review.photo_count = photoButtons.length;
|
|
||||||
review.photo_urls = Array.from(photoButtons).map(btn => {
|
|
||||||
const img = btn.querySelector('img');
|
|
||||||
return img ? img.src : null;
|
|
||||||
}).filter(Boolean);
|
|
||||||
```
|
|
||||||
|
|
||||||
### Priority 2: Extract Local Guide Status
|
|
||||||
```javascript
|
|
||||||
const isLocalGuide = !!elem.querySelector('span.RfnDt');
|
|
||||||
review.is_local_guide = isLocalGuide;
|
|
||||||
```
|
|
||||||
|
|
||||||
### Priority 3: Extract Owner Responses
|
|
||||||
```javascript
|
|
||||||
const ownerResponseElem = elem.querySelector('.CDe7pd');
|
|
||||||
review.owner_response = ownerResponseElem ? ownerResponseElem.textContent.trim() : null;
|
|
||||||
```
|
|
||||||
|
|
||||||
### Priority 4: Extract Review Helpfulness
|
|
||||||
```javascript
|
|
||||||
const helpfulElem = elem.querySelector('[aria-label*="helpful"]');
|
|
||||||
if (helpfulElem) {
|
|
||||||
const match = helpfulElem.getAttribute('aria-label').match(/\d+/);
|
|
||||||
review.helpful_count = match ? parseInt(match[0]) : 0;
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## 📝 Summary
|
|
||||||
|
|
||||||
**What we have:**
|
|
||||||
- ✅ All essential review data (author, rating, text, date)
|
|
||||||
- ✅ Profile info (avatar, profile URL)
|
|
||||||
- ✅ Fast, reliable extraction
|
|
||||||
- ✅ Working date parsing (good enough for analytics)
|
|
||||||
|
|
||||||
**What we're missing (but could add):**
|
|
||||||
- 📸 Review photos
|
|
||||||
- 👤 Local Guide badges
|
|
||||||
- 💬 Owner responses
|
|
||||||
- 👍 Helpfulness counts
|
|
||||||
|
|
||||||
**What doesn't exist in DOM:**
|
|
||||||
- ❌ Actual timestamps
|
|
||||||
- ❌ Precise dates
|
|
||||||
|
|
||||||
**Conclusion:** Our date parsing approach is the best solution given Google Maps' limitations. Focus enhancement efforts on extracting photos, owner responses, and local guide status rather than chasing timestamps that don't exist.
|
|
||||||
261
FINAL_RESULTS.md
261
FINAL_RESULTS.md
@@ -1,261 +0,0 @@
|
|||||||
# Final Optimization Results - Google Maps Review Scraper
|
|
||||||
|
|
||||||
## Executive Summary
|
|
||||||
|
|
||||||
Successfully optimized Google Maps review scraper from **155 seconds** to **~20-34 seconds** depending on completeness requirements, achieving **4.5x-8.0x speedup**.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Available Scrapers
|
|
||||||
|
|
||||||
### 1. `start_ultra_fast.py` - **FASTEST** ⚡
|
|
||||||
**Time**: ~19.4 seconds
|
|
||||||
**Reviews**: 234/244 (95.9%)
|
|
||||||
**Speedup**: 8.0x faster
|
|
||||||
|
|
||||||
**Best for**:
|
|
||||||
- Maximum speed priority
|
|
||||||
- When 234 reviews is sufficient
|
|
||||||
- Time-critical applications
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python start_ultra_fast.py
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 2. `start_ultra_fast_complete.py` - **RECOMMENDED** ✅
|
|
||||||
**Time**: ~34 seconds
|
|
||||||
**Reviews**: 244/244 (100%)
|
|
||||||
**Speedup**: 4.5x faster
|
|
||||||
|
|
||||||
**Best for**:
|
|
||||||
- Balance of speed and completeness
|
|
||||||
- Production use
|
|
||||||
- When all reviews are needed
|
|
||||||
|
|
||||||
**How it works**:
|
|
||||||
- Phase 1: Ultra-fast API scrolling → 234 reviews in ~20s
|
|
||||||
- Phase 2: DOM parsing for missing 10 → ~13s
|
|
||||||
- Total: 244 reviews in ~34s
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python start_ultra_fast_complete.py
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 3. `start.py` - **ORIGINAL**
|
|
||||||
**Time**: 155 seconds
|
|
||||||
**Reviews**: 244/244 (100%)
|
|
||||||
**Speedup**: 1.0x (baseline)
|
|
||||||
|
|
||||||
**Best for**:
|
|
||||||
- Reference implementation
|
|
||||||
- Debugging
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Key Findings
|
|
||||||
|
|
||||||
### API Limitation Discovery
|
|
||||||
After extensive testing with different scrolling strategies:
|
|
||||||
|
|
||||||
| Strategy | Time | Reviews | Notes |
|
|
||||||
|----------|------|---------|-------|
|
|
||||||
| Ultra-fast (0.27s scroll) | 19.4s | 234 | ✅ Optimal API speed |
|
|
||||||
| Patient (0.30-0.80s scroll) | 58.2s | 234 | Still only 234 |
|
|
||||||
| Complete (0.27-0.50s adaptive) | 30.8s | 234 | Still only 234 |
|
|
||||||
|
|
||||||
**Conclusion**: The Google Maps API endpoint **consistently returns only 234/244 reviews** regardless of scrolling speed or patience. The missing 10 reviews are **NOT available via API** - they only exist in the DOM.
|
|
||||||
|
|
||||||
### Why 10 Reviews Missing from API?
|
|
||||||
|
|
||||||
Possible reasons:
|
|
||||||
1. **Pagination limit**: Google's API may have a hard limit on returned reviews
|
|
||||||
2. **Different endpoint**: Some reviews may use a different API endpoint
|
|
||||||
3. **Age/status filtering**: Older or filtered reviews may be excluded from API responses
|
|
||||||
4. **DOM-only content**: Some reviews may be rendered client-side only
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Performance Comparison
|
|
||||||
|
|
||||||
```
|
|
||||||
Scraper Time Reviews Speedup Completeness
|
|
||||||
─────────────────────────────────────────────────────────────────────
|
|
||||||
Original (start.py) 155s 244 1.0x 100%
|
|
||||||
Fast API (start_fast.py) 29s 234 5.3x 95.9%
|
|
||||||
Ultra-fast (start_ultra_fast.py) 19.4s 234 8.0x 95.9%
|
|
||||||
API-only attempt 58.2s 234 2.7x 95.9%
|
|
||||||
Hybrid Complete (WINNER) 34s 244 4.5x 100% ✅
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Optimization Journey
|
|
||||||
|
|
||||||
### Phase 1: API Interception (3.6x speedup)
|
|
||||||
- Replaced DOM parsing with API interception
|
|
||||||
- 155s → 43s
|
|
||||||
- Scroll timing: 0.8s
|
|
||||||
|
|
||||||
### Phase 2: Faster Scrolling (5.3x speedup)
|
|
||||||
- Optimized scroll timing
|
|
||||||
- 43s → 29s
|
|
||||||
- Scroll timing: 0.3s
|
|
||||||
|
|
||||||
### Phase 3: Ultra-Fast (8.0x speedup)
|
|
||||||
- Minimized all waits
|
|
||||||
- Optimal scroll timing (0.27s)
|
|
||||||
- Less logging overhead
|
|
||||||
- 155s → 19.4s
|
|
||||||
|
|
||||||
### Phase 4: Complete Coverage (4.5x speedup)
|
|
||||||
- Ultra-fast API scrolling (234 reviews)
|
|
||||||
- DOM parsing fallback (10 reviews)
|
|
||||||
- 155s → 34s
|
|
||||||
- **100% completeness maintained**
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Technical Details
|
|
||||||
|
|
||||||
### Optimal Scroll Timing
|
|
||||||
After extensive testing:
|
|
||||||
|
|
||||||
| Timing | Result | Notes |
|
|
||||||
|--------|--------|-------|
|
|
||||||
| 0.15s | 210 reviews | Too fast - misses API responses |
|
|
||||||
| 0.25s | 0 reviews (33% failure) | Unreliable |
|
|
||||||
| **0.27s** | **234 reviews (100% success)** | ✅ **Sweet spot** |
|
|
||||||
| 0.30s | 234 reviews | Reliable but slower |
|
|
||||||
| 0.80s | 234 reviews | Original, very slow |
|
|
||||||
|
|
||||||
### Timing Breakdown (Ultra-Fast)
|
|
||||||
|
|
||||||
```
|
|
||||||
Operation Time % of Total
|
|
||||||
──────────────────────────────────────────────────
|
|
||||||
Browser startup ~1.0s 5%
|
|
||||||
Navigate to page 1.5s 8%
|
|
||||||
Cookie dialog dismiss 0.4s 2%
|
|
||||||
Click reviews tab 0.4s 2%
|
|
||||||
Wait for page stability 1.0s 5%
|
|
||||||
Find reviews pane ~1.5s 8%
|
|
||||||
Setup API interceptor 0.3s 2%
|
|
||||||
Initial scroll trigger 0.3s 2%
|
|
||||||
Scrolling (30 × 0.27s) 8.1s 42%
|
|
||||||
Response collection ~3.0s 15%
|
|
||||||
Parsing & saving ~1.9s 10%
|
|
||||||
──────────────────────────────────────────────────
|
|
||||||
TOTAL ~19.4s 100%
|
|
||||||
```
|
|
||||||
|
|
||||||
### Theoretical Limits
|
|
||||||
- **Current best**: 19.4s for 234 reviews
|
|
||||||
- **Theoretical minimum**: ~13s (if everything instant except scrolling)
|
|
||||||
- **Achievement**: 68% of theoretical maximum speed
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Bottleneck Analysis
|
|
||||||
|
|
||||||
Current bottlenecks (in order):
|
|
||||||
1. **Scrolling loop**: 8.1s (42%) - Already optimized to limit (0.27s/scroll)
|
|
||||||
2. **Response collection**: 3.0s (15%) - Necessary overhead
|
|
||||||
3. **Parsing & saving**: 1.9s (10%) - Fast enough
|
|
||||||
4. **Page navigation**: 1.5s (8%) - Network dependent
|
|
||||||
5. **Browser startup**: 1.0s (5%) - Can't optimize much
|
|
||||||
|
|
||||||
Further optimization would require:
|
|
||||||
- Faster Google API responses (impossible)
|
|
||||||
- Instant browser startup (impossible)
|
|
||||||
- Instant network requests (impossible)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Recommendations
|
|
||||||
|
|
||||||
### For Production Use
|
|
||||||
**Use `start_ultra_fast_complete.py`**:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python start_ultra_fast_complete.py
|
|
||||||
```
|
|
||||||
|
|
||||||
**Benefits**:
|
|
||||||
- ✅ 4.5x faster (34s vs 155s)
|
|
||||||
- ✅ 100% completeness (244/244 reviews)
|
|
||||||
- ✅ Stable and reliable
|
|
||||||
- ✅ No authentication needed
|
|
||||||
- ✅ Best balance of speed and completeness
|
|
||||||
|
|
||||||
### For Maximum Speed
|
|
||||||
**Use `start_ultra_fast.py`**:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python start_ultra_fast.py
|
|
||||||
```
|
|
||||||
|
|
||||||
**Benefits**:
|
|
||||||
- ✅ 8.0x faster (19.4s vs 155s)
|
|
||||||
- ✅ 100% stable
|
|
||||||
- ✅ 95.9% review coverage
|
|
||||||
- ⚠️ Missing 10 reviews (4.1%)
|
|
||||||
|
|
||||||
### Configuration
|
|
||||||
```yaml
|
|
||||||
headless: false # Must be false for stability
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Performance Metrics
|
|
||||||
|
|
||||||
### Ultra-Fast Complete (Recommended)
|
|
||||||
```
|
|
||||||
Metric Value
|
|
||||||
────────────────────────────────────
|
|
||||||
Average time 34s
|
|
||||||
Reviews captured 244 (100%)
|
|
||||||
Success rate 100%
|
|
||||||
API reviews 234 (95.9%)
|
|
||||||
DOM reviews 10 (4.1%)
|
|
||||||
Speedup vs original 4.5x
|
|
||||||
Time saved per run 121s
|
|
||||||
```
|
|
||||||
|
|
||||||
### Ultra-Fast (Maximum Speed)
|
|
||||||
```
|
|
||||||
Metric Value
|
|
||||||
────────────────────────────────────
|
|
||||||
Average time 19.4s
|
|
||||||
Std deviation ±0.4s
|
|
||||||
Success rate 100%
|
|
||||||
Reviews captured 234 (95.9%)
|
|
||||||
Reviews/second 12.1
|
|
||||||
Speedup vs original 8.0x
|
|
||||||
Time saved per run 135.6s
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Conclusion
|
|
||||||
|
|
||||||
After extensive testing, we discovered:
|
|
||||||
|
|
||||||
1. **API Hard Limit**: Google Maps API consistently returns only 234/244 reviews, regardless of scrolling strategy
|
|
||||||
2. **DOM Required**: The missing 10 reviews are ONLY available via DOM parsing
|
|
||||||
3. **Hybrid is Optimal**: Combining ultra-fast API scrolling with DOM fallback achieves best balance
|
|
||||||
|
|
||||||
**Final Achievement**:
|
|
||||||
- 📊 Original: 155s → **Optimized: 34s** (100% complete)
|
|
||||||
- 📊 Original: 155s → **Ultra-fast: 19.4s** (95.9% complete)
|
|
||||||
- 🚀 **4.5x-8.0x faster!**
|
|
||||||
- ⏱️ **Saves 121-136 seconds per run**
|
|
||||||
- ✅ **100% stable**
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**The scraper is now operating near theoretical maximum efficiency!** 🚀
|
|
||||||
@@ -1,322 +0,0 @@
|
|||||||
# Google Maps Date Format Specification
|
|
||||||
|
|
||||||
## Reverse-Engineered from 244 Reviews (English Locale)
|
|
||||||
|
|
||||||
**Date:** 2026-01-18
|
|
||||||
**Source:** Google Maps Reviews (hl=en)
|
|
||||||
**Library:** Google Internal (not moment.js, date-fns, or dayjs)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📋 Complete Pattern Catalog
|
|
||||||
|
|
||||||
### Discovered Patterns (31 unique formats)
|
|
||||||
|
|
||||||
```
|
|
||||||
Standard Formats:
|
|
||||||
- a month ago
|
|
||||||
- a year ago
|
|
||||||
- 2 weeks ago, 3 weeks ago
|
|
||||||
- 2-11 months ago
|
|
||||||
- 2-11 years ago
|
|
||||||
|
|
||||||
Edited Variants:
|
|
||||||
- Edited 2 weeks ago
|
|
||||||
- Edited 3 months ago
|
|
||||||
- Edited a year ago
|
|
||||||
- Edited 2-11 years ago
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🔬 Google's Algorithm (Reverse-Engineered)
|
|
||||||
|
|
||||||
### Pattern Structure
|
|
||||||
|
|
||||||
```
|
|
||||||
Singular: "a {unit} ago"
|
|
||||||
Plural: "{number} {unit}s ago"
|
|
||||||
Edited: "Edited {pattern}"
|
|
||||||
```
|
|
||||||
|
|
||||||
**Key Rules:**
|
|
||||||
1. Google NEVER shows "1 month ago" - always "a month ago"
|
|
||||||
2. Weeks: Only 2-3 weeks (no "1 week" or "4 weeks")
|
|
||||||
3. Months: 2-11 months (no "1 month" or "12 months")
|
|
||||||
4. Years: "a year" then 2-11 years
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ⏱️ Time Range Boundaries
|
|
||||||
|
|
||||||
### Unit Thresholds (Estimated)
|
|
||||||
|
|
||||||
| From | To | Unit Displayed | Example |
|
|
||||||
|------|-----|----------------|---------|
|
|
||||||
| 0s | 59s | seconds | "30 seconds ago" |
|
|
||||||
| 1min | 59min | minutes | "45 minutes ago" |
|
|
||||||
| 1h | 23h | hours | "12 hours ago" |
|
|
||||||
| 1d | 6d | days | "5 days ago" |
|
|
||||||
| 7d | 27d | weeks | "2 weeks ago", "3 weeks ago" |
|
|
||||||
| 28d | 59d | month (singular) | "a month ago" |
|
|
||||||
| 60d | 364d | months (plural) | "2 months ago" ... "11 months ago" |
|
|
||||||
| 365d | 729d | year (singular) | "a year ago" |
|
|
||||||
| 730d | ∞ | years (plural) | "2 years ago" ... "11 years ago" |
|
|
||||||
|
|
||||||
### Observed Ranges from 244 Reviews
|
|
||||||
|
|
||||||
| Unit | Values Found | Range |
|
|
||||||
|------|--------------|-------|
|
|
||||||
| Weeks | [2, 3] | 2-3 weeks |
|
|
||||||
| Months | [2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | 2-11 months |
|
|
||||||
| Years | [2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | 2-11 years |
|
|
||||||
|
|
||||||
**Note:** No reviews with seconds/minutes/hours/days in this dataset (all reviews were older than 2 weeks)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📊 Uncertainty Analysis
|
|
||||||
|
|
||||||
### Why Dates Are Imprecise
|
|
||||||
|
|
||||||
Google Maps shows relative dates that are **rounded down to the largest unit**:
|
|
||||||
|
|
||||||
```
|
|
||||||
Review posted: December 15, 2025
|
|
||||||
Viewed on: January 18, 2026
|
|
||||||
Actual age: 34 days
|
|
||||||
|
|
||||||
Google shows: "a month ago"
|
|
||||||
Actual range: 30-59 days (±15 days uncertainty)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Uncertainty by Unit
|
|
||||||
|
|
||||||
| Pattern | Actual Range | Uncertainty | Example |
|
|
||||||
|---------|--------------|-------------|---------|
|
|
||||||
| "a month ago" | 30-59 days | ±15 days | Could be 30 or 59 days old |
|
|
||||||
| "2 months ago" | 60-89 days | ±15 days | Could be 60 or 89 days old |
|
|
||||||
| "3 months ago" | 90-119 days | ±15 days | Could be 90 or 119 days old |
|
|
||||||
| "a year ago" | 365-729 days | ±182 days (6 months!) | Could be 1 or 2 years old |
|
|
||||||
| "2 years ago" | 730-1094 days | ±182 days | Could be 2 or 3 years old |
|
|
||||||
|
|
||||||
### Maximum Uncertainty
|
|
||||||
|
|
||||||
- **Months:** ±15 days (~50% of a month)
|
|
||||||
- **Years:** ±6 months (~25% of 2 years)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🎯 Recommended Parsing Strategy
|
|
||||||
|
|
||||||
### Option 1: Conservative (Current Implementation)
|
|
||||||
**Treat as exact midpoint**
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
"a month ago" → 45 days ago (midpoint of 30-59)
|
|
||||||
"2 months ago" → 75 days ago (midpoint of 60-89)
|
|
||||||
"a year ago" → 547 days ago (midpoint of 365-729)
|
|
||||||
```
|
|
||||||
|
|
||||||
✅ Simple to implement
|
|
||||||
✅ Statistically balanced
|
|
||||||
❌ Can be off by ±15 days (months) or ±6 months (years)
|
|
||||||
|
|
||||||
### Option 2: Conservative Lower Bound
|
|
||||||
**Assume oldest possible date**
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
"a month ago" → 59 days ago
|
|
||||||
"2 months ago" → 89 days ago
|
|
||||||
"a year ago" → 729 days ago
|
|
||||||
```
|
|
||||||
|
|
||||||
✅ Ensures reviews are AT LEAST this old
|
|
||||||
✅ Good for "show me reviews from last month" (inclusive)
|
|
||||||
❌ May exclude recent reviews
|
|
||||||
|
|
||||||
### Option 3: Optimistic Upper Bound
|
|
||||||
**Assume newest possible date**
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
"a month ago" → 30 days ago
|
|
||||||
"2 months ago" → 60 days ago
|
|
||||||
"a year ago" → 365 days ago
|
|
||||||
```
|
|
||||||
|
|
||||||
✅ Good for "show me reviews from last year" (exclusive)
|
|
||||||
❌ May include older reviews than expected
|
|
||||||
|
|
||||||
### Option 4: Range Filtering
|
|
||||||
**Store both bounds and filter inclusively**
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
"a month ago" → {min: 30 days, max: 59 days}
|
|
||||||
|
|
||||||
Filter "Last Month" (30 days):
|
|
||||||
Include if review.min_age <= 30 days
|
|
||||||
```
|
|
||||||
|
|
||||||
✅ Most accurate for filtering
|
|
||||||
✅ Accounts for all uncertainty
|
|
||||||
❌ More complex implementation
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 💡 Recommendation for Analytics Dashboard
|
|
||||||
|
|
||||||
### Use **Option 1 (Midpoint) + Grace Period**
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
function parseDateWithGracePeriod(dateText, graceFactor = 0.2) {
|
|
||||||
const midpoint = calculateMidpoint(dateText);
|
|
||||||
const grace = calculateUncertainty(dateText) * graceFactor;
|
|
||||||
|
|
||||||
return {
|
|
||||||
date: midpoint,
|
|
||||||
minDate: midpoint - grace,
|
|
||||||
maxDate: midpoint + grace
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
// Filter example:
|
|
||||||
// "Last Month" filter includes reviews where:
|
|
||||||
// review.date >= (30 days ago - grace)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Grace Period Values:**
|
|
||||||
- Weeks: ±0.5 days (10% of 7 days)
|
|
||||||
- Months: ±3 days (20% of 15 days)
|
|
||||||
- Years: ±36 days (20% of 182 days)
|
|
||||||
|
|
||||||
This provides a **buffer zone** to catch edge cases while maintaining statistical accuracy.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🔧 Implementation Reference
|
|
||||||
|
|
||||||
### Complete Pattern Regex (English)
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
const GOOGLE_DATE_PATTERNS = {
|
|
||||||
// Singular
|
|
||||||
singular: /^a (second|minute|hour|day|week|month|year) ago$/,
|
|
||||||
|
|
||||||
// Plural
|
|
||||||
plural: /^(\d+) (seconds|minutes|hours|days|weeks|months|years) ago$/,
|
|
||||||
|
|
||||||
// Edited variants
|
|
||||||
edited_singular: /^Edited a (second|minute|hour|day|week|month|year) ago$/,
|
|
||||||
edited_plural: /^Edited (\d+) (seconds|minutes|hours|days|weeks|months|years) ago$/
|
|
||||||
};
|
|
||||||
```
|
|
||||||
|
|
||||||
### Extraction Function
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
function extractNumberAndUnit(dateText) {
|
|
||||||
// Remove "Edited " prefix
|
|
||||||
const cleaned = dateText.replace(/^Edited\s+/i, '');
|
|
||||||
|
|
||||||
// Check singular pattern
|
|
||||||
const singularMatch = cleaned.match(/^a (\w+) ago$/);
|
|
||||||
if (singularMatch) {
|
|
||||||
return { number: 1, unit: singularMatch[1] };
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check plural pattern
|
|
||||||
const pluralMatch = cleaned.match(/^(\d+) (\w+) ago$/);
|
|
||||||
if (pluralMatch) {
|
|
||||||
const unit = pluralMatch[2].replace(/s$/, ''); // Remove plural 's'
|
|
||||||
return { number: parseInt(pluralMatch[1]), unit };
|
|
||||||
}
|
|
||||||
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Midpoint Calculation with Uncertainty
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
const UNIT_RANGES = {
|
|
||||||
second: { min: 1, max: 59, days: 0 },
|
|
||||||
minute: { min: 1, max: 59, days: 0 },
|
|
||||||
hour: { min: 1, max: 23, days: 0 },
|
|
||||||
day: { min: 1, max: 6, days: 1 },
|
|
||||||
week: { min: 1, max: 3.9, days: 7 },
|
|
||||||
month: { min: 1, max: 11.9, days: 30 },
|
|
||||||
year: { min: 1, max: Infinity, days: 365 }
|
|
||||||
};
|
|
||||||
|
|
||||||
function calculateMidpointDays(number, unit) {
|
|
||||||
const range = UNIT_RANGES[unit];
|
|
||||||
const daysPerUnit = range.days;
|
|
||||||
|
|
||||||
// Special case for singular "a month ago" = 30-59 days
|
|
||||||
if (number === 1 && unit === 'month') {
|
|
||||||
return 45; // Midpoint of 30-59
|
|
||||||
}
|
|
||||||
|
|
||||||
// Special case for singular "a year ago" = 365-729 days
|
|
||||||
if (number === 1 && unit === 'year') {
|
|
||||||
return 547; // Midpoint of 365-729
|
|
||||||
}
|
|
||||||
|
|
||||||
// Standard calculation
|
|
||||||
const minDays = number * daysPerUnit;
|
|
||||||
const maxDays = (number + 0.999) * daysPerUnit;
|
|
||||||
|
|
||||||
return (minDays + maxDays) / 2;
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📈 Statistical Analysis from Dataset
|
|
||||||
|
|
||||||
### Distribution of Review Ages (244 reviews)
|
|
||||||
|
|
||||||
| Time Range | Count | Percentage |
|
|
||||||
|------------|-------|------------|
|
|
||||||
| 2-3 weeks | ~2 | <1% |
|
|
||||||
| 1-12 months | ~15 | 6% |
|
|
||||||
| 1-2 years | ~30 | 12% |
|
|
||||||
| 2-5 years | ~60 | 25% |
|
|
||||||
| 5+ years | ~137 | 56% |
|
|
||||||
|
|
||||||
**Median Age:** ~5 years
|
|
||||||
**Oldest Review:** 11 years ago
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ✅ Validation
|
|
||||||
|
|
||||||
### Test Cases
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
const testCases = [
|
|
||||||
{ input: "a month ago", expected_days: 45, range: [30, 59] },
|
|
||||||
{ input: "2 months ago", expected_days: 75, range: [60, 89] },
|
|
||||||
{ input: "3 weeks ago", expected_days: 21, range: [21, 27] },
|
|
||||||
{ input: "a year ago", expected_days: 547, range: [365, 729] },
|
|
||||||
{ input: "Edited 2 years ago", expected_days: 913, range: [730, 1094] }
|
|
||||||
];
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🎓 Conclusion
|
|
||||||
|
|
||||||
**Google's Date Formatter:**
|
|
||||||
- Custom internal implementation (not a public library)
|
|
||||||
- Simple, user-friendly patterns
|
|
||||||
- Intentionally imprecise (UX over accuracy)
|
|
||||||
- Maximum uncertainty: ±6 months for "a year ago"
|
|
||||||
|
|
||||||
**For Analytics:**
|
|
||||||
- Use midpoint calculation for balanced accuracy
|
|
||||||
- Add 10-20% grace period for filters
|
|
||||||
- Accept that ±15 days is unavoidable for month-level precision
|
|
||||||
- Consider showing date ranges in UI: "1-2 months ago" instead of "45 days ago"
|
|
||||||
|
|
||||||
**Bottom Line:** Our regex-based parser extracting from English text is the **only possible approach** and achieves the **best accuracy** given Google's intentional imprecision.
|
|
||||||
570
HEALTH_CHECKS.md
570
HEALTH_CHECKS.md
@@ -1,570 +0,0 @@
|
|||||||
# Production Health Check Strategy
|
|
||||||
## Verify Actual Scraping Works
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🎯 Problem with Basic Health Checks
|
|
||||||
|
|
||||||
### What Basic Health Checks Test:
|
|
||||||
```python
|
|
||||||
@app.get("/health")
|
|
||||||
async def health():
|
|
||||||
db_ok = await ping_database() # ✅ DB responds
|
|
||||||
redis_ok = await ping_redis() # ✅ Redis responds
|
|
||||||
disk_ok = check_disk_space() < 90 # ✅ Disk not full
|
|
||||||
|
|
||||||
return {"status": "healthy"}
|
|
||||||
```
|
|
||||||
|
|
||||||
### What They DON'T Test:
|
|
||||||
- ❌ Can we actually scrape Google Maps?
|
|
||||||
- ❌ Is Chrome working?
|
|
||||||
- ❌ Are CSS selectors still valid?
|
|
||||||
- ❌ Is GDPR handling working?
|
|
||||||
- ❌ Did Google change their page structure?
|
|
||||||
- ❌ Is our proxy/network working?
|
|
||||||
|
|
||||||
### Real-World Failure Example:
|
|
||||||
```
|
|
||||||
✅ Database: healthy
|
|
||||||
✅ Redis: healthy
|
|
||||||
✅ Disk: 45% used
|
|
||||||
❌ Actual scraping: BROKEN (Google changed selectors)
|
|
||||||
|
|
||||||
→ Health check says "healthy" but all jobs fail!
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ✅ Solution: Synthetic Monitoring
|
|
||||||
|
|
||||||
### Concept: Canary Tests
|
|
||||||
|
|
||||||
Run an **actual scraping job** periodically on a known test URL:
|
|
||||||
|
|
||||||
```python
|
|
||||||
TEST_URL = "https://www.google.com/maps/place/Soho+Factory/@54.6738155,25.2595844,17z/..."
|
|
||||||
# A stable business that always has reviews
|
|
||||||
|
|
||||||
Every 4-6 hours:
|
|
||||||
1. Run actual scrape on test URL
|
|
||||||
2. Verify we get reviews
|
|
||||||
3. Verify data structure is correct
|
|
||||||
4. Verify scrape time is reasonable
|
|
||||||
5. Alert if anything fails
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🏗️ Implementation
|
|
||||||
|
|
||||||
### 1. Canary Scraping Endpoint
|
|
||||||
|
|
||||||
```python
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
|
|
||||||
# Store last canary result
|
|
||||||
canary_state = {
|
|
||||||
"last_run": None,
|
|
||||||
"last_success": None,
|
|
||||||
"last_result": None,
|
|
||||||
"consecutive_failures": 0
|
|
||||||
}
|
|
||||||
|
|
||||||
@app.get("/health/canary")
|
|
||||||
async def canary_health_check():
|
|
||||||
"""
|
|
||||||
Run a real scraping test to verify the scraper works.
|
|
||||||
|
|
||||||
This is the MOST IMPORTANT health check - it verifies:
|
|
||||||
- Chrome can start
|
|
||||||
- Google Maps is accessible
|
|
||||||
- Selectors still work
|
|
||||||
- GDPR handling works
|
|
||||||
- We can extract reviews
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Don't run too frequently (rate limit to avoid Google detection)
|
|
||||||
if canary_state["last_run"]:
|
|
||||||
elapsed = datetime.now() - canary_state["last_run"]
|
|
||||||
if elapsed < timedelta(hours=1):
|
|
||||||
# Return cached result
|
|
||||||
return {
|
|
||||||
"status": "cached",
|
|
||||||
"last_run": canary_state["last_run"].isoformat(),
|
|
||||||
"last_result": canary_state["last_result"],
|
|
||||||
"cached_for": f"{elapsed.total_seconds():.0f}s"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Run canary test
|
|
||||||
canary_state["last_run"] = datetime.now()
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Use a known stable business
|
|
||||||
TEST_URL = "https://www.google.com/maps/place/Soho+Factory/@54.6738155,25.2595844,17z/"
|
|
||||||
|
|
||||||
# Run actual scrape with timeout
|
|
||||||
result = await asyncio.wait_for(
|
|
||||||
fast_scrape_reviews(
|
|
||||||
url=TEST_URL,
|
|
||||||
headless=True,
|
|
||||||
max_scrolls=10 # Limited for canary
|
|
||||||
),
|
|
||||||
timeout=60 # Fail if takes > 60s
|
|
||||||
)
|
|
||||||
|
|
||||||
# Validate result
|
|
||||||
checks = {
|
|
||||||
"scrape_succeeded": result['success'],
|
|
||||||
"got_reviews": result['count'] > 0,
|
|
||||||
"reasonable_count": 10 <= result['count'] <= 500,
|
|
||||||
"reasonable_time": result['time'] < 30,
|
|
||||||
"data_structure_valid": validate_review_structure(result['reviews']),
|
|
||||||
}
|
|
||||||
|
|
||||||
all_passed = all(checks.values())
|
|
||||||
|
|
||||||
if all_passed:
|
|
||||||
canary_state["consecutive_failures"] = 0
|
|
||||||
canary_state["last_success"] = datetime.now()
|
|
||||||
canary_state["last_result"] = {
|
|
||||||
"status": "pass",
|
|
||||||
"reviews_count": result['count'],
|
|
||||||
"scrape_time": result['time'],
|
|
||||||
"checks": checks
|
|
||||||
}
|
|
||||||
status_code = 200
|
|
||||||
else:
|
|
||||||
canary_state["consecutive_failures"] += 1
|
|
||||||
canary_state["last_result"] = {
|
|
||||||
"status": "fail",
|
|
||||||
"reviews_count": result['count'],
|
|
||||||
"scrape_time": result['time'],
|
|
||||||
"checks": checks,
|
|
||||||
"consecutive_failures": canary_state["consecutive_failures"]
|
|
||||||
}
|
|
||||||
status_code = 503 # Service Unavailable
|
|
||||||
|
|
||||||
return JSONResponse(
|
|
||||||
status_code=status_code,
|
|
||||||
content={
|
|
||||||
"status": "pass" if all_passed else "fail",
|
|
||||||
"last_run": canary_state["last_run"].isoformat(),
|
|
||||||
"last_success": canary_state["last_success"].isoformat() if canary_state["last_success"] else None,
|
|
||||||
"result": canary_state["last_result"],
|
|
||||||
"details": {
|
|
||||||
"test_url": TEST_URL,
|
|
||||||
"reviews_found": result['count'],
|
|
||||||
"scrape_time_seconds": result['time'],
|
|
||||||
"checks": checks
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
canary_state["consecutive_failures"] += 1
|
|
||||||
canary_state["last_result"] = {
|
|
||||||
"status": "timeout",
|
|
||||||
"error": "Scrape took longer than 60 seconds"
|
|
||||||
}
|
|
||||||
return JSONResponse(
|
|
||||||
status_code=503,
|
|
||||||
content={
|
|
||||||
"status": "timeout",
|
|
||||||
"error": "Canary scrape timeout (>60s)",
|
|
||||||
"consecutive_failures": canary_state["consecutive_failures"]
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
canary_state["consecutive_failures"] += 1
|
|
||||||
canary_state["last_result"] = {
|
|
||||||
"status": "error",
|
|
||||||
"error": str(e)
|
|
||||||
}
|
|
||||||
return JSONResponse(
|
|
||||||
status_code=503,
|
|
||||||
content={
|
|
||||||
"status": "error",
|
|
||||||
"error": str(e),
|
|
||||||
"consecutive_failures": canary_state["consecutive_failures"]
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def validate_review_structure(reviews):
|
|
||||||
"""Validate that reviews have expected structure"""
|
|
||||||
if not reviews or len(reviews) == 0:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check first review has required fields
|
|
||||||
first_review = reviews[0]
|
|
||||||
required_fields = ['author', 'rating', 'date_text']
|
|
||||||
|
|
||||||
return all(field in first_review for field in required_fields)
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 2. Background Canary Runner
|
|
||||||
|
|
||||||
Instead of running on health check endpoint (which gets called frequently), run in background:
|
|
||||||
|
|
||||||
```python
|
|
||||||
import asyncio
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
|
|
||||||
class CanaryMonitor:
|
|
||||||
"""Background task that runs canary tests periodically"""
|
|
||||||
|
|
||||||
def __init__(self, interval_hours=4):
|
|
||||||
self.interval = timedelta(hours=interval_hours)
|
|
||||||
self.last_run = None
|
|
||||||
self.last_success = None
|
|
||||||
self.consecutive_failures = 0
|
|
||||||
self.running = False
|
|
||||||
|
|
||||||
async def start(self):
|
|
||||||
"""Start the background canary monitoring"""
|
|
||||||
self.running = True
|
|
||||||
|
|
||||||
while self.running:
|
|
||||||
try:
|
|
||||||
await self.run_canary()
|
|
||||||
except Exception as e:
|
|
||||||
log.error(f"Canary test failed: {e}")
|
|
||||||
self.consecutive_failures += 1
|
|
||||||
|
|
||||||
# Alert if multiple consecutive failures
|
|
||||||
if self.consecutive_failures >= 3:
|
|
||||||
await self.send_alert(
|
|
||||||
f"🚨 CRITICAL: Scraper canary failed {self.consecutive_failures} times in a row!"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Sleep until next run
|
|
||||||
await asyncio.sleep(self.interval.total_seconds())
|
|
||||||
|
|
||||||
async def run_canary(self):
|
|
||||||
"""Run a single canary test"""
|
|
||||||
log.info("Running canary scrape test...")
|
|
||||||
self.last_run = datetime.now()
|
|
||||||
|
|
||||||
TEST_URL = "https://www.google.com/maps/place/Soho+Factory/@54.6738155,25.2595844,17z/"
|
|
||||||
|
|
||||||
result = await asyncio.wait_for(
|
|
||||||
fast_scrape_reviews(url=TEST_URL, headless=True, max_scrolls=10),
|
|
||||||
timeout=60
|
|
||||||
)
|
|
||||||
|
|
||||||
# Validate result
|
|
||||||
if result['success'] and result['count'] > 10 and result['time'] < 30:
|
|
||||||
log.info(f"✅ Canary test passed: {result['count']} reviews in {result['time']:.1f}s")
|
|
||||||
self.consecutive_failures = 0
|
|
||||||
self.last_success = datetime.now()
|
|
||||||
|
|
||||||
# Store result in database for tracking
|
|
||||||
await db.execute("""
|
|
||||||
INSERT INTO canary_results (timestamp, success, reviews_count, scrape_time)
|
|
||||||
VALUES (NOW(), true, %s, %s)
|
|
||||||
""", result['count'], result['time'])
|
|
||||||
|
|
||||||
else:
|
|
||||||
log.error(f"❌ Canary test failed: {result}")
|
|
||||||
self.consecutive_failures += 1
|
|
||||||
|
|
||||||
await db.execute("""
|
|
||||||
INSERT INTO canary_results (timestamp, success, error_message)
|
|
||||||
VALUES (NOW(), false, %s)
|
|
||||||
""", result.get('error', 'Unknown error'))
|
|
||||||
|
|
||||||
raise Exception(f"Canary validation failed: {result}")
|
|
||||||
|
|
||||||
async def send_alert(self, message):
|
|
||||||
"""Send alert via Slack/email/PagerDuty when canary fails"""
|
|
||||||
# Slack webhook
|
|
||||||
await httpx.post(
|
|
||||||
SLACK_WEBHOOK_URL,
|
|
||||||
json={"text": message}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Or email
|
|
||||||
await send_email(
|
|
||||||
to="oncall@example.com",
|
|
||||||
subject="Scraper Canary Failure",
|
|
||||||
body=message
|
|
||||||
)
|
|
||||||
|
|
||||||
def stop(self):
|
|
||||||
"""Stop the background monitoring"""
|
|
||||||
self.running = False
|
|
||||||
|
|
||||||
|
|
||||||
# In api_server.py startup
|
|
||||||
canary_monitor = CanaryMonitor(interval_hours=4)
|
|
||||||
|
|
||||||
@asynccontextmanager
|
|
||||||
async def lifespan(app: FastAPI):
|
|
||||||
# Startup
|
|
||||||
asyncio.create_task(canary_monitor.start())
|
|
||||||
|
|
||||||
yield
|
|
||||||
|
|
||||||
# Shutdown
|
|
||||||
canary_monitor.stop()
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 3. Canary Health Check Endpoint (Fast)
|
|
||||||
|
|
||||||
```python
|
|
||||||
@app.get("/health/canary")
|
|
||||||
async def get_canary_status():
|
|
||||||
"""
|
|
||||||
Return the LATEST canary test result (doesn't run a new test).
|
|
||||||
|
|
||||||
Use this for health checks from load balancers / monitoring systems.
|
|
||||||
"""
|
|
||||||
if not canary_monitor.last_success:
|
|
||||||
return JSONResponse(
|
|
||||||
status_code=503,
|
|
||||||
content={
|
|
||||||
"status": "unknown",
|
|
||||||
"message": "No canary tests run yet"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check if last success was recent enough
|
|
||||||
age = datetime.now() - canary_monitor.last_success
|
|
||||||
max_age = timedelta(hours=6)
|
|
||||||
|
|
||||||
if age > max_age:
|
|
||||||
return JSONResponse(
|
|
||||||
status_code=503,
|
|
||||||
content={
|
|
||||||
"status": "stale",
|
|
||||||
"last_success": canary_monitor.last_success.isoformat(),
|
|
||||||
"age_hours": age.total_seconds() / 3600,
|
|
||||||
"message": f"Last successful canary was {age.total_seconds()/3600:.1f} hours ago"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Recent success - all good!
|
|
||||||
return {
|
|
||||||
"status": "healthy",
|
|
||||||
"last_success": canary_monitor.last_success.isoformat(),
|
|
||||||
"age_minutes": age.total_seconds() / 60,
|
|
||||||
"consecutive_failures": canary_monitor.consecutive_failures
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📊 Complete Health Check Hierarchy
|
|
||||||
|
|
||||||
### 1. **Liveness** (Is the app alive?)
|
|
||||||
```python
|
|
||||||
@app.get("/health/live")
|
|
||||||
async def liveness():
|
|
||||||
# Simple: can the server respond?
|
|
||||||
return {"status": "alive"}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Use**: Kubernetes liveness probe (restart if fails)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 2. **Readiness** (Can the app handle traffic?)
|
|
||||||
```python
|
|
||||||
@app.get("/health/ready")
|
|
||||||
async def readiness():
|
|
||||||
# Check dependencies
|
|
||||||
db_ok = await ping_database()
|
|
||||||
redis_ok = await ping_redis()
|
|
||||||
|
|
||||||
if db_ok and redis_ok:
|
|
||||||
return {"status": "ready"}
|
|
||||||
else:
|
|
||||||
raise HTTPException(status_code=503, detail="Not ready")
|
|
||||||
```
|
|
||||||
|
|
||||||
**Use**: Kubernetes readiness probe (remove from load balancer if fails)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 3. **Canary** (Does scraping actually work?)
|
|
||||||
```python
|
|
||||||
@app.get("/health/canary")
|
|
||||||
async def canary():
|
|
||||||
# Return last canary test result
|
|
||||||
if canary_monitor.last_success and age < 6_hours:
|
|
||||||
return {"status": "healthy"}
|
|
||||||
else:
|
|
||||||
return JSONResponse(status_code=503, content={"status": "unhealthy"})
|
|
||||||
```
|
|
||||||
|
|
||||||
**Use**: External monitoring (PagerDuty, DataDog) - alerts if fails
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 4. **Detailed** (Full system status)
|
|
||||||
```python
|
|
||||||
@app.get("/health/detailed")
|
|
||||||
async def detailed_health():
|
|
||||||
return {
|
|
||||||
"status": "healthy",
|
|
||||||
"components": {
|
|
||||||
"api": {"status": "healthy", "latency_ms": 1},
|
|
||||||
"database": {"status": "healthy", "latency_ms": 5},
|
|
||||||
"redis": {"status": "healthy", "latency_ms": 2},
|
|
||||||
"workers": {"status": "healthy", "active": 4},
|
|
||||||
"canary": {
|
|
||||||
"status": "healthy",
|
|
||||||
"last_success": "2026-01-18T10:30:00Z",
|
|
||||||
"age_minutes": 45,
|
|
||||||
"consecutive_failures": 0
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"timestamp": datetime.utcnow().isoformat()
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Use**: Monitoring dashboards, debugging
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📈 Monitoring Strategy
|
|
||||||
|
|
||||||
### Canary Test Schedule
|
|
||||||
|
|
||||||
```
|
|
||||||
Every 4 hours:
|
|
||||||
- Run full canary test
|
|
||||||
- Store result in database
|
|
||||||
- Alert if fails
|
|
||||||
|
|
||||||
Benefits:
|
|
||||||
✅ Detects Google Maps changes within 4 hours
|
|
||||||
✅ Detects selector breakage quickly
|
|
||||||
✅ Low overhead (6 tests/day)
|
|
||||||
✅ Won't trigger Google rate limits
|
|
||||||
```
|
|
||||||
|
|
||||||
### Alert Rules
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Alert on consecutive failures
|
|
||||||
if consecutive_failures >= 3:
|
|
||||||
send_pagerduty_alert("CRITICAL: Scraper broken")
|
|
||||||
|
|
||||||
# Alert on slow canary
|
|
||||||
if scrape_time > 60:
|
|
||||||
send_slack_alert("WARNING: Scraper slow")
|
|
||||||
|
|
||||||
# Alert on low review count
|
|
||||||
if reviews_count < 10:
|
|
||||||
send_slack_alert("WARNING: Low review count in canary")
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🎯 Canary Database Tracking
|
|
||||||
|
|
||||||
```sql
|
|
||||||
CREATE TABLE canary_results (
|
|
||||||
id SERIAL PRIMARY KEY,
|
|
||||||
timestamp TIMESTAMP NOT NULL DEFAULT NOW(),
|
|
||||||
success BOOLEAN NOT NULL,
|
|
||||||
reviews_count INTEGER,
|
|
||||||
scrape_time REAL,
|
|
||||||
error_message TEXT,
|
|
||||||
metadata JSONB
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE INDEX idx_canary_timestamp ON canary_results(timestamp DESC);
|
|
||||||
|
|
||||||
-- Query to see canary health over time
|
|
||||||
SELECT
|
|
||||||
DATE_TRUNC('day', timestamp) as day,
|
|
||||||
COUNT(*) as total_tests,
|
|
||||||
SUM(CASE WHEN success THEN 1 ELSE 0 END) as successful,
|
|
||||||
AVG(scrape_time) as avg_scrape_time,
|
|
||||||
AVG(reviews_count) as avg_reviews
|
|
||||||
FROM canary_results
|
|
||||||
WHERE timestamp > NOW() - INTERVAL '7 days'
|
|
||||||
GROUP BY day
|
|
||||||
ORDER BY day DESC;
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ✅ Complete Health Check Implementation
|
|
||||||
|
|
||||||
```python
|
|
||||||
# health_checks.py
|
|
||||||
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
import asyncio
|
|
||||||
from typing import Dict, Any
|
|
||||||
|
|
||||||
class HealthCheckSystem:
|
|
||||||
"""Complete health check system for production"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.canary = CanaryMonitor(interval_hours=4)
|
|
||||||
|
|
||||||
async def start(self):
|
|
||||||
"""Start background health monitoring"""
|
|
||||||
asyncio.create_task(self.canary.start())
|
|
||||||
|
|
||||||
@property
|
|
||||||
def is_healthy(self) -> bool:
|
|
||||||
"""Overall system health"""
|
|
||||||
return (
|
|
||||||
self.canary.consecutive_failures < 3 and
|
|
||||||
self.canary.last_success and
|
|
||||||
(datetime.now() - self.canary.last_success) < timedelta(hours=6)
|
|
||||||
)
|
|
||||||
|
|
||||||
async def get_status(self) -> Dict[str, Any]:
|
|
||||||
"""Get complete health status"""
|
|
||||||
db_latency = await self.check_database()
|
|
||||||
redis_latency = await self.check_redis()
|
|
||||||
|
|
||||||
return {
|
|
||||||
"status": "healthy" if self.is_healthy else "degraded",
|
|
||||||
"components": {
|
|
||||||
"database": {
|
|
||||||
"healthy": db_latency is not None,
|
|
||||||
"latency_ms": db_latency
|
|
||||||
},
|
|
||||||
"redis": {
|
|
||||||
"healthy": redis_latency is not None,
|
|
||||||
"latency_ms": redis_latency
|
|
||||||
},
|
|
||||||
"canary_scraper": {
|
|
||||||
"healthy": self.canary.consecutive_failures == 0,
|
|
||||||
"last_success": self.canary.last_success.isoformat() if self.canary.last_success else None,
|
|
||||||
"consecutive_failures": self.canary.consecutive_failures
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"timestamp": datetime.utcnow().isoformat()
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🚀 Production Recommendations
|
|
||||||
|
|
||||||
1. ✅ **Run canary every 4-6 hours** (balanced between freshness and overhead)
|
|
||||||
2. ✅ **Alert after 3 consecutive failures** (avoid false positives)
|
|
||||||
3. ✅ **Store canary results in database** (historical tracking)
|
|
||||||
4. ✅ **Use different health checks for different purposes**:
|
|
||||||
- `/health/live` → Kubernetes liveness (restart if fails)
|
|
||||||
- `/health/ready` → Kubernetes readiness (route traffic)
|
|
||||||
- `/health/canary` → External monitoring (PagerDuty alerts)
|
|
||||||
5. ✅ **Monitor canary metrics**: scrape time, review count, success rate
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**The canary test is your MOST IMPORTANT health check** - it's the only one that verifies your core business logic actually works!
|
|
||||||
@@ -1,833 +0,0 @@
|
|||||||
# Production Microservice Architecture
|
|
||||||
## Google Reviews Scraper API
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🎯 Recommended Communication Patterns
|
|
||||||
|
|
||||||
### 1. **Webhooks** (Primary - RECOMMENDED) ✅
|
|
||||||
|
|
||||||
**Best for**: Production async job processing
|
|
||||||
|
|
||||||
```
|
|
||||||
Client → POST /scrape (with webhook_url)
|
|
||||||
↓
|
|
||||||
Server → Starts job, returns job_id
|
|
||||||
↓
|
|
||||||
[Scraping in progress...]
|
|
||||||
↓
|
|
||||||
Server → POST to client's webhook_url when complete
|
|
||||||
{
|
|
||||||
"job_id": "...",
|
|
||||||
"status": "completed",
|
|
||||||
"reviews_count": 244,
|
|
||||||
"reviews_url": "https://api.example.com/jobs/{job_id}/reviews"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Advantages**:
|
|
||||||
- ✅ No polling needed (reduces server load)
|
|
||||||
- ✅ Instant notifications when job completes
|
|
||||||
- ✅ Industry standard (Stripe, GitHub, Twilio use this)
|
|
||||||
- ✅ Client can go offline and come back
|
|
||||||
- ✅ Scales to millions of jobs
|
|
||||||
|
|
||||||
**Use cases**:
|
|
||||||
- Batch processing systems
|
|
||||||
- Integration with other services
|
|
||||||
- When client has a public endpoint
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 2. **Server-Sent Events (SSE)** (Real-time Updates) ⚡
|
|
||||||
|
|
||||||
**Best for**: Real-time progress monitoring
|
|
||||||
|
|
||||||
```
|
|
||||||
Client → GET /jobs/{job_id}/stream (keeps connection open)
|
|
||||||
↓
|
|
||||||
Server → Sends progress updates in real-time:
|
|
||||||
|
|
||||||
data: {"stage": "scrolling", "reviews_loaded": 50}
|
|
||||||
|
|
||||||
data: {"stage": "scrolling", "reviews_loaded": 100}
|
|
||||||
|
|
||||||
data: {"stage": "extracting", "reviews_loaded": 244}
|
|
||||||
|
|
||||||
data: {"stage": "completed", "total": 244}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Advantages**:
|
|
||||||
- ✅ Real-time progress updates
|
|
||||||
- ✅ HTTP-based (works through firewalls)
|
|
||||||
- ✅ Lightweight (one-way communication)
|
|
||||||
- ✅ Auto-reconnection support
|
|
||||||
- ✅ Great for dashboards/UIs
|
|
||||||
|
|
||||||
**Use cases**:
|
|
||||||
- Web dashboards
|
|
||||||
- Real-time monitoring
|
|
||||||
- Progress bars in UI
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 3. **Polling** (Fallback) 🔄
|
|
||||||
|
|
||||||
**Best for**: Simple clients, no webhook capability
|
|
||||||
|
|
||||||
```
|
|
||||||
Client → POST /scrape
|
|
||||||
↓
|
|
||||||
Server → Returns job_id
|
|
||||||
↓
|
|
||||||
Client → Polls GET /jobs/{job_id} every 2-5 seconds
|
|
||||||
↓
|
|
||||||
Server → Returns current status
|
|
||||||
```
|
|
||||||
|
|
||||||
**Advantages**:
|
|
||||||
- ✅ Simple to implement
|
|
||||||
- ✅ Works everywhere (no public endpoint needed)
|
|
||||||
- ✅ Firewall-friendly
|
|
||||||
|
|
||||||
**Disadvantages**:
|
|
||||||
- ❌ Inefficient (many wasted requests)
|
|
||||||
- ❌ Delayed notifications (polling interval)
|
|
||||||
- ❌ Higher server load
|
|
||||||
|
|
||||||
**Use cases**:
|
|
||||||
- Internal tools
|
|
||||||
- Clients behind firewalls
|
|
||||||
- Simple integrations
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🏛️ Complete Production Architecture
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────────────────────────────┐
|
|
||||||
│ LOAD BALANCER │
|
|
||||||
│ (nginx/AWS ALB) │
|
|
||||||
└──────────┬──────────────────────────────────┬────────────────┘
|
|
||||||
│ │
|
|
||||||
▼ ▼
|
|
||||||
┌──────────────────────┐ ┌──────────────────────┐
|
|
||||||
│ API Server 1 │ │ API Server 2 │
|
|
||||||
│ (FastAPI) │ │ (FastAPI) │
|
|
||||||
│ - REST endpoints │ │ - REST endpoints │
|
|
||||||
│ - Health checks │ │ - Health checks │
|
|
||||||
│ - Job management │ │ - Job management │
|
|
||||||
└──────────┬───────────┘ └──────────┬───────────┘
|
|
||||||
│ │
|
|
||||||
└────────────┬───────────────────┘
|
|
||||||
▼
|
|
||||||
┌────────────────────────┐
|
|
||||||
│ REDIS / RabbitMQ │
|
|
||||||
│ (Job Queue) │
|
|
||||||
│ │
|
|
||||||
│ - Pending jobs │
|
|
||||||
│ - Job distribution │
|
|
||||||
│ - Pub/Sub for events │
|
|
||||||
└────────┬───────────────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌──────────────┴──────────────┐
|
|
||||||
│ │
|
|
||||||
▼ ▼
|
|
||||||
┌─────────────┐ ┌─────────────┐
|
|
||||||
│ Worker 1 │ │ Worker 2 │
|
|
||||||
│ │ │ │
|
|
||||||
│ - Scraping │ │ - Scraping │
|
|
||||||
│ - Headless │ │ - Headless │
|
|
||||||
│ - Chrome │ │ - Chrome │
|
|
||||||
└─────┬───────┘ └─────┬───────┘
|
|
||||||
│ │
|
|
||||||
└────────────┬───────────────┘
|
|
||||||
▼
|
|
||||||
┌──────────────────────────────┐
|
|
||||||
│ PERSISTENT STORAGE │
|
|
||||||
│ │
|
|
||||||
│ ┌────────────────────────┐ │
|
|
||||||
│ │ PostgreSQL / MongoDB │ │
|
|
||||||
│ │ - Job metadata │ │
|
|
||||||
│ │ - Status tracking │ │
|
|
||||||
│ │ - Webhook configs │ │
|
|
||||||
│ └────────────────────────┘ │
|
|
||||||
│ │
|
|
||||||
│ ┌────────────────────────┐ │
|
|
||||||
│ │ File Storage / S3 │ │
|
|
||||||
│ │ - Review JSON files │ │
|
|
||||||
│ │ - Large payloads │ │
|
|
||||||
│ └────────────────────────┘ │
|
|
||||||
└───────────────────────────────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌─────────────────────┐
|
|
||||||
│ Webhook Dispatcher │
|
|
||||||
│ - Retry logic │
|
|
||||||
│ - Dead letter queue│
|
|
||||||
└─────────────────────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
[Client's webhook URL]
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📦 Component Breakdown
|
|
||||||
|
|
||||||
### 1. **API Server** (FastAPI)
|
|
||||||
|
|
||||||
**Responsibilities**:
|
|
||||||
- Handle HTTP requests
|
|
||||||
- Validate input
|
|
||||||
- Enqueue jobs
|
|
||||||
- Serve results
|
|
||||||
- Health checks
|
|
||||||
|
|
||||||
**Endpoints**:
|
|
||||||
```python
|
|
||||||
POST /scrape # Submit job
|
|
||||||
GET /jobs/{id} # Get job status
|
|
||||||
GET /jobs/{id}/reviews # Get results
|
|
||||||
GET /jobs/{id}/stream # SSE progress stream
|
|
||||||
DELETE /jobs/{id} # Cancel job
|
|
||||||
GET /health # Health check
|
|
||||||
GET /metrics # Prometheus metrics
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 2. **Job Queue** (Redis or RabbitMQ)
|
|
||||||
|
|
||||||
**Why needed**:
|
|
||||||
- Decouple API from scraping workers
|
|
||||||
- Distribute load across workers
|
|
||||||
- Retry failed jobs
|
|
||||||
- Handle backpressure
|
|
||||||
|
|
||||||
**Options**:
|
|
||||||
|
|
||||||
**Option A: Redis** (Recommended for simpler setups)
|
|
||||||
```python
|
|
||||||
# Fast, simple, good for most use cases
|
|
||||||
- In-memory queue
|
|
||||||
- Pub/Sub for events
|
|
||||||
- Job state storage
|
|
||||||
- Session storage
|
|
||||||
```
|
|
||||||
|
|
||||||
**Option B: RabbitMQ** (For complex workflows)
|
|
||||||
```python
|
|
||||||
# More features, better for complex scenarios
|
|
||||||
- Guaranteed delivery
|
|
||||||
- Advanced routing
|
|
||||||
- Dead letter queues
|
|
||||||
- Priority queues
|
|
||||||
```
|
|
||||||
|
|
||||||
**Recommendation**: Start with **Redis**, upgrade to RabbitMQ if needed.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 3. **Worker Processes** (Celery or Custom)
|
|
||||||
|
|
||||||
**Responsibilities**:
|
|
||||||
- Pull jobs from queue
|
|
||||||
- Run scraping (headless Chrome)
|
|
||||||
- Save results
|
|
||||||
- Send webhooks
|
|
||||||
- Update job status
|
|
||||||
|
|
||||||
**Scaling**:
|
|
||||||
```bash
|
|
||||||
# Run 4 workers on same machine
|
|
||||||
celery -A worker worker --concurrency=4
|
|
||||||
|
|
||||||
# Or 4 separate processes
|
|
||||||
python worker.py &
|
|
||||||
python worker.py &
|
|
||||||
python worker.py &
|
|
||||||
python worker.py &
|
|
||||||
|
|
||||||
# Or Kubernetes deployment
|
|
||||||
kubectl scale deployment scraper-worker --replicas=10
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 4. **Database** (PostgreSQL or MongoDB)
|
|
||||||
|
|
||||||
**Job Metadata Schema**:
|
|
||||||
|
|
||||||
**PostgreSQL** (Recommended):
|
|
||||||
```sql
|
|
||||||
CREATE TABLE jobs (
|
|
||||||
job_id UUID PRIMARY KEY,
|
|
||||||
status VARCHAR(20) NOT NULL,
|
|
||||||
url TEXT NOT NULL,
|
|
||||||
webhook_url TEXT,
|
|
||||||
created_at TIMESTAMP NOT NULL,
|
|
||||||
started_at TIMESTAMP,
|
|
||||||
completed_at TIMESTAMP,
|
|
||||||
reviews_count INTEGER,
|
|
||||||
reviews_file_path TEXT,
|
|
||||||
error_message TEXT,
|
|
||||||
metadata JSONB
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE INDEX idx_jobs_status ON jobs(status);
|
|
||||||
CREATE INDEX idx_jobs_created_at ON jobs(created_at);
|
|
||||||
```
|
|
||||||
|
|
||||||
**Why PostgreSQL**:
|
|
||||||
- ✅ ACID transactions
|
|
||||||
- ✅ Good for structured data
|
|
||||||
- ✅ SQL queries
|
|
||||||
- ✅ Mature ecosystem
|
|
||||||
|
|
||||||
**Alternative - MongoDB**:
|
|
||||||
```javascript
|
|
||||||
{
|
|
||||||
_id: ObjectId("..."),
|
|
||||||
job_id: "550e8400-...",
|
|
||||||
status: "completed",
|
|
||||||
url: "https://...",
|
|
||||||
webhook_url: "https://...",
|
|
||||||
created_at: ISODate("2026-01-18T..."),
|
|
||||||
reviews_count: 244,
|
|
||||||
reviews_file: "/data/reviews/550e8400.json",
|
|
||||||
metadata: { ... }
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Why MongoDB**:
|
|
||||||
- ✅ Flexible schema
|
|
||||||
- ✅ Good for document storage
|
|
||||||
- ✅ Built-in sharding
|
|
||||||
|
|
||||||
**Recommendation**: **PostgreSQL** for most cases (better for job queues and transactions)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 5. **File Storage**
|
|
||||||
|
|
||||||
**Options**:
|
|
||||||
|
|
||||||
**Option A: Local Filesystem** (Development/Small scale)
|
|
||||||
```python
|
|
||||||
/data/reviews/
|
|
||||||
├── 550e8400-e29b-41d4-a716-446655440000.json
|
|
||||||
├── 6a1f9b2c-3d4e-5f6g-7h8i-9j0k1l2m3n4o.json
|
|
||||||
└── ...
|
|
||||||
```
|
|
||||||
|
|
||||||
**Option B: S3 / Object Storage** (Production - RECOMMENDED)
|
|
||||||
```python
|
|
||||||
s3://scraper-reviews-bucket/
|
|
||||||
├── 2026/01/18/550e8400-e29b-41d4-a716-446655440000.json
|
|
||||||
├── 2026/01/18/6a1f9b2c-3d4e-5f6g-7h8i-9j0k1l2m3n4o.json
|
|
||||||
└── ...
|
|
||||||
```
|
|
||||||
|
|
||||||
**Why S3**:
|
|
||||||
- ✅ Unlimited storage
|
|
||||||
- ✅ No disk management
|
|
||||||
- ✅ High availability
|
|
||||||
- ✅ Versioning support
|
|
||||||
- ✅ Pre-signed URLs for direct access
|
|
||||||
- ✅ Lifecycle policies (auto-delete old files)
|
|
||||||
|
|
||||||
**Recommendation**: **S3 (or compatible)** for production
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 6. **Webhook Dispatcher**
|
|
||||||
|
|
||||||
**Features**:
|
|
||||||
- ✅ Retry logic (exponential backoff)
|
|
||||||
- ✅ Dead letter queue for failed webhooks
|
|
||||||
- ✅ Webhook signatures (HMAC for security)
|
|
||||||
- ✅ Timeout handling
|
|
||||||
- ✅ Async delivery
|
|
||||||
|
|
||||||
**Implementation**:
|
|
||||||
```python
|
|
||||||
async def send_webhook(webhook_url, payload, max_retries=3):
|
|
||||||
for attempt in range(max_retries):
|
|
||||||
try:
|
|
||||||
# Add signature
|
|
||||||
signature = hmac.new(
|
|
||||||
WEBHOOK_SECRET,
|
|
||||||
json.dumps(payload).encode(),
|
|
||||||
hashlib.sha256
|
|
||||||
).hexdigest()
|
|
||||||
|
|
||||||
# Send with timeout
|
|
||||||
async with httpx.AsyncClient() as client:
|
|
||||||
response = await client.post(
|
|
||||||
webhook_url,
|
|
||||||
json=payload,
|
|
||||||
headers={"X-Webhook-Signature": signature},
|
|
||||||
timeout=10.0
|
|
||||||
)
|
|
||||||
|
|
||||||
if response.status_code == 200:
|
|
||||||
return True
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
if attempt < max_retries - 1:
|
|
||||||
await asyncio.sleep(2 ** attempt) # Exponential backoff
|
|
||||||
else:
|
|
||||||
# Move to dead letter queue
|
|
||||||
await save_to_dead_letter_queue(webhook_url, payload)
|
|
||||||
|
|
||||||
return False
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🔥 Complete Workflow Examples
|
|
||||||
|
|
||||||
### Workflow 1: **Webhooks** (Production)
|
|
||||||
|
|
||||||
```python
|
|
||||||
# 1. Client submits job with webhook
|
|
||||||
POST /scrape
|
|
||||||
{
|
|
||||||
"url": "https://maps.google.com/...",
|
|
||||||
"webhook_url": "https://client.com/webhook",
|
|
||||||
"webhook_secret": "secret123" # For signature verification
|
|
||||||
}
|
|
||||||
|
|
||||||
Response:
|
|
||||||
{
|
|
||||||
"job_id": "550e8400-...",
|
|
||||||
"status": "queued",
|
|
||||||
"estimated_time": "20s"
|
|
||||||
}
|
|
||||||
|
|
||||||
# 2. Server enqueues job
|
|
||||||
redis.lpush("scraper:queue", job_id)
|
|
||||||
|
|
||||||
# 3. Worker picks up job
|
|
||||||
worker = get_from_queue()
|
|
||||||
result = fast_scrape_reviews(url)
|
|
||||||
|
|
||||||
# 4. Save to S3
|
|
||||||
s3.upload(f"reviews/{job_id}.json", reviews)
|
|
||||||
|
|
||||||
# 5. Update database
|
|
||||||
db.jobs.update(job_id, {
|
|
||||||
status: "completed",
|
|
||||||
reviews_count: 244,
|
|
||||||
reviews_url: f"https://api.example.com/jobs/{job_id}/reviews"
|
|
||||||
})
|
|
||||||
|
|
||||||
# 6. Send webhook to client
|
|
||||||
POST https://client.com/webhook
|
|
||||||
Headers:
|
|
||||||
X-Webhook-Signature: hmac_sha256(payload, secret)
|
|
||||||
Body:
|
|
||||||
{
|
|
||||||
"event": "job.completed",
|
|
||||||
"job_id": "550e8400-...",
|
|
||||||
"status": "completed",
|
|
||||||
"reviews_count": 244,
|
|
||||||
"reviews_url": "https://api.example.com/jobs/{job_id}/reviews",
|
|
||||||
"completed_at": "2026-01-18T10:30:20Z"
|
|
||||||
}
|
|
||||||
|
|
||||||
# 7. Client downloads reviews
|
|
||||||
GET https://api.example.com/jobs/{job_id}/reviews
|
|
||||||
# Or direct S3 pre-signed URL
|
|
||||||
GET https://s3.amazonaws.com/bucket/reviews/{job_id}.json?signature=...
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Workflow 2: **SSE Streaming** (Real-time Dashboard)
|
|
||||||
|
|
||||||
```python
|
|
||||||
# 1. Client opens SSE connection
|
|
||||||
EventSource("/jobs/{job_id}/stream")
|
|
||||||
|
|
||||||
# 2. Server streams progress updates
|
|
||||||
def stream_progress(job_id):
|
|
||||||
while True:
|
|
||||||
job = get_job(job_id)
|
|
||||||
|
|
||||||
yield f"data: {json.dumps({
|
|
||||||
'stage': job.stage,
|
|
||||||
'reviews_loaded': job.reviews_loaded,
|
|
||||||
'progress_percent': job.progress_percent
|
|
||||||
})}\n\n"
|
|
||||||
|
|
||||||
if job.status in ['completed', 'failed']:
|
|
||||||
break
|
|
||||||
|
|
||||||
await asyncio.sleep(1) # Update every second
|
|
||||||
|
|
||||||
# 3. Client receives updates
|
|
||||||
onmessage: {"stage": "scrolling", "reviews_loaded": 50, "progress": 20}
|
|
||||||
onmessage: {"stage": "scrolling", "reviews_loaded": 100, "progress": 40}
|
|
||||||
onmessage: {"stage": "scrolling", "reviews_loaded": 150, "progress": 60}
|
|
||||||
onmessage: {"stage": "extracting", "reviews_loaded": 244, "progress": 100}
|
|
||||||
onmessage: {"stage": "completed", "total": 244}
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Workflow 3: **Polling** (Simple Clients)
|
|
||||||
|
|
||||||
```python
|
|
||||||
# 1. Submit job (no webhook)
|
|
||||||
POST /scrape
|
|
||||||
{
|
|
||||||
"url": "https://maps.google.com/..."
|
|
||||||
}
|
|
||||||
|
|
||||||
Response:
|
|
||||||
{
|
|
||||||
"job_id": "550e8400-...",
|
|
||||||
"status": "queued"
|
|
||||||
}
|
|
||||||
|
|
||||||
# 2. Poll every 3 seconds
|
|
||||||
while True:
|
|
||||||
response = GET /jobs/{job_id}
|
|
||||||
|
|
||||||
if response.status == "completed":
|
|
||||||
reviews = GET /jobs/{job_id}/reviews
|
|
||||||
break
|
|
||||||
elif response.status == "failed":
|
|
||||||
handle_error(response.error_message)
|
|
||||||
break
|
|
||||||
|
|
||||||
sleep(3)
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🏥 Health Checks
|
|
||||||
|
|
||||||
### 1. **Basic Health Check**
|
|
||||||
|
|
||||||
```python
|
|
||||||
@app.get("/health")
|
|
||||||
async def health_check():
|
|
||||||
return {
|
|
||||||
"status": "healthy",
|
|
||||||
"timestamp": datetime.utcnow().isoformat(),
|
|
||||||
"version": "1.0.0"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. **Detailed Health Check** (Recommended)
|
|
||||||
|
|
||||||
```python
|
|
||||||
@app.get("/health/detailed")
|
|
||||||
async def detailed_health():
|
|
||||||
checks = {
|
|
||||||
"api": await check_api(), # Always healthy if responding
|
|
||||||
"database": await check_database(), # Query DB
|
|
||||||
"redis": await check_redis(), # Ping Redis
|
|
||||||
"s3": await check_s3(), # List buckets
|
|
||||||
"workers": await check_workers(), # Check if workers alive
|
|
||||||
"disk": await check_disk_space(), # Check disk usage
|
|
||||||
}
|
|
||||||
|
|
||||||
overall_healthy = all(c["healthy"] for c in checks.values())
|
|
||||||
|
|
||||||
return {
|
|
||||||
"status": "healthy" if overall_healthy else "degraded",
|
|
||||||
"checks": checks,
|
|
||||||
"timestamp": datetime.utcnow().isoformat()
|
|
||||||
}
|
|
||||||
|
|
||||||
# Example response:
|
|
||||||
{
|
|
||||||
"status": "healthy",
|
|
||||||
"checks": {
|
|
||||||
"api": {"healthy": true, "latency_ms": 1},
|
|
||||||
"database": {"healthy": true, "latency_ms": 5},
|
|
||||||
"redis": {"healthy": true, "latency_ms": 2},
|
|
||||||
"s3": {"healthy": true, "latency_ms": 50},
|
|
||||||
"workers": {"healthy": true, "active_workers": 4},
|
|
||||||
"disk": {"healthy": true, "usage_percent": 45}
|
|
||||||
},
|
|
||||||
"timestamp": "2026-01-18T10:30:00Z"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. **Readiness vs Liveness** (Kubernetes)
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Liveness: Is the app alive? (restart if false)
|
|
||||||
@app.get("/health/live")
|
|
||||||
async def liveness():
|
|
||||||
# Simple check - is the server running?
|
|
||||||
return {"status": "alive"}
|
|
||||||
|
|
||||||
# Readiness: Can the app handle traffic? (remove from load balancer if false)
|
|
||||||
@app.get("/health/ready")
|
|
||||||
async def readiness():
|
|
||||||
# Check dependencies
|
|
||||||
db_ok = await ping_database()
|
|
||||||
redis_ok = await ping_redis()
|
|
||||||
|
|
||||||
if db_ok and redis_ok:
|
|
||||||
return {"status": "ready"}
|
|
||||||
else:
|
|
||||||
raise HTTPException(status_code=503, detail="Not ready")
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📊 Monitoring & Metrics
|
|
||||||
|
|
||||||
### Prometheus Metrics
|
|
||||||
|
|
||||||
```python
|
|
||||||
from prometheus_client import Counter, Histogram, Gauge
|
|
||||||
|
|
||||||
# Counters
|
|
||||||
jobs_total = Counter('scraper_jobs_total', 'Total jobs created', ['status'])
|
|
||||||
webhooks_sent = Counter('scraper_webhooks_sent_total', 'Webhooks sent', ['success'])
|
|
||||||
|
|
||||||
# Histograms
|
|
||||||
scrape_duration = Histogram('scraper_duration_seconds', 'Scraping duration')
|
|
||||||
reviews_scraped = Histogram('scraper_reviews_count', 'Reviews per job')
|
|
||||||
|
|
||||||
# Gauges
|
|
||||||
active_jobs = Gauge('scraper_active_jobs', 'Currently running jobs')
|
|
||||||
queue_size = Gauge('scraper_queue_size', 'Jobs in queue')
|
|
||||||
|
|
||||||
@app.get("/metrics")
|
|
||||||
async def metrics():
|
|
||||||
# Prometheus scrapes this endpoint
|
|
||||||
return Response(generate_latest(), media_type="text/plain")
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🔐 Security
|
|
||||||
|
|
||||||
### 1. **API Keys**
|
|
||||||
|
|
||||||
```python
|
|
||||||
@app.post("/scrape")
|
|
||||||
async def scrape(
|
|
||||||
request: ScrapeRequest,
|
|
||||||
api_key: str = Header(..., alias="X-API-Key")
|
|
||||||
):
|
|
||||||
if not validate_api_key(api_key):
|
|
||||||
raise HTTPException(status_code=401, detail="Invalid API key")
|
|
||||||
|
|
||||||
# Process request...
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. **Rate Limiting**
|
|
||||||
|
|
||||||
```python
|
|
||||||
from slowapi import Limiter, _rate_limit_exceeded_handler
|
|
||||||
from slowapi.util import get_remote_address
|
|
||||||
|
|
||||||
limiter = Limiter(key_func=get_remote_address)
|
|
||||||
|
|
||||||
@app.post("/scrape")
|
|
||||||
@limiter.limit("10/minute") # Max 10 jobs per minute
|
|
||||||
async def scrape(request: Request, ...):
|
|
||||||
# Process request...
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. **Webhook Signatures**
|
|
||||||
|
|
||||||
```python
|
|
||||||
import hmac
|
|
||||||
|
|
||||||
def verify_webhook_signature(payload, signature, secret):
|
|
||||||
expected = hmac.new(
|
|
||||||
secret.encode(),
|
|
||||||
payload.encode(),
|
|
||||||
hashlib.sha256
|
|
||||||
).hexdigest()
|
|
||||||
|
|
||||||
return hmac.compare_digest(signature, expected)
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🚀 Deployment Options
|
|
||||||
|
|
||||||
### Option 1: **Docker Compose** (Development)
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
version: '3.8'
|
|
||||||
services:
|
|
||||||
api:
|
|
||||||
build: .
|
|
||||||
ports:
|
|
||||||
- "8000:8000"
|
|
||||||
environment:
|
|
||||||
- REDIS_URL=redis://redis:6379
|
|
||||||
- DATABASE_URL=postgresql://db:5432/scraper
|
|
||||||
depends_on:
|
|
||||||
- redis
|
|
||||||
- db
|
|
||||||
|
|
||||||
worker:
|
|
||||||
build: .
|
|
||||||
command: python worker.py
|
|
||||||
environment:
|
|
||||||
- REDIS_URL=redis://redis:6379
|
|
||||||
depends_on:
|
|
||||||
- redis
|
|
||||||
deploy:
|
|
||||||
replicas: 4
|
|
||||||
|
|
||||||
redis:
|
|
||||||
image: redis:7-alpine
|
|
||||||
|
|
||||||
db:
|
|
||||||
image: postgres:15-alpine
|
|
||||||
environment:
|
|
||||||
- POSTGRES_DB=scraper
|
|
||||||
```
|
|
||||||
|
|
||||||
### Option 2: **Kubernetes** (Production)
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: scraper-api
|
|
||||||
spec:
|
|
||||||
replicas: 3
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: scraper-api
|
|
||||||
template:
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: api
|
|
||||||
image: scraper-api:latest
|
|
||||||
ports:
|
|
||||||
- containerPort: 8000
|
|
||||||
env:
|
|
||||||
- name: REDIS_URL
|
|
||||||
value: redis://redis:6379
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /health/live
|
|
||||||
port: 8000
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /health/ready
|
|
||||||
port: 8000
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: scraper-worker
|
|
||||||
spec:
|
|
||||||
replicas: 10
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: scraper-worker
|
|
||||||
template:
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: worker
|
|
||||||
image: scraper-worker:latest
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📈 Scaling Considerations
|
|
||||||
|
|
||||||
### Horizontal Scaling
|
|
||||||
|
|
||||||
```
|
|
||||||
1 Worker = 3 jobs/minute (20s per job)
|
|
||||||
10 Workers = 30 jobs/minute
|
|
||||||
100 Workers = 300 jobs/minute = 432,000 jobs/day
|
|
||||||
```
|
|
||||||
|
|
||||||
### Resource Requirements (per worker)
|
|
||||||
|
|
||||||
```
|
|
||||||
CPU: 1-2 cores (Chrome is CPU-intensive)
|
|
||||||
RAM: 2-4 GB (headless Chrome + data)
|
|
||||||
Disk: Minimal (results go to S3)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Auto-scaling (Kubernetes HPA)
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: autoscaling/v2
|
|
||||||
kind: HorizontalPodAutoscaler
|
|
||||||
metadata:
|
|
||||||
name: scraper-worker-hpa
|
|
||||||
spec:
|
|
||||||
scaleTargetRef:
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
name: scraper-worker
|
|
||||||
minReplicas: 2
|
|
||||||
maxReplicas: 50
|
|
||||||
metrics:
|
|
||||||
- type: External
|
|
||||||
external:
|
|
||||||
metric:
|
|
||||||
name: redis_queue_size
|
|
||||||
target:
|
|
||||||
type: Value
|
|
||||||
value: "10" # Scale up if queue > 10
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ✅ Recommended Stack
|
|
||||||
|
|
||||||
### For Small-Medium (< 1000 jobs/day):
|
|
||||||
```
|
|
||||||
✅ FastAPI (API Server)
|
|
||||||
✅ Redis (Queue + Cache)
|
|
||||||
✅ PostgreSQL (Job metadata)
|
|
||||||
✅ Local files or S3 (Reviews storage)
|
|
||||||
✅ Webhooks (Primary)
|
|
||||||
✅ Polling (Fallback)
|
|
||||||
✅ Docker Compose (Deployment)
|
|
||||||
```
|
|
||||||
|
|
||||||
### For Large Scale (> 10,000 jobs/day):
|
|
||||||
```
|
|
||||||
✅ FastAPI (API Server)
|
|
||||||
✅ RabbitMQ (Queue)
|
|
||||||
✅ PostgreSQL (Job metadata)
|
|
||||||
✅ S3 (Reviews storage)
|
|
||||||
✅ Webhooks (Primary)
|
|
||||||
✅ SSE (Real-time updates)
|
|
||||||
✅ Kubernetes (Orchestration)
|
|
||||||
✅ Prometheus + Grafana (Monitoring)
|
|
||||||
✅ ELK Stack (Logging)
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🎯 Next Steps
|
|
||||||
|
|
||||||
Would you like me to implement:
|
|
||||||
|
|
||||||
1. ✅ **Webhooks** - Full webhook support with retries
|
|
||||||
2. ✅ **Redis Queue** - Job queue with Celery/RQ
|
|
||||||
3. ✅ **PostgreSQL** - Job metadata storage
|
|
||||||
4. ✅ **S3 Storage** - Reviews file storage
|
|
||||||
5. ✅ **Health Checks** - Detailed health endpoints
|
|
||||||
6. ✅ **SSE Streaming** - Real-time progress updates (optional)
|
|
||||||
7. ✅ **Docker Setup** - Complete docker-compose.yml
|
|
||||||
|
|
||||||
**My recommendation**: Start with **#1-5** (core production features), add #6-7 later if needed.
|
|
||||||
|
|
||||||
Let me know which to implement first!
|
|
||||||
@@ -1,157 +0,0 @@
|
|||||||
# Google Maps Scraper Optimization Results
|
|
||||||
|
|
||||||
## Summary
|
|
||||||
|
|
||||||
Successfully optimized Google Maps review scraper from **155 seconds** to **~29 seconds** - achieving **5.3x speedup**!
|
|
||||||
|
|
||||||
## Approaches Tested
|
|
||||||
|
|
||||||
### 1. ✅ Fast API Scrolling (`start_fast.py`) - **WINNER**
|
|
||||||
**Time**: ~29 seconds for 234 reviews
|
|
||||||
**Speed**: 5.3x faster than original
|
|
||||||
**Reviews/sec**: 7.9
|
|
||||||
|
|
||||||
**How it works**:
|
|
||||||
1. Navigate to reviews page (~15s)
|
|
||||||
2. Setup API interceptor (~2s)
|
|
||||||
3. Rapid scrolling with 0.3s waits (~12s)
|
|
||||||
- Each scroll triggers API call
|
|
||||||
- API returns 10 reviews per response
|
|
||||||
- No DOM parsing needed!
|
|
||||||
4. Collect all API responses
|
|
||||||
|
|
||||||
**Why it works**:
|
|
||||||
- Uses browser's active session (no auth issues)
|
|
||||||
- Minimal wait between scrolls (0.3s optimal)
|
|
||||||
- API interception captures all responses
|
|
||||||
- Zero DOM parsing overhead
|
|
||||||
|
|
||||||
**Usage**:
|
|
||||||
```bash
|
|
||||||
python start_fast.py
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 2. ❌ Parallel API Calls (`start_parallel.py`)
|
|
||||||
**Result**: Failed - 400 error
|
|
||||||
**Issue**: Captured cookies missing auth tokens (SID, HSID, SAPISID)
|
|
||||||
|
|
||||||
Captured only 5 tracking cookies when browser closed. Auth cookies only available:
|
|
||||||
- When logged into Google account, OR
|
|
||||||
- In active browser session
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 3. ❌ Parallel Browser Fetch (`start_parallel_v2.py`)
|
|
||||||
**Result**: Script timeout
|
|
||||||
**Issue**: Sequential token dependency
|
|
||||||
|
|
||||||
Google Maps API requires continuation tokens from previous response, so pages can't be fetched fully in parallel. The sequential token collection + parallel fetch took too long and timed out.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 4. ⚠️ Hybrid Parallel (`start_hybrid_parallel.py`)
|
|
||||||
**Result**: Partial success (60 reviews, timeout on parallel phase)
|
|
||||||
**Issue**: Same script timeout on parallel fetch
|
|
||||||
|
|
||||||
Collected 60 reviews via scrolling, then timed out on parallel fetch of remaining pages.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Key Findings
|
|
||||||
|
|
||||||
### Optimal Scroll Timing
|
|
||||||
| Wait Time | Reviews | Time | Speed | Notes |
|
|
||||||
|-----------|---------|------|-------|-------|
|
|
||||||
| 0.8s | 234 | 43s | 3.6x | Original fast version |
|
|
||||||
| 0.3s | 234 | 29s | 5.3x | ✅ **Optimal - best balance** |
|
|
||||||
| 0.15s | 210 | 30s | 5.1x | Too fast - misses 24 reviews |
|
|
||||||
|
|
||||||
**Conclusion**: 0.3s is the sweet spot - fast enough for 5.3x speedup while capturing all reviews.
|
|
||||||
|
|
||||||
### Why True Parallel is Hard
|
|
||||||
1. **Continuation tokens**: Each API response contains token for next page
|
|
||||||
2. **Sequential dependency**: Must fetch page N before getting token for page N+1
|
|
||||||
3. **Script timeout**: Collecting tokens + parallel fetch exceeds browser timeout
|
|
||||||
4. **Session state**: Direct API calls fail without active browser session
|
|
||||||
|
|
||||||
### What We Learned
|
|
||||||
- Browser's active session can make API calls that standalone requests cannot
|
|
||||||
- API interception is more reliable than trying to replay requests
|
|
||||||
- Small optimizations (0.3s vs 0.8s wait) make big differences (3.6x → 5.3x)
|
|
||||||
- Sometimes simple solutions (fast scrolling) beat complex ones (parallel fetch)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Performance Comparison
|
|
||||||
|
|
||||||
```
|
|
||||||
Approach Time Reviews Speed Notes
|
|
||||||
────────────────────────────────────────────────────────────────────
|
|
||||||
Original DOM Scraping 155s 244 1.0x Baseline
|
|
||||||
Fast API Scrolling (0.8s) 43s 234 3.6x Good
|
|
||||||
Fast API Scrolling (0.3s) 29s 234 5.3x ✅ Best
|
|
||||||
Ultra-fast (0.15s) 30s 210 5.1x Misses reviews
|
|
||||||
Hybrid Parallel 51s 60 3.0x Timeout issues
|
|
||||||
Parallel Fetch V1 FAILED 0 N/A Auth error
|
|
||||||
Parallel Fetch V2 FAILED 0 N/A Timeout
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Recommendations
|
|
||||||
|
|
||||||
### For Best Performance
|
|
||||||
Use `start_fast.py` with 0.3s scroll timing:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python start_fast.py
|
|
||||||
```
|
|
||||||
|
|
||||||
**Benefits**:
|
|
||||||
- ✅ 5.3x faster than original (29s vs 155s)
|
|
||||||
- ✅ Gets 234/244 reviews (95.9%)
|
|
||||||
- ✅ No login required
|
|
||||||
- ✅ Stable and reliable
|
|
||||||
- ✅ Simple implementation
|
|
||||||
|
|
||||||
### For Maximum Reviews
|
|
||||||
Use original `start.py`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python start.py
|
|
||||||
```
|
|
||||||
|
|
||||||
Gets all 244 reviews but takes 155 seconds.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Future Improvements
|
|
||||||
|
|
||||||
Potential optimizations (not yet tested):
|
|
||||||
1. **Reduce initial wait times**: Navigate/click timing could be optimized
|
|
||||||
2. **Pre-inject API interceptor**: Setup before navigation for instant capture
|
|
||||||
3. **Smarter scroll detection**: Only scroll when API call completes
|
|
||||||
4. **Progressive timeout increase**: Start with 0.1s, increase if misses detected
|
|
||||||
|
|
||||||
However, at 5.3x speedup with simple implementation, further optimization may not be worth the complexity.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Conclusion
|
|
||||||
|
|
||||||
**The `start_fast.py` script achieves the best balance**:
|
|
||||||
- 5.3x faster than original
|
|
||||||
- 95.9% review coverage (234/244)
|
|
||||||
- Simple, stable, reliable
|
|
||||||
- No authentication required
|
|
||||||
|
|
||||||
True parallel API calls face fundamental limitations due to:
|
|
||||||
- Continuation token dependencies
|
|
||||||
- Browser session requirements
|
|
||||||
- Script execution timeouts
|
|
||||||
|
|
||||||
The fast scrolling approach leverages the browser's capabilities while minimizing wait times, achieving excellent performance without the complexity and failure modes of parallel approaches.
|
|
||||||
|
|
||||||
**Mission accomplished!** 🚀
|
|
||||||
@@ -1,200 +0,0 @@
|
|||||||
# Parallel Optimization Results
|
|
||||||
|
|
||||||
## Question: Can we do scrolling and DOM parsing in parallel?
|
|
||||||
|
|
||||||
**TL;DR**: No, sequential is faster. DOM parsing during scrolling adds too much overhead.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Approaches Tested
|
|
||||||
|
|
||||||
### 1. ❌ Full Parallel Hybrid (`start_parallel_hybrid.py`)
|
|
||||||
**Strategy**: Parse DOM every 5 scrolls while collecting API responses
|
|
||||||
|
|
||||||
**Results**:
|
|
||||||
- Time: 76-103 seconds
|
|
||||||
- Reviews: 244/244
|
|
||||||
- **Verdict**: 2.3x SLOWER than sequential
|
|
||||||
|
|
||||||
**Why it failed**: DOM parsing is heavyweight. Even parsing every 5 scrolls adds 50-80 seconds of overhead to the scroll loop.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 2. ❌ Optimized Parallel (`start_parallel_hybrid.py` v2)
|
|
||||||
**Strategy**: Only parse DOM in last 10 scrolls when near 234 reviews
|
|
||||||
|
|
||||||
**Results**:
|
|
||||||
- Time: 76 seconds
|
|
||||||
- Reviews: 244/244
|
|
||||||
- **Verdict**: Still 2.2x slower than sequential
|
|
||||||
|
|
||||||
**Why it failed**: DOM parsing at any point during scrolling slows down the critical scroll loop.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 3. ❌ Minimal Overhead Parallel (`start_optimized_hybrid.py`)
|
|
||||||
**Strategy**: Keep scroll loop completely clean, only parse DOM at very end
|
|
||||||
|
|
||||||
**Results**:
|
|
||||||
- Time: 0 reviews (instability)
|
|
||||||
- **Verdict**: FAILED - page not ready, 0 reviews captured
|
|
||||||
|
|
||||||
**Why it failed**: Timing instability. Difficult to get initialization exactly right.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 4. ✅ **WINNER: Sequential Hybrid** (`start_ultra_fast_complete.py`)
|
|
||||||
**Strategy**:
|
|
||||||
1. Phase 1: Ultra-fast API scrolling (no DOM parsing)
|
|
||||||
2. Phase 2: Targeted DOM parsing for missing 10 reviews
|
|
||||||
|
|
||||||
**Results**:
|
|
||||||
- **Time**: 32.4 seconds
|
|
||||||
- **Reviews**: 244/244 (100%)
|
|
||||||
- **Speedup**: 4.8x faster than original
|
|
||||||
- **Stability**: 100% reliable
|
|
||||||
|
|
||||||
**Why it works**:
|
|
||||||
- API scrolling is fastest when uninterrupted (19.5s)
|
|
||||||
- DOM parsing is most efficient on fully loaded page (12.9s)
|
|
||||||
- Clean separation = predictable, stable performance
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Performance Comparison
|
|
||||||
|
|
||||||
```
|
|
||||||
Approach Time Speedup Reviews Status
|
|
||||||
────────────────────────────────────────────────────────────────────────────
|
|
||||||
Original DOM Scraping 155s 1.0x 244 Baseline
|
|
||||||
Ultra-Fast API Only 19.4s 8.0x 234 Fast but incomplete
|
|
||||||
Sequential Hybrid (WINNER) 32.4s 4.8x 244 ✅ Best balance
|
|
||||||
Parallel Hybrid (every 5 scrolls) 103s 1.5x 244 Too slow
|
|
||||||
Parallel Hybrid (last 10 scrolls) 76s 2.0x 244 Still slow
|
|
||||||
Optimized Parallel FAILED N/A 0 Unstable
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Key Findings
|
|
||||||
|
|
||||||
### Why Parallel Doesn't Help
|
|
||||||
|
|
||||||
1. **DOM Parsing is Heavy**
|
|
||||||
- Finding elements: ~100-200ms per query
|
|
||||||
- Parsing each element: ~10-50ms
|
|
||||||
- Total overhead: 50-80 seconds when done during scrolling
|
|
||||||
|
|
||||||
2. **Scroll Loop is Time-Critical**
|
|
||||||
- Optimal scroll timing: 0.27 seconds
|
|
||||||
- API response collection: ~30-50ms
|
|
||||||
- Adding DOM parsing: +100-200ms = 4-8x slower per scroll
|
|
||||||
|
|
||||||
3. **Page State Matters**
|
|
||||||
- During scrolling: Elements constantly changing (stale references)
|
|
||||||
- After scrolling: Stable DOM, faster parsing
|
|
||||||
|
|
||||||
### Why Sequential Wins
|
|
||||||
|
|
||||||
1. **Clean Scroll Loop**
|
|
||||||
- Only API collection (fast)
|
|
||||||
- No element queries during critical path
|
|
||||||
- Predictable timing
|
|
||||||
|
|
||||||
2. **Efficient DOM Parsing**
|
|
||||||
- Parse on stable page (no stale elements)
|
|
||||||
- Only parse top 15-20 reviews (missing ones are at top)
|
|
||||||
- Batch operation is faster than incremental
|
|
||||||
|
|
||||||
3. **Simple = Stable**
|
|
||||||
- Two clear phases, easy to debug
|
|
||||||
- No complex synchronization
|
|
||||||
- Consistent results
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Theoretical Analysis
|
|
||||||
|
|
||||||
### Time Breakdown
|
|
||||||
|
|
||||||
**Sequential Approach**:
|
|
||||||
```
|
|
||||||
Phase 1: API Scrolling
|
|
||||||
- 35 scrolls × 0.27s = 9.5s
|
|
||||||
- API collection overhead = 10.0s
|
|
||||||
- Total Phase 1 = 19.5s
|
|
||||||
|
|
||||||
Phase 2: DOM Parsing
|
|
||||||
- Scroll to top = 0.5s
|
|
||||||
- Find elements = 0.8s
|
|
||||||
- Parse 15 elements = 11.6s
|
|
||||||
- Total Phase 2 = 12.9s
|
|
||||||
|
|
||||||
TOTAL = 32.4s
|
|
||||||
```
|
|
||||||
|
|
||||||
**Parallel Approach** (every 5 scrolls):
|
|
||||||
```
|
|
||||||
Combined Scrolling + DOM:
|
|
||||||
- 40 scrolls with DOM parsing
|
|
||||||
- Per scroll: 0.27s scroll + 2.0s DOM = 2.27s
|
|
||||||
- Total = 90.8s (plus overhead)
|
|
||||||
|
|
||||||
TOTAL = ~103s
|
|
||||||
```
|
|
||||||
|
|
||||||
**Parallel Approach** (last 10 scrolls):
|
|
||||||
```
|
|
||||||
Phase 1: Fast scrolling (30 scrolls)
|
|
||||||
- 30 × 0.27s = 8.1s
|
|
||||||
|
|
||||||
Phase 2: Slow scrolling with DOM (10 scrolls)
|
|
||||||
- 10 × (0.27s + 6.5s) = 67.7s
|
|
||||||
|
|
||||||
TOTAL = 75.8s
|
|
||||||
```
|
|
||||||
|
|
||||||
### Why DOM is So Slow During Scrolling
|
|
||||||
|
|
||||||
1. **Stale Element References**: Elements change as page scrolls, requiring re-queries
|
|
||||||
2. **Layout Thrashing**: DOM queries force layout recalculation
|
|
||||||
3. **Concurrent Modifications**: Page is updating while we're reading
|
|
||||||
4. **No Batch Optimization**: Can't batch when elements keep changing
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Conclusion
|
|
||||||
|
|
||||||
**Sequential is 2-3x faster than parallel** for this use case.
|
|
||||||
|
|
||||||
**Recommended Solution**: `start_ultra_fast_complete.py`
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python start_ultra_fast_complete.py
|
|
||||||
```
|
|
||||||
|
|
||||||
**Benefits**:
|
|
||||||
- ✅ 4.8x faster than original (32.4s vs 155s)
|
|
||||||
- ✅ 100% completeness (244/244 reviews)
|
|
||||||
- ✅ 100% stable and reliable
|
|
||||||
- ✅ Simple, maintainable code
|
|
||||||
- ✅ Saves 122 seconds per run
|
|
||||||
|
|
||||||
**Why not ultra-fast API-only (8.0x)?**
|
|
||||||
- Missing 10 reviews (4.1%)
|
|
||||||
- Only 13 seconds slower to get 100% completeness
|
|
||||||
- Worth the trade-off for most use cases
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Lessons Learned
|
|
||||||
|
|
||||||
1. **"Parallel" doesn't always mean faster** - overhead matters
|
|
||||||
2. **Keep critical loops clean** - don't add slow operations to tight loops
|
|
||||||
3. **Stable state = faster operations** - parse DOM when it's not changing
|
|
||||||
4. **Simple often wins** - clear phases beat complex synchronization
|
|
||||||
5. **Measure, don't assume** - test proves sequential is faster
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**Final Recommendation**: Use sequential hybrid approach (`start_ultra_fast_complete.py`) for best balance of speed and completeness.
|
|
||||||
@@ -1,501 +0,0 @@
|
|||||||
# ✅ Phase 1 Implementation Complete!
|
|
||||||
|
|
||||||
## 🎉 What Was Built
|
|
||||||
|
|
||||||
### Production Microservice with:
|
|
||||||
1. ✅ **PostgreSQL Storage** - JSONB for reviews (not S3!)
|
|
||||||
2. ✅ **Webhooks** - Async notifications with retry logic
|
|
||||||
3. ✅ **Smart Health Checks** - Canary testing to verify scraping works
|
|
||||||
4. ✅ **Fast Scraper** - 18.9s average (8.2x faster)
|
|
||||||
5. ✅ **Docker Deployment** - Complete Docker Compose setup
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📦 Files Created
|
|
||||||
|
|
||||||
### Core Modules:
|
|
||||||
```
|
|
||||||
modules/
|
|
||||||
├── database.py # PostgreSQL with JSONB storage
|
|
||||||
├── webhooks.py # Webhook delivery with retries + HMAC
|
|
||||||
├── health_checks.py # Canary testing every 4 hours
|
|
||||||
└── fast_scraper.py # Ultra-fast DOM scraper (existing, updated)
|
|
||||||
```
|
|
||||||
|
|
||||||
### API Server:
|
|
||||||
```
|
|
||||||
api_server_production.py # Production API with all Phase 1 features
|
|
||||||
```
|
|
||||||
|
|
||||||
### Deployment:
|
|
||||||
```
|
|
||||||
Dockerfile # Production container image
|
|
||||||
docker-compose.production.yml # Complete Docker setup
|
|
||||||
requirements-production.txt # Production dependencies
|
|
||||||
.env.example # Environment configuration template
|
|
||||||
```
|
|
||||||
|
|
||||||
### Documentation:
|
|
||||||
```
|
|
||||||
DEPLOYMENT_GUIDE.md # Complete deployment instructions
|
|
||||||
STORAGE_COMPARISON.md # PostgreSQL vs S3 analysis
|
|
||||||
HEALTH_CHECKS.md # Smart health check strategy
|
|
||||||
MICROSERVICE_ARCHITECTURE.md # Full architecture docs
|
|
||||||
PHASE1_COMPLETE.md # This file
|
|
||||||
```
|
|
||||||
|
|
||||||
### Testing:
|
|
||||||
```
|
|
||||||
test_phase1.py # Module validation test
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🏗️ Architecture
|
|
||||||
|
|
||||||
```
|
|
||||||
Client Request
|
|
||||||
↓
|
|
||||||
Production API Server
|
|
||||||
↓
|
|
||||||
PostgreSQL
|
|
||||||
├─ Job metadata (status, timestamps, etc.)
|
|
||||||
└─ Reviews data (JSONB - 244 reviews = 150 KB)
|
|
||||||
↓
|
|
||||||
Webhooks (async notifications)
|
|
||||||
├─ Retry logic (3 attempts, exponential backoff)
|
|
||||||
├─ HMAC signatures for security
|
|
||||||
└─ Delivery tracking in database
|
|
||||||
↓
|
|
||||||
Background Canary Monitor
|
|
||||||
└─ Runs actual scrape every 4 hours
|
|
||||||
├─ Verifies Chrome works
|
|
||||||
├─ Verifies selectors work
|
|
||||||
├─ Verifies GDPR handling works
|
|
||||||
└─ Alerts if 3 consecutive failures
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🚀 Quick Start
|
|
||||||
|
|
||||||
### Option 1: Docker (Recommended)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# 1. Configure environment
|
|
||||||
cp .env.example .env
|
|
||||||
nano .env
|
|
||||||
|
|
||||||
# 2. Start services
|
|
||||||
docker-compose -f docker-compose.production.yml up -d
|
|
||||||
|
|
||||||
# 3. Check health
|
|
||||||
curl http://localhost:8000/health/detailed | jq
|
|
||||||
```
|
|
||||||
|
|
||||||
### Option 2: Manual
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# 1. Install dependencies
|
|
||||||
pip install -r requirements-production.txt
|
|
||||||
|
|
||||||
# 2. Setup PostgreSQL
|
|
||||||
createdb scraper
|
|
||||||
|
|
||||||
# 3. Set environment
|
|
||||||
export DATABASE_URL="postgresql://$(whoami)@localhost:5432/scraper"
|
|
||||||
export API_BASE_URL="http://localhost:8000"
|
|
||||||
|
|
||||||
# 4. Run server
|
|
||||||
python api_server_production.py
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 💡 Key Design Decisions
|
|
||||||
|
|
||||||
### 1. PostgreSQL JSONB (Not S3)
|
|
||||||
|
|
||||||
**Why PostgreSQL wins**:
|
|
||||||
- ✅ 14-57x faster (2ms vs 200ms)
|
|
||||||
- ✅ Simpler (one service, not two)
|
|
||||||
- ✅ Transactional (atomic updates)
|
|
||||||
- ✅ Queryable (can search reviews with SQL)
|
|
||||||
- ✅ Cheaper for < 100,000 jobs/month
|
|
||||||
|
|
||||||
**When to use S3**: Only if you exceed 100GB+ of review data
|
|
||||||
|
|
||||||
**Storage efficiency**:
|
|
||||||
```
|
|
||||||
244 reviews × 0.6 KB = 150 KB per job
|
|
||||||
10,000 jobs/month = 1.5 GB/month ✅ Perfect for PostgreSQL
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Smart Health Checks (Canary Testing)
|
|
||||||
|
|
||||||
**Why it matters**:
|
|
||||||
- Basic health checks only verify services are up
|
|
||||||
- They DON'T verify scraping actually works
|
|
||||||
- Google can change page structure and break selectors
|
|
||||||
- **Canary tests verify scraping works end-to-end**
|
|
||||||
|
|
||||||
**How it works**:
|
|
||||||
```
|
|
||||||
Every 4 hours:
|
|
||||||
1. Run actual scrape on test URL
|
|
||||||
2. Verify we get reviews
|
|
||||||
3. Verify data structure is correct
|
|
||||||
4. Alert if 3 consecutive failures
|
|
||||||
```
|
|
||||||
|
|
||||||
**This catches issues before your customers do!**
|
|
||||||
|
|
||||||
### 3. Webhooks (Not Just Polling)
|
|
||||||
|
|
||||||
**Why webhooks**:
|
|
||||||
- ✅ No polling needed (reduces server load)
|
|
||||||
- ✅ Instant notifications when job completes
|
|
||||||
- ✅ Industry standard (Stripe, GitHub use this)
|
|
||||||
- ✅ Scales to millions of jobs
|
|
||||||
|
|
||||||
**Security**:
|
|
||||||
- HMAC-SHA256 signatures on all webhooks
|
|
||||||
- Timestamp headers to prevent replay attacks
|
|
||||||
- Retry logic with exponential backoff
|
|
||||||
- Delivery tracking in database
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📡 API Examples
|
|
||||||
|
|
||||||
### Submit Job with Webhook
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -X POST "http://localhost:8000/scrape" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"url": "https://www.google.com/maps/place/YOUR_BUSINESS",
|
|
||||||
"webhook_url": "https://your-server.com/webhook",
|
|
||||||
"webhook_secret": "your-secret-key"
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
**Response**:
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"job_id": "550e8400-e29b-41d4-a716-446655440000",
|
|
||||||
"status": "started"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Receive Webhook (When Complete)
|
|
||||||
|
|
||||||
```json
|
|
||||||
POST https://your-server.com/webhook
|
|
||||||
Headers:
|
|
||||||
X-Webhook-Signature: sha256=abc123...
|
|
||||||
X-Webhook-Timestamp: 1705582800
|
|
||||||
|
|
||||||
Body:
|
|
||||||
{
|
|
||||||
"event": "job.completed",
|
|
||||||
"job_id": "550e8400-...",
|
|
||||||
"status": "completed",
|
|
||||||
"reviews_count": 244,
|
|
||||||
"scrape_time": 18.9,
|
|
||||||
"reviews_url": "http://localhost:8000/jobs/{job_id}/reviews"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Verify Webhook Signature
|
|
||||||
|
|
||||||
```python
|
|
||||||
import hmac
|
|
||||||
import hashlib
|
|
||||||
|
|
||||||
def verify_webhook(payload: str, signature: str, secret: str) -> bool:
|
|
||||||
expected = signature.split("sha256=", 1)[1]
|
|
||||||
computed = hmac.new(
|
|
||||||
secret.encode(),
|
|
||||||
payload.encode(),
|
|
||||||
hashlib.sha256
|
|
||||||
).hexdigest()
|
|
||||||
return hmac.compare_digest(expected, computed)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Get Reviews
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl "http://localhost:8000/jobs/550e8400-.../reviews" | jq
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🏥 Health Endpoints
|
|
||||||
|
|
||||||
### Liveness (Kubernetes restart if fails)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
GET /health/live
|
|
||||||
```
|
|
||||||
|
|
||||||
### Readiness (Load balancer routing)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
GET /health/ready
|
|
||||||
```
|
|
||||||
|
|
||||||
### Canary (External monitoring alerts)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
GET /health/canary
|
|
||||||
```
|
|
||||||
|
|
||||||
**Response**:
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"status": "healthy",
|
|
||||||
"last_success": "2026-01-18T10:00:00Z",
|
|
||||||
"age_minutes": 30,
|
|
||||||
"consecutive_failures": 0,
|
|
||||||
"last_result": {
|
|
||||||
"reviews_count": 244,
|
|
||||||
"scrape_time": 18.9
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Detailed (Debugging)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
GET /health/detailed
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📊 Database Schema
|
|
||||||
|
|
||||||
### Jobs Table
|
|
||||||
|
|
||||||
```sql
|
|
||||||
job_id UUID PRIMARY KEY
|
|
||||||
status VARCHAR(20) -- pending, running, completed, failed, cancelled
|
|
||||||
url TEXT
|
|
||||||
webhook_url TEXT
|
|
||||||
webhook_secret TEXT
|
|
||||||
created_at TIMESTAMP
|
|
||||||
started_at TIMESTAMP
|
|
||||||
completed_at TIMESTAMP
|
|
||||||
reviews_count INTEGER
|
|
||||||
reviews_data JSONB -- ← All 244 reviews stored here!
|
|
||||||
scrape_time REAL
|
|
||||||
error_message TEXT
|
|
||||||
metadata JSONB
|
|
||||||
```
|
|
||||||
|
|
||||||
**Size**: 244 reviews = ~150 KB per job
|
|
||||||
|
|
||||||
### Canary Results Table
|
|
||||||
|
|
||||||
```sql
|
|
||||||
id SERIAL PRIMARY KEY
|
|
||||||
timestamp TIMESTAMP
|
|
||||||
success BOOLEAN
|
|
||||||
reviews_count INTEGER
|
|
||||||
scrape_time REAL
|
|
||||||
error_message TEXT
|
|
||||||
metadata JSONB
|
|
||||||
```
|
|
||||||
|
|
||||||
**Purpose**: Track canary test history for monitoring
|
|
||||||
|
|
||||||
### Webhook Attempts Table
|
|
||||||
|
|
||||||
```sql
|
|
||||||
id SERIAL PRIMARY KEY
|
|
||||||
job_id UUID
|
|
||||||
attempt_number INTEGER -- 1, 2, 3...
|
|
||||||
timestamp TIMESTAMP
|
|
||||||
success BOOLEAN
|
|
||||||
status_code INTEGER
|
|
||||||
error_message TEXT
|
|
||||||
response_time_ms REAL
|
|
||||||
```
|
|
||||||
|
|
||||||
**Purpose**: Track webhook delivery for debugging
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📈 Performance
|
|
||||||
|
|
||||||
### Scraping Speed
|
|
||||||
|
|
||||||
```
|
|
||||||
Average Time: 18.9 seconds
|
|
||||||
Reviews: 244 (100%)
|
|
||||||
Speedup: 8.2x faster than original
|
|
||||||
Success Rate: 100%
|
|
||||||
```
|
|
||||||
|
|
||||||
### Storage Efficiency
|
|
||||||
|
|
||||||
```
|
|
||||||
1 job = 150 KB
|
|
||||||
1,000 jobs = 150 MB
|
|
||||||
10,000 jobs = 1.5 GB ✅ PostgreSQL handles easily
|
|
||||||
```
|
|
||||||
|
|
||||||
### Webhook Delivery
|
|
||||||
|
|
||||||
```
|
|
||||||
Max retries: 3 attempts
|
|
||||||
Backoff: Exponential (2s, 4s, 8s)
|
|
||||||
Timeout: 10 seconds per attempt
|
|
||||||
Success rate: 99.5% (with retries)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Canary Testing
|
|
||||||
|
|
||||||
```
|
|
||||||
Interval: Every 4 hours
|
|
||||||
Test duration: ~20 seconds
|
|
||||||
Alert threshold: 3 consecutive failures
|
|
||||||
Downtime detection: Within 12 hours maximum
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🔒 Security Features
|
|
||||||
|
|
||||||
### Webhook Security
|
|
||||||
|
|
||||||
- ✅ HMAC-SHA256 signatures
|
|
||||||
- ✅ Timestamp headers
|
|
||||||
- ✅ Secret validation
|
|
||||||
- ✅ Replay attack prevention
|
|
||||||
|
|
||||||
### Database Security
|
|
||||||
|
|
||||||
- ✅ Parameterized queries (SQL injection safe)
|
|
||||||
- ✅ Connection pooling
|
|
||||||
- ✅ Environment-based credentials
|
|
||||||
- ✅ No secrets in code
|
|
||||||
|
|
||||||
### API Security
|
|
||||||
|
|
||||||
- ✅ CORS configured
|
|
||||||
- ✅ Input validation (Pydantic)
|
|
||||||
- ✅ Error handling
|
|
||||||
- ✅ Health check endpoints
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🐛 Testing
|
|
||||||
|
|
||||||
### Module Validation
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python test_phase1.py
|
|
||||||
```
|
|
||||||
|
|
||||||
**Tests**:
|
|
||||||
- ✅ All imports work
|
|
||||||
- ✅ Database module structure
|
|
||||||
- ✅ Webhook signature generation
|
|
||||||
- ✅ Health check system structure
|
|
||||||
- ✅ Scraper integration
|
|
||||||
|
|
||||||
### Full Integration Test
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Start services
|
|
||||||
docker-compose -f docker-compose.production.yml up -d
|
|
||||||
|
|
||||||
# Wait for services
|
|
||||||
sleep 10
|
|
||||||
|
|
||||||
# Test health
|
|
||||||
curl http://localhost:8000/health/detailed | jq
|
|
||||||
|
|
||||||
# Submit test job
|
|
||||||
curl -X POST http://localhost:8000/scrape \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"url": "https://www.google.com/maps/place/...", "webhook_url": "https://webhook.site/YOUR_ID"}'
|
|
||||||
|
|
||||||
# Check status
|
|
||||||
curl http://localhost:8000/jobs/{job_id} | jq
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🎯 What's Next (Phase 2)
|
|
||||||
|
|
||||||
### Optional Enhancements:
|
|
||||||
|
|
||||||
1. **Redis Queue** - Distribute jobs across multiple workers
|
|
||||||
2. **Worker Processes** - Separate API from scraping
|
|
||||||
3. **Auto-scaling** - Kubernetes HPA based on queue size
|
|
||||||
4. **SSE Streaming** - Real-time progress updates (optional)
|
|
||||||
5. **Prometheus Metrics** - Advanced monitoring
|
|
||||||
6. **Rate Limiting** - API rate limits per client
|
|
||||||
|
|
||||||
**Current Phase 1 handles**:
|
|
||||||
- ✅ Up to 10,000 jobs/month easily
|
|
||||||
- ✅ Single server deployment
|
|
||||||
- ✅ Production-ready microservice
|
|
||||||
|
|
||||||
**Upgrade to Phase 2 when**:
|
|
||||||
- You need > 100,000 jobs/month
|
|
||||||
- You need auto-scaling
|
|
||||||
- You need multi-region deployment
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📚 Documentation
|
|
||||||
|
|
||||||
All documentation created:
|
|
||||||
|
|
||||||
1. **DEPLOYMENT_GUIDE.md** - Complete deployment instructions
|
|
||||||
2. **STORAGE_COMPARISON.md** - PostgreSQL vs S3 decision
|
|
||||||
3. **HEALTH_CHECKS.md** - Canary testing strategy
|
|
||||||
4. **MICROSERVICE_ARCHITECTURE.md** - Full architecture details
|
|
||||||
5. **API_DOCUMENTATION.md** - API reference (from earlier)
|
|
||||||
6. **PHASE1_COMPLETE.md** - This summary
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ✅ Phase 1 Checklist
|
|
||||||
|
|
||||||
- [x] PostgreSQL storage with JSONB
|
|
||||||
- [x] Webhook delivery with retries
|
|
||||||
- [x] Smart health checks with canary
|
|
||||||
- [x] Fast scraper integration (18.9s)
|
|
||||||
- [x] Docker Compose setup
|
|
||||||
- [x] Complete documentation
|
|
||||||
- [x] Security (HMAC signatures)
|
|
||||||
- [x] Monitoring (canary + health)
|
|
||||||
- [x] Production-ready API
|
|
||||||
- [x] Testing scripts
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🚀 You're Production Ready!
|
|
||||||
|
|
||||||
Your microservice now has:
|
|
||||||
|
|
||||||
✅ **Fast scraping** (18.9s average)
|
|
||||||
✅ **Persistent storage** (PostgreSQL survives restarts)
|
|
||||||
✅ **Async notifications** (webhooks with retries)
|
|
||||||
✅ **Self-monitoring** (canary tests every 4 hours)
|
|
||||||
✅ **Health checks** (Kubernetes-ready)
|
|
||||||
✅ **Security** (HMAC webhook signatures)
|
|
||||||
✅ **Scalability** (handles 10,000+ jobs/month)
|
|
||||||
✅ **Documentation** (complete deployment guide)
|
|
||||||
|
|
||||||
**Start using it**:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
docker-compose -f docker-compose.production.yml up -d
|
|
||||||
```
|
|
||||||
|
|
||||||
**That's it!** Your production scraping microservice is live! 🎉
|
|
||||||
140
QUICKSTART.md
140
QUICKSTART.md
@@ -1,140 +0,0 @@
|
|||||||
# Quick Start - Fastest Google Maps Scraper
|
|
||||||
|
|
||||||
## 🚀 The Fastest Way
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python start_dom_only_fast.py
|
|
||||||
```
|
|
||||||
|
|
||||||
**Result**: All 244 reviews in **~18.9 seconds** (8.2x faster than original)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ✅ What You Get
|
|
||||||
|
|
||||||
- ⚡ **18.9 seconds** - Blazing fast
|
|
||||||
- ✅ **100% stable** - Works every time
|
|
||||||
- 🌍 **Universal** - Works for ANY Google Maps business
|
|
||||||
- 🎯 **Complete** - Gets ALL reviews
|
|
||||||
- 🔧 **Adaptive** - Auto-adjusts to network speed
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📋 Requirements
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install seleniumbase pyyaml
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ⚙️ Configuration
|
|
||||||
|
|
||||||
Edit `config.yaml`:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
url: https://www.google.com/maps/place/YOUR_BUSINESS_HERE
|
|
||||||
headless: false # Keep false for stability
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🎯 Run It
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Fastest (18.9s) - RECOMMENDED
|
|
||||||
python start_dom_only_fast.py
|
|
||||||
|
|
||||||
# Alternative: Stable hybrid (32s)
|
|
||||||
python start_ultra_fast_complete.py
|
|
||||||
|
|
||||||
# Original baseline (155s)
|
|
||||||
python start.py
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📊 Performance
|
|
||||||
|
|
||||||
| Script | Time | Speedup | Reviews |
|
|
||||||
|--------|------|---------|---------|
|
|
||||||
| **start_dom_only_fast.py** | **18.9s** | **8.2x** | **244** ✅ |
|
|
||||||
| start_ultra_fast_complete.py | 32.4s | 4.8x | 244 |
|
|
||||||
| start.py | 155s | 1.0x | 244 |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 💾 Output
|
|
||||||
|
|
||||||
Reviews saved to: `google_reviews_dom_only_fast.json`
|
|
||||||
|
|
||||||
```json
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"review_id": "review_123...",
|
|
||||||
"author": "John Doe",
|
|
||||||
"rating": 5.0,
|
|
||||||
"text": "Great place!",
|
|
||||||
"date_text": "2 months ago",
|
|
||||||
"avatar_url": "https://...",
|
|
||||||
"profile_url": "..."
|
|
||||||
}
|
|
||||||
]
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🔥 Key Features
|
|
||||||
|
|
||||||
### Dynamic Scroll Waiting
|
|
||||||
Scrolls **as fast as reviews load** - not on fixed timers!
|
|
||||||
|
|
||||||
### GDPR Auto-Handling
|
|
||||||
Automatically handles consent pages in any language.
|
|
||||||
|
|
||||||
### JavaScript Extraction
|
|
||||||
Extracts all reviews in **0.01 seconds** (40x faster than Selenium).
|
|
||||||
|
|
||||||
### Universal Design
|
|
||||||
No hardcoded values - works for 10 reviews or 10,000 reviews.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📈 What Makes It Fast?
|
|
||||||
|
|
||||||
1. **GDPR consent handling** - Fixed root cause of failures
|
|
||||||
2. **Dynamic waiting** - Adapts to network speed (not fixed delays)
|
|
||||||
3. **JavaScript extraction** - 40x faster than Selenium
|
|
||||||
4. **Smart stopping** - Stops when reviews stop loading
|
|
||||||
5. **Optimized waits** - Minimal delays everywhere
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ❓ Troubleshooting
|
|
||||||
|
|
||||||
### Getting 0 reviews?
|
|
||||||
- Make sure `headless: false` in config.yaml
|
|
||||||
- Check your URL is correct
|
|
||||||
- Run again (sometimes GDPR page needs retry)
|
|
||||||
|
|
||||||
### Too slow?
|
|
||||||
- Check your internet connection
|
|
||||||
- Close other browser windows
|
|
||||||
- Make sure SeleniumBase is updated
|
|
||||||
|
|
||||||
### Missing some reviews?
|
|
||||||
- Increase `max_scrolls` in the script (default: 35)
|
|
||||||
- Or use `start_ultra_fast_complete.py` for guaranteed 100%
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🎯 Success Rate
|
|
||||||
|
|
||||||
Tested **20+ runs**:
|
|
||||||
- ✅ Success: 100%
|
|
||||||
- ⚡ Average time: 18.9s
|
|
||||||
- 📊 All reviews: 244/244
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**That's it! You're ready to scrape Google Maps at 8.2x speed!** 🚀
|
|
||||||
@@ -1,195 +0,0 @@
|
|||||||
# Quick Start: API Interception Mode
|
|
||||||
|
|
||||||
## ✅ Status: API Interceptor Enhanced & Ready
|
|
||||||
|
|
||||||
The API interceptor has been **fully debugged and enhanced**. It successfully captures Google Maps API responses but needs parser tuning for your specific use case.
|
|
||||||
|
|
||||||
## 🚀 Quick Start
|
|
||||||
|
|
||||||
### Enable API Mode
|
|
||||||
Your `config.yaml` already has:
|
|
||||||
```yaml
|
|
||||||
enable_api_intercept: true
|
|
||||||
```
|
|
||||||
|
|
||||||
### Run with Debug Logging
|
|
||||||
```bash
|
|
||||||
# Clean Python cache first
|
|
||||||
find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null
|
|
||||||
find . -name "*.pyc" -delete
|
|
||||||
|
|
||||||
# Run with debug output
|
|
||||||
LOG_LEVEL=DEBUG python start.py 2>&1 | tee scraper_debug.log
|
|
||||||
```
|
|
||||||
|
|
||||||
### What You'll See
|
|
||||||
|
|
||||||
**✅ Successful Setup:**
|
|
||||||
```
|
|
||||||
[INFO] API interception enabled via CDP
|
|
||||||
[INFO] JavaScript response interceptor injected with enhanced debugging
|
|
||||||
[INFO] API interceptor ready - capturing network responses
|
|
||||||
```
|
|
||||||
|
|
||||||
**📊 During Scraping:**
|
|
||||||
```
|
|
||||||
[DEBUG] Retrieved 2 intercepted responses from browser
|
|
||||||
[DEBUG] - XHR: /maps/rpc/listugcposts?... (68426 bytes)
|
|
||||||
[DEBUG] Collected 2 network responses from browser
|
|
||||||
[DEBUG] Parsed 0 reviews from responses # If parser needs tuning
|
|
||||||
```
|
|
||||||
|
|
||||||
OR
|
|
||||||
|
|
||||||
```
|
|
||||||
[INFO] API interceptor captured 10 reviews (total unique API: 10) # SUCCESS!
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🔧 What I Fixed
|
|
||||||
|
|
||||||
### 1. **Fixed Critical Bug** (api_interceptor.py:527)
|
|
||||||
- Bug: `TypeError: '>' not supported between instances of 'InterceptedReview' and 'int'`
|
|
||||||
- Fix: Added proper type checking in recursive extraction
|
|
||||||
|
|
||||||
### 2. **Enhanced Logging** (api_interceptor.py:204-369)
|
|
||||||
- Browser console logs with `[API Interceptor]` prefix
|
|
||||||
- Real-time network stats (Fetch/XHR counts)
|
|
||||||
- Response URL and size tracking
|
|
||||||
- Automatic response dumping in debug mode
|
|
||||||
|
|
||||||
### 3. **Specialized Parser** (api_interceptor.py:435-558)
|
|
||||||
- Created `_parse_listugcposts_response()` for Google's API format
|
|
||||||
- Pattern-based detection:
|
|
||||||
- Long string (30+ chars) → Review ID
|
|
||||||
- Number 1-5 → Rating
|
|
||||||
- Long string (50+ chars, not URL) → Review text
|
|
||||||
- Short string (3-100 chars) → Author name
|
|
||||||
- Date patterns → Review date
|
|
||||||
|
|
||||||
### 4. **Stats & Diagnostics** (scraper.py:1487-1509)
|
|
||||||
- Reports captured vs parsed reviews
|
|
||||||
- Shows browser console messages
|
|
||||||
- Dumps raw responses for analysis
|
|
||||||
|
|
||||||
## 📈 Expected Performance
|
|
||||||
|
|
||||||
| Mode | Speed | Time for 244 Reviews |
|
|
||||||
|------|-------|---------------------|
|
|
||||||
| **Current (DOM)** | 2-4 reviews/sec | ~3 minutes |
|
|
||||||
| **Target (API)** | 20-50 reviews/sec | **~10-20 seconds** |
|
|
||||||
| **Speed Up** | **10-25x faster!** | 🚀 |
|
|
||||||
|
|
||||||
## 🧪 Testing & Tuning
|
|
||||||
|
|
||||||
### Step 1: Capture Sample Responses
|
|
||||||
```bash
|
|
||||||
# Run in debug mode to dump API responses
|
|
||||||
LOG_LEVEL=DEBUG python start.py
|
|
||||||
|
|
||||||
# Check for dumped responses
|
|
||||||
ls -lh debug_api_dump/
|
|
||||||
```
|
|
||||||
|
|
||||||
### Step 2: Analyze Response Format
|
|
||||||
```bash
|
|
||||||
# View captured response structure
|
|
||||||
cat debug_api_dump/response_0_body.txt | head -100
|
|
||||||
```
|
|
||||||
|
|
||||||
### Step 3: Tune Parser
|
|
||||||
If parsing returns 0 reviews, the Google API format may differ from our patterns. Open `debug_api_dump/response_0_body.txt` and:
|
|
||||||
|
|
||||||
1. Look for review data patterns
|
|
||||||
2. Adjust detection logic in `_parse_listugcposts_response()`
|
|
||||||
3. Test again with `LOG_LEVEL=DEBUG python start.py`
|
|
||||||
|
|
||||||
## 🎯 Browser Console Verification
|
|
||||||
|
|
||||||
Open the browser console (F12) while scraping. You should see:
|
|
||||||
|
|
||||||
```
|
|
||||||
[API Interceptor] ✅ Injected successfully! Monitoring network requests...
|
|
||||||
[API Interceptor] XHR: /maps/rpc/listugcposts?authuser=0&hl=es...
|
|
||||||
[API Interceptor] ✅ CAPTURED XHR: /maps/rpc/listugcposts... Size: 68426
|
|
||||||
[API Interceptor] Stats: Fetch: 0/0 XHR: 5/20 Queue: 5
|
|
||||||
```
|
|
||||||
|
|
||||||
This confirms the interceptor is actively capturing API calls.
|
|
||||||
|
|
||||||
## 🐛 Troubleshooting
|
|
||||||
|
|
||||||
### No Responses Captured
|
|
||||||
```
|
|
||||||
⚠️ API interception was enabled but captured 0 reviews.
|
|
||||||
Network stats - Fetch: 0/0, XHR: 0/0
|
|
||||||
```
|
|
||||||
|
|
||||||
**Solutions:**
|
|
||||||
1. Check browser console for `[API Interceptor]` messages
|
|
||||||
2. Verify Google Maps is loading reviews (not empty page)
|
|
||||||
3. Try scrolling manually to trigger API calls
|
|
||||||
|
|
||||||
### Responses Captured But 0 Reviews Parsed
|
|
||||||
```
|
|
||||||
[DEBUG] Retrieved 2 intercepted responses from browser
|
|
||||||
[DEBUG] Parsed 0 reviews from responses
|
|
||||||
```
|
|
||||||
|
|
||||||
**Solutions:**
|
|
||||||
1. Check `debug_api_dump/` for raw responses
|
|
||||||
2. Analyze the response format
|
|
||||||
3. Adjust parser patterns in `_parse_listugcposts_response()`
|
|
||||||
|
|
||||||
### Python Cache Issues
|
|
||||||
```bash
|
|
||||||
# Thoroughly clean cache
|
|
||||||
find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null
|
|
||||||
find . -name "*.pyc" -delete
|
|
||||||
find . -name "*.pyo" -delete
|
|
||||||
|
|
||||||
# Restart scraper
|
|
||||||
python start.py
|
|
||||||
```
|
|
||||||
|
|
||||||
## 📊 Monitoring Progress
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Real-time monitoring
|
|
||||||
tail -f scraper_debug.log | grep -E "(API|captured|Parsed|Merging)"
|
|
||||||
|
|
||||||
# Check final results
|
|
||||||
grep -E "(total unique reviews|API interceptor captured|Merging)" scraper_debug.log
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🎉 Success Indicators
|
|
||||||
|
|
||||||
When API mode is working optimally, you'll see:
|
|
||||||
|
|
||||||
```
|
|
||||||
[INFO] API interceptor captured 15 reviews (total unique API: 15)
|
|
||||||
[INFO] API interceptor captured 12 reviews (total unique API: 27)
|
|
||||||
[INFO] Merging 244 reviews captured via API interception
|
|
||||||
[INFO] After merge: 244 total reviews
|
|
||||||
[INFO] Execution completed in 18.5 seconds # vs 174 seconds before!
|
|
||||||
```
|
|
||||||
|
|
||||||
## 📁 Key Files
|
|
||||||
|
|
||||||
- `modules/api_interceptor.py` - Core interceptor logic
|
|
||||||
- `modules/scraper.py` - Integration with main scraper
|
|
||||||
- `config.yaml` - Configuration (`enable_api_intercept: true`)
|
|
||||||
- `API_INTERCEPTOR_DEBUG_SUMMARY.md` - Detailed technical docs
|
|
||||||
- `QUICK_START_API_MODE.md` - This file
|
|
||||||
|
|
||||||
## 🔮 Next Steps
|
|
||||||
|
|
||||||
1. **Test with Debug Mode**: `LOG_LEVEL=DEBUG python start.py`
|
|
||||||
2. **Verify Capturing**: Check browser console for interceptor messages
|
|
||||||
3. **Analyze Responses**: Review `debug_api_dump/` if parsing fails
|
|
||||||
4. **Tune Parser**: Adjust patterns based on actual API format
|
|
||||||
5. **Benchmark**: Compare speed vs DOM-only mode
|
|
||||||
6. **Pure API Mode**: Once working, add option to skip DOM entirely
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**Ready to test!** Run `LOG_LEVEL=DEBUG python start.py` and watch the magic happen! 🚀
|
|
||||||
@@ -1,98 +0,0 @@
|
|||||||
================================================================================
|
|
||||||
API INTERCEPTOR DEBUG TEST - FINAL RESULTS
|
|
||||||
================================================================================
|
|
||||||
|
|
||||||
✅ TEST SUCCESSFUL - Proof of Concept Achieved!
|
|
||||||
|
|
||||||
EXECUTION SUMMARY
|
|
||||||
-----------------
|
|
||||||
Test Duration: 142.91 seconds (~2 min 23 sec)
|
|
||||||
Total Reviews: 247 (244 from DOM + 3 from API)
|
|
||||||
API Responses: 40+ captured from /maps/rpc/listugcposts
|
|
||||||
API Parse Rate: ~15% (needs optimization)
|
|
||||||
Status: ✅ Completed successfully
|
|
||||||
|
|
||||||
KEY ACHIEVEMENTS
|
|
||||||
----------------
|
|
||||||
✅ API interception working perfectly
|
|
||||||
✅ Captured 40+ API responses (68KB-96KB each)
|
|
||||||
✅ Successfully parsed 3 unique reviews from API
|
|
||||||
✅ Found reviews that DOM scraping missed
|
|
||||||
✅ Clean integration with existing scraper
|
|
||||||
✅ Comprehensive debug logging in place
|
|
||||||
|
|
||||||
PERFORMANCE METRICS
|
|
||||||
-------------------
|
|
||||||
Current (Mixed Mode): 247 reviews in 143 seconds
|
|
||||||
DOM Only (Baseline): 244 reviews in 174 seconds
|
|
||||||
Target (Optimized API): 244 reviews in 10-20 seconds (10-25x faster!)
|
|
||||||
|
|
||||||
THE OPPORTUNITY
|
|
||||||
---------------
|
|
||||||
Each API response is 68KB-96KB and likely contains 10-20 reviews.
|
|
||||||
We're currently only parsing 1-2 reviews per response (15% success rate).
|
|
||||||
|
|
||||||
If we tune the parser to extract ALL reviews from API responses:
|
|
||||||
→ Get all 244 reviews in just 2-3 API calls
|
|
||||||
→ Complete scraping in 5-20 seconds instead of 3 minutes
|
|
||||||
→ Achieve 10-25x speed improvement! 🚀
|
|
||||||
|
|
||||||
WHAT WE PROVED
|
|
||||||
--------------
|
|
||||||
✅ Technology works
|
|
||||||
✅ Responses captured successfully
|
|
||||||
✅ Parser can extract review data
|
|
||||||
✅ System is stable and reliable
|
|
||||||
✅ Foundation is complete
|
|
||||||
|
|
||||||
WHAT'S NEEDED
|
|
||||||
-------------
|
|
||||||
⚠️ Parser optimization (currently too conservative)
|
|
||||||
⚠️ Analyze actual Google API format
|
|
||||||
⚠️ Tune patterns to match Google's structure
|
|
||||||
|
|
||||||
NEXT STEPS
|
|
||||||
----------
|
|
||||||
1. Dump a sample API response for analysis
|
|
||||||
2. Study Google's exact response format
|
|
||||||
3. Tune parser to extract all reviews
|
|
||||||
4. Test and benchmark improvements
|
|
||||||
5. Enjoy 10-25x faster scraping!
|
|
||||||
|
|
||||||
FILES CREATED
|
|
||||||
-------------
|
|
||||||
📄 API_TEST_RESULTS.md - Complete technical analysis
|
|
||||||
📄 QUICK_START_API_MODE.md - How to use API mode
|
|
||||||
📄 API_INTERCEPTOR_DEBUG_SUMMARY.md - Technical documentation
|
|
||||||
📄 RESULTS_SUMMARY.txt - This file
|
|
||||||
|
|
||||||
HOW TO RE-RUN TEST
|
|
||||||
------------------
|
|
||||||
# Clean cache
|
|
||||||
find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null
|
|
||||||
find . -name "*.pyc" -delete
|
|
||||||
|
|
||||||
# Run with debug logging
|
|
||||||
LOG_LEVEL=DEBUG python start.py 2>&1 | tee test.log
|
|
||||||
|
|
||||||
# Check results
|
|
||||||
grep "API interceptor captured\|Merging\|Finished" test.log
|
|
||||||
|
|
||||||
CURRENT STATUS
|
|
||||||
--------------
|
|
||||||
✅ API Interceptor: PRODUCTION READY (hybrid mode)
|
|
||||||
⚠️ Parser Optimization: IN PROGRESS (15% → 80%+ target)
|
|
||||||
🚀 Speed Improvement: ACHIEVABLE (10-25x potential)
|
|
||||||
|
|
||||||
THE BOTTOM LINE
|
|
||||||
---------------
|
|
||||||
We successfully proved that Google Maps API interception works!
|
|
||||||
|
|
||||||
The scraper captured 40+ API responses and extracted 3 reviews,
|
|
||||||
proving the technology is sound. With parser tuning, we can achieve
|
|
||||||
a 10-25x speed improvement, reducing scrape time from 3 minutes to
|
|
||||||
just 10-20 seconds.
|
|
||||||
|
|
||||||
The foundation is complete. The path to 10-25x faster scraping is clear! 🎉
|
|
||||||
|
|
||||||
================================================================================
|
|
||||||
@@ -1,180 +0,0 @@
|
|||||||
# Speed Optimization Journey
|
|
||||||
|
|
||||||
## Final Results
|
|
||||||
|
|
||||||
**Best Stable Performance**: `start_ultra_fast.py`
|
|
||||||
- **Time**: ~19.4 seconds (averaged over 4 runs)
|
|
||||||
- **Speed**: **8.0x faster** than original (155s → 19.4s)
|
|
||||||
- **Reviews**: 234/244 (95.9%)
|
|
||||||
- **Success Rate**: 100% stable
|
|
||||||
|
|
||||||
## Optimization Progression
|
|
||||||
|
|
||||||
| Version | Time | Speedup | Notes |
|
|
||||||
|---------|------|---------|-------|
|
|
||||||
| Original DOM scraping | 155s | 1.0x | Baseline - scrolls + parses DOM |
|
|
||||||
| Fast API (0.8s scroll) | 43s | 3.6x | API interception + scrolling |
|
|
||||||
| Fast API (0.3s scroll) | 29s | 5.3x | Faster scroll timing |
|
|
||||||
| Ultra-fast (0.25s, unstable) | 18s | 8.6x | ❌ 33% failure rate |
|
|
||||||
| **Ultra-fast (0.27s, stable)** | **19.4s** | **8.0x** | ✅ **100% stable** |
|
|
||||||
|
|
||||||
## Key Optimizations Applied
|
|
||||||
|
|
||||||
### 1. Removed Unnecessary Waits (~6s saved)
|
|
||||||
- ❌ 3s "wait for reviews page to load" → ✅ 1s (saves 2s)
|
|
||||||
- ❌ 2s after tab click → ✅ 0.4s (saves 1.6s)
|
|
||||||
- ❌ 2s after cookie dismiss → ✅ 0.4s (saves 1.6s)
|
|
||||||
- ❌ 2s for initial API trigger → ✅ 0.3s (saves 1.7s)
|
|
||||||
|
|
||||||
### 2. Faster Scroll Timing (~10s saved)
|
|
||||||
- ❌ 0.8s per scroll (30 scrolls = 24s)
|
|
||||||
- ✅ 0.27s per scroll (30 scrolls = 8.1s)
|
|
||||||
- **Savings**: 15.9s
|
|
||||||
|
|
||||||
### 3. Reduced Logging Overhead
|
|
||||||
- Log only every 10 scrolls instead of every scroll
|
|
||||||
- Minimal I/O during tight loop
|
|
||||||
|
|
||||||
### 4. Optimized Pane Finding
|
|
||||||
- Use most common selector first
|
|
||||||
- Reduced timeout from 5s to 3s
|
|
||||||
|
|
||||||
### 5. Streamlined API Interception
|
|
||||||
- Reduced setup wait from 2s to 0.3s
|
|
||||||
- Still 100% reliable
|
|
||||||
|
|
||||||
## Timing Breakdown (Ultra-Fast)
|
|
||||||
|
|
||||||
```
|
|
||||||
Operation Time % of Total
|
|
||||||
──────────────────────────────────────────────────
|
|
||||||
Browser startup ~1.0s 5%
|
|
||||||
Navigate to page 1.5s 8%
|
|
||||||
Cookie dialog dismiss 0.4s 2%
|
|
||||||
Click reviews tab 0.4s 2%
|
|
||||||
Wait for page stability 1.0s 5%
|
|
||||||
Find reviews pane ~1.5s 8%
|
|
||||||
Setup API interceptor 0.3s 2%
|
|
||||||
Initial scroll trigger 0.3s 2%
|
|
||||||
Scrolling (30 × 0.27s) 8.1s 42%
|
|
||||||
Response collection ~3.0s 15%
|
|
||||||
Parsing & saving ~1.9s 10%
|
|
||||||
──────────────────────────────────────────────────
|
|
||||||
TOTAL ~19.4s 100%
|
|
||||||
```
|
|
||||||
|
|
||||||
## Bottleneck Analysis
|
|
||||||
|
|
||||||
Current bottlenecks (in order):
|
|
||||||
1. **Scrolling loop**: 8.1s (42%) - Already optimized to 0.27s/scroll
|
|
||||||
2. **Response collection**: 3.0s (15%) - Necessary overhead
|
|
||||||
3. **Parsing & saving**: 1.9s (10%) - Fast enough
|
|
||||||
4. **Browser startup**: 1.0s (5%) - Can't optimize much
|
|
||||||
5. **Page navigation**: 1.5s (8%) - Network dependent
|
|
||||||
|
|
||||||
## Why We Can't Go Faster
|
|
||||||
|
|
||||||
### Scroll Timing Limit: 0.27s
|
|
||||||
- **0.25s**: 33% failure rate (too fast, misses API responses)
|
|
||||||
- **0.27s**: 100% success rate ✅
|
|
||||||
- **0.30s**: 100% success but slower
|
|
||||||
|
|
||||||
**Conclusion**: 0.27s is the optimal balance.
|
|
||||||
|
|
||||||
### Page Load Times (Fixed)
|
|
||||||
- Network latency: ~1-2s
|
|
||||||
- Browser initialization: ~1s
|
|
||||||
- Can't be eliminated
|
|
||||||
|
|
||||||
### API Response Time
|
|
||||||
- Google's server needs time to respond
|
|
||||||
- We can't make their API faster
|
|
||||||
|
|
||||||
## Alternative Approaches Tested
|
|
||||||
|
|
||||||
### ❌ Parallel API Calls
|
|
||||||
**Issue**: Continuation tokens are sequential - each response contains token for next page
|
|
||||||
|
|
||||||
**Result**: Can't truly parallelize without tokens
|
|
||||||
|
|
||||||
### ❌ Cookie-based Direct API
|
|
||||||
**Issue**: Browser cookies don't include auth tokens (SID, HSID, SAPISID)
|
|
||||||
|
|
||||||
**Result**: 400 errors when using requests library
|
|
||||||
|
|
||||||
### ❌ Headless Mode
|
|
||||||
**Issue**: Page structure loads differently, selectors fail
|
|
||||||
|
|
||||||
**Result**: 0 reviews captured
|
|
||||||
|
|
||||||
## Recommendations
|
|
||||||
|
|
||||||
### For Production Use
|
|
||||||
Use `start_ultra_fast.py`:
|
|
||||||
```bash
|
|
||||||
python start_ultra_fast.py
|
|
||||||
```
|
|
||||||
|
|
||||||
**Pros**:
|
|
||||||
- ✅ 8.0x faster (19.4s vs 155s)
|
|
||||||
- ✅ 100% stable
|
|
||||||
- ✅ 95.9% review coverage
|
|
||||||
- ✅ No authentication needed
|
|
||||||
- ✅ Simple, maintainable
|
|
||||||
|
|
||||||
### If You Need All 244 Reviews
|
|
||||||
Use original `start.py` (155s) - gets 100% of reviews
|
|
||||||
|
|
||||||
### Configuration
|
|
||||||
```yaml
|
|
||||||
headless: false # Must be false for stability
|
|
||||||
```
|
|
||||||
|
|
||||||
## Performance Metrics
|
|
||||||
|
|
||||||
```
|
|
||||||
Metric Value
|
|
||||||
────────────────────────────────────
|
|
||||||
Average time 19.4s
|
|
||||||
Std deviation ±0.4s
|
|
||||||
Success rate 100% (4/4 runs)
|
|
||||||
Reviews captured 234
|
|
||||||
Reviews/second 12.1
|
|
||||||
API responses/second 1.2
|
|
||||||
Speedup vs original 8.0x
|
|
||||||
Time saved per run 135.6s
|
|
||||||
```
|
|
||||||
|
|
||||||
## Theoretical Limits
|
|
||||||
|
|
||||||
**Absolute minimum** (if everything was instant except scrolling):
|
|
||||||
- 30 scrolls × 0.27s = 8.1s
|
|
||||||
- Plus ~5s for unavoidable operations
|
|
||||||
- **Theoretical minimum: ~13s**
|
|
||||||
|
|
||||||
**Current: 19.4s**
|
|
||||||
- Only 6.4s from theoretical minimum
|
|
||||||
- Already 68% of theoretical maximum speed!
|
|
||||||
|
|
||||||
## Conclusion
|
|
||||||
|
|
||||||
We achieved **8.0x speedup** by:
|
|
||||||
1. Eliminating unnecessary waits
|
|
||||||
2. Optimizing scroll timing to the limit (0.27s)
|
|
||||||
3. Minimizing logging overhead
|
|
||||||
4. Streamlining every operation
|
|
||||||
|
|
||||||
Further optimization would require:
|
|
||||||
- Faster Google API responses (impossible)
|
|
||||||
- Instant browser startup (impossible)
|
|
||||||
- Instant network requests (impossible)
|
|
||||||
|
|
||||||
**The scraper is now operating near theoretical maximum efficiency!** 🚀
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**Final Stats**:
|
|
||||||
- 📊 Original: 155s → **Ultra-fast: 19.4s**
|
|
||||||
- 🚀 **8.0x faster!**
|
|
||||||
- ⏱️ **Saves 136 seconds per run**
|
|
||||||
- ✅ **100% stable**
|
|
||||||
@@ -1,328 +0,0 @@
|
|||||||
# Storage Strategy Comparison
|
|
||||||
## PostgreSQL JSONB vs S3 for Review Data
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🎯 Recommendation: Start with PostgreSQL JSONB
|
|
||||||
|
|
||||||
### Why PostgreSQL is Better for Most Cases:
|
|
||||||
|
|
||||||
```sql
|
|
||||||
CREATE TABLE jobs (
|
|
||||||
job_id UUID PRIMARY KEY,
|
|
||||||
status VARCHAR(20) NOT NULL,
|
|
||||||
url TEXT NOT NULL,
|
|
||||||
webhook_url TEXT,
|
|
||||||
created_at TIMESTAMP NOT NULL,
|
|
||||||
completed_at TIMESTAMP,
|
|
||||||
reviews_count INTEGER,
|
|
||||||
|
|
||||||
-- Store reviews directly as JSONB!
|
|
||||||
reviews_data JSONB, ← All 244 reviews in one column
|
|
||||||
|
|
||||||
error_message TEXT
|
|
||||||
);
|
|
||||||
|
|
||||||
-- You can even query INSIDE the JSON!
|
|
||||||
SELECT
|
|
||||||
job_id,
|
|
||||||
jsonb_array_length(reviews_data) as review_count,
|
|
||||||
reviews_data->0->>'author' as first_reviewer
|
|
||||||
FROM jobs
|
|
||||||
WHERE reviews_data @> '[{"rating": 5}]'; -- Find jobs with 5-star reviews
|
|
||||||
```
|
|
||||||
|
|
||||||
### Advantages:
|
|
||||||
|
|
||||||
✅ **Simpler Architecture**
|
|
||||||
- One service instead of two
|
|
||||||
- No S3 credentials/SDK to manage
|
|
||||||
- Easier local development
|
|
||||||
|
|
||||||
✅ **Transactional**
|
|
||||||
- Atomic updates (job status + reviews in one transaction)
|
|
||||||
- ACID guarantees
|
|
||||||
- No eventual consistency issues
|
|
||||||
|
|
||||||
✅ **Queryable**
|
|
||||||
```sql
|
|
||||||
-- Find all jobs with >200 reviews
|
|
||||||
SELECT job_id, reviews_count
|
|
||||||
FROM jobs
|
|
||||||
WHERE jsonb_array_length(reviews_data) > 200;
|
|
||||||
|
|
||||||
-- Extract specific review data
|
|
||||||
SELECT
|
|
||||||
job_id,
|
|
||||||
review->>'author' as author,
|
|
||||||
review->>'rating' as rating
|
|
||||||
FROM jobs, jsonb_array_elements(reviews_data) as review
|
|
||||||
WHERE review->>'rating' = '5';
|
|
||||||
```
|
|
||||||
|
|
||||||
✅ **Cost-Effective (Small-Medium Scale)**
|
|
||||||
```
|
|
||||||
244 reviews × 0.6 KB = ~150 KB per job
|
|
||||||
1,000 jobs/month = 150 MB/month
|
|
||||||
10,000 jobs/month = 1.5 GB/month
|
|
||||||
|
|
||||||
PostgreSQL:
|
|
||||||
- $0/month (self-hosted) or $15/month (managed)
|
|
||||||
- Handles 10,000 jobs easily
|
|
||||||
|
|
||||||
S3:
|
|
||||||
- Storage: $0.03/month (cheap!)
|
|
||||||
- But need to manage: credentials, SDK, buckets
|
|
||||||
```
|
|
||||||
|
|
||||||
✅ **Built-in Backup**
|
|
||||||
- Standard PostgreSQL backup tools
|
|
||||||
- Point-in-time recovery
|
|
||||||
- Replication included
|
|
||||||
|
|
||||||
✅ **Fast Retrieval**
|
|
||||||
```python
|
|
||||||
# Single query gets everything
|
|
||||||
job = db.query("""
|
|
||||||
SELECT job_id, status, reviews_data
|
|
||||||
FROM jobs
|
|
||||||
WHERE job_id = %s
|
|
||||||
""", job_id)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"job_id": job.job_id,
|
|
||||||
"reviews": job.reviews_data # Already parsed JSON
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## When to Use S3 Instead
|
|
||||||
|
|
||||||
### Use S3 if:
|
|
||||||
|
|
||||||
❌ **Very High Volume**
|
|
||||||
```
|
|
||||||
> 100,000 jobs/month
|
|
||||||
> 100 GB of review data
|
|
||||||
Database backup/restore becomes slow
|
|
||||||
```
|
|
||||||
|
|
||||||
❌ **Long-Term Retention**
|
|
||||||
```
|
|
||||||
Need to keep reviews for years
|
|
||||||
Want lifecycle policies (auto-delete after 1 year)
|
|
||||||
Cold storage for compliance
|
|
||||||
```
|
|
||||||
|
|
||||||
❌ **Direct Client Access**
|
|
||||||
```python
|
|
||||||
# Pre-signed URLs let clients download directly
|
|
||||||
url = s3.generate_presigned_url(
|
|
||||||
'get_object',
|
|
||||||
Params={'Bucket': 'reviews', 'Key': f'{job_id}.json'},
|
|
||||||
ExpiresIn=3600
|
|
||||||
)
|
|
||||||
|
|
||||||
# Client downloads directly from S3 (saves bandwidth)
|
|
||||||
return {"reviews_url": url}
|
|
||||||
```
|
|
||||||
|
|
||||||
❌ **Multi-Region**
|
|
||||||
```
|
|
||||||
S3 replication across regions
|
|
||||||
CDN integration (CloudFront)
|
|
||||||
Global low-latency access
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📊 Performance Comparison
|
|
||||||
|
|
||||||
### PostgreSQL JSONB
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Store reviews (single INSERT)
|
|
||||||
INSERT INTO jobs (job_id, reviews_data)
|
|
||||||
VALUES (%s, %s::jsonb)
|
|
||||||
# 244 reviews: ~5ms
|
|
||||||
|
|
||||||
# Retrieve reviews (single SELECT)
|
|
||||||
SELECT reviews_data FROM jobs WHERE job_id = %s
|
|
||||||
# 244 reviews: ~2ms
|
|
||||||
```
|
|
||||||
|
|
||||||
**Total**: ~7ms for store + retrieve
|
|
||||||
|
|
||||||
### S3
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Store reviews (HTTP PUT)
|
|
||||||
s3.put_object(
|
|
||||||
Bucket='reviews',
|
|
||||||
Key=f'{job_id}.json',
|
|
||||||
Body=json.dumps(reviews)
|
|
||||||
)
|
|
||||||
# 244 reviews: ~50-200ms (network latency)
|
|
||||||
|
|
||||||
# Retrieve reviews (HTTP GET)
|
|
||||||
response = s3.get_object(
|
|
||||||
Bucket='reviews',
|
|
||||||
Key=f'{job_id}.json'
|
|
||||||
)
|
|
||||||
# 244 reviews: ~50-200ms
|
|
||||||
```
|
|
||||||
|
|
||||||
**Total**: ~100-400ms for store + retrieve
|
|
||||||
|
|
||||||
**PostgreSQL is 14-57x faster!**
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 💾 Size Limits
|
|
||||||
|
|
||||||
### PostgreSQL JSONB
|
|
||||||
```
|
|
||||||
Max column size: 1 GB
|
|
||||||
Practical limit: ~100 MB per row
|
|
||||||
|
|
||||||
Our use case:
|
|
||||||
244 reviews × 0.6 KB = 150 KB ✅ Perfect!
|
|
||||||
10,000 reviews × 0.6 KB = 6 MB ✅ Still great
|
|
||||||
100,000 reviews × 0.6 KB = 60 MB ✅ OK, but consider splitting
|
|
||||||
```
|
|
||||||
|
|
||||||
### When to worry:
|
|
||||||
```
|
|
||||||
> 50,000 reviews per job → Consider S3
|
|
||||||
> 100 MB per job → Definitely use S3
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🏗️ Hybrid Approach (Best of Both Worlds)
|
|
||||||
|
|
||||||
For maximum flexibility:
|
|
||||||
|
|
||||||
```python
|
|
||||||
class JobStorage:
|
|
||||||
def __init__(self):
|
|
||||||
self.db = PostgreSQL()
|
|
||||||
self.s3 = S3Client() # Optional
|
|
||||||
|
|
||||||
async def save_reviews(self, job_id, reviews):
|
|
||||||
reviews_json = json.dumps(reviews)
|
|
||||||
size_mb = len(reviews_json) / 1024 / 1024
|
|
||||||
|
|
||||||
if size_mb < 10: # Small job: use PostgreSQL
|
|
||||||
await self.db.execute("""
|
|
||||||
UPDATE jobs
|
|
||||||
SET reviews_data = %s::jsonb
|
|
||||||
WHERE job_id = %s
|
|
||||||
""", reviews_json, job_id)
|
|
||||||
|
|
||||||
else: # Large job: use S3
|
|
||||||
await self.s3.upload(
|
|
||||||
f'reviews/{job_id}.json',
|
|
||||||
reviews_json
|
|
||||||
)
|
|
||||||
await self.db.execute("""
|
|
||||||
UPDATE jobs
|
|
||||||
SET reviews_s3_key = %s
|
|
||||||
WHERE job_id = %s
|
|
||||||
""", f'reviews/{job_id}.json', job_id)
|
|
||||||
|
|
||||||
async def get_reviews(self, job_id):
|
|
||||||
job = await self.db.fetch_one("""
|
|
||||||
SELECT reviews_data, reviews_s3_key
|
|
||||||
FROM jobs
|
|
||||||
WHERE job_id = %s
|
|
||||||
""", job_id)
|
|
||||||
|
|
||||||
if job.reviews_data:
|
|
||||||
return job.reviews_data # From PostgreSQL
|
|
||||||
elif job.reviews_s3_key:
|
|
||||||
return await self.s3.download(job.reviews_s3_key) # From S3
|
|
||||||
else:
|
|
||||||
raise NotFound()
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ✅ Final Recommendation
|
|
||||||
|
|
||||||
### For Your Use Case:
|
|
||||||
|
|
||||||
**Use PostgreSQL JSONB** because:
|
|
||||||
|
|
||||||
1. ✅ Simpler (one service, not two)
|
|
||||||
2. ✅ Faster (2ms vs 200ms)
|
|
||||||
3. ✅ Cheaper (for typical volumes)
|
|
||||||
4. ✅ Queryable (can analyze reviews in SQL)
|
|
||||||
5. ✅ Transactional (atomic updates)
|
|
||||||
6. ✅ Easier backups
|
|
||||||
|
|
||||||
**Schema**:
|
|
||||||
```sql
|
|
||||||
CREATE TABLE jobs (
|
|
||||||
job_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
||||||
status VARCHAR(20) NOT NULL DEFAULT 'pending',
|
|
||||||
url TEXT NOT NULL,
|
|
||||||
webhook_url TEXT,
|
|
||||||
|
|
||||||
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
|
|
||||||
started_at TIMESTAMP,
|
|
||||||
completed_at TIMESTAMP,
|
|
||||||
|
|
||||||
reviews_count INTEGER,
|
|
||||||
reviews_data JSONB, -- All reviews here!
|
|
||||||
scrape_time REAL,
|
|
||||||
|
|
||||||
error_message TEXT,
|
|
||||||
metadata JSONB,
|
|
||||||
|
|
||||||
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE INDEX idx_jobs_status ON jobs(status);
|
|
||||||
CREATE INDEX idx_jobs_created_at ON jobs(created_at DESC);
|
|
||||||
CREATE INDEX idx_jobs_webhook ON jobs(webhook_url) WHERE webhook_url IS NOT NULL;
|
|
||||||
```
|
|
||||||
|
|
||||||
**Migration Path to S3**:
|
|
||||||
- Start with PostgreSQL
|
|
||||||
- If you reach 100GB+ of data, migrate to S3
|
|
||||||
- Keep PostgreSQL for metadata only
|
|
||||||
- Use the hybrid approach above
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📈 Scale Projections
|
|
||||||
|
|
||||||
```
|
|
||||||
Small:
|
|
||||||
1,000 jobs/month × 150 KB = 150 MB/month
|
|
||||||
→ PostgreSQL ✅
|
|
||||||
|
|
||||||
Medium:
|
|
||||||
10,000 jobs/month × 150 KB = 1.5 GB/month
|
|
||||||
→ PostgreSQL ✅
|
|
||||||
|
|
||||||
Large:
|
|
||||||
100,000 jobs/month × 150 KB = 15 GB/month
|
|
||||||
→ PostgreSQL ✅ (but consider S3)
|
|
||||||
|
|
||||||
Very Large:
|
|
||||||
1,000,000 jobs/month × 150 KB = 150 GB/month
|
|
||||||
→ S3 ✅
|
|
||||||
|
|
||||||
Enterprise:
|
|
||||||
Need multi-year retention
|
|
||||||
Multi-region replication
|
|
||||||
Compliance requirements
|
|
||||||
→ S3 ✅
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**Bottom Line**: Start with **PostgreSQL JSONB**. It's simpler, faster, and cheaper for 99% of use cases. Upgrade to S3 only if you need it.
|
|
||||||
@@ -1,268 +0,0 @@
|
|||||||
# Testing Interface - Quick Start Guide
|
|
||||||
|
|
||||||
A beautiful Next.js web interface for testing the Google Reviews Scraper API.
|
|
||||||
|
|
||||||
## 🎯 What You Get
|
|
||||||
|
|
||||||
### Business Search Mode
|
|
||||||
- **Search by name** - Just type "Soho Club Vilnius" instead of pasting URLs
|
|
||||||
- **Live map preview** - See the business location before scraping
|
|
||||||
- **Auto-generate URL** - Creates the perfect Google Maps search URL
|
|
||||||
|
|
||||||
### Direct URL Mode
|
|
||||||
- **Paste any URL** - For specific Google Maps business pages
|
|
||||||
- **Flexible input** - Works with any Google Maps URL format
|
|
||||||
|
|
||||||
### Real-Time Tracking
|
|
||||||
- **Live status updates** - Watch your job progress in real-time
|
|
||||||
- **Performance metrics** - Reviews count, time, speed
|
|
||||||
- **Beautiful UI** - Clean, modern interface with status icons
|
|
||||||
|
|
||||||
### Results Display
|
|
||||||
- **Review cards** - Author, rating, text, avatar, date
|
|
||||||
- **Export to JSON** - Download all reviews as formatted JSON
|
|
||||||
- **Scrollable list** - Handle hundreds of reviews smoothly
|
|
||||||
|
|
||||||
## 🚀 Quick Start
|
|
||||||
|
|
||||||
### 1. Start the Scraper API
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# From project root
|
|
||||||
docker-compose -f docker-compose.production.yml up -d
|
|
||||||
```
|
|
||||||
|
|
||||||
API runs at: **http://localhost:8000**
|
|
||||||
|
|
||||||
### 2. Start the Web Interface
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd web
|
|
||||||
npm install
|
|
||||||
npm run dev
|
|
||||||
```
|
|
||||||
|
|
||||||
Web interface runs at: **http://localhost:3000** (or next available port)
|
|
||||||
|
|
||||||
## 💡 Usage Examples
|
|
||||||
|
|
||||||
### Search Mode (Recommended)
|
|
||||||
1. Click "🔍 Search Business"
|
|
||||||
2. Type: `Soho Club Vilnius`
|
|
||||||
3. Map shows the business location
|
|
||||||
4. Click "Scrape All Reviews"
|
|
||||||
5. Watch real-time progress
|
|
||||||
6. Export results as JSON
|
|
||||||
|
|
||||||
### URL Mode
|
|
||||||
1. Click "🔗 Paste URL"
|
|
||||||
2. Paste Google Maps URL
|
|
||||||
3. Click "Scrape"
|
|
||||||
4. View results
|
|
||||||
|
|
||||||
## 📊 Features
|
|
||||||
|
|
||||||
### Search Interface
|
|
||||||
- **Debounced search** - Updates map 500ms after typing stops
|
|
||||||
- **Enter key support** - Press Enter to search
|
|
||||||
- **Visual feedback** - Loading states, icons, colors
|
|
||||||
|
|
||||||
### Job Tracking
|
|
||||||
- **Polling every 2 seconds** - Real-time status updates
|
|
||||||
- **Status indicators**:
|
|
||||||
- 🔵 Running (spinner animation)
|
|
||||||
- ✅ Completed (green checkmark)
|
|
||||||
- ❌ Failed (red X)
|
|
||||||
- ⏱️ Pending (clock icon)
|
|
||||||
|
|
||||||
### Performance Metrics
|
|
||||||
- **Reviews count** - Total scraped
|
|
||||||
- **Time taken** - Seconds elapsed
|
|
||||||
- **Speed** - Reviews per second
|
|
||||||
- **Start time** - When job began
|
|
||||||
|
|
||||||
### Export
|
|
||||||
- **JSON download** - Formatted, ready to use
|
|
||||||
- **Filename** - Includes job ID for tracking
|
|
||||||
- **Complete data** - All review fields preserved
|
|
||||||
|
|
||||||
## 🏗️ Architecture
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────┐
|
|
||||||
│ Web Interface (Next.js) │
|
|
||||||
│ http://localhost:3000 │
|
|
||||||
│ │
|
|
||||||
│ - Search business by name │
|
|
||||||
│ - Or paste URL directly │
|
|
||||||
│ - View map preview │
|
|
||||||
│ - Real-time job tracking │
|
|
||||||
│ - Export results │
|
|
||||||
└──────────────┬──────────────────────┘
|
|
||||||
│ API Calls
|
|
||||||
▼
|
|
||||||
┌─────────────────────────────────────┐
|
|
||||||
│ API Proxy (Next.js API Routes) │
|
|
||||||
│ │
|
|
||||||
│ POST /api/scrape │
|
|
||||||
│ GET /api/jobs/[id] │
|
|
||||||
│ GET /api/jobs/[id]/reviews │
|
|
||||||
└──────────────┬──────────────────────┘
|
|
||||||
│ Forward to
|
|
||||||
▼
|
|
||||||
┌─────────────────────────────────────┐
|
|
||||||
│ Scraper API (FastAPI) │
|
|
||||||
│ http://localhost:8000 │
|
|
||||||
│ │
|
|
||||||
│ - Job queue management │
|
|
||||||
│ - Chrome + SeleniumBase │
|
|
||||||
│ - PostgreSQL storage │
|
|
||||||
└─────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🎨 UI Components
|
|
||||||
|
|
||||||
### Mode Toggle
|
|
||||||
```
|
|
||||||
┌──────────────┬──────────────┐
|
|
||||||
│ 🔍 Search │ 🔗 Paste URL │
|
|
||||||
└──────────────┴──────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
### Search Interface
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────┐
|
|
||||||
│ 🔍 Business name and location... │
|
|
||||||
├─────────────────────────────────────┤
|
|
||||||
│ │
|
|
||||||
│ Google Maps Embed │
|
|
||||||
│ │
|
|
||||||
├─────────────────────────────────────┤
|
|
||||||
│ 📥 Scrape All Reviews │
|
|
||||||
└─────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
### Job Status Card
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────┐
|
|
||||||
│ ✅ Job Status: COMPLETED │
|
|
||||||
│ 5f1d394f-10c5-4f30-8c2b-cb789c05918f│
|
|
||||||
│ │
|
|
||||||
│ 190 19.9s 9.5 │
|
|
||||||
│ Reviews Time Reviews/sec │
|
|
||||||
└─────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
### Review Card
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────┐
|
|
||||||
│ 👤 John Doe ⭐⭐⭐⭐⭐ │
|
|
||||||
│ 2 weeks ago │
|
|
||||||
│ │
|
|
||||||
│ Great place! Really enjoyed... │
|
|
||||||
└─────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🔧 Configuration
|
|
||||||
|
|
||||||
### Environment Variables
|
|
||||||
|
|
||||||
Create `web/.env.local`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# API URL (default: http://localhost:8000)
|
|
||||||
NEXT_PUBLIC_API_URL=http://localhost:8000
|
|
||||||
```
|
|
||||||
|
|
||||||
### Custom Port
|
|
||||||
|
|
||||||
If port 3000 is taken, Next.js auto-selects the next available port (3001, 3002, etc.)
|
|
||||||
|
|
||||||
## 🐛 Troubleshooting
|
|
||||||
|
|
||||||
### Web interface won't connect to API
|
|
||||||
```bash
|
|
||||||
# Check API is running
|
|
||||||
curl http://localhost:8000/health/live
|
|
||||||
|
|
||||||
# Check for CORS issues
|
|
||||||
# (Next.js API routes handle CORS automatically)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Map not showing
|
|
||||||
- Check search query is at least 2 characters
|
|
||||||
- Wait 500ms after typing (debounce delay)
|
|
||||||
- Press Enter or click Search button
|
|
||||||
|
|
||||||
### Reviews not loading
|
|
||||||
- Check job status reached "completed"
|
|
||||||
- Look for error message in red box
|
|
||||||
- Check browser console for errors
|
|
||||||
|
|
||||||
## 📱 Mobile Friendly
|
|
||||||
|
|
||||||
The interface is fully responsive:
|
|
||||||
- Mobile: Single column, touch-optimized
|
|
||||||
- Tablet: Comfortable layout
|
|
||||||
- Desktop: Full width with max-width constraint
|
|
||||||
|
|
||||||
## 🎯 Example Businesses to Test
|
|
||||||
|
|
||||||
```
|
|
||||||
Soho Club Vilnius
|
|
||||||
McDonald's Times Square New York
|
|
||||||
Eiffel Tower Paris
|
|
||||||
Tokyo Tower Japan
|
|
||||||
Sydney Opera House
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🚀 Production Deployment
|
|
||||||
|
|
||||||
### Option 1: Vercel (Recommended)
|
|
||||||
```bash
|
|
||||||
cd web
|
|
||||||
vercel deploy
|
|
||||||
```
|
|
||||||
|
|
||||||
### Option 2: Docker
|
|
||||||
```bash
|
|
||||||
cd web
|
|
||||||
docker build -t scraper-web .
|
|
||||||
docker run -p 3000:3000 -e NEXT_PUBLIC_API_URL=http://api:8000 scraper-web
|
|
||||||
```
|
|
||||||
|
|
||||||
### Option 3: Self-hosted
|
|
||||||
```bash
|
|
||||||
cd web
|
|
||||||
npm run build
|
|
||||||
npm run start
|
|
||||||
```
|
|
||||||
|
|
||||||
## 📝 Notes
|
|
||||||
|
|
||||||
- Interface polls job status every 2 seconds
|
|
||||||
- Polling stops when job completes or fails
|
|
||||||
- Reviews fetched with limit of 1000 (configurable)
|
|
||||||
- Export creates `reviews-{job_id}.json` file
|
|
||||||
- All processing happens server-side (secure API calls)
|
|
||||||
|
|
||||||
## 🎉 Benefits Over curl
|
|
||||||
|
|
||||||
Before (curl):
|
|
||||||
```bash
|
|
||||||
curl -X POST http://localhost:8000/scrape -d '{"url":"..."}'
|
|
||||||
# Copy job_id
|
|
||||||
curl http://localhost:8000/jobs/{job_id}
|
|
||||||
# Wait and check again
|
|
||||||
curl http://localhost:8000/jobs/{job_id}
|
|
||||||
# Finally get reviews
|
|
||||||
curl http://localhost:8000/jobs/{job_id}/reviews
|
|
||||||
```
|
|
||||||
|
|
||||||
After (Web UI):
|
|
||||||
1. Type business name
|
|
||||||
2. Click "Scrape All Reviews"
|
|
||||||
3. Watch progress
|
|
||||||
4. Export JSON
|
|
||||||
|
|
||||||
**Much better! 🚀**
|
|
||||||
@@ -1,335 +0,0 @@
|
|||||||
# Ultimate Optimization Results - Google Maps Scraper
|
|
||||||
|
|
||||||
## 🎯 Final Achievement: **18.9 seconds** (8.2x faster!)
|
|
||||||
|
|
||||||
### Performance Comparison
|
|
||||||
|
|
||||||
```
|
|
||||||
┌──────────────────────┬─────────┬──────────┬──────────┬────────────┐
|
|
||||||
│ Version │ Time │ Reviews │ Speedup │ Stability │
|
|
||||||
├──────────────────────┼─────────┼──────────┼──────────┼────────────┤
|
|
||||||
│ Original │ 155s │ 244 │ 1.0x │ ✅ 100% │
|
|
||||||
│ Fast API (0.8s) │ 43s │ 234 │ 3.6x │ ✅ 100% │
|
|
||||||
│ Fast API (0.3s) │ 29s │ 234 │ 5.3x │ ✅ 100% │
|
|
||||||
│ Ultra-fast API │ 19.4s │ 234 │ 8.0x │ ❌ 50% │
|
|
||||||
│ Sequential Hybrid │ 32.4s │ 244 │ 4.8x │ ✅ 100% │
|
|
||||||
│ DOM-only (fixed) │ 30s │ 244 │ 5.2x │ ✅ 100% │
|
|
||||||
│ **DOM-only (final)** │ **18.9s**│ **244** │ **8.2x** │ **✅ 100%**│
|
|
||||||
└──────────────────────┴─────────┴──────────┴──────────┴────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🚀 The Winning Solution
|
|
||||||
|
|
||||||
**File**: `start_dom_only_fast.py`
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python start_dom_only_fast.py
|
|
||||||
```
|
|
||||||
|
|
||||||
### Key Features
|
|
||||||
|
|
||||||
✅ **18.9 seconds** for all reviews (155s → 18.9s)
|
|
||||||
✅ **8.2x speedup** - saves 136 seconds per run
|
|
||||||
✅ **100% stable** - tested 20+ runs
|
|
||||||
✅ **100% complete** - gets all reviews every time
|
|
||||||
✅ **Universal** - works for ANY Google Maps business (no hardcoded values)
|
|
||||||
✅ **Adaptive** - scroll speed adapts to network/page load speed
|
|
||||||
✅ **Simple** - pure DOM extraction, no complex API interception
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🔧 Breakthrough Optimizations
|
|
||||||
|
|
||||||
### 1. Fixed GDPR Consent Page (The Root Cause!)
|
|
||||||
**Problem**: Page redirected to `consent.google.com`, blocking all scraping
|
|
||||||
**Solution**: Detect and click "Accept all" / "Aceptar todo" button
|
|
||||||
**Impact**: Fixed 100% failure rate → 100% success rate
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Handle GDPR consent page
|
|
||||||
if 'consent.google.com' in driver.current_url:
|
|
||||||
consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Aceptar"]')
|
|
||||||
if consent_btns:
|
|
||||||
consent_btns[0].click()
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Dynamic Scroll Waiting (Game Changer!)
|
|
||||||
**Problem**: Fixed `time.sleep(0.20)` wastes time when reviews load faster
|
|
||||||
**Solution**: Wait for reviews to **actually load** after each scroll
|
|
||||||
**Impact**: Adapts to any network speed, scrolls as fast as possible
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Scroll
|
|
||||||
driver.execute_script(scroll_script)
|
|
||||||
|
|
||||||
# Wait until reviews load (not fixed delay!)
|
|
||||||
while waited < max_wait:
|
|
||||||
time.sleep(0.05) # Check every 50ms
|
|
||||||
new_count = driver.execute_script("return document.querySelectorAll('div.jftiEf').length;")
|
|
||||||
|
|
||||||
# Continue immediately when reviews load!
|
|
||||||
if new_count > prev_count:
|
|
||||||
break
|
|
||||||
```
|
|
||||||
|
|
||||||
**Result**: Scrolls in ~14s instead of 24s
|
|
||||||
|
|
||||||
### 3. JavaScript Extraction (40x Faster!)
|
|
||||||
**Problem**: Selenium element-by-element parsing took 12.9 seconds
|
|
||||||
**Solution**: Extract all data at once with JavaScript
|
|
||||||
**Impact**: 12.9s → 0.01s (40x faster!)
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
const reviews = [];
|
|
||||||
const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium');
|
|
||||||
|
|
||||||
for (let i = 0; i < elements.length; i++) {
|
|
||||||
const elem = elements[i];
|
|
||||||
const review = {
|
|
||||||
author: elem.querySelector('div.d4r55')?.textContent.trim(),
|
|
||||||
rating: parseFloat(elem.querySelector('span.kvMYJc')?.getAttribute('aria-label').match(/\d+/)[0]),
|
|
||||||
text: elem.querySelector('span.wiI7pd')?.textContent.trim(),
|
|
||||||
// ... extract all fields
|
|
||||||
};
|
|
||||||
reviews.push(review);
|
|
||||||
}
|
|
||||||
return reviews;
|
|
||||||
```
|
|
||||||
|
|
||||||
### 4. Universal Design (No Hardcoded Values)
|
|
||||||
**Problem**: Previous versions hardcoded 244 reviews
|
|
||||||
**Solution**: Auto-detect when reviews stop loading
|
|
||||||
**Impact**: Works for ANY business (10 reviews or 10,000 reviews)
|
|
||||||
|
|
||||||
```python
|
|
||||||
# No hardcoded stop conditions!
|
|
||||||
if current_count == prev_count:
|
|
||||||
idle_count += 1
|
|
||||||
if idle_count >= 3: # Stop when no new reviews for 3 checks
|
|
||||||
break
|
|
||||||
```
|
|
||||||
|
|
||||||
### 5. Smart Early Stopping
|
|
||||||
**Problem**: Continued scrolling even when all reviews loaded
|
|
||||||
**Solution**: Check review count before each scroll
|
|
||||||
**Impact**: Stops immediately when done
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📊 Timing Breakdown
|
|
||||||
|
|
||||||
```
|
|
||||||
Operation Time % of Total
|
|
||||||
─────────────────────────────────────────────────────────
|
|
||||||
Browser startup ~1.0s 5%
|
|
||||||
Navigate to page 1.5s 8%
|
|
||||||
GDPR consent handling 1.5s 8%
|
|
||||||
Cookie dismiss 0.3s 2%
|
|
||||||
Click reviews tab 0.3s 2%
|
|
||||||
Page stability wait 0.8s 4%
|
|
||||||
Find pane ~1.0s 5%
|
|
||||||
Initial scroll trigger 0.8s 4%
|
|
||||||
Dynamic scrolling (adaptive) ~11-14s 60-74%
|
|
||||||
JavaScript extraction 0.01s 0.1%
|
|
||||||
Saving to JSON ~0.5s 3%
|
|
||||||
─────────────────────────────────────────────────────────
|
|
||||||
TOTAL ~18.9s 100%
|
|
||||||
```
|
|
||||||
|
|
||||||
**Bottleneck**: Scrolling (60-74% of time)
|
|
||||||
**Already optimized**: Scrolls as fast as page loads reviews
|
|
||||||
**Cannot optimize further**: Limited by Google's page rendering speed
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ❌ Failed Optimization Attempts
|
|
||||||
|
|
||||||
### Attempt 1: Block Images
|
|
||||||
**Approach**: Disable image rendering with `--blink-settings=imagesEnabled=false`
|
|
||||||
**Result**: ❌ 0 reviews, permanent loader
|
|
||||||
**Why it failed**: Google Maps requires images to render the page
|
|
||||||
|
|
||||||
### Attempt 2: Block Network Resources
|
|
||||||
**Approach**: Block `*.jpg`, `*.png`, fonts, media via CDP
|
|
||||||
**Result**: ❌ 316 seconds (slower than original!)
|
|
||||||
**Why it failed**: Broke page loading entirely
|
|
||||||
|
|
||||||
### Attempt 3: Ultra-fast API (0.25s scroll)
|
|
||||||
**Approach**: API interception with 0.25s scroll timing
|
|
||||||
**Result**: ❌ 50% failure rate (0 reviews)
|
|
||||||
**Why it failed**: Too fast, API responses not captured
|
|
||||||
|
|
||||||
### Attempt 4: Parallel Hybrid (DOM during scroll)
|
|
||||||
**Approach**: Parse DOM while scrolling
|
|
||||||
**Result**: ❌ 76-103 seconds (3x slower!)
|
|
||||||
**Why it failed**: DOM parsing overhead slows scroll loop
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🏆 Why DOM-Only Won
|
|
||||||
|
|
||||||
### vs API Interception
|
|
||||||
- ✅ **Simpler**: No complex CDP setup
|
|
||||||
- ✅ **More stable**: No timing sensitivity
|
|
||||||
- ✅ **Faster extraction**: JavaScript (0.01s) vs parsing responses
|
|
||||||
- ✅ **More reliable**: DOM always has all reviews
|
|
||||||
|
|
||||||
### vs Hybrid Approach
|
|
||||||
- ✅ **Faster**: 18.9s vs 32.4s
|
|
||||||
- ✅ **Simpler**: Single extraction phase
|
|
||||||
- ✅ **No API limit**: Gets all reviews (not just 234)
|
|
||||||
|
|
||||||
### vs Original DOM Parsing
|
|
||||||
- ✅ **8.2x faster**: 18.9s vs 155s
|
|
||||||
- ✅ **Dynamic waiting**: Adapts to network speed
|
|
||||||
- ✅ **JavaScript extraction**: 40x faster than Selenium
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📈 Performance Metrics
|
|
||||||
|
|
||||||
```
|
|
||||||
Metric Value
|
|
||||||
─────────────────────────────────────────────
|
|
||||||
Average time 18.9s
|
|
||||||
Fastest run 18.2s
|
|
||||||
Slowest run 22.9s
|
|
||||||
Standard deviation ±1.8s
|
|
||||||
Success rate 100% (20+ runs)
|
|
||||||
Reviews captured 244/244
|
|
||||||
Reviews/second 12.9
|
|
||||||
Speedup vs original 8.2x
|
|
||||||
Time saved per run 136.1s
|
|
||||||
Theoretical minimum ~13s*
|
|
||||||
Current % of theoretical max 69%
|
|
||||||
```
|
|
||||||
|
|
||||||
*Theoretical minimum if scrolling was instant (~5s setup + 8s browser overhead)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🎯 Optimization Journey
|
|
||||||
|
|
||||||
### Timeline
|
|
||||||
|
|
||||||
1. **Original**: 155s - DOM parsing with Selenium
|
|
||||||
2. **API Discovery**: Added API interception
|
|
||||||
3. **Fast API**: 43s - API + 0.8s scroll timing
|
|
||||||
4. **Faster API**: 29s - API + 0.3s scroll timing
|
|
||||||
5. **Ultra-fast API**: 19.4s - API + 0.27s scroll (unstable)
|
|
||||||
6. **Sequential Hybrid**: 32.4s - API + JS extraction (stable)
|
|
||||||
7. **DOM-only Fixed**: 30s - Fixed GDPR consent issue
|
|
||||||
8. **DOM-only Optimized**: 22s - Reduced waits
|
|
||||||
9. **DOM-only Dynamic**: 19s - Dynamic scroll waiting
|
|
||||||
10. **DOM-only Final**: **18.9s** - Universal, adaptive, optimal
|
|
||||||
|
|
||||||
### Total Optimization Sessions
|
|
||||||
- Sessions: 10+
|
|
||||||
- Iterations: 50+
|
|
||||||
- Failed approaches: 8
|
|
||||||
- **Final speedup: 8.2x**
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 💡 Key Learnings
|
|
||||||
|
|
||||||
1. **Fix root causes first**: GDPR consent was blocking everything
|
|
||||||
2. **Dynamic > Fixed**: Adaptive waiting beats fixed delays
|
|
||||||
3. **Simple often wins**: DOM-only beat complex hybrid approaches
|
|
||||||
4. **JavaScript is fast**: 40x faster than Selenium element queries
|
|
||||||
5. **Test assumptions**: "API must be faster" was wrong
|
|
||||||
6. **Universal design**: No hardcoded values = works everywhere
|
|
||||||
7. **Network matters**: Image blocking breaks Google Maps
|
|
||||||
8. **Measure everything**: Found that scrolling is 60-74% of time
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🚀 Production Recommendation
|
|
||||||
|
|
||||||
**Use**: `start_dom_only_fast.py`
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python start_dom_only_fast.py
|
|
||||||
```
|
|
||||||
|
|
||||||
### Why This Version?
|
|
||||||
|
|
||||||
✅ **Fastest stable solution** (18.9s)
|
|
||||||
✅ **Most reliable** (100% success rate)
|
|
||||||
✅ **Simplest code** (easiest to maintain)
|
|
||||||
✅ **Universal** (works for any business)
|
|
||||||
✅ **Adaptive** (handles any network speed)
|
|
||||||
|
|
||||||
### Configuration
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
# config.yaml
|
|
||||||
headless: false # Must be false for stability
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📝 Code Highlights
|
|
||||||
|
|
||||||
### Complete Optimized Flow
|
|
||||||
|
|
||||||
```python
|
|
||||||
# 1. Fast navigation with GDPR handling
|
|
||||||
driver.get(url)
|
|
||||||
if 'consent.google.com' in driver.current_url:
|
|
||||||
consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Aceptar"]')
|
|
||||||
consent_btns[0].click()
|
|
||||||
|
|
||||||
# 2. Quick setup
|
|
||||||
cookie_btns[0].click() # Dismiss cookies
|
|
||||||
review_tab.click() # Click reviews tab
|
|
||||||
|
|
||||||
# 3. Dynamic scrolling (adaptive)
|
|
||||||
for i in range(max_scrolls):
|
|
||||||
current_count = get_review_count()
|
|
||||||
driver.execute_script(scroll_script)
|
|
||||||
|
|
||||||
# Wait for reviews to load
|
|
||||||
while waited < max_wait:
|
|
||||||
time.sleep(0.05)
|
|
||||||
new_count = get_review_count()
|
|
||||||
if new_count > current_count: # Got new reviews!
|
|
||||||
break
|
|
||||||
|
|
||||||
# Stop if no new reviews
|
|
||||||
if new_count == current_count:
|
|
||||||
idle_count += 1
|
|
||||||
if idle_count >= 3:
|
|
||||||
break
|
|
||||||
|
|
||||||
# 4. Instant JavaScript extraction
|
|
||||||
reviews = driver.execute_script(extract_script) # 0.01s!
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🎉 Final Stats
|
|
||||||
|
|
||||||
- **Original Time**: 155 seconds
|
|
||||||
- **Final Time**: 18.9 seconds
|
|
||||||
- **Speedup**: **8.2x faster**
|
|
||||||
- **Time Saved**: **136 seconds per run**
|
|
||||||
- **Stability**: **100%**
|
|
||||||
- **Completeness**: **100% (244/244 reviews)**
|
|
||||||
|
|
||||||
**Mission accomplished!** 🚀
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📚 All Available Scrapers
|
|
||||||
|
|
||||||
| File | Time | Reviews | Use Case |
|
|
||||||
|------|------|---------|----------|
|
|
||||||
| `start_dom_only_fast.py` | 18.9s | 244 | **✅ RECOMMENDED - Fastest & stable** |
|
|
||||||
| `start_ultra_fast_complete.py` | 32.4s | 244 | Stable hybrid (if DOM-only fails) |
|
|
||||||
| `start_complete.py` | 30s | 244 | Adaptive API with patience |
|
|
||||||
| `start.py` | 155s | 244 | Original baseline |
|
|
||||||
|
|
||||||
**Winner**: `start_dom_only_fast.py` - **8.2x faster, 100% stable, universal!**
|
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -1,217 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Debug script to inspect the actual HTML structure on Google Maps search results.
|
|
||||||
This will help us identify where the review count is located in the DOM.
|
|
||||||
"""
|
|
||||||
import time
|
|
||||||
from seleniumbase import Driver
|
|
||||||
from selenium.webdriver.common.by import By
|
|
||||||
|
|
||||||
# Initialize driver
|
|
||||||
print("Starting Chrome...")
|
|
||||||
driver = Driver(
|
|
||||||
uc=True,
|
|
||||||
headless=True,
|
|
||||||
page_load_strategy="normal"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Navigate to Google Maps search for Instinto
|
|
||||||
url = "https://www.google.com/maps/search/?api=1&query=instinto+las+palmas&hl=en"
|
|
||||||
print(f"\nNavigating to: {url}")
|
|
||||||
driver.get(url)
|
|
||||||
time.sleep(3)
|
|
||||||
|
|
||||||
# Handle GDPR consent if present
|
|
||||||
if 'consent.google.com' in driver.current_url:
|
|
||||||
print("Handling GDPR consent...")
|
|
||||||
try:
|
|
||||||
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
|
||||||
for btn in form_btns:
|
|
||||||
btn_text = (btn.text or '').lower()
|
|
||||||
if 'accept all' in btn_text or 'aceptar todo' in btn_text:
|
|
||||||
print(f"Clicking: {btn.text}")
|
|
||||||
btn.click()
|
|
||||||
time.sleep(3)
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
if len(form_btns) >= 2:
|
|
||||||
print("Using fallback - clicking second button")
|
|
||||||
form_btns[1].click()
|
|
||||||
time.sleep(3)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"GDPR handling error: {e}")
|
|
||||||
|
|
||||||
# Wait for page to load
|
|
||||||
print("\nWaiting for page to fully load...")
|
|
||||||
time.sleep(5)
|
|
||||||
|
|
||||||
print(f"\nCurrent URL: {driver.current_url}")
|
|
||||||
|
|
||||||
# Get all text content on the page
|
|
||||||
all_text = driver.execute_script("return document.body.innerText;")
|
|
||||||
print("\n" + "="*80)
|
|
||||||
print("ALL TEXT ON PAGE (first 3000 chars):")
|
|
||||||
print("="*80)
|
|
||||||
print(all_text[:3000])
|
|
||||||
|
|
||||||
# Search for elements containing "152" or "review"
|
|
||||||
print("\n" + "="*80)
|
|
||||||
print("SEARCHING FOR ELEMENTS CONTAINING '152' OR 'review':")
|
|
||||||
print("="*80)
|
|
||||||
|
|
||||||
elements_with_numbers = driver.execute_script("""
|
|
||||||
const results = [];
|
|
||||||
const allElements = document.querySelectorAll('*');
|
|
||||||
|
|
||||||
for (let elem of allElements) {
|
|
||||||
const text = elem.textContent || '';
|
|
||||||
const ownText = elem.innerText || '';
|
|
||||||
|
|
||||||
// Only check elements that directly contain the text (not nested)
|
|
||||||
if (ownText && ownText.length < 200 && (ownText.includes('152') || /\\d+\\s*review/i.test(ownText))) {
|
|
||||||
results.push({
|
|
||||||
tag: elem.tagName,
|
|
||||||
class: elem.className,
|
|
||||||
id: elem.id,
|
|
||||||
text: ownText.substring(0, 100),
|
|
||||||
href: elem.href || null,
|
|
||||||
role: elem.getAttribute('role'),
|
|
||||||
ariaLabel: elem.getAttribute('aria-label')
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return results.slice(0, 50); // First 50 matches
|
|
||||||
""")
|
|
||||||
|
|
||||||
for i, elem in enumerate(elements_with_numbers, 1):
|
|
||||||
print(f"\n{i}. <{elem['tag']}> "
|
|
||||||
f"class='{elem['class'][:50] if elem['class'] else ''}' "
|
|
||||||
f"id='{elem['id']}'")
|
|
||||||
if elem['role']:
|
|
||||||
print(f" role: {elem['role']}")
|
|
||||||
if elem['ariaLabel']:
|
|
||||||
print(f" aria-label: {elem['ariaLabel'][:100]}")
|
|
||||||
if elem['href']:
|
|
||||||
print(f" href: {elem['href'][:100]}")
|
|
||||||
print(f" text: {elem['text']}")
|
|
||||||
|
|
||||||
# Also check what the extraction script would find
|
|
||||||
print("\n" + "="*80)
|
|
||||||
print("RUNNING ACTUAL EXTRACTION SCRIPT:")
|
|
||||||
print("="*80)
|
|
||||||
|
|
||||||
extract_script = """
|
|
||||||
const info = {
|
|
||||||
name: null,
|
|
||||||
address: null,
|
|
||||||
rating: null,
|
|
||||||
total_reviews: null,
|
|
||||||
debug_info: []
|
|
||||||
};
|
|
||||||
|
|
||||||
// Extract business name
|
|
||||||
const nameSelectors = [
|
|
||||||
'h1.DUwDvf',
|
|
||||||
'[role="main"] h1',
|
|
||||||
'h1.fontHeadlineLarge'
|
|
||||||
];
|
|
||||||
|
|
||||||
for (const selector of nameSelectors) {
|
|
||||||
const elem = document.querySelector(selector);
|
|
||||||
if (elem && elem.textContent) {
|
|
||||||
info.name = elem.textContent.trim();
|
|
||||||
info.debug_info.push(`Found name via: ${selector}`);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract rating
|
|
||||||
const ratingElem = document.querySelector('[role="img"][aria-label*="star"]');
|
|
||||||
if (ratingElem) {
|
|
||||||
const ariaLabel = ratingElem.getAttribute('aria-label');
|
|
||||||
const match = ariaLabel.match(/([0-9.]+)/);
|
|
||||||
if (match) {
|
|
||||||
info.rating = parseFloat(match[1]);
|
|
||||||
info.debug_info.push(`Found rating: ${info.rating} from aria-label: ${ariaLabel}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract total review count
|
|
||||||
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
|
|
||||||
|
|
||||||
// Check search panel selectors
|
|
||||||
const searchPanelSelectors = [
|
|
||||||
'a[href*="reviews"]',
|
|
||||||
'button[jsaction*="reviews"]',
|
|
||||||
'div[role="link"]',
|
|
||||||
];
|
|
||||||
|
|
||||||
for (const selector of searchPanelSelectors) {
|
|
||||||
const elements = document.querySelectorAll(selector);
|
|
||||||
info.debug_info.push(`Checking ${selector}: found ${elements.length} elements`);
|
|
||||||
|
|
||||||
for (let elem of elements) {
|
|
||||||
const text = elem.textContent || '';
|
|
||||||
if (text.length < 200) {
|
|
||||||
info.debug_info.push(` - text: "${text.substring(0, 100)}"`);
|
|
||||||
}
|
|
||||||
|
|
||||||
const match = text.match(numberPattern);
|
|
||||||
if (match) {
|
|
||||||
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
||||||
if (num > 0 && num < 1000000) {
|
|
||||||
info.total_reviews = num;
|
|
||||||
info.debug_info.push(` ✓ FOUND via ${selector}: ${num}`);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (info.total_reviews) break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If not found, try all spans/divs
|
|
||||||
if (!info.total_reviews) {
|
|
||||||
const allElements = document.querySelectorAll('span, div, a');
|
|
||||||
info.debug_info.push(`Checking all spans/divs/links: ${allElements.length} elements`);
|
|
||||||
|
|
||||||
let checked = 0;
|
|
||||||
for (let elem of allElements) {
|
|
||||||
const text = elem.textContent || '';
|
|
||||||
if (text.length < 100) {
|
|
||||||
const match = text.match(numberPattern);
|
|
||||||
if (match) {
|
|
||||||
checked++;
|
|
||||||
if (checked <= 10) { // Log first 10 matches
|
|
||||||
info.debug_info.push(` - potential match: "${text.substring(0, 80)}"`);
|
|
||||||
}
|
|
||||||
|
|
||||||
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
||||||
if (num > 0 && num < 1000000) {
|
|
||||||
info.total_reviews = num;
|
|
||||||
info.debug_info.push(` ✓ FOUND via all elements: ${num} from "${text.substring(0, 80)}"`);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return info;
|
|
||||||
"""
|
|
||||||
|
|
||||||
result = driver.execute_script(extract_script)
|
|
||||||
|
|
||||||
print(f"\nExtracted Info:")
|
|
||||||
print(f" Name: {result.get('name')}")
|
|
||||||
print(f" Rating: {result.get('rating')}")
|
|
||||||
print(f" Total Reviews: {result.get('total_reviews')}")
|
|
||||||
|
|
||||||
print(f"\nDebug Info:")
|
|
||||||
for debug_line in result.get('debug_info', []):
|
|
||||||
print(f" {debug_line}")
|
|
||||||
|
|
||||||
print("\n" + "="*80)
|
|
||||||
print("Done! Closing browser.")
|
|
||||||
print("="*80)
|
|
||||||
driver.quit()
|
|
||||||
@@ -1,97 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Quick debug to see what's happening"""
|
|
||||||
import yaml
|
|
||||||
import time
|
|
||||||
from seleniumbase import Driver
|
|
||||||
from selenium.webdriver.common.by import By
|
|
||||||
|
|
||||||
def load_config():
|
|
||||||
with open('config.yaml', 'r') as f:
|
|
||||||
return yaml.safe_load(f)
|
|
||||||
|
|
||||||
config = load_config()
|
|
||||||
url = config.get('url')
|
|
||||||
|
|
||||||
driver = Driver(uc=True, headless=False, page_load_strategy="normal")
|
|
||||||
|
|
||||||
try:
|
|
||||||
print(f"Loading: {url[:100]}")
|
|
||||||
driver.get(url)
|
|
||||||
time.sleep(3)
|
|
||||||
|
|
||||||
print(f"Title: {driver.title}")
|
|
||||||
print(f"URL: {driver.current_url[:100]}")
|
|
||||||
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
# Handle GDPR consent page
|
|
||||||
if 'consent.google.com' in driver.current_url:
|
|
||||||
print("On consent page, looking for accept button...")
|
|
||||||
try:
|
|
||||||
# Look for various consent buttons
|
|
||||||
consent_selectors = [
|
|
||||||
'button:has-text("Accept all")',
|
|
||||||
'button:has-text("Aceptar todo")',
|
|
||||||
'button[aria-label*="Accept"]',
|
|
||||||
'button[aria-label*="Aceptar"]',
|
|
||||||
'form button[type="submit"]',
|
|
||||||
'//button[contains(., "Accept")]',
|
|
||||||
'//button[contains(., "Aceptar")]',
|
|
||||||
]
|
|
||||||
|
|
||||||
for selector in consent_selectors:
|
|
||||||
try:
|
|
||||||
if selector.startswith('//'):
|
|
||||||
btns = driver.find_elements(By.XPATH, selector)
|
|
||||||
else:
|
|
||||||
btns = driver.find_elements(By.CSS_SELECTOR, selector)
|
|
||||||
|
|
||||||
print(f" Selector '{selector[:30]}...': found {len(btns)} buttons")
|
|
||||||
if btns:
|
|
||||||
print(f" Clicking: {btns[0].text[:50]}")
|
|
||||||
btns[0].click()
|
|
||||||
time.sleep(2)
|
|
||||||
break
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(f"After consent click: {driver.current_url[:100]}")
|
|
||||||
time.sleep(3)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Consent error: {e}")
|
|
||||||
|
|
||||||
# Now try cookie banner on Maps page
|
|
||||||
try:
|
|
||||||
cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i]')
|
|
||||||
print(f"Found {len(cookie_btns)} cookie buttons")
|
|
||||||
if cookie_btns:
|
|
||||||
cookie_btns[0].click()
|
|
||||||
time.sleep(1)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Cookie error: {e}")
|
|
||||||
|
|
||||||
# Click reviews
|
|
||||||
tabs = driver.find_elements(By.CSS_SELECTOR, '.LRkQ2, button[role="tab"]')
|
|
||||||
print(f"Found {len(tabs)} tabs")
|
|
||||||
for tab in tabs:
|
|
||||||
text = (tab.text or '').lower()
|
|
||||||
if 'review' in text:
|
|
||||||
print(f"Clicking: {tab.text}")
|
|
||||||
driver.execute_script("arguments[0].click();", tab)
|
|
||||||
break
|
|
||||||
|
|
||||||
time.sleep(3)
|
|
||||||
|
|
||||||
# Check reviews
|
|
||||||
reviews = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')
|
|
||||||
print(f"Found {len(reviews)} review elements")
|
|
||||||
|
|
||||||
# Check pane
|
|
||||||
panes = driver.find_elements(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb')
|
|
||||||
print(f"Found {len(panes)} pane elements")
|
|
||||||
|
|
||||||
time.sleep(10) # Keep browser open
|
|
||||||
|
|
||||||
finally:
|
|
||||||
driver.quit()
|
|
||||||
@@ -1,130 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Debug script - check detail page after auto-navigation for review count.
|
|
||||||
"""
|
|
||||||
import time
|
|
||||||
from seleniumbase import Driver
|
|
||||||
from selenium.webdriver.common.by import By
|
|
||||||
|
|
||||||
driver = Driver(uc=True, headless=True)
|
|
||||||
|
|
||||||
url = "https://www.google.com/maps/search/?api=1&query=soho+vilna+club&hl=en"
|
|
||||||
print(f"Navigating to: {url}")
|
|
||||||
driver.get(url)
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
# Handle GDPR
|
|
||||||
if 'consent.google.com' in driver.current_url:
|
|
||||||
print("Handling GDPR...")
|
|
||||||
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
|
||||||
for btn in form_btns:
|
|
||||||
if 'accept all' in (btn.text or '').lower():
|
|
||||||
btn.click()
|
|
||||||
time.sleep(2)
|
|
||||||
break
|
|
||||||
|
|
||||||
# Wait for auto-navigation to complete
|
|
||||||
print("Waiting for Google Maps to auto-navigate to business detail page...")
|
|
||||||
time.sleep(6)
|
|
||||||
|
|
||||||
print(f"Final URL: {driver.current_url}")
|
|
||||||
print(f"On detail page: {'/place/' in driver.current_url}\n")
|
|
||||||
|
|
||||||
# Dump ALL text on the page
|
|
||||||
all_text = driver.execute_script("return document.body.innerText;")
|
|
||||||
|
|
||||||
print("="*80)
|
|
||||||
print("SEARCHING FOR REVIEW NUMBERS IN PAGE TEXT:")
|
|
||||||
print("="*80)
|
|
||||||
|
|
||||||
# Find all numbers followed by "review"
|
|
||||||
import re
|
|
||||||
review_pattern = r'(\d[\d,\.]*)\s*(?:review|reseña|avis)'
|
|
||||||
matches = re.findall(review_pattern, all_text, re.IGNORECASE)
|
|
||||||
|
|
||||||
if matches:
|
|
||||||
print(f"✓ Found {len(matches)} potential review count(s) in text:")
|
|
||||||
for i, match in enumerate(matches, 1):
|
|
||||||
num = match.replace(',', '').replace('.', '')
|
|
||||||
print(f" {i}. {match} ({num})")
|
|
||||||
else:
|
|
||||||
print("✗ No review count found in page text")
|
|
||||||
|
|
||||||
# Check specific patterns in the text
|
|
||||||
print(f"\n{'='*80}")
|
|
||||||
print("PAGE TEXT ANALYSIS:")
|
|
||||||
print("="*80)
|
|
||||||
|
|
||||||
# Lines containing numbers
|
|
||||||
lines = all_text.split('\n')
|
|
||||||
number_lines = [line.strip() for line in lines if re.search(r'\d+', line) and len(line.strip()) < 100 and len(line.strip()) > 0]
|
|
||||||
|
|
||||||
print(f"Lines containing numbers (first 30):")
|
|
||||||
for i, line in enumerate(number_lines[:30], 1):
|
|
||||||
print(f" {i}. {line}")
|
|
||||||
|
|
||||||
# Now use JavaScript to find exact element
|
|
||||||
result = driver.execute_script("""
|
|
||||||
const info = {
|
|
||||||
foundIn: [],
|
|
||||||
reviewCount: null
|
|
||||||
};
|
|
||||||
|
|
||||||
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
|
|
||||||
|
|
||||||
// Check ALL elements
|
|
||||||
const allElements = document.querySelectorAll('*');
|
|
||||||
|
|
||||||
for (let elem of allElements) {
|
|
||||||
const text = elem.textContent || '';
|
|
||||||
const ownText = elem.innerText || '';
|
|
||||||
|
|
||||||
// Check both textContent and innerText
|
|
||||||
for (let txt of [text, ownText]) {
|
|
||||||
if (txt && txt.length < 200) {
|
|
||||||
const match = txt.match(numberPattern);
|
|
||||||
if (match) {
|
|
||||||
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
||||||
if (num > 0 && num < 1000000) {
|
|
||||||
info.foundIn.push({
|
|
||||||
tag: elem.tagName,
|
|
||||||
class: elem.className,
|
|
||||||
id: elem.id,
|
|
||||||
role: elem.getAttribute('role'),
|
|
||||||
ariaLabel: elem.getAttribute('aria-label'),
|
|
||||||
text: txt.substring(0, 100),
|
|
||||||
number: num
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!info.reviewCount) {
|
|
||||||
info.reviewCount = num;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return info;
|
|
||||||
""")
|
|
||||||
|
|
||||||
print(f"\n{'='*80}")
|
|
||||||
print("JAVASCRIPT EXTRACTION:")
|
|
||||||
print("="*80)
|
|
||||||
print(f"Review Count Found: {result['reviewCount']}\n")
|
|
||||||
|
|
||||||
if result['foundIn']:
|
|
||||||
print(f"Elements containing review numbers (first 15):")
|
|
||||||
for i, elem in enumerate(result['foundIn'][:15], 1):
|
|
||||||
print(f"\n{i}. <{elem['tag']}> Number: {elem['number']}")
|
|
||||||
if elem['class']:
|
|
||||||
print(f" class: {elem['class'][:60]}")
|
|
||||||
if elem['role']:
|
|
||||||
print(f" role: {elem['role']}")
|
|
||||||
if elem['ariaLabel']:
|
|
||||||
print(f" aria-label: {elem['ariaLabel'][:80]}")
|
|
||||||
print(f" text: {elem['text']}")
|
|
||||||
else:
|
|
||||||
print("No elements with review numbers found")
|
|
||||||
|
|
||||||
driver.quit()
|
|
||||||
@@ -1,171 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Debug script to extract review count from search results BEFORE auto-navigation.
|
|
||||||
"""
|
|
||||||
import time
|
|
||||||
from seleniumbase import Driver
|
|
||||||
from selenium.webdriver.common.by import By
|
|
||||||
|
|
||||||
driver = Driver(uc=True, headless=True)
|
|
||||||
|
|
||||||
url = "https://www.google.com/maps/search/?api=1&query=soho+vilna+club&hl=en"
|
|
||||||
print(f"Navigating to: {url}")
|
|
||||||
driver.get(url)
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
# Handle GDPR
|
|
||||||
if 'consent.google.com' in driver.current_url:
|
|
||||||
print("Handling GDPR...")
|
|
||||||
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
|
||||||
for btn in form_btns:
|
|
||||||
if 'accept all' in (btn.text or '').lower():
|
|
||||||
btn.click()
|
|
||||||
time.sleep(2)
|
|
||||||
break
|
|
||||||
|
|
||||||
# SHORT WAIT - extract quickly before auto-navigation!
|
|
||||||
time.sleep(1.5)
|
|
||||||
|
|
||||||
print(f"Current URL (should still be /search/): {driver.current_url}")
|
|
||||||
is_search = '/search/' in driver.current_url
|
|
||||||
print(f"Still on search results: {is_search}\n")
|
|
||||||
|
|
||||||
# FAST extraction from search results sidebar
|
|
||||||
result = driver.execute_script("""
|
|
||||||
const info = {
|
|
||||||
businessName: null,
|
|
||||||
rating: null,
|
|
||||||
reviewCount: null,
|
|
||||||
searchResults: [],
|
|
||||||
allTextWithNumbers: []
|
|
||||||
};
|
|
||||||
|
|
||||||
console.log('[EXTRACTION] Starting search results extraction...');
|
|
||||||
|
|
||||||
// Get business name from first result card
|
|
||||||
const nameSelectors = [
|
|
||||||
'div[role="article"] h3',
|
|
||||||
'div[role="article"] div.fontHeadlineSmall',
|
|
||||||
'div[aria-label*="Results"] h3',
|
|
||||||
'a[href*="/place/"] h3',
|
|
||||||
'div.Nv2PK h3' // Google Maps class for business name in search results
|
|
||||||
];
|
|
||||||
|
|
||||||
for (const selector of nameSelectors) {
|
|
||||||
const elem = document.querySelector(selector);
|
|
||||||
if (elem && elem.textContent) {
|
|
||||||
info.businessName = elem.textContent.trim();
|
|
||||||
console.log(`[EXTRACTION] Found name via ${selector}: ${info.businessName}`);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get rating from first result
|
|
||||||
const ratingElem = document.querySelector('div[role="article"] [role="img"][aria-label*="star"], a[href*="/place/"] [role="img"][aria-label*="star"]');
|
|
||||||
if (ratingElem) {
|
|
||||||
const ariaLabel = ratingElem.getAttribute('aria-label');
|
|
||||||
const match = ariaLabel.match(/([0-9.]+)/);
|
|
||||||
if (match) {
|
|
||||||
info.rating = parseFloat(match[1]);
|
|
||||||
console.log(`[EXTRACTION] Found rating: ${info.rating}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// CRITICAL: Extract review count from search results sidebar
|
|
||||||
// Look for patterns like "152 reviews", "247 reviews", etc.
|
|
||||||
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
|
|
||||||
|
|
||||||
// Strategy 1: Check first result card/article
|
|
||||||
const resultCards = document.querySelectorAll('div[role="article"], a[href*="/place/"], div.Nv2PK');
|
|
||||||
console.log(`[EXTRACTION] Found ${resultCards.length} result cards`);
|
|
||||||
|
|
||||||
for (let card of resultCards) {
|
|
||||||
const text = card.textContent || '';
|
|
||||||
console.log(`[EXTRACTION] Card text (first 200 chars): ${text.substring(0, 200)}`);
|
|
||||||
|
|
||||||
const match = text.match(numberPattern);
|
|
||||||
if (match) {
|
|
||||||
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
||||||
if (num > 0 && num < 1000000) {
|
|
||||||
info.reviewCount = num;
|
|
||||||
console.log(`[EXTRACTION] ✓ Found review count in card: ${num}`);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Only check first card
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Strategy 2: Check all elements in left sidebar/panel
|
|
||||||
if (!info.reviewCount) {
|
|
||||||
console.log('[EXTRACTION] Strategy 2: Checking all sidebar elements...');
|
|
||||||
|
|
||||||
const leftPanel = document.querySelector('div[role="main"]') || document.querySelector('[aria-label*="Results"]') || document.body;
|
|
||||||
const allElements = leftPanel.querySelectorAll('span, div, a, button');
|
|
||||||
|
|
||||||
console.log(`[EXTRACTION] Checking ${allElements.length} elements in sidebar...`);
|
|
||||||
|
|
||||||
for (let elem of allElements) {
|
|
||||||
const text = elem.textContent || '';
|
|
||||||
|
|
||||||
// Skip very long text blocks (likely not the review count)
|
|
||||||
if (text.length > 0 && text.length < 150) {
|
|
||||||
const match = text.match(numberPattern);
|
|
||||||
if (match) {
|
|
||||||
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
||||||
if (num > 0 && num < 1000000) {
|
|
||||||
info.allTextWithNumbers.push({
|
|
||||||
tag: elem.tagName,
|
|
||||||
text: text,
|
|
||||||
number: num
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!info.reviewCount) {
|
|
||||||
info.reviewCount = num;
|
|
||||||
console.log(`[EXTRACTION] ✓ Found via sidebar scan: ${num} from "${text}"`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log(`[EXTRACTION] Final result: ${info.reviewCount} reviews`);
|
|
||||||
return info;
|
|
||||||
""")
|
|
||||||
|
|
||||||
print("="*80)
|
|
||||||
print("EXTRACTION RESULTS (from search results page):")
|
|
||||||
print("="*80)
|
|
||||||
print(f"Business Name: {result['businessName']}")
|
|
||||||
print(f"Rating: {result['rating']}")
|
|
||||||
print(f"Review Count: {result['reviewCount']}")
|
|
||||||
|
|
||||||
if result['allTextWithNumbers']:
|
|
||||||
print(f"\n{'='*80}")
|
|
||||||
print("ALL ELEMENTS WITH REVIEW NUMBERS (first 10):")
|
|
||||||
print("="*80)
|
|
||||||
for i, item in enumerate(result['allTextWithNumbers'][:10], 1):
|
|
||||||
print(f"\n{i}. <{item['tag']}> Number: {item['number']}")
|
|
||||||
print(f" Text: {item['text'][:100]}")
|
|
||||||
|
|
||||||
# Check browser console
|
|
||||||
console_logs = driver.get_log('browser')
|
|
||||||
print(f"\n{'='*80}")
|
|
||||||
print("BROWSER CONSOLE LOGS:")
|
|
||||||
print("="*80)
|
|
||||||
for log in console_logs:
|
|
||||||
if '[EXTRACTION]' in log['message']:
|
|
||||||
print(log['message'])
|
|
||||||
|
|
||||||
# Wait a bit longer to see if Google auto-navigates
|
|
||||||
print(f"\n{'='*80}")
|
|
||||||
print("Waiting 5 more seconds to see if Google auto-navigates...")
|
|
||||||
print("="*80)
|
|
||||||
time.sleep(5)
|
|
||||||
|
|
||||||
print(f"URL after waiting: {driver.current_url}")
|
|
||||||
print(f"Still on search results: {'/search/' in driver.current_url}")
|
|
||||||
|
|
||||||
driver.quit()
|
|
||||||
144
debug_soho.py
144
debug_soho.py
@@ -1,144 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Debug script for the actual business user tried: Soho Vilna Club
|
|
||||||
"""
|
|
||||||
import time
|
|
||||||
from seleniumbase import Driver
|
|
||||||
from selenium.webdriver.common.by import By
|
|
||||||
|
|
||||||
driver = Driver(uc=True, headless=True)
|
|
||||||
|
|
||||||
url = "https://www.google.com/maps/search/?api=1&query=soho+vilna+club&hl=en"
|
|
||||||
print(f"Navigating to: {url}")
|
|
||||||
driver.get(url)
|
|
||||||
time.sleep(3)
|
|
||||||
|
|
||||||
# Handle GDPR
|
|
||||||
if 'consent.google.com' in driver.current_url:
|
|
||||||
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
|
||||||
for btn in form_btns:
|
|
||||||
if 'accept all' in (btn.text or '').lower():
|
|
||||||
btn.click()
|
|
||||||
time.sleep(3)
|
|
||||||
break
|
|
||||||
|
|
||||||
time.sleep(5)
|
|
||||||
print(f"Current URL: {driver.current_url}\n")
|
|
||||||
|
|
||||||
# Check if still on search results or navigated to business page
|
|
||||||
is_search_results = '/search/' in driver.current_url
|
|
||||||
print(f"On search results page: {is_search_results}\n")
|
|
||||||
|
|
||||||
# Extract info
|
|
||||||
result = driver.execute_script("""
|
|
||||||
const info = {
|
|
||||||
tabs: [],
|
|
||||||
reviewCount: null,
|
|
||||||
businessName: null,
|
|
||||||
rating: null,
|
|
||||||
searchResults: []
|
|
||||||
};
|
|
||||||
|
|
||||||
const isSearchPage = window.location.href.includes('/search/');
|
|
||||||
|
|
||||||
// Get business name
|
|
||||||
const nameElem = document.querySelector('h1.DUwDvf, [role="main"] h1, h1.fontHeadlineLarge');
|
|
||||||
if (nameElem) {
|
|
||||||
info.businessName = nameElem.textContent.trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get rating
|
|
||||||
const ratingElem = document.querySelector('[role="img"][aria-label*="star"]');
|
|
||||||
if (ratingElem) {
|
|
||||||
const ariaLabel = ratingElem.getAttribute('aria-label');
|
|
||||||
const match = ariaLabel.match(/([0-9.]+)/);
|
|
||||||
if (match) {
|
|
||||||
info.rating = parseFloat(match[1]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get all tabs
|
|
||||||
const tabs = document.querySelectorAll('button[role="tab"]');
|
|
||||||
tabs.forEach((tab, i) => {
|
|
||||||
const text = tab.textContent || '';
|
|
||||||
const ariaLabel = tab.getAttribute('aria-label') || '';
|
|
||||||
info.tabs.push({
|
|
||||||
index: i,
|
|
||||||
text: text,
|
|
||||||
ariaLabel: ariaLabel
|
|
||||||
});
|
|
||||||
|
|
||||||
// Try to extract review count from tabs
|
|
||||||
const reviewPattern = /\\((\\d[\\d,\\.]*)\\)/;
|
|
||||||
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
|
|
||||||
|
|
||||||
let match = text.match(reviewPattern);
|
|
||||||
if (!match) match = text.match(numberPattern);
|
|
||||||
if (!match) match = ariaLabel.match(reviewPattern);
|
|
||||||
if (!match) match = ariaLabel.match(numberPattern);
|
|
||||||
|
|
||||||
if (match) {
|
|
||||||
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
||||||
if (num > 0 && num < 1000000) {
|
|
||||||
info.reviewCount = num;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// If on search results, try to get review count from search panel
|
|
||||||
if (isSearchPage || !info.reviewCount) {
|
|
||||||
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
|
|
||||||
|
|
||||||
// Check all elements
|
|
||||||
const allElements = document.querySelectorAll('a, span, div');
|
|
||||||
for (let elem of allElements) {
|
|
||||||
const text = elem.textContent || '';
|
|
||||||
if (text.length > 0 && text.length < 150) {
|
|
||||||
const match = text.match(numberPattern);
|
|
||||||
if (match) {
|
|
||||||
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
||||||
if (num > 0 && num < 1000000) {
|
|
||||||
info.searchResults.push({
|
|
||||||
tag: elem.tagName,
|
|
||||||
class: elem.className,
|
|
||||||
text: text,
|
|
||||||
number: num
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!info.reviewCount) {
|
|
||||||
info.reviewCount = num;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return info;
|
|
||||||
""")
|
|
||||||
|
|
||||||
print("="*80)
|
|
||||||
print("BUSINESS INFO:")
|
|
||||||
print("="*80)
|
|
||||||
print(f"Name: {result['businessName']}")
|
|
||||||
print(f"Rating: {result['rating']}")
|
|
||||||
print(f"Review Count: {result['reviewCount']}\n")
|
|
||||||
|
|
||||||
print("="*80)
|
|
||||||
print("TABS FOUND:")
|
|
||||||
print("="*80)
|
|
||||||
for tab in result['tabs']:
|
|
||||||
print(f"\nTab {tab['index']}:")
|
|
||||||
print(f" Text: {tab['text']}")
|
|
||||||
print(f" Aria-label: {tab['ariaLabel']}")
|
|
||||||
|
|
||||||
if result['searchResults']:
|
|
||||||
print(f"\n{'='*80}")
|
|
||||||
print("SEARCH RESULTS WITH NUMBERS (first 10):")
|
|
||||||
print("="*80)
|
|
||||||
for i, sr in enumerate(result['searchResults'][:10], 1):
|
|
||||||
print(f"\n{i}. <{sr['tag']}> class='{sr['class'][:40]}'")
|
|
||||||
print(f" Number found: {sr['number']}")
|
|
||||||
print(f" Text: {sr['text'][:100]}")
|
|
||||||
|
|
||||||
driver.quit()
|
|
||||||
100
debug_tabs.py
100
debug_tabs.py
@@ -1,100 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Debug script to find review count on business detail page tabs.
|
|
||||||
"""
|
|
||||||
import time
|
|
||||||
from seleniumbase import Driver
|
|
||||||
from selenium.webdriver.common.by import By
|
|
||||||
|
|
||||||
driver = Driver(uc=True, headless=True)
|
|
||||||
|
|
||||||
url = "https://www.google.com/maps/search/?api=1&query=instinto+las+palmas&hl=en"
|
|
||||||
print(f"Navigating to: {url}")
|
|
||||||
driver.get(url)
|
|
||||||
time.sleep(3)
|
|
||||||
|
|
||||||
# Handle GDPR
|
|
||||||
if 'consent.google.com' in driver.current_url:
|
|
||||||
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
|
||||||
for btn in form_btns:
|
|
||||||
if 'accept all' in (btn.text or '').lower():
|
|
||||||
btn.click()
|
|
||||||
time.sleep(3)
|
|
||||||
break
|
|
||||||
|
|
||||||
time.sleep(5)
|
|
||||||
print(f"Current URL: {driver.current_url}\n")
|
|
||||||
|
|
||||||
# Extract tabs and review count
|
|
||||||
result = driver.execute_script("""
|
|
||||||
const info = {
|
|
||||||
tabs: [],
|
|
||||||
reviewCount: null,
|
|
||||||
allText: []
|
|
||||||
};
|
|
||||||
|
|
||||||
// Get all tabs
|
|
||||||
const tabs = document.querySelectorAll('button[role="tab"]');
|
|
||||||
tabs.forEach((tab, i) => {
|
|
||||||
info.tabs.push({
|
|
||||||
index: i,
|
|
||||||
text: tab.textContent || '',
|
|
||||||
ariaLabel: tab.getAttribute('aria-label') || ''
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
// Look for review count patterns
|
|
||||||
const reviewPattern = /\\((\\d[\\d,\\.]*)\\)/;
|
|
||||||
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
|
|
||||||
|
|
||||||
for (let tab of tabs) {
|
|
||||||
const text = tab.textContent || '';
|
|
||||||
const ariaLabel = tab.getAttribute('aria-label') || '';
|
|
||||||
|
|
||||||
let match = text.match(reviewPattern);
|
|
||||||
if (!match) match = text.match(numberPattern);
|
|
||||||
if (!match) match = ariaLabel.match(reviewPattern);
|
|
||||||
if (!match) match = ariaLabel.match(numberPattern);
|
|
||||||
|
|
||||||
if (match) {
|
|
||||||
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
||||||
if (num > 0 && num < 1000000) {
|
|
||||||
info.reviewCount = num;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Also check all elements with "review" in text
|
|
||||||
const allElements = document.querySelectorAll('*');
|
|
||||||
for (let elem of allElements) {
|
|
||||||
const text = (elem.textContent || '').trim();
|
|
||||||
if (text.length > 0 && text.length < 150 && /review/i.test(text)) {
|
|
||||||
if (!info.allText.includes(text)) {
|
|
||||||
info.allText.push(text);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return info;
|
|
||||||
""")
|
|
||||||
|
|
||||||
print("="*80)
|
|
||||||
print("TABS FOUND:")
|
|
||||||
print("="*80)
|
|
||||||
for tab in result['tabs']:
|
|
||||||
print(f"\nTab {tab['index']}:")
|
|
||||||
print(f" Text: {tab['text']}")
|
|
||||||
print(f" Aria-label: {tab['ariaLabel']}")
|
|
||||||
|
|
||||||
print(f"\n{'='*80}")
|
|
||||||
print(f"REVIEW COUNT EXTRACTED: {result['reviewCount']}")
|
|
||||||
print(f"{'='*80}\n")
|
|
||||||
|
|
||||||
print("="*80)
|
|
||||||
print("ALL TEXT CONTAINING 'review' (first 20):")
|
|
||||||
print("="*80)
|
|
||||||
for i, text in enumerate(result['allText'][:20], 1):
|
|
||||||
print(f"{i}. {text}")
|
|
||||||
|
|
||||||
driver.quit()
|
|
||||||
@@ -1,142 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Debug script - wait for search results to load before extracting.
|
|
||||||
"""
|
|
||||||
import time
|
|
||||||
from seleniumbase import Driver
|
|
||||||
from selenium.webdriver.common.by import By
|
|
||||||
from selenium.webdriver.support.ui import WebDriverWait
|
|
||||||
from selenium.webdriver.support import expected_conditions as EC
|
|
||||||
|
|
||||||
driver = Driver(uc=True, headless=True)
|
|
||||||
|
|
||||||
url = "https://www.google.com/maps/search/?api=1&query=soho+vilna+club&hl=en"
|
|
||||||
print(f"Navigating to: {url}")
|
|
||||||
driver.get(url)
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
# Handle GDPR
|
|
||||||
if 'consent.google.com' in driver.current_url:
|
|
||||||
print("Handling GDPR...")
|
|
||||||
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
|
||||||
for btn in form_btns:
|
|
||||||
if 'accept all' in (btn.text or '').lower():
|
|
||||||
btn.click()
|
|
||||||
time.sleep(2)
|
|
||||||
break
|
|
||||||
|
|
||||||
print(f"Current URL: {driver.current_url}")
|
|
||||||
print("Waiting for search results to load...\n")
|
|
||||||
|
|
||||||
# Wait for search results to appear (but don't wait so long that Google auto-navigates)
|
|
||||||
try:
|
|
||||||
# Wait for the first result card to appear
|
|
||||||
wait = WebDriverWait(driver, 10)
|
|
||||||
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[role="article"], a[href*="/place/"]')))
|
|
||||||
print("✓ Search results loaded!")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"✗ Timeout waiting for results: {e}")
|
|
||||||
|
|
||||||
# Give it just a tiny bit more time for content to render
|
|
||||||
time.sleep(0.5)
|
|
||||||
|
|
||||||
print(f"Current URL: {driver.current_url}")
|
|
||||||
print(f"Still on search results: {'/search/' in driver.current_url}\n")
|
|
||||||
|
|
||||||
# Extract
|
|
||||||
result = driver.execute_script("""
|
|
||||||
const info = {
|
|
||||||
businessName: null,
|
|
||||||
rating: null,
|
|
||||||
reviewCount: null,
|
|
||||||
debug: []
|
|
||||||
};
|
|
||||||
|
|
||||||
// Find first result card
|
|
||||||
const resultCard = document.querySelector('div[role="article"], a[href*="/place/"]');
|
|
||||||
if (!resultCard) {
|
|
||||||
info.debug.push('No result card found');
|
|
||||||
return info;
|
|
||||||
}
|
|
||||||
|
|
||||||
info.debug.push('Found result card');
|
|
||||||
|
|
||||||
// Get full text of card
|
|
||||||
const cardText = resultCard.textContent || '';
|
|
||||||
info.debug.push(`Card text length: ${cardText.length}`);
|
|
||||||
info.debug.push(`Card text (first 300 chars): ${cardText.substring(0, 300)}`);
|
|
||||||
|
|
||||||
// Extract business name (usually first h3 or div with specific class)
|
|
||||||
const nameElem = resultCard.querySelector('h3, div.fontHeadlineSmall, div[class*="fontHeadline"]');
|
|
||||||
if (nameElem) {
|
|
||||||
info.businessName = nameElem.textContent.trim();
|
|
||||||
info.debug.push(`Found name: ${info.businessName}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract rating
|
|
||||||
const ratingElem = resultCard.querySelector('[role="img"][aria-label*="star"]');
|
|
||||||
if (ratingElem) {
|
|
||||||
const ariaLabel = ratingElem.getAttribute('aria-label');
|
|
||||||
const match = ariaLabel.match(/([0-9.]+)/);
|
|
||||||
if (match) {
|
|
||||||
info.rating = parseFloat(match[1]);
|
|
||||||
info.debug.push(`Found rating: ${info.rating}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract review count - look for "N reviews" pattern
|
|
||||||
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
|
|
||||||
const match = cardText.match(numberPattern);
|
|
||||||
|
|
||||||
if (match) {
|
|
||||||
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
||||||
if (num > 0 && num < 1000000) {
|
|
||||||
info.reviewCount = num;
|
|
||||||
info.debug.push(`✓ Found review count: ${num}`);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
info.debug.push('No review count pattern found in card text');
|
|
||||||
|
|
||||||
// Try checking individual child elements
|
|
||||||
const allChildren = resultCard.querySelectorAll('*');
|
|
||||||
info.debug.push(`Card has ${allChildren.length} child elements`);
|
|
||||||
|
|
||||||
for (let child of allChildren) {
|
|
||||||
const childText = child.textContent || '';
|
|
||||||
if (childText.length < 100 && /review/i.test(childText)) {
|
|
||||||
info.debug.push(`Element with "review": ${childText}`);
|
|
||||||
|
|
||||||
const match = childText.match(numberPattern);
|
|
||||||
if (match) {
|
|
||||||
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
||||||
if (num > 0 && num < 1000000 && !info.reviewCount) {
|
|
||||||
info.reviewCount = num;
|
|
||||||
info.debug.push(`✓ Found via child element: ${num}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return info;
|
|
||||||
""")
|
|
||||||
|
|
||||||
print("="*80)
|
|
||||||
print("EXTRACTION RESULTS:")
|
|
||||||
print("="*80)
|
|
||||||
print(f"Business Name: {result['businessName']}")
|
|
||||||
print(f"Rating: {result['rating']}")
|
|
||||||
print(f"Review Count: {result['reviewCount']}\n")
|
|
||||||
|
|
||||||
print("="*80)
|
|
||||||
print("DEBUG INFO:")
|
|
||||||
print("="*80)
|
|
||||||
for debug_line in result['debug']:
|
|
||||||
print(f" {debug_line}")
|
|
||||||
|
|
||||||
# Take a screenshot of the search results
|
|
||||||
screenshot_path = '/tmp/search_results.png'
|
|
||||||
driver.save_screenshot(screenshot_path)
|
|
||||||
print(f"\n✓ Screenshot saved to: {screenshot_path}")
|
|
||||||
|
|
||||||
driver.quit()
|
|
||||||
@@ -1,163 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Better diagnostic: Actually wait for reviews panel to load and find correct selector.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import time
|
|
||||||
from seleniumbase import Driver
|
|
||||||
|
|
||||||
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
|
|
||||||
|
|
||||||
print("Opening browser...")
|
|
||||||
driver = Driver(uc=True, headless=False)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Add English locale
|
|
||||||
if '?' in url:
|
|
||||||
url += '&hl=en'
|
|
||||||
else:
|
|
||||||
url += '?hl=en'
|
|
||||||
|
|
||||||
driver.get(url)
|
|
||||||
print(f"Loaded: {url}")
|
|
||||||
time.sleep(5)
|
|
||||||
|
|
||||||
# Handle GDPR
|
|
||||||
try:
|
|
||||||
form_btns = driver.find_elements('css selector', 'form button')
|
|
||||||
for btn in form_btns:
|
|
||||||
btn_text = (btn.text or '').lower()
|
|
||||||
if 'accept all' in btn_text:
|
|
||||||
print(f"Clicking GDPR: {btn.text}")
|
|
||||||
btn.click()
|
|
||||||
time.sleep(2)
|
|
||||||
break
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Click reviews tab and WAIT for panel to load
|
|
||||||
print("\nClicking reviews tab...")
|
|
||||||
time.sleep(2)
|
|
||||||
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
|
||||||
for tab in tabs:
|
|
||||||
text = (tab.text or '').lower()
|
|
||||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
|
||||||
if 'review' in text or 'review' in aria:
|
|
||||||
print(f"Found reviews tab: {tab.text or aria[:50]}")
|
|
||||||
driver.execute_script("arguments[0].click();", tab)
|
|
||||||
print("Clicked! Waiting for reviews panel to load...")
|
|
||||||
time.sleep(5) # Wait longer for reviews to actually load
|
|
||||||
break
|
|
||||||
|
|
||||||
# Try scrolling the reviews pane to load more
|
|
||||||
print("\nTrying to find and scroll reviews pane...")
|
|
||||||
pane_selectors = [
|
|
||||||
'div.m6QErb.WNBkOb.XiKgde',
|
|
||||||
'div.m6QErb',
|
|
||||||
'div[role="main"]'
|
|
||||||
]
|
|
||||||
|
|
||||||
for selector in pane_selectors:
|
|
||||||
try:
|
|
||||||
pane = driver.find_element('css selector', selector)
|
|
||||||
print(f"Found pane: {selector}")
|
|
||||||
driver.execute_script("arguments[0].scrollBy(0, 500);", pane)
|
|
||||||
time.sleep(2)
|
|
||||||
driver.execute_script("arguments[0].scrollBy(0, 500);", pane)
|
|
||||||
time.sleep(2)
|
|
||||||
break
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# NOW check for review selectors
|
|
||||||
print("\n" + "="*80)
|
|
||||||
print("CHECKING REVIEW SELECTORS AFTER PANEL LOADED:")
|
|
||||||
print("="*80)
|
|
||||||
|
|
||||||
selectors_to_try = [
|
|
||||||
('div.jftiEf.fontBodyMedium', 'Standard Google Maps reviews'),
|
|
||||||
('div.jftiEf', 'Just jftiEf class'),
|
|
||||||
('div.fontBodyMedium', 'Just fontBodyMedium'),
|
|
||||||
('div[data-review-id]', 'data-review-id attribute'),
|
|
||||||
('div[jsaction*="review"]', 'jsaction with review'),
|
|
||||||
('[data-review]', 'data-review attribute'),
|
|
||||||
('div[class*="review" i]', 'Class containing review'),
|
|
||||||
('[role="article"]', 'role=article'),
|
|
||||||
('div[jslog]', 'Elements with jslog (Google tracking)'),
|
|
||||||
]
|
|
||||||
|
|
||||||
for selector, description in selectors_to_try:
|
|
||||||
count = driver.execute_script(
|
|
||||||
f"return document.querySelectorAll('{selector}').length;"
|
|
||||||
)
|
|
||||||
print(f"{description:35} | {selector:40} | Found: {count}")
|
|
||||||
|
|
||||||
# Get detailed info about most promising selector
|
|
||||||
print("\n" + "="*80)
|
|
||||||
print("ANALYZING MOST PROMISING SELECTOR:")
|
|
||||||
print("="*80)
|
|
||||||
|
|
||||||
analysis = driver.execute_script("""
|
|
||||||
// Try selectors in order of likelihood
|
|
||||||
const selectors = [
|
|
||||||
'div.jftiEf.fontBodyMedium',
|
|
||||||
'div.jftiEf',
|
|
||||||
'div.fontBodyMedium',
|
|
||||||
'div[jslog*="impression"]',
|
|
||||||
'[role="article"]'
|
|
||||||
];
|
|
||||||
|
|
||||||
for (let selector of selectors) {
|
|
||||||
const elements = document.querySelectorAll(selector);
|
|
||||||
if (elements.length > 5) { // Need at least a few to be reviews
|
|
||||||
// Analyze first element
|
|
||||||
const first = elements[0];
|
|
||||||
const analysis = {
|
|
||||||
selector: selector,
|
|
||||||
total_found: elements.length,
|
|
||||||
first_element: {
|
|
||||||
tag: first.tagName,
|
|
||||||
classes: first.className,
|
|
||||||
has_rating: !!first.querySelector('[aria-label*="star" i]'),
|
|
||||||
has_author: !!first.querySelector('button, a, div[aria-label]'),
|
|
||||||
has_avatar: !!first.querySelector('img'),
|
|
||||||
has_date: !!first.textContent.match(/\\d+\\s*(day|week|month|year|hour|minute)/i),
|
|
||||||
text_length: first.textContent.length,
|
|
||||||
sample_text: first.textContent.substring(0, 100)
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Check if multiple elements have review characteristics
|
|
||||||
let reviewLikeCount = 0;
|
|
||||||
for (let i = 0; i < Math.min(10, elements.length); i++) {
|
|
||||||
const elem = elements[i];
|
|
||||||
const hasRating = !!elem.querySelector('[aria-label*="star" i]');
|
|
||||||
const hasText = elem.textContent.length > 30;
|
|
||||||
if (hasRating && hasText) reviewLikeCount++;
|
|
||||||
}
|
|
||||||
analysis.review_like_count_in_first_10 = reviewLikeCount;
|
|
||||||
|
|
||||||
return analysis;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return {error: 'No selector found with >5 elements'};
|
|
||||||
""")
|
|
||||||
|
|
||||||
if 'error' in analysis:
|
|
||||||
print(f"ERROR: {analysis['error']}")
|
|
||||||
else:
|
|
||||||
print(f"Best selector: {analysis['selector']}")
|
|
||||||
print(f"Total found: {analysis['total_found']}")
|
|
||||||
print(f"Review-like in first 10: {analysis['review_like_count_in_first_10']}")
|
|
||||||
print(f"\nFirst element analysis:")
|
|
||||||
for key, value in analysis['first_element'].items():
|
|
||||||
print(f" {key}: {value}")
|
|
||||||
|
|
||||||
print("\n" + "="*80)
|
|
||||||
print("Keeping browser open for 120 seconds for manual inspection...")
|
|
||||||
print("="*80)
|
|
||||||
time.sleep(120)
|
|
||||||
|
|
||||||
finally:
|
|
||||||
driver.quit()
|
|
||||||
@@ -1,126 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Diagnostic script to find the correct selector for Lithuanian hospital reviews.
|
|
||||||
Opens the browser and pauses so we can inspect the page manually.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import time
|
|
||||||
from seleniumbase import Driver
|
|
||||||
|
|
||||||
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
|
|
||||||
|
|
||||||
print("Opening browser...")
|
|
||||||
driver = Driver(uc=True, headless=False)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Add English locale for consistency
|
|
||||||
if '?' in url:
|
|
||||||
url += '&hl=en'
|
|
||||||
else:
|
|
||||||
url += '?hl=en'
|
|
||||||
|
|
||||||
driver.get(url)
|
|
||||||
print(f"Loaded: {url}")
|
|
||||||
|
|
||||||
# Wait for page to load
|
|
||||||
time.sleep(5)
|
|
||||||
|
|
||||||
# Handle GDPR
|
|
||||||
try:
|
|
||||||
form_btns = driver.find_elements('css selector', 'form button')
|
|
||||||
for btn in form_btns:
|
|
||||||
btn_text = (btn.text or '').lower()
|
|
||||||
if 'accept all' in btn_text or 'aceptar todo' in btn_text:
|
|
||||||
print(f"Clicking GDPR consent: {btn.text}")
|
|
||||||
btn.click()
|
|
||||||
time.sleep(2)
|
|
||||||
break
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Click reviews tab
|
|
||||||
time.sleep(2)
|
|
||||||
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
|
||||||
for tab in tabs:
|
|
||||||
text = (tab.text or '').lower()
|
|
||||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
|
||||||
if 'review' in text or 'review' in aria:
|
|
||||||
print(f"Clicking reviews tab: {tab.text or aria[:30]}")
|
|
||||||
driver.execute_script("arguments[0].click();", tab)
|
|
||||||
time.sleep(3)
|
|
||||||
break
|
|
||||||
|
|
||||||
# Try different selectors and show what we find
|
|
||||||
selectors_to_try = [
|
|
||||||
('div.jftiEf.fontBodyMedium', 'Known selector 1'),
|
|
||||||
('div.jftiEf', 'Known selector 2'),
|
|
||||||
('div[data-review-id]', 'Known selector 3'),
|
|
||||||
('div[jsaction*="review"]', 'jsaction with review'),
|
|
||||||
('[role="article"]', 'role=article'),
|
|
||||||
('div[data-review-id]', 'data-review-id attribute'),
|
|
||||||
('div.fontBodyMedium', 'Just fontBodyMedium class'),
|
|
||||||
('div[class*="review"]', 'Class containing "review"'),
|
|
||||||
]
|
|
||||||
|
|
||||||
print("\n" + "="*80)
|
|
||||||
print("TESTING SELECTORS:")
|
|
||||||
print("="*80)
|
|
||||||
|
|
||||||
for selector, description in selectors_to_try:
|
|
||||||
count = driver.execute_script(
|
|
||||||
f"return document.querySelectorAll('{selector}').length;"
|
|
||||||
)
|
|
||||||
print(f"{description:30} | {selector:40} | Found: {count}")
|
|
||||||
|
|
||||||
# Show sample HTML of first few elements matching the most promising selector
|
|
||||||
print("\n" + "="*80)
|
|
||||||
print("SAMPLE HTML FROM FIRST MATCH:")
|
|
||||||
print("="*80)
|
|
||||||
|
|
||||||
sample_html = driver.execute_script("""
|
|
||||||
const selectors = [
|
|
||||||
'div.jftiEf.fontBodyMedium',
|
|
||||||
'div.jftiEf',
|
|
||||||
'[role="article"]',
|
|
||||||
'div[jsaction*="review"]'
|
|
||||||
];
|
|
||||||
|
|
||||||
for (let selector of selectors) {
|
|
||||||
const elements = document.querySelectorAll(selector);
|
|
||||||
if (elements.length > 0) {
|
|
||||||
const first = elements[0];
|
|
||||||
return {
|
|
||||||
selector: selector,
|
|
||||||
count: elements.length,
|
|
||||||
outerHTML: first.outerHTML.substring(0, 500),
|
|
||||||
classes: first.className,
|
|
||||||
hasRating: !!first.querySelector('[aria-label*="star" i]'),
|
|
||||||
hasAuthor: !!first.querySelector('img'),
|
|
||||||
textLength: first.textContent.length
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
""")
|
|
||||||
|
|
||||||
if sample_html:
|
|
||||||
print(f"Selector: {sample_html['selector']}")
|
|
||||||
print(f"Total found: {sample_html['count']}")
|
|
||||||
print(f"Classes: {sample_html['classes']}")
|
|
||||||
print(f"Has rating: {sample_html['hasRating']}")
|
|
||||||
print(f"Has author img: {sample_html['hasAuthor']}")
|
|
||||||
print(f"Text length: {sample_html['textLength']}")
|
|
||||||
print(f"\nSample HTML (first 500 chars):")
|
|
||||||
print(sample_html['outerHTML'])
|
|
||||||
|
|
||||||
print("\n" + "="*80)
|
|
||||||
print("Browser will stay open for 60 seconds so you can inspect manually...")
|
|
||||||
print("Use DevTools to find the correct selector!")
|
|
||||||
print("="*80)
|
|
||||||
|
|
||||||
# Keep browser open for inspection
|
|
||||||
time.sleep(60)
|
|
||||||
|
|
||||||
finally:
|
|
||||||
driver.quit()
|
|
||||||
print("\nBrowser closed.")
|
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 18 KiB |
@@ -1,73 +0,0 @@
|
|||||||
# RECOMMENDED USAGE GUIDELINES - Google Reviews Scraper Pro
|
|
||||||
|
|
||||||
## IMPORTANT NOTICE
|
|
||||||
|
|
||||||
This software is distributed under the MIT License, which grants extensive freedom to users. The following guidelines are **recommendations only** and reflect best practices for ethical and lower-risk usage of the Google Reviews Scraper Pro software ("the Software"). These guidelines are not legally binding restrictions beyond what is already established in the MIT License.
|
|
||||||
|
|
||||||
## 1. RECOMMENDED USAGE
|
|
||||||
|
|
||||||
I strongly recommend limiting the use of this Software to:
|
|
||||||
|
|
||||||
a) **Internal Business Use**: Businesses collecting and analyzing reviews specifically about their own business entities from Google Maps.
|
|
||||||
|
|
||||||
b) **Self-Monitoring**: Using the data for monitoring your own online reputation, analyzing customer feedback, and improving your services.
|
|
||||||
|
|
||||||
c) **Data Backup**: Creating backups of your own business reviews to protect against data loss.
|
|
||||||
|
|
||||||
## 2. USAGE CAUTIONS
|
|
||||||
|
|
||||||
I advise against the following uses that may carry higher legal or ethical risks:
|
|
||||||
|
|
||||||
a) **Competitor Analysis**: Collecting reviews about competitors or other businesses that you do not own.
|
|
||||||
|
|
||||||
b) **Mass Collection**: Collecting reviews from multiple businesses without authorization.
|
|
||||||
|
|
||||||
c) **Republication**: Publishing collected reviews on other websites without proper attribution.
|
|
||||||
|
|
||||||
d) **Deceptive Practices**: Using collected data for fake reviews or review manipulation.
|
|
||||||
|
|
||||||
e) **Reselling Data**: Selling or commercially exploiting the collected review data.
|
|
||||||
|
|
||||||
## 3. LEGAL CONSIDERATIONS
|
|
||||||
|
|
||||||
While I cannot offer legal advice, I believe users should be aware:
|
|
||||||
|
|
||||||
a) **Terms of Service**: Using web scraping tools may potentially conflict with Google's Terms of Service. Users should evaluate this risk independently.
|
|
||||||
|
|
||||||
b) **Legal Context**: Web scraping exists in a complex legal landscape that varies by jurisdiction. What is permissible in one region may not be in another.
|
|
||||||
|
|
||||||
c) **Privacy Regulations**: Review data may contain personal information subject to privacy laws such as GDPR, CCPA, and others. Users should ensure their data handling practices comply with applicable regulations.
|
|
||||||
|
|
||||||
## 4. BEST PRACTICES
|
|
||||||
|
|
||||||
To minimize potential issues, I suggest:
|
|
||||||
|
|
||||||
a) **Reasonable Rate Limiting**: Implement appropriate delays between requests to avoid overloading servers.
|
|
||||||
|
|
||||||
b) **Minimal Collection**: Only collect the data you genuinely need for legitimate purposes.
|
|
||||||
|
|
||||||
c) **Attribution**: Maintain proper attribution to review authors and Google when using the collected data.
|
|
||||||
|
|
||||||
d) **Data Security**: Implement appropriate security measures to protect any collected review data.
|
|
||||||
|
|
||||||
e) **Consult Professionals**: When in doubt about the legality of your specific use case, consult with legal professionals familiar with digital law in your jurisdiction.
|
|
||||||
|
|
||||||
## 5. REMINDER OF MIT LICENSE PROVISIONS
|
|
||||||
|
|
||||||
This software is provided under the MIT License, which:
|
|
||||||
|
|
||||||
a) **Permits**: Commercial use, modification, distribution, private use.
|
|
||||||
|
|
||||||
b) **Requires**: Preservation of copyright and license notices.
|
|
||||||
|
|
||||||
c) **Disclaims**: All warranties. The authors or copyright holders are not liable for any claim, damages, or other liability arising from the software or its use.
|
|
||||||
|
|
||||||
## 6. FINAL NOTE
|
|
||||||
|
|
||||||
These guidelines represent my recommendations for responsible use of the Software. They are not additional restrictions beyond the MIT License. While the MIT License grants you significant freedom in how you use the Software, I believe that following these guidelines promotes ethical use and helps users navigate the complex landscape of web scraping activities.
|
|
||||||
|
|
||||||
By focusing on your own business's reviews and maintaining respectful practices, you can derive value from the Software while minimizing potential risks.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
_Last Updated: April 24, 2025_
|
|
||||||
@@ -1,96 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Quick test of API interceptor with manual response dumping"""
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import time
|
|
||||||
from pathlib import Path
|
|
||||||
from seleniumbase import SB
|
|
||||||
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
|
||||||
|
|
||||||
# Set up logging
|
|
||||||
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
|
|
||||||
|
|
||||||
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
|
|
||||||
|
|
||||||
print("[INFO] Starting browser with UC mode...")
|
|
||||||
with SB(uc=True, headless=False) as sb:
|
|
||||||
print("[INFO] Loading Google Maps page...")
|
|
||||||
sb.open(url)
|
|
||||||
sb.sleep(3)
|
|
||||||
|
|
||||||
# Inject interceptor EARLY
|
|
||||||
print("[INFO] Injecting API interceptor...")
|
|
||||||
interceptor = GoogleMapsAPIInterceptor(sb.driver)
|
|
||||||
interceptor.inject_response_interceptor()
|
|
||||||
sb.sleep(2)
|
|
||||||
|
|
||||||
# Click reviews tab
|
|
||||||
print("[INFO] Looking for reviews tab...")
|
|
||||||
try:
|
|
||||||
sb.click('.LRkQ2', timeout=5)
|
|
||||||
print("[INFO] Clicked reviews tab")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[WARN] Could not click reviews tab: {e}")
|
|
||||||
|
|
||||||
sb.sleep(5)
|
|
||||||
|
|
||||||
# Scroll to trigger API calls
|
|
||||||
print("[INFO] Scrolling to load reviews...")
|
|
||||||
for i in range(5):
|
|
||||||
sb.execute_script("window.scrollBy(0, 800)")
|
|
||||||
sb.sleep(2)
|
|
||||||
print(f" Scroll {i+1}/5...")
|
|
||||||
|
|
||||||
# Wait a bit more
|
|
||||||
print("[INFO] Waiting for API responses...")
|
|
||||||
sb.sleep(3)
|
|
||||||
|
|
||||||
# Get intercepted responses
|
|
||||||
responses = interceptor.get_intercepted_responses()
|
|
||||||
print(f"\n[SUCCESS] Captured {len(responses)} API responses!")
|
|
||||||
|
|
||||||
if not responses:
|
|
||||||
print("[WARN] No responses captured. Exiting.")
|
|
||||||
exit(0)
|
|
||||||
|
|
||||||
# Dump to files
|
|
||||||
output_dir = Path("debug_api_dump")
|
|
||||||
output_dir.mkdir(exist_ok=True)
|
|
||||||
|
|
||||||
for i, resp in enumerate(responses):
|
|
||||||
# Full response
|
|
||||||
resp_file = output_dir / f"response_{i}.json"
|
|
||||||
with open(resp_file, 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(resp, f, indent=2, ensure_ascii=False)
|
|
||||||
|
|
||||||
# Just body
|
|
||||||
body_file = output_dir / f"response_{i}_body.txt"
|
|
||||||
with open(body_file, 'w', encoding='utf-8') as f:
|
|
||||||
f.write(resp.get('body', ''))
|
|
||||||
|
|
||||||
url_str = resp.get('url', 'unknown')
|
|
||||||
size = resp.get('size', len(resp.get('body', '')))
|
|
||||||
print(f"\n [{i}] {url_str[:80]}... ({size} bytes)")
|
|
||||||
print(f" Full: {resp_file}")
|
|
||||||
print(f" Body: {body_file}")
|
|
||||||
|
|
||||||
print(f"\n[SUCCESS] Dumped {len(responses)} responses to: {output_dir}/")
|
|
||||||
|
|
||||||
# Try to parse
|
|
||||||
print("\n[INFO] Attempting to parse reviews from responses...")
|
|
||||||
try:
|
|
||||||
parsed_reviews = interceptor.parse_reviews_from_responses(responses)
|
|
||||||
print(f"[INFO] Parsed {len(parsed_reviews)} reviews")
|
|
||||||
|
|
||||||
for i, review in enumerate(parsed_reviews[:5]):
|
|
||||||
print(f"\n Review {i+1}:")
|
|
||||||
print(f" ID: {review.review_id[:50] if review.review_id else 'N/A'}")
|
|
||||||
print(f" Author: {review.author}")
|
|
||||||
print(f" Rating: {review.rating}")
|
|
||||||
print(f" Text: {review.text[:80] if review.text else 'N/A'}...")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[ERROR] Failed to parse: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
print("\n[DONE]")
|
|
||||||
@@ -1,185 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Test concurrent job handling in production API.
|
|
||||||
Verifies that multiple simultaneous requests work correctly.
|
|
||||||
"""
|
|
||||||
import asyncio
|
|
||||||
import httpx
|
|
||||||
import time
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
API_BASE_URL = "http://localhost:8000"
|
|
||||||
|
|
||||||
# Test URLs (using the same URL is fine for testing)
|
|
||||||
TEST_URLS = [
|
|
||||||
"https://www.google.com/maps/place/Soho+Factory/@54.6738155,25.2595844,17z/",
|
|
||||||
] * 5 # 5 concurrent jobs
|
|
||||||
|
|
||||||
|
|
||||||
async def submit_job(client: httpx.AsyncClient, url: str, job_num: int):
|
|
||||||
"""Submit a single scraping job"""
|
|
||||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] Job {job_num}: Submitting...")
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = await client.post(
|
|
||||||
f"{API_BASE_URL}/scrape",
|
|
||||||
json={"url": url},
|
|
||||||
timeout=10.0
|
|
||||||
)
|
|
||||||
|
|
||||||
if response.status_code == 200:
|
|
||||||
data = response.json()
|
|
||||||
job_id = data['job_id']
|
|
||||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] Job {job_num}: Started (ID: {job_id[:8]}...)")
|
|
||||||
return job_id, job_num
|
|
||||||
else:
|
|
||||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] Job {job_num}: Failed - {response.status_code}")
|
|
||||||
return None, job_num
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] Job {job_num}: Error - {e}")
|
|
||||||
return None, job_num
|
|
||||||
|
|
||||||
|
|
||||||
async def monitor_job(client: httpx.AsyncClient, job_id: str, job_num: int):
|
|
||||||
"""Monitor a job until completion"""
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
response = await client.get(
|
|
||||||
f"{API_BASE_URL}/jobs/{job_id}",
|
|
||||||
timeout=5.0
|
|
||||||
)
|
|
||||||
|
|
||||||
if response.status_code == 200:
|
|
||||||
job = response.json()
|
|
||||||
status = job['status']
|
|
||||||
|
|
||||||
if status == 'completed':
|
|
||||||
elapsed = time.time() - start_time
|
|
||||||
reviews = job.get('reviews_count', 0)
|
|
||||||
scrape_time = job.get('scrape_time', 0)
|
|
||||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] Job {job_num}: ✅ COMPLETED - {reviews} reviews in {scrape_time:.1f}s (total: {elapsed:.1f}s)")
|
|
||||||
return True, elapsed, reviews
|
|
||||||
|
|
||||||
elif status == 'failed':
|
|
||||||
elapsed = time.time() - start_time
|
|
||||||
error = job.get('error_message', 'Unknown error')
|
|
||||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] Job {job_num}: ❌ FAILED - {error}")
|
|
||||||
return False, elapsed, 0
|
|
||||||
|
|
||||||
elif status == 'running':
|
|
||||||
# Still running, wait and check again
|
|
||||||
await asyncio.sleep(2)
|
|
||||||
else:
|
|
||||||
# Pending, wait longer
|
|
||||||
await asyncio.sleep(1)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] Job {job_num}: Monitor error - {e}")
|
|
||||||
await asyncio.sleep(2)
|
|
||||||
|
|
||||||
|
|
||||||
async def test_concurrent_jobs():
|
|
||||||
"""Test multiple concurrent jobs"""
|
|
||||||
print("=" * 70)
|
|
||||||
print("Testing Concurrent Job Handling")
|
|
||||||
print("=" * 70)
|
|
||||||
print(f"Submitting {len(TEST_URLS)} jobs simultaneously...\n")
|
|
||||||
|
|
||||||
overall_start = time.time()
|
|
||||||
|
|
||||||
async with httpx.AsyncClient() as client:
|
|
||||||
# Test 1: Check API is available
|
|
||||||
try:
|
|
||||||
response = await client.get(f"{API_BASE_URL}/", timeout=5.0)
|
|
||||||
if response.status_code != 200:
|
|
||||||
print("❌ API not available!")
|
|
||||||
return
|
|
||||||
print("✅ API is available\n")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Cannot connect to API: {e}")
|
|
||||||
print("\nPlease start the API server first:")
|
|
||||||
print(" python api_server_production.py")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Test 2: Submit all jobs concurrently
|
|
||||||
print(f"Step 1: Submitting {len(TEST_URLS)} jobs in parallel...")
|
|
||||||
print("-" * 70)
|
|
||||||
|
|
||||||
submit_tasks = [
|
|
||||||
submit_job(client, url, i+1)
|
|
||||||
for i, url in enumerate(TEST_URLS)
|
|
||||||
]
|
|
||||||
|
|
||||||
results = await asyncio.gather(*submit_tasks)
|
|
||||||
job_ids = [(job_id, num) for job_id, num in results if job_id]
|
|
||||||
|
|
||||||
print(f"\n✅ Submitted {len(job_ids)}/{len(TEST_URLS)} jobs successfully\n")
|
|
||||||
|
|
||||||
if not job_ids:
|
|
||||||
print("❌ No jobs were submitted successfully!")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Test 3: Monitor all jobs concurrently
|
|
||||||
print("Step 2: Monitoring jobs until completion...")
|
|
||||||
print("-" * 70)
|
|
||||||
|
|
||||||
monitor_tasks = [
|
|
||||||
monitor_job(client, job_id, num)
|
|
||||||
for job_id, num in job_ids
|
|
||||||
]
|
|
||||||
|
|
||||||
completion_results = await asyncio.gather(*monitor_tasks)
|
|
||||||
|
|
||||||
# Test 4: Analyze results
|
|
||||||
print("\n" + "=" * 70)
|
|
||||||
print("Results Summary")
|
|
||||||
print("=" * 70)
|
|
||||||
|
|
||||||
total_elapsed = time.time() - overall_start
|
|
||||||
successful = sum(1 for success, _, _ in completion_results if success)
|
|
||||||
failed = sum(1 for success, _, _ in completion_results if not success)
|
|
||||||
|
|
||||||
avg_time = sum(elapsed for _, elapsed, _ in completion_results) / len(completion_results)
|
|
||||||
total_reviews = sum(reviews for _, _, reviews in completion_results)
|
|
||||||
|
|
||||||
print(f"Total jobs: {len(job_ids)}")
|
|
||||||
print(f"Successful: {successful}")
|
|
||||||
print(f"Failed: {failed}")
|
|
||||||
print(f"Total reviews: {total_reviews}")
|
|
||||||
print(f"Average job time: {avg_time:.1f}s")
|
|
||||||
print(f"Total wall time: {total_elapsed:.1f}s")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Check if jobs ran in parallel
|
|
||||||
if total_elapsed < avg_time * len(job_ids) * 0.8:
|
|
||||||
print("✅ Jobs ran IN PARALLEL! (wall time < sum of job times)")
|
|
||||||
speedup = (avg_time * len(job_ids)) / total_elapsed
|
|
||||||
print(f" Speedup: {speedup:.1f}x faster than sequential")
|
|
||||||
else:
|
|
||||||
print("⚠️ Jobs may have run SEQUENTIALLY")
|
|
||||||
print(f" Expected parallel time: ~{avg_time:.1f}s")
|
|
||||||
print(f" Actual time: {total_elapsed:.1f}s")
|
|
||||||
|
|
||||||
print("\n" + "=" * 70)
|
|
||||||
|
|
||||||
# Check memory/resource usage
|
|
||||||
print("\n💡 Notes:")
|
|
||||||
print(" - Each job runs a headless Chrome instance")
|
|
||||||
print(" - Memory usage: ~500MB per concurrent job")
|
|
||||||
print(f" - Current test: {len(job_ids)} jobs = ~{len(job_ids) * 500}MB RAM")
|
|
||||||
print(" - For production: Consider limiting concurrent jobs")
|
|
||||||
print(" (Phase 2 adds Redis queue + worker pool for this)")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
try:
|
|
||||||
asyncio.run(test_concurrent_jobs())
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print("\n\nTest interrupted by user")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"\n❌ Test failed: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
@@ -1,47 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Test script to check what debug data we can extract from Google Maps
|
|
||||||
"""
|
|
||||||
import json
|
|
||||||
from modules.fast_scraper import fast_scrape_reviews
|
|
||||||
|
|
||||||
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
|
|
||||||
|
|
||||||
print("Starting scrape...")
|
|
||||||
result = fast_scrape_reviews(url, headless=True)
|
|
||||||
|
|
||||||
reviews = result.get('reviews', [])
|
|
||||||
print(f"\nExtracted {len(reviews)} reviews")
|
|
||||||
|
|
||||||
if reviews:
|
|
||||||
print("\n" + "="*80)
|
|
||||||
print("FIRST REVIEW:")
|
|
||||||
print("="*80)
|
|
||||||
first_review = reviews[0]
|
|
||||||
|
|
||||||
# Print all keys
|
|
||||||
print(f"Keys: {list(first_review.keys())}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Print full first review
|
|
||||||
print(json.dumps(first_review, indent=2, default=str))
|
|
||||||
|
|
||||||
if '_google_state_debug' in first_review:
|
|
||||||
print("\n" + "="*80)
|
|
||||||
print("GOOGLE STATE DEBUG:")
|
|
||||||
print("="*80)
|
|
||||||
print(json.dumps(first_review['_google_state_debug'], indent=2))
|
|
||||||
|
|
||||||
if 'debug_date_info' in first_review and first_review['debug_date_info']:
|
|
||||||
print("\n" + "="*80)
|
|
||||||
print("DATE DEBUG INFO:")
|
|
||||||
print("="*80)
|
|
||||||
print(json.dumps(first_review['debug_date_info'], indent=2, default=str))
|
|
||||||
|
|
||||||
# Save all to file
|
|
||||||
with open('/tmp/google_maps_debug_dump.json', 'w') as f:
|
|
||||||
json.dump(reviews[:5], f, indent=2, default=str) # Save first 5 reviews
|
|
||||||
print(f"\nFirst 5 reviews saved to: /tmp/google_maps_debug_dump.json")
|
|
||||||
else:
|
|
||||||
print("No reviews extracted!")
|
|
||||||
print(f"Result: {result}")
|
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Test script to verify Chrome + fast_scraper works inside Docker container.
|
|
||||||
"""
|
|
||||||
import sys
|
|
||||||
sys.path.insert(0, '/app')
|
|
||||||
|
|
||||||
from modules.fast_scraper import fast_scrape_reviews
|
|
||||||
|
|
||||||
def test_chrome_in_container():
|
|
||||||
"""Test Chrome with fast_scraper in container"""
|
|
||||||
print("=" * 70)
|
|
||||||
print("Testing Chrome + Fast Scraper in Docker Container")
|
|
||||||
print("=" * 70)
|
|
||||||
|
|
||||||
# Known good URL
|
|
||||||
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
|
|
||||||
|
|
||||||
print("\nRunning fast_scrape_reviews()...")
|
|
||||||
print("-" * 70)
|
|
||||||
|
|
||||||
try:
|
|
||||||
result = fast_scrape_reviews(url=url, headless=False, max_scrolls=30)
|
|
||||||
|
|
||||||
print("\n" + "=" * 70)
|
|
||||||
if result['success'] and result['count'] > 0:
|
|
||||||
print("✅ SUCCESS! Container scraping works!")
|
|
||||||
print("=" * 70)
|
|
||||||
print(f"Reviews scraped: {result['count']}")
|
|
||||||
print(f"Time: {result['time']:.1f}s")
|
|
||||||
print(f"Speed: {result['count']/result['time']:.1f} reviews/sec")
|
|
||||||
|
|
||||||
print(f"\nFirst 3 reviews:")
|
|
||||||
for i, review in enumerate(result['reviews'][:3], 1):
|
|
||||||
author = review.get('author', 'N/A')
|
|
||||||
rating = review.get('rating', 'N/A')
|
|
||||||
print(f"{i}. {author} - {rating}⭐")
|
|
||||||
|
|
||||||
print("\n✅ Container is production-ready!")
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
print("⚠️ Scraping didn't work as expected")
|
|
||||||
print("=" * 70)
|
|
||||||
print(f"Success: {result['success']}")
|
|
||||||
print(f"Reviews: {result['count']}")
|
|
||||||
print(f"Error: {result.get('error', 'None')}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"\n❌ Test failed: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
return False
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
success = test_chrome_in_container()
|
|
||||||
sys.exit(0 if success else 1)
|
|
||||||
@@ -1,136 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Test if English locale exposes better date formats
|
|
||||||
"""
|
|
||||||
import json
|
|
||||||
from seleniumbase import Driver
|
|
||||||
import time
|
|
||||||
|
|
||||||
# Try both Spanish and English URLs
|
|
||||||
urls = {
|
|
||||||
'spanish': "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1",
|
|
||||||
'english': "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2G1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=en&rclk=1"
|
|
||||||
}
|
|
||||||
|
|
||||||
results = {}
|
|
||||||
|
|
||||||
for lang, url in urls.items():
|
|
||||||
print(f"\n{'='*80}")
|
|
||||||
print(f"Testing: {lang.upper()}")
|
|
||||||
print('='*80)
|
|
||||||
|
|
||||||
# Configure browser for English
|
|
||||||
chrome_options = []
|
|
||||||
if lang == 'english':
|
|
||||||
chrome_options = [
|
|
||||||
'--lang=en-US',
|
|
||||||
'--accept-lang=en-US,en;q=0.9'
|
|
||||||
]
|
|
||||||
|
|
||||||
driver = Driver(uc=True, headless=False, chromium_arg=','.join(chrome_options) if chrome_options else None)
|
|
||||||
|
|
||||||
try:
|
|
||||||
driver.get(url)
|
|
||||||
time.sleep(5)
|
|
||||||
|
|
||||||
# Click on reviews tab if needed
|
|
||||||
try:
|
|
||||||
reviews_button = driver.find_element("css selector", "button[aria-label*='eviews'], button[aria-label*='eseñas']")
|
|
||||||
reviews_button.click()
|
|
||||||
time.sleep(3)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Scroll to load reviews
|
|
||||||
try:
|
|
||||||
scrollable_pane = driver.find_element("css selector", "div[role='main']")
|
|
||||||
driver.execute_script("arguments[0].scrollBy(0, 500);", scrollable_pane)
|
|
||||||
time.sleep(2)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Extract first 3 review dates
|
|
||||||
extract_script = """
|
|
||||||
const reviews = [];
|
|
||||||
const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium');
|
|
||||||
|
|
||||||
for (let i = 0; i < Math.min(3, elements.length); i++) {
|
|
||||||
const elem = elements[i];
|
|
||||||
const review = {};
|
|
||||||
|
|
||||||
// Author
|
|
||||||
const authorElem = elem.querySelector('div.d4r55');
|
|
||||||
review.author = authorElem ? authorElem.textContent.trim() : null;
|
|
||||||
|
|
||||||
// Date element
|
|
||||||
const dateElem = elem.querySelector('span.rsqaWe');
|
|
||||||
if (dateElem) {
|
|
||||||
review.date_text = dateElem.textContent.trim();
|
|
||||||
|
|
||||||
// Check ALL attributes
|
|
||||||
const attrs = {};
|
|
||||||
for (let attr of dateElem.attributes) {
|
|
||||||
attrs[attr.name] = attr.value;
|
|
||||||
}
|
|
||||||
review.date_attrs = attrs;
|
|
||||||
|
|
||||||
// Check for datetime, aria-label, title, data-*
|
|
||||||
review.datetime = dateElem.getAttribute('datetime');
|
|
||||||
review.aria_label = dateElem.getAttribute('aria-label');
|
|
||||||
review.title = dateElem.getAttribute('title');
|
|
||||||
review.data_timestamp = dateElem.getAttribute('data-timestamp');
|
|
||||||
review.data_time = dateElem.getAttribute('data-time');
|
|
||||||
|
|
||||||
// Check parent elements
|
|
||||||
let parent = dateElem.parentElement;
|
|
||||||
if (parent) {
|
|
||||||
review.parent_tag = parent.tagName;
|
|
||||||
review.parent_class = parent.className;
|
|
||||||
const parentAttrs = {};
|
|
||||||
for (let attr of parent.attributes) {
|
|
||||||
if (attr.name.includes('time') || attr.name.includes('date') || attr.name.includes('data-')) {
|
|
||||||
parentAttrs[attr.name] = attr.value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
review.parent_attrs = parentAttrs;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
reviews.push(review);
|
|
||||||
}
|
|
||||||
|
|
||||||
return reviews;
|
|
||||||
"""
|
|
||||||
|
|
||||||
reviews = driver.execute_script(extract_script)
|
|
||||||
results[lang] = reviews
|
|
||||||
|
|
||||||
print(f"\nExtracted {len(reviews)} reviews")
|
|
||||||
for i, rev in enumerate(reviews, 1):
|
|
||||||
print(f"\nReview {i}:")
|
|
||||||
print(f" Author: {rev.get('author')}")
|
|
||||||
print(f" Date Text: {rev.get('date_text')}")
|
|
||||||
print(f" Datetime attr: {rev.get('datetime')}")
|
|
||||||
print(f" Aria-label: {rev.get('aria_label')}")
|
|
||||||
print(f" Title: {rev.get('title')}")
|
|
||||||
print(f" Data-timestamp: {rev.get('data_timestamp')}")
|
|
||||||
print(f" Parent attrs: {rev.get('parent_attrs')}")
|
|
||||||
|
|
||||||
finally:
|
|
||||||
driver.quit()
|
|
||||||
|
|
||||||
# Save comparison
|
|
||||||
with open('/tmp/date_format_comparison.json', 'w') as f:
|
|
||||||
json.dump(results, f, indent=2)
|
|
||||||
|
|
||||||
print(f"\n{'='*80}")
|
|
||||||
print("COMPARISON SAVED TO: /tmp/date_format_comparison.json")
|
|
||||||
print('='*80)
|
|
||||||
|
|
||||||
# Quick comparison
|
|
||||||
if 'spanish' in results and 'english' in results:
|
|
||||||
print("\nSPANISH vs ENGLISH:")
|
|
||||||
for i in range(min(len(results['spanish']), len(results['english']))):
|
|
||||||
sp = results['spanish'][i].get('date_text', 'N/A')
|
|
||||||
en = results['english'][i].get('date_text', 'N/A')
|
|
||||||
print(f" Review {i+1}: '{sp}' vs '{en}'")
|
|
||||||
@@ -1,73 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Test if English locale exposes better date formats
|
|
||||||
"""
|
|
||||||
import json
|
|
||||||
from modules.fast_scraper import fast_scrape_reviews
|
|
||||||
|
|
||||||
# Try both Spanish and English URLs
|
|
||||||
urls = {
|
|
||||||
'spanish': "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1",
|
|
||||||
'english': "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=en&rclk=1"
|
|
||||||
}
|
|
||||||
|
|
||||||
results = {}
|
|
||||||
|
|
||||||
for lang, url in urls.items():
|
|
||||||
print(f"\n{'='*80}")
|
|
||||||
print(f"Testing: {lang.upper()}")
|
|
||||||
print('='*80)
|
|
||||||
|
|
||||||
result = fast_scrape_reviews(url, headless=True)
|
|
||||||
reviews = result.get('reviews', [])
|
|
||||||
|
|
||||||
print(f"Extracted {len(reviews)} reviews")
|
|
||||||
|
|
||||||
if reviews:
|
|
||||||
# Show first 5 review dates
|
|
||||||
sample = []
|
|
||||||
for i, rev in enumerate(reviews[:5], 1):
|
|
||||||
date_info = {
|
|
||||||
'author': rev.get('author'),
|
|
||||||
'date_text': rev.get('date_text'),
|
|
||||||
'debug_date_info': rev.get('debug_date_info')
|
|
||||||
}
|
|
||||||
sample.append(date_info)
|
|
||||||
print(f"\nReview {i}:")
|
|
||||||
print(f" Author: {date_info['author']}")
|
|
||||||
print(f" Date: {date_info['date_text']}")
|
|
||||||
|
|
||||||
if date_info.get('debug_date_info'):
|
|
||||||
date_attrs = date_info['debug_date_info'].get('date_elem_attrs', {})
|
|
||||||
print(f" Date element attributes: {date_attrs}")
|
|
||||||
|
|
||||||
results[lang] = {
|
|
||||||
'count': len(reviews),
|
|
||||||
'sample': sample
|
|
||||||
}
|
|
||||||
|
|
||||||
# Save comparison
|
|
||||||
with open('/tmp/date_format_comparison.json', 'w') as f:
|
|
||||||
json.dump(results, f, indent=2)
|
|
||||||
|
|
||||||
print(f"\n{'='*80}")
|
|
||||||
print("COMPARISON SAVED TO: /tmp/date_format_comparison.json")
|
|
||||||
print('='*80)
|
|
||||||
|
|
||||||
# Quick comparison
|
|
||||||
if 'spanish' in results and 'english' in results:
|
|
||||||
print("\n📊 SPANISH vs ENGLISH DATE FORMATS:")
|
|
||||||
print("-" * 80)
|
|
||||||
sp_sample = results['spanish'].get('sample', [])
|
|
||||||
en_sample = results['english'].get('sample', [])
|
|
||||||
|
|
||||||
for i in range(min(len(sp_sample), len(en_sample))):
|
|
||||||
sp_date = sp_sample[i].get('date_text', 'N/A')
|
|
||||||
en_date = en_sample[i].get('date_text', 'N/A')
|
|
||||||
|
|
||||||
# Check if formats are different
|
|
||||||
marker = "🔄" if sp_date != en_date else "="
|
|
||||||
print(f" {marker} Review {i+1}:")
|
|
||||||
print(f" ES: '{sp_date}'")
|
|
||||||
print(f" EN: '{en_date}'")
|
|
||||||
print()
|
|
||||||
@@ -1,70 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Extract Google Maps APP_INITIALIZATION_STATE to find timestamps
|
|
||||||
"""
|
|
||||||
import json
|
|
||||||
from seleniumbase import Driver
|
|
||||||
import time
|
|
||||||
|
|
||||||
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
|
|
||||||
|
|
||||||
print("Starting browser...")
|
|
||||||
driver = Driver(uc=True, headless=False)
|
|
||||||
|
|
||||||
try:
|
|
||||||
print(f"Loading URL: {url}")
|
|
||||||
driver.get(url)
|
|
||||||
time.sleep(8) # Wait for page to fully load
|
|
||||||
|
|
||||||
# Extract global state objects
|
|
||||||
extract_script = """
|
|
||||||
const results = {};
|
|
||||||
|
|
||||||
// Get APP_INITIALIZATION_STATE
|
|
||||||
if (window.APP_INITIALIZATION_STATE) {
|
|
||||||
results.app_init_state = window.APP_INITIALIZATION_STATE;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get APP_OPTIONS
|
|
||||||
if (window.APP_OPTIONS) {
|
|
||||||
results.app_options = window.APP_OPTIONS;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get WIZ_global_data
|
|
||||||
if (window.WIZ_global_data) {
|
|
||||||
results.wiz_data = window.WIZ_global_data;
|
|
||||||
}
|
|
||||||
|
|
||||||
return results;
|
|
||||||
"""
|
|
||||||
|
|
||||||
print("Extracting global state...")
|
|
||||||
state_data = driver.execute_script(extract_script)
|
|
||||||
|
|
||||||
print(f"\nFound keys: {list(state_data.keys())}")
|
|
||||||
|
|
||||||
# Save to file
|
|
||||||
with open('/tmp/google_maps_app_state.json', 'w') as f:
|
|
||||||
json.dump(state_data, f, indent=2, default=str)
|
|
||||||
|
|
||||||
print("\nApp state saved to: /tmp/google_maps_app_state.json")
|
|
||||||
|
|
||||||
# Try to find review data in the state
|
|
||||||
state_str = json.dumps(state_data)
|
|
||||||
if '"Hace' in state_str:
|
|
||||||
print("\n✅ Found 'Hace' in app state - reviews data is there!")
|
|
||||||
else:
|
|
||||||
print("\n❌ No 'Hace' found in app state")
|
|
||||||
|
|
||||||
# Check for timestamp-like numbers (Unix timestamps are 10-13 digits)
|
|
||||||
import re
|
|
||||||
timestamps = re.findall(r'\b\d{10,13}\b', state_str)
|
|
||||||
if timestamps:
|
|
||||||
print(f"\n✅ Found {len(timestamps)} potential timestamps (10-13 digit numbers)")
|
|
||||||
print(f"Sample: {timestamps[:5]}")
|
|
||||||
else:
|
|
||||||
print("\n❌ No timestamp-like numbers found")
|
|
||||||
|
|
||||||
finally:
|
|
||||||
driver.quit()
|
|
||||||
print("\nBrowser closed")
|
|
||||||
162
test_fast_api.py
162
test_fast_api.py
@@ -1,162 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Test script for the Fast API server.
|
|
||||||
Demonstrates how to use the updated API with the fast scraper (18.9s).
|
|
||||||
"""
|
|
||||||
import requests
|
|
||||||
import time
|
|
||||||
import json
|
|
||||||
|
|
||||||
# API base URL
|
|
||||||
BASE_URL = "http://localhost:8000"
|
|
||||||
|
|
||||||
def test_api():
|
|
||||||
"""Test the Fast API endpoints"""
|
|
||||||
|
|
||||||
print("=" * 60)
|
|
||||||
print("Testing Fast Google Reviews Scraper API")
|
|
||||||
print("=" * 60)
|
|
||||||
print()
|
|
||||||
|
|
||||||
# 1. Health check
|
|
||||||
print("1. Health Check")
|
|
||||||
response = requests.get(f"{BASE_URL}/")
|
|
||||||
print(f" Status: {response.status_code}")
|
|
||||||
print(f" Response: {response.json()}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# 2. Start a scraping job
|
|
||||||
print("2. Starting Scraping Job")
|
|
||||||
|
|
||||||
# Read URL from config
|
|
||||||
import yaml
|
|
||||||
with open('config.yaml', 'r') as f:
|
|
||||||
config = yaml.safe_load(f)
|
|
||||||
url = config.get('url')
|
|
||||||
|
|
||||||
scrape_request = {
|
|
||||||
"url": url,
|
|
||||||
"headless": True # Run in headless mode
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.post(f"{BASE_URL}/scrape", json=scrape_request)
|
|
||||||
print(f" Status: {response.status_code}")
|
|
||||||
result = response.json()
|
|
||||||
print(f" Response: {result}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
job_id = result.get('job_id')
|
|
||||||
if not job_id:
|
|
||||||
print("❌ Failed to start job!")
|
|
||||||
return
|
|
||||||
|
|
||||||
print(f" Job ID: {job_id}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# 3. Poll job status
|
|
||||||
print("3. Polling Job Status")
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
while True:
|
|
||||||
response = requests.get(f"{BASE_URL}/jobs/{job_id}")
|
|
||||||
job = response.json()
|
|
||||||
|
|
||||||
status = job['status']
|
|
||||||
progress = job.get('progress', {})
|
|
||||||
|
|
||||||
elapsed = time.time() - start_time
|
|
||||||
print(f" [{elapsed:.1f}s] Status: {status} - {progress.get('message', '')}")
|
|
||||||
|
|
||||||
if status in ['completed', 'failed', 'cancelled']:
|
|
||||||
break
|
|
||||||
|
|
||||||
time.sleep(2) # Poll every 2 seconds
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
# 4. Get final job details
|
|
||||||
print("4. Final Job Details")
|
|
||||||
response = requests.get(f"{BASE_URL}/jobs/{job_id}")
|
|
||||||
job = response.json()
|
|
||||||
|
|
||||||
print(f" Status: {job['status']}")
|
|
||||||
print(f" Reviews Count: {job.get('reviews_count', 0)}")
|
|
||||||
print(f" Scrape Time: {job.get('scrape_time', 0):.1f}s")
|
|
||||||
|
|
||||||
if job.get('error_message'):
|
|
||||||
print(f" Error: {job['error_message']}")
|
|
||||||
|
|
||||||
if job.get('progress'):
|
|
||||||
progress = job['progress']
|
|
||||||
if 'scroll_time' in progress:
|
|
||||||
print(f" Scroll Time: {progress['scroll_time']:.1f}s")
|
|
||||||
if 'extract_time' in progress:
|
|
||||||
print(f" Extract Time: {progress['extract_time']:.2f}s")
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
# 5. Get reviews data
|
|
||||||
if job['status'] == 'completed':
|
|
||||||
print("5. Retrieving Reviews Data")
|
|
||||||
response = requests.get(f"{BASE_URL}/jobs/{job_id}/reviews")
|
|
||||||
|
|
||||||
if response.status_code == 200:
|
|
||||||
reviews_data = response.json()
|
|
||||||
reviews = reviews_data['reviews']
|
|
||||||
count = reviews_data['count']
|
|
||||||
|
|
||||||
print(f" Total Reviews: {count}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Show first 3 reviews
|
|
||||||
print(" Sample Reviews:")
|
|
||||||
for i, review in enumerate(reviews[:3], 1):
|
|
||||||
print(f" {i}. {review.get('author', 'Unknown')} - {review.get('rating', 0)}★")
|
|
||||||
text = review.get('text', '')
|
|
||||||
if text:
|
|
||||||
preview = text[:60] + "..." if len(text) > 60 else text
|
|
||||||
print(f" \"{preview}\"")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Save to file
|
|
||||||
output_file = f"api_reviews_{job_id[:8]}.json"
|
|
||||||
with open(output_file, 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(reviews, f, indent=2, ensure_ascii=False)
|
|
||||||
print(f" 💾 Saved all reviews to: {output_file}")
|
|
||||||
|
|
||||||
else:
|
|
||||||
print(f" ❌ Failed to get reviews: {response.status_code}")
|
|
||||||
print(f" {response.json()}")
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
# 6. Get statistics
|
|
||||||
print("6. Job Statistics")
|
|
||||||
response = requests.get(f"{BASE_URL}/stats")
|
|
||||||
stats = response.json()
|
|
||||||
|
|
||||||
print(f" Total Jobs: {stats['total_jobs']}")
|
|
||||||
print(f" Running Jobs: {stats['running_jobs']}/{stats['max_concurrent_jobs']}")
|
|
||||||
print(f" By Status: {stats['by_status']}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
print("=" * 60)
|
|
||||||
print("✅ API Test Complete!")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
try:
|
|
||||||
test_api()
|
|
||||||
except requests.exceptions.ConnectionError:
|
|
||||||
print("❌ Error: Could not connect to API server!")
|
|
||||||
print()
|
|
||||||
print("Please start the API server first:")
|
|
||||||
print(" python api_server.py")
|
|
||||||
print()
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print("\n\nTest interrupted by user")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"\n❌ Error: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
@@ -1,71 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Test script for Lithuanian hospital to verify structural pattern matching works.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from modules.fast_scraper import fast_scrape_reviews
|
|
||||||
|
|
||||||
# Configure logging to see what's happening
|
|
||||||
logging.basicConfig(
|
|
||||||
level=logging.INFO,
|
|
||||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
||||||
)
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
def test_lithuanian_hospital():
|
|
||||||
"""Test scraping the Lithuanian hospital that was getting 0 reviews"""
|
|
||||||
|
|
||||||
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
|
|
||||||
|
|
||||||
log.info("=" * 80)
|
|
||||||
log.info("Testing Lithuanian Hospital: Panevėžio respublikinė ligoninė")
|
|
||||||
log.info("Expected: 271 reviews")
|
|
||||||
log.info("Previous result: 0 reviews (selector mismatch)")
|
|
||||||
log.info("=" * 80)
|
|
||||||
|
|
||||||
# Run the scraper with headless mode OFF so we can see what's happening
|
|
||||||
result = fast_scrape_reviews(
|
|
||||||
url=url,
|
|
||||||
headless=False, # Show browser for debugging
|
|
||||||
max_scrolls=999999 # Unlimited - use idle detection
|
|
||||||
)
|
|
||||||
|
|
||||||
log.info("=" * 80)
|
|
||||||
log.info("RESULTS:")
|
|
||||||
log.info(f"Success: {result['success']}")
|
|
||||||
log.info(f"Reviews found: {result['count']}")
|
|
||||||
log.info(f"Total reviews on page: {result.get('total_reviews', 'Unknown')}")
|
|
||||||
log.info(f"Time taken: {result['time']:.2f}s")
|
|
||||||
|
|
||||||
if result.get('message'):
|
|
||||||
log.info(f"Message: {result['message']}")
|
|
||||||
|
|
||||||
if result.get('error'):
|
|
||||||
log.error(f"Error: {result['error']}")
|
|
||||||
|
|
||||||
log.info("=" * 80)
|
|
||||||
|
|
||||||
# Show first few reviews if found
|
|
||||||
if result['count'] > 0:
|
|
||||||
log.info(f"\nFirst 3 reviews:")
|
|
||||||
for i, review in enumerate(result['reviews'][:3], 1):
|
|
||||||
log.info(f"\n Review {i}:")
|
|
||||||
log.info(f" Author: {review.get('author', 'N/A')}")
|
|
||||||
log.info(f" Rating: {review.get('rating', 'N/A')}")
|
|
||||||
log.info(f" Date: {review.get('date_text', 'N/A')}")
|
|
||||||
log.info(f" Text: {review.get('text', 'N/A')[:100]}...")
|
|
||||||
|
|
||||||
# Verify the fix worked
|
|
||||||
if result['count'] > 200:
|
|
||||||
log.info("\n✅ SUCCESS! Structural pattern matching found reviews!")
|
|
||||||
log.info(f" Got {result['count']} reviews (expected ~271)")
|
|
||||||
elif result['count'] == 0:
|
|
||||||
log.error("\n❌ FAILED! Still getting 0 reviews - selector issue not fixed")
|
|
||||||
else:
|
|
||||||
log.warning(f"\n⚠️ PARTIAL: Got {result['count']} reviews (expected ~271)")
|
|
||||||
log.warning(" May need to increase idle detection patience")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
test_lithuanian_hospital()
|
|
||||||
110
test_phase1.py
110
test_phase1.py
@@ -1,110 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Test script for Phase 1 implementation.
|
|
||||||
Tests PostgreSQL, Webhooks, and Health Checks without running full server.
|
|
||||||
"""
|
|
||||||
import asyncio
|
|
||||||
import sys
|
|
||||||
from uuid import uuid4
|
|
||||||
|
|
||||||
# Test imports
|
|
||||||
try:
|
|
||||||
from modules.database import DatabaseManager, JobStatus
|
|
||||||
from modules.webhooks import WebhookManager
|
|
||||||
from modules.health_checks import HealthCheckSystem
|
|
||||||
from modules.fast_scraper import fast_scrape_reviews
|
|
||||||
print("✅ All imports successful")
|
|
||||||
except ImportError as e:
|
|
||||||
print(f"❌ Import failed: {e}")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
async def test_phase1():
|
|
||||||
"""Test Phase 1 features"""
|
|
||||||
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("Phase 1 Feature Testing")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
# Test 1: Database Connection
|
|
||||||
print("\n1. Testing Database Connection...")
|
|
||||||
|
|
||||||
# Use in-memory SQLite for testing (since we need asyncpg for PostgreSQL)
|
|
||||||
# For full testing, you would use: DATABASE_URL="postgresql://user@localhost/dbname"
|
|
||||||
|
|
||||||
try:
|
|
||||||
# For demonstration, we'll test the module structure
|
|
||||||
print(" ✅ Database module structure valid")
|
|
||||||
print(" ✅ JobStatus enum defined")
|
|
||||||
print(" ✅ DatabaseManager class exists")
|
|
||||||
except Exception as e:
|
|
||||||
print(f" ❌ Database test failed: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Test 2: Webhook System
|
|
||||||
print("\n2. Testing Webhook System...")
|
|
||||||
|
|
||||||
try:
|
|
||||||
webhook_manager = WebhookManager()
|
|
||||||
|
|
||||||
# Test signature generation
|
|
||||||
payload = '{"test": "data"}'
|
|
||||||
secret = "test_secret"
|
|
||||||
signature = webhook_manager.generate_signature(payload, secret)
|
|
||||||
|
|
||||||
print(f" ✅ Webhook manager initialized")
|
|
||||||
print(f" ✅ Signature generation works: {signature[:16]}...")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f" ❌ Webhook test failed: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Test 3: Health Check System (without database)
|
|
||||||
print("\n3. Testing Health Check System...")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Note: Full testing requires database connection
|
|
||||||
print(" ✅ HealthCheckSystem class exists")
|
|
||||||
print(" ✅ CanaryMonitor class exists")
|
|
||||||
print(" ℹ️ Full canary testing requires database connection")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f" ❌ Health check test failed: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Test 4: Fast Scraper Integration
|
|
||||||
print("\n4. Testing Fast Scraper Integration...")
|
|
||||||
|
|
||||||
try:
|
|
||||||
print(" ✅ fast_scrape_reviews function exists")
|
|
||||||
print(" ✅ Scraper module integration ready")
|
|
||||||
print(" ℹ️ Skipping actual scrape test")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f" ❌ Scraper test failed: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Summary
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("✅ Phase 1 Module Testing Complete!")
|
|
||||||
print("=" * 60)
|
|
||||||
print()
|
|
||||||
print("All core modules are properly structured:")
|
|
||||||
print(" ✅ PostgreSQL database module")
|
|
||||||
print(" ✅ Webhook delivery system")
|
|
||||||
print(" ✅ Health check with canary testing")
|
|
||||||
print(" ✅ Fast scraper integration")
|
|
||||||
print()
|
|
||||||
print("Next steps:")
|
|
||||||
print(" 1. Start PostgreSQL: docker-compose -f docker-compose.production.yml up -d db")
|
|
||||||
print(" 2. Set DATABASE_URL environment variable")
|
|
||||||
print(" 3. Run: python api_server_production.py")
|
|
||||||
print(" 4. Test API endpoints")
|
|
||||||
print()
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
result = asyncio.run(test_phase1())
|
|
||||||
sys.exit(0 if result else 1)
|
|
||||||
@@ -1,34 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Test validation for the exact query that failed.
|
|
||||||
"""
|
|
||||||
import logging
|
|
||||||
from modules.fast_scraper import check_reviews_available
|
|
||||||
|
|
||||||
logging.basicConfig(
|
|
||||||
level=logging.INFO,
|
|
||||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
||||||
)
|
|
||||||
|
|
||||||
# Test with the exact query that failed
|
|
||||||
url = "https://www.google.com/maps/search/?api=1&query=soho+vilna+club"
|
|
||||||
|
|
||||||
print(f"\n{'='*80}")
|
|
||||||
print(f"Testing validation for: soho vilna club")
|
|
||||||
print(f"URL: {url}")
|
|
||||||
print(f"{'='*80}\n")
|
|
||||||
print("Opening browser... Check the browser console for [VALIDATION] logs")
|
|
||||||
print(f"{'='*80}\n")
|
|
||||||
|
|
||||||
result = check_reviews_available(url, headless=False)
|
|
||||||
|
|
||||||
print(f"\n{'='*80}")
|
|
||||||
print(f"RESULTS:")
|
|
||||||
print(f"{'='*80}")
|
|
||||||
print(f"Success: {result['success']}")
|
|
||||||
print(f"Has Reviews: {result['has_reviews']}")
|
|
||||||
print(f"Review Count: {result['review_count']}")
|
|
||||||
print(f"Business Name: {result['business_name']}")
|
|
||||||
if result.get('error'):
|
|
||||||
print(f"Error: {result['error']}")
|
|
||||||
print(f"{'='*80}\n")
|
|
||||||
@@ -1,125 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Test the CSS selector provided by the user to find review count.
|
|
||||||
"""
|
|
||||||
import time
|
|
||||||
from seleniumbase import Driver
|
|
||||||
from selenium.webdriver.common.by import By
|
|
||||||
|
|
||||||
driver = Driver(uc=True, headless=True)
|
|
||||||
|
|
||||||
url = 'https://www.google.com/maps/search/?api=1&query=instinto+las+palmas&hl=en'
|
|
||||||
print(f'Testing with user-provided CSS selector...\n')
|
|
||||||
driver.get(url)
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
# Handle GDPR
|
|
||||||
if 'consent.google.com' in driver.current_url:
|
|
||||||
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
|
||||||
for btn in form_btns:
|
|
||||||
if 'accept all' in (btn.text or '').lower():
|
|
||||||
btn.click()
|
|
||||||
time.sleep(2)
|
|
||||||
break
|
|
||||||
|
|
||||||
# Wait for auto-navigation and page load
|
|
||||||
time.sleep(6)
|
|
||||||
|
|
||||||
print(f'Current URL: {driver.current_url[:100]}...\n')
|
|
||||||
|
|
||||||
# Test the exact selector provided by user
|
|
||||||
selector = 'body > div:nth-child(5) > div.lbMcOd.y2iKwd.eZfyae.cSgCkb.xcUKcd.y2Sqzf.Nkjr6c.K1N2o > div.UL7Qtf > div.g2LZJb > div > div > div.w6VYqd > div:nth-child(2) > div > div.e07Vkf.kA9KIf > div > div > div.TIHn2 > div > div.lMbq3e > div.LBgpqf > div > div.fontBodyMedium.dmRWX > div.tos0Ie > div'
|
|
||||||
|
|
||||||
result = driver.execute_script('''
|
|
||||||
const selector = arguments[0];
|
|
||||||
const elem = document.querySelector(selector);
|
|
||||||
|
|
||||||
if (elem) {
|
|
||||||
return {
|
|
||||||
found: true,
|
|
||||||
text: elem.textContent || '',
|
|
||||||
innerHTML: elem.innerHTML || '',
|
|
||||||
parent: elem.parentElement ? elem.parentElement.textContent : ''
|
|
||||||
};
|
|
||||||
} else {
|
|
||||||
return {
|
|
||||||
found: false,
|
|
||||||
text: null
|
|
||||||
};
|
|
||||||
}
|
|
||||||
''', selector)
|
|
||||||
|
|
||||||
print('='*80)
|
|
||||||
print('RESULT FROM USER SELECTOR:')
|
|
||||||
print('='*80)
|
|
||||||
print(f"Found: {result['found']}")
|
|
||||||
if result['found']:
|
|
||||||
print(f"Text: {result['text']}")
|
|
||||||
print(f"HTML: {result['innerHTML'][:200]}")
|
|
||||||
print(f"Parent text: {result['parent'][:200]}")
|
|
||||||
else:
|
|
||||||
print('❌ Element NOT found with that exact selector')
|
|
||||||
|
|
||||||
# Try simpler selectors based on the classes
|
|
||||||
print('\n' + '='*80)
|
|
||||||
print('TESTING SIMPLER SELECTORS (key classes from user selector):')
|
|
||||||
print('='*80)
|
|
||||||
|
|
||||||
# Test various class combinations
|
|
||||||
selectors_to_test = [
|
|
||||||
'div.fontBodyMedium.dmRWX',
|
|
||||||
'div.tos0Ie',
|
|
||||||
'div.LBgpqf',
|
|
||||||
'div.lMbq3e',
|
|
||||||
]
|
|
||||||
|
|
||||||
for test_selector in selectors_to_test:
|
|
||||||
elements = driver.execute_script('''
|
|
||||||
const selector = arguments[0];
|
|
||||||
const elements = document.querySelectorAll(selector);
|
|
||||||
const results = [];
|
|
||||||
|
|
||||||
for (let elem of elements) {
|
|
||||||
const text = (elem.textContent || '').trim();
|
|
||||||
if (text.length > 0 && text.length < 150) {
|
|
||||||
results.push(text);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return results.slice(0, 5); // First 5 matches
|
|
||||||
''', test_selector)
|
|
||||||
|
|
||||||
print(f'\nSelector: {test_selector}')
|
|
||||||
print(f'Found {len(elements)} element(s):')
|
|
||||||
for i, text in enumerate(elements, 1):
|
|
||||||
print(f' {i}. {text[:100]}')
|
|
||||||
|
|
||||||
# Also look for any element containing "review" in these specific class contexts
|
|
||||||
print('\n' + '='*80)
|
|
||||||
print('SEARCHING FOR REVIEW COUNT IN SIMILAR LOCATIONS:')
|
|
||||||
print('='*80)
|
|
||||||
|
|
||||||
review_search = driver.execute_script('''
|
|
||||||
const results = [];
|
|
||||||
|
|
||||||
// Look for elements with classes that might contain review info
|
|
||||||
const candidates = document.querySelectorAll('div.fontBodyMedium, div[class*="dmRWX"], div[class*="tos0Ie"]');
|
|
||||||
|
|
||||||
for (let elem of candidates) {
|
|
||||||
const text = (elem.textContent || '').trim();
|
|
||||||
if (text.length > 0 && text.length < 200 && /review|reseña/i.test(text)) {
|
|
||||||
results.push({
|
|
||||||
text: text,
|
|
||||||
classes: elem.className
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return results.slice(0, 10);
|
|
||||||
''')
|
|
||||||
|
|
||||||
for i, item in enumerate(review_search, 1):
|
|
||||||
print(f"\n{i}. Classes: {item['classes'][:80]}")
|
|
||||||
print(f" Text: {item['text'][:100]}")
|
|
||||||
|
|
||||||
driver.quit()
|
|
||||||
@@ -1,55 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Test script for validating review detection on search results pages.
|
|
||||||
Tests the check_reviews_available() function locally.
|
|
||||||
"""
|
|
||||||
import sys
|
|
||||||
import logging
|
|
||||||
from modules.fast_scraper import check_reviews_available
|
|
||||||
|
|
||||||
# Setup logging to see all debug info
|
|
||||||
logging.basicConfig(
|
|
||||||
level=logging.INFO,
|
|
||||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_validation(search_query: str):
|
|
||||||
"""Test validation for a search query."""
|
|
||||||
# Convert search query to Google Maps search URL
|
|
||||||
url = f"https://www.google.com/maps/search/?api=1&query={search_query.replace(' ', '+')}"
|
|
||||||
|
|
||||||
print(f"\n{'='*80}")
|
|
||||||
print(f"Testing validation for: {search_query}")
|
|
||||||
print(f"URL: {url}")
|
|
||||||
print(f"{'='*80}\n")
|
|
||||||
|
|
||||||
# Run the check
|
|
||||||
result = check_reviews_available(url, headless=False)
|
|
||||||
|
|
||||||
# Display results
|
|
||||||
print(f"\n{'='*80}")
|
|
||||||
print(f"RESULTS:")
|
|
||||||
print(f"{'='*80}")
|
|
||||||
print(f"Success: {result['success']}")
|
|
||||||
print(f"Has Reviews: {result['has_reviews']}")
|
|
||||||
print(f"Review Count: {result['review_count']}")
|
|
||||||
print(f"Business Name: {result['business_name']}")
|
|
||||||
if result.get('error'):
|
|
||||||
print(f"Error: {result['error']}")
|
|
||||||
print(f"{'='*80}\n")
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# Test with the problematic search query
|
|
||||||
test_cases = [
|
|
||||||
"soho vilnius club",
|
|
||||||
"google dublin office", # Known business with many reviews
|
|
||||||
]
|
|
||||||
|
|
||||||
for query in test_cases:
|
|
||||||
result = test_validation(query)
|
|
||||||
|
|
||||||
# Pause between tests
|
|
||||||
if query != test_cases[-1]:
|
|
||||||
input("\nPress Enter to continue to next test...")
|
|
||||||
@@ -1,92 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Test WITHOUT forcing English locale - use the page's default language.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import time
|
|
||||||
from seleniumbase import Driver
|
|
||||||
|
|
||||||
# NO hl=en parameter!
|
|
||||||
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
|
|
||||||
|
|
||||||
driver = Driver(uc=True, headless=False)
|
|
||||||
|
|
||||||
try:
|
|
||||||
driver.get(url)
|
|
||||||
print(f"Loaded (NO hl=en): {url}")
|
|
||||||
time.sleep(5)
|
|
||||||
|
|
||||||
# GDPR
|
|
||||||
try:
|
|
||||||
form_btns = driver.find_elements('css selector', 'form button')
|
|
||||||
for btn in form_btns:
|
|
||||||
btn_text = (btn.text or '').lower()
|
|
||||||
if 'accept' in btn_text or 'priim' in btn_text: # Lithuanian "priimti"
|
|
||||||
print(f"Clicking consent: {btn.text}")
|
|
||||||
btn.click()
|
|
||||||
time.sleep(2)
|
|
||||||
break
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# List ALL tabs
|
|
||||||
print("\nALL TABS FOUND:")
|
|
||||||
time.sleep(2)
|
|
||||||
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
|
||||||
for i, tab in enumerate(tabs, 1):
|
|
||||||
text = tab.text or ''
|
|
||||||
aria = tab.get_attribute('aria-label') or ''
|
|
||||||
print(f" Tab {i}: text='{text}', aria='{aria}'")
|
|
||||||
|
|
||||||
# Look for reviews tab (try multiple keywords)
|
|
||||||
review_keywords = ['review', 'reseña', 'atsiliepimai', 'atsiliepi', 'отзыв']
|
|
||||||
review_tab_found = False
|
|
||||||
|
|
||||||
for tab in tabs:
|
|
||||||
text = (tab.text or '').lower()
|
|
||||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
|
||||||
|
|
||||||
for keyword in review_keywords:
|
|
||||||
if keyword in text or keyword in aria:
|
|
||||||
print(f"\nFound REVIEWS TAB: {tab.text or aria[:50]}")
|
|
||||||
driver.execute_script("arguments[0].click();", tab)
|
|
||||||
time.sleep(5)
|
|
||||||
review_tab_found = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if review_tab_found:
|
|
||||||
break
|
|
||||||
|
|
||||||
if not review_tab_found:
|
|
||||||
print("\nWARNING: Still no reviews tab found!")
|
|
||||||
else:
|
|
||||||
# Now scroll and check for reviews
|
|
||||||
print("\nScrolling to load reviews...")
|
|
||||||
try:
|
|
||||||
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
|
|
||||||
for i in range(10):
|
|
||||||
driver.execute_script("arguments[0].scrollBy(0, 400);", pane)
|
|
||||||
time.sleep(0.3)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Check for reviews using known selectors
|
|
||||||
selectors_to_check = [
|
|
||||||
'div.jftiEf.fontBodyMedium',
|
|
||||||
'div.jftiEf',
|
|
||||||
'div.fontBodyMedium',
|
|
||||||
'div[data-review-id]'
|
|
||||||
]
|
|
||||||
|
|
||||||
print("\nChecking selectors:")
|
|
||||||
for selector in selectors_to_check:
|
|
||||||
count = driver.execute_script(f"return document.querySelectorAll('{selector}').length;")
|
|
||||||
print(f" {selector:30} : {count} elements")
|
|
||||||
|
|
||||||
print(f"\n{'='*80}")
|
|
||||||
print("Browser open for inspection (120s)...")
|
|
||||||
print(f"{'='*80}")
|
|
||||||
time.sleep(120)
|
|
||||||
|
|
||||||
finally:
|
|
||||||
driver.quit()
|
|
||||||
86
web/app/api/jobs/[jobId]/compare/route.ts
Normal file
86
web/app/api/jobs/[jobId]/compare/route.ts
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
||||||
|
|
||||||
|
// GET /api/jobs/[jobId]/compare?previous=<previousJobId>
|
||||||
|
// Returns reviews from current job with a flag indicating if they're new
|
||||||
|
export async function GET(
|
||||||
|
request: NextRequest,
|
||||||
|
{ params }: { params: Promise<{ jobId: string }> }
|
||||||
|
) {
|
||||||
|
try {
|
||||||
|
const { jobId } = await params;
|
||||||
|
const { searchParams } = new URL(request.url);
|
||||||
|
const previousJobId = searchParams.get('previous');
|
||||||
|
|
||||||
|
// Fetch current job reviews
|
||||||
|
const currentResponse = await fetch(`${API_BASE_URL}/jobs/${jobId}/reviews?limit=10000`);
|
||||||
|
if (!currentResponse.ok) {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Failed to get current job reviews' },
|
||||||
|
{ status: currentResponse.status }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
const currentData = await currentResponse.json();
|
||||||
|
const currentReviews = currentData.reviews || [];
|
||||||
|
|
||||||
|
// If no previous job to compare, all reviews are "new"
|
||||||
|
if (!previousJobId) {
|
||||||
|
const reviewsWithStatus = currentReviews.map((review: Record<string, unknown>) => ({
|
||||||
|
...review,
|
||||||
|
is_new: true,
|
||||||
|
}));
|
||||||
|
return NextResponse.json({
|
||||||
|
reviews: reviewsWithStatus,
|
||||||
|
total_count: reviewsWithStatus.length,
|
||||||
|
new_count: reviewsWithStatus.length,
|
||||||
|
previous_job_id: null,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fetch previous job reviews
|
||||||
|
const previousResponse = await fetch(`${API_BASE_URL}/jobs/${previousJobId}/reviews?limit=10000`);
|
||||||
|
if (!previousResponse.ok) {
|
||||||
|
// Previous job not found, treat all as new
|
||||||
|
const reviewsWithStatus = currentReviews.map((review: Record<string, unknown>) => ({
|
||||||
|
...review,
|
||||||
|
is_new: true,
|
||||||
|
}));
|
||||||
|
return NextResponse.json({
|
||||||
|
reviews: reviewsWithStatus,
|
||||||
|
total_count: reviewsWithStatus.length,
|
||||||
|
new_count: reviewsWithStatus.length,
|
||||||
|
previous_job_id: previousJobId,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const previousData = await previousResponse.json();
|
||||||
|
const previousReviews = previousData.reviews || [];
|
||||||
|
|
||||||
|
// Create a Set of previous review IDs for O(1) lookup
|
||||||
|
const previousReviewIds = new Set(
|
||||||
|
previousReviews.map((r: { review_id: string }) => r.review_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
// Mark reviews as new if they weren't in the previous job
|
||||||
|
const reviewsWithStatus = currentReviews.map((review: { review_id: string }) => ({
|
||||||
|
...review,
|
||||||
|
is_new: !previousReviewIds.has(review.review_id),
|
||||||
|
}));
|
||||||
|
|
||||||
|
const newCount = reviewsWithStatus.filter((r: { is_new: boolean }) => r.is_new).length;
|
||||||
|
|
||||||
|
return NextResponse.json({
|
||||||
|
reviews: reviewsWithStatus,
|
||||||
|
total_count: reviewsWithStatus.length,
|
||||||
|
new_count: newCount,
|
||||||
|
previous_job_id: previousJobId,
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Compare API error:', error);
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Failed to compare reviews' },
|
||||||
|
{ status: 500 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
30
web/app/api/jobs/[jobId]/logs/route.ts
Normal file
30
web/app/api/jobs/[jobId]/logs/route.ts
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
||||||
|
|
||||||
|
export async function GET(
|
||||||
|
request: NextRequest,
|
||||||
|
{ params }: { params: Promise<{ jobId: string }> }
|
||||||
|
) {
|
||||||
|
try {
|
||||||
|
const { jobId } = await params;
|
||||||
|
|
||||||
|
const response = await fetch(`${API_BASE_URL}/jobs/${jobId}/logs`);
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Failed to get logs' },
|
||||||
|
{ status: response.status }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = await response.json();
|
||||||
|
return NextResponse.json(data);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Logs API error:', error);
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Failed to get logs' },
|
||||||
|
{ status: 500 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -28,3 +28,32 @@ export async function GET(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function DELETE(
|
||||||
|
request: NextRequest,
|
||||||
|
{ params }: { params: Promise<{ jobId: string }> }
|
||||||
|
) {
|
||||||
|
try {
|
||||||
|
const { jobId } = await params;
|
||||||
|
|
||||||
|
const response = await fetch(`${API_BASE_URL}/jobs/${jobId}`, {
|
||||||
|
method: 'DELETE',
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const data = await response.json();
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: data.detail || 'Failed to delete job' },
|
||||||
|
{ status: response.status }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return NextResponse.json({ success: true, message: 'Job deleted successfully' });
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Delete job API error:', error);
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Failed to delete job' },
|
||||||
|
{ status: 500 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
57
web/app/api/jobs/[jobId]/stream/route.ts
Normal file
57
web/app/api/jobs/[jobId]/stream/route.ts
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
import { NextRequest } from 'next/server';
|
||||||
|
|
||||||
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
||||||
|
|
||||||
|
export const dynamic = 'force-dynamic';
|
||||||
|
|
||||||
|
export async function GET(
|
||||||
|
request: NextRequest,
|
||||||
|
{ params }: { params: Promise<{ jobId: string }> }
|
||||||
|
) {
|
||||||
|
const { jobId } = await params;
|
||||||
|
const encoder = new TextEncoder();
|
||||||
|
|
||||||
|
const stream = new ReadableStream({
|
||||||
|
async start(controller) {
|
||||||
|
try {
|
||||||
|
const response = await fetch(`${API_BASE_URL}/jobs/${jobId}/stream`, {
|
||||||
|
headers: {
|
||||||
|
'Accept': 'text/event-stream',
|
||||||
|
'Cache-Control': 'no-cache',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok || !response.body) {
|
||||||
|
controller.enqueue(encoder.encode(`event: error\ndata: {"error": "Failed to connect to backend"}\n\n`));
|
||||||
|
controller.close();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const reader = response.body.getReader();
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
const { done, value } = await reader.read();
|
||||||
|
if (done) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// Forward the SSE data as-is
|
||||||
|
controller.enqueue(value);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('SSE stream error:', error);
|
||||||
|
controller.enqueue(encoder.encode(`event: error\ndata: {"error": "Stream connection failed"}\n\n`));
|
||||||
|
} finally {
|
||||||
|
controller.close();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
return new Response(stream, {
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'text/event-stream',
|
||||||
|
'Cache-Control': 'no-cache, no-transform',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'X-Accel-Buffering': 'no',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
30
web/app/api/jobs/route.ts
Normal file
30
web/app/api/jobs/route.ts
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
|
||||||
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
||||||
|
|
||||||
|
export async function GET(request: NextRequest) {
|
||||||
|
try {
|
||||||
|
const { searchParams } = new URL(request.url);
|
||||||
|
const limit = searchParams.get('limit') || '100';
|
||||||
|
|
||||||
|
const response = await fetch(`${API_BASE_URL}/jobs?limit=${limit}`);
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Failed to get jobs' },
|
||||||
|
{ status: response.status }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = await response.json();
|
||||||
|
// Backend returns array directly, not { jobs: [...] }
|
||||||
|
const jobs = Array.isArray(data) ? data : (data.jobs || []);
|
||||||
|
return NextResponse.json({ jobs });
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Jobs API error:', error);
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Failed to get jobs' },
|
||||||
|
{ status: 500 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
54
web/app/api/jobs/stream/route.ts
Normal file
54
web/app/api/jobs/stream/route.ts
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
import { NextRequest } from 'next/server';
|
||||||
|
|
||||||
|
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
||||||
|
|
||||||
|
export const dynamic = 'force-dynamic';
|
||||||
|
|
||||||
|
export async function GET(request: NextRequest) {
|
||||||
|
const encoder = new TextEncoder();
|
||||||
|
|
||||||
|
const stream = new ReadableStream({
|
||||||
|
async start(controller) {
|
||||||
|
try {
|
||||||
|
const response = await fetch(`${API_BASE_URL}/jobs/stream`, {
|
||||||
|
headers: {
|
||||||
|
'Accept': 'text/event-stream',
|
||||||
|
'Cache-Control': 'no-cache',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok || !response.body) {
|
||||||
|
controller.enqueue(encoder.encode(`event: error\ndata: {"error": "Failed to connect to backend"}\n\n`));
|
||||||
|
controller.close();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const reader = response.body.getReader();
|
||||||
|
const decoder = new TextDecoder();
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
const { done, value } = await reader.read();
|
||||||
|
if (done) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// Forward the SSE data as-is
|
||||||
|
controller.enqueue(value);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('SSE stream error:', error);
|
||||||
|
controller.enqueue(encoder.encode(`event: error\ndata: {"error": "Stream connection failed"}\n\n`));
|
||||||
|
} finally {
|
||||||
|
controller.close();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
return new Response(stream, {
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'text/event-stream',
|
||||||
|
'Cache-Control': 'no-cache, no-transform',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'X-Accel-Buffering': 'no',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
@@ -4,17 +4,26 @@ const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
|||||||
|
|
||||||
export async function POST(request: NextRequest) {
|
export async function POST(request: NextRequest) {
|
||||||
try {
|
try {
|
||||||
const { url } = await request.json();
|
const body = await request.json();
|
||||||
|
const { url, business_name, business_address, rating_snapshot, total_reviews_snapshot } = body;
|
||||||
|
|
||||||
if (!url) {
|
if (!url) {
|
||||||
return NextResponse.json({ error: 'URL is required' }, { status: 400 });
|
return NextResponse.json({ error: 'URL is required' }, { status: 400 });
|
||||||
}
|
}
|
||||||
|
|
||||||
// Call the containerized scraper API
|
// Call the containerized scraper API with business metadata
|
||||||
const response = await fetch(`${API_BASE_URL}/scrape`, {
|
const response = await fetch(`${API_BASE_URL}/scrape`, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: { 'Content-Type': 'application/json' },
|
headers: { 'Content-Type': 'application/json' },
|
||||||
body: JSON.stringify({ url }),
|
body: JSON.stringify({
|
||||||
|
url,
|
||||||
|
metadata: {
|
||||||
|
business_name,
|
||||||
|
business_address,
|
||||||
|
rating_snapshot,
|
||||||
|
total_reviews_snapshot,
|
||||||
|
},
|
||||||
|
}),
|
||||||
});
|
});
|
||||||
|
|
||||||
const data = await response.json();
|
const data = await response.json();
|
||||||
|
|||||||
271
web/app/page.tsx
271
web/app/page.tsx
@@ -1,38 +1,259 @@
|
|||||||
import ScraperTest from '@/components/ScraperTest';
|
'use client';
|
||||||
|
|
||||||
|
import ScraperTest, { JobStatus } from '@/components/ScraperTest';
|
||||||
|
import ReviewAnalytics from '@/components/ReviewAnalytics';
|
||||||
|
import Sidebar from '@/components/Sidebar';
|
||||||
|
import JobsView from '@/components/JobsView';
|
||||||
|
import { useState, useCallback, useEffect } from 'react';
|
||||||
|
|
||||||
|
interface Review {
|
||||||
|
author: string;
|
||||||
|
rating: number;
|
||||||
|
text: string | null;
|
||||||
|
date_text: string;
|
||||||
|
avatar_url: string | null;
|
||||||
|
profile_url: string | null;
|
||||||
|
review_id: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ReviewWithNew extends Review {
|
||||||
|
is_new?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface SelectedJob {
|
||||||
|
reviews: ReviewWithNew[];
|
||||||
|
businessName: string;
|
||||||
|
businessUrl: string;
|
||||||
|
jobId: string;
|
||||||
|
newCount?: number;
|
||||||
|
previousJobId?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
type ViewType = 'newScrape' | 'jobs' | 'reports';
|
||||||
|
|
||||||
export default function Home() {
|
export default function Home() {
|
||||||
|
const [activeView, setActiveView] = useState<ViewType>('newScrape');
|
||||||
|
const [jobs, setJobs] = useState<JobStatus[]>([]);
|
||||||
|
const [selectedJob, setSelectedJob] = useState<SelectedJob | null>(null);
|
||||||
|
const [isLoadingJob, setIsLoadingJob] = useState<string | null>(null);
|
||||||
|
|
||||||
|
// Load jobs from API
|
||||||
|
const refreshJobs = useCallback(async () => {
|
||||||
|
try {
|
||||||
|
const response = await fetch('/api/jobs?limit=100');
|
||||||
|
if (response.ok) {
|
||||||
|
const data = await response.json();
|
||||||
|
if (data.jobs) {
|
||||||
|
setJobs(data.jobs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Failed to load jobs:', err);
|
||||||
|
}
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
// Load jobs from API on mount
|
||||||
|
useEffect(() => {
|
||||||
|
refreshJobs();
|
||||||
|
}, [refreshJobs]);
|
||||||
|
|
||||||
|
const handleJobsChange = useCallback((newJobs: JobStatus[]) => {
|
||||||
|
setJobs(prev => {
|
||||||
|
// Merge new jobs with existing, updating duplicates
|
||||||
|
const jobMap = new Map(prev.map(j => [j.job_id, j]));
|
||||||
|
newJobs.forEach(job => jobMap.set(job.job_id, job));
|
||||||
|
return Array.from(jobMap.values());
|
||||||
|
});
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
const handleSelectReviews = useCallback((reviews: Review[], businessName: string, jobId: string, businessUrl?: string) => {
|
||||||
|
setSelectedJob({ reviews, businessName, businessUrl: businessUrl || '', jobId });
|
||||||
|
setActiveView('reports');
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
const loadJobReviews = async (job: JobStatus, previousJob?: JobStatus) => {
|
||||||
|
if (job.status !== 'completed' || !job.reviews_count) return;
|
||||||
|
|
||||||
|
setIsLoadingJob(job.job_id);
|
||||||
|
try {
|
||||||
|
// Use compare API if we have a previous job
|
||||||
|
const url = previousJob
|
||||||
|
? `/api/jobs/${job.job_id}/compare?previous=${previousJob.job_id}`
|
||||||
|
: `/api/jobs/${job.job_id}/reviews?limit=10000`;
|
||||||
|
|
||||||
|
const response = await fetch(url);
|
||||||
|
if (!response.ok) throw new Error('Failed to fetch reviews');
|
||||||
|
const data = await response.json();
|
||||||
|
|
||||||
|
const reviews = data.reviews || [];
|
||||||
|
if (reviews.length > 0) {
|
||||||
|
// Extract business name from URL query param as fallback
|
||||||
|
let businessName = job.business_name;
|
||||||
|
if (!businessName) {
|
||||||
|
try {
|
||||||
|
const urlObj = new URL(job.url);
|
||||||
|
const query = urlObj.searchParams.get('query');
|
||||||
|
businessName = query ? decodeURIComponent(query) : 'Unknown Business';
|
||||||
|
} catch {
|
||||||
|
businessName = 'Unknown Business';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
setSelectedJob({
|
||||||
|
reviews,
|
||||||
|
businessName,
|
||||||
|
businessUrl: job.url,
|
||||||
|
jobId: job.job_id,
|
||||||
|
newCount: data.new_count,
|
||||||
|
previousJobId: previousJob?.job_id,
|
||||||
|
});
|
||||||
|
setActiveView('reports');
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Failed to load job reviews:', err);
|
||||||
|
} finally {
|
||||||
|
setIsLoadingJob(null);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const renderMainContent = () => {
|
||||||
|
switch (activeView) {
|
||||||
|
case 'newScrape':
|
||||||
return (
|
return (
|
||||||
<div className="min-h-screen bg-gradient-to-br from-blue-600 to-indigo-700 py-12 px-4">
|
<div className="h-full overflow-y-auto p-6">
|
||||||
<main className="max-w-5xl mx-auto">
|
<ScraperTest onJobsChange={handleJobsChange} onSelectReviews={handleSelectReviews} />
|
||||||
<div className="text-center mb-10">
|
|
||||||
<h1 className="text-4xl md:text-5xl font-bold text-white mb-3">
|
|
||||||
Google Reviews Scraper
|
|
||||||
</h1>
|
|
||||||
<p className="text-blue-100 text-lg">
|
|
||||||
Test the containerized scraper API
|
|
||||||
</p>
|
|
||||||
<div className="mt-4 inline-flex items-center gap-2 px-4 py-2 bg-blue-500/30 rounded-lg text-blue-100 text-sm">
|
|
||||||
<div className="w-2 h-2 bg-green-400 rounded-full animate-pulse"></div>
|
|
||||||
Powered by SeleniumBase UC Mode
|
|
||||||
</div>
|
</div>
|
||||||
|
);
|
||||||
|
|
||||||
|
case 'jobs':
|
||||||
|
return (
|
||||||
|
<JobsView
|
||||||
|
jobs={jobs}
|
||||||
|
onSelectJob={loadJobReviews}
|
||||||
|
isLoadingJob={isLoadingJob}
|
||||||
|
onRefresh={refreshJobs}
|
||||||
|
/>
|
||||||
|
);
|
||||||
|
|
||||||
|
case 'reports': {
|
||||||
|
// Get completed jobs with reviews
|
||||||
|
const completedJobs = jobs
|
||||||
|
.filter(j => j.status === 'completed' && j.reviews_count && j.reviews_count > 0)
|
||||||
|
.sort((a, b) => new Date(b.created_at).getTime() - new Date(a.created_at).getTime());
|
||||||
|
|
||||||
|
return selectedJob ? (
|
||||||
|
<div className="h-full overflow-y-auto p-6">
|
||||||
|
<div className="mb-4 flex items-center justify-between">
|
||||||
|
<h2 className="text-xl font-bold text-gray-900">Analytics</h2>
|
||||||
|
<button
|
||||||
|
onClick={() => setSelectedJob(null)}
|
||||||
|
className="px-4 py-2 bg-gray-200 hover:bg-gray-300 text-gray-700 rounded-lg font-medium transition-colors flex items-center gap-2"
|
||||||
|
>
|
||||||
|
<svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||||
|
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M10 19l-7-7m0 0l7-7m-7 7h18" />
|
||||||
|
</svg>
|
||||||
|
Back to Reports
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<ReviewAnalytics reviews={selectedJob.reviews} businessName={selectedJob.businessName} businessUrl={selectedJob.businessUrl} newCount={selectedJob.newCount} />
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<div className="h-full overflow-y-auto p-6">
|
||||||
|
<div className="mb-6">
|
||||||
|
<h2 className="text-2xl font-bold text-gray-900">Reports</h2>
|
||||||
|
<p className="text-sm text-gray-600 mt-1">
|
||||||
|
{completedJobs.length} completed {completedJobs.length === 1 ? 'scrape' : 'scrapes'} with reviews
|
||||||
|
</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div className="bg-white rounded-2xl shadow-2xl p-6 md:p-8">
|
{completedJobs.length === 0 ? (
|
||||||
<ScraperTest />
|
<div className="flex flex-col items-center justify-center py-16 text-gray-500">
|
||||||
|
<svg className="w-20 h-20 mb-4 opacity-30" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||||
|
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={1.5} d="M9 19v-6a2 2 0 00-2-2H5a2 2 0 00-2 2v6a2 2 0 002 2h2a2 2 0 002-2zm0 0V9a2 2 0 012-2h2a2 2 0 012 2v10m-6 0a2 2 0 002 2h2a2 2 0 002-2m0 0V5a2 2 0 012-2h2a2 2 0 012 2v14a2 2 0 01-2 2h-2a2 2 0 01-2-2z" />
|
||||||
|
</svg>
|
||||||
|
<h3 className="text-xl font-semibold text-gray-700 mb-2">No Reports Yet</h3>
|
||||||
|
<p className="text-sm text-gray-500 mb-4">Complete a scrape job to see analytics reports</p>
|
||||||
|
<button
|
||||||
|
onClick={() => setActiveView('newScrape')}
|
||||||
|
className="px-4 py-2 bg-blue-600 text-white rounded-lg font-medium hover:bg-blue-700 transition-colors"
|
||||||
|
>
|
||||||
|
Start New Scrape
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<div className="grid gap-4 md:grid-cols-2 lg:grid-cols-3">
|
||||||
|
{completedJobs.map(job => {
|
||||||
|
// Extract business name from URL as fallback
|
||||||
|
let businessName = job.business_name;
|
||||||
|
if (!businessName) {
|
||||||
|
try {
|
||||||
|
const urlObj = new URL(job.url);
|
||||||
|
const query = urlObj.searchParams.get('query');
|
||||||
|
businessName = query ? decodeURIComponent(query) : 'Unknown Business';
|
||||||
|
} catch {
|
||||||
|
businessName = 'Unknown Business';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div
|
||||||
|
key={job.job_id}
|
||||||
|
onClick={() => loadJobReviews(job)}
|
||||||
|
className="bg-white rounded-xl border-2 border-gray-200 p-5 cursor-pointer hover:border-blue-400 hover:shadow-lg transition-all"
|
||||||
|
>
|
||||||
|
<div className="flex items-start justify-between mb-3">
|
||||||
|
<h3 className="font-bold text-gray-900 truncate flex-1" title={businessName}>
|
||||||
|
{businessName}
|
||||||
|
</h3>
|
||||||
|
{job.rating_snapshot && (
|
||||||
|
<span className="flex items-center gap-1 text-yellow-600 font-semibold ml-2">
|
||||||
|
<svg className="w-4 h-4" fill="currentColor" viewBox="0 0 20 20">
|
||||||
|
<path d="M9.049 2.927c.3-.921 1.603-.921 1.902 0l1.07 3.292a1 1 0 00.95.69h3.462c.969 0 1.371 1.24.588 1.81l-2.8 2.034a1 1 0 00-.364 1.118l1.07 3.292c.3.921-.755 1.688-1.54 1.118l-2.8-2.034a1 1 0 00-1.175 0l-2.8 2.034c-.784.57-1.838-.197-1.539-1.118l1.07-3.292a1 1 0 00-.364-1.118L2.98 8.72c-.783-.57-.38-1.81.588-1.81h3.461a1 1 0 00.951-.69l1.07-3.292z" />
|
||||||
|
</svg>
|
||||||
|
{job.rating_snapshot.toFixed(1)}
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div className="mt-8 text-center text-blue-100 text-sm space-y-2">
|
<div className="flex items-center gap-4 text-sm text-gray-600 mb-3">
|
||||||
<p className="font-medium">💡 Example URLs to test:</p>
|
<span className="font-semibold text-blue-700">{job.reviews_count} reviews</span>
|
||||||
<div className="space-y-1 text-xs">
|
{job.scrape_time && <span>{job.scrape_time.toFixed(1)}s</span>}
|
||||||
<p className="font-mono bg-blue-500/20 rounded px-3 py-1 inline-block">
|
|
||||||
https://www.google.com/maps/place/Soho+Club/...
|
|
||||||
</p>
|
|
||||||
</div>
|
</div>
|
||||||
<p className="mt-4 text-blue-200">
|
|
||||||
API running at: <span className="font-mono">localhost:8000</span>
|
<div className="text-xs text-gray-500">
|
||||||
</p>
|
{new Date(job.created_at).toLocaleDateString()} at {new Date(job.created_at).toLocaleTimeString()}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{isLoadingJob === job.job_id && (
|
||||||
|
<div className="mt-3 flex items-center gap-2 text-blue-600">
|
||||||
|
<div className="w-4 h-4 border-2 border-blue-500 border-t-transparent rounded-full animate-spin" />
|
||||||
|
<span className="text-sm font-medium">Loading...</span>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
})}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="h-screen w-screen overflow-hidden flex">
|
||||||
|
{/* Sidebar */}
|
||||||
|
<Sidebar
|
||||||
|
activeView={activeView}
|
||||||
|
onViewChange={setActiveView}
|
||||||
|
jobCount={jobs.length}
|
||||||
|
/>
|
||||||
|
|
||||||
|
{/* Main Content */}
|
||||||
|
<div className="flex-1 bg-gray-50 overflow-hidden">
|
||||||
|
{renderMainContent()}
|
||||||
</div>
|
</div>
|
||||||
</main>
|
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
1586
web/components/JobsView.tsx
Normal file
1586
web/components/JobsView.tsx
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -13,7 +13,7 @@ interface Review {
|
|||||||
review_id: string;
|
review_id: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface JobStatus {
|
export interface JobStatus {
|
||||||
job_id: string;
|
job_id: string;
|
||||||
status: 'pending' | 'running' | 'completed' | 'failed';
|
status: 'pending' | 'running' | 'completed' | 'failed';
|
||||||
url: string;
|
url: string;
|
||||||
@@ -25,9 +25,19 @@ interface JobStatus {
|
|||||||
total_reviews: number | null;
|
total_reviews: number | null;
|
||||||
scrape_time: number | null;
|
scrape_time: number | null;
|
||||||
error_message: string | null;
|
error_message: string | null;
|
||||||
|
// Business metadata for tracking and comparison
|
||||||
|
business_name: string | null;
|
||||||
|
business_address: string | null;
|
||||||
|
rating_snapshot: number | null;
|
||||||
|
total_reviews_snapshot: number | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
export default function ScraperTest() {
|
interface ScraperTestProps {
|
||||||
|
onJobsChange?: (jobs: JobStatus[]) => void;
|
||||||
|
onSelectReviews?: (reviews: Review[], businessName: string, jobId: string) => void;
|
||||||
|
}
|
||||||
|
|
||||||
|
export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTestProps = {}) {
|
||||||
const [searchQuery, setSearchQuery] = useState('');
|
const [searchQuery, setSearchQuery] = useState('');
|
||||||
const [searchedQuery, setSearchedQuery] = useState('');
|
const [searchedQuery, setSearchedQuery] = useState('');
|
||||||
const [jobs, setJobs] = useState<Map<string, JobStatus>>(new Map());
|
const [jobs, setJobs] = useState<Map<string, JobStatus>>(new Map());
|
||||||
@@ -44,6 +54,8 @@ export default function ScraperTest() {
|
|||||||
const [businessName, setBusinessName] = useState<string | null>(null);
|
const [businessName, setBusinessName] = useState<string | null>(null);
|
||||||
const [businessAddress, setBusinessAddress] = useState<string | null>(null);
|
const [businessAddress, setBusinessAddress] = useState<string | null>(null);
|
||||||
const [businessRating, setBusinessRating] = useState<number | null>(null);
|
const [businessRating, setBusinessRating] = useState<number | null>(null);
|
||||||
|
const [businessImage, setBusinessImage] = useState<string | null>(null);
|
||||||
|
const [businessCategory, setBusinessCategory] = useState<string | null>(null);
|
||||||
const debounceRef = useRef<NodeJS.Timeout | null>(null);
|
const debounceRef = useRef<NodeJS.Timeout | null>(null);
|
||||||
const pollingIntervals = useRef<Map<string, NodeJS.Timeout>>(new Map());
|
const pollingIntervals = useRef<Map<string, NodeJS.Timeout>>(new Map());
|
||||||
const abortControllerRef = useRef<AbortController | null>(null);
|
const abortControllerRef = useRef<AbortController | null>(null);
|
||||||
@@ -80,9 +92,18 @@ export default function ScraperTest() {
|
|||||||
setBusinessName(null);
|
setBusinessName(null);
|
||||||
setBusinessAddress(null);
|
setBusinessAddress(null);
|
||||||
setBusinessRating(null);
|
setBusinessRating(null);
|
||||||
|
setBusinessImage(null);
|
||||||
|
setBusinessCategory(null);
|
||||||
}
|
}
|
||||||
}, [searchQuery, searchedQuery]);
|
}, [searchQuery, searchedQuery]);
|
||||||
|
|
||||||
|
// Notify parent when jobs change
|
||||||
|
useEffect(() => {
|
||||||
|
if (onJobsChange) {
|
||||||
|
onJobsChange(Array.from(jobs.values()));
|
||||||
|
}
|
||||||
|
}, [jobs, onJobsChange]);
|
||||||
|
|
||||||
// Check for reviews function (called manually when user clicks Validate)
|
// Check for reviews function (called manually when user clicks Validate)
|
||||||
const checkReviews = async (query: string) => {
|
const checkReviews = async (query: string) => {
|
||||||
// Abort any previous validation request
|
// Abort any previous validation request
|
||||||
@@ -96,6 +117,8 @@ export default function ScraperTest() {
|
|||||||
setBusinessName(null);
|
setBusinessName(null);
|
||||||
setBusinessAddress(null);
|
setBusinessAddress(null);
|
||||||
setBusinessRating(null);
|
setBusinessRating(null);
|
||||||
|
setBusinessImage(null);
|
||||||
|
setBusinessCategory(null);
|
||||||
setError('');
|
setError('');
|
||||||
|
|
||||||
// Create new abort controller with 30 second timeout
|
// Create new abort controller with 30 second timeout
|
||||||
@@ -123,6 +146,8 @@ export default function ScraperTest() {
|
|||||||
setBusinessName(data.name);
|
setBusinessName(data.name);
|
||||||
setBusinessAddress(data.address);
|
setBusinessAddress(data.address);
|
||||||
setBusinessRating(data.rating);
|
setBusinessRating(data.rating);
|
||||||
|
setBusinessImage(data.image_url);
|
||||||
|
setBusinessCategory(data.category);
|
||||||
} else {
|
} else {
|
||||||
console.error('Failed to get business info:', data.error);
|
console.error('Failed to get business info:', data.error);
|
||||||
// Business not found
|
// Business not found
|
||||||
@@ -226,7 +251,13 @@ export default function ScraperTest() {
|
|||||||
const response = await fetch('/api/scrape', {
|
const response = await fetch('/api/scrape', {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: { 'Content-Type': 'application/json' },
|
headers: { 'Content-Type': 'application/json' },
|
||||||
body: JSON.stringify({ url }),
|
body: JSON.stringify({
|
||||||
|
url,
|
||||||
|
business_name: businessName,
|
||||||
|
business_address: businessAddress,
|
||||||
|
rating_snapshot: businessRating,
|
||||||
|
total_reviews_snapshot: availableReviewCount,
|
||||||
|
}),
|
||||||
});
|
});
|
||||||
|
|
||||||
const data = await response.json();
|
const data = await response.json();
|
||||||
@@ -245,10 +276,15 @@ export default function ScraperTest() {
|
|||||||
created_at: new Date().toISOString(),
|
created_at: new Date().toISOString(),
|
||||||
started_at: null,
|
started_at: null,
|
||||||
completed_at: null,
|
completed_at: null,
|
||||||
|
updated_at: new Date().toISOString(),
|
||||||
reviews_count: null,
|
reviews_count: null,
|
||||||
total_reviews: null,
|
total_reviews: null,
|
||||||
scrape_time: null,
|
scrape_time: null,
|
||||||
error_message: null,
|
error_message: null,
|
||||||
|
business_name: businessName,
|
||||||
|
business_address: businessAddress,
|
||||||
|
rating_snapshot: businessRating,
|
||||||
|
total_reviews_snapshot: availableReviewCount,
|
||||||
});
|
});
|
||||||
return newMap;
|
return newMap;
|
||||||
});
|
});
|
||||||
@@ -323,6 +359,7 @@ export default function ScraperTest() {
|
|||||||
{ name: '🏪 Small (~79)', query: 'R. Fleitas Peluqueros Gran Canaria' },
|
{ name: '🏪 Small (~79)', query: 'R. Fleitas Peluqueros Gran Canaria' },
|
||||||
{ name: '🚗 Medium (~589)', query: 'ClickRent Gran Canaria' },
|
{ name: '🚗 Medium (~589)', query: 'ClickRent Gran Canaria' },
|
||||||
{ name: '🏥 Large (~2000+)', query: 'Hospital Universitario Doctor Negrín Las Palmas' },
|
{ name: '🏥 Large (~2000+)', query: 'Hospital Universitario Doctor Negrín Las Palmas' },
|
||||||
|
{ name: '🛒 Alcampo', query: 'Alcampo Hipermarket Las Palmas' },
|
||||||
];
|
];
|
||||||
|
|
||||||
return (
|
return (
|
||||||
@@ -376,13 +413,33 @@ export default function ScraperTest() {
|
|||||||
<button
|
<button
|
||||||
onClick={handleSearch}
|
onClick={handleSearch}
|
||||||
disabled={searchQuery.trim().length < 2 || isCheckingReviews}
|
disabled={searchQuery.trim().length < 2 || isCheckingReviews}
|
||||||
className="px-6 py-3 bg-blue-600 text-white font-semibold rounded-xl hover:bg-blue-700 disabled:bg-gray-300 disabled:cursor-not-allowed transition-colors flex items-center gap-2"
|
className={`px-6 py-3 font-semibold rounded-xl transition-all flex items-center gap-2 ${
|
||||||
|
hasReviews === true && searchQuery.trim() === searchedQuery
|
||||||
|
? 'bg-green-600 text-white hover:bg-green-700'
|
||||||
|
: hasReviews === false && searchQuery.trim() === searchedQuery
|
||||||
|
? 'bg-yellow-500 text-white hover:bg-yellow-600'
|
||||||
|
: 'bg-blue-600 text-white hover:bg-blue-700'
|
||||||
|
} disabled:bg-gray-300 disabled:cursor-not-allowed`}
|
||||||
>
|
>
|
||||||
{isCheckingReviews ? (
|
{isCheckingReviews ? (
|
||||||
<>
|
<>
|
||||||
<div className="w-4 h-4 border-2 border-white border-t-transparent rounded-full animate-spin" />
|
<div className="w-4 h-4 border-2 border-white border-t-transparent rounded-full animate-spin" />
|
||||||
Validating...
|
Validating...
|
||||||
</>
|
</>
|
||||||
|
) : hasReviews === true && searchQuery.trim() === searchedQuery ? (
|
||||||
|
<>
|
||||||
|
<svg className="w-5 h-5" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||||
|
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M5 13l4 4L19 7" />
|
||||||
|
</svg>
|
||||||
|
{availableReviewCount?.toLocaleString()} reviews
|
||||||
|
</>
|
||||||
|
) : hasReviews === false && searchQuery.trim() === searchedQuery ? (
|
||||||
|
<>
|
||||||
|
<svg className="w-5 h-5" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||||
|
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M12 9v2m0 4h.01m-6.938 4h13.856c1.54 0 2.502-1.667 1.732-3L13.732 4c-.77-1.333-2.694-1.333-3.464 0L3.34 16c-.77 1.333.192 3 1.732 3z" />
|
||||||
|
</svg>
|
||||||
|
No reviews
|
||||||
|
</>
|
||||||
) : (
|
) : (
|
||||||
<>
|
<>
|
||||||
<svg className="w-5 h-5" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
<svg className="w-5 h-5" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||||
@@ -477,30 +534,53 @@ export default function ScraperTest() {
|
|||||||
{hasReviews ? (
|
{hasReviews ? (
|
||||||
// Success - Show Business Card
|
// Success - Show Business Card
|
||||||
<div className="bg-white border-2 border-green-500 rounded-2xl shadow-lg overflow-hidden mb-4">
|
<div className="bg-white border-2 border-green-500 rounded-2xl shadow-lg overflow-hidden mb-4">
|
||||||
{/* Header */}
|
{/* Business Card Layout */}
|
||||||
<div className="bg-gradient-to-r from-green-500 to-emerald-500 px-6 py-4">
|
<div className="flex">
|
||||||
<div className="flex items-center gap-2 text-white">
|
{/* Business Image */}
|
||||||
<svg className="w-6 h-6" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
{businessImage && (
|
||||||
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M5 13l4 4L19 7" />
|
<div className="w-40 h-40 flex-shrink-0 bg-gray-200">
|
||||||
</svg>
|
<img
|
||||||
<span className="font-bold text-lg">Business Found</span>
|
src={businessImage}
|
||||||
</div>
|
alt={businessName || 'Business'}
|
||||||
|
className="w-full h-full object-cover"
|
||||||
|
onError={(e) => {
|
||||||
|
// Hide image on error
|
||||||
|
(e.target as HTMLImageElement).style.display = 'none';
|
||||||
|
}}
|
||||||
|
/>
|
||||||
</div>
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
{/* Business Info */}
|
{/* Business Info */}
|
||||||
<div className="p-6">
|
<div className="flex-1 p-5">
|
||||||
{/* Business Name */}
|
{/* Category Badge + Verified */}
|
||||||
<h3 className="text-2xl font-bold text-gray-900 mb-3">{businessName}</h3>
|
<div className="flex items-center gap-2 mb-2">
|
||||||
|
<span className="inline-flex items-center gap-1 px-2 py-0.5 bg-green-100 text-green-700 text-xs font-semibold rounded-full">
|
||||||
|
<svg className="w-3 h-3" fill="currentColor" viewBox="0 0 20 20">
|
||||||
|
<path fillRule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zm3.707-9.293a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z" clipRule="evenodd" />
|
||||||
|
</svg>
|
||||||
|
Verified
|
||||||
|
</span>
|
||||||
|
{businessCategory && (
|
||||||
|
<span className="px-2 py-0.5 bg-gray-100 text-gray-600 text-xs font-medium rounded-full">
|
||||||
|
{businessCategory}
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
|
||||||
{/* Rating */}
|
{/* Business Name */}
|
||||||
|
<h3 className="text-xl font-bold text-gray-900 mb-2 leading-tight">{businessName}</h3>
|
||||||
|
|
||||||
|
{/* Rating + Reviews Row */}
|
||||||
|
<div className="flex items-center gap-3 mb-2">
|
||||||
{businessRating && (
|
{businessRating && (
|
||||||
<div className="flex items-center gap-1 mb-3">
|
<div className="flex items-center gap-1">
|
||||||
<span className="text-2xl font-bold text-gray-900">{businessRating.toFixed(1)}</span>
|
<span className="text-lg font-bold text-gray-900">{businessRating.toFixed(1)}</span>
|
||||||
<div className="flex items-center ml-1">
|
<div className="flex items-center">
|
||||||
{[...Array(5)].map((_, i) => (
|
{[...Array(5)].map((_, i) => (
|
||||||
<svg
|
<svg
|
||||||
key={i}
|
key={i}
|
||||||
className={`w-5 h-5 ${i < Math.floor(businessRating) ? 'text-yellow-400' : 'text-gray-300'}`}
|
className={`w-4 h-4 ${i < Math.floor(businessRating) ? 'text-yellow-400' : 'text-gray-300'}`}
|
||||||
fill="currentColor"
|
fill="currentColor"
|
||||||
viewBox="0 0 20 20"
|
viewBox="0 0 20 20"
|
||||||
>
|
>
|
||||||
@@ -510,16 +590,28 @@ export default function ScraperTest() {
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
{availableReviewCount !== null && availableReviewCount > 0 && (
|
||||||
|
<span className="text-sm text-gray-600 font-medium">
|
||||||
|
({availableReviewCount.toLocaleString()} reviews)
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
|
||||||
{/* Address */}
|
{/* Address */}
|
||||||
{businessAddress && (
|
{businessAddress && (
|
||||||
<div className="flex items-start gap-2 text-gray-600 mb-4">
|
<div className="flex items-start gap-1.5 text-gray-500 text-sm">
|
||||||
<span className="text-lg">📍</span>
|
<svg className="w-4 h-4 mt-0.5 flex-shrink-0" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||||
<span className="text-sm">{businessAddress}</span>
|
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M17.657 16.657L13.414 20.9a1.998 1.998 0 01-2.827 0l-4.244-4.243a8 8 0 1111.314 0z" />
|
||||||
|
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M15 11a3 3 0 11-6 0 3 3 0 016 0z" />
|
||||||
|
</svg>
|
||||||
|
<span className="line-clamp-2">{businessAddress}</span>
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
{/* Start Scraping Button */}
|
{/* Start Scraping Button */}
|
||||||
|
<div className="px-5 pb-5">
|
||||||
<form onSubmit={handlePreviewBusiness}>
|
<form onSubmit={handlePreviewBusiness}>
|
||||||
<button
|
<button
|
||||||
type="submit"
|
type="submit"
|
||||||
@@ -536,7 +628,7 @@ export default function ScraperTest() {
|
|||||||
<svg className="w-6 h-6" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
<svg className="w-6 h-6" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||||
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M13 10V3L4 14h7v7l9-11h-7z" />
|
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M13 10V3L4 14h7v7l9-11h-7z" />
|
||||||
</svg>
|
</svg>
|
||||||
Start Scraping Reviews
|
Scrape {availableReviewCount?.toLocaleString()} Reviews
|
||||||
</>
|
</>
|
||||||
)}
|
)}
|
||||||
</button>
|
</button>
|
||||||
@@ -711,7 +803,13 @@ export default function ScraperTest() {
|
|||||||
|
|
||||||
setReviews(reviewsData.reviews);
|
setReviews(reviewsData.reviews);
|
||||||
setActiveJobId(job.job_id);
|
setActiveJobId(job.job_id);
|
||||||
|
|
||||||
|
// Call parent callback if provided (for right panel display)
|
||||||
|
if (onSelectReviews) {
|
||||||
|
onSelectReviews(reviewsData.reviews, searchedQuery || 'Business', job.job_id);
|
||||||
|
} else {
|
||||||
setShowAnalytics(true);
|
setShowAnalytics(true);
|
||||||
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.error('Failed to fetch reviews:', err);
|
console.error('Failed to fetch reviews:', err);
|
||||||
setError(err instanceof Error ? err.message : 'Failed to load reviews for analysis');
|
setError(err instanceof Error ? err.message : 'Failed to load reviews for analysis');
|
||||||
|
|||||||
65
web/components/Sidebar.tsx
Normal file
65
web/components/Sidebar.tsx
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
'use client';
|
||||||
|
|
||||||
|
interface SidebarProps {
|
||||||
|
activeView: 'newScrape' | 'jobs' | 'reports';
|
||||||
|
onViewChange: (view: 'newScrape' | 'jobs' | 'reports') => void;
|
||||||
|
jobCount: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export default function Sidebar({ activeView, onViewChange, jobCount }: SidebarProps) {
|
||||||
|
const navItems = [
|
||||||
|
{
|
||||||
|
id: 'newScrape' as const,
|
||||||
|
icon: (
|
||||||
|
<svg className="w-6 h-6" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||||
|
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M12 4v16m8-8H4" />
|
||||||
|
</svg>
|
||||||
|
),
|
||||||
|
label: 'New Scrape',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'jobs' as const,
|
||||||
|
icon: (
|
||||||
|
<svg className="w-6 h-6" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||||
|
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 5H7a2 2 0 00-2 2v12a2 2 0 002 2h10a2 2 0 002-2V7a2 2 0 00-2-2h-2M9 5a2 2 0 002 2h2a2 2 0 002-2M9 5a2 2 0 012-2h2a2 2 0 012 2" />
|
||||||
|
</svg>
|
||||||
|
),
|
||||||
|
label: 'Jobs',
|
||||||
|
badge: jobCount > 0 ? jobCount : undefined,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'reports' as const,
|
||||||
|
icon: (
|
||||||
|
<svg className="w-6 h-6" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||||
|
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 19v-6a2 2 0 00-2-2H5a2 2 0 00-2 2v6a2 2 0 002 2h2a2 2 0 002-2zm0 0V9a2 2 0 012-2h2a2 2 0 012 2v10m-6 0a2 2 0 002 2h2a2 2 0 002-2m0 0V5a2 2 0 012-2h2a2 2 0 012 2v14a2 2 0 01-2 2h-2a2 2 0 01-2-2z" />
|
||||||
|
</svg>
|
||||||
|
),
|
||||||
|
label: 'Reports',
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="w-20 bg-gray-900 flex flex-col items-center py-6 gap-2">
|
||||||
|
{navItems.map((item) => (
|
||||||
|
<button
|
||||||
|
key={item.id}
|
||||||
|
onClick={() => onViewChange(item.id)}
|
||||||
|
className={`relative w-14 h-14 rounded-xl flex flex-col items-center justify-center gap-1 transition-all ${
|
||||||
|
activeView === item.id
|
||||||
|
? 'bg-blue-600 text-white shadow-lg'
|
||||||
|
: 'text-gray-400 hover:bg-gray-800 hover:text-white'
|
||||||
|
}`}
|
||||||
|
title={item.label}
|
||||||
|
>
|
||||||
|
{item.icon}
|
||||||
|
<span className="text-[10px] font-medium">{item.label.split(' ')[0]}</span>
|
||||||
|
{item.badge !== undefined && (
|
||||||
|
<span className="absolute -top-1 -right-1 w-5 h-5 bg-red-500 text-white text-xs font-bold rounded-full flex items-center justify-center">
|
||||||
|
{item.badge > 99 ? '99+' : item.badge}
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</button>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
@@ -1,5 +1,10 @@
|
|||||||
// Analytics utility functions
|
// Analytics utility functions
|
||||||
|
|
||||||
|
export interface OwnerResponse {
|
||||||
|
text: string;
|
||||||
|
timestamp?: string;
|
||||||
|
}
|
||||||
|
|
||||||
export interface Review {
|
export interface Review {
|
||||||
author: string;
|
author: string;
|
||||||
rating: number;
|
rating: number;
|
||||||
@@ -8,6 +13,8 @@ export interface Review {
|
|||||||
avatar_url: string | null;
|
avatar_url: string | null;
|
||||||
profile_url: string | null;
|
profile_url: string | null;
|
||||||
review_id: string;
|
review_id: string;
|
||||||
|
owner_response?: OwnerResponse | null;
|
||||||
|
photo_urls?: string[] | null;
|
||||||
// Derived fields (computed on load)
|
// Derived fields (computed on load)
|
||||||
parsedDate?: Date;
|
parsedDate?: Date;
|
||||||
dateCategory?: 'recent' | 'month' | 'year' | 'older'; // Time range category
|
dateCategory?: 'recent' | 'month' | 'year' | 'older'; // Time range category
|
||||||
@@ -22,6 +29,7 @@ export interface TimelineDataPoint {
|
|||||||
date: string;
|
date: string;
|
||||||
rating: number;
|
rating: number;
|
||||||
rollingAvg: number;
|
rollingAvg: number;
|
||||||
|
count: number; // Number of reviews in this period
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ReviewStats {
|
export interface ReviewStats {
|
||||||
@@ -37,6 +45,21 @@ export interface ReviewStats {
|
|||||||
negativeReviews: number;
|
negativeReviews: number;
|
||||||
responseRate: number;
|
responseRate: number;
|
||||||
averageResponseTime: string;
|
averageResponseTime: string;
|
||||||
|
// Response breakdown
|
||||||
|
responseBreakdown: { answered: number; notAnswered: number };
|
||||||
|
// New trend metrics
|
||||||
|
ratingTrend: {
|
||||||
|
recentAvg: number;
|
||||||
|
olderAvg: number;
|
||||||
|
change: number; // positive = improvement, negative = decline
|
||||||
|
periodLabel: string;
|
||||||
|
};
|
||||||
|
reviewVelocity: {
|
||||||
|
recentCount: number;
|
||||||
|
olderCount: number;
|
||||||
|
changePercent: number; // positive = more reviews, negative = fewer
|
||||||
|
periodLabel: string;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export function calculateReviewStats(reviews: Review[]): ReviewStats {
|
export function calculateReviewStats(reviews: Review[]): ReviewStats {
|
||||||
@@ -55,19 +78,21 @@ export function calculateReviewStats(reviews: Review[]): ReviewStats {
|
|||||||
const totalReviews = reviews.length;
|
const totalReviews = reviews.length;
|
||||||
|
|
||||||
// Average rating
|
// Average rating
|
||||||
const averageRating = reviews.reduce((sum, r) => sum + r.rating, 0) / totalReviews;
|
const averageRating = totalReviews > 0
|
||||||
|
? reviews.reduce((sum, r) => sum + r.rating, 0) / totalReviews
|
||||||
|
: 0;
|
||||||
|
|
||||||
// Sentiment score (% of 4-5 star reviews)
|
// Sentiment score (% of 4-5 star reviews)
|
||||||
const positiveReviews = reviews.filter(r => r.rating >= 4).length;
|
const positiveReviews = reviews.filter(r => r.rating >= 4).length;
|
||||||
const sentimentScore = (positiveReviews / totalReviews) * 100;
|
const sentimentScore = totalReviews > 0 ? (positiveReviews / totalReviews) * 100 : 0;
|
||||||
|
|
||||||
// Photo count (reviews with avatars as proxy)
|
// Photo count (reviews with actual photos attached)
|
||||||
const photoCount = reviews.filter(r => r.avatar_url).length;
|
const photoCount = reviews.filter(r => r.photo_urls && r.photo_urls.length > 0).length;
|
||||||
|
|
||||||
// Average review length
|
// Average review length
|
||||||
const avgReviewLength = Math.round(
|
const avgReviewLength = totalReviews > 0
|
||||||
reviews.reduce((sum, r) => sum + (r.text?.split(' ').length || 0), 0) / totalReviews
|
? Math.round(reviews.reduce((sum, r) => sum + (r.text?.split(' ').length || 0), 0) / totalReviews)
|
||||||
);
|
: 0;
|
||||||
|
|
||||||
// Recent reviews (last 30 days - simplified check)
|
// Recent reviews (last 30 days - simplified check)
|
||||||
const recentReviews = reviews.filter(r => {
|
const recentReviews = reviews.filter(r => {
|
||||||
@@ -122,11 +147,50 @@ export function calculateReviewStats(reviews: Review[]): ReviewStats {
|
|||||||
// Negative reviews count
|
// Negative reviews count
|
||||||
const negativeReviews = reviews.filter(r => r.rating <= 2).length;
|
const negativeReviews = reviews.filter(r => r.rating <= 2).length;
|
||||||
|
|
||||||
// Response rate (placeholder - would need owner_response field)
|
// Response breakdown - count answered vs not answered reviews
|
||||||
const responseRate = 0; // TODO: Calculate when owner responses are available
|
const answeredReviews = reviews.filter(r => r.owner_response?.text).length;
|
||||||
|
const responseBreakdown = {
|
||||||
|
answered: answeredReviews,
|
||||||
|
notAnswered: totalReviews - answeredReviews,
|
||||||
|
};
|
||||||
|
|
||||||
// Average response time (placeholder)
|
// Response rate calculated from actual data
|
||||||
const averageResponseTime = 'N/A'; // TODO: Calculate when response data is available
|
const responseRate = totalReviews > 0 ? (answeredReviews / totalReviews) * 100 : 0;
|
||||||
|
|
||||||
|
// Average response time (placeholder - would need response timestamps)
|
||||||
|
const averageResponseTime = 'N/A'; // TODO: Calculate when response timestamps are available
|
||||||
|
|
||||||
|
// Rating Trend - compare recent 3 months vs previous 3 months
|
||||||
|
const now = new Date();
|
||||||
|
const threeMonthsAgo = new Date(now.getTime() - 90 * 24 * 60 * 60 * 1000);
|
||||||
|
const sixMonthsAgo = new Date(now.getTime() - 180 * 24 * 60 * 60 * 1000);
|
||||||
|
|
||||||
|
const recentReviewsForTrend = reviews.filter(r => r.centerDate && r.centerDate >= threeMonthsAgo);
|
||||||
|
const olderReviewsForTrend = reviews.filter(r => r.centerDate && r.centerDate < threeMonthsAgo && r.centerDate >= sixMonthsAgo);
|
||||||
|
|
||||||
|
const recentAvg = recentReviewsForTrend.length > 0
|
||||||
|
? recentReviewsForTrend.reduce((sum, r) => sum + r.rating, 0) / recentReviewsForTrend.length
|
||||||
|
: 0;
|
||||||
|
const olderAvg = olderReviewsForTrend.length > 0
|
||||||
|
? olderReviewsForTrend.reduce((sum, r) => sum + r.rating, 0) / olderReviewsForTrend.length
|
||||||
|
: 0;
|
||||||
|
|
||||||
|
const ratingTrend = {
|
||||||
|
recentAvg: Math.round(recentAvg * 10) / 10,
|
||||||
|
olderAvg: Math.round(olderAvg * 10) / 10,
|
||||||
|
change: Math.round((recentAvg - olderAvg) * 10) / 10,
|
||||||
|
periodLabel: 'last 3 months vs previous 3 months',
|
||||||
|
};
|
||||||
|
|
||||||
|
// Review Velocity - compare recent 3 months vs previous 3 months
|
||||||
|
const reviewVelocity = {
|
||||||
|
recentCount: recentReviewsForTrend.length,
|
||||||
|
olderCount: olderReviewsForTrend.length,
|
||||||
|
changePercent: olderReviewsForTrend.length > 0
|
||||||
|
? Math.round(((recentReviewsForTrend.length - olderReviewsForTrend.length) / olderReviewsForTrend.length) * 100)
|
||||||
|
: (recentReviewsForTrend.length > 0 ? 100 : 0),
|
||||||
|
periodLabel: 'last 3 months vs previous 3 months',
|
||||||
|
};
|
||||||
|
|
||||||
return {
|
return {
|
||||||
totalReviews,
|
totalReviews,
|
||||||
@@ -141,6 +205,9 @@ export function calculateReviewStats(reviews: Review[]): ReviewStats {
|
|||||||
negativeReviews,
|
negativeReviews,
|
||||||
responseRate,
|
responseRate,
|
||||||
averageResponseTime,
|
averageResponseTime,
|
||||||
|
responseBreakdown,
|
||||||
|
ratingTrend,
|
||||||
|
reviewVelocity,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -367,9 +434,10 @@ export function calculateTimelineData(reviews: Review[]): TimelineDataPoint[] {
|
|||||||
|
|
||||||
// Group by month
|
// Group by month
|
||||||
const monthlyData: Record<string, { ratings: number[]; date: Date }> = {};
|
const monthlyData: Record<string, { ratings: number[]; date: Date }> = {};
|
||||||
|
const monthNames = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'];
|
||||||
|
|
||||||
sortedReviews.forEach(review => {
|
sortedReviews.forEach(review => {
|
||||||
const monthKey = `${review.parsedDate.getFullYear()}-${String(review.parsedDate.getMonth() + 1).padStart(2, '0')}`;
|
const monthKey = `${monthNames[review.parsedDate.getMonth()]} ${review.parsedDate.getFullYear()}`;
|
||||||
|
|
||||||
if (!monthlyData[monthKey]) {
|
if (!monthlyData[monthKey]) {
|
||||||
monthlyData[monthKey] = { ratings: [], date: review.parsedDate };
|
monthlyData[monthKey] = { ratings: [], date: review.parsedDate };
|
||||||
@@ -383,8 +451,17 @@ export function calculateTimelineData(reviews: Review[]): TimelineDataPoint[] {
|
|||||||
date: monthKey,
|
date: monthKey,
|
||||||
rating: data.ratings.reduce((a, b) => a + b, 0) / data.ratings.length,
|
rating: data.ratings.reduce((a, b) => a + b, 0) / data.ratings.length,
|
||||||
rollingAvg: 0, // Will calculate below
|
rollingAvg: 0, // Will calculate below
|
||||||
|
count: data.ratings.length, // Number of reviews this month
|
||||||
}))
|
}))
|
||||||
.sort((a, b) => a.date.localeCompare(b.date));
|
.sort((a, b) => {
|
||||||
|
// Parse "Mon YYYY" format for sorting
|
||||||
|
const parseMonthYear = (d: string) => {
|
||||||
|
const [month, year] = d.split(' ');
|
||||||
|
const monthIndex = monthNames.indexOf(month);
|
||||||
|
return new Date(parseInt(year), monthIndex, 1).getTime();
|
||||||
|
};
|
||||||
|
return parseMonthYear(a.date) - parseMonthYear(b.date);
|
||||||
|
});
|
||||||
|
|
||||||
// Calculate 3-month rolling average
|
// Calculate 3-month rolling average
|
||||||
dataPoints.forEach((point, idx) => {
|
dataPoints.forEach((point, idx) => {
|
||||||
|
|||||||
Reference in New Issue
Block a user