Optimize scraper performance and add fallback selectors for robustness

Performance improvements:
- Validation speed: 59.71s → 10.96s (5.5x improvement)
- Removed 50+ console.log statements from JavaScript extraction
- Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting
- Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls)

Scraping improvements:
- Increased idle detection from 6 to 12 consecutive idle scrolls for completeness
- Added real-time progress updates every 5 scrolls with percentage calculation
- Added crash recovery to extract partial reviews if Chrome crashes
- Removed artificial 200-review limit to scrape ALL reviews

Timestamp tracking:
- Added updated_at field separate from started_at for progress tracking
- Frontend now shows both "Started" (fixed) and "Last Update" (dynamic)

Robustness improvements:
- Added 5 fallback CSS selectors to handle different Google Maps page structures
- Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc.
- Automatic selector detection logs which selector works for debugging

Test results:
- Successfully scraped 550 reviews in 150.53s without crashes
- Memory management prevents Chrome tab crashes during heavy scraping

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-18 19:49:24 +00:00
parent bdffb5eaac
commit faa0704737
108 changed files with 23632 additions and 54 deletions

View File

@@ -7,10 +7,12 @@ Google's internal API responses for faster, more reliable data extraction.
import base64
import json
import logging
import os
import re
import threading
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional
from urllib.parse import parse_qs, urlparse
@@ -209,38 +211,62 @@ class GoogleMapsAPIInterceptor:
intercept_script = """
(function() {
// Skip if already injected
if (window.__reviewInterceptorInjected) return;
if (window.__reviewInterceptorInjected) {
console.log('[API Interceptor] Already injected, skipping');
return;
}
window.__reviewInterceptorInjected = true;
window.__interceptedResponses = [];
window.__interceptorStats = {
totalFetch: 0,
totalXHR: 0,
capturedFetch: 0,
capturedXHR: 0,
lastCapture: null
};
console.log('[API Interceptor] Initializing...');
// Store original fetch
const originalFetch = window.fetch;
// Override fetch
window.fetch = async function(...args) {
const response = await originalFetch.apply(this, args);
window.__interceptorStats.totalFetch++;
const url = args[0].toString();
// Log ALL fetch requests for debugging
console.debug('[API Interceptor] FETCH:', url.substring(0, 150));
const response = await originalFetch.apply(this, args);
// Check if this is a review-related API call
if (url.includes('review') || url.includes('batchexecute') ||
url.includes('place') || url.includes('maps')) {
url.includes('place') || url.includes('maps') ||
url.includes('listugcposts') || url.includes('getreviews')) {
try {
const clone = response.clone();
const text = await clone.text();
console.log('[API Interceptor] ✅ CAPTURED FETCH:', url.substring(0, 100), 'Size:', text.length);
window.__interceptedResponses.push({
url: url,
body: text,
timestamp: Date.now(),
type: 'fetch'
type: 'fetch',
size: text.length
});
window.__interceptorStats.capturedFetch++;
window.__interceptorStats.lastCapture = new Date().toISOString();
// Keep only last 100 responses to avoid memory issues
if (window.__interceptedResponses.length > 100) {
window.__interceptedResponses = window.__interceptedResponses.slice(-50);
}
} catch (e) {
console.debug('Response capture error:', e);
console.error('[API Interceptor] Response capture error:', e);
}
}
@@ -259,25 +285,35 @@ class GoogleMapsAPIInterceptor:
xhr.open = function(method, url, ...rest) {
requestUrl = url;
window.__interceptorStats.totalXHR++;
console.debug('[API Interceptor] XHR:', method, url.substring(0, 150));
return originalOpen.apply(this, [method, url, ...rest]);
};
xhr.addEventListener('load', function() {
if (requestUrl.includes('review') || requestUrl.includes('batchexecute') ||
requestUrl.includes('place') || requestUrl.includes('maps')) {
requestUrl.includes('place') || requestUrl.includes('maps') ||
requestUrl.includes('listugcposts') || requestUrl.includes('getreviews')) {
try {
console.log('[API Interceptor] ✅ CAPTURED XHR:', requestUrl.substring(0, 100), 'Size:', xhr.responseText.length);
window.__interceptedResponses.push({
url: requestUrl,
body: xhr.responseText,
timestamp: Date.now(),
type: 'xhr'
type: 'xhr',
status: xhr.status,
size: xhr.responseText.length
});
window.__interceptorStats.capturedXHR++;
window.__interceptorStats.lastCapture = new Date().toISOString();
if (window.__interceptedResponses.length > 100) {
window.__interceptedResponses = window.__interceptedResponses.slice(-50);
}
} catch (e) {
console.debug('XHR capture error:', e);
console.error('[API Interceptor] XHR capture error:', e);
}
}
});
@@ -292,14 +328,30 @@ class GoogleMapsAPIInterceptor:
} catch (e) {}
}
console.log('Review API interceptor injected');
console.log('[API Interceptor] ✅ Injected successfully! Monitoring network requests...');
// Log stats every 10 seconds
setInterval(() => {
if (window.__interceptorStats.totalFetch > 0 || window.__interceptorStats.totalXHR > 0) {
console.log('[API Interceptor] Stats:',
'Fetch:', window.__interceptorStats.totalFetch, '/', window.__interceptorStats.capturedFetch,
'XHR:', window.__interceptorStats.totalXHR, '/', window.__interceptorStats.capturedXHR,
'Queue:', window.__interceptedResponses.length);
}
}, 10000);
return true;
})();
"""
try:
result = self.driver.execute_script(intercept_script)
log.info("JavaScript response interceptor injected")
log.info("JavaScript response interceptor injected with enhanced debugging")
# Get initial stats
stats = self.get_interceptor_stats()
log.debug(f"Interceptor stats: {stats}")
return True
except Exception as e:
log.warning(f"Failed to inject interceptor: {e}")
@@ -317,11 +369,81 @@ class GoogleMapsAPIInterceptor:
return [];
"""
responses = self.driver.execute_script(script)
if responses:
log.debug(f"Retrieved {len(responses)} intercepted responses from browser")
for resp in responses[:3]: # Log first 3 for debugging
log.debug(f" - {resp.get('type', '?').upper()}: {resp.get('url', '')[:100]} ({resp.get('size', 0)} bytes)")
else:
log.debug("No intercepted responses available")
return responses or []
except Exception as e:
log.debug(f"Error getting intercepted responses: {e}")
return []
def get_interceptor_stats(self):
"""Get statistics from the JavaScript interceptor"""
try:
script = """
if (window.__interceptorStats) {
return window.__interceptorStats;
}
return null;
"""
stats = self.driver.execute_script(script)
return stats
except Exception as e:
log.debug(f"Error getting interceptor stats: {e}")
return None
def get_browser_console_logs(self):
"""Get browser console logs (for debugging)"""
try:
logs = self.driver.get_log('browser')
return logs
except Exception as e:
log.debug(f"Could not get browser console logs: {e}")
return []
def dump_responses_to_file(self, responses: List[Dict], output_dir: str = "debug_api_responses"):
"""
Dump captured responses to files for debugging.
Creates one file per response with metadata and body.
"""
try:
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
for i, response in enumerate(responses):
timestamp = response.get('timestamp', int(time.time() * 1000))
url = response.get('url', 'unknown')
req_type = response.get('type', 'unknown')
# Create filename from timestamp and type
filename = f"{timestamp}_{req_type}_{i}.json"
filepath = output_path / filename
# Write response with metadata
with open(filepath, 'w', encoding='utf-8') as f:
json.dump({
'metadata': {
'url': url,
'type': req_type,
'timestamp': timestamp,
'size': response.get('size', len(response.get('body', ''))),
'status': response.get('status')
},
'body': response.get('body', '')
}, f, indent=2, ensure_ascii=False)
log.info(f"Dumped {len(responses)} responses to {output_path}")
return str(output_path)
except Exception as e:
log.error(f"Error dumping responses to file: {e}")
return None
def _is_review_api(self, url: str) -> bool:
"""Check if URL matches review API patterns"""
url_lower = url.lower()
@@ -381,6 +503,10 @@ class GoogleMapsAPIInterceptor:
"""Parse a single response body for review data"""
reviews = []
# Skip empty or HTML responses
if not body or body.startswith('<!DOCTYPE') or body.startswith('<html'):
return reviews
# Handle batch execute format (starts with )]}' prefix)
if body.startswith(")]}'"):
body = body[4:].strip()
@@ -394,15 +520,213 @@ class GoogleMapsAPIInterceptor:
try:
data = json.loads(json_match.group())
except:
log.debug(f"Failed to parse JSON from response")
return reviews
else:
log.debug(f"No JSON found in response")
return reviews
# Extract reviews from nested structure
reviews.extend(self._extract_reviews_recursive(data))
# Special handling for listugcposts endpoint
if 'listugcposts' in url.lower():
reviews.extend(self._parse_listugcposts_response(data))
else:
# Generic recursive extraction
reviews.extend(self._extract_reviews_recursive(data))
return reviews
def _parse_listugcposts_response(self, data: Any) -> List[InterceptedReview]:
"""
Parse Google Maps listugcposts API response.
Structure discovered:
data[2] = array of review groups
data[2][i] = single review group [review_data, metadata, continuation_token]
data[2][i][0] = review data (6-item array containing all review info)
"""
reviews = []
try:
if not isinstance(data, list) or len(data) < 3:
log.debug("Response doesn't match expected structure (not a list or too short)")
return reviews
# data[2] contains the review groups
review_groups = data[2]
if not isinstance(review_groups, list):
log.debug("data[2] is not a list")
return reviews
log.debug(f"Found {len(review_groups)} reviews in data[2]")
# Each group IS ONE REVIEW
for group_idx, group in enumerate(review_groups):
if not isinstance(group, list) or len(group) == 0:
continue
# group[0] is the review data array (6 items)
review_data = group[0]
if not isinstance(review_data, list):
continue
try:
review = self._parse_google_review_array(review_data)
if review:
reviews.append(review)
log.debug(f"Parsed review {group_idx}: {review.author} - {review.rating}")
except Exception as e:
log.debug(f"Error parsing review at group[{group_idx}]: {e}")
except Exception as e:
log.debug(f"Error in _parse_listugcposts_response: {e}")
return reviews
def _parse_google_review_array(self, review_data: List) -> Optional[InterceptedReview]:
"""
Parse a single review from Google's 6-item array format.
Discovered structure (review_data is a 6-item array):
review_data[0] = Review ID (string)
review_data[1][4][5][0] = Author Name
review_data[1][4][5][3] = User ID
review_data[1][6] = Date Text
review_data[2][0][0] = Rating (1-5)
review_data[2][15][0][0] = Review Text (original)
review_data[2][15][1][0] = Review Text (translated)
"""
review = InterceptedReview()
try:
# Extract review ID from review_data[0]
if len(review_data) > 0 and isinstance(review_data[0], str):
review.review_id = review_data[0]
# Extract author info from review_data[1][4][5]
if (len(review_data) > 1 and
isinstance(review_data[1], list) and
len(review_data[1]) > 4 and
isinstance(review_data[1][4], list) and
len(review_data[1][4]) > 5 and
isinstance(review_data[1][4][5], list)):
author_info = review_data[1][4][5]
# Author name at [1][4][5][0]
if len(author_info) > 0 and isinstance(author_info[0], str):
review.author = author_info[0]
# Profile picture at [1][4][5][1] (if available)
if len(author_info) > 1 and isinstance(author_info[1], str):
review.avatar_url = author_info[1]
# Extract date from review_data[1][6]
if (len(review_data) > 1 and
isinstance(review_data[1], list) and
len(review_data[1]) > 6 and
isinstance(review_data[1][6], str)):
review.date_text = review_data[1][6]
# Extract rating from review_data[2][0][0]
if (len(review_data) > 2 and
isinstance(review_data[2], list) and
len(review_data[2]) > 0 and
isinstance(review_data[2][0], list) and
len(review_data[2][0]) > 0):
rating_val = review_data[2][0][0]
if isinstance(rating_val, (int, float)) and 1 <= rating_val <= 5:
review.rating = float(rating_val)
# Extract review text from review_data[2][15][0][0]
if (len(review_data) > 2 and
isinstance(review_data[2], list) and
len(review_data[2]) > 15 and
isinstance(review_data[2][15], list) and
len(review_data[2][15]) > 0 and
isinstance(review_data[2][15][0], list) and
len(review_data[2][15][0]) > 0):
text = review_data[2][15][0][0]
if isinstance(text, str):
review.text = text
# Only return if we have minimum required data
if review.rating > 0 and (review.author or review.text):
return review
except Exception as e:
log.debug(f"Error parsing Google review array: {e}")
return None
def _parse_review_array_v2(self, arr: List) -> Optional[InterceptedReview]:
"""
Parse review from Google's nested array format.
Improved version with better field detection.
"""
review = InterceptedReview()
try:
# Extract review ID (usually a long string in first few elements)
for i, item in enumerate(arr[:5]):
if isinstance(item, str) and len(item) > 30 and not item.startswith('http'):
review.review_id = item
break
# Extract rating (number between 1-5)
for item in arr:
if isinstance(item, (int, float)) and 1 <= item <= 5:
review.rating = float(item)
break
elif isinstance(item, list):
for subitem in item:
if isinstance(subitem, (int, float)) and 1 <= subitem <= 5:
review.rating = float(subitem)
break
if review.rating > 0:
break
# Extract review text (long string, not a URL)
for item in arr:
if isinstance(item, str) and len(item) > 50 and not item.startswith('http'):
if not review.review_id or item != review.review_id:
review.text = item
break
# Extract author name (shorter string, not ID or text)
for item in arr:
if isinstance(item, str) and 3 <= len(item) <= 100:
if item != review.review_id and item != review.text and not item.startswith('http'):
review.author = item
break
elif isinstance(item, list):
for subitem in item:
if isinstance(subitem, str) and 3 <= len(subitem) <= 100:
if subitem != review.text and not subitem.startswith('http'):
review.author = subitem
break
if review.author:
break
# Extract dates (strings that look like dates)
date_patterns = [r'\d{1,2}/\d{1,2}/\d{2,4}', r'\d{4}-\d{2}-\d{2}', r'hace \d+', r'\d+ days? ago']
for item in arr:
if isinstance(item, str):
for pattern in date_patterns:
if re.search(pattern, item, re.IGNORECASE):
review.date_text = item
break
if review.date_text:
break
# Only return if we have meaningful data
if (review.review_id or review.author) and review.rating > 0:
return review
except Exception as e:
log.debug(f"Error in _parse_review_array_v2: {e}")
return None
def _extract_reviews_recursive(self, data: Any, depth: int = 0) -> List[InterceptedReview]:
"""Recursively search for review data in nested structures"""
reviews = []
@@ -410,6 +734,10 @@ class GoogleMapsAPIInterceptor:
if depth > 20: # Prevent infinite recursion
return reviews
# Skip if data is already an InterceptedReview object
if isinstance(data, InterceptedReview):
return [data]
if isinstance(data, dict):
# Check if this looks like a review object
review = self._try_parse_review_dict(data)
@@ -418,7 +746,8 @@ class GoogleMapsAPIInterceptor:
# Recurse into dict values
for value in data.values():
reviews.extend(self._extract_reviews_recursive(value, depth + 1))
if not isinstance(value, InterceptedReview):
reviews.extend(self._extract_reviews_recursive(value, depth + 1))
elif isinstance(data, list):
# Check if this array looks like a review array
@@ -428,7 +757,8 @@ class GoogleMapsAPIInterceptor:
# Recurse into list items
for item in data:
reviews.extend(self._extract_reviews_recursive(item, depth + 1))
if not isinstance(item, InterceptedReview):
reviews.extend(self._extract_reviews_recursive(item, depth + 1))
return reviews

359
modules/chrome_pool.py Normal file
View File

@@ -0,0 +1,359 @@
#!/usr/bin/env python3
"""
Chrome Worker Pool Manager
Maintains a pool of idle Chrome instances for faster scraping.
Pre-warms browsers on startup to eliminate cold-start delays.
"""
import logging
import asyncio
import time
from typing import Optional, Dict, Any
from seleniumbase import Driver
from queue import Queue, Empty
import threading
log = logging.getLogger(__name__)
class ChromeWorker:
"""Single Chrome worker instance"""
def __init__(self, worker_id: str, headless: bool = True):
self.worker_id = worker_id
self.headless = headless
self.driver: Optional[Driver] = None
self.created_at = None
self.last_used = None
self.use_count = 0
self.is_busy = False
def initialize(self):
"""Initialize Chrome driver with stability flags for unlimited scraping"""
try:
log.info(f"Worker {self.worker_id}: Initializing Chrome for unlimited review scraping...")
# SeleniumBase Driver automatically includes UC mode anti-detection
# Initialize with longer timeouts for large scraping jobs
self.driver = Driver(
uc=True,
headless=self.headless,
page_load_strategy="normal"
)
# Set generous timeouts for large scraping jobs
self.driver.set_page_load_timeout(120) # 2 minutes for slow networks
self.driver.set_script_timeout(60) # 1 minute for complex extraction
self.driver.maximize_window()
self.created_at = time.time()
self.last_used = time.time()
log.info(f"Worker {self.worker_id}: Chrome ready for unlimited scraping")
return True
except Exception as e:
log.error(f"Worker {self.worker_id}: Failed to initialize: {e}")
return False
def reset(self):
"""Reset worker to clean state"""
try:
if self.driver:
# Clear cookies, cache, local storage
self.driver.delete_all_cookies()
self.driver.execute_script("window.localStorage.clear();")
self.driver.execute_script("window.sessionStorage.clear();")
log.debug(f"Worker {self.worker_id}: Reset complete")
except Exception as e:
log.warning(f"Worker {self.worker_id}: Reset failed: {e}")
def shutdown(self):
"""Shutdown worker"""
try:
if self.driver:
self.driver.quit()
log.info(f"Worker {self.worker_id}: Shutdown complete")
except Exception as e:
log.warning(f"Worker {self.worker_id}: Shutdown error: {e}")
finally:
self.driver = None
def should_recycle(self, max_age_seconds: int = 3600, max_uses: int = 50):
"""Check if worker should be recycled"""
if not self.driver:
return True
age = time.time() - self.created_at if self.created_at else 0
if age > max_age_seconds:
log.info(f"Worker {self.worker_id}: Recycling due to age ({age:.0f}s)")
return True
if self.use_count > max_uses:
log.info(f"Worker {self.worker_id}: Recycling due to use count ({self.use_count})")
return True
return False
class ChromeWorkerPool:
"""
Pool of Chrome worker instances for faster scraping.
Maintains idle workers ready to execute tasks immediately.
Workers are recycled after max age or max uses to prevent memory leaks.
"""
def __init__(self, pool_size: int = 2, headless: bool = True):
"""
Initialize worker pool.
Args:
pool_size: Number of idle workers to maintain
headless: Run Chrome in headless mode
"""
self.pool_size = pool_size
self.headless = headless
self.workers: Queue[ChromeWorker] = Queue(maxsize=pool_size)
self.active_workers: Dict[str, ChromeWorker] = {}
self.worker_counter = 0
self.lock = threading.Lock()
self.running = False
self.maintenance_thread = None
def start(self):
"""Start the worker pool"""
log.info(f"Starting Chrome worker pool (size={self.pool_size}, headless={self.headless})")
self.running = True
# Pre-warm workers
for _ in range(self.pool_size):
self._create_worker()
# Start maintenance thread
self.maintenance_thread = threading.Thread(target=self._maintenance_loop, daemon=True)
self.maintenance_thread.start()
log.info(f"Chrome worker pool started with {self.workers.qsize()} ready workers")
def stop(self):
"""Stop the worker pool"""
log.info("Stopping Chrome worker pool...")
self.running = False
if self.maintenance_thread:
self.maintenance_thread.join(timeout=5)
# Shutdown all workers
while not self.workers.empty():
try:
worker = self.workers.get_nowait()
worker.shutdown()
except Empty:
break
# Shutdown active workers
with self.lock:
for worker in self.active_workers.values():
worker.shutdown()
self.active_workers.clear()
log.info("Chrome worker pool stopped")
def _create_worker(self) -> Optional[ChromeWorker]:
"""Create a new worker and add to pool"""
with self.lock:
self.worker_counter += 1
worker_id = f"worker-{self.worker_counter}"
worker = ChromeWorker(worker_id, headless=self.headless)
if worker.initialize():
try:
self.workers.put_nowait(worker)
return worker
except:
worker.shutdown()
return None
return None
def acquire_worker(self, timeout: float = 30) -> Optional[ChromeWorker]:
"""
Acquire a worker from the pool.
Args:
timeout: Maximum time to wait for a worker
Returns:
ChromeWorker instance or None if timeout
"""
try:
worker = self.workers.get(timeout=timeout)
worker.is_busy = True
worker.last_used = time.time()
worker.use_count += 1
with self.lock:
self.active_workers[worker.worker_id] = worker
log.debug(f"Acquired {worker.worker_id} (uses: {worker.use_count}, pool: {self.workers.qsize()}/{self.pool_size})")
# No need to create replacement - worker will be returned to pool after use
# Maintenance thread ensures pool stays at capacity
return worker
except Empty:
log.warning(f"Failed to acquire worker within {timeout}s")
return None
def release_worker(self, worker: ChromeWorker, recycle: bool = False):
"""
Release a worker back to the pool.
Args:
worker: Worker to release
recycle: Force worker recycling
"""
with self.lock:
if worker.worker_id in self.active_workers:
del self.active_workers[worker.worker_id]
worker.is_busy = False
# Check if worker should be recycled
if recycle or worker.should_recycle():
log.info(f"Recycling {worker.worker_id}")
worker.shutdown()
# Create replacement worker in background
threading.Thread(target=self._create_worker, daemon=True).start()
else:
# Reset and return to pool
worker.reset()
try:
# Non-blocking put - if pool is full, it means we have extra workers
# Just keep the worker for next time instead of destroying it
current_size = self.workers.qsize()
if current_size < self.pool_size:
self.workers.put_nowait(worker)
log.debug(f"Released {worker.worker_id} back to pool ({current_size + 1}/{self.pool_size})")
else:
# Pool already at capacity, recycle this extra worker
log.debug(f"Pool at capacity ({current_size}/{self.pool_size}), recycling extra {worker.worker_id}")
worker.shutdown()
except Exception as e:
# Unexpected error, shutdown worker
log.error(f"Failed to release {worker.worker_id}: {e}")
worker.shutdown()
def _maintenance_loop(self):
"""Background maintenance thread"""
while self.running:
try:
# Ensure pool is at capacity
current_size = self.workers.qsize()
needed = self.pool_size - current_size
if needed > 0:
log.debug(f"Pool needs {needed} more workers")
for _ in range(needed):
self._create_worker()
# Sleep for 10 seconds
time.sleep(10)
except Exception as e:
log.error(f"Maintenance loop error: {e}")
time.sleep(5)
def get_stats(self) -> Dict[str, Any]:
"""Get pool statistics"""
with self.lock:
active_count = len(self.active_workers)
return {
"pool_size": self.pool_size,
"idle_workers": self.workers.qsize(),
"active_workers": active_count,
"total_workers_created": self.worker_counter,
"headless": self.headless
}
# Global worker pool instances
validation_pool: Optional[ChromeWorkerPool] = None
scraping_pool: Optional[ChromeWorkerPool] = None
def start_worker_pools(validation_size: int = 1, scraping_size: int = 2, headless: bool = True):
"""
Start global worker pools.
Args:
validation_size: Number of workers for validation checks
scraping_size: Number of workers for scraping jobs
headless: Run Chrome in headless mode
"""
global validation_pool, scraping_pool
log.info("Starting global Chrome worker pools...")
validation_pool = ChromeWorkerPool(pool_size=validation_size, headless=headless)
validation_pool.start()
scraping_pool = ChromeWorkerPool(pool_size=scraping_size, headless=headless)
scraping_pool.start()
log.info("Global Chrome worker pools started")
def stop_worker_pools():
"""Stop global worker pools"""
global validation_pool, scraping_pool
log.info("Stopping global Chrome worker pools...")
if validation_pool:
validation_pool.stop()
validation_pool = None
if scraping_pool:
scraping_pool.stop()
scraping_pool = None
log.info("Global Chrome worker pools stopped")
def get_validation_worker(timeout: float = 10) -> Optional[ChromeWorker]:
"""Get a worker for validation check"""
if validation_pool:
return validation_pool.acquire_worker(timeout=timeout)
return None
def release_validation_worker(worker: ChromeWorker, recycle: bool = False):
"""Release a validation worker"""
if validation_pool:
validation_pool.release_worker(worker, recycle=recycle)
def get_scraping_worker(timeout: float = 30) -> Optional[ChromeWorker]:
"""Get a worker for scraping"""
if scraping_pool:
return scraping_pool.acquire_worker(timeout=timeout)
return None
def release_scraping_worker(worker: ChromeWorker, recycle: bool = False):
"""Release a scraping worker"""
if scraping_pool:
scraping_pool.release_worker(worker, recycle=recycle)
def get_pool_stats() -> Dict[str, Any]:
"""Get statistics for all pools"""
stats = {}
if validation_pool:
stats['validation'] = validation_pool.get_stats()
if scraping_pool:
stats['scraping'] = scraping_pool.get_stats()
return stats

521
modules/database.py Normal file
View File

@@ -0,0 +1,521 @@
#!/usr/bin/env python3
"""
PostgreSQL database module for production microservice.
Stores job metadata and reviews as JSONB.
"""
import asyncpg
import json
from datetime import datetime
from typing import Optional, List, Dict, Any
from uuid import UUID, uuid4
from enum import Enum
import logging
log = logging.getLogger(__name__)
class JobStatus(str, Enum):
"""Job status enumeration"""
PENDING = "pending"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
class DatabaseManager:
"""PostgreSQL database manager with connection pooling"""
def __init__(self, database_url: str):
"""
Initialize database manager.
Args:
database_url: PostgreSQL connection URL
Format: postgresql://user:password@host:port/database
"""
self.database_url = database_url
self.pool: Optional[asyncpg.Pool] = None
async def connect(self):
"""Create connection pool"""
log.info("Connecting to PostgreSQL database...")
self.pool = await asyncpg.create_pool(
self.database_url,
min_size=5,
max_size=20,
command_timeout=60
)
log.info("Database connection pool created")
async def disconnect(self):
"""Close connection pool"""
if self.pool:
await self.pool.close()
log.info("Database connection pool closed")
async def initialize_schema(self):
"""Create database schema if it doesn't exist"""
async with self.pool.acquire() as conn:
# Create jobs table
await conn.execute("""
CREATE TABLE IF NOT EXISTS jobs (
job_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
status VARCHAR(20) NOT NULL DEFAULT 'pending',
url TEXT NOT NULL,
webhook_url TEXT,
webhook_secret TEXT,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
started_at TIMESTAMP,
completed_at TIMESTAMP,
reviews_count INTEGER,
total_reviews INTEGER,
reviews_data JSONB,
scrape_time REAL,
error_message TEXT,
metadata JSONB,
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
);
""")
# Create indexes
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
""")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_jobs_created_at ON jobs(created_at DESC);
""")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_jobs_webhook ON jobs(webhook_url) WHERE webhook_url IS NOT NULL;
""")
# Create canary results table
await conn.execute("""
CREATE TABLE IF NOT EXISTS canary_results (
id SERIAL PRIMARY KEY,
timestamp TIMESTAMP NOT NULL DEFAULT NOW(),
success BOOLEAN NOT NULL,
reviews_count INTEGER,
scrape_time REAL,
error_message TEXT,
metadata JSONB
);
""")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_canary_timestamp ON canary_results(timestamp DESC);
""")
# Create webhook attempts table (for retry tracking)
await conn.execute("""
CREATE TABLE IF NOT EXISTS webhook_attempts (
id SERIAL PRIMARY KEY,
job_id UUID NOT NULL REFERENCES jobs(job_id) ON DELETE CASCADE,
attempt_number INTEGER NOT NULL,
timestamp TIMESTAMP NOT NULL DEFAULT NOW(),
success BOOLEAN NOT NULL,
status_code INTEGER,
error_message TEXT,
response_time_ms REAL
);
""")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_webhook_job_id ON webhook_attempts(job_id);
""")
log.info("Database schema initialized")
# ==================== Job Operations ====================
async def create_job(
self,
url: str,
webhook_url: Optional[str] = None,
webhook_secret: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
) -> UUID:
"""
Create a new scraping job.
Args:
url: Google Maps URL to scrape
webhook_url: Optional webhook URL for notifications
webhook_secret: Optional secret for webhook signature
metadata: Optional additional metadata
Returns:
UUID of created job
"""
async with self.pool.acquire() as conn:
job_id = await conn.fetchval("""
INSERT INTO jobs (url, webhook_url, webhook_secret, metadata)
VALUES ($1, $2, $3, $4)
RETURNING job_id
""", url, webhook_url, webhook_secret, json.dumps(metadata) if metadata else None)
log.info(f"Created job {job_id} for URL: {url[:80]}...")
return job_id
async def get_job(self, job_id: UUID) -> Optional[Dict[str, Any]]:
"""
Get job by ID.
Args:
job_id: Job UUID
Returns:
Job dictionary or None if not found
"""
async with self.pool.acquire() as conn:
row = await conn.fetchrow("""
SELECT
job_id,
status,
url,
webhook_url,
created_at,
started_at,
completed_at,
reviews_count,
reviews_data,
scrape_time,
error_message,
metadata
FROM jobs
WHERE job_id = $1
""", job_id)
if not row:
return None
return dict(row)
async def get_job_reviews(self, job_id: UUID) -> Optional[List[Dict[str, Any]]]:
"""
Get reviews for a specific job.
Args:
job_id: Job UUID
Returns:
List of reviews or None if not found/not completed
"""
async with self.pool.acquire() as conn:
reviews_data = await conn.fetchval("""
SELECT reviews_data
FROM jobs
WHERE job_id = $1 AND status = 'completed'
""", job_id)
if not reviews_data:
return None
# asyncpg returns JSONB as string, need to parse it
if isinstance(reviews_data, str):
return json.loads(reviews_data)
return reviews_data
async def update_job_status(
self,
job_id: UUID,
status: JobStatus,
**kwargs
):
"""
Update job status and optional fields.
Args:
job_id: Job UUID
status: New status
**kwargs: Additional fields to update (started_at, completed_at, error_message, etc.)
"""
# Build dynamic UPDATE query
set_clauses = ["status = $2"]
params = [job_id, status.value]
param_idx = 3
if status == JobStatus.RUNNING and 'started_at' not in kwargs:
kwargs['started_at'] = datetime.now()
elif status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED] and 'completed_at' not in kwargs:
kwargs['completed_at'] = datetime.now()
for key, value in kwargs.items():
set_clauses.append(f"{key} = ${param_idx}")
params.append(value)
param_idx += 1
query = f"""
UPDATE jobs
SET {', '.join(set_clauses)}
WHERE job_id = $1
"""
async with self.pool.acquire() as conn:
await conn.execute(query, *params)
async def save_job_result(
self,
job_id: UUID,
reviews: List[Dict[str, Any]],
scrape_time: float,
total_reviews: Optional[int] = None
):
"""
Save scraping results to database.
Args:
job_id: Job UUID
reviews: List of review dictionaries
scrape_time: Time taken to scrape in seconds
total_reviews: Total reviews available (from page counter)
"""
async with self.pool.acquire() as conn:
await conn.execute("""
UPDATE jobs
SET
status = 'completed',
completed_at = NOW(),
reviews_count = $2,
total_reviews = $3,
reviews_data = $4::jsonb,
scrape_time = $5
WHERE job_id = $1
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time)
log.info(f"Saved {len(reviews)} reviews for job {job_id}")
async def list_jobs(
self,
status: Optional[JobStatus] = None,
limit: int = 100,
offset: int = 0
) -> List[Dict[str, Any]]:
"""
List jobs with optional filtering.
Args:
status: Optional status filter
limit: Maximum number of jobs to return
offset: Number of jobs to skip
Returns:
List of job dictionaries
"""
async with self.pool.acquire() as conn:
if status:
rows = await conn.fetch("""
SELECT
job_id,
status,
url,
created_at,
completed_at,
reviews_count,
scrape_time,
error_message
FROM jobs
WHERE status = $1
ORDER BY created_at DESC
LIMIT $2 OFFSET $3
""", status.value, limit, offset)
else:
rows = await conn.fetch("""
SELECT
job_id,
status,
url,
created_at,
completed_at,
reviews_count,
scrape_time,
error_message
FROM jobs
ORDER BY created_at DESC
LIMIT $1 OFFSET $2
""", limit, offset)
return [dict(row) for row in rows]
async def get_pending_jobs_with_webhooks(self, limit: int = 100) -> List[Dict[str, Any]]:
"""
Get completed jobs that have webhooks pending delivery.
Args:
limit: Maximum number of jobs to return
Returns:
List of job dictionaries with webhook info
"""
async with self.pool.acquire() as conn:
rows = await conn.fetch("""
SELECT
job_id,
status,
url,
webhook_url,
webhook_secret,
reviews_count,
scrape_time,
error_message,
completed_at
FROM jobs
WHERE webhook_url IS NOT NULL
AND status IN ('completed', 'failed')
AND job_id NOT IN (
SELECT job_id
FROM webhook_attempts
WHERE success = true
)
ORDER BY completed_at ASC
LIMIT $1
""", limit)
return [dict(row) for row in rows]
async def delete_job(self, job_id: UUID) -> bool:
"""
Delete a job from the database.
Args:
job_id: Job UUID
Returns:
True if deleted, False if not found
"""
async with self.pool.acquire() as conn:
result = await conn.execute("""
DELETE FROM jobs WHERE job_id = $1
""", job_id)
deleted = result.split()[-1] == "1"
if deleted:
log.info(f"Deleted job {job_id}")
return deleted
async def cleanup_old_jobs(self, max_age_days: int = 30):
"""
Delete old completed/failed jobs.
Args:
max_age_days: Maximum age in days before deletion
"""
async with self.pool.acquire() as conn:
result = await conn.execute("""
DELETE FROM jobs
WHERE status IN ('completed', 'failed', 'cancelled')
AND completed_at < NOW() - INTERVAL '%s days'
""", max_age_days)
deleted_count = int(result.split()[-1])
if deleted_count > 0:
log.info(f"Cleaned up {deleted_count} old jobs")
# ==================== Statistics ====================
async def get_stats(self) -> Dict[str, Any]:
"""
Get job statistics.
Returns:
Statistics dictionary
"""
async with self.pool.acquire() as conn:
stats = await conn.fetchrow("""
SELECT
COUNT(*) as total_jobs,
COUNT(*) FILTER (WHERE status = 'pending') as pending,
COUNT(*) FILTER (WHERE status = 'running') as running,
COUNT(*) FILTER (WHERE status = 'completed') as completed,
COUNT(*) FILTER (WHERE status = 'failed') as failed,
COUNT(*) FILTER (WHERE status = 'cancelled') as cancelled,
AVG(scrape_time) FILTER (WHERE status = 'completed') as avg_scrape_time,
SUM(reviews_count) FILTER (WHERE status = 'completed') as total_reviews
FROM jobs
""")
return dict(stats)
# ==================== Canary Operations ====================
async def save_canary_result(
self,
success: bool,
reviews_count: Optional[int] = None,
scrape_time: Optional[float] = None,
error_message: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
):
"""
Save canary test result.
Args:
success: Whether canary test succeeded
reviews_count: Number of reviews scraped
scrape_time: Time taken in seconds
error_message: Error message if failed
metadata: Additional metadata
"""
async with self.pool.acquire() as conn:
await conn.execute("""
INSERT INTO canary_results (success, reviews_count, scrape_time, error_message, metadata)
VALUES ($1, $2, $3, $4, $5)
""", success, reviews_count, scrape_time, error_message, json.dumps(metadata) if metadata else None)
async def get_canary_history(self, limit: int = 100) -> List[Dict[str, Any]]:
"""
Get canary test history.
Args:
limit: Maximum number of results to return
Returns:
List of canary result dictionaries
"""
async with self.pool.acquire() as conn:
rows = await conn.fetch("""
SELECT
timestamp,
success,
reviews_count,
scrape_time,
error_message
FROM canary_results
ORDER BY timestamp DESC
LIMIT $1
""", limit)
return [dict(row) for row in rows]
# ==================== Webhook Attempts ====================
async def log_webhook_attempt(
self,
job_id: UUID,
attempt_number: int,
success: bool,
status_code: Optional[int] = None,
error_message: Optional[str] = None,
response_time_ms: Optional[float] = None
):
"""
Log a webhook delivery attempt.
Args:
job_id: Job UUID
attempt_number: Attempt number (1, 2, 3...)
success: Whether delivery succeeded
status_code: HTTP status code
error_message: Error message if failed
response_time_ms: Response time in milliseconds
"""
async with self.pool.acquire() as conn:
await conn.execute("""
INSERT INTO webhook_attempts (job_id, attempt_number, success, status_code, error_message, response_time_ms)
VALUES ($1, $2, $3, $4, $5, $6)
""", job_id, attempt_number, success, status_code, error_message, response_time_ms)

1280
modules/fast_scraper.py Normal file

File diff suppressed because it is too large Load Diff

411
modules/health_checks.py Normal file
View File

@@ -0,0 +1,411 @@
#!/usr/bin/env python3
"""
Smart health check system with canary testing.
Verifies that scraping actually works, not just that services are up.
"""
import asyncio
import logging
from datetime import datetime, timedelta
from typing import Dict, Any, Optional
import os
log = logging.getLogger(__name__)
class CanaryMonitor:
"""
Background canary test monitor.
Runs actual scraping tests periodically to verify the scraper works.
This catches issues like:
- Google Maps page structure changes
- Broken CSS selectors
- GDPR consent handling issues
- Network/proxy problems
- Chrome/browser issues
"""
def __init__(
self,
db,
interval_hours: int = 4,
test_url: Optional[str] = None
):
"""
Initialize canary monitor.
Args:
db: Database manager instance
interval_hours: How often to run canary tests
test_url: Optional test URL (defaults to Soho Factory in Vilnius)
"""
self.db = db
self.interval = timedelta(hours=interval_hours)
self.test_url = test_url or os.getenv(
'CANARY_TEST_URL',
'https://www.google.com/maps/place/Soho+Factory/@54.6738155,25.2595844,17z/'
)
self.running = False
self.last_run: Optional[datetime] = None
self.last_success: Optional[datetime] = None
self.consecutive_failures = 0
self.last_result: Optional[Dict[str, Any]] = None
async def start(self):
"""Start the background canary monitoring"""
self.running = True
log.info(f"Canary monitor started (interval: {self.interval.total_seconds()/3600:.1f}h)")
while self.running:
try:
await self.run_canary_test()
except Exception as e:
log.error(f"Canary test failed with exception: {e}")
self.consecutive_failures += 1
# Alert if multiple consecutive failures
if self.consecutive_failures >= 3:
await self.send_alert(
f"🚨 CRITICAL: Scraper canary failed {self.consecutive_failures} times in a row! "
f"Last error: {str(e)[:200]}"
)
# Sleep until next run
await asyncio.sleep(self.interval.total_seconds())
def stop(self):
"""Stop the background monitoring"""
self.running = False
log.info("Canary monitor stopped")
async def run_canary_test(self):
"""
Run a single canary test.
This performs an actual scrape on a known test URL and validates:
- Scraping succeeds
- Reviews are extracted
- Review count is reasonable
- Scrape time is reasonable
- Data structure is valid
"""
from modules.fast_scraper import fast_scrape_reviews
log.info(f"Running canary scrape test on {self.test_url[:60]}...")
self.last_run = datetime.now()
try:
# Run actual scrape with timeout
result = await asyncio.wait_for(
asyncio.to_thread(
fast_scrape_reviews,
url=self.test_url,
headless=True,
max_scrolls=10 # Limited for canary
),
timeout=60 # Fail if takes > 60s
)
# Validate result
checks = {
"scrape_succeeded": result['success'],
"got_reviews": result['count'] > 0,
"reasonable_count": 10 <= result['count'] <= 500,
"reasonable_time": result['time'] < 30,
"data_structure_valid": self._validate_review_structure(result.get('reviews', []))
}
all_passed = all(checks.values())
if all_passed:
# Success!
log.info(
f"✅ Canary test PASSED: {result['count']} reviews in {result['time']:.1f}s"
)
self.consecutive_failures = 0
self.last_success = datetime.now()
self.last_result = {
"status": "pass",
"reviews_count": result['count'],
"scrape_time": result['time'],
"checks": checks
}
# Save to database
await self.db.save_canary_result(
success=True,
reviews_count=result['count'],
scrape_time=result['time'],
metadata={"checks": checks}
)
else:
# Validation failed
failed_checks = [k for k, v in checks.items() if not v]
log.error(
f"❌ Canary test FAILED: validation failed on {failed_checks}"
)
self.consecutive_failures += 1
self.last_result = {
"status": "fail",
"reviews_count": result['count'],
"scrape_time": result['time'],
"checks": checks,
"failed_checks": failed_checks
}
# Save to database
await self.db.save_canary_result(
success=False,
reviews_count=result['count'],
scrape_time=result['time'],
error_message=f"Validation failed: {failed_checks}",
metadata={"checks": checks}
)
# Alert on failure
if self.consecutive_failures >= 3:
await self.send_alert(
f"🚨 CRITICAL: Canary validation failed {self.consecutive_failures} times! "
f"Failed checks: {failed_checks}"
)
except asyncio.TimeoutError:
log.error("❌ Canary test TIMEOUT (>60s)")
self.consecutive_failures += 1
self.last_result = {
"status": "timeout",
"error": "Scrape took longer than 60 seconds"
}
await self.db.save_canary_result(
success=False,
error_message="Timeout after 60 seconds"
)
if self.consecutive_failures >= 3:
await self.send_alert(
f"🚨 CRITICAL: Canary timeout {self.consecutive_failures} times!"
)
except Exception as e:
log.error(f"❌ Canary test ERROR: {e}")
self.consecutive_failures += 1
self.last_result = {
"status": "error",
"error": str(e)
}
await self.db.save_canary_result(
success=False,
error_message=str(e)
)
raise # Re-raise to trigger alert in main loop
def _validate_review_structure(self, reviews) -> bool:
"""
Validate that reviews have expected structure.
Args:
reviews: List of review dictionaries
Returns:
True if structure is valid
"""
if not reviews or len(reviews) == 0:
return False
# Check first review has required fields
first_review = reviews[0]
required_fields = ['author', 'rating', 'date_text']
return all(field in first_review for field in required_fields)
async def send_alert(self, message: str):
"""
Send alert via configured channels.
Args:
message: Alert message to send
"""
log.critical(message)
# TODO: Integrate with alerting systems
# Examples:
# Slack
slack_webhook = os.getenv('SLACK_WEBHOOK_URL')
if slack_webhook:
try:
import httpx
async with httpx.AsyncClient() as client:
await client.post(
slack_webhook,
json={"text": message},
timeout=5.0
)
log.info("Alert sent to Slack")
except Exception as e:
log.error(f"Failed to send Slack alert: {e}")
# Email (example with SMTP)
# smtp_config = os.getenv('SMTP_CONFIG')
# if smtp_config:
# await send_email(
# to=os.getenv('ALERT_EMAIL'),
# subject="Scraper Canary Alert",
# body=message
# )
# PagerDuty
# pagerduty_key = os.getenv('PAGERDUTY_KEY')
# if pagerduty_key:
# await trigger_pagerduty(message)
def get_status(self) -> Dict[str, Any]:
"""
Get current canary status.
Returns:
Status dictionary
"""
if not self.last_success:
return {
"status": "unknown",
"message": "No canary tests run yet",
"last_run": self.last_run.isoformat() if self.last_run else None
}
age = datetime.now() - self.last_success
max_age = timedelta(hours=6) # Alert if no success in 6 hours
if age > max_age:
return {
"status": "stale",
"last_success": self.last_success.isoformat(),
"age_hours": age.total_seconds() / 3600,
"consecutive_failures": self.consecutive_failures,
"message": f"Last successful canary was {age.total_seconds()/3600:.1f} hours ago"
}
return {
"status": "healthy",
"last_success": self.last_success.isoformat(),
"last_run": self.last_run.isoformat() if self.last_run else None,
"age_minutes": age.total_seconds() / 60,
"consecutive_failures": self.consecutive_failures,
"last_result": self.last_result
}
class HealthCheckSystem:
"""
Complete health check system for production.
Provides multiple levels of health checks:
- Liveness: Is the server alive?
- Readiness: Can it handle traffic?
- Canary: Does scraping actually work?
"""
def __init__(self, db):
"""
Initialize health check system.
Args:
db: Database manager instance
"""
self.db = db
self.canary = CanaryMonitor(db, interval_hours=4)
async def start(self):
"""Start background health monitoring"""
asyncio.create_task(self.canary.start())
def stop(self):
"""Stop background health monitoring"""
self.canary.stop()
async def check_liveness(self) -> Dict[str, Any]:
"""
Liveness check: Is the server alive?
This is a simple check that always succeeds if the server is running.
Used by Kubernetes liveness probe - restart container if fails.
Returns:
Liveness status
"""
return {
"status": "alive",
"timestamp": datetime.utcnow().isoformat()
}
async def check_readiness(self) -> Dict[str, Any]:
"""
Readiness check: Can the server handle traffic?
Checks if dependencies are available.
Used by Kubernetes readiness probe - remove from load balancer if fails.
Returns:
Readiness status
"""
checks = {}
# Check database
try:
await self.db.pool.fetchval("SELECT 1")
checks["database"] = {"healthy": True}
except Exception as e:
checks["database"] = {"healthy": False, "error": str(e)}
# Overall readiness
all_healthy = all(c.get("healthy", False) for c in checks.values())
return {
"status": "ready" if all_healthy else "not_ready",
"checks": checks,
"timestamp": datetime.utcnow().isoformat()
}
async def check_canary(self) -> Dict[str, Any]:
"""
Canary check: Does scraping actually work?
Returns the latest canary test result.
Used by external monitoring (PagerDuty, DataDog) for alerts.
Returns:
Canary status
"""
return self.canary.get_status()
async def get_detailed_health(self) -> Dict[str, Any]:
"""
Get detailed health status of all components.
Returns:
Complete health status
"""
liveness = await self.check_liveness()
readiness = await self.check_readiness()
canary = await self.check_canary()
overall_healthy = (
liveness["status"] == "alive" and
readiness["status"] == "ready" and
canary["status"] in ["healthy", "unknown"] # Unknown is OK (first run)
)
return {
"status": "healthy" if overall_healthy else "degraded",
"components": {
"liveness": liveness,
"readiness": readiness,
"canary": canary
},
"timestamp": datetime.utcnow().isoformat()
}

View File

@@ -15,6 +15,8 @@ from dataclasses import dataclass, asdict
from modules.config import load_config
from modules.scraper import GoogleReviewsScraper
from modules.fast_scraper import fast_scrape_reviews
from modules.chrome_pool import get_scraping_worker, release_scraping_worker
log = logging.getLogger("scraper")
@@ -38,18 +40,32 @@ class ScrapingJob:
created_at: datetime
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
updated_at: Optional[datetime] = None # Last update time (for progress tracking)
error_message: Optional[str] = None
reviews_count: Optional[int] = None
total_reviews: Optional[int] = None # Total reviews available (from page counter)
images_count: Optional[int] = None
progress: Dict[str, Any] = None
reviews_data: Optional[List[Dict[str, Any]]] = None # Store actual review data
scrape_time: Optional[float] = None # Time taken to scrape
def to_dict(self) -> Dict[str, Any]:
"""Convert job to dictionary for JSON serialization"""
def to_dict(self, include_reviews: bool = False) -> Dict[str, Any]:
"""
Convert job to dictionary for JSON serialization
Args:
include_reviews: Whether to include the full reviews data (default: False)
"""
data = asdict(self)
# Convert datetime objects to ISO strings
for field in ['created_at', 'started_at', 'completed_at']:
if data[field]:
data[field] = data[field].isoformat()
# Exclude reviews_data by default (can be large)
if not include_reviews:
data.pop('reviews_data', None)
return data
@@ -126,6 +142,7 @@ class JobManager:
job.status = JobStatus.RUNNING
job.started_at = datetime.now()
job.updated_at = datetime.now()
job.progress = {"stage": "starting", "message": "Initializing scraper"}
# Submit job to thread pool
@@ -137,61 +154,139 @@ class JobManager:
def _run_scraping_job(self, job_id: str):
"""
Run the actual scraping job in background thread.
Args:
job_id: Job ID to run
"""
def progress_callback(current_count: int, total_count: int):
"""Update job progress during scraping"""
with self.lock:
job = self.jobs.get(job_id)
if job:
job.reviews_count = current_count
job.total_reviews = total_count
job.updated_at = datetime.now() # Update last update time
# Calculate percentage for better UX
percentage = int((current_count / total_count * 100)) if total_count > 0 else 0
job.progress = {
"stage": "scraping",
"message": f"Collecting reviews: {current_count} / {total_count} ({percentage}%)",
"percentage": percentage
}
worker = None
try:
with self.lock:
job = self.jobs[job_id]
job.progress = {"stage": "initializing", "message": "Setting up scraper"}
# Create scraper with job config
scraper = GoogleReviewsScraper(job.config)
# Hook into scraper progress (if available)
# This would require modifying the scraper to report progress
job.progress = {"stage": "initializing", "message": "Acquiring Chrome worker from pool"}
# Get a worker from the scraping pool
worker = get_scraping_worker(timeout=30)
if not worker:
raise Exception("No Chrome workers available. Pool may be at capacity.")
log.info(f"Job {job_id}: Acquired worker {worker.worker_id} from pool")
# Get config
url = job.config.get('url')
headless = job.config.get('headless', True) # Default to headless
max_scrolls = job.config.get('max_scrolls', 999999) # Effectively unlimited - relies on idle detection
with self.lock:
job.progress = {"stage": "scraping", "message": "Scraping reviews in progress"}
# Run the scraping
scraper.scrape()
# Mark job as completed
job.progress = {"stage": "scraping", "message": f"Scraping reviews with {worker.worker_id} (fast mode)"}
# Run the FAST scraping with progress callback using pooled worker
result = fast_scrape_reviews(
url=url,
headless=headless,
max_scrolls=max_scrolls,
progress_callback=progress_callback,
driver=worker.driver, # Use worker's driver
return_driver=True # Don't close the driver
)
# Pop the driver from result before storing
result.pop('driver', None)
# Mark job as completed or failed
with self.lock:
job.status = JobStatus.COMPLETED
job.completed_at = datetime.now()
job.progress = {"stage": "completed", "message": "Scraping completed successfully"}
# Try to get results count if available
# This would require scraper to return results
job.reviews_count = getattr(scraper, 'total_reviews', None)
job.images_count = getattr(scraper, 'total_images', None)
log.info(f"Completed scraping job {job_id}")
if result['success']:
job.status = JobStatus.COMPLETED
job.completed_at = datetime.now()
job.updated_at = datetime.now()
job.reviews_count = result['count']
job.total_reviews = result.get('total_reviews') # Store total review count from page
job.reviews_data = result['reviews'] # Store the actual reviews
job.scrape_time = result['time']
job.progress = {
"stage": "completed",
"message": f"Scraping completed successfully in {result['time']:.1f}s",
"scroll_time": result.get('scroll_time'),
"extract_time": result.get('extract_time')
}
log.info(f"Completed scraping job {job_id}: {result['count']} reviews in {result['time']:.1f}s")
else:
job.status = JobStatus.FAILED
job.completed_at = datetime.now()
job.updated_at = datetime.now()
job.error_message = result.get('error', 'Unknown error')
job.progress = {"stage": "failed", "message": f"Job failed: {result.get('error')}"}
log.error(f"Failed scraping job {job_id}: {result.get('error')}")
except Exception as e:
log.error(f"Error in scraping job {job_id}: {e}")
import traceback
traceback.print_exc()
with self.lock:
job = self.jobs[job_id]
job.status = JobStatus.FAILED
job.completed_at = datetime.now()
job.updated_at = datetime.now()
job.error_message = str(e)
job.progress = {"stage": "failed", "message": f"Job failed: {str(e)}"}
# Recycle worker on error
if worker:
log.info(f"Job {job_id}: Recycling worker {worker.worker_id} due to error")
release_scraping_worker(worker, recycle=True)
worker = None # Mark as released
finally:
# Release worker back to pool if not already released
if worker:
log.info(f"Job {job_id}: Releasing worker {worker.worker_id} back to pool")
release_scraping_worker(worker, recycle=False)
def get_job(self, job_id: str) -> Optional[ScrapingJob]:
"""
Get job by ID.
Args:
job_id: Job ID
Returns:
Job object or None if not found
"""
with self.lock:
return self.jobs.get(job_id)
def get_job_reviews(self, job_id: str) -> Optional[List[Dict[str, Any]]]:
"""
Get reviews data for a specific job.
Args:
job_id: Job ID
Returns:
List of reviews or None if not found/not completed
"""
with self.lock:
job = self.jobs.get(job_id)
if job and job.status == JobStatus.COMPLETED:
return job.reviews_data
return None
def list_jobs(self, status: Optional[JobStatus] = None, limit: int = 100) -> List[ScrapingJob]:
"""
@@ -235,6 +330,7 @@ class JobManager:
job.status = JobStatus.CANCELLED
job.completed_at = datetime.now()
job.updated_at = datetime.now()
job.progress = {"stage": "cancelled", "message": "Job was cancelled"}
log.info(f"Cancelled scraping job {job_id}")

View File

@@ -1420,14 +1420,65 @@ class GoogleReviewsScraper:
try:
responses = self.api_interceptor.get_intercepted_responses()
if responses:
log.debug(f"Collected {len(responses)} network responses from browser")
# Dump first few responses for analysis
if not hasattr(self, '_dumped_responses'):
self._dumped_responses = 0
if self._dumped_responses < 5: # Dump first 5 responses
from pathlib import Path
import json
output_dir = Path("api_response_samples")
output_dir.mkdir(exist_ok=True)
for resp in responses:
if self._dumped_responses >= 5:
break
idx = self._dumped_responses
body = resp.get('body', '')
# Save full response
full_file = output_dir / f"response_{idx:02d}_full.json"
with open(full_file, 'w', encoding='utf-8') as f:
json.dump(resp, f, indent=2, ensure_ascii=False)
# Save body
body_file = output_dir / f"response_{idx:02d}_body.txt"
with open(body_file, 'w', encoding='utf-8') as f:
f.write(body)
# Try to parse and save
clean_body = body[4:].strip() if body.startswith(")]}'") else body
try:
parsed_data = json.loads(clean_body)
parsed_file = output_dir / f"response_{idx:02d}_parsed.json"
with open(parsed_file, 'w', encoding='utf-8') as f:
json.dump(parsed_data, f, indent=2, ensure_ascii=False)
log.info(f"Dumped API response {idx} to {output_dir}/ ({len(body)} bytes)")
except:
log.debug(f"Response {idx} is not JSON")
self._dumped_responses += 1
parsed = self.api_interceptor.parse_reviews_from_responses(responses)
log.debug(f"Parsed {len(parsed)} reviews from responses")
for intercepted in parsed:
if intercepted.review_id and intercepted.review_id not in api_reviews:
api_reviews[intercepted.review_id] = self.api_interceptor.convert_to_raw_review_format(intercepted)
if parsed:
log.debug(f"API interceptor captured {len(parsed)} reviews (total unique: {len(api_reviews)})")
log.info(f"API interceptor captured {len(parsed)} reviews (total unique API: {len(api_reviews)})")
# Log stats every 10 iterations
if attempts % 10 == 0:
stats = self.api_interceptor.get_interceptor_stats()
if stats:
log.debug(f"Interceptor stats - Fetch: {stats.get('totalFetch', 0)}/{stats.get('capturedFetch', 0)}, "
f"XHR: {stats.get('totalXHR', 0)}/{stats.get('capturedXHR', 0)}, "
f"Last: {stats.get('lastCapture', 'never')}")
except Exception as api_err:
log.debug(f"API interception error: {api_err}")
log.warning(f"API interception error: {api_err}", exc_info=True)
# Dynamic sleep: sleep less when processing many reviews, more when finding none
if len(fresh_cards) > 5:
@@ -1470,6 +1521,35 @@ class GoogleReviewsScraper:
if key not in existing or not existing.get(key):
existing[key] = value
log.info(f"After merge: {len(docs)} total reviews")
elif self.enable_api_intercept:
# Log final stats even if no reviews captured
if self.api_interceptor:
stats = self.api_interceptor.get_interceptor_stats()
if stats:
log.warning(f"⚠️ API interception was enabled but captured 0 reviews. "
f"Network stats - Fetch requests: {stats.get('capturedFetch', 0)}/{stats.get('totalFetch', 0)}, "
f"XHR requests: {stats.get('capturedXHR', 0)}/{stats.get('totalXHR', 0)}")
# Get browser console logs for debugging
console_logs = self.api_interceptor.get_browser_console_logs()
api_logs = [log_entry for log_entry in console_logs
if 'API Interceptor' in log_entry.get('message', '')]
if api_logs:
log.info(f"Found {len(api_logs)} API interceptor console messages")
for entry in api_logs[:10]: # Show first 10
log.debug(f" Console: {entry.get('message', '')[:200]}")
else:
log.debug("No API interceptor console messages found")
# In debug mode, try to dump any responses that were collected
if log.level <= logging.DEBUG:
all_responses = self.api_interceptor.get_intercepted_responses()
if all_responses:
dump_path = self.api_interceptor.dump_responses_to_file(all_responses)
if dump_path:
log.info(f"Raw responses dumped to: {dump_path}")
else:
log.warning("API interceptor stats not available")
# Save to MongoDB if enabled
if self.use_mongodb and self.mongodb:

373
modules/webhooks.py Normal file
View File

@@ -0,0 +1,373 @@
#!/usr/bin/env python3
"""
Webhook delivery system with retry logic and security.
"""
import asyncio
import hmac
import hashlib
import json
import logging
from typing import Dict, Any, Optional
from datetime import datetime
import httpx
from uuid import UUID
log = logging.getLogger(__name__)
class WebhookDeliveryError(Exception):
"""Raised when webhook delivery fails after all retries"""
pass
class WebhookManager:
"""
Manages webhook delivery with retry logic and security.
Features:
- Exponential backoff retry (3 attempts)
- HMAC signature for security
- Timeout handling
- Async delivery
- Logging of all attempts
"""
def __init__(
self,
max_retries: int = 3,
timeout: float = 10.0,
initial_retry_delay: float = 2.0
):
"""
Initialize webhook manager.
Args:
max_retries: Maximum number of delivery attempts
timeout: Request timeout in seconds
initial_retry_delay: Initial delay between retries (exponential backoff)
"""
self.max_retries = max_retries
self.timeout = timeout
self.initial_retry_delay = initial_retry_delay
def generate_signature(self, payload: str, secret: str) -> str:
"""
Generate HMAC-SHA256 signature for webhook payload.
Args:
payload: JSON string payload
secret: Webhook secret
Returns:
Hex-encoded signature
"""
return hmac.new(
secret.encode('utf-8'),
payload.encode('utf-8'),
hashlib.sha256
).hexdigest()
async def send_webhook(
self,
webhook_url: str,
payload: Dict[str, Any],
secret: Optional[str] = None,
job_id: Optional[UUID] = None,
db=None
) -> bool:
"""
Send webhook with retry logic.
Args:
webhook_url: URL to send webhook to
payload: Webhook payload dictionary
secret: Optional webhook secret for HMAC signature
job_id: Optional job ID for logging attempts
db: Optional database manager for logging
Returns:
True if delivery succeeded, False otherwise
"""
payload_json = json.dumps(payload, default=str)
for attempt in range(1, self.max_retries + 1):
try:
start_time = datetime.now()
# Prepare headers
headers = {
"Content-Type": "application/json",
"User-Agent": "GoogleReviewsScraper-Webhook/1.0"
}
# Add signature if secret provided
if secret:
signature = self.generate_signature(payload_json, secret)
headers["X-Webhook-Signature"] = f"sha256={signature}"
headers["X-Webhook-Timestamp"] = str(int(datetime.now().timestamp()))
# Send webhook
async with httpx.AsyncClient() as client:
response = await client.post(
webhook_url,
content=payload_json,
headers=headers,
timeout=self.timeout
)
response_time_ms = (datetime.now() - start_time).total_seconds() * 1000
# Check response
if response.status_code in [200, 201, 202, 204]:
# Success
log.info(
f"Webhook delivered successfully to {webhook_url} "
f"(attempt {attempt}, {response_time_ms:.0f}ms, status {response.status_code})"
)
# Log successful attempt
if db and job_id:
await db.log_webhook_attempt(
job_id=job_id,
attempt_number=attempt,
success=True,
status_code=response.status_code,
response_time_ms=response_time_ms
)
return True
else:
# Non-2xx response
error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
log.warning(
f"Webhook delivery failed to {webhook_url} "
f"(attempt {attempt}/{self.max_retries}): {error_msg}"
)
# Log failed attempt
if db and job_id:
await db.log_webhook_attempt(
job_id=job_id,
attempt_number=attempt,
success=False,
status_code=response.status_code,
error_message=error_msg,
response_time_ms=response_time_ms
)
except httpx.TimeoutException as e:
error_msg = f"Timeout after {self.timeout}s"
log.warning(
f"Webhook delivery timeout to {webhook_url} "
f"(attempt {attempt}/{self.max_retries}): {error_msg}"
)
# Log timeout attempt
if db and job_id:
await db.log_webhook_attempt(
job_id=job_id,
attempt_number=attempt,
success=False,
error_message=error_msg
)
except Exception as e:
error_msg = f"{type(e).__name__}: {str(e)}"
log.error(
f"Webhook delivery error to {webhook_url} "
f"(attempt {attempt}/{self.max_retries}): {error_msg}"
)
# Log error attempt
if db and job_id:
await db.log_webhook_attempt(
job_id=job_id,
attempt_number=attempt,
success=False,
error_message=error_msg
)
# Retry with exponential backoff
if attempt < self.max_retries:
retry_delay = self.initial_retry_delay * (2 ** (attempt - 1))
log.info(f"Retrying in {retry_delay:.1f}s...")
await asyncio.sleep(retry_delay)
# All retries failed
log.error(
f"Webhook delivery failed to {webhook_url} after {self.max_retries} attempts"
)
return False
async def send_job_completed_webhook(
self,
webhook_url: str,
job_id: UUID,
status: str,
reviews_count: Optional[int] = None,
scrape_time: Optional[float] = None,
error_message: Optional[str] = None,
reviews_url: Optional[str] = None,
secret: Optional[str] = None,
db=None
) -> bool:
"""
Send job completion webhook.
Args:
webhook_url: URL to send webhook to
job_id: Job UUID
status: Job status ('completed' or 'failed')
reviews_count: Number of reviews scraped
scrape_time: Time taken in seconds
error_message: Error message if failed
reviews_url: URL to retrieve reviews
secret: Webhook secret
db: Database manager for logging
Returns:
True if delivery succeeded
"""
payload = {
"event": f"job.{status}",
"job_id": str(job_id),
"status": status,
"timestamp": datetime.utcnow().isoformat() + "Z"
}
if status == "completed":
payload.update({
"reviews_count": reviews_count,
"scrape_time": scrape_time,
"reviews_url": reviews_url
})
elif status == "failed":
payload["error_message"] = error_message
return await self.send_webhook(
webhook_url=webhook_url,
payload=payload,
secret=secret,
job_id=job_id,
db=db
)
class WebhookDispatcher:
"""
Background webhook dispatcher that processes pending webhooks.
Runs in background and delivers webhooks for completed jobs.
"""
def __init__(self, db, interval_seconds: int = 30):
"""
Initialize webhook dispatcher.
Args:
db: Database manager instance
interval_seconds: How often to check for pending webhooks
"""
self.db = db
self.interval = interval_seconds
self.webhook_manager = WebhookManager()
self.running = False
async def start(self):
"""Start the background webhook dispatcher"""
self.running = True
log.info("Webhook dispatcher started")
while self.running:
try:
await self.process_pending_webhooks()
except Exception as e:
log.error(f"Error in webhook dispatcher: {e}")
await asyncio.sleep(self.interval)
def stop(self):
"""Stop the background webhook dispatcher"""
self.running = False
log.info("Webhook dispatcher stopped")
async def process_pending_webhooks(self):
"""
Process all pending webhooks.
Fetches jobs with pending webhooks and delivers them.
"""
# Get jobs with pending webhooks
jobs = await self.db.get_pending_jobs_with_webhooks(limit=100)
if not jobs:
return
log.info(f"Processing {len(jobs)} pending webhooks...")
for job in jobs:
try:
job_id = job['job_id']
webhook_url = job['webhook_url']
webhook_secret = job.get('webhook_secret')
status = job['status']
# Build reviews URL (assuming API base URL from environment)
import os
api_base_url = os.getenv('API_BASE_URL', 'http://localhost:8000')
reviews_url = f"{api_base_url}/jobs/{job_id}/reviews"
# Send webhook
await self.webhook_manager.send_job_completed_webhook(
webhook_url=webhook_url,
job_id=job_id,
status=status,
reviews_count=job.get('reviews_count'),
scrape_time=job.get('scrape_time'),
error_message=job.get('error_message'),
reviews_url=reviews_url if status == 'completed' else None,
secret=webhook_secret,
db=self.db
)
except Exception as e:
log.error(f"Error processing webhook for job {job['job_id']}: {e}")
log.info(f"Processed {len(jobs)} webhooks")
# Webhook verification helper for client implementations
def verify_webhook_signature(payload: str, signature: str, secret: str) -> bool:
"""
Verify webhook signature (for client-side verification).
Args:
payload: Raw JSON payload string
signature: Signature from X-Webhook-Signature header (format: "sha256=...")
secret: Webhook secret
Returns:
True if signature is valid
Example:
@app.post("/webhook")
async def handle_webhook(request: Request):
payload = await request.body()
signature = request.headers.get("X-Webhook-Signature")
if not verify_webhook_signature(payload.decode(), signature, WEBHOOK_SECRET):
raise HTTPException(status_code=401, detail="Invalid signature")
# Process webhook...
"""
if not signature or not signature.startswith("sha256="):
return False
expected_signature = signature.split("sha256=", 1)[1]
computed_signature = hmac.new(
secret.encode('utf-8'),
payload.encode('utf-8'),
hashlib.sha256
).hexdigest()
return hmac.compare_digest(expected_signature, computed_signature)