Optimize scraper performance and add fallback selectors for robustness
Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -7,10 +7,12 @@ Google's internal API responses for faster, more reliable data extraction.
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
@@ -209,38 +211,62 @@ class GoogleMapsAPIInterceptor:
|
||||
intercept_script = """
|
||||
(function() {
|
||||
// Skip if already injected
|
||||
if (window.__reviewInterceptorInjected) return;
|
||||
if (window.__reviewInterceptorInjected) {
|
||||
console.log('[API Interceptor] Already injected, skipping');
|
||||
return;
|
||||
}
|
||||
window.__reviewInterceptorInjected = true;
|
||||
window.__interceptedResponses = [];
|
||||
window.__interceptorStats = {
|
||||
totalFetch: 0,
|
||||
totalXHR: 0,
|
||||
capturedFetch: 0,
|
||||
capturedXHR: 0,
|
||||
lastCapture: null
|
||||
};
|
||||
|
||||
console.log('[API Interceptor] Initializing...');
|
||||
|
||||
// Store original fetch
|
||||
const originalFetch = window.fetch;
|
||||
|
||||
// Override fetch
|
||||
window.fetch = async function(...args) {
|
||||
const response = await originalFetch.apply(this, args);
|
||||
window.__interceptorStats.totalFetch++;
|
||||
const url = args[0].toString();
|
||||
|
||||
// Log ALL fetch requests for debugging
|
||||
console.debug('[API Interceptor] FETCH:', url.substring(0, 150));
|
||||
|
||||
const response = await originalFetch.apply(this, args);
|
||||
|
||||
// Check if this is a review-related API call
|
||||
if (url.includes('review') || url.includes('batchexecute') ||
|
||||
url.includes('place') || url.includes('maps')) {
|
||||
url.includes('place') || url.includes('maps') ||
|
||||
url.includes('listugcposts') || url.includes('getreviews')) {
|
||||
try {
|
||||
const clone = response.clone();
|
||||
const text = await clone.text();
|
||||
|
||||
console.log('[API Interceptor] ✅ CAPTURED FETCH:', url.substring(0, 100), 'Size:', text.length);
|
||||
|
||||
window.__interceptedResponses.push({
|
||||
url: url,
|
||||
body: text,
|
||||
timestamp: Date.now(),
|
||||
type: 'fetch'
|
||||
type: 'fetch',
|
||||
size: text.length
|
||||
});
|
||||
|
||||
window.__interceptorStats.capturedFetch++;
|
||||
window.__interceptorStats.lastCapture = new Date().toISOString();
|
||||
|
||||
// Keep only last 100 responses to avoid memory issues
|
||||
if (window.__interceptedResponses.length > 100) {
|
||||
window.__interceptedResponses = window.__interceptedResponses.slice(-50);
|
||||
}
|
||||
} catch (e) {
|
||||
console.debug('Response capture error:', e);
|
||||
console.error('[API Interceptor] Response capture error:', e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -259,25 +285,35 @@ class GoogleMapsAPIInterceptor:
|
||||
|
||||
xhr.open = function(method, url, ...rest) {
|
||||
requestUrl = url;
|
||||
window.__interceptorStats.totalXHR++;
|
||||
console.debug('[API Interceptor] XHR:', method, url.substring(0, 150));
|
||||
return originalOpen.apply(this, [method, url, ...rest]);
|
||||
};
|
||||
|
||||
xhr.addEventListener('load', function() {
|
||||
if (requestUrl.includes('review') || requestUrl.includes('batchexecute') ||
|
||||
requestUrl.includes('place') || requestUrl.includes('maps')) {
|
||||
requestUrl.includes('place') || requestUrl.includes('maps') ||
|
||||
requestUrl.includes('listugcposts') || requestUrl.includes('getreviews')) {
|
||||
try {
|
||||
console.log('[API Interceptor] ✅ CAPTURED XHR:', requestUrl.substring(0, 100), 'Size:', xhr.responseText.length);
|
||||
|
||||
window.__interceptedResponses.push({
|
||||
url: requestUrl,
|
||||
body: xhr.responseText,
|
||||
timestamp: Date.now(),
|
||||
type: 'xhr'
|
||||
type: 'xhr',
|
||||
status: xhr.status,
|
||||
size: xhr.responseText.length
|
||||
});
|
||||
|
||||
window.__interceptorStats.capturedXHR++;
|
||||
window.__interceptorStats.lastCapture = new Date().toISOString();
|
||||
|
||||
if (window.__interceptedResponses.length > 100) {
|
||||
window.__interceptedResponses = window.__interceptedResponses.slice(-50);
|
||||
}
|
||||
} catch (e) {
|
||||
console.debug('XHR capture error:', e);
|
||||
console.error('[API Interceptor] XHR capture error:', e);
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -292,14 +328,30 @@ class GoogleMapsAPIInterceptor:
|
||||
} catch (e) {}
|
||||
}
|
||||
|
||||
console.log('Review API interceptor injected');
|
||||
console.log('[API Interceptor] ✅ Injected successfully! Monitoring network requests...');
|
||||
|
||||
// Log stats every 10 seconds
|
||||
setInterval(() => {
|
||||
if (window.__interceptorStats.totalFetch > 0 || window.__interceptorStats.totalXHR > 0) {
|
||||
console.log('[API Interceptor] Stats:',
|
||||
'Fetch:', window.__interceptorStats.totalFetch, '/', window.__interceptorStats.capturedFetch,
|
||||
'XHR:', window.__interceptorStats.totalXHR, '/', window.__interceptorStats.capturedXHR,
|
||||
'Queue:', window.__interceptedResponses.length);
|
||||
}
|
||||
}, 10000);
|
||||
|
||||
return true;
|
||||
})();
|
||||
"""
|
||||
|
||||
try:
|
||||
result = self.driver.execute_script(intercept_script)
|
||||
log.info("JavaScript response interceptor injected")
|
||||
log.info("JavaScript response interceptor injected with enhanced debugging")
|
||||
|
||||
# Get initial stats
|
||||
stats = self.get_interceptor_stats()
|
||||
log.debug(f"Interceptor stats: {stats}")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
log.warning(f"Failed to inject interceptor: {e}")
|
||||
@@ -317,11 +369,81 @@ class GoogleMapsAPIInterceptor:
|
||||
return [];
|
||||
"""
|
||||
responses = self.driver.execute_script(script)
|
||||
|
||||
if responses:
|
||||
log.debug(f"Retrieved {len(responses)} intercepted responses from browser")
|
||||
for resp in responses[:3]: # Log first 3 for debugging
|
||||
log.debug(f" - {resp.get('type', '?').upper()}: {resp.get('url', '')[:100]} ({resp.get('size', 0)} bytes)")
|
||||
else:
|
||||
log.debug("No intercepted responses available")
|
||||
|
||||
return responses or []
|
||||
except Exception as e:
|
||||
log.debug(f"Error getting intercepted responses: {e}")
|
||||
return []
|
||||
|
||||
def get_interceptor_stats(self):
|
||||
"""Get statistics from the JavaScript interceptor"""
|
||||
try:
|
||||
script = """
|
||||
if (window.__interceptorStats) {
|
||||
return window.__interceptorStats;
|
||||
}
|
||||
return null;
|
||||
"""
|
||||
stats = self.driver.execute_script(script)
|
||||
return stats
|
||||
except Exception as e:
|
||||
log.debug(f"Error getting interceptor stats: {e}")
|
||||
return None
|
||||
|
||||
def get_browser_console_logs(self):
|
||||
"""Get browser console logs (for debugging)"""
|
||||
try:
|
||||
logs = self.driver.get_log('browser')
|
||||
return logs
|
||||
except Exception as e:
|
||||
log.debug(f"Could not get browser console logs: {e}")
|
||||
return []
|
||||
|
||||
def dump_responses_to_file(self, responses: List[Dict], output_dir: str = "debug_api_responses"):
|
||||
"""
|
||||
Dump captured responses to files for debugging.
|
||||
Creates one file per response with metadata and body.
|
||||
"""
|
||||
try:
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(exist_ok=True)
|
||||
|
||||
for i, response in enumerate(responses):
|
||||
timestamp = response.get('timestamp', int(time.time() * 1000))
|
||||
url = response.get('url', 'unknown')
|
||||
req_type = response.get('type', 'unknown')
|
||||
|
||||
# Create filename from timestamp and type
|
||||
filename = f"{timestamp}_{req_type}_{i}.json"
|
||||
filepath = output_path / filename
|
||||
|
||||
# Write response with metadata
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
json.dump({
|
||||
'metadata': {
|
||||
'url': url,
|
||||
'type': req_type,
|
||||
'timestamp': timestamp,
|
||||
'size': response.get('size', len(response.get('body', ''))),
|
||||
'status': response.get('status')
|
||||
},
|
||||
'body': response.get('body', '')
|
||||
}, f, indent=2, ensure_ascii=False)
|
||||
|
||||
log.info(f"Dumped {len(responses)} responses to {output_path}")
|
||||
return str(output_path)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error dumping responses to file: {e}")
|
||||
return None
|
||||
|
||||
def _is_review_api(self, url: str) -> bool:
|
||||
"""Check if URL matches review API patterns"""
|
||||
url_lower = url.lower()
|
||||
@@ -381,6 +503,10 @@ class GoogleMapsAPIInterceptor:
|
||||
"""Parse a single response body for review data"""
|
||||
reviews = []
|
||||
|
||||
# Skip empty or HTML responses
|
||||
if not body or body.startswith('<!DOCTYPE') or body.startswith('<html'):
|
||||
return reviews
|
||||
|
||||
# Handle batch execute format (starts with )]}' prefix)
|
||||
if body.startswith(")]}'"):
|
||||
body = body[4:].strip()
|
||||
@@ -394,15 +520,213 @@ class GoogleMapsAPIInterceptor:
|
||||
try:
|
||||
data = json.loads(json_match.group())
|
||||
except:
|
||||
log.debug(f"Failed to parse JSON from response")
|
||||
return reviews
|
||||
else:
|
||||
log.debug(f"No JSON found in response")
|
||||
return reviews
|
||||
|
||||
# Extract reviews from nested structure
|
||||
reviews.extend(self._extract_reviews_recursive(data))
|
||||
# Special handling for listugcposts endpoint
|
||||
if 'listugcposts' in url.lower():
|
||||
reviews.extend(self._parse_listugcposts_response(data))
|
||||
else:
|
||||
# Generic recursive extraction
|
||||
reviews.extend(self._extract_reviews_recursive(data))
|
||||
|
||||
return reviews
|
||||
|
||||
def _parse_listugcposts_response(self, data: Any) -> List[InterceptedReview]:
|
||||
"""
|
||||
Parse Google Maps listugcposts API response.
|
||||
|
||||
Structure discovered:
|
||||
data[2] = array of review groups
|
||||
data[2][i] = single review group [review_data, metadata, continuation_token]
|
||||
data[2][i][0] = review data (6-item array containing all review info)
|
||||
"""
|
||||
reviews = []
|
||||
|
||||
try:
|
||||
if not isinstance(data, list) or len(data) < 3:
|
||||
log.debug("Response doesn't match expected structure (not a list or too short)")
|
||||
return reviews
|
||||
|
||||
# data[2] contains the review groups
|
||||
review_groups = data[2]
|
||||
if not isinstance(review_groups, list):
|
||||
log.debug("data[2] is not a list")
|
||||
return reviews
|
||||
|
||||
log.debug(f"Found {len(review_groups)} reviews in data[2]")
|
||||
|
||||
# Each group IS ONE REVIEW
|
||||
for group_idx, group in enumerate(review_groups):
|
||||
if not isinstance(group, list) or len(group) == 0:
|
||||
continue
|
||||
|
||||
# group[0] is the review data array (6 items)
|
||||
review_data = group[0]
|
||||
if not isinstance(review_data, list):
|
||||
continue
|
||||
|
||||
try:
|
||||
review = self._parse_google_review_array(review_data)
|
||||
if review:
|
||||
reviews.append(review)
|
||||
log.debug(f"Parsed review {group_idx}: {review.author} - {review.rating}★")
|
||||
except Exception as e:
|
||||
log.debug(f"Error parsing review at group[{group_idx}]: {e}")
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error in _parse_listugcposts_response: {e}")
|
||||
|
||||
return reviews
|
||||
|
||||
def _parse_google_review_array(self, review_data: List) -> Optional[InterceptedReview]:
|
||||
"""
|
||||
Parse a single review from Google's 6-item array format.
|
||||
|
||||
Discovered structure (review_data is a 6-item array):
|
||||
review_data[0] = Review ID (string)
|
||||
review_data[1][4][5][0] = Author Name
|
||||
review_data[1][4][5][3] = User ID
|
||||
review_data[1][6] = Date Text
|
||||
review_data[2][0][0] = Rating (1-5)
|
||||
review_data[2][15][0][0] = Review Text (original)
|
||||
review_data[2][15][1][0] = Review Text (translated)
|
||||
"""
|
||||
review = InterceptedReview()
|
||||
|
||||
try:
|
||||
# Extract review ID from review_data[0]
|
||||
if len(review_data) > 0 and isinstance(review_data[0], str):
|
||||
review.review_id = review_data[0]
|
||||
|
||||
# Extract author info from review_data[1][4][5]
|
||||
if (len(review_data) > 1 and
|
||||
isinstance(review_data[1], list) and
|
||||
len(review_data[1]) > 4 and
|
||||
isinstance(review_data[1][4], list) and
|
||||
len(review_data[1][4]) > 5 and
|
||||
isinstance(review_data[1][4][5], list)):
|
||||
|
||||
author_info = review_data[1][4][5]
|
||||
|
||||
# Author name at [1][4][5][0]
|
||||
if len(author_info) > 0 and isinstance(author_info[0], str):
|
||||
review.author = author_info[0]
|
||||
|
||||
# Profile picture at [1][4][5][1] (if available)
|
||||
if len(author_info) > 1 and isinstance(author_info[1], str):
|
||||
review.avatar_url = author_info[1]
|
||||
|
||||
# Extract date from review_data[1][6]
|
||||
if (len(review_data) > 1 and
|
||||
isinstance(review_data[1], list) and
|
||||
len(review_data[1]) > 6 and
|
||||
isinstance(review_data[1][6], str)):
|
||||
review.date_text = review_data[1][6]
|
||||
|
||||
# Extract rating from review_data[2][0][0]
|
||||
if (len(review_data) > 2 and
|
||||
isinstance(review_data[2], list) and
|
||||
len(review_data[2]) > 0 and
|
||||
isinstance(review_data[2][0], list) and
|
||||
len(review_data[2][0]) > 0):
|
||||
rating_val = review_data[2][0][0]
|
||||
if isinstance(rating_val, (int, float)) and 1 <= rating_val <= 5:
|
||||
review.rating = float(rating_val)
|
||||
|
||||
# Extract review text from review_data[2][15][0][0]
|
||||
if (len(review_data) > 2 and
|
||||
isinstance(review_data[2], list) and
|
||||
len(review_data[2]) > 15 and
|
||||
isinstance(review_data[2][15], list) and
|
||||
len(review_data[2][15]) > 0 and
|
||||
isinstance(review_data[2][15][0], list) and
|
||||
len(review_data[2][15][0]) > 0):
|
||||
text = review_data[2][15][0][0]
|
||||
if isinstance(text, str):
|
||||
review.text = text
|
||||
|
||||
# Only return if we have minimum required data
|
||||
if review.rating > 0 and (review.author or review.text):
|
||||
return review
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error parsing Google review array: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _parse_review_array_v2(self, arr: List) -> Optional[InterceptedReview]:
|
||||
"""
|
||||
Parse review from Google's nested array format.
|
||||
Improved version with better field detection.
|
||||
"""
|
||||
review = InterceptedReview()
|
||||
|
||||
try:
|
||||
# Extract review ID (usually a long string in first few elements)
|
||||
for i, item in enumerate(arr[:5]):
|
||||
if isinstance(item, str) and len(item) > 30 and not item.startswith('http'):
|
||||
review.review_id = item
|
||||
break
|
||||
|
||||
# Extract rating (number between 1-5)
|
||||
for item in arr:
|
||||
if isinstance(item, (int, float)) and 1 <= item <= 5:
|
||||
review.rating = float(item)
|
||||
break
|
||||
elif isinstance(item, list):
|
||||
for subitem in item:
|
||||
if isinstance(subitem, (int, float)) and 1 <= subitem <= 5:
|
||||
review.rating = float(subitem)
|
||||
break
|
||||
if review.rating > 0:
|
||||
break
|
||||
|
||||
# Extract review text (long string, not a URL)
|
||||
for item in arr:
|
||||
if isinstance(item, str) and len(item) > 50 and not item.startswith('http'):
|
||||
if not review.review_id or item != review.review_id:
|
||||
review.text = item
|
||||
break
|
||||
|
||||
# Extract author name (shorter string, not ID or text)
|
||||
for item in arr:
|
||||
if isinstance(item, str) and 3 <= len(item) <= 100:
|
||||
if item != review.review_id and item != review.text and not item.startswith('http'):
|
||||
review.author = item
|
||||
break
|
||||
elif isinstance(item, list):
|
||||
for subitem in item:
|
||||
if isinstance(subitem, str) and 3 <= len(subitem) <= 100:
|
||||
if subitem != review.text and not subitem.startswith('http'):
|
||||
review.author = subitem
|
||||
break
|
||||
if review.author:
|
||||
break
|
||||
|
||||
# Extract dates (strings that look like dates)
|
||||
date_patterns = [r'\d{1,2}/\d{1,2}/\d{2,4}', r'\d{4}-\d{2}-\d{2}', r'hace \d+', r'\d+ days? ago']
|
||||
for item in arr:
|
||||
if isinstance(item, str):
|
||||
for pattern in date_patterns:
|
||||
if re.search(pattern, item, re.IGNORECASE):
|
||||
review.date_text = item
|
||||
break
|
||||
if review.date_text:
|
||||
break
|
||||
|
||||
# Only return if we have meaningful data
|
||||
if (review.review_id or review.author) and review.rating > 0:
|
||||
return review
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error in _parse_review_array_v2: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _extract_reviews_recursive(self, data: Any, depth: int = 0) -> List[InterceptedReview]:
|
||||
"""Recursively search for review data in nested structures"""
|
||||
reviews = []
|
||||
@@ -410,6 +734,10 @@ class GoogleMapsAPIInterceptor:
|
||||
if depth > 20: # Prevent infinite recursion
|
||||
return reviews
|
||||
|
||||
# Skip if data is already an InterceptedReview object
|
||||
if isinstance(data, InterceptedReview):
|
||||
return [data]
|
||||
|
||||
if isinstance(data, dict):
|
||||
# Check if this looks like a review object
|
||||
review = self._try_parse_review_dict(data)
|
||||
@@ -418,7 +746,8 @@ class GoogleMapsAPIInterceptor:
|
||||
|
||||
# Recurse into dict values
|
||||
for value in data.values():
|
||||
reviews.extend(self._extract_reviews_recursive(value, depth + 1))
|
||||
if not isinstance(value, InterceptedReview):
|
||||
reviews.extend(self._extract_reviews_recursive(value, depth + 1))
|
||||
|
||||
elif isinstance(data, list):
|
||||
# Check if this array looks like a review array
|
||||
@@ -428,7 +757,8 @@ class GoogleMapsAPIInterceptor:
|
||||
|
||||
# Recurse into list items
|
||||
for item in data:
|
||||
reviews.extend(self._extract_reviews_recursive(item, depth + 1))
|
||||
if not isinstance(item, InterceptedReview):
|
||||
reviews.extend(self._extract_reviews_recursive(item, depth + 1))
|
||||
|
||||
return reviews
|
||||
|
||||
|
||||
359
modules/chrome_pool.py
Normal file
359
modules/chrome_pool.py
Normal file
@@ -0,0 +1,359 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Chrome Worker Pool Manager
|
||||
|
||||
Maintains a pool of idle Chrome instances for faster scraping.
|
||||
Pre-warms browsers on startup to eliminate cold-start delays.
|
||||
"""
|
||||
import logging
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Optional, Dict, Any
|
||||
from seleniumbase import Driver
|
||||
from queue import Queue, Empty
|
||||
import threading
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ChromeWorker:
|
||||
"""Single Chrome worker instance"""
|
||||
|
||||
def __init__(self, worker_id: str, headless: bool = True):
|
||||
self.worker_id = worker_id
|
||||
self.headless = headless
|
||||
self.driver: Optional[Driver] = None
|
||||
self.created_at = None
|
||||
self.last_used = None
|
||||
self.use_count = 0
|
||||
self.is_busy = False
|
||||
|
||||
def initialize(self):
|
||||
"""Initialize Chrome driver with stability flags for unlimited scraping"""
|
||||
try:
|
||||
log.info(f"Worker {self.worker_id}: Initializing Chrome for unlimited review scraping...")
|
||||
|
||||
# SeleniumBase Driver automatically includes UC mode anti-detection
|
||||
# Initialize with longer timeouts for large scraping jobs
|
||||
self.driver = Driver(
|
||||
uc=True,
|
||||
headless=self.headless,
|
||||
page_load_strategy="normal"
|
||||
)
|
||||
|
||||
# Set generous timeouts for large scraping jobs
|
||||
self.driver.set_page_load_timeout(120) # 2 minutes for slow networks
|
||||
self.driver.set_script_timeout(60) # 1 minute for complex extraction
|
||||
|
||||
self.driver.maximize_window()
|
||||
self.created_at = time.time()
|
||||
self.last_used = time.time()
|
||||
log.info(f"Worker {self.worker_id}: Chrome ready for unlimited scraping")
|
||||
return True
|
||||
except Exception as e:
|
||||
log.error(f"Worker {self.worker_id}: Failed to initialize: {e}")
|
||||
return False
|
||||
|
||||
def reset(self):
|
||||
"""Reset worker to clean state"""
|
||||
try:
|
||||
if self.driver:
|
||||
# Clear cookies, cache, local storage
|
||||
self.driver.delete_all_cookies()
|
||||
self.driver.execute_script("window.localStorage.clear();")
|
||||
self.driver.execute_script("window.sessionStorage.clear();")
|
||||
log.debug(f"Worker {self.worker_id}: Reset complete")
|
||||
except Exception as e:
|
||||
log.warning(f"Worker {self.worker_id}: Reset failed: {e}")
|
||||
|
||||
def shutdown(self):
|
||||
"""Shutdown worker"""
|
||||
try:
|
||||
if self.driver:
|
||||
self.driver.quit()
|
||||
log.info(f"Worker {self.worker_id}: Shutdown complete")
|
||||
except Exception as e:
|
||||
log.warning(f"Worker {self.worker_id}: Shutdown error: {e}")
|
||||
finally:
|
||||
self.driver = None
|
||||
|
||||
def should_recycle(self, max_age_seconds: int = 3600, max_uses: int = 50):
|
||||
"""Check if worker should be recycled"""
|
||||
if not self.driver:
|
||||
return True
|
||||
|
||||
age = time.time() - self.created_at if self.created_at else 0
|
||||
if age > max_age_seconds:
|
||||
log.info(f"Worker {self.worker_id}: Recycling due to age ({age:.0f}s)")
|
||||
return True
|
||||
|
||||
if self.use_count > max_uses:
|
||||
log.info(f"Worker {self.worker_id}: Recycling due to use count ({self.use_count})")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
class ChromeWorkerPool:
|
||||
"""
|
||||
Pool of Chrome worker instances for faster scraping.
|
||||
|
||||
Maintains idle workers ready to execute tasks immediately.
|
||||
Workers are recycled after max age or max uses to prevent memory leaks.
|
||||
"""
|
||||
|
||||
def __init__(self, pool_size: int = 2, headless: bool = True):
|
||||
"""
|
||||
Initialize worker pool.
|
||||
|
||||
Args:
|
||||
pool_size: Number of idle workers to maintain
|
||||
headless: Run Chrome in headless mode
|
||||
"""
|
||||
self.pool_size = pool_size
|
||||
self.headless = headless
|
||||
self.workers: Queue[ChromeWorker] = Queue(maxsize=pool_size)
|
||||
self.active_workers: Dict[str, ChromeWorker] = {}
|
||||
self.worker_counter = 0
|
||||
self.lock = threading.Lock()
|
||||
self.running = False
|
||||
self.maintenance_thread = None
|
||||
|
||||
def start(self):
|
||||
"""Start the worker pool"""
|
||||
log.info(f"Starting Chrome worker pool (size={self.pool_size}, headless={self.headless})")
|
||||
self.running = True
|
||||
|
||||
# Pre-warm workers
|
||||
for _ in range(self.pool_size):
|
||||
self._create_worker()
|
||||
|
||||
# Start maintenance thread
|
||||
self.maintenance_thread = threading.Thread(target=self._maintenance_loop, daemon=True)
|
||||
self.maintenance_thread.start()
|
||||
|
||||
log.info(f"Chrome worker pool started with {self.workers.qsize()} ready workers")
|
||||
|
||||
def stop(self):
|
||||
"""Stop the worker pool"""
|
||||
log.info("Stopping Chrome worker pool...")
|
||||
self.running = False
|
||||
|
||||
if self.maintenance_thread:
|
||||
self.maintenance_thread.join(timeout=5)
|
||||
|
||||
# Shutdown all workers
|
||||
while not self.workers.empty():
|
||||
try:
|
||||
worker = self.workers.get_nowait()
|
||||
worker.shutdown()
|
||||
except Empty:
|
||||
break
|
||||
|
||||
# Shutdown active workers
|
||||
with self.lock:
|
||||
for worker in self.active_workers.values():
|
||||
worker.shutdown()
|
||||
self.active_workers.clear()
|
||||
|
||||
log.info("Chrome worker pool stopped")
|
||||
|
||||
def _create_worker(self) -> Optional[ChromeWorker]:
|
||||
"""Create a new worker and add to pool"""
|
||||
with self.lock:
|
||||
self.worker_counter += 1
|
||||
worker_id = f"worker-{self.worker_counter}"
|
||||
|
||||
worker = ChromeWorker(worker_id, headless=self.headless)
|
||||
if worker.initialize():
|
||||
try:
|
||||
self.workers.put_nowait(worker)
|
||||
return worker
|
||||
except:
|
||||
worker.shutdown()
|
||||
return None
|
||||
return None
|
||||
|
||||
def acquire_worker(self, timeout: float = 30) -> Optional[ChromeWorker]:
|
||||
"""
|
||||
Acquire a worker from the pool.
|
||||
|
||||
Args:
|
||||
timeout: Maximum time to wait for a worker
|
||||
|
||||
Returns:
|
||||
ChromeWorker instance or None if timeout
|
||||
"""
|
||||
try:
|
||||
worker = self.workers.get(timeout=timeout)
|
||||
worker.is_busy = True
|
||||
worker.last_used = time.time()
|
||||
worker.use_count += 1
|
||||
|
||||
with self.lock:
|
||||
self.active_workers[worker.worker_id] = worker
|
||||
|
||||
log.debug(f"Acquired {worker.worker_id} (uses: {worker.use_count}, pool: {self.workers.qsize()}/{self.pool_size})")
|
||||
|
||||
# No need to create replacement - worker will be returned to pool after use
|
||||
# Maintenance thread ensures pool stays at capacity
|
||||
|
||||
return worker
|
||||
except Empty:
|
||||
log.warning(f"Failed to acquire worker within {timeout}s")
|
||||
return None
|
||||
|
||||
def release_worker(self, worker: ChromeWorker, recycle: bool = False):
|
||||
"""
|
||||
Release a worker back to the pool.
|
||||
|
||||
Args:
|
||||
worker: Worker to release
|
||||
recycle: Force worker recycling
|
||||
"""
|
||||
with self.lock:
|
||||
if worker.worker_id in self.active_workers:
|
||||
del self.active_workers[worker.worker_id]
|
||||
|
||||
worker.is_busy = False
|
||||
|
||||
# Check if worker should be recycled
|
||||
if recycle or worker.should_recycle():
|
||||
log.info(f"Recycling {worker.worker_id}")
|
||||
worker.shutdown()
|
||||
# Create replacement worker in background
|
||||
threading.Thread(target=self._create_worker, daemon=True).start()
|
||||
else:
|
||||
# Reset and return to pool
|
||||
worker.reset()
|
||||
try:
|
||||
# Non-blocking put - if pool is full, it means we have extra workers
|
||||
# Just keep the worker for next time instead of destroying it
|
||||
current_size = self.workers.qsize()
|
||||
if current_size < self.pool_size:
|
||||
self.workers.put_nowait(worker)
|
||||
log.debug(f"Released {worker.worker_id} back to pool ({current_size + 1}/{self.pool_size})")
|
||||
else:
|
||||
# Pool already at capacity, recycle this extra worker
|
||||
log.debug(f"Pool at capacity ({current_size}/{self.pool_size}), recycling extra {worker.worker_id}")
|
||||
worker.shutdown()
|
||||
except Exception as e:
|
||||
# Unexpected error, shutdown worker
|
||||
log.error(f"Failed to release {worker.worker_id}: {e}")
|
||||
worker.shutdown()
|
||||
|
||||
def _maintenance_loop(self):
|
||||
"""Background maintenance thread"""
|
||||
while self.running:
|
||||
try:
|
||||
# Ensure pool is at capacity
|
||||
current_size = self.workers.qsize()
|
||||
needed = self.pool_size - current_size
|
||||
|
||||
if needed > 0:
|
||||
log.debug(f"Pool needs {needed} more workers")
|
||||
for _ in range(needed):
|
||||
self._create_worker()
|
||||
|
||||
# Sleep for 10 seconds
|
||||
time.sleep(10)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Maintenance loop error: {e}")
|
||||
time.sleep(5)
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get pool statistics"""
|
||||
with self.lock:
|
||||
active_count = len(self.active_workers)
|
||||
|
||||
return {
|
||||
"pool_size": self.pool_size,
|
||||
"idle_workers": self.workers.qsize(),
|
||||
"active_workers": active_count,
|
||||
"total_workers_created": self.worker_counter,
|
||||
"headless": self.headless
|
||||
}
|
||||
|
||||
|
||||
# Global worker pool instances
|
||||
validation_pool: Optional[ChromeWorkerPool] = None
|
||||
scraping_pool: Optional[ChromeWorkerPool] = None
|
||||
|
||||
|
||||
def start_worker_pools(validation_size: int = 1, scraping_size: int = 2, headless: bool = True):
|
||||
"""
|
||||
Start global worker pools.
|
||||
|
||||
Args:
|
||||
validation_size: Number of workers for validation checks
|
||||
scraping_size: Number of workers for scraping jobs
|
||||
headless: Run Chrome in headless mode
|
||||
"""
|
||||
global validation_pool, scraping_pool
|
||||
|
||||
log.info("Starting global Chrome worker pools...")
|
||||
|
||||
validation_pool = ChromeWorkerPool(pool_size=validation_size, headless=headless)
|
||||
validation_pool.start()
|
||||
|
||||
scraping_pool = ChromeWorkerPool(pool_size=scraping_size, headless=headless)
|
||||
scraping_pool.start()
|
||||
|
||||
log.info("Global Chrome worker pools started")
|
||||
|
||||
|
||||
def stop_worker_pools():
|
||||
"""Stop global worker pools"""
|
||||
global validation_pool, scraping_pool
|
||||
|
||||
log.info("Stopping global Chrome worker pools...")
|
||||
|
||||
if validation_pool:
|
||||
validation_pool.stop()
|
||||
validation_pool = None
|
||||
|
||||
if scraping_pool:
|
||||
scraping_pool.stop()
|
||||
scraping_pool = None
|
||||
|
||||
log.info("Global Chrome worker pools stopped")
|
||||
|
||||
|
||||
def get_validation_worker(timeout: float = 10) -> Optional[ChromeWorker]:
|
||||
"""Get a worker for validation check"""
|
||||
if validation_pool:
|
||||
return validation_pool.acquire_worker(timeout=timeout)
|
||||
return None
|
||||
|
||||
|
||||
def release_validation_worker(worker: ChromeWorker, recycle: bool = False):
|
||||
"""Release a validation worker"""
|
||||
if validation_pool:
|
||||
validation_pool.release_worker(worker, recycle=recycle)
|
||||
|
||||
|
||||
def get_scraping_worker(timeout: float = 30) -> Optional[ChromeWorker]:
|
||||
"""Get a worker for scraping"""
|
||||
if scraping_pool:
|
||||
return scraping_pool.acquire_worker(timeout=timeout)
|
||||
return None
|
||||
|
||||
|
||||
def release_scraping_worker(worker: ChromeWorker, recycle: bool = False):
|
||||
"""Release a scraping worker"""
|
||||
if scraping_pool:
|
||||
scraping_pool.release_worker(worker, recycle=recycle)
|
||||
|
||||
|
||||
def get_pool_stats() -> Dict[str, Any]:
|
||||
"""Get statistics for all pools"""
|
||||
stats = {}
|
||||
|
||||
if validation_pool:
|
||||
stats['validation'] = validation_pool.get_stats()
|
||||
|
||||
if scraping_pool:
|
||||
stats['scraping'] = scraping_pool.get_stats()
|
||||
|
||||
return stats
|
||||
521
modules/database.py
Normal file
521
modules/database.py
Normal file
@@ -0,0 +1,521 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PostgreSQL database module for production microservice.
|
||||
Stores job metadata and reviews as JSONB.
|
||||
"""
|
||||
import asyncpg
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Dict, Any
|
||||
from uuid import UUID, uuid4
|
||||
from enum import Enum
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class JobStatus(str, Enum):
|
||||
"""Job status enumeration"""
|
||||
PENDING = "pending"
|
||||
RUNNING = "running"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
CANCELLED = "cancelled"
|
||||
|
||||
|
||||
class DatabaseManager:
|
||||
"""PostgreSQL database manager with connection pooling"""
|
||||
|
||||
def __init__(self, database_url: str):
|
||||
"""
|
||||
Initialize database manager.
|
||||
|
||||
Args:
|
||||
database_url: PostgreSQL connection URL
|
||||
Format: postgresql://user:password@host:port/database
|
||||
"""
|
||||
self.database_url = database_url
|
||||
self.pool: Optional[asyncpg.Pool] = None
|
||||
|
||||
async def connect(self):
|
||||
"""Create connection pool"""
|
||||
log.info("Connecting to PostgreSQL database...")
|
||||
self.pool = await asyncpg.create_pool(
|
||||
self.database_url,
|
||||
min_size=5,
|
||||
max_size=20,
|
||||
command_timeout=60
|
||||
)
|
||||
log.info("Database connection pool created")
|
||||
|
||||
async def disconnect(self):
|
||||
"""Close connection pool"""
|
||||
if self.pool:
|
||||
await self.pool.close()
|
||||
log.info("Database connection pool closed")
|
||||
|
||||
async def initialize_schema(self):
|
||||
"""Create database schema if it doesn't exist"""
|
||||
async with self.pool.acquire() as conn:
|
||||
# Create jobs table
|
||||
await conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS jobs (
|
||||
job_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'pending',
|
||||
url TEXT NOT NULL,
|
||||
webhook_url TEXT,
|
||||
webhook_secret TEXT,
|
||||
|
||||
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
|
||||
started_at TIMESTAMP,
|
||||
completed_at TIMESTAMP,
|
||||
|
||||
reviews_count INTEGER,
|
||||
total_reviews INTEGER,
|
||||
reviews_data JSONB,
|
||||
scrape_time REAL,
|
||||
|
||||
error_message TEXT,
|
||||
metadata JSONB,
|
||||
|
||||
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
|
||||
);
|
||||
""")
|
||||
|
||||
# Create indexes
|
||||
await conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
|
||||
""")
|
||||
await conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_jobs_created_at ON jobs(created_at DESC);
|
||||
""")
|
||||
await conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_jobs_webhook ON jobs(webhook_url) WHERE webhook_url IS NOT NULL;
|
||||
""")
|
||||
|
||||
# Create canary results table
|
||||
await conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS canary_results (
|
||||
id SERIAL PRIMARY KEY,
|
||||
timestamp TIMESTAMP NOT NULL DEFAULT NOW(),
|
||||
success BOOLEAN NOT NULL,
|
||||
reviews_count INTEGER,
|
||||
scrape_time REAL,
|
||||
error_message TEXT,
|
||||
metadata JSONB
|
||||
);
|
||||
""")
|
||||
|
||||
await conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_canary_timestamp ON canary_results(timestamp DESC);
|
||||
""")
|
||||
|
||||
# Create webhook attempts table (for retry tracking)
|
||||
await conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS webhook_attempts (
|
||||
id SERIAL PRIMARY KEY,
|
||||
job_id UUID NOT NULL REFERENCES jobs(job_id) ON DELETE CASCADE,
|
||||
attempt_number INTEGER NOT NULL,
|
||||
timestamp TIMESTAMP NOT NULL DEFAULT NOW(),
|
||||
success BOOLEAN NOT NULL,
|
||||
status_code INTEGER,
|
||||
error_message TEXT,
|
||||
response_time_ms REAL
|
||||
);
|
||||
""")
|
||||
|
||||
await conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_webhook_job_id ON webhook_attempts(job_id);
|
||||
""")
|
||||
|
||||
log.info("Database schema initialized")
|
||||
|
||||
# ==================== Job Operations ====================
|
||||
|
||||
async def create_job(
|
||||
self,
|
||||
url: str,
|
||||
webhook_url: Optional[str] = None,
|
||||
webhook_secret: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
) -> UUID:
|
||||
"""
|
||||
Create a new scraping job.
|
||||
|
||||
Args:
|
||||
url: Google Maps URL to scrape
|
||||
webhook_url: Optional webhook URL for notifications
|
||||
webhook_secret: Optional secret for webhook signature
|
||||
metadata: Optional additional metadata
|
||||
|
||||
Returns:
|
||||
UUID of created job
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
job_id = await conn.fetchval("""
|
||||
INSERT INTO jobs (url, webhook_url, webhook_secret, metadata)
|
||||
VALUES ($1, $2, $3, $4)
|
||||
RETURNING job_id
|
||||
""", url, webhook_url, webhook_secret, json.dumps(metadata) if metadata else None)
|
||||
|
||||
log.info(f"Created job {job_id} for URL: {url[:80]}...")
|
||||
return job_id
|
||||
|
||||
async def get_job(self, job_id: UUID) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get job by ID.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
|
||||
Returns:
|
||||
Job dictionary or None if not found
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
row = await conn.fetchrow("""
|
||||
SELECT
|
||||
job_id,
|
||||
status,
|
||||
url,
|
||||
webhook_url,
|
||||
created_at,
|
||||
started_at,
|
||||
completed_at,
|
||||
reviews_count,
|
||||
reviews_data,
|
||||
scrape_time,
|
||||
error_message,
|
||||
metadata
|
||||
FROM jobs
|
||||
WHERE job_id = $1
|
||||
""", job_id)
|
||||
|
||||
if not row:
|
||||
return None
|
||||
|
||||
return dict(row)
|
||||
|
||||
async def get_job_reviews(self, job_id: UUID) -> Optional[List[Dict[str, Any]]]:
|
||||
"""
|
||||
Get reviews for a specific job.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
|
||||
Returns:
|
||||
List of reviews or None if not found/not completed
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
reviews_data = await conn.fetchval("""
|
||||
SELECT reviews_data
|
||||
FROM jobs
|
||||
WHERE job_id = $1 AND status = 'completed'
|
||||
""", job_id)
|
||||
|
||||
if not reviews_data:
|
||||
return None
|
||||
|
||||
# asyncpg returns JSONB as string, need to parse it
|
||||
if isinstance(reviews_data, str):
|
||||
return json.loads(reviews_data)
|
||||
|
||||
return reviews_data
|
||||
|
||||
async def update_job_status(
|
||||
self,
|
||||
job_id: UUID,
|
||||
status: JobStatus,
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
Update job status and optional fields.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
status: New status
|
||||
**kwargs: Additional fields to update (started_at, completed_at, error_message, etc.)
|
||||
"""
|
||||
# Build dynamic UPDATE query
|
||||
set_clauses = ["status = $2"]
|
||||
params = [job_id, status.value]
|
||||
param_idx = 3
|
||||
|
||||
if status == JobStatus.RUNNING and 'started_at' not in kwargs:
|
||||
kwargs['started_at'] = datetime.now()
|
||||
elif status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED] and 'completed_at' not in kwargs:
|
||||
kwargs['completed_at'] = datetime.now()
|
||||
|
||||
for key, value in kwargs.items():
|
||||
set_clauses.append(f"{key} = ${param_idx}")
|
||||
params.append(value)
|
||||
param_idx += 1
|
||||
|
||||
query = f"""
|
||||
UPDATE jobs
|
||||
SET {', '.join(set_clauses)}
|
||||
WHERE job_id = $1
|
||||
"""
|
||||
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute(query, *params)
|
||||
|
||||
async def save_job_result(
|
||||
self,
|
||||
job_id: UUID,
|
||||
reviews: List[Dict[str, Any]],
|
||||
scrape_time: float,
|
||||
total_reviews: Optional[int] = None
|
||||
):
|
||||
"""
|
||||
Save scraping results to database.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
reviews: List of review dictionaries
|
||||
scrape_time: Time taken to scrape in seconds
|
||||
total_reviews: Total reviews available (from page counter)
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
UPDATE jobs
|
||||
SET
|
||||
status = 'completed',
|
||||
completed_at = NOW(),
|
||||
reviews_count = $2,
|
||||
total_reviews = $3,
|
||||
reviews_data = $4::jsonb,
|
||||
scrape_time = $5
|
||||
WHERE job_id = $1
|
||||
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time)
|
||||
|
||||
log.info(f"Saved {len(reviews)} reviews for job {job_id}")
|
||||
|
||||
async def list_jobs(
|
||||
self,
|
||||
status: Optional[JobStatus] = None,
|
||||
limit: int = 100,
|
||||
offset: int = 0
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
List jobs with optional filtering.
|
||||
|
||||
Args:
|
||||
status: Optional status filter
|
||||
limit: Maximum number of jobs to return
|
||||
offset: Number of jobs to skip
|
||||
|
||||
Returns:
|
||||
List of job dictionaries
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
if status:
|
||||
rows = await conn.fetch("""
|
||||
SELECT
|
||||
job_id,
|
||||
status,
|
||||
url,
|
||||
created_at,
|
||||
completed_at,
|
||||
reviews_count,
|
||||
scrape_time,
|
||||
error_message
|
||||
FROM jobs
|
||||
WHERE status = $1
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $2 OFFSET $3
|
||||
""", status.value, limit, offset)
|
||||
else:
|
||||
rows = await conn.fetch("""
|
||||
SELECT
|
||||
job_id,
|
||||
status,
|
||||
url,
|
||||
created_at,
|
||||
completed_at,
|
||||
reviews_count,
|
||||
scrape_time,
|
||||
error_message
|
||||
FROM jobs
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $1 OFFSET $2
|
||||
""", limit, offset)
|
||||
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
async def get_pending_jobs_with_webhooks(self, limit: int = 100) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get completed jobs that have webhooks pending delivery.
|
||||
|
||||
Args:
|
||||
limit: Maximum number of jobs to return
|
||||
|
||||
Returns:
|
||||
List of job dictionaries with webhook info
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
rows = await conn.fetch("""
|
||||
SELECT
|
||||
job_id,
|
||||
status,
|
||||
url,
|
||||
webhook_url,
|
||||
webhook_secret,
|
||||
reviews_count,
|
||||
scrape_time,
|
||||
error_message,
|
||||
completed_at
|
||||
FROM jobs
|
||||
WHERE webhook_url IS NOT NULL
|
||||
AND status IN ('completed', 'failed')
|
||||
AND job_id NOT IN (
|
||||
SELECT job_id
|
||||
FROM webhook_attempts
|
||||
WHERE success = true
|
||||
)
|
||||
ORDER BY completed_at ASC
|
||||
LIMIT $1
|
||||
""", limit)
|
||||
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
async def delete_job(self, job_id: UUID) -> bool:
|
||||
"""
|
||||
Delete a job from the database.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
|
||||
Returns:
|
||||
True if deleted, False if not found
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
result = await conn.execute("""
|
||||
DELETE FROM jobs WHERE job_id = $1
|
||||
""", job_id)
|
||||
|
||||
deleted = result.split()[-1] == "1"
|
||||
if deleted:
|
||||
log.info(f"Deleted job {job_id}")
|
||||
return deleted
|
||||
|
||||
async def cleanup_old_jobs(self, max_age_days: int = 30):
|
||||
"""
|
||||
Delete old completed/failed jobs.
|
||||
|
||||
Args:
|
||||
max_age_days: Maximum age in days before deletion
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
result = await conn.execute("""
|
||||
DELETE FROM jobs
|
||||
WHERE status IN ('completed', 'failed', 'cancelled')
|
||||
AND completed_at < NOW() - INTERVAL '%s days'
|
||||
""", max_age_days)
|
||||
|
||||
deleted_count = int(result.split()[-1])
|
||||
if deleted_count > 0:
|
||||
log.info(f"Cleaned up {deleted_count} old jobs")
|
||||
|
||||
# ==================== Statistics ====================
|
||||
|
||||
async def get_stats(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get job statistics.
|
||||
|
||||
Returns:
|
||||
Statistics dictionary
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
stats = await conn.fetchrow("""
|
||||
SELECT
|
||||
COUNT(*) as total_jobs,
|
||||
COUNT(*) FILTER (WHERE status = 'pending') as pending,
|
||||
COUNT(*) FILTER (WHERE status = 'running') as running,
|
||||
COUNT(*) FILTER (WHERE status = 'completed') as completed,
|
||||
COUNT(*) FILTER (WHERE status = 'failed') as failed,
|
||||
COUNT(*) FILTER (WHERE status = 'cancelled') as cancelled,
|
||||
AVG(scrape_time) FILTER (WHERE status = 'completed') as avg_scrape_time,
|
||||
SUM(reviews_count) FILTER (WHERE status = 'completed') as total_reviews
|
||||
FROM jobs
|
||||
""")
|
||||
|
||||
return dict(stats)
|
||||
|
||||
# ==================== Canary Operations ====================
|
||||
|
||||
async def save_canary_result(
|
||||
self,
|
||||
success: bool,
|
||||
reviews_count: Optional[int] = None,
|
||||
scrape_time: Optional[float] = None,
|
||||
error_message: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
):
|
||||
"""
|
||||
Save canary test result.
|
||||
|
||||
Args:
|
||||
success: Whether canary test succeeded
|
||||
reviews_count: Number of reviews scraped
|
||||
scrape_time: Time taken in seconds
|
||||
error_message: Error message if failed
|
||||
metadata: Additional metadata
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
INSERT INTO canary_results (success, reviews_count, scrape_time, error_message, metadata)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
""", success, reviews_count, scrape_time, error_message, json.dumps(metadata) if metadata else None)
|
||||
|
||||
async def get_canary_history(self, limit: int = 100) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get canary test history.
|
||||
|
||||
Args:
|
||||
limit: Maximum number of results to return
|
||||
|
||||
Returns:
|
||||
List of canary result dictionaries
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
rows = await conn.fetch("""
|
||||
SELECT
|
||||
timestamp,
|
||||
success,
|
||||
reviews_count,
|
||||
scrape_time,
|
||||
error_message
|
||||
FROM canary_results
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT $1
|
||||
""", limit)
|
||||
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
# ==================== Webhook Attempts ====================
|
||||
|
||||
async def log_webhook_attempt(
|
||||
self,
|
||||
job_id: UUID,
|
||||
attempt_number: int,
|
||||
success: bool,
|
||||
status_code: Optional[int] = None,
|
||||
error_message: Optional[str] = None,
|
||||
response_time_ms: Optional[float] = None
|
||||
):
|
||||
"""
|
||||
Log a webhook delivery attempt.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
attempt_number: Attempt number (1, 2, 3...)
|
||||
success: Whether delivery succeeded
|
||||
status_code: HTTP status code
|
||||
error_message: Error message if failed
|
||||
response_time_ms: Response time in milliseconds
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
INSERT INTO webhook_attempts (job_id, attempt_number, success, status_code, error_message, response_time_ms)
|
||||
VALUES ($1, $2, $3, $4, $5, $6)
|
||||
""", job_id, attempt_number, success, status_code, error_message, response_time_ms)
|
||||
1280
modules/fast_scraper.py
Normal file
1280
modules/fast_scraper.py
Normal file
File diff suppressed because it is too large
Load Diff
411
modules/health_checks.py
Normal file
411
modules/health_checks.py
Normal file
@@ -0,0 +1,411 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Smart health check system with canary testing.
|
||||
Verifies that scraping actually works, not just that services are up.
|
||||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, Any, Optional
|
||||
import os
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CanaryMonitor:
|
||||
"""
|
||||
Background canary test monitor.
|
||||
|
||||
Runs actual scraping tests periodically to verify the scraper works.
|
||||
This catches issues like:
|
||||
- Google Maps page structure changes
|
||||
- Broken CSS selectors
|
||||
- GDPR consent handling issues
|
||||
- Network/proxy problems
|
||||
- Chrome/browser issues
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
db,
|
||||
interval_hours: int = 4,
|
||||
test_url: Optional[str] = None
|
||||
):
|
||||
"""
|
||||
Initialize canary monitor.
|
||||
|
||||
Args:
|
||||
db: Database manager instance
|
||||
interval_hours: How often to run canary tests
|
||||
test_url: Optional test URL (defaults to Soho Factory in Vilnius)
|
||||
"""
|
||||
self.db = db
|
||||
self.interval = timedelta(hours=interval_hours)
|
||||
self.test_url = test_url or os.getenv(
|
||||
'CANARY_TEST_URL',
|
||||
'https://www.google.com/maps/place/Soho+Factory/@54.6738155,25.2595844,17z/'
|
||||
)
|
||||
|
||||
self.running = False
|
||||
self.last_run: Optional[datetime] = None
|
||||
self.last_success: Optional[datetime] = None
|
||||
self.consecutive_failures = 0
|
||||
self.last_result: Optional[Dict[str, Any]] = None
|
||||
|
||||
async def start(self):
|
||||
"""Start the background canary monitoring"""
|
||||
self.running = True
|
||||
log.info(f"Canary monitor started (interval: {self.interval.total_seconds()/3600:.1f}h)")
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
await self.run_canary_test()
|
||||
except Exception as e:
|
||||
log.error(f"Canary test failed with exception: {e}")
|
||||
self.consecutive_failures += 1
|
||||
|
||||
# Alert if multiple consecutive failures
|
||||
if self.consecutive_failures >= 3:
|
||||
await self.send_alert(
|
||||
f"🚨 CRITICAL: Scraper canary failed {self.consecutive_failures} times in a row! "
|
||||
f"Last error: {str(e)[:200]}"
|
||||
)
|
||||
|
||||
# Sleep until next run
|
||||
await asyncio.sleep(self.interval.total_seconds())
|
||||
|
||||
def stop(self):
|
||||
"""Stop the background monitoring"""
|
||||
self.running = False
|
||||
log.info("Canary monitor stopped")
|
||||
|
||||
async def run_canary_test(self):
|
||||
"""
|
||||
Run a single canary test.
|
||||
|
||||
This performs an actual scrape on a known test URL and validates:
|
||||
- Scraping succeeds
|
||||
- Reviews are extracted
|
||||
- Review count is reasonable
|
||||
- Scrape time is reasonable
|
||||
- Data structure is valid
|
||||
"""
|
||||
from modules.fast_scraper import fast_scrape_reviews
|
||||
|
||||
log.info(f"Running canary scrape test on {self.test_url[:60]}...")
|
||||
self.last_run = datetime.now()
|
||||
|
||||
try:
|
||||
# Run actual scrape with timeout
|
||||
result = await asyncio.wait_for(
|
||||
asyncio.to_thread(
|
||||
fast_scrape_reviews,
|
||||
url=self.test_url,
|
||||
headless=True,
|
||||
max_scrolls=10 # Limited for canary
|
||||
),
|
||||
timeout=60 # Fail if takes > 60s
|
||||
)
|
||||
|
||||
# Validate result
|
||||
checks = {
|
||||
"scrape_succeeded": result['success'],
|
||||
"got_reviews": result['count'] > 0,
|
||||
"reasonable_count": 10 <= result['count'] <= 500,
|
||||
"reasonable_time": result['time'] < 30,
|
||||
"data_structure_valid": self._validate_review_structure(result.get('reviews', []))
|
||||
}
|
||||
|
||||
all_passed = all(checks.values())
|
||||
|
||||
if all_passed:
|
||||
# Success!
|
||||
log.info(
|
||||
f"✅ Canary test PASSED: {result['count']} reviews in {result['time']:.1f}s"
|
||||
)
|
||||
self.consecutive_failures = 0
|
||||
self.last_success = datetime.now()
|
||||
self.last_result = {
|
||||
"status": "pass",
|
||||
"reviews_count": result['count'],
|
||||
"scrape_time": result['time'],
|
||||
"checks": checks
|
||||
}
|
||||
|
||||
# Save to database
|
||||
await self.db.save_canary_result(
|
||||
success=True,
|
||||
reviews_count=result['count'],
|
||||
scrape_time=result['time'],
|
||||
metadata={"checks": checks}
|
||||
)
|
||||
|
||||
else:
|
||||
# Validation failed
|
||||
failed_checks = [k for k, v in checks.items() if not v]
|
||||
log.error(
|
||||
f"❌ Canary test FAILED: validation failed on {failed_checks}"
|
||||
)
|
||||
self.consecutive_failures += 1
|
||||
self.last_result = {
|
||||
"status": "fail",
|
||||
"reviews_count": result['count'],
|
||||
"scrape_time": result['time'],
|
||||
"checks": checks,
|
||||
"failed_checks": failed_checks
|
||||
}
|
||||
|
||||
# Save to database
|
||||
await self.db.save_canary_result(
|
||||
success=False,
|
||||
reviews_count=result['count'],
|
||||
scrape_time=result['time'],
|
||||
error_message=f"Validation failed: {failed_checks}",
|
||||
metadata={"checks": checks}
|
||||
)
|
||||
|
||||
# Alert on failure
|
||||
if self.consecutive_failures >= 3:
|
||||
await self.send_alert(
|
||||
f"🚨 CRITICAL: Canary validation failed {self.consecutive_failures} times! "
|
||||
f"Failed checks: {failed_checks}"
|
||||
)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
log.error("❌ Canary test TIMEOUT (>60s)")
|
||||
self.consecutive_failures += 1
|
||||
self.last_result = {
|
||||
"status": "timeout",
|
||||
"error": "Scrape took longer than 60 seconds"
|
||||
}
|
||||
|
||||
await self.db.save_canary_result(
|
||||
success=False,
|
||||
error_message="Timeout after 60 seconds"
|
||||
)
|
||||
|
||||
if self.consecutive_failures >= 3:
|
||||
await self.send_alert(
|
||||
f"🚨 CRITICAL: Canary timeout {self.consecutive_failures} times!"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"❌ Canary test ERROR: {e}")
|
||||
self.consecutive_failures += 1
|
||||
self.last_result = {
|
||||
"status": "error",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
await self.db.save_canary_result(
|
||||
success=False,
|
||||
error_message=str(e)
|
||||
)
|
||||
|
||||
raise # Re-raise to trigger alert in main loop
|
||||
|
||||
def _validate_review_structure(self, reviews) -> bool:
|
||||
"""
|
||||
Validate that reviews have expected structure.
|
||||
|
||||
Args:
|
||||
reviews: List of review dictionaries
|
||||
|
||||
Returns:
|
||||
True if structure is valid
|
||||
"""
|
||||
if not reviews or len(reviews) == 0:
|
||||
return False
|
||||
|
||||
# Check first review has required fields
|
||||
first_review = reviews[0]
|
||||
required_fields = ['author', 'rating', 'date_text']
|
||||
|
||||
return all(field in first_review for field in required_fields)
|
||||
|
||||
async def send_alert(self, message: str):
|
||||
"""
|
||||
Send alert via configured channels.
|
||||
|
||||
Args:
|
||||
message: Alert message to send
|
||||
"""
|
||||
log.critical(message)
|
||||
|
||||
# TODO: Integrate with alerting systems
|
||||
# Examples:
|
||||
|
||||
# Slack
|
||||
slack_webhook = os.getenv('SLACK_WEBHOOK_URL')
|
||||
if slack_webhook:
|
||||
try:
|
||||
import httpx
|
||||
async with httpx.AsyncClient() as client:
|
||||
await client.post(
|
||||
slack_webhook,
|
||||
json={"text": message},
|
||||
timeout=5.0
|
||||
)
|
||||
log.info("Alert sent to Slack")
|
||||
except Exception as e:
|
||||
log.error(f"Failed to send Slack alert: {e}")
|
||||
|
||||
# Email (example with SMTP)
|
||||
# smtp_config = os.getenv('SMTP_CONFIG')
|
||||
# if smtp_config:
|
||||
# await send_email(
|
||||
# to=os.getenv('ALERT_EMAIL'),
|
||||
# subject="Scraper Canary Alert",
|
||||
# body=message
|
||||
# )
|
||||
|
||||
# PagerDuty
|
||||
# pagerduty_key = os.getenv('PAGERDUTY_KEY')
|
||||
# if pagerduty_key:
|
||||
# await trigger_pagerduty(message)
|
||||
|
||||
def get_status(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get current canary status.
|
||||
|
||||
Returns:
|
||||
Status dictionary
|
||||
"""
|
||||
if not self.last_success:
|
||||
return {
|
||||
"status": "unknown",
|
||||
"message": "No canary tests run yet",
|
||||
"last_run": self.last_run.isoformat() if self.last_run else None
|
||||
}
|
||||
|
||||
age = datetime.now() - self.last_success
|
||||
max_age = timedelta(hours=6) # Alert if no success in 6 hours
|
||||
|
||||
if age > max_age:
|
||||
return {
|
||||
"status": "stale",
|
||||
"last_success": self.last_success.isoformat(),
|
||||
"age_hours": age.total_seconds() / 3600,
|
||||
"consecutive_failures": self.consecutive_failures,
|
||||
"message": f"Last successful canary was {age.total_seconds()/3600:.1f} hours ago"
|
||||
}
|
||||
|
||||
return {
|
||||
"status": "healthy",
|
||||
"last_success": self.last_success.isoformat(),
|
||||
"last_run": self.last_run.isoformat() if self.last_run else None,
|
||||
"age_minutes": age.total_seconds() / 60,
|
||||
"consecutive_failures": self.consecutive_failures,
|
||||
"last_result": self.last_result
|
||||
}
|
||||
|
||||
|
||||
class HealthCheckSystem:
|
||||
"""
|
||||
Complete health check system for production.
|
||||
|
||||
Provides multiple levels of health checks:
|
||||
- Liveness: Is the server alive?
|
||||
- Readiness: Can it handle traffic?
|
||||
- Canary: Does scraping actually work?
|
||||
"""
|
||||
|
||||
def __init__(self, db):
|
||||
"""
|
||||
Initialize health check system.
|
||||
|
||||
Args:
|
||||
db: Database manager instance
|
||||
"""
|
||||
self.db = db
|
||||
self.canary = CanaryMonitor(db, interval_hours=4)
|
||||
|
||||
async def start(self):
|
||||
"""Start background health monitoring"""
|
||||
asyncio.create_task(self.canary.start())
|
||||
|
||||
def stop(self):
|
||||
"""Stop background health monitoring"""
|
||||
self.canary.stop()
|
||||
|
||||
async def check_liveness(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Liveness check: Is the server alive?
|
||||
|
||||
This is a simple check that always succeeds if the server is running.
|
||||
Used by Kubernetes liveness probe - restart container if fails.
|
||||
|
||||
Returns:
|
||||
Liveness status
|
||||
"""
|
||||
return {
|
||||
"status": "alive",
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
async def check_readiness(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Readiness check: Can the server handle traffic?
|
||||
|
||||
Checks if dependencies are available.
|
||||
Used by Kubernetes readiness probe - remove from load balancer if fails.
|
||||
|
||||
Returns:
|
||||
Readiness status
|
||||
"""
|
||||
checks = {}
|
||||
|
||||
# Check database
|
||||
try:
|
||||
await self.db.pool.fetchval("SELECT 1")
|
||||
checks["database"] = {"healthy": True}
|
||||
except Exception as e:
|
||||
checks["database"] = {"healthy": False, "error": str(e)}
|
||||
|
||||
# Overall readiness
|
||||
all_healthy = all(c.get("healthy", False) for c in checks.values())
|
||||
|
||||
return {
|
||||
"status": "ready" if all_healthy else "not_ready",
|
||||
"checks": checks,
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
async def check_canary(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Canary check: Does scraping actually work?
|
||||
|
||||
Returns the latest canary test result.
|
||||
Used by external monitoring (PagerDuty, DataDog) for alerts.
|
||||
|
||||
Returns:
|
||||
Canary status
|
||||
"""
|
||||
return self.canary.get_status()
|
||||
|
||||
async def get_detailed_health(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get detailed health status of all components.
|
||||
|
||||
Returns:
|
||||
Complete health status
|
||||
"""
|
||||
liveness = await self.check_liveness()
|
||||
readiness = await self.check_readiness()
|
||||
canary = await self.check_canary()
|
||||
|
||||
overall_healthy = (
|
||||
liveness["status"] == "alive" and
|
||||
readiness["status"] == "ready" and
|
||||
canary["status"] in ["healthy", "unknown"] # Unknown is OK (first run)
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "healthy" if overall_healthy else "degraded",
|
||||
"components": {
|
||||
"liveness": liveness,
|
||||
"readiness": readiness,
|
||||
"canary": canary
|
||||
},
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
@@ -15,6 +15,8 @@ from dataclasses import dataclass, asdict
|
||||
|
||||
from modules.config import load_config
|
||||
from modules.scraper import GoogleReviewsScraper
|
||||
from modules.fast_scraper import fast_scrape_reviews
|
||||
from modules.chrome_pool import get_scraping_worker, release_scraping_worker
|
||||
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
@@ -38,18 +40,32 @@ class ScrapingJob:
|
||||
created_at: datetime
|
||||
started_at: Optional[datetime] = None
|
||||
completed_at: Optional[datetime] = None
|
||||
updated_at: Optional[datetime] = None # Last update time (for progress tracking)
|
||||
error_message: Optional[str] = None
|
||||
reviews_count: Optional[int] = None
|
||||
total_reviews: Optional[int] = None # Total reviews available (from page counter)
|
||||
images_count: Optional[int] = None
|
||||
progress: Dict[str, Any] = None
|
||||
reviews_data: Optional[List[Dict[str, Any]]] = None # Store actual review data
|
||||
scrape_time: Optional[float] = None # Time taken to scrape
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert job to dictionary for JSON serialization"""
|
||||
def to_dict(self, include_reviews: bool = False) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert job to dictionary for JSON serialization
|
||||
|
||||
Args:
|
||||
include_reviews: Whether to include the full reviews data (default: False)
|
||||
"""
|
||||
data = asdict(self)
|
||||
# Convert datetime objects to ISO strings
|
||||
for field in ['created_at', 'started_at', 'completed_at']:
|
||||
if data[field]:
|
||||
data[field] = data[field].isoformat()
|
||||
|
||||
# Exclude reviews_data by default (can be large)
|
||||
if not include_reviews:
|
||||
data.pop('reviews_data', None)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
@@ -126,6 +142,7 @@ class JobManager:
|
||||
|
||||
job.status = JobStatus.RUNNING
|
||||
job.started_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.progress = {"stage": "starting", "message": "Initializing scraper"}
|
||||
|
||||
# Submit job to thread pool
|
||||
@@ -137,61 +154,139 @@ class JobManager:
|
||||
def _run_scraping_job(self, job_id: str):
|
||||
"""
|
||||
Run the actual scraping job in background thread.
|
||||
|
||||
|
||||
Args:
|
||||
job_id: Job ID to run
|
||||
"""
|
||||
def progress_callback(current_count: int, total_count: int):
|
||||
"""Update job progress during scraping"""
|
||||
with self.lock:
|
||||
job = self.jobs.get(job_id)
|
||||
if job:
|
||||
job.reviews_count = current_count
|
||||
job.total_reviews = total_count
|
||||
job.updated_at = datetime.now() # Update last update time
|
||||
# Calculate percentage for better UX
|
||||
percentage = int((current_count / total_count * 100)) if total_count > 0 else 0
|
||||
job.progress = {
|
||||
"stage": "scraping",
|
||||
"message": f"Collecting reviews: {current_count} / {total_count} ({percentage}%)",
|
||||
"percentage": percentage
|
||||
}
|
||||
|
||||
worker = None
|
||||
try:
|
||||
with self.lock:
|
||||
job = self.jobs[job_id]
|
||||
job.progress = {"stage": "initializing", "message": "Setting up scraper"}
|
||||
|
||||
# Create scraper with job config
|
||||
scraper = GoogleReviewsScraper(job.config)
|
||||
|
||||
# Hook into scraper progress (if available)
|
||||
# This would require modifying the scraper to report progress
|
||||
|
||||
job.progress = {"stage": "initializing", "message": "Acquiring Chrome worker from pool"}
|
||||
|
||||
# Get a worker from the scraping pool
|
||||
worker = get_scraping_worker(timeout=30)
|
||||
|
||||
if not worker:
|
||||
raise Exception("No Chrome workers available. Pool may be at capacity.")
|
||||
|
||||
log.info(f"Job {job_id}: Acquired worker {worker.worker_id} from pool")
|
||||
|
||||
# Get config
|
||||
url = job.config.get('url')
|
||||
headless = job.config.get('headless', True) # Default to headless
|
||||
max_scrolls = job.config.get('max_scrolls', 999999) # Effectively unlimited - relies on idle detection
|
||||
|
||||
with self.lock:
|
||||
job.progress = {"stage": "scraping", "message": "Scraping reviews in progress"}
|
||||
|
||||
# Run the scraping
|
||||
scraper.scrape()
|
||||
|
||||
# Mark job as completed
|
||||
job.progress = {"stage": "scraping", "message": f"Scraping reviews with {worker.worker_id} (fast mode)"}
|
||||
|
||||
# Run the FAST scraping with progress callback using pooled worker
|
||||
result = fast_scrape_reviews(
|
||||
url=url,
|
||||
headless=headless,
|
||||
max_scrolls=max_scrolls,
|
||||
progress_callback=progress_callback,
|
||||
driver=worker.driver, # Use worker's driver
|
||||
return_driver=True # Don't close the driver
|
||||
)
|
||||
|
||||
# Pop the driver from result before storing
|
||||
result.pop('driver', None)
|
||||
|
||||
# Mark job as completed or failed
|
||||
with self.lock:
|
||||
job.status = JobStatus.COMPLETED
|
||||
job.completed_at = datetime.now()
|
||||
job.progress = {"stage": "completed", "message": "Scraping completed successfully"}
|
||||
|
||||
# Try to get results count if available
|
||||
# This would require scraper to return results
|
||||
job.reviews_count = getattr(scraper, 'total_reviews', None)
|
||||
job.images_count = getattr(scraper, 'total_images', None)
|
||||
|
||||
log.info(f"Completed scraping job {job_id}")
|
||||
|
||||
if result['success']:
|
||||
job.status = JobStatus.COMPLETED
|
||||
job.completed_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.reviews_count = result['count']
|
||||
job.total_reviews = result.get('total_reviews') # Store total review count from page
|
||||
job.reviews_data = result['reviews'] # Store the actual reviews
|
||||
job.scrape_time = result['time']
|
||||
job.progress = {
|
||||
"stage": "completed",
|
||||
"message": f"Scraping completed successfully in {result['time']:.1f}s",
|
||||
"scroll_time": result.get('scroll_time'),
|
||||
"extract_time": result.get('extract_time')
|
||||
}
|
||||
log.info(f"Completed scraping job {job_id}: {result['count']} reviews in {result['time']:.1f}s")
|
||||
else:
|
||||
job.status = JobStatus.FAILED
|
||||
job.completed_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.error_message = result.get('error', 'Unknown error')
|
||||
job.progress = {"stage": "failed", "message": f"Job failed: {result.get('error')}"}
|
||||
log.error(f"Failed scraping job {job_id}: {result.get('error')}")
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error in scraping job {job_id}: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
with self.lock:
|
||||
job = self.jobs[job_id]
|
||||
job.status = JobStatus.FAILED
|
||||
job.completed_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.error_message = str(e)
|
||||
job.progress = {"stage": "failed", "message": f"Job failed: {str(e)}"}
|
||||
|
||||
# Recycle worker on error
|
||||
if worker:
|
||||
log.info(f"Job {job_id}: Recycling worker {worker.worker_id} due to error")
|
||||
release_scraping_worker(worker, recycle=True)
|
||||
worker = None # Mark as released
|
||||
|
||||
finally:
|
||||
# Release worker back to pool if not already released
|
||||
if worker:
|
||||
log.info(f"Job {job_id}: Releasing worker {worker.worker_id} back to pool")
|
||||
release_scraping_worker(worker, recycle=False)
|
||||
|
||||
def get_job(self, job_id: str) -> Optional[ScrapingJob]:
|
||||
"""
|
||||
Get job by ID.
|
||||
|
||||
|
||||
Args:
|
||||
job_id: Job ID
|
||||
|
||||
|
||||
Returns:
|
||||
Job object or None if not found
|
||||
"""
|
||||
with self.lock:
|
||||
return self.jobs.get(job_id)
|
||||
|
||||
def get_job_reviews(self, job_id: str) -> Optional[List[Dict[str, Any]]]:
|
||||
"""
|
||||
Get reviews data for a specific job.
|
||||
|
||||
Args:
|
||||
job_id: Job ID
|
||||
|
||||
Returns:
|
||||
List of reviews or None if not found/not completed
|
||||
"""
|
||||
with self.lock:
|
||||
job = self.jobs.get(job_id)
|
||||
if job and job.status == JobStatus.COMPLETED:
|
||||
return job.reviews_data
|
||||
return None
|
||||
|
||||
def list_jobs(self, status: Optional[JobStatus] = None, limit: int = 100) -> List[ScrapingJob]:
|
||||
"""
|
||||
@@ -235,6 +330,7 @@ class JobManager:
|
||||
|
||||
job.status = JobStatus.CANCELLED
|
||||
job.completed_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.progress = {"stage": "cancelled", "message": "Job was cancelled"}
|
||||
|
||||
log.info(f"Cancelled scraping job {job_id}")
|
||||
|
||||
@@ -1420,14 +1420,65 @@ class GoogleReviewsScraper:
|
||||
try:
|
||||
responses = self.api_interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
log.debug(f"Collected {len(responses)} network responses from browser")
|
||||
|
||||
# Dump first few responses for analysis
|
||||
if not hasattr(self, '_dumped_responses'):
|
||||
self._dumped_responses = 0
|
||||
|
||||
if self._dumped_responses < 5: # Dump first 5 responses
|
||||
from pathlib import Path
|
||||
import json
|
||||
output_dir = Path("api_response_samples")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
for resp in responses:
|
||||
if self._dumped_responses >= 5:
|
||||
break
|
||||
|
||||
idx = self._dumped_responses
|
||||
body = resp.get('body', '')
|
||||
|
||||
# Save full response
|
||||
full_file = output_dir / f"response_{idx:02d}_full.json"
|
||||
with open(full_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(resp, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Save body
|
||||
body_file = output_dir / f"response_{idx:02d}_body.txt"
|
||||
with open(body_file, 'w', encoding='utf-8') as f:
|
||||
f.write(body)
|
||||
|
||||
# Try to parse and save
|
||||
clean_body = body[4:].strip() if body.startswith(")]}'") else body
|
||||
try:
|
||||
parsed_data = json.loads(clean_body)
|
||||
parsed_file = output_dir / f"response_{idx:02d}_parsed.json"
|
||||
with open(parsed_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(parsed_data, f, indent=2, ensure_ascii=False)
|
||||
log.info(f"Dumped API response {idx} to {output_dir}/ ({len(body)} bytes)")
|
||||
except:
|
||||
log.debug(f"Response {idx} is not JSON")
|
||||
|
||||
self._dumped_responses += 1
|
||||
|
||||
parsed = self.api_interceptor.parse_reviews_from_responses(responses)
|
||||
log.debug(f"Parsed {len(parsed)} reviews from responses")
|
||||
for intercepted in parsed:
|
||||
if intercepted.review_id and intercepted.review_id not in api_reviews:
|
||||
api_reviews[intercepted.review_id] = self.api_interceptor.convert_to_raw_review_format(intercepted)
|
||||
if parsed:
|
||||
log.debug(f"API interceptor captured {len(parsed)} reviews (total unique: {len(api_reviews)})")
|
||||
log.info(f"API interceptor captured {len(parsed)} reviews (total unique API: {len(api_reviews)})")
|
||||
|
||||
# Log stats every 10 iterations
|
||||
if attempts % 10 == 0:
|
||||
stats = self.api_interceptor.get_interceptor_stats()
|
||||
if stats:
|
||||
log.debug(f"Interceptor stats - Fetch: {stats.get('totalFetch', 0)}/{stats.get('capturedFetch', 0)}, "
|
||||
f"XHR: {stats.get('totalXHR', 0)}/{stats.get('capturedXHR', 0)}, "
|
||||
f"Last: {stats.get('lastCapture', 'never')}")
|
||||
except Exception as api_err:
|
||||
log.debug(f"API interception error: {api_err}")
|
||||
log.warning(f"API interception error: {api_err}", exc_info=True)
|
||||
|
||||
# Dynamic sleep: sleep less when processing many reviews, more when finding none
|
||||
if len(fresh_cards) > 5:
|
||||
@@ -1470,6 +1521,35 @@ class GoogleReviewsScraper:
|
||||
if key not in existing or not existing.get(key):
|
||||
existing[key] = value
|
||||
log.info(f"After merge: {len(docs)} total reviews")
|
||||
elif self.enable_api_intercept:
|
||||
# Log final stats even if no reviews captured
|
||||
if self.api_interceptor:
|
||||
stats = self.api_interceptor.get_interceptor_stats()
|
||||
if stats:
|
||||
log.warning(f"⚠️ API interception was enabled but captured 0 reviews. "
|
||||
f"Network stats - Fetch requests: {stats.get('capturedFetch', 0)}/{stats.get('totalFetch', 0)}, "
|
||||
f"XHR requests: {stats.get('capturedXHR', 0)}/{stats.get('totalXHR', 0)}")
|
||||
|
||||
# Get browser console logs for debugging
|
||||
console_logs = self.api_interceptor.get_browser_console_logs()
|
||||
api_logs = [log_entry for log_entry in console_logs
|
||||
if 'API Interceptor' in log_entry.get('message', '')]
|
||||
if api_logs:
|
||||
log.info(f"Found {len(api_logs)} API interceptor console messages")
|
||||
for entry in api_logs[:10]: # Show first 10
|
||||
log.debug(f" Console: {entry.get('message', '')[:200]}")
|
||||
else:
|
||||
log.debug("No API interceptor console messages found")
|
||||
|
||||
# In debug mode, try to dump any responses that were collected
|
||||
if log.level <= logging.DEBUG:
|
||||
all_responses = self.api_interceptor.get_intercepted_responses()
|
||||
if all_responses:
|
||||
dump_path = self.api_interceptor.dump_responses_to_file(all_responses)
|
||||
if dump_path:
|
||||
log.info(f"Raw responses dumped to: {dump_path}")
|
||||
else:
|
||||
log.warning("API interceptor stats not available")
|
||||
|
||||
# Save to MongoDB if enabled
|
||||
if self.use_mongodb and self.mongodb:
|
||||
|
||||
373
modules/webhooks.py
Normal file
373
modules/webhooks.py
Normal file
@@ -0,0 +1,373 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Webhook delivery system with retry logic and security.
|
||||
"""
|
||||
import asyncio
|
||||
import hmac
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, Any, Optional
|
||||
from datetime import datetime
|
||||
import httpx
|
||||
from uuid import UUID
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WebhookDeliveryError(Exception):
|
||||
"""Raised when webhook delivery fails after all retries"""
|
||||
pass
|
||||
|
||||
|
||||
class WebhookManager:
|
||||
"""
|
||||
Manages webhook delivery with retry logic and security.
|
||||
|
||||
Features:
|
||||
- Exponential backoff retry (3 attempts)
|
||||
- HMAC signature for security
|
||||
- Timeout handling
|
||||
- Async delivery
|
||||
- Logging of all attempts
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_retries: int = 3,
|
||||
timeout: float = 10.0,
|
||||
initial_retry_delay: float = 2.0
|
||||
):
|
||||
"""
|
||||
Initialize webhook manager.
|
||||
|
||||
Args:
|
||||
max_retries: Maximum number of delivery attempts
|
||||
timeout: Request timeout in seconds
|
||||
initial_retry_delay: Initial delay between retries (exponential backoff)
|
||||
"""
|
||||
self.max_retries = max_retries
|
||||
self.timeout = timeout
|
||||
self.initial_retry_delay = initial_retry_delay
|
||||
|
||||
def generate_signature(self, payload: str, secret: str) -> str:
|
||||
"""
|
||||
Generate HMAC-SHA256 signature for webhook payload.
|
||||
|
||||
Args:
|
||||
payload: JSON string payload
|
||||
secret: Webhook secret
|
||||
|
||||
Returns:
|
||||
Hex-encoded signature
|
||||
"""
|
||||
return hmac.new(
|
||||
secret.encode('utf-8'),
|
||||
payload.encode('utf-8'),
|
||||
hashlib.sha256
|
||||
).hexdigest()
|
||||
|
||||
async def send_webhook(
|
||||
self,
|
||||
webhook_url: str,
|
||||
payload: Dict[str, Any],
|
||||
secret: Optional[str] = None,
|
||||
job_id: Optional[UUID] = None,
|
||||
db=None
|
||||
) -> bool:
|
||||
"""
|
||||
Send webhook with retry logic.
|
||||
|
||||
Args:
|
||||
webhook_url: URL to send webhook to
|
||||
payload: Webhook payload dictionary
|
||||
secret: Optional webhook secret for HMAC signature
|
||||
job_id: Optional job ID for logging attempts
|
||||
db: Optional database manager for logging
|
||||
|
||||
Returns:
|
||||
True if delivery succeeded, False otherwise
|
||||
"""
|
||||
payload_json = json.dumps(payload, default=str)
|
||||
|
||||
for attempt in range(1, self.max_retries + 1):
|
||||
try:
|
||||
start_time = datetime.now()
|
||||
|
||||
# Prepare headers
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"User-Agent": "GoogleReviewsScraper-Webhook/1.0"
|
||||
}
|
||||
|
||||
# Add signature if secret provided
|
||||
if secret:
|
||||
signature = self.generate_signature(payload_json, secret)
|
||||
headers["X-Webhook-Signature"] = f"sha256={signature}"
|
||||
headers["X-Webhook-Timestamp"] = str(int(datetime.now().timestamp()))
|
||||
|
||||
# Send webhook
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
webhook_url,
|
||||
content=payload_json,
|
||||
headers=headers,
|
||||
timeout=self.timeout
|
||||
)
|
||||
|
||||
response_time_ms = (datetime.now() - start_time).total_seconds() * 1000
|
||||
|
||||
# Check response
|
||||
if response.status_code in [200, 201, 202, 204]:
|
||||
# Success
|
||||
log.info(
|
||||
f"Webhook delivered successfully to {webhook_url} "
|
||||
f"(attempt {attempt}, {response_time_ms:.0f}ms, status {response.status_code})"
|
||||
)
|
||||
|
||||
# Log successful attempt
|
||||
if db and job_id:
|
||||
await db.log_webhook_attempt(
|
||||
job_id=job_id,
|
||||
attempt_number=attempt,
|
||||
success=True,
|
||||
status_code=response.status_code,
|
||||
response_time_ms=response_time_ms
|
||||
)
|
||||
|
||||
return True
|
||||
else:
|
||||
# Non-2xx response
|
||||
error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
|
||||
log.warning(
|
||||
f"Webhook delivery failed to {webhook_url} "
|
||||
f"(attempt {attempt}/{self.max_retries}): {error_msg}"
|
||||
)
|
||||
|
||||
# Log failed attempt
|
||||
if db and job_id:
|
||||
await db.log_webhook_attempt(
|
||||
job_id=job_id,
|
||||
attempt_number=attempt,
|
||||
success=False,
|
||||
status_code=response.status_code,
|
||||
error_message=error_msg,
|
||||
response_time_ms=response_time_ms
|
||||
)
|
||||
|
||||
except httpx.TimeoutException as e:
|
||||
error_msg = f"Timeout after {self.timeout}s"
|
||||
log.warning(
|
||||
f"Webhook delivery timeout to {webhook_url} "
|
||||
f"(attempt {attempt}/{self.max_retries}): {error_msg}"
|
||||
)
|
||||
|
||||
# Log timeout attempt
|
||||
if db and job_id:
|
||||
await db.log_webhook_attempt(
|
||||
job_id=job_id,
|
||||
attempt_number=attempt,
|
||||
success=False,
|
||||
error_message=error_msg
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"{type(e).__name__}: {str(e)}"
|
||||
log.error(
|
||||
f"Webhook delivery error to {webhook_url} "
|
||||
f"(attempt {attempt}/{self.max_retries}): {error_msg}"
|
||||
)
|
||||
|
||||
# Log error attempt
|
||||
if db and job_id:
|
||||
await db.log_webhook_attempt(
|
||||
job_id=job_id,
|
||||
attempt_number=attempt,
|
||||
success=False,
|
||||
error_message=error_msg
|
||||
)
|
||||
|
||||
# Retry with exponential backoff
|
||||
if attempt < self.max_retries:
|
||||
retry_delay = self.initial_retry_delay * (2 ** (attempt - 1))
|
||||
log.info(f"Retrying in {retry_delay:.1f}s...")
|
||||
await asyncio.sleep(retry_delay)
|
||||
|
||||
# All retries failed
|
||||
log.error(
|
||||
f"Webhook delivery failed to {webhook_url} after {self.max_retries} attempts"
|
||||
)
|
||||
return False
|
||||
|
||||
async def send_job_completed_webhook(
|
||||
self,
|
||||
webhook_url: str,
|
||||
job_id: UUID,
|
||||
status: str,
|
||||
reviews_count: Optional[int] = None,
|
||||
scrape_time: Optional[float] = None,
|
||||
error_message: Optional[str] = None,
|
||||
reviews_url: Optional[str] = None,
|
||||
secret: Optional[str] = None,
|
||||
db=None
|
||||
) -> bool:
|
||||
"""
|
||||
Send job completion webhook.
|
||||
|
||||
Args:
|
||||
webhook_url: URL to send webhook to
|
||||
job_id: Job UUID
|
||||
status: Job status ('completed' or 'failed')
|
||||
reviews_count: Number of reviews scraped
|
||||
scrape_time: Time taken in seconds
|
||||
error_message: Error message if failed
|
||||
reviews_url: URL to retrieve reviews
|
||||
secret: Webhook secret
|
||||
db: Database manager for logging
|
||||
|
||||
Returns:
|
||||
True if delivery succeeded
|
||||
"""
|
||||
payload = {
|
||||
"event": f"job.{status}",
|
||||
"job_id": str(job_id),
|
||||
"status": status,
|
||||
"timestamp": datetime.utcnow().isoformat() + "Z"
|
||||
}
|
||||
|
||||
if status == "completed":
|
||||
payload.update({
|
||||
"reviews_count": reviews_count,
|
||||
"scrape_time": scrape_time,
|
||||
"reviews_url": reviews_url
|
||||
})
|
||||
elif status == "failed":
|
||||
payload["error_message"] = error_message
|
||||
|
||||
return await self.send_webhook(
|
||||
webhook_url=webhook_url,
|
||||
payload=payload,
|
||||
secret=secret,
|
||||
job_id=job_id,
|
||||
db=db
|
||||
)
|
||||
|
||||
|
||||
class WebhookDispatcher:
|
||||
"""
|
||||
Background webhook dispatcher that processes pending webhooks.
|
||||
|
||||
Runs in background and delivers webhooks for completed jobs.
|
||||
"""
|
||||
|
||||
def __init__(self, db, interval_seconds: int = 30):
|
||||
"""
|
||||
Initialize webhook dispatcher.
|
||||
|
||||
Args:
|
||||
db: Database manager instance
|
||||
interval_seconds: How often to check for pending webhooks
|
||||
"""
|
||||
self.db = db
|
||||
self.interval = interval_seconds
|
||||
self.webhook_manager = WebhookManager()
|
||||
self.running = False
|
||||
|
||||
async def start(self):
|
||||
"""Start the background webhook dispatcher"""
|
||||
self.running = True
|
||||
log.info("Webhook dispatcher started")
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
await self.process_pending_webhooks()
|
||||
except Exception as e:
|
||||
log.error(f"Error in webhook dispatcher: {e}")
|
||||
|
||||
await asyncio.sleep(self.interval)
|
||||
|
||||
def stop(self):
|
||||
"""Stop the background webhook dispatcher"""
|
||||
self.running = False
|
||||
log.info("Webhook dispatcher stopped")
|
||||
|
||||
async def process_pending_webhooks(self):
|
||||
"""
|
||||
Process all pending webhooks.
|
||||
|
||||
Fetches jobs with pending webhooks and delivers them.
|
||||
"""
|
||||
# Get jobs with pending webhooks
|
||||
jobs = await self.db.get_pending_jobs_with_webhooks(limit=100)
|
||||
|
||||
if not jobs:
|
||||
return
|
||||
|
||||
log.info(f"Processing {len(jobs)} pending webhooks...")
|
||||
|
||||
for job in jobs:
|
||||
try:
|
||||
job_id = job['job_id']
|
||||
webhook_url = job['webhook_url']
|
||||
webhook_secret = job.get('webhook_secret')
|
||||
status = job['status']
|
||||
|
||||
# Build reviews URL (assuming API base URL from environment)
|
||||
import os
|
||||
api_base_url = os.getenv('API_BASE_URL', 'http://localhost:8000')
|
||||
reviews_url = f"{api_base_url}/jobs/{job_id}/reviews"
|
||||
|
||||
# Send webhook
|
||||
await self.webhook_manager.send_job_completed_webhook(
|
||||
webhook_url=webhook_url,
|
||||
job_id=job_id,
|
||||
status=status,
|
||||
reviews_count=job.get('reviews_count'),
|
||||
scrape_time=job.get('scrape_time'),
|
||||
error_message=job.get('error_message'),
|
||||
reviews_url=reviews_url if status == 'completed' else None,
|
||||
secret=webhook_secret,
|
||||
db=self.db
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error processing webhook for job {job['job_id']}: {e}")
|
||||
|
||||
log.info(f"Processed {len(jobs)} webhooks")
|
||||
|
||||
|
||||
# Webhook verification helper for client implementations
|
||||
def verify_webhook_signature(payload: str, signature: str, secret: str) -> bool:
|
||||
"""
|
||||
Verify webhook signature (for client-side verification).
|
||||
|
||||
Args:
|
||||
payload: Raw JSON payload string
|
||||
signature: Signature from X-Webhook-Signature header (format: "sha256=...")
|
||||
secret: Webhook secret
|
||||
|
||||
Returns:
|
||||
True if signature is valid
|
||||
|
||||
Example:
|
||||
@app.post("/webhook")
|
||||
async def handle_webhook(request: Request):
|
||||
payload = await request.body()
|
||||
signature = request.headers.get("X-Webhook-Signature")
|
||||
|
||||
if not verify_webhook_signature(payload.decode(), signature, WEBHOOK_SECRET):
|
||||
raise HTTPException(status_code=401, detail="Invalid signature")
|
||||
|
||||
# Process webhook...
|
||||
"""
|
||||
if not signature or not signature.startswith("sha256="):
|
||||
return False
|
||||
|
||||
expected_signature = signature.split("sha256=", 1)[1]
|
||||
computed_signature = hmac.new(
|
||||
secret.encode('utf-8'),
|
||||
payload.encode('utf-8'),
|
||||
hashlib.sha256
|
||||
).hexdigest()
|
||||
|
||||
return hmac.compare_digest(expected_signature, computed_signature)
|
||||
Reference in New Issue
Block a user