Phase 0: Project restructure to ReviewIQ platform architecture
New structure: - scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py) - scrapers/base.py (BaseScraper interface) - scrapers/registry.py (ScraperRegistry for version routing) - core/database.py, models.py, config.py, enums.py - utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py - workers/chrome_pool.py - services/webhook_service.py - api/ routes structure (empty, ready for Phase 2) - tests/ structure mirroring source All imports updated in: - api_server_production.py (7 import paths updated) - utils/health_checks.py (scraper import path) Legacy modules moved to modules/_legacy/: - data_storage.py, image_handler.py, s3_handler.py (unused) Syntax verified, frontend build passing. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,388 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Chrome Worker Pool Manager
|
||||
|
||||
Maintains a pool of idle Chrome instances for faster scraping.
|
||||
Pre-warms browsers on startup to eliminate cold-start delays.
|
||||
"""
|
||||
import logging
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Optional, Dict, Any
|
||||
from seleniumbase import Driver
|
||||
from queue import Queue, Empty
|
||||
import threading
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ChromeWorker:
|
||||
"""Single Chrome worker instance"""
|
||||
|
||||
def __init__(self, worker_id: str, headless: bool = True):
|
||||
self.worker_id = worker_id
|
||||
self.headless = headless
|
||||
self.driver: Optional[Driver] = None
|
||||
self.created_at = None
|
||||
self.last_used = None
|
||||
self.use_count = 0
|
||||
self.is_busy = False
|
||||
|
||||
def initialize(self):
|
||||
"""Initialize Chrome driver with stability flags for unlimited scraping"""
|
||||
try:
|
||||
log.info(f"Worker {self.worker_id}: Initializing Chrome for unlimited review scraping...")
|
||||
|
||||
# SeleniumBase Driver automatically includes UC mode anti-detection
|
||||
# Initialize with longer timeouts for large scraping jobs
|
||||
# Chrome arguments for Docker stability
|
||||
chrome_args = [
|
||||
"--disable-dev-shm-usage", # Use /tmp instead of /dev/shm (critical for Docker)
|
||||
"--disable-gpu", # Disable GPU acceleration
|
||||
"--no-sandbox", # Required for Docker
|
||||
"--disable-software-rasterizer",
|
||||
"--disable-extensions",
|
||||
"--disable-background-networking",
|
||||
"--disable-default-apps",
|
||||
"--disable-sync",
|
||||
"--metrics-recording-only",
|
||||
"--mute-audio",
|
||||
"--no-first-run",
|
||||
"--safebrowsing-disable-auto-update",
|
||||
]
|
||||
|
||||
self.driver = Driver(
|
||||
uc=True,
|
||||
headless=self.headless,
|
||||
page_load_strategy="normal",
|
||||
chromium_arg=",".join(chrome_args)
|
||||
)
|
||||
|
||||
# Set generous timeouts for large scraping jobs
|
||||
self.driver.set_page_load_timeout(120) # 2 minutes for slow networks
|
||||
self.driver.set_script_timeout(60) # 1 minute for complex extraction
|
||||
|
||||
# Set Chrome geolocation to US (Boston, MA) for consistent Google Maps results
|
||||
# This prevents location-based variations in search results
|
||||
try:
|
||||
self.driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
||||
'latitude': 42.3601,
|
||||
'longitude': -71.0589,
|
||||
'accuracy': 100
|
||||
})
|
||||
log.info(f"Worker {self.worker_id}: Geolocation set to US (Boston, MA)")
|
||||
except Exception as e:
|
||||
log.warning(f"Worker {self.worker_id}: Could not set geolocation: {e}")
|
||||
|
||||
self.driver.maximize_window()
|
||||
self.created_at = time.time()
|
||||
self.last_used = time.time()
|
||||
log.info(f"Worker {self.worker_id}: Chrome ready for unlimited scraping")
|
||||
return True
|
||||
except Exception as e:
|
||||
log.error(f"Worker {self.worker_id}: Failed to initialize: {e}")
|
||||
return False
|
||||
|
||||
def reset(self):
|
||||
"""Reset worker to clean state"""
|
||||
try:
|
||||
if self.driver:
|
||||
# Clear cookies, cache, local storage
|
||||
self.driver.delete_all_cookies()
|
||||
self.driver.execute_script("window.localStorage.clear();")
|
||||
self.driver.execute_script("window.sessionStorage.clear();")
|
||||
log.debug(f"Worker {self.worker_id}: Reset complete")
|
||||
except Exception as e:
|
||||
log.warning(f"Worker {self.worker_id}: Reset failed: {e}")
|
||||
|
||||
def shutdown(self):
|
||||
"""Shutdown worker"""
|
||||
try:
|
||||
if self.driver:
|
||||
self.driver.quit()
|
||||
log.info(f"Worker {self.worker_id}: Shutdown complete")
|
||||
except Exception as e:
|
||||
log.warning(f"Worker {self.worker_id}: Shutdown error: {e}")
|
||||
finally:
|
||||
self.driver = None
|
||||
|
||||
def should_recycle(self, max_age_seconds: int = 3600, max_uses: int = 50):
|
||||
"""Check if worker should be recycled"""
|
||||
if not self.driver:
|
||||
return True
|
||||
|
||||
age = time.time() - self.created_at if self.created_at else 0
|
||||
if age > max_age_seconds:
|
||||
log.info(f"Worker {self.worker_id}: Recycling due to age ({age:.0f}s)")
|
||||
return True
|
||||
|
||||
if self.use_count > max_uses:
|
||||
log.info(f"Worker {self.worker_id}: Recycling due to use count ({self.use_count})")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
class ChromeWorkerPool:
|
||||
"""
|
||||
Pool of Chrome worker instances for faster scraping.
|
||||
|
||||
Maintains idle workers ready to execute tasks immediately.
|
||||
Workers are recycled after max age or max uses to prevent memory leaks.
|
||||
"""
|
||||
|
||||
def __init__(self, pool_size: int = 2, headless: bool = True):
|
||||
"""
|
||||
Initialize worker pool.
|
||||
|
||||
Args:
|
||||
pool_size: Number of idle workers to maintain
|
||||
headless: Run Chrome in headless mode
|
||||
"""
|
||||
self.pool_size = pool_size
|
||||
self.headless = headless
|
||||
self.workers: Queue[ChromeWorker] = Queue(maxsize=pool_size)
|
||||
self.active_workers: Dict[str, ChromeWorker] = {}
|
||||
self.worker_counter = 0
|
||||
self.lock = threading.Lock()
|
||||
self.running = False
|
||||
self.maintenance_thread = None
|
||||
|
||||
def start(self):
|
||||
"""Start the worker pool"""
|
||||
log.info(f"Starting Chrome worker pool (size={self.pool_size}, headless={self.headless})")
|
||||
self.running = True
|
||||
|
||||
# Pre-warm workers
|
||||
for _ in range(self.pool_size):
|
||||
self._create_worker()
|
||||
|
||||
# Start maintenance thread
|
||||
self.maintenance_thread = threading.Thread(target=self._maintenance_loop, daemon=True)
|
||||
self.maintenance_thread.start()
|
||||
|
||||
log.info(f"Chrome worker pool started with {self.workers.qsize()} ready workers")
|
||||
|
||||
def stop(self):
|
||||
"""Stop the worker pool"""
|
||||
log.info("Stopping Chrome worker pool...")
|
||||
self.running = False
|
||||
|
||||
if self.maintenance_thread:
|
||||
self.maintenance_thread.join(timeout=5)
|
||||
|
||||
# Shutdown all workers
|
||||
while not self.workers.empty():
|
||||
try:
|
||||
worker = self.workers.get_nowait()
|
||||
worker.shutdown()
|
||||
except Empty:
|
||||
break
|
||||
|
||||
# Shutdown active workers
|
||||
with self.lock:
|
||||
for worker in self.active_workers.values():
|
||||
worker.shutdown()
|
||||
self.active_workers.clear()
|
||||
|
||||
log.info("Chrome worker pool stopped")
|
||||
|
||||
def _create_worker(self) -> Optional[ChromeWorker]:
|
||||
"""Create a new worker and add to pool"""
|
||||
with self.lock:
|
||||
self.worker_counter += 1
|
||||
worker_id = f"worker-{self.worker_counter}"
|
||||
|
||||
worker = ChromeWorker(worker_id, headless=self.headless)
|
||||
if worker.initialize():
|
||||
try:
|
||||
self.workers.put_nowait(worker)
|
||||
return worker
|
||||
except:
|
||||
worker.shutdown()
|
||||
return None
|
||||
return None
|
||||
|
||||
def acquire_worker(self, timeout: float = 30) -> Optional[ChromeWorker]:
|
||||
"""
|
||||
Acquire a worker from the pool.
|
||||
|
||||
Args:
|
||||
timeout: Maximum time to wait for a worker
|
||||
|
||||
Returns:
|
||||
ChromeWorker instance or None if timeout
|
||||
"""
|
||||
try:
|
||||
worker = self.workers.get(timeout=timeout)
|
||||
worker.is_busy = True
|
||||
worker.last_used = time.time()
|
||||
worker.use_count += 1
|
||||
|
||||
with self.lock:
|
||||
self.active_workers[worker.worker_id] = worker
|
||||
|
||||
log.debug(f"Acquired {worker.worker_id} (uses: {worker.use_count}, pool: {self.workers.qsize()}/{self.pool_size})")
|
||||
|
||||
# No need to create replacement - worker will be returned to pool after use
|
||||
# Maintenance thread ensures pool stays at capacity
|
||||
|
||||
return worker
|
||||
except Empty:
|
||||
log.warning(f"Failed to acquire worker within {timeout}s")
|
||||
return None
|
||||
|
||||
def release_worker(self, worker: ChromeWorker, recycle: bool = False):
|
||||
"""
|
||||
Release a worker back to the pool.
|
||||
|
||||
Args:
|
||||
worker: Worker to release
|
||||
recycle: Force worker recycling
|
||||
"""
|
||||
with self.lock:
|
||||
if worker.worker_id in self.active_workers:
|
||||
del self.active_workers[worker.worker_id]
|
||||
|
||||
worker.is_busy = False
|
||||
|
||||
# Check if worker should be recycled
|
||||
if recycle or worker.should_recycle():
|
||||
log.info(f"Recycling {worker.worker_id}")
|
||||
worker.shutdown()
|
||||
# Create replacement worker in background
|
||||
threading.Thread(target=self._create_worker, daemon=True).start()
|
||||
else:
|
||||
# Reset and return to pool
|
||||
worker.reset()
|
||||
try:
|
||||
# Non-blocking put - if pool is full, it means we have extra workers
|
||||
# Just keep the worker for next time instead of destroying it
|
||||
current_size = self.workers.qsize()
|
||||
if current_size < self.pool_size:
|
||||
self.workers.put_nowait(worker)
|
||||
log.debug(f"Released {worker.worker_id} back to pool ({current_size + 1}/{self.pool_size})")
|
||||
else:
|
||||
# Pool already at capacity, recycle this extra worker
|
||||
log.debug(f"Pool at capacity ({current_size}/{self.pool_size}), recycling extra {worker.worker_id}")
|
||||
worker.shutdown()
|
||||
except Exception as e:
|
||||
# Unexpected error, shutdown worker
|
||||
log.error(f"Failed to release {worker.worker_id}: {e}")
|
||||
worker.shutdown()
|
||||
|
||||
def _maintenance_loop(self):
|
||||
"""Background maintenance thread"""
|
||||
while self.running:
|
||||
try:
|
||||
# Ensure pool is at capacity
|
||||
current_size = self.workers.qsize()
|
||||
needed = self.pool_size - current_size
|
||||
|
||||
if needed > 0:
|
||||
log.debug(f"Pool needs {needed} more workers")
|
||||
for _ in range(needed):
|
||||
self._create_worker()
|
||||
|
||||
# Sleep for 10 seconds
|
||||
time.sleep(10)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Maintenance loop error: {e}")
|
||||
time.sleep(5)
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get pool statistics"""
|
||||
with self.lock:
|
||||
active_count = len(self.active_workers)
|
||||
|
||||
return {
|
||||
"pool_size": self.pool_size,
|
||||
"idle_workers": self.workers.qsize(),
|
||||
"active_workers": active_count,
|
||||
"total_workers_created": self.worker_counter,
|
||||
"headless": self.headless
|
||||
}
|
||||
|
||||
|
||||
# Global worker pool instances
|
||||
validation_pool: Optional[ChromeWorkerPool] = None
|
||||
scraping_pool: Optional[ChromeWorkerPool] = None
|
||||
|
||||
|
||||
def start_worker_pools(validation_size: int = 1, scraping_size: int = 2, headless: bool = True):
|
||||
"""
|
||||
Start global worker pools.
|
||||
|
||||
Args:
|
||||
validation_size: Number of workers for validation checks
|
||||
scraping_size: Number of workers for scraping jobs
|
||||
headless: Run Chrome in headless mode
|
||||
"""
|
||||
global validation_pool, scraping_pool
|
||||
|
||||
log.info("Starting global Chrome worker pools...")
|
||||
|
||||
validation_pool = ChromeWorkerPool(pool_size=validation_size, headless=headless)
|
||||
validation_pool.start()
|
||||
|
||||
scraping_pool = ChromeWorkerPool(pool_size=scraping_size, headless=headless)
|
||||
scraping_pool.start()
|
||||
|
||||
log.info("Global Chrome worker pools started")
|
||||
|
||||
|
||||
def stop_worker_pools():
|
||||
"""Stop global worker pools"""
|
||||
global validation_pool, scraping_pool
|
||||
|
||||
log.info("Stopping global Chrome worker pools...")
|
||||
|
||||
if validation_pool:
|
||||
validation_pool.stop()
|
||||
validation_pool = None
|
||||
|
||||
if scraping_pool:
|
||||
scraping_pool.stop()
|
||||
scraping_pool = None
|
||||
|
||||
log.info("Global Chrome worker pools stopped")
|
||||
|
||||
|
||||
def get_validation_worker(timeout: float = 10) -> Optional[ChromeWorker]:
|
||||
"""Get a worker for validation check"""
|
||||
if validation_pool:
|
||||
return validation_pool.acquire_worker(timeout=timeout)
|
||||
return None
|
||||
|
||||
|
||||
def release_validation_worker(worker: ChromeWorker, recycle: bool = False):
|
||||
"""Release a validation worker"""
|
||||
if validation_pool:
|
||||
validation_pool.release_worker(worker, recycle=recycle)
|
||||
|
||||
|
||||
def get_scraping_worker(timeout: float = 30) -> Optional[ChromeWorker]:
|
||||
"""Get a worker for scraping"""
|
||||
if scraping_pool:
|
||||
return scraping_pool.acquire_worker(timeout=timeout)
|
||||
return None
|
||||
|
||||
|
||||
def release_scraping_worker(worker: ChromeWorker, recycle: bool = False):
|
||||
"""Release a scraping worker"""
|
||||
if scraping_pool:
|
||||
scraping_pool.release_worker(worker, recycle=recycle)
|
||||
|
||||
|
||||
def get_pool_stats() -> Dict[str, Any]:
|
||||
"""Get statistics for all pools"""
|
||||
stats = {}
|
||||
|
||||
if validation_pool:
|
||||
stats['validation'] = validation_pool.get_stats()
|
||||
|
||||
if scraping_pool:
|
||||
stats['scraping'] = scraping_pool.get_stats()
|
||||
|
||||
return stats
|
||||
@@ -1,82 +0,0 @@
|
||||
"""
|
||||
Configuration management for Google Maps Reviews Scraper.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
|
||||
import yaml
|
||||
|
||||
# Configure logging - can be overridden by environment variable
|
||||
import os
|
||||
log_level = getattr(logging, os.environ.get('LOG_LEVEL', 'INFO').upper(), logging.INFO)
|
||||
logging.basicConfig(level=log_level, format="[%(asctime)s] %(levelname)s: %(message)s")
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
# Default configuration path
|
||||
DEFAULT_CONFIG_PATH = Path("config.yaml")
|
||||
|
||||
# Default configuration - will be overridden by config file
|
||||
DEFAULT_CONFIG = {
|
||||
"url": "https://maps.app.goo.gl/6tkNMDjcj3SS6LJe9",
|
||||
"headless": True,
|
||||
"sort_by": "relevance",
|
||||
"stop_on_match": False,
|
||||
"overwrite_existing": False,
|
||||
"use_mongodb": True,
|
||||
"mongodb": {
|
||||
"uri": "mongodb://localhost:27017",
|
||||
"database": "reviews",
|
||||
"collection": "google_reviews"
|
||||
},
|
||||
"backup_to_json": True,
|
||||
"json_path": "google_reviews.json",
|
||||
"seen_ids_path": "google_reviews.ids",
|
||||
"convert_dates": True,
|
||||
"download_images": True,
|
||||
"image_dir": "review_images",
|
||||
"download_threads": 4,
|
||||
"store_local_paths": True, # Option to control storing local image paths
|
||||
"replace_urls": False, # Option to control URL replacement
|
||||
"custom_url_base": "https://mycustomurl.com", # Base URL for replacement
|
||||
"custom_url_profiles": "/profiles/", # Path for profile images
|
||||
"custom_url_reviews": "/reviews/", # Path for review images
|
||||
"preserve_original_urls": True, # Option to preserve original URLs
|
||||
"custom_params": { # Custom parameters to add to each document
|
||||
"company": "Thaitours", # Default example
|
||||
"source": "Google Maps" # Default example
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def load_config(config_path: Path = DEFAULT_CONFIG_PATH) -> Dict[str, Any]:
|
||||
"""Load configuration from YAML file or use defaults"""
|
||||
config = DEFAULT_CONFIG.copy()
|
||||
|
||||
if config_path.exists():
|
||||
try:
|
||||
with open(config_path, 'r') as f:
|
||||
user_config = yaml.safe_load(f)
|
||||
if user_config:
|
||||
# Merge configs, with nested dictionary support
|
||||
def deep_update(d, u):
|
||||
for k, v in u.items():
|
||||
if isinstance(v, dict) and k in d and isinstance(d[k], dict):
|
||||
deep_update(d[k], v)
|
||||
else:
|
||||
d[k] = v
|
||||
|
||||
deep_update(config, user_config)
|
||||
log.info(f"Loaded configuration from {config_path}")
|
||||
except Exception as e:
|
||||
log.error(f"Error loading config from {config_path}: {e}")
|
||||
log.info("Using default configuration")
|
||||
else:
|
||||
log.info(f"Config file {config_path} not found, using default configuration")
|
||||
# Create a default config file for future use
|
||||
with open(config_path, 'w') as f:
|
||||
yaml.dump(config, f, default_flow_style=False)
|
||||
log.info(f"Created default configuration file at {config_path}")
|
||||
|
||||
return config
|
||||
@@ -1,666 +0,0 @@
|
||||
"""
|
||||
Crash Pattern Analyzer Module
|
||||
|
||||
Provides deep analysis of scraper crashes with pattern detection,
|
||||
confidence scoring, and auto-fix parameter suggestions.
|
||||
|
||||
Builds on top of the basic classify_crash function in scraper_clean.py
|
||||
with more sophisticated pattern matching and multi-signal analysis.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional
|
||||
import re
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrashAnalysis:
|
||||
"""
|
||||
Result of crash pattern analysis.
|
||||
|
||||
Attributes:
|
||||
pattern: The identified crash pattern type (e.g., "memory_exhaustion", "dom_bloat")
|
||||
confidence: Confidence score from 0.0 to 1.0 based on multiple signals
|
||||
description: Human-readable description of the crash cause
|
||||
suggested_fix: Recommended action to prevent this crash
|
||||
auto_fix_params: Parameters that can be applied automatically to prevent recurrence
|
||||
"""
|
||||
pattern: str # e.g., "memory_exhaustion", "dom_bloat", "rate_limited"
|
||||
confidence: float # 0.0 to 1.0
|
||||
description: str
|
||||
suggested_fix: str
|
||||
auto_fix_params: Optional[Dict[str, Any]]
|
||||
|
||||
|
||||
# Thresholds for pattern detection
|
||||
MEMORY_EXHAUSTION_THRESHOLD_MB = 1500 # 1.5GB in MB
|
||||
MEMORY_GROWTH_RATE_THRESHOLD_MB_S = 10 # 10MB/s
|
||||
DOM_BLOAT_THRESHOLD = 50000 # 50000 nodes
|
||||
SCROLL_TIMEOUT_MIN_SCROLLS = 10 # Minimum scrolls before considering scroll_timeout
|
||||
|
||||
|
||||
# Auto-fix parameters for each crash pattern
|
||||
AUTO_FIX_PARAMS = {
|
||||
"memory_exhaustion": {
|
||||
"max_reviews": 500,
|
||||
"restart_browser_after": 200
|
||||
},
|
||||
"dom_bloat": {
|
||||
"scroll_cleanup": True,
|
||||
"lazy_load": True
|
||||
},
|
||||
"rate_limited": {
|
||||
"delay_multiplier": 2.0,
|
||||
"use_different_proxy": True
|
||||
},
|
||||
"consent_loop": {
|
||||
"skip_consent_retries": True
|
||||
},
|
||||
"scroll_timeout": {
|
||||
"reduce_target": True,
|
||||
"target_reviews": "current - 10%"
|
||||
},
|
||||
"element_stale": {
|
||||
"retry_with_fresh_elements": True
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def _calculate_memory_growth_rate(metrics_history: List[Dict]) -> Optional[float]:
|
||||
"""
|
||||
Calculate memory growth rate in MB/s from metrics history.
|
||||
|
||||
Args:
|
||||
metrics_history: List of metric samples with timestamp_ms and memory_mb
|
||||
|
||||
Returns:
|
||||
Growth rate in MB/s, or None if cannot be calculated
|
||||
"""
|
||||
if not metrics_history or len(metrics_history) < 2:
|
||||
return None
|
||||
|
||||
# Filter samples that have valid memory readings
|
||||
valid_samples = [
|
||||
m for m in metrics_history
|
||||
if m.get('memory_mb') is not None and m.get('timestamp_ms') is not None
|
||||
]
|
||||
|
||||
if len(valid_samples) < 2:
|
||||
return None
|
||||
|
||||
# Use first and last valid samples
|
||||
first = valid_samples[0]
|
||||
last = valid_samples[-1]
|
||||
|
||||
time_delta_s = (last['timestamp_ms'] - first['timestamp_ms']) / 1000
|
||||
if time_delta_s <= 0:
|
||||
return None
|
||||
|
||||
memory_delta_mb = last['memory_mb'] - first['memory_mb']
|
||||
return memory_delta_mb / time_delta_s
|
||||
|
||||
|
||||
def _get_max_memory(metrics_history: List[Dict]) -> Optional[int]:
|
||||
"""Get maximum memory usage from metrics history."""
|
||||
if not metrics_history:
|
||||
return None
|
||||
|
||||
memories = [m.get('memory_mb') for m in metrics_history if m.get('memory_mb') is not None]
|
||||
return max(memories) if memories else None
|
||||
|
||||
|
||||
def _get_max_dom_nodes(metrics_history: List[Dict]) -> Optional[int]:
|
||||
"""Get maximum DOM node count from metrics history."""
|
||||
if not metrics_history:
|
||||
return None
|
||||
|
||||
nodes = [m.get('dom_nodes') for m in metrics_history if m.get('dom_nodes') is not None]
|
||||
return max(nodes) if nodes else None
|
||||
|
||||
|
||||
def _check_memory_exhaustion(
|
||||
error_message: str,
|
||||
metrics_history: List[Dict],
|
||||
logs: List[Dict]
|
||||
) -> tuple[float, str]:
|
||||
"""
|
||||
Check for memory exhaustion pattern.
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence, description)
|
||||
"""
|
||||
confidence = 0.0
|
||||
signals = []
|
||||
|
||||
# Check for high memory usage
|
||||
max_memory = _get_max_memory(metrics_history)
|
||||
if max_memory is not None:
|
||||
if max_memory >= MEMORY_EXHAUSTION_THRESHOLD_MB:
|
||||
confidence += 0.5
|
||||
signals.append(f"Memory reached {max_memory}MB (threshold: {MEMORY_EXHAUSTION_THRESHOLD_MB}MB)")
|
||||
elif max_memory >= MEMORY_EXHAUSTION_THRESHOLD_MB * 0.8:
|
||||
confidence += 0.3
|
||||
signals.append(f"Memory at {max_memory}MB approaching threshold")
|
||||
|
||||
# Check for rapid memory growth
|
||||
growth_rate = _calculate_memory_growth_rate(metrics_history)
|
||||
if growth_rate is not None and growth_rate >= MEMORY_GROWTH_RATE_THRESHOLD_MB_S:
|
||||
confidence += 0.3
|
||||
signals.append(f"Memory growing at {growth_rate:.1f}MB/s (threshold: {MEMORY_GROWTH_RATE_THRESHOLD_MB_S}MB/s)")
|
||||
|
||||
# Check error message for memory-related keywords
|
||||
error_lower = error_message.lower()
|
||||
memory_keywords = ['memory', 'heap', 'out of memory', 'oom', 'aw, snap', 'status_access_violation']
|
||||
for keyword in memory_keywords:
|
||||
if keyword in error_lower:
|
||||
confidence += 0.2
|
||||
signals.append(f"Error contains '{keyword}'")
|
||||
break
|
||||
|
||||
# Check logs for memory warnings
|
||||
for log_entry in logs:
|
||||
msg = log_entry.get('message', '').lower()
|
||||
if 'memory' in msg and ('high' in msg or 'warning' in msg or 'exceeded' in msg):
|
||||
confidence += 0.1
|
||||
signals.append("Memory warning found in logs")
|
||||
break
|
||||
|
||||
description = "; ".join(signals) if signals else "No memory exhaustion signals detected"
|
||||
return min(confidence, 1.0), description
|
||||
|
||||
|
||||
def _check_dom_bloat(
|
||||
error_message: str,
|
||||
metrics_history: List[Dict],
|
||||
logs: List[Dict]
|
||||
) -> tuple[float, str]:
|
||||
"""
|
||||
Check for DOM bloat pattern.
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence, description)
|
||||
"""
|
||||
confidence = 0.0
|
||||
signals = []
|
||||
|
||||
# Check for high DOM node count
|
||||
max_nodes = _get_max_dom_nodes(metrics_history)
|
||||
if max_nodes is not None:
|
||||
if max_nodes >= DOM_BLOAT_THRESHOLD:
|
||||
confidence += 0.6
|
||||
signals.append(f"DOM nodes reached {max_nodes} (threshold: {DOM_BLOAT_THRESHOLD})")
|
||||
elif max_nodes >= DOM_BLOAT_THRESHOLD * 0.8:
|
||||
confidence += 0.3
|
||||
signals.append(f"DOM nodes at {max_nodes} approaching threshold")
|
||||
|
||||
# Check error message for DOM-related keywords
|
||||
error_lower = error_message.lower()
|
||||
dom_keywords = ['dom', 'element', 'node', 'render', 'paint', 'layout']
|
||||
for keyword in dom_keywords:
|
||||
if keyword in error_lower:
|
||||
confidence += 0.2
|
||||
signals.append(f"Error contains '{keyword}'")
|
||||
break
|
||||
|
||||
# Check if memory is high too (DOM bloat often causes memory issues)
|
||||
max_memory = _get_max_memory(metrics_history)
|
||||
if max_memory is not None and max_memory >= 800: # 800MB
|
||||
confidence += 0.1
|
||||
signals.append(f"Memory also elevated ({max_memory}MB)")
|
||||
|
||||
# Check logs for DOM-related messages
|
||||
for log_entry in logs:
|
||||
msg = log_entry.get('message', '').lower()
|
||||
if 'dom' in msg and ('large' in msg or 'cleanup' in msg or 'remove' in msg):
|
||||
confidence += 0.1
|
||||
signals.append("DOM warning found in logs")
|
||||
break
|
||||
|
||||
description = "; ".join(signals) if signals else "No DOM bloat signals detected"
|
||||
return min(confidence, 1.0), description
|
||||
|
||||
|
||||
def _check_rate_limited(
|
||||
error_message: str,
|
||||
metrics_history: List[Dict],
|
||||
logs: List[Dict]
|
||||
) -> tuple[float, str]:
|
||||
"""
|
||||
Check for rate limiting pattern.
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence, description)
|
||||
"""
|
||||
confidence = 0.0
|
||||
signals = []
|
||||
|
||||
# Check error message for rate limit indicators
|
||||
error_lower = error_message.lower()
|
||||
if '429' in error_message:
|
||||
confidence += 0.6
|
||||
signals.append("HTTP 429 status code in error")
|
||||
|
||||
rate_keywords = ['rate limit', 'too many requests', 'unusual traffic', 'captcha', 'blocked']
|
||||
for keyword in rate_keywords:
|
||||
if keyword in error_lower:
|
||||
confidence += 0.4
|
||||
signals.append(f"Error contains '{keyword}'")
|
||||
break
|
||||
|
||||
# Check logs for rate limiting signals
|
||||
rate_log_count = 0
|
||||
for log_entry in logs:
|
||||
msg = log_entry.get('message', '').lower()
|
||||
network = log_entry.get('network', {})
|
||||
status = network.get('status')
|
||||
|
||||
if status == 429:
|
||||
rate_log_count += 1
|
||||
confidence += 0.2
|
||||
|
||||
if 'unusual traffic' in msg or 'rate' in msg or 'blocked' in msg:
|
||||
rate_log_count += 1
|
||||
confidence += 0.1
|
||||
|
||||
if rate_log_count > 0:
|
||||
signals.append(f"Found {rate_log_count} rate-limiting indicators in logs")
|
||||
|
||||
description = "; ".join(signals) if signals else "No rate limiting signals detected"
|
||||
return min(confidence, 1.0), description
|
||||
|
||||
|
||||
def _check_consent_loop(
|
||||
error_message: str,
|
||||
metrics_history: List[Dict],
|
||||
logs: List[Dict]
|
||||
) -> tuple[float, str]:
|
||||
"""
|
||||
Check for consent popup loop pattern.
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence, description)
|
||||
"""
|
||||
confidence = 0.0
|
||||
signals = []
|
||||
|
||||
# Check error message for consent keywords
|
||||
error_lower = error_message.lower()
|
||||
if 'consent' in error_lower:
|
||||
confidence += 0.3
|
||||
signals.append("Error mentions consent")
|
||||
|
||||
# Count consent-related log entries
|
||||
consent_count = 0
|
||||
consent_messages = []
|
||||
for log_entry in logs:
|
||||
msg = log_entry.get('message', '').lower()
|
||||
if 'consent' in msg:
|
||||
consent_count += 1
|
||||
consent_messages.append(msg[:50])
|
||||
|
||||
# Multiple consent messages indicate a loop
|
||||
if consent_count >= 3:
|
||||
confidence += 0.5
|
||||
signals.append(f"Consent popup appeared {consent_count} times in logs")
|
||||
elif consent_count >= 2:
|
||||
confidence += 0.3
|
||||
signals.append(f"Consent popup appeared {consent_count} times")
|
||||
elif consent_count == 1:
|
||||
confidence += 0.1
|
||||
signals.append("Single consent popup detected")
|
||||
|
||||
# Check for timeout after consent handling
|
||||
if 'timeout' in error_lower and consent_count > 0:
|
||||
confidence += 0.2
|
||||
signals.append("Timeout occurred with consent activity")
|
||||
|
||||
description = "; ".join(signals) if signals else "No consent loop signals detected"
|
||||
return min(confidence, 1.0), description
|
||||
|
||||
|
||||
def _check_scroll_timeout(
|
||||
error_message: str,
|
||||
metrics_history: List[Dict],
|
||||
logs: List[Dict],
|
||||
state: Optional[Dict] = None
|
||||
) -> tuple[float, str]:
|
||||
"""
|
||||
Check for scroll timeout pattern (no new reviews after many scrolls).
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence, description)
|
||||
"""
|
||||
confidence = 0.0
|
||||
signals = []
|
||||
|
||||
# Check state for scroll count
|
||||
scroll_count = 0
|
||||
reviews_count = 0
|
||||
if state:
|
||||
scroll_count = state.get('scroll_count', 0)
|
||||
reviews_count = state.get('reviews_extracted', 0)
|
||||
|
||||
# Check error for timeout indicators
|
||||
error_lower = error_message.lower()
|
||||
if 'timeout' in error_lower:
|
||||
confidence += 0.2
|
||||
signals.append("Timeout in error message")
|
||||
|
||||
# Count recovery attempts in logs (indicate stuck scrolling)
|
||||
recovery_count = 0
|
||||
no_new_count = 0
|
||||
for log_entry in logs:
|
||||
msg = log_entry.get('message', '').lower()
|
||||
if 'recovery attempt' in msg:
|
||||
recovery_count += 1
|
||||
if 'no new' in msg or 'stuck' in msg:
|
||||
no_new_count += 1
|
||||
|
||||
if recovery_count >= SCROLL_TIMEOUT_MIN_SCROLLS:
|
||||
confidence += 0.5
|
||||
signals.append(f"Made {recovery_count} recovery attempts")
|
||||
elif recovery_count >= 5:
|
||||
confidence += 0.3
|
||||
signals.append(f"Made {recovery_count} recovery attempts")
|
||||
|
||||
if no_new_count > 0:
|
||||
confidence += 0.2
|
||||
signals.append(f"Found {no_new_count} 'no new reviews' log entries")
|
||||
|
||||
# Check if reviews stopped growing
|
||||
if metrics_history and len(metrics_history) >= 5:
|
||||
# Check if reviews count plateaued
|
||||
recent_counts = [m.get('reviews_count', 0) for m in metrics_history[-5:] if m.get('reviews_count')]
|
||||
if recent_counts and len(set(recent_counts)) == 1:
|
||||
confidence += 0.2
|
||||
signals.append(f"Review count stuck at {recent_counts[0]}")
|
||||
|
||||
description = "; ".join(signals) if signals else "No scroll timeout signals detected"
|
||||
return min(confidence, 1.0), description
|
||||
|
||||
|
||||
def _check_element_stale(
|
||||
error_message: str,
|
||||
metrics_history: List[Dict],
|
||||
logs: List[Dict]
|
||||
) -> tuple[float, str]:
|
||||
"""
|
||||
Check for stale element reference pattern.
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence, description)
|
||||
"""
|
||||
confidence = 0.0
|
||||
signals = []
|
||||
|
||||
# Check error message for stale element indicators
|
||||
error_lower = error_message.lower()
|
||||
stale_keywords = [
|
||||
'stale element', 'staleelement', 'stale_element',
|
||||
'element is not attached', 'element reference',
|
||||
'no such element', 'element not found',
|
||||
'element is no longer valid'
|
||||
]
|
||||
|
||||
for keyword in stale_keywords:
|
||||
if keyword in error_lower:
|
||||
confidence += 0.6
|
||||
signals.append(f"Error contains '{keyword}'")
|
||||
break
|
||||
|
||||
# Check logs for stale element patterns
|
||||
stale_log_count = 0
|
||||
for log_entry in logs:
|
||||
msg = log_entry.get('message', '').lower()
|
||||
for keyword in stale_keywords:
|
||||
if keyword in msg:
|
||||
stale_log_count += 1
|
||||
break
|
||||
|
||||
if stale_log_count > 0:
|
||||
confidence += 0.2
|
||||
signals.append(f"Found {stale_log_count} stale element references in logs")
|
||||
|
||||
# Check if DOM was changing rapidly (indicates dynamic page)
|
||||
if metrics_history and len(metrics_history) >= 3:
|
||||
dom_counts = [m.get('dom_nodes') for m in metrics_history if m.get('dom_nodes')]
|
||||
if len(dom_counts) >= 3:
|
||||
# Calculate variance
|
||||
avg = sum(dom_counts) / len(dom_counts)
|
||||
variance = sum((x - avg) ** 2 for x in dom_counts) / len(dom_counts)
|
||||
std_dev = variance ** 0.5
|
||||
# High variance indicates rapidly changing DOM
|
||||
if std_dev > 1000:
|
||||
confidence += 0.2
|
||||
signals.append(f"High DOM variability (std dev: {std_dev:.0f})")
|
||||
|
||||
description = "; ".join(signals) if signals else "No stale element signals detected"
|
||||
return min(confidence, 1.0), description
|
||||
|
||||
|
||||
def analyze_crash(crash_report: Dict) -> CrashAnalysis:
|
||||
"""
|
||||
Analyze a crash report to determine the most likely crash pattern.
|
||||
|
||||
Examines error_message, metrics_history, and logs_before_crash to
|
||||
calculate confidence scores for each crash pattern type.
|
||||
|
||||
Args:
|
||||
crash_report: Dictionary containing:
|
||||
- error_message: str - The exception message
|
||||
- metrics_history: List[Dict] - Sampled metrics with timestamp_ms, memory_mb, dom_nodes
|
||||
- logs_before_crash: List[Dict] - Recent log entries before the crash
|
||||
- state: Optional[Dict] - Scraper state (reviews_extracted, scroll_count, etc.)
|
||||
- crash_type: Optional[str] - Basic crash classification from classify_crash()
|
||||
|
||||
Returns:
|
||||
CrashAnalysis with the highest-confidence pattern match
|
||||
"""
|
||||
# Extract data from crash report
|
||||
error_message = crash_report.get('error_message', '')
|
||||
metrics_history = crash_report.get('metrics_history', [])
|
||||
logs = crash_report.get('logs_before_crash', [])
|
||||
state = crash_report.get('state', {})
|
||||
basic_type = crash_report.get('crash_type', 'unknown')
|
||||
|
||||
# Run all pattern checks
|
||||
pattern_results = {}
|
||||
|
||||
# Memory exhaustion
|
||||
conf, desc = _check_memory_exhaustion(error_message, metrics_history, logs)
|
||||
pattern_results['memory_exhaustion'] = (conf, desc)
|
||||
|
||||
# DOM bloat
|
||||
conf, desc = _check_dom_bloat(error_message, metrics_history, logs)
|
||||
pattern_results['dom_bloat'] = (conf, desc)
|
||||
|
||||
# Rate limited
|
||||
conf, desc = _check_rate_limited(error_message, metrics_history, logs)
|
||||
pattern_results['rate_limited'] = (conf, desc)
|
||||
|
||||
# Consent loop
|
||||
conf, desc = _check_consent_loop(error_message, metrics_history, logs)
|
||||
pattern_results['consent_loop'] = (conf, desc)
|
||||
|
||||
# Scroll timeout
|
||||
conf, desc = _check_scroll_timeout(error_message, metrics_history, logs, state)
|
||||
pattern_results['scroll_timeout'] = (conf, desc)
|
||||
|
||||
# Element stale
|
||||
conf, desc = _check_element_stale(error_message, metrics_history, logs)
|
||||
pattern_results['element_stale'] = (conf, desc)
|
||||
|
||||
# Find the pattern with highest confidence
|
||||
best_pattern = max(pattern_results.items(), key=lambda x: x[1][0])
|
||||
pattern_name = best_pattern[0]
|
||||
confidence = best_pattern[1][0]
|
||||
description = best_pattern[1][1]
|
||||
|
||||
# If confidence is too low, fall back to basic classification
|
||||
if confidence < 0.2:
|
||||
# Map basic crash types to our patterns
|
||||
basic_to_pattern = {
|
||||
'memory_exhaustion': 'memory_exhaustion',
|
||||
'tab_crash': 'memory_exhaustion', # Tab crashes often from memory
|
||||
'timeout': 'scroll_timeout',
|
||||
'element_not_found': 'element_stale',
|
||||
'rate_limited': 'rate_limited',
|
||||
'network_failure': 'rate_limited', # Could be blocking
|
||||
}
|
||||
|
||||
if basic_type in basic_to_pattern:
|
||||
pattern_name = basic_to_pattern[basic_type]
|
||||
confidence = 0.3 # Low confidence fallback
|
||||
description = f"Inferred from basic crash type '{basic_type}'"
|
||||
else:
|
||||
pattern_name = 'unknown'
|
||||
confidence = 0.0
|
||||
description = f"Unable to determine crash pattern (basic type: {basic_type})"
|
||||
|
||||
# Generate suggested fix based on pattern
|
||||
suggested_fixes = {
|
||||
'memory_exhaustion': (
|
||||
"Reduce batch size and restart browser more frequently. "
|
||||
"Consider limiting max_reviews to 500 and restarting browser after every 200 reviews."
|
||||
),
|
||||
'dom_bloat': (
|
||||
"Enable DOM cleanup during scrolling. "
|
||||
"Hide processed review cards and remove separator elements to keep DOM light."
|
||||
),
|
||||
'rate_limited': (
|
||||
"Increase delays between requests and consider rotating proxies. "
|
||||
"Double the delay multiplier and switch to a different proxy if available."
|
||||
),
|
||||
'consent_loop': (
|
||||
"Skip consent handling after initial attempt to avoid infinite loops. "
|
||||
"The consent popup may be appearing due to cookie clearing or navigation issues."
|
||||
),
|
||||
'scroll_timeout': (
|
||||
"The page may have stopped loading new reviews. "
|
||||
"Try reducing the target review count by 10% and accepting partial results."
|
||||
),
|
||||
'element_stale': (
|
||||
"Page elements are being removed/replaced during scraping. "
|
||||
"Retry operations with freshly-located elements and add defensive waits."
|
||||
),
|
||||
'unknown': (
|
||||
"Unable to determine specific crash cause. "
|
||||
"Review logs and consider restarting with fresh browser session."
|
||||
)
|
||||
}
|
||||
|
||||
suggested_fix = suggested_fixes.get(pattern_name, suggested_fixes['unknown'])
|
||||
auto_fix_params = AUTO_FIX_PARAMS.get(pattern_name)
|
||||
|
||||
return CrashAnalysis(
|
||||
pattern=pattern_name,
|
||||
confidence=confidence,
|
||||
description=description,
|
||||
suggested_fix=suggested_fix,
|
||||
auto_fix_params=auto_fix_params
|
||||
)
|
||||
|
||||
|
||||
def get_auto_fix_params(pattern: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get auto-fix parameters for a specific crash pattern.
|
||||
|
||||
Args:
|
||||
pattern: The crash pattern name
|
||||
|
||||
Returns:
|
||||
Dictionary of auto-fix parameters, or None if pattern not recognized
|
||||
"""
|
||||
return AUTO_FIX_PARAMS.get(pattern)
|
||||
|
||||
|
||||
def apply_auto_fix(pattern: str, current_params: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Apply auto-fix parameters to current scraper parameters.
|
||||
|
||||
Args:
|
||||
pattern: The crash pattern name
|
||||
current_params: Current scraper parameters to modify
|
||||
|
||||
Returns:
|
||||
Updated parameters dictionary with fixes applied
|
||||
"""
|
||||
fix_params = AUTO_FIX_PARAMS.get(pattern, {})
|
||||
updated = current_params.copy()
|
||||
|
||||
for key, value in fix_params.items():
|
||||
if key == 'target_reviews' and value == 'current - 10%':
|
||||
# Special case: reduce target by 10%
|
||||
current_target = updated.get('max_reviews', 1000)
|
||||
updated['max_reviews'] = int(current_target * 0.9)
|
||||
elif key == 'delay_multiplier':
|
||||
# Multiply existing delay
|
||||
current_delay = updated.get('scroll_delay', 1.0)
|
||||
updated['scroll_delay'] = current_delay * value
|
||||
else:
|
||||
updated[key] = value
|
||||
|
||||
return updated
|
||||
|
||||
|
||||
def summarize_crash_patterns(crash_reports: List[Dict]) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze multiple crash reports to identify recurring patterns.
|
||||
|
||||
Args:
|
||||
crash_reports: List of crash report dictionaries
|
||||
|
||||
Returns:
|
||||
Summary dictionary with pattern frequencies and recommendations
|
||||
"""
|
||||
if not crash_reports:
|
||||
return {
|
||||
'total_crashes': 0,
|
||||
'patterns': {},
|
||||
'most_common': None,
|
||||
'recommendations': []
|
||||
}
|
||||
|
||||
pattern_counts: Dict[str, int] = {}
|
||||
pattern_confidences: Dict[str, List[float]] = {}
|
||||
|
||||
for report in crash_reports:
|
||||
analysis = analyze_crash(report)
|
||||
pattern = analysis.pattern
|
||||
|
||||
pattern_counts[pattern] = pattern_counts.get(pattern, 0) + 1
|
||||
if pattern not in pattern_confidences:
|
||||
pattern_confidences[pattern] = []
|
||||
pattern_confidences[pattern].append(analysis.confidence)
|
||||
|
||||
# Calculate average confidence per pattern
|
||||
patterns_summary = {}
|
||||
for pattern, count in pattern_counts.items():
|
||||
avg_confidence = sum(pattern_confidences[pattern]) / len(pattern_confidences[pattern])
|
||||
patterns_summary[pattern] = {
|
||||
'count': count,
|
||||
'percentage': count / len(crash_reports) * 100,
|
||||
'avg_confidence': avg_confidence
|
||||
}
|
||||
|
||||
# Find most common pattern
|
||||
most_common = max(pattern_counts.items(), key=lambda x: x[1])[0] if pattern_counts else None
|
||||
|
||||
# Generate recommendations
|
||||
recommendations = []
|
||||
for pattern, stats in sorted(patterns_summary.items(), key=lambda x: x[1]['count'], reverse=True):
|
||||
if stats['count'] >= 2: # Only recommend for recurring patterns
|
||||
fix_params = AUTO_FIX_PARAMS.get(pattern)
|
||||
if fix_params:
|
||||
recommendations.append({
|
||||
'pattern': pattern,
|
||||
'occurrences': stats['count'],
|
||||
'auto_fix_params': fix_params
|
||||
})
|
||||
|
||||
return {
|
||||
'total_crashes': len(crash_reports),
|
||||
'patterns': patterns_summary,
|
||||
'most_common': most_common,
|
||||
'recommendations': recommendations
|
||||
}
|
||||
@@ -1,882 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PostgreSQL database module for production microservice.
|
||||
Stores job metadata and reviews as JSONB.
|
||||
"""
|
||||
import asyncpg
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Dict, Any
|
||||
from uuid import UUID, uuid4
|
||||
from enum import Enum
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class JobStatus(str, Enum):
|
||||
"""Job status enumeration"""
|
||||
PENDING = "pending"
|
||||
RUNNING = "running"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
CANCELLED = "cancelled"
|
||||
PARTIAL = "partial" # Job crashed but has partial reviews saved
|
||||
|
||||
|
||||
class DatabaseManager:
|
||||
"""PostgreSQL database manager with connection pooling"""
|
||||
|
||||
def __init__(self, database_url: str):
|
||||
"""
|
||||
Initialize database manager.
|
||||
|
||||
Args:
|
||||
database_url: PostgreSQL connection URL
|
||||
Format: postgresql://user:password@host:port/database
|
||||
"""
|
||||
self.database_url = database_url
|
||||
self.pool: Optional[asyncpg.Pool] = None
|
||||
|
||||
async def connect(self):
|
||||
"""Create connection pool"""
|
||||
log.info("Connecting to PostgreSQL database...")
|
||||
self.pool = await asyncpg.create_pool(
|
||||
self.database_url,
|
||||
min_size=5,
|
||||
max_size=20,
|
||||
command_timeout=60
|
||||
)
|
||||
log.info("Database connection pool created")
|
||||
|
||||
async def disconnect(self):
|
||||
"""Close connection pool"""
|
||||
if self.pool:
|
||||
await self.pool.close()
|
||||
log.info("Database connection pool closed")
|
||||
|
||||
async def initialize_schema(self):
|
||||
"""Create database schema if it doesn't exist"""
|
||||
async with self.pool.acquire() as conn:
|
||||
# Create jobs table
|
||||
await conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS jobs (
|
||||
job_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'pending',
|
||||
url TEXT NOT NULL,
|
||||
webhook_url TEXT,
|
||||
webhook_secret TEXT,
|
||||
|
||||
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
|
||||
started_at TIMESTAMP,
|
||||
completed_at TIMESTAMP,
|
||||
updated_at TIMESTAMP,
|
||||
|
||||
reviews_count INTEGER,
|
||||
total_reviews INTEGER,
|
||||
reviews_data JSONB,
|
||||
scrape_time REAL,
|
||||
|
||||
error_message TEXT,
|
||||
metadata JSONB,
|
||||
scrape_logs JSONB,
|
||||
|
||||
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'))
|
||||
);
|
||||
""")
|
||||
|
||||
# Add scrape_logs column if it doesn't exist (for existing databases)
|
||||
await conn.execute("""
|
||||
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
|
||||
""")
|
||||
|
||||
# Add updated_at column if it doesn't exist (for incremental progress tracking)
|
||||
await conn.execute("""
|
||||
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS updated_at TIMESTAMP;
|
||||
""")
|
||||
|
||||
# Add review_topics column if it doesn't exist (extracted topic filters with mention counts)
|
||||
await conn.execute("""
|
||||
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS review_topics JSONB;
|
||||
""")
|
||||
|
||||
# Update constraint to include 'partial' status (for existing databases)
|
||||
await conn.execute("""
|
||||
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_status;
|
||||
""")
|
||||
await conn.execute("""
|
||||
ALTER TABLE jobs ADD CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'));
|
||||
""")
|
||||
|
||||
# Create indexes
|
||||
await conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
|
||||
""")
|
||||
await conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_jobs_created_at ON jobs(created_at DESC);
|
||||
""")
|
||||
await conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_jobs_webhook ON jobs(webhook_url) WHERE webhook_url IS NOT NULL;
|
||||
""")
|
||||
|
||||
# Create canary results table
|
||||
await conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS canary_results (
|
||||
id SERIAL PRIMARY KEY,
|
||||
timestamp TIMESTAMP NOT NULL DEFAULT NOW(),
|
||||
success BOOLEAN NOT NULL,
|
||||
reviews_count INTEGER,
|
||||
scrape_time REAL,
|
||||
error_message TEXT,
|
||||
metadata JSONB
|
||||
);
|
||||
""")
|
||||
|
||||
await conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_canary_timestamp ON canary_results(timestamp DESC);
|
||||
""")
|
||||
|
||||
# Create webhook attempts table (for retry tracking)
|
||||
await conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS webhook_attempts (
|
||||
id SERIAL PRIMARY KEY,
|
||||
job_id UUID NOT NULL REFERENCES jobs(job_id) ON DELETE CASCADE,
|
||||
attempt_number INTEGER NOT NULL,
|
||||
timestamp TIMESTAMP NOT NULL DEFAULT NOW(),
|
||||
success BOOLEAN NOT NULL,
|
||||
status_code INTEGER,
|
||||
error_message TEXT,
|
||||
response_time_ms REAL
|
||||
);
|
||||
""")
|
||||
|
||||
await conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_webhook_job_id ON webhook_attempts(job_id);
|
||||
""")
|
||||
|
||||
# Add session_fingerprint and metrics_history columns to jobs table
|
||||
await conn.execute("""
|
||||
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS session_fingerprint JSONB;
|
||||
""")
|
||||
await conn.execute("""
|
||||
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS metrics_history JSONB;
|
||||
""")
|
||||
|
||||
# Create crash_reports table
|
||||
await conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS crash_reports (
|
||||
crash_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
job_id UUID REFERENCES jobs(job_id) ON DELETE CASCADE,
|
||||
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
|
||||
crash_type VARCHAR(50) NOT NULL,
|
||||
error_message TEXT,
|
||||
state JSONB NOT NULL,
|
||||
metrics_history JSONB,
|
||||
logs_before_crash JSONB,
|
||||
analysis JSONB,
|
||||
screenshot_url TEXT,
|
||||
dom_snapshot_id UUID
|
||||
);
|
||||
""")
|
||||
|
||||
await conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_crash_reports_job ON crash_reports(job_id);
|
||||
""")
|
||||
await conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_crash_reports_type ON crash_reports(crash_type);
|
||||
""")
|
||||
await conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_crash_reports_created ON crash_reports(created_at DESC);
|
||||
""")
|
||||
|
||||
log.info("Database schema initialized")
|
||||
|
||||
# ==================== Job Operations ====================
|
||||
|
||||
async def create_job(
|
||||
self,
|
||||
url: str,
|
||||
webhook_url: Optional[str] = None,
|
||||
webhook_secret: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
) -> UUID:
|
||||
"""
|
||||
Create a new scraping job.
|
||||
|
||||
Args:
|
||||
url: Google Maps URL to scrape
|
||||
webhook_url: Optional webhook URL for notifications
|
||||
webhook_secret: Optional secret for webhook signature
|
||||
metadata: Optional additional metadata
|
||||
|
||||
Returns:
|
||||
UUID of created job
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
job_id = await conn.fetchval("""
|
||||
INSERT INTO jobs (url, webhook_url, webhook_secret, metadata)
|
||||
VALUES ($1, $2, $3, $4)
|
||||
RETURNING job_id
|
||||
""", url, webhook_url, webhook_secret, json.dumps(metadata) if metadata else None)
|
||||
|
||||
log.info(f"Created job {job_id} for URL: {url[:80]}...")
|
||||
return job_id
|
||||
|
||||
async def get_job(self, job_id: UUID) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get job by ID.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
|
||||
Returns:
|
||||
Job dictionary or None if not found
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
row = await conn.fetchrow("""
|
||||
SELECT
|
||||
job_id,
|
||||
status,
|
||||
url,
|
||||
webhook_url,
|
||||
created_at,
|
||||
started_at,
|
||||
completed_at,
|
||||
updated_at,
|
||||
reviews_count,
|
||||
total_reviews,
|
||||
reviews_data,
|
||||
scrape_time,
|
||||
error_message,
|
||||
metadata,
|
||||
scrape_logs,
|
||||
review_topics
|
||||
FROM jobs
|
||||
WHERE job_id = $1
|
||||
""", job_id)
|
||||
|
||||
if not row:
|
||||
return None
|
||||
|
||||
return dict(row)
|
||||
|
||||
async def get_job_reviews(self, job_id: UUID, include_partial: bool = True) -> Optional[List[Dict[str, Any]]]:
|
||||
"""
|
||||
Get reviews for a specific job.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
include_partial: If True, also return reviews for running and partial jobs
|
||||
|
||||
Returns:
|
||||
List of reviews or None if not found/no reviews
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
if include_partial:
|
||||
# Return reviews for completed, running, or partial jobs
|
||||
reviews_data = await conn.fetchval("""
|
||||
SELECT reviews_data
|
||||
FROM jobs
|
||||
WHERE job_id = $1 AND status IN ('completed', 'running', 'partial')
|
||||
""", job_id)
|
||||
else:
|
||||
# Only return reviews for completed jobs
|
||||
reviews_data = await conn.fetchval("""
|
||||
SELECT reviews_data
|
||||
FROM jobs
|
||||
WHERE job_id = $1 AND status = 'completed'
|
||||
""", job_id)
|
||||
|
||||
if not reviews_data:
|
||||
return None
|
||||
|
||||
# asyncpg returns JSONB as string, need to parse it
|
||||
if isinstance(reviews_data, str):
|
||||
return json.loads(reviews_data)
|
||||
|
||||
return reviews_data
|
||||
|
||||
async def update_job_status(
|
||||
self,
|
||||
job_id: UUID,
|
||||
status: JobStatus,
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
Update job status and optional fields.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
status: New status
|
||||
**kwargs: Additional fields to update (started_at, completed_at, error_message, etc.)
|
||||
"""
|
||||
# Build dynamic UPDATE query
|
||||
set_clauses = ["status = $2"]
|
||||
params = [job_id, status.value]
|
||||
param_idx = 3
|
||||
|
||||
if status == JobStatus.RUNNING and 'started_at' not in kwargs:
|
||||
kwargs['started_at'] = datetime.now()
|
||||
elif status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED] and 'completed_at' not in kwargs:
|
||||
kwargs['completed_at'] = datetime.now()
|
||||
|
||||
for key, value in kwargs.items():
|
||||
# Handle JSONB fields specially
|
||||
if key == 'scrape_logs' and value is not None:
|
||||
set_clauses.append(f"{key} = ${param_idx}::jsonb")
|
||||
params.append(json.dumps(value) if not isinstance(value, str) else value)
|
||||
else:
|
||||
set_clauses.append(f"{key} = ${param_idx}")
|
||||
params.append(value)
|
||||
param_idx += 1
|
||||
|
||||
query = f"""
|
||||
UPDATE jobs
|
||||
SET {', '.join(set_clauses)}
|
||||
WHERE job_id = $1
|
||||
"""
|
||||
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute(query, *params)
|
||||
|
||||
async def save_job_result(
|
||||
self,
|
||||
job_id: UUID,
|
||||
reviews: List[Dict[str, Any]],
|
||||
scrape_time: float,
|
||||
total_reviews: Optional[int] = None,
|
||||
scrape_logs: Optional[List[Dict[str, Any]]] = None,
|
||||
review_topics: Optional[List[Dict[str, Any]]] = None
|
||||
):
|
||||
"""
|
||||
Save scraping results to database.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
reviews: List of review dictionaries
|
||||
scrape_time: Time taken to scrape in seconds
|
||||
total_reviews: Total reviews available (from page counter)
|
||||
scrape_logs: List of log entries from the scraper
|
||||
review_topics: List of topic filter dictionaries with topic and count
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
# If reviews list is empty, check if job already has reviews from incremental saves
|
||||
# This happens when flush_callback was used during scraping
|
||||
if not reviews:
|
||||
existing = await conn.fetchval(
|
||||
"SELECT reviews_count FROM jobs WHERE job_id = $1", job_id
|
||||
)
|
||||
if existing and existing > 0:
|
||||
# Job has reviews from incremental saves, don't overwrite reviews_data
|
||||
await conn.execute("""
|
||||
UPDATE jobs
|
||||
SET
|
||||
status = 'completed',
|
||||
completed_at = NOW(),
|
||||
total_reviews = COALESCE($2, total_reviews),
|
||||
scrape_time = $3,
|
||||
scrape_logs = $4::jsonb,
|
||||
review_topics = $5::jsonb
|
||||
WHERE job_id = $1
|
||||
""", job_id, total_reviews, scrape_time,
|
||||
json.dumps(scrape_logs) if scrape_logs else None,
|
||||
json.dumps(review_topics) if review_topics else None)
|
||||
log.info(f"Completed job {job_id} with {existing} reviews (from incremental saves)")
|
||||
return
|
||||
|
||||
await conn.execute("""
|
||||
UPDATE jobs
|
||||
SET
|
||||
status = 'completed',
|
||||
completed_at = NOW(),
|
||||
reviews_count = $2,
|
||||
total_reviews = $3,
|
||||
reviews_data = $4::jsonb,
|
||||
scrape_time = $5,
|
||||
scrape_logs = $6::jsonb,
|
||||
review_topics = $7::jsonb
|
||||
WHERE job_id = $1
|
||||
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
|
||||
json.dumps(scrape_logs) if scrape_logs else None,
|
||||
json.dumps(review_topics) if review_topics else None)
|
||||
|
||||
log.info(f"Saved {len(reviews)} reviews for job {job_id}")
|
||||
|
||||
async def save_reviews_incremental(
|
||||
self,
|
||||
job_id: UUID,
|
||||
reviews: List[Dict[str, Any]],
|
||||
total_reviews: Optional[int] = None
|
||||
):
|
||||
"""
|
||||
Save reviews incrementally during scraping.
|
||||
Called on each flush to preserve progress in case of crash.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
reviews: ALL reviews collected so far (not just new ones)
|
||||
total_reviews: Total reviews available (from page counter)
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
UPDATE jobs
|
||||
SET
|
||||
reviews_count = $2,
|
||||
total_reviews = COALESCE($3, total_reviews),
|
||||
reviews_data = $4::jsonb,
|
||||
updated_at = NOW()
|
||||
WHERE job_id = $1 AND status = 'running'
|
||||
""", job_id, len(reviews), total_reviews, json.dumps(reviews))
|
||||
|
||||
log.debug(f"Incremental save: {len(reviews)} reviews for job {job_id}")
|
||||
|
||||
async def update_session_fingerprint(
|
||||
self,
|
||||
job_id: UUID,
|
||||
session_fingerprint: Dict[str, Any]
|
||||
):
|
||||
"""
|
||||
Update the session fingerprint for a job.
|
||||
|
||||
This should be called early in the scraping process after the browser
|
||||
fingerprint is captured, to record browser characteristics for
|
||||
bot detection analysis.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
session_fingerprint: Dictionary containing browser fingerprint data:
|
||||
- user_agent: Browser user agent string
|
||||
- platform: OS platform
|
||||
- language: Primary language
|
||||
- languages: List of accepted languages
|
||||
- timezone: Timezone string
|
||||
- screen: {width, height, colorDepth}
|
||||
- viewport: {width, height}
|
||||
- webgl_vendor: WebGL vendor string
|
||||
- webgl_renderer: WebGL renderer string
|
||||
- canvas_fingerprint: Canvas fingerprint hash
|
||||
- hardware_concurrency: Number of CPU cores
|
||||
- device_memory: Device memory in GB
|
||||
- bot_detection_tests: {webdriver_hidden, chrome_runtime, permissions_query}
|
||||
- captured_at: ISO timestamp when fingerprint was captured
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
UPDATE jobs
|
||||
SET
|
||||
session_fingerprint = $2::jsonb,
|
||||
updated_at = NOW()
|
||||
WHERE job_id = $1
|
||||
""", job_id, json.dumps(session_fingerprint))
|
||||
|
||||
log.debug(f"Updated session fingerprint for job {job_id}")
|
||||
|
||||
async def mark_job_partial(
|
||||
self,
|
||||
job_id: UUID,
|
||||
error_message: str,
|
||||
scrape_logs: Optional[List[Dict[str, Any]]] = None
|
||||
):
|
||||
"""
|
||||
Mark a job as partial (crashed but has some reviews saved).
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
error_message: Error that caused the crash
|
||||
scrape_logs: Log entries from the scraper
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
UPDATE jobs
|
||||
SET
|
||||
status = 'partial',
|
||||
completed_at = NOW(),
|
||||
error_message = $2,
|
||||
scrape_logs = $3::jsonb
|
||||
WHERE job_id = $1
|
||||
""", job_id, error_message, json.dumps(scrape_logs) if scrape_logs else None)
|
||||
|
||||
log.info(f"Marked job {job_id} as partial due to: {error_message}")
|
||||
|
||||
async def list_jobs(
|
||||
self,
|
||||
status: Optional[JobStatus] = None,
|
||||
limit: int = 100,
|
||||
offset: int = 0
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
List jobs with optional filtering.
|
||||
|
||||
Args:
|
||||
status: Optional status filter
|
||||
limit: Maximum number of jobs to return
|
||||
offset: Number of jobs to skip
|
||||
|
||||
Returns:
|
||||
List of job dictionaries
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
if status:
|
||||
rows = await conn.fetch("""
|
||||
SELECT
|
||||
job_id,
|
||||
status,
|
||||
url,
|
||||
created_at,
|
||||
completed_at,
|
||||
reviews_count,
|
||||
total_reviews,
|
||||
scrape_time,
|
||||
error_message,
|
||||
metadata,
|
||||
review_topics
|
||||
FROM jobs
|
||||
WHERE status = $1
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $2 OFFSET $3
|
||||
""", status.value, limit, offset)
|
||||
else:
|
||||
rows = await conn.fetch("""
|
||||
SELECT
|
||||
job_id,
|
||||
status,
|
||||
url,
|
||||
created_at,
|
||||
completed_at,
|
||||
reviews_count,
|
||||
total_reviews,
|
||||
scrape_time,
|
||||
error_message,
|
||||
metadata,
|
||||
review_topics
|
||||
FROM jobs
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $1 OFFSET $2
|
||||
""", limit, offset)
|
||||
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
async def get_pending_jobs_with_webhooks(self, limit: int = 100) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get completed jobs that have webhooks pending delivery.
|
||||
|
||||
Args:
|
||||
limit: Maximum number of jobs to return
|
||||
|
||||
Returns:
|
||||
List of job dictionaries with webhook info
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
rows = await conn.fetch("""
|
||||
SELECT
|
||||
job_id,
|
||||
status,
|
||||
url,
|
||||
webhook_url,
|
||||
webhook_secret,
|
||||
reviews_count,
|
||||
scrape_time,
|
||||
error_message,
|
||||
completed_at
|
||||
FROM jobs
|
||||
WHERE webhook_url IS NOT NULL
|
||||
AND status IN ('completed', 'failed')
|
||||
AND job_id NOT IN (
|
||||
SELECT job_id
|
||||
FROM webhook_attempts
|
||||
WHERE success = true
|
||||
)
|
||||
ORDER BY completed_at ASC
|
||||
LIMIT $1
|
||||
""", limit)
|
||||
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
async def delete_job(self, job_id: UUID) -> bool:
|
||||
"""
|
||||
Delete a job from the database.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
|
||||
Returns:
|
||||
True if deleted, False if not found
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
result = await conn.execute("""
|
||||
DELETE FROM jobs WHERE job_id = $1
|
||||
""", job_id)
|
||||
|
||||
deleted = result.split()[-1] == "1"
|
||||
if deleted:
|
||||
log.info(f"Deleted job {job_id}")
|
||||
return deleted
|
||||
|
||||
async def cleanup_old_jobs(self, max_age_days: int = 30):
|
||||
"""
|
||||
Delete old completed/failed jobs.
|
||||
|
||||
Args:
|
||||
max_age_days: Maximum age in days before deletion
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
result = await conn.execute("""
|
||||
DELETE FROM jobs
|
||||
WHERE status IN ('completed', 'failed', 'cancelled')
|
||||
AND completed_at < NOW() - INTERVAL '%s days'
|
||||
""", max_age_days)
|
||||
|
||||
deleted_count = int(result.split()[-1])
|
||||
if deleted_count > 0:
|
||||
log.info(f"Cleaned up {deleted_count} old jobs")
|
||||
|
||||
# ==================== Statistics ====================
|
||||
|
||||
async def get_stats(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get job statistics.
|
||||
|
||||
Returns:
|
||||
Statistics dictionary
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
stats = await conn.fetchrow("""
|
||||
SELECT
|
||||
COUNT(*) as total_jobs,
|
||||
COUNT(*) FILTER (WHERE status = 'pending') as pending,
|
||||
COUNT(*) FILTER (WHERE status = 'running') as running,
|
||||
COUNT(*) FILTER (WHERE status = 'completed') as completed,
|
||||
COUNT(*) FILTER (WHERE status = 'failed') as failed,
|
||||
COUNT(*) FILTER (WHERE status = 'cancelled') as cancelled,
|
||||
AVG(scrape_time) FILTER (WHERE status = 'completed') as avg_scrape_time,
|
||||
SUM(reviews_count) FILTER (WHERE status = 'completed') as total_reviews
|
||||
FROM jobs
|
||||
""")
|
||||
|
||||
return dict(stats)
|
||||
|
||||
# ==================== Canary Operations ====================
|
||||
|
||||
async def save_canary_result(
|
||||
self,
|
||||
success: bool,
|
||||
reviews_count: Optional[int] = None,
|
||||
scrape_time: Optional[float] = None,
|
||||
error_message: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
):
|
||||
"""
|
||||
Save canary test result.
|
||||
|
||||
Args:
|
||||
success: Whether canary test succeeded
|
||||
reviews_count: Number of reviews scraped
|
||||
scrape_time: Time taken in seconds
|
||||
error_message: Error message if failed
|
||||
metadata: Additional metadata
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
INSERT INTO canary_results (success, reviews_count, scrape_time, error_message, metadata)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
""", success, reviews_count, scrape_time, error_message, json.dumps(metadata) if metadata else None)
|
||||
|
||||
async def get_canary_history(self, limit: int = 100) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get canary test history.
|
||||
|
||||
Args:
|
||||
limit: Maximum number of results to return
|
||||
|
||||
Returns:
|
||||
List of canary result dictionaries
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
rows = await conn.fetch("""
|
||||
SELECT
|
||||
timestamp,
|
||||
success,
|
||||
reviews_count,
|
||||
scrape_time,
|
||||
error_message
|
||||
FROM canary_results
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT $1
|
||||
""", limit)
|
||||
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
# ==================== Webhook Attempts ====================
|
||||
|
||||
async def log_webhook_attempt(
|
||||
self,
|
||||
job_id: UUID,
|
||||
attempt_number: int,
|
||||
success: bool,
|
||||
status_code: Optional[int] = None,
|
||||
error_message: Optional[str] = None,
|
||||
response_time_ms: Optional[float] = None
|
||||
):
|
||||
"""
|
||||
Log a webhook delivery attempt.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
attempt_number: Attempt number (1, 2, 3...)
|
||||
success: Whether delivery succeeded
|
||||
status_code: HTTP status code
|
||||
error_message: Error message if failed
|
||||
response_time_ms: Response time in milliseconds
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
INSERT INTO webhook_attempts (job_id, attempt_number, success, status_code, error_message, response_time_ms)
|
||||
VALUES ($1, $2, $3, $4, $5, $6)
|
||||
""", job_id, attempt_number, success, status_code, error_message, response_time_ms)
|
||||
|
||||
# ==================== Crash Reports ====================
|
||||
|
||||
async def save_crash_report(self, job_id: str, crash_data: dict) -> str:
|
||||
"""
|
||||
Save a crash report and return the crash_id.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID as string
|
||||
crash_data: Dictionary containing crash report data:
|
||||
- crash_type: Type of crash (required)
|
||||
- error_message: Error message (optional)
|
||||
- state: Current state at crash time (required)
|
||||
- metrics_history: Historical metrics (optional)
|
||||
- logs_before_crash: Log entries before crash (optional)
|
||||
- analysis: Crash analysis data (optional)
|
||||
- screenshot_url: URL to screenshot (optional)
|
||||
- dom_snapshot_id: UUID of DOM snapshot (optional)
|
||||
|
||||
Returns:
|
||||
UUID of created crash report as string
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
# Convert job_id string to UUID
|
||||
job_uuid = UUID(job_id) if isinstance(job_id, str) else job_id
|
||||
|
||||
crash_id = await conn.fetchval("""
|
||||
INSERT INTO crash_reports (
|
||||
job_id,
|
||||
crash_type,
|
||||
error_message,
|
||||
state,
|
||||
metrics_history,
|
||||
logs_before_crash,
|
||||
analysis,
|
||||
screenshot_url,
|
||||
dom_snapshot_id
|
||||
)
|
||||
VALUES ($1, $2, $3, $4::jsonb, $5::jsonb, $6::jsonb, $7::jsonb, $8, $9)
|
||||
RETURNING crash_id
|
||||
""",
|
||||
job_uuid,
|
||||
crash_data.get('crash_type'),
|
||||
crash_data.get('error_message'),
|
||||
json.dumps(crash_data.get('state', {})),
|
||||
json.dumps(crash_data.get('metrics_history')) if crash_data.get('metrics_history') else None,
|
||||
json.dumps(crash_data.get('logs_before_crash')) if crash_data.get('logs_before_crash') else None,
|
||||
json.dumps(crash_data.get('analysis')) if crash_data.get('analysis') else None,
|
||||
crash_data.get('screenshot_url'),
|
||||
UUID(crash_data['dom_snapshot_id']) if crash_data.get('dom_snapshot_id') else None
|
||||
)
|
||||
|
||||
log.info(f"Saved crash report {crash_id} for job {job_id}, type: {crash_data.get('crash_type')}")
|
||||
return str(crash_id)
|
||||
|
||||
async def get_crash_report(self, job_id: str) -> Optional[dict]:
|
||||
"""
|
||||
Get crash report for a job, if any.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID as string
|
||||
|
||||
Returns:
|
||||
Crash report dictionary or None if not found
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
job_uuid = UUID(job_id) if isinstance(job_id, str) else job_id
|
||||
|
||||
row = await conn.fetchrow("""
|
||||
SELECT
|
||||
crash_id,
|
||||
job_id,
|
||||
created_at,
|
||||
crash_type,
|
||||
error_message,
|
||||
state,
|
||||
metrics_history,
|
||||
logs_before_crash,
|
||||
analysis,
|
||||
screenshot_url,
|
||||
dom_snapshot_id
|
||||
FROM crash_reports
|
||||
WHERE job_id = $1
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1
|
||||
""", job_uuid)
|
||||
|
||||
if not row:
|
||||
return None
|
||||
|
||||
result = dict(row)
|
||||
# Convert UUIDs to strings for JSON serialization
|
||||
result['crash_id'] = str(result['crash_id'])
|
||||
result['job_id'] = str(result['job_id'])
|
||||
if result.get('dom_snapshot_id'):
|
||||
result['dom_snapshot_id'] = str(result['dom_snapshot_id'])
|
||||
|
||||
return result
|
||||
|
||||
async def get_crash_stats(self, days: int = 7) -> dict:
|
||||
"""
|
||||
Get crash statistics for the last N days.
|
||||
|
||||
Args:
|
||||
days: Number of days to look back (default: 7)
|
||||
|
||||
Returns:
|
||||
Dictionary with:
|
||||
- total: Total number of crashes
|
||||
- by_type: Dict mapping crash type to count
|
||||
- by_day: List of dicts with date and count
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
# Get total count
|
||||
total = await conn.fetchval("""
|
||||
SELECT COUNT(*)
|
||||
FROM crash_reports
|
||||
WHERE created_at >= NOW() - INTERVAL '%s days'
|
||||
""", days)
|
||||
|
||||
# Get counts by type
|
||||
type_rows = await conn.fetch("""
|
||||
SELECT crash_type, COUNT(*) as count
|
||||
FROM crash_reports
|
||||
WHERE created_at >= NOW() - INTERVAL '%s days'
|
||||
GROUP BY crash_type
|
||||
ORDER BY count DESC
|
||||
""", days)
|
||||
|
||||
by_type = {row['crash_type']: row['count'] for row in type_rows}
|
||||
|
||||
# Get counts by day
|
||||
day_rows = await conn.fetch("""
|
||||
SELECT DATE(created_at) as date, COUNT(*) as count
|
||||
FROM crash_reports
|
||||
WHERE created_at >= NOW() - INTERVAL '%s days'
|
||||
GROUP BY DATE(created_at)
|
||||
ORDER BY date DESC
|
||||
""", days)
|
||||
|
||||
by_day = [{'date': str(row['date']), 'count': row['count']} for row in day_rows]
|
||||
|
||||
return {
|
||||
'total': total or 0,
|
||||
'by_type': by_type,
|
||||
'by_day': by_day
|
||||
}
|
||||
@@ -1,391 +0,0 @@
|
||||
"""
|
||||
Date conversion utilities for Google Maps reviews.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
# Logger
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
|
||||
def relative_to_datetime(date_str: str, lang: str = "en") -> Optional[datetime]:
|
||||
"""
|
||||
Convert a relative date string to a datetime object.
|
||||
|
||||
Args:
|
||||
date_str: The relative date string (e.g., "2 years ago")
|
||||
lang: Language code ("en" or "he")
|
||||
|
||||
Returns:
|
||||
datetime object or None if conversion fails
|
||||
"""
|
||||
if not date_str:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Convert to ISO format first
|
||||
iso_date = parse_relative_date(date_str, lang)
|
||||
|
||||
# If original string was returned, it wasn't in the expected format
|
||||
if iso_date == date_str:
|
||||
return None
|
||||
|
||||
# Parse the ISO format into datetime
|
||||
return datetime.fromisoformat(iso_date)
|
||||
except Exception as e:
|
||||
log.debug(f"Failed to convert relative date '{date_str}': {e}")
|
||||
return None
|
||||
|
||||
|
||||
class DateConverter:
|
||||
"""Handler for converting string dates to datetime objects in MongoDB"""
|
||||
|
||||
@staticmethod
|
||||
def convert_dates_in_document(doc: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert string dates to datetime objects in a document.
|
||||
|
||||
Args:
|
||||
doc: MongoDB document with string dates
|
||||
|
||||
Returns:
|
||||
Document with string dates converted to datetime objects
|
||||
"""
|
||||
# Remove the original date string field if it exists
|
||||
if "date" in doc:
|
||||
original_date = doc.pop("date")
|
||||
|
||||
# Try to use the original date to fix review_date if needed
|
||||
if "review_date" not in doc or not doc["review_date"]:
|
||||
lang = next(iter(doc.get("description", {}).keys()), "en")
|
||||
date_obj = relative_to_datetime(original_date, lang)
|
||||
if date_obj:
|
||||
doc["review_date"] = date_obj
|
||||
|
||||
# Fields that should be converted to dates
|
||||
date_fields = ["created_date", "last_modified_date", "review_date"]
|
||||
|
||||
# Convert date fields to datetime
|
||||
for field in date_fields:
|
||||
if field in doc and isinstance(doc[field], str):
|
||||
try:
|
||||
# Try to parse as ISO format first
|
||||
doc[field] = datetime.fromisoformat(doc[field].replace('Z', '+00:00'))
|
||||
except (ValueError, TypeError):
|
||||
# If that fails, try parsing as relative date
|
||||
lang = next(iter(doc.get("description", {}).keys()), "en")
|
||||
date_obj = relative_to_datetime(doc[field], lang)
|
||||
if date_obj:
|
||||
doc[field] = date_obj
|
||||
|
||||
# Handle nested date fields in owner_responses
|
||||
if "owner_responses" in doc and isinstance(doc["owner_responses"], dict):
|
||||
for lang, response in doc["owner_responses"].items():
|
||||
if isinstance(response, dict) and "date" in response:
|
||||
# Remove the date string field from owner responses
|
||||
del response["date"]
|
||||
|
||||
return doc
|
||||
|
||||
@staticmethod
|
||||
def convert_dates_in_reviews(reviews: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
|
||||
"""
|
||||
Convert string dates to datetime objects for all reviews.
|
||||
|
||||
Args:
|
||||
reviews: Dictionary of review documents
|
||||
|
||||
Returns:
|
||||
Reviews with dates converted to datetime objects
|
||||
"""
|
||||
log.info("Converting string dates to datetime objects...")
|
||||
|
||||
for review_id, review in reviews.items():
|
||||
reviews[review_id] = DateConverter.convert_dates_in_document(review)
|
||||
|
||||
return reviews
|
||||
|
||||
|
||||
def parse_relative_date(date_str: str, lang: str, now: Optional[datetime] = None) -> str:
|
||||
"""
|
||||
Converts a relative review_date (in English or Hebrew) such as "a week ago" or "לפני 7 שנים"
|
||||
into an ISO formatted datetime string (UTC).
|
||||
|
||||
For English, supported formats include:
|
||||
- "a day ago", "an hour ago", "3 weeks ago", "4 months ago", "2 years ago", etc.
|
||||
For Hebrew, supported formats include:
|
||||
- "לפני יום", "לפני 2 ימים", "לפני שבוע", "לפני שבועיים", "לפני חודש",
|
||||
"לפני חודשיים", "לפני 10 חודשים", "לפני שנה", "לפני 3 שנים", etc.
|
||||
|
||||
Parameters:
|
||||
- date_str (str): the relative date string.
|
||||
- lang (str): "en" for English or "he" for Hebrew.
|
||||
- now (Optional[datetime]): reference datetime; if None, current local time is used.
|
||||
|
||||
Returns:
|
||||
A string representing the calculated absolute datetime in ISO 8601 format.
|
||||
If parsing fails in all supported languages, returns a random date within the last year.
|
||||
"""
|
||||
import random
|
||||
|
||||
if now is None:
|
||||
now = datetime.utcnow() # use UTC for consistency
|
||||
|
||||
# Try with the provided language first
|
||||
result = try_parse_date(date_str, lang, now)
|
||||
if result != date_str:
|
||||
return result
|
||||
|
||||
# If the provided language failed, try other supported languages
|
||||
supported_langs = ["en", "he", "th"]
|
||||
for alt_lang in supported_langs:
|
||||
if alt_lang != lang.lower():
|
||||
result = try_parse_date(date_str, alt_lang, now)
|
||||
if result != date_str:
|
||||
return result
|
||||
|
||||
# If all parsing attempts failed, generate a random date within the last year
|
||||
# This creates a date between 1 day ago and 365 days ago
|
||||
random_days_ago = random.randint(1, 365)
|
||||
random_date = now - timedelta(days=random_days_ago)
|
||||
return random_date.isoformat()
|
||||
|
||||
|
||||
def try_parse_date(date_str: str, lang: str, now: datetime) -> str:
|
||||
"""
|
||||
Helper function that attempts to parse a date string in a specific language.
|
||||
|
||||
Returns the ISO formatted date if successful, or the original string if not.
|
||||
"""
|
||||
delta = timedelta(0)
|
||||
parsed = False
|
||||
|
||||
if lang.lower() == "en":
|
||||
# Pattern: capture number or "a"/"an", then unit.
|
||||
pattern = re.compile(r'(?P<num>a|an|\d+)\s+(?P<unit>day|week|month|year)s?\s+ago', re.IGNORECASE)
|
||||
m = pattern.search(date_str)
|
||||
if m:
|
||||
num_str = m.group("num").lower()
|
||||
num = 1 if num_str in ("a", "an") else int(num_str)
|
||||
unit = m.group("unit").lower()
|
||||
if unit == "day":
|
||||
delta = timedelta(days=num)
|
||||
elif unit == "week":
|
||||
delta = timedelta(weeks=num)
|
||||
elif unit == "month":
|
||||
delta = timedelta(days=30 * num) # approximate
|
||||
elif unit == "year":
|
||||
delta = timedelta(days=365 * num) # approximate
|
||||
parsed = True
|
||||
elif lang.lower() == "he":
|
||||
# Remove the "לפני" prefix if present
|
||||
text = date_str.strip()
|
||||
if text.startswith("לפני"):
|
||||
text = text[len("לפני"):].strip()
|
||||
|
||||
# Handle special cases where the number and unit are combined:
|
||||
special = {
|
||||
"חודשיים": (2, "month"),
|
||||
"שבועיים": (2, "week"),
|
||||
"יומיים": (2, "day"),
|
||||
}
|
||||
if text in special:
|
||||
num, unit = special[text]
|
||||
if unit == "day":
|
||||
delta = timedelta(days=num)
|
||||
elif unit == "week":
|
||||
delta = timedelta(weeks=num)
|
||||
elif unit == "month":
|
||||
delta = timedelta(days=30 * num) # approximate
|
||||
parsed = True
|
||||
else:
|
||||
# Match optional number (or assume 1) and then a unit.
|
||||
pattern = re.compile(r'(?P<num>\d+|אחד|אחת)?\s*(?P<unit>שנה|שנים|חודש|חודשים|יום|ימים|שבוע|שבועות)',
|
||||
re.IGNORECASE)
|
||||
m = pattern.search(text)
|
||||
if m:
|
||||
num_str = m.group("num")
|
||||
if not num_str:
|
||||
num = 1
|
||||
else:
|
||||
try:
|
||||
num = int(num_str)
|
||||
except ValueError:
|
||||
num = 1
|
||||
unit_he = m.group("unit")
|
||||
# Map the Hebrew unit (both singular and plural) to English unit names
|
||||
if unit_he in ("יום", "ימים"):
|
||||
unit = "day"
|
||||
elif unit_he in ("שבוע", "שבועות"):
|
||||
unit = "week"
|
||||
elif unit_he in ("חודש", "חודשים"):
|
||||
unit = "month"
|
||||
elif unit_he in ("שנה", "שנים"):
|
||||
unit = "year"
|
||||
else:
|
||||
unit = "day" # fallback
|
||||
|
||||
if unit == "day":
|
||||
delta = timedelta(days=num)
|
||||
elif unit == "week":
|
||||
delta = timedelta(weeks=num)
|
||||
elif unit == "month":
|
||||
delta = timedelta(days=30 * num) # approximate
|
||||
elif unit == "year":
|
||||
delta = timedelta(days=365 * num) # approximate
|
||||
parsed = True
|
||||
elif lang.lower() == "th":
|
||||
# Thai language patterns (simplified)
|
||||
# Check for Thai patterns like "3 วันที่แล้ว" (3 days ago)
|
||||
thai_pattern = re.compile(r'(?P<num>\d+)?\s*(?P<unit>วัน|สัปดาห์|เดือน|ปี)ที่แล้ว', re.IGNORECASE)
|
||||
m = thai_pattern.search(date_str)
|
||||
if m:
|
||||
num_str = m.group("num")
|
||||
num = 1 if not num_str else int(num_str)
|
||||
unit_th = m.group("unit")
|
||||
|
||||
# Map Thai units to English
|
||||
if unit_th == "วัน":
|
||||
unit = "day"
|
||||
elif unit_th == "สัปดาห์":
|
||||
unit = "week"
|
||||
elif unit_th == "เดือน":
|
||||
unit = "month"
|
||||
elif unit_th == "ปี":
|
||||
unit = "year"
|
||||
else:
|
||||
unit = "day" # fallback
|
||||
|
||||
if unit == "day":
|
||||
delta = timedelta(days=num)
|
||||
elif unit == "week":
|
||||
delta = timedelta(weeks=num)
|
||||
elif unit == "month":
|
||||
delta = timedelta(days=30 * num) # approximate
|
||||
elif unit == "year":
|
||||
delta = timedelta(days=365 * num) # approximate
|
||||
parsed = True
|
||||
|
||||
# Return the calculated date if parsing was successful, otherwise return the original string
|
||||
if parsed:
|
||||
result = now - delta
|
||||
return result.isoformat()
|
||||
else:
|
||||
return date_str
|
||||
|
||||
|
||||
# def parse_relative_date(date_str: str, lang: str, now: Optional[datetime] = None) -> str:
|
||||
# """
|
||||
# Converts a relative review_date (in English or Hebrew) such as "a week ago" or "לפני 7 שנים"
|
||||
# into an ISO formatted datetime string (UTC).
|
||||
#
|
||||
# For English, supported formats include:
|
||||
# - "a day ago", "an hour ago", "3 weeks ago", "4 months ago", "2 years ago", etc.
|
||||
# For Hebrew, supported formats include:
|
||||
# - "לפני יום", "לפני 2 ימים", "לפני שבוע", "לפני שבועיים", "לפני חודש",
|
||||
# "לפני חודשיים", "לפני 10 חודשים", "לפני שנה", "לפני 3 שנים", etc.
|
||||
#
|
||||
# Parameters:
|
||||
# - date_str (str): the relative date string.
|
||||
# - lang (str): "en" for English or "he" for Hebrew.
|
||||
# - now (Optional[datetime]): reference datetime; if None, current local time is used.
|
||||
#
|
||||
# Returns:
|
||||
# A string representing the calculated absolute datetime in ISO 8601 format,
|
||||
# or the original date_str if parsing fails.
|
||||
# """
|
||||
# if now is None:
|
||||
# now = datetime.utcnow() # use UTC for consistency
|
||||
#
|
||||
# delta = timedelta(0)
|
||||
#
|
||||
# if lang.lower() == "en":
|
||||
# # Pattern: capture number or "a"/"an", then unit.
|
||||
# pattern = re.compile(r'(?P<num>a|an|\d+)\s+(?P<unit>day|week|month|year)s?\s+ago', re.IGNORECASE)
|
||||
# m = pattern.search(date_str)
|
||||
# if m:
|
||||
# num_str = m.group("num").lower()
|
||||
# num = 1 if num_str in ("a", "an") else int(num_str)
|
||||
# unit = m.group("unit").lower()
|
||||
# if unit == "day":
|
||||
# delta = timedelta(days=num)
|
||||
# elif unit == "week":
|
||||
# delta = timedelta(weeks=num)
|
||||
# elif unit == "month":
|
||||
# delta = timedelta(days=30 * num) # approximate
|
||||
# elif unit == "year":
|
||||
# delta = timedelta(days=365 * num) # approximate
|
||||
# else:
|
||||
# return date_str # return original if not matched
|
||||
# elif lang.lower() == "he":
|
||||
# # Remove the "לפני" prefix if present
|
||||
# text = date_str.strip()
|
||||
# if text.startswith("לפני"):
|
||||
# text = text[len("לפני"):].strip()
|
||||
#
|
||||
# # Handle special cases where the number and unit are combined:
|
||||
# special = {
|
||||
# "חודשיים": (2, "month"),
|
||||
# "שבועיים": (2, "week"),
|
||||
# "יומיים": (2, "day"),
|
||||
# }
|
||||
# if text in special:
|
||||
# num, unit = special[text]
|
||||
# else:
|
||||
# # Match optional number (or assume 1) and then a unit.
|
||||
# pattern = re.compile(r'(?P<num>\d+|אחד|אחת)?\s*(?P<unit>שנה|שנים|חודש|חודשים|יום|ימים|שבוע|שבועות)',
|
||||
# re.IGNORECASE)
|
||||
# m = pattern.search(text)
|
||||
# if m:
|
||||
# num_str = m.group("num")
|
||||
# if not num_str:
|
||||
# num = 1
|
||||
# else:
|
||||
# try:
|
||||
# num = int(num_str)
|
||||
# except ValueError:
|
||||
# num = 1
|
||||
# unit_he = m.group("unit")
|
||||
# # Map the Hebrew unit (both singular and plural) to English unit names
|
||||
# if unit_he in ("יום", "ימים"):
|
||||
# unit = "day"
|
||||
# elif unit_he in ("שבוע", "שבועות"):
|
||||
# unit = "week"
|
||||
# elif unit_he in ("חודש", "חודשים"):
|
||||
# unit = "month"
|
||||
# elif unit_he in ("שנה", "שנים"):
|
||||
# unit = "year"
|
||||
# else:
|
||||
# unit = "day" # fallback
|
||||
# else:
|
||||
# return date_str # if nothing matches, return original text
|
||||
#
|
||||
# if unit == "day":
|
||||
# delta = timedelta(days=num)
|
||||
# elif unit == "week":
|
||||
# delta = timedelta(weeks=num)
|
||||
# elif unit == "month":
|
||||
# delta = timedelta(days=30 * num) # approximate
|
||||
# elif unit == "year":
|
||||
# delta = timedelta(days=365 * num) # approximate
|
||||
#
|
||||
# result = now - delta
|
||||
# return result.isoformat()
|
||||
|
||||
|
||||
# --- Example usage ---
|
||||
if __name__ == "__main__":
|
||||
# Fixed reference time for reproducibility:
|
||||
fixed_now = datetime(2025, 2, 5, 12, 0, 0)
|
||||
examples = [
|
||||
("a week ago", "he"),
|
||||
("4 weeks ago", "en"),
|
||||
("לפני 7 שנים", "he"),
|
||||
("לפני חודשיים", "he")
|
||||
]
|
||||
for text, lang in examples:
|
||||
iso_date = parse_relative_date(text, lang, now=fixed_now)
|
||||
print(f"Original: {text} ({lang}) => ISO: {iso_date}")
|
||||
@@ -1,411 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Smart health check system with canary testing.
|
||||
Verifies that scraping actually works, not just that services are up.
|
||||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, Any, Optional
|
||||
import os
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CanaryMonitor:
|
||||
"""
|
||||
Background canary test monitor.
|
||||
|
||||
Runs actual scraping tests periodically to verify the scraper works.
|
||||
This catches issues like:
|
||||
- Google Maps page structure changes
|
||||
- Broken CSS selectors
|
||||
- GDPR consent handling issues
|
||||
- Network/proxy problems
|
||||
- Chrome/browser issues
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
db,
|
||||
interval_hours: int = 4,
|
||||
test_url: Optional[str] = None
|
||||
):
|
||||
"""
|
||||
Initialize canary monitor.
|
||||
|
||||
Args:
|
||||
db: Database manager instance
|
||||
interval_hours: How often to run canary tests
|
||||
test_url: Optional test URL (defaults to Soho Factory in Vilnius)
|
||||
"""
|
||||
self.db = db
|
||||
self.interval = timedelta(hours=interval_hours)
|
||||
self.test_url = test_url or os.getenv(
|
||||
'CANARY_TEST_URL',
|
||||
'https://www.google.com/maps/place/Soho+Factory/@54.6738155,25.2595844,17z/'
|
||||
)
|
||||
|
||||
self.running = False
|
||||
self.last_run: Optional[datetime] = None
|
||||
self.last_success: Optional[datetime] = None
|
||||
self.consecutive_failures = 0
|
||||
self.last_result: Optional[Dict[str, Any]] = None
|
||||
|
||||
async def start(self):
|
||||
"""Start the background canary monitoring"""
|
||||
self.running = True
|
||||
log.info(f"Canary monitor started (interval: {self.interval.total_seconds()/3600:.1f}h)")
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
await self.run_canary_test()
|
||||
except Exception as e:
|
||||
log.error(f"Canary test failed with exception: {e}")
|
||||
self.consecutive_failures += 1
|
||||
|
||||
# Alert if multiple consecutive failures
|
||||
if self.consecutive_failures >= 3:
|
||||
await self.send_alert(
|
||||
f"🚨 CRITICAL: Scraper canary failed {self.consecutive_failures} times in a row! "
|
||||
f"Last error: {str(e)[:200]}"
|
||||
)
|
||||
|
||||
# Sleep until next run
|
||||
await asyncio.sleep(self.interval.total_seconds())
|
||||
|
||||
def stop(self):
|
||||
"""Stop the background monitoring"""
|
||||
self.running = False
|
||||
log.info("Canary monitor stopped")
|
||||
|
||||
async def run_canary_test(self):
|
||||
"""
|
||||
Run a single canary test.
|
||||
|
||||
This performs an actual scrape on a known test URL and validates:
|
||||
- Scraping succeeds
|
||||
- Reviews are extracted
|
||||
- Review count is reasonable
|
||||
- Scrape time is reasonable
|
||||
- Data structure is valid
|
||||
"""
|
||||
from modules.scraper_clean import fast_scrape_reviews
|
||||
|
||||
log.info(f"Running canary scrape test on {self.test_url[:60]}...")
|
||||
self.last_run = datetime.now()
|
||||
|
||||
try:
|
||||
# Run actual scrape with timeout
|
||||
result = await asyncio.wait_for(
|
||||
asyncio.to_thread(
|
||||
fast_scrape_reviews,
|
||||
url=self.test_url,
|
||||
headless=True,
|
||||
max_scrolls=10 # Limited for canary
|
||||
),
|
||||
timeout=60 # Fail if takes > 60s
|
||||
)
|
||||
|
||||
# Validate result
|
||||
checks = {
|
||||
"scrape_succeeded": result['success'],
|
||||
"got_reviews": result['count'] > 0,
|
||||
"reasonable_count": 10 <= result['count'] <= 500,
|
||||
"reasonable_time": result['time'] < 30,
|
||||
"data_structure_valid": self._validate_review_structure(result.get('reviews', []))
|
||||
}
|
||||
|
||||
all_passed = all(checks.values())
|
||||
|
||||
if all_passed:
|
||||
# Success!
|
||||
log.info(
|
||||
f"✅ Canary test PASSED: {result['count']} reviews in {result['time']:.1f}s"
|
||||
)
|
||||
self.consecutive_failures = 0
|
||||
self.last_success = datetime.now()
|
||||
self.last_result = {
|
||||
"status": "pass",
|
||||
"reviews_count": result['count'],
|
||||
"scrape_time": result['time'],
|
||||
"checks": checks
|
||||
}
|
||||
|
||||
# Save to database
|
||||
await self.db.save_canary_result(
|
||||
success=True,
|
||||
reviews_count=result['count'],
|
||||
scrape_time=result['time'],
|
||||
metadata={"checks": checks}
|
||||
)
|
||||
|
||||
else:
|
||||
# Validation failed
|
||||
failed_checks = [k for k, v in checks.items() if not v]
|
||||
log.error(
|
||||
f"❌ Canary test FAILED: validation failed on {failed_checks}"
|
||||
)
|
||||
self.consecutive_failures += 1
|
||||
self.last_result = {
|
||||
"status": "fail",
|
||||
"reviews_count": result['count'],
|
||||
"scrape_time": result['time'],
|
||||
"checks": checks,
|
||||
"failed_checks": failed_checks
|
||||
}
|
||||
|
||||
# Save to database
|
||||
await self.db.save_canary_result(
|
||||
success=False,
|
||||
reviews_count=result['count'],
|
||||
scrape_time=result['time'],
|
||||
error_message=f"Validation failed: {failed_checks}",
|
||||
metadata={"checks": checks}
|
||||
)
|
||||
|
||||
# Alert on failure
|
||||
if self.consecutive_failures >= 3:
|
||||
await self.send_alert(
|
||||
f"🚨 CRITICAL: Canary validation failed {self.consecutive_failures} times! "
|
||||
f"Failed checks: {failed_checks}"
|
||||
)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
log.error("❌ Canary test TIMEOUT (>60s)")
|
||||
self.consecutive_failures += 1
|
||||
self.last_result = {
|
||||
"status": "timeout",
|
||||
"error": "Scrape took longer than 60 seconds"
|
||||
}
|
||||
|
||||
await self.db.save_canary_result(
|
||||
success=False,
|
||||
error_message="Timeout after 60 seconds"
|
||||
)
|
||||
|
||||
if self.consecutive_failures >= 3:
|
||||
await self.send_alert(
|
||||
f"🚨 CRITICAL: Canary timeout {self.consecutive_failures} times!"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"❌ Canary test ERROR: {e}")
|
||||
self.consecutive_failures += 1
|
||||
self.last_result = {
|
||||
"status": "error",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
await self.db.save_canary_result(
|
||||
success=False,
|
||||
error_message=str(e)
|
||||
)
|
||||
|
||||
raise # Re-raise to trigger alert in main loop
|
||||
|
||||
def _validate_review_structure(self, reviews) -> bool:
|
||||
"""
|
||||
Validate that reviews have expected structure.
|
||||
|
||||
Args:
|
||||
reviews: List of review dictionaries
|
||||
|
||||
Returns:
|
||||
True if structure is valid
|
||||
"""
|
||||
if not reviews or len(reviews) == 0:
|
||||
return False
|
||||
|
||||
# Check first review has required fields
|
||||
first_review = reviews[0]
|
||||
required_fields = ['author', 'rating', 'date_text']
|
||||
|
||||
return all(field in first_review for field in required_fields)
|
||||
|
||||
async def send_alert(self, message: str):
|
||||
"""
|
||||
Send alert via configured channels.
|
||||
|
||||
Args:
|
||||
message: Alert message to send
|
||||
"""
|
||||
log.critical(message)
|
||||
|
||||
# TODO: Integrate with alerting systems
|
||||
# Examples:
|
||||
|
||||
# Slack
|
||||
slack_webhook = os.getenv('SLACK_WEBHOOK_URL')
|
||||
if slack_webhook:
|
||||
try:
|
||||
import httpx
|
||||
async with httpx.AsyncClient() as client:
|
||||
await client.post(
|
||||
slack_webhook,
|
||||
json={"text": message},
|
||||
timeout=5.0
|
||||
)
|
||||
log.info("Alert sent to Slack")
|
||||
except Exception as e:
|
||||
log.error(f"Failed to send Slack alert: {e}")
|
||||
|
||||
# Email (example with SMTP)
|
||||
# smtp_config = os.getenv('SMTP_CONFIG')
|
||||
# if smtp_config:
|
||||
# await send_email(
|
||||
# to=os.getenv('ALERT_EMAIL'),
|
||||
# subject="Scraper Canary Alert",
|
||||
# body=message
|
||||
# )
|
||||
|
||||
# PagerDuty
|
||||
# pagerduty_key = os.getenv('PAGERDUTY_KEY')
|
||||
# if pagerduty_key:
|
||||
# await trigger_pagerduty(message)
|
||||
|
||||
def get_status(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get current canary status.
|
||||
|
||||
Returns:
|
||||
Status dictionary
|
||||
"""
|
||||
if not self.last_success:
|
||||
return {
|
||||
"status": "unknown",
|
||||
"message": "No canary tests run yet",
|
||||
"last_run": self.last_run.isoformat() if self.last_run else None
|
||||
}
|
||||
|
||||
age = datetime.now() - self.last_success
|
||||
max_age = timedelta(hours=6) # Alert if no success in 6 hours
|
||||
|
||||
if age > max_age:
|
||||
return {
|
||||
"status": "stale",
|
||||
"last_success": self.last_success.isoformat(),
|
||||
"age_hours": age.total_seconds() / 3600,
|
||||
"consecutive_failures": self.consecutive_failures,
|
||||
"message": f"Last successful canary was {age.total_seconds()/3600:.1f} hours ago"
|
||||
}
|
||||
|
||||
return {
|
||||
"status": "healthy",
|
||||
"last_success": self.last_success.isoformat(),
|
||||
"last_run": self.last_run.isoformat() if self.last_run else None,
|
||||
"age_minutes": age.total_seconds() / 60,
|
||||
"consecutive_failures": self.consecutive_failures,
|
||||
"last_result": self.last_result
|
||||
}
|
||||
|
||||
|
||||
class HealthCheckSystem:
|
||||
"""
|
||||
Complete health check system for production.
|
||||
|
||||
Provides multiple levels of health checks:
|
||||
- Liveness: Is the server alive?
|
||||
- Readiness: Can it handle traffic?
|
||||
- Canary: Does scraping actually work?
|
||||
"""
|
||||
|
||||
def __init__(self, db):
|
||||
"""
|
||||
Initialize health check system.
|
||||
|
||||
Args:
|
||||
db: Database manager instance
|
||||
"""
|
||||
self.db = db
|
||||
self.canary = CanaryMonitor(db, interval_hours=4)
|
||||
|
||||
async def start(self):
|
||||
"""Start background health monitoring"""
|
||||
asyncio.create_task(self.canary.start())
|
||||
|
||||
def stop(self):
|
||||
"""Stop background health monitoring"""
|
||||
self.canary.stop()
|
||||
|
||||
async def check_liveness(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Liveness check: Is the server alive?
|
||||
|
||||
This is a simple check that always succeeds if the server is running.
|
||||
Used by Kubernetes liveness probe - restart container if fails.
|
||||
|
||||
Returns:
|
||||
Liveness status
|
||||
"""
|
||||
return {
|
||||
"status": "alive",
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
async def check_readiness(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Readiness check: Can the server handle traffic?
|
||||
|
||||
Checks if dependencies are available.
|
||||
Used by Kubernetes readiness probe - remove from load balancer if fails.
|
||||
|
||||
Returns:
|
||||
Readiness status
|
||||
"""
|
||||
checks = {}
|
||||
|
||||
# Check database
|
||||
try:
|
||||
await self.db.pool.fetchval("SELECT 1")
|
||||
checks["database"] = {"healthy": True}
|
||||
except Exception as e:
|
||||
checks["database"] = {"healthy": False, "error": str(e)}
|
||||
|
||||
# Overall readiness
|
||||
all_healthy = all(c.get("healthy", False) for c in checks.values())
|
||||
|
||||
return {
|
||||
"status": "ready" if all_healthy else "not_ready",
|
||||
"checks": checks,
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
async def check_canary(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Canary check: Does scraping actually work?
|
||||
|
||||
Returns the latest canary test result.
|
||||
Used by external monitoring (PagerDuty, DataDog) for alerts.
|
||||
|
||||
Returns:
|
||||
Canary status
|
||||
"""
|
||||
return self.canary.get_status()
|
||||
|
||||
async def get_detailed_health(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get detailed health status of all components.
|
||||
|
||||
Returns:
|
||||
Complete health status
|
||||
"""
|
||||
liveness = await self.check_liveness()
|
||||
readiness = await self.check_readiness()
|
||||
canary = await self.check_canary()
|
||||
|
||||
overall_healthy = (
|
||||
liveness["status"] == "alive" and
|
||||
readiness["status"] == "ready" and
|
||||
canary["status"] in ["healthy", "unknown"] # Unknown is OK (first run)
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "healthy" if overall_healthy else "degraded",
|
||||
"components": {
|
||||
"liveness": liveness,
|
||||
"readiness": readiness,
|
||||
"canary": canary
|
||||
},
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
@@ -1,93 +0,0 @@
|
||||
"""
|
||||
Data models for Google Maps Reviews Scraper.
|
||||
"""
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from selenium.webdriver.remote.webelement import WebElement
|
||||
|
||||
from modules.utils import (try_find, first_text, first_attr, safe_int, detect_lang, parse_date_to_iso)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RawReview:
|
||||
"""
|
||||
Data class representing a raw review extracted from Google Maps.
|
||||
"""
|
||||
id: str = ""
|
||||
author: str = ""
|
||||
rating: float = 0.0
|
||||
date: str = ""
|
||||
lang: str = "und"
|
||||
text: str = ""
|
||||
likes: int = 0
|
||||
photos: list[str] = field(default_factory=list)
|
||||
profile: str = ""
|
||||
avatar: str = "" # URL to profile picture
|
||||
owner_date: str = ""
|
||||
owner_text: str = ""
|
||||
review_date: str = "" # ISO format date
|
||||
|
||||
# Translation fields
|
||||
translations: dict = field(default_factory=dict) # Store translations by language code
|
||||
|
||||
# CSS Selectors for review elements
|
||||
MORE_BTN = "button.kyuRq"
|
||||
LIKE_BTN = 'button[jsaction*="toggleThumbsUp" i]'
|
||||
PHOTO_BTN = "button.Tya61d"
|
||||
OWNER_RESP = "div.CDe7pd"
|
||||
|
||||
@classmethod
|
||||
def from_card(cls, card: WebElement) -> "RawReview":
|
||||
"""Factory method to create a RawReview from a WebElement"""
|
||||
# expand "More" - non-blocking approach
|
||||
for b in try_find(card, cls.MORE_BTN, all=True):
|
||||
try:
|
||||
b.click()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Try to get data-review-id from the card itself, or from a child element
|
||||
rid = card.get_attribute("data-review-id") or ""
|
||||
if not rid:
|
||||
# Try to find it in a child element
|
||||
review_id_elem = try_find(card, "[data-review-id]")
|
||||
if review_id_elem:
|
||||
rid = review_id_elem[0].get_attribute("data-review-id") or ""
|
||||
author = first_text(card, 'div[class*="d4r55"]')
|
||||
profile = first_attr(card, 'button[data-review-id]', "data-href")
|
||||
avatar = first_attr(card, 'button[data-review-id] img', "src")
|
||||
|
||||
label = first_attr(card, 'span[role="img"]', "aria-label")
|
||||
num = re.search(r"[\d\.]+", label.replace(",", ".")) if label else None
|
||||
rating = float(num.group()) if num else 0.0
|
||||
|
||||
date = first_text(card, 'span[class*="rsqaWe"]')
|
||||
# Parse the date string to ISO format
|
||||
review_date = parse_date_to_iso(date)
|
||||
|
||||
text = ""
|
||||
for sel in ('span[jsname="bN97Pc"]',
|
||||
'span[jsname="fbQN7e"]',
|
||||
'div.MyEned span.wiI7pd'):
|
||||
text = first_text(card, sel)
|
||||
if text: break
|
||||
lang = detect_lang(text)
|
||||
|
||||
likes = 0
|
||||
if (btn := try_find(card, cls.LIKE_BTN)):
|
||||
likes = safe_int(btn[0].text or btn[0].get_attribute("aria-label"))
|
||||
|
||||
photos: list[str] = []
|
||||
for btn in try_find(card, cls.PHOTO_BTN, all=True):
|
||||
if (m := re.search(r'url\("([^"]+)"', btn.get_attribute("style") or "")):
|
||||
photos.append(m.group(1))
|
||||
|
||||
owner_date = owner_text = ""
|
||||
if (box := try_find(card, cls.OWNER_RESP)):
|
||||
box = box[0]
|
||||
owner_date = first_text(box, "span.DZSIDd")
|
||||
owner_text = first_text(box, "div.wiI7pd")
|
||||
|
||||
return cls(rid, author, rating, date, lang, text, likes,
|
||||
photos, profile, avatar, owner_date, owner_text, review_date)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,250 +0,0 @@
|
||||
"""
|
||||
Structured Logger Module
|
||||
|
||||
Provides a thread-safe, structured logging system with JSON-serializable output.
|
||||
Designed to replace the LogCapture class with enhanced categorization and metrics support.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, List, Literal, Optional
|
||||
import threading
|
||||
import time
|
||||
|
||||
|
||||
LogLevel = Literal['DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL']
|
||||
LogCategory = Literal['scraper', 'browser', 'network', 'system']
|
||||
|
||||
|
||||
@dataclass
|
||||
class LogEntry:
|
||||
"""Structured log entry with timestamp, level, category, and optional metrics."""
|
||||
timestamp: str # ISO 8601 with Z suffix
|
||||
timestamp_ms: int # Unix milliseconds
|
||||
level: LogLevel
|
||||
category: LogCategory
|
||||
message: str
|
||||
metrics: Optional[Dict] = None # memory_mb, reviews_count, scroll_position, dom_nodes, etc.
|
||||
network: Optional[Dict] = None # url, method, status, size_bytes, duration_ms
|
||||
snapshot_id: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert to JSON-serializable dictionary, excluding None values."""
|
||||
result = {
|
||||
'timestamp': self.timestamp,
|
||||
'timestamp_ms': self.timestamp_ms,
|
||||
'level': self.level,
|
||||
'category': self.category,
|
||||
'message': self.message,
|
||||
}
|
||||
if self.metrics is not None:
|
||||
result['metrics'] = self.metrics
|
||||
if self.network is not None:
|
||||
result['network'] = self.network
|
||||
if self.snapshot_id is not None:
|
||||
result['snapshot_id'] = self.snapshot_id
|
||||
return result
|
||||
|
||||
|
||||
class StructuredLogger:
|
||||
"""
|
||||
Thread-safe structured logger with categorized log entries and automatic pruning.
|
||||
|
||||
Example usage:
|
||||
logger = StructuredLogger()
|
||||
logger.info('browser', 'Navigating to URL', metrics={'memory_mb': 245})
|
||||
logger.warn('network', 'Rate limit detected', network={'status': 429, 'url': '...'})
|
||||
logger.error('system', 'Chrome crashed', metrics={'memory_mb': 489, 'dom_nodes': 12000})
|
||||
"""
|
||||
|
||||
def __init__(self, max_entries: int = 10000):
|
||||
"""
|
||||
Initialize the structured logger.
|
||||
|
||||
Args:
|
||||
max_entries: Maximum number of log entries to retain (default 10000).
|
||||
Oldest entries are pruned when limit is exceeded.
|
||||
"""
|
||||
self._entries: List[LogEntry] = []
|
||||
self._lock = threading.Lock()
|
||||
self._max_entries = max_entries
|
||||
|
||||
def _create_entry(
|
||||
self,
|
||||
level: LogLevel,
|
||||
category: LogCategory,
|
||||
message: str,
|
||||
metrics: Optional[Dict] = None,
|
||||
network: Optional[Dict] = None,
|
||||
snapshot_id: Optional[str] = None,
|
||||
) -> LogEntry:
|
||||
"""Create a new log entry with current timestamp."""
|
||||
now = datetime.now(timezone.utc)
|
||||
timestamp = now.strftime('%Y-%m-%dT%H:%M:%S.') + f'{now.microsecond // 1000:03d}Z'
|
||||
timestamp_ms = int(now.timestamp() * 1000)
|
||||
|
||||
return LogEntry(
|
||||
timestamp=timestamp,
|
||||
timestamp_ms=timestamp_ms,
|
||||
level=level,
|
||||
category=category,
|
||||
message=message,
|
||||
metrics=metrics,
|
||||
network=network,
|
||||
snapshot_id=snapshot_id,
|
||||
)
|
||||
|
||||
def _add_entry(self, entry: LogEntry) -> None:
|
||||
"""Add an entry to the log with thread-safety and automatic pruning."""
|
||||
with self._lock:
|
||||
self._entries.append(entry)
|
||||
# Prune oldest entries if limit exceeded
|
||||
if len(self._entries) > self._max_entries:
|
||||
# Remove oldest 10% to avoid frequent pruning
|
||||
prune_count = max(1, self._max_entries // 10)
|
||||
self._entries = self._entries[prune_count:]
|
||||
|
||||
def debug(
|
||||
self,
|
||||
category: LogCategory,
|
||||
message: str,
|
||||
*,
|
||||
metrics: Optional[Dict] = None,
|
||||
network: Optional[Dict] = None,
|
||||
snapshot_id: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Log a DEBUG level message."""
|
||||
entry = self._create_entry('DEBUG', category, message, metrics, network, snapshot_id)
|
||||
self._add_entry(entry)
|
||||
|
||||
def info(
|
||||
self,
|
||||
category: LogCategory,
|
||||
message: str,
|
||||
*,
|
||||
metrics: Optional[Dict] = None,
|
||||
network: Optional[Dict] = None,
|
||||
snapshot_id: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Log an INFO level message."""
|
||||
entry = self._create_entry('INFO', category, message, metrics, network, snapshot_id)
|
||||
self._add_entry(entry)
|
||||
|
||||
def warn(
|
||||
self,
|
||||
category: LogCategory,
|
||||
message: str,
|
||||
*,
|
||||
metrics: Optional[Dict] = None,
|
||||
network: Optional[Dict] = None,
|
||||
snapshot_id: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Log a WARN level message."""
|
||||
entry = self._create_entry('WARN', category, message, metrics, network, snapshot_id)
|
||||
self._add_entry(entry)
|
||||
|
||||
def error(
|
||||
self,
|
||||
category: LogCategory,
|
||||
message: str,
|
||||
*,
|
||||
metrics: Optional[Dict] = None,
|
||||
network: Optional[Dict] = None,
|
||||
snapshot_id: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Log an ERROR level message."""
|
||||
entry = self._create_entry('ERROR', category, message, metrics, network, snapshot_id)
|
||||
self._add_entry(entry)
|
||||
|
||||
def fatal(
|
||||
self,
|
||||
category: LogCategory,
|
||||
message: str,
|
||||
*,
|
||||
metrics: Optional[Dict] = None,
|
||||
network: Optional[Dict] = None,
|
||||
snapshot_id: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Log a FATAL level message."""
|
||||
entry = self._create_entry('FATAL', category, message, metrics, network, snapshot_id)
|
||||
self._add_entry(entry)
|
||||
|
||||
def log(self, message: str, level: str = 'INFO') -> None:
|
||||
"""
|
||||
Backward-compatible log method for legacy code.
|
||||
|
||||
Maps to 'system' category by default.
|
||||
|
||||
Args:
|
||||
message: The log message
|
||||
level: Log level as string (DEBUG, INFO, WARN, ERROR, FATAL)
|
||||
"""
|
||||
level_upper = level.upper()
|
||||
if level_upper not in ('DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL'):
|
||||
level_upper = 'INFO'
|
||||
|
||||
entry = self._create_entry(level_upper, 'system', message)
|
||||
self._add_entry(entry)
|
||||
|
||||
def get_logs(self) -> List[Dict]:
|
||||
"""
|
||||
Get all log entries as JSON-serializable dictionaries.
|
||||
|
||||
Returns:
|
||||
List of log entry dictionaries.
|
||||
"""
|
||||
with self._lock:
|
||||
return [entry.to_dict() for entry in self._entries]
|
||||
|
||||
def get_logs_by_category(self, category: LogCategory) -> List[Dict]:
|
||||
"""
|
||||
Get log entries filtered by category.
|
||||
|
||||
Args:
|
||||
category: The category to filter by ('scraper', 'browser', 'network', 'system')
|
||||
|
||||
Returns:
|
||||
List of log entry dictionaries matching the category.
|
||||
"""
|
||||
with self._lock:
|
||||
return [entry.to_dict() for entry in self._entries if entry.category == category]
|
||||
|
||||
def get_logs_by_level(self, level: LogLevel) -> List[Dict]:
|
||||
"""
|
||||
Get log entries filtered by level.
|
||||
|
||||
Args:
|
||||
level: The level to filter by ('DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL')
|
||||
|
||||
Returns:
|
||||
List of log entry dictionaries matching the level.
|
||||
"""
|
||||
with self._lock:
|
||||
return [entry.to_dict() for entry in self._entries if entry.level == level]
|
||||
|
||||
def get_logs_since(self, timestamp_ms: int) -> List[Dict]:
|
||||
"""
|
||||
Get log entries since a specific timestamp.
|
||||
|
||||
Args:
|
||||
timestamp_ms: Unix timestamp in milliseconds
|
||||
|
||||
Returns:
|
||||
List of log entry dictionaries with timestamp >= timestamp_ms.
|
||||
"""
|
||||
with self._lock:
|
||||
return [entry.to_dict() for entry in self._entries if entry.timestamp_ms >= timestamp_ms]
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all log entries."""
|
||||
with self._lock:
|
||||
self._entries.clear()
|
||||
|
||||
def count(self) -> int:
|
||||
"""Get the current number of log entries."""
|
||||
with self._lock:
|
||||
return len(self._entries)
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Get the current number of log entries."""
|
||||
return self.count()
|
||||
307
modules/utils.py
307
modules/utils.py
@@ -1,307 +0,0 @@
|
||||
"""
|
||||
Utility functions for Google Maps Reviews Scraper.
|
||||
"""
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from datetime import timezone
|
||||
from functools import lru_cache
|
||||
from typing import List
|
||||
|
||||
from selenium.common.exceptions import (NoSuchElementException,
|
||||
StaleElementReferenceException,
|
||||
TimeoutException)
|
||||
from selenium.webdriver import Chrome
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.remote.webelement import WebElement
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
||||
# Logger
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
# Constants for language detection
|
||||
HEB_CHARS = re.compile(r"[\u0590-\u05FF]")
|
||||
THAI_CHARS = re.compile(r"[\u0E00-\u0E7F]")
|
||||
|
||||
|
||||
@lru_cache(maxsize=1024)
|
||||
def detect_lang(txt: str) -> str:
|
||||
"""Detect language based on character sets"""
|
||||
if HEB_CHARS.search(txt): return "he"
|
||||
if THAI_CHARS.search(txt): return "th"
|
||||
return "en"
|
||||
|
||||
|
||||
@lru_cache(maxsize=128)
|
||||
def safe_int(s: str | None) -> int:
|
||||
"""Safely convert string to integer, returning 0 if not possible"""
|
||||
m = re.search(r"\d+", s or "")
|
||||
return int(m.group()) if m else 0
|
||||
|
||||
|
||||
def try_find(el: WebElement, css: str, *, all=False) -> List[WebElement]:
|
||||
"""Safely find elements by CSS selector without raising exceptions"""
|
||||
try:
|
||||
if all:
|
||||
return el.find_elements(By.CSS_SELECTOR, css)
|
||||
obj = el.find_element(By.CSS_SELECTOR, css)
|
||||
return [obj] if obj else []
|
||||
except (NoSuchElementException, StaleElementReferenceException):
|
||||
return []
|
||||
|
||||
|
||||
def first_text(el: WebElement, css: str) -> str:
|
||||
"""Get text from the first matching element that has non-empty text"""
|
||||
for e in try_find(el, css, all=True):
|
||||
try:
|
||||
if (t := e.text.strip()):
|
||||
return t
|
||||
except StaleElementReferenceException:
|
||||
continue
|
||||
return ""
|
||||
|
||||
|
||||
def parse_date_to_iso(date_str: str) -> str:
|
||||
"""
|
||||
Parse date strings like "2 weeks ago", "January 2023", etc. into ISO format.
|
||||
Returns a best-effort ISO string, or empty string if parsing fails.
|
||||
"""
|
||||
if not date_str:
|
||||
return ""
|
||||
|
||||
try:
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
# Handle relative dates
|
||||
if "ago" in date_str.lower():
|
||||
# For simplicity, map to approximate dates
|
||||
if "minute" in date_str.lower():
|
||||
minutes = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
dt = now.replace(microsecond=0) - timezone.timedelta(minutes=minutes)
|
||||
elif "hour" in date_str.lower():
|
||||
hours = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
dt = now.replace(microsecond=0) - timezone.timedelta(hours=hours)
|
||||
elif "day" in date_str.lower():
|
||||
days = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
dt = now.replace(microsecond=0) - timezone.timedelta(days=days)
|
||||
elif "week" in date_str.lower():
|
||||
weeks = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
dt = now.replace(microsecond=0) - timezone.timedelta(weeks=weeks)
|
||||
elif "month" in date_str.lower():
|
||||
months = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# Approximate months as 30 days
|
||||
dt = now.replace(microsecond=0) - timezone.timedelta(days=30 * months)
|
||||
elif "year" in date_str.lower():
|
||||
years = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# Approximate years as 365 days
|
||||
dt = now.replace(microsecond=0) - timezone.timedelta(days=365 * years)
|
||||
else:
|
||||
# Default to current time if can't parse
|
||||
dt = now.replace(microsecond=0)
|
||||
else:
|
||||
# Handle absolute dates (month year format)
|
||||
# This is a simplification - would need more robust parsing for production
|
||||
dt = now.replace(microsecond=0)
|
||||
|
||||
return dt.isoformat()
|
||||
except Exception:
|
||||
# If parsing fails, return empty string
|
||||
return ""
|
||||
|
||||
|
||||
def first_attr(el: WebElement, css: str, attr: str) -> str:
|
||||
"""Get attribute value from the first matching element that has a non-empty value"""
|
||||
for e in try_find(el, css, all=True):
|
||||
try:
|
||||
if (v := (e.get_attribute(attr) or "").strip()):
|
||||
return v
|
||||
except StaleElementReferenceException:
|
||||
continue
|
||||
return ""
|
||||
|
||||
|
||||
def click_if(driver: Chrome, css: str, delay: float = .25, timeout: float = 5.0) -> bool:
|
||||
"""
|
||||
Click element if it exists and is clickable, with timeout and better error handling.
|
||||
|
||||
Args:
|
||||
driver: WebDriver instance
|
||||
css: CSS selector for the element to click
|
||||
delay: Time to wait after clicking (seconds)
|
||||
timeout: Maximum time to wait for element (seconds)
|
||||
|
||||
Returns:
|
||||
True if element was found and clicked, False otherwise
|
||||
"""
|
||||
try:
|
||||
# First check if elements exist at all
|
||||
elements = driver.find_elements(By.CSS_SELECTOR, css)
|
||||
if not elements:
|
||||
return False
|
||||
|
||||
# Try clicking the first visible element
|
||||
for element in elements:
|
||||
try:
|
||||
if element.is_displayed() and element.is_enabled():
|
||||
element.click()
|
||||
time.sleep(delay)
|
||||
return True
|
||||
except Exception:
|
||||
# Try next element if this one fails
|
||||
continue
|
||||
|
||||
# If we couldn't click any of the direct elements, try with WebDriverWait
|
||||
try:
|
||||
WebDriverWait(driver, timeout).until(
|
||||
EC.element_to_be_clickable((By.CSS_SELECTOR, css))
|
||||
).click()
|
||||
time.sleep(delay)
|
||||
return True
|
||||
except TimeoutException:
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error in click_if: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
def get_current_iso_date() -> str:
|
||||
"""Return current UTC time in ISO format."""
|
||||
from datetime import datetime, timezone
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# """
|
||||
# Utility functions for Google Maps Reviews Scraper.
|
||||
# """
|
||||
#
|
||||
# import re
|
||||
# import time
|
||||
# import logging
|
||||
# from datetime import datetime, timezone
|
||||
# from functools import lru_cache
|
||||
# from typing import List, Optional
|
||||
#
|
||||
# from selenium.common.exceptions import (NoSuchElementException,
|
||||
# StaleElementReferenceException,
|
||||
# TimeoutException)
|
||||
# from selenium.webdriver import Chrome
|
||||
# from selenium.webdriver.common.by import By
|
||||
# from selenium.webdriver.remote.webelement import WebElement
|
||||
# from selenium.webdriver.support import expected_conditions as EC
|
||||
# from selenium.webdriver.support.ui import WebDriverWait
|
||||
#
|
||||
# # Constants for language detection
|
||||
# HEB_CHARS = re.compile(r"[\u0590-\u05FF]")
|
||||
# THAI_CHARS = re.compile(r"[\u0E00-\u0E7F]")
|
||||
#
|
||||
# # Logger
|
||||
# log = logging.getLogger("scraper")
|
||||
#
|
||||
#
|
||||
# @lru_cache(maxsize=1024)
|
||||
# def detect_lang(txt: str) -> str:
|
||||
# """Detect language based on character sets"""
|
||||
# if HEB_CHARS.search(txt): return "he"
|
||||
# if THAI_CHARS.search(txt): return "th"
|
||||
# return "en"
|
||||
#
|
||||
#
|
||||
# @lru_cache(maxsize=128)
|
||||
# def safe_int(s: str | None) -> int:
|
||||
# """Safely convert string to integer, returning 0 if not possible"""
|
||||
# m = re.search(r"\d+", s or "")
|
||||
# return int(m.group()) if m else 0
|
||||
#
|
||||
#
|
||||
# def try_find(el: WebElement, css: str, *, all=False) -> List[WebElement]:
|
||||
# """Safely find elements by CSS selector without raising exceptions"""
|
||||
# try:
|
||||
# if all:
|
||||
# return el.find_elements(By.CSS_SELECTOR, css)
|
||||
# obj = el.find_element(By.CSS_SELECTOR, css)
|
||||
# return [obj] if obj else []
|
||||
# except (NoSuchElementException, StaleElementReferenceException):
|
||||
# return []
|
||||
#
|
||||
#
|
||||
# def first_text(el: WebElement, css: str) -> str:
|
||||
# """Get text from the first matching element that has non-empty text"""
|
||||
# for e in try_find(el, css, all=True):
|
||||
# if (t := e.text.strip()):
|
||||
# return t
|
||||
# return ""
|
||||
#
|
||||
#
|
||||
# def first_attr(el: WebElement, css: str, attr: str) -> str:
|
||||
# """Get attribute value from the first matching element that has a non-empty value"""
|
||||
# for e in try_find(el, css, all=True):
|
||||
# if (v := (e.get_attribute(attr) or "").strip()):
|
||||
# return v
|
||||
# return ""
|
||||
#
|
||||
#
|
||||
# def click_if(driver: Chrome, css: str, delay: float = .25, timeout: float = 5.0) -> bool:
|
||||
# """Click element if it exists and is clickable, with timeout"""
|
||||
# try:
|
||||
# WebDriverWait(driver, timeout).until(
|
||||
# EC.element_to_be_clickable((By.CSS_SELECTOR, css))
|
||||
# ).click()
|
||||
# time.sleep(delay)
|
||||
# return True
|
||||
# except TimeoutException:
|
||||
# return False
|
||||
#
|
||||
#
|
||||
# def parse_date_to_iso(date_str: str) -> str:
|
||||
# """
|
||||
# Parse date strings like "2 weeks ago", "January 2023", etc. into ISO format.
|
||||
# Returns a best-effort ISO string, or empty string if parsing fails.
|
||||
# """
|
||||
# if not date_str:
|
||||
# return ""
|
||||
#
|
||||
# try:
|
||||
# now = datetime.now(timezone.utc)
|
||||
#
|
||||
# # Handle relative dates
|
||||
# if "ago" in date_str.lower():
|
||||
# # For simplicity, map to approximate dates
|
||||
# if "minute" in date_str.lower():
|
||||
# minutes = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# dt = now.replace(microsecond=0) - timezone.timedelta(minutes=minutes)
|
||||
# elif "hour" in date_str.lower():
|
||||
# hours = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# dt = now.replace(microsecond=0) - timezone.timedelta(hours=hours)
|
||||
# elif "day" in date_str.lower():
|
||||
# days = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# dt = now.replace(microsecond=0) - timezone.timedelta(days=days)
|
||||
# elif "week" in date_str.lower():
|
||||
# weeks = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# dt = now.replace(microsecond=0) - timezone.timedelta(weeks=weeks)
|
||||
# elif "month" in date_str.lower():
|
||||
# months = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# # Approximate months as 30 days
|
||||
# dt = now.replace(microsecond=0) - timezone.timedelta(days=30 * months)
|
||||
# elif "year" in date_str.lower():
|
||||
# years = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# # Approximate years as 365 days
|
||||
# dt = now.replace(microsecond=0) - timezone.timedelta(days=365 * years)
|
||||
# else:
|
||||
# # Default to current time if can't parse
|
||||
# dt = now.replace(microsecond=0)
|
||||
# else:
|
||||
# # Handle absolute dates (month year format)
|
||||
# # This is a simplification - would need more robust parsing for production
|
||||
# dt = now.replace(microsecond=0)
|
||||
#
|
||||
# return dt.isoformat()
|
||||
# except Exception:
|
||||
# # If parsing fails, return empty string
|
||||
# return ""
|
||||
#
|
||||
#
|
||||
# def get_current_iso_date() -> str:
|
||||
# """Return current UTC time in ISO format."""
|
||||
# return datetime.now(timezone.utc).isoformat()
|
||||
@@ -1,373 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Webhook delivery system with retry logic and security.
|
||||
"""
|
||||
import asyncio
|
||||
import hmac
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, Any, Optional
|
||||
from datetime import datetime
|
||||
import httpx
|
||||
from uuid import UUID
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WebhookDeliveryError(Exception):
|
||||
"""Raised when webhook delivery fails after all retries"""
|
||||
pass
|
||||
|
||||
|
||||
class WebhookManager:
|
||||
"""
|
||||
Manages webhook delivery with retry logic and security.
|
||||
|
||||
Features:
|
||||
- Exponential backoff retry (3 attempts)
|
||||
- HMAC signature for security
|
||||
- Timeout handling
|
||||
- Async delivery
|
||||
- Logging of all attempts
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_retries: int = 3,
|
||||
timeout: float = 10.0,
|
||||
initial_retry_delay: float = 2.0
|
||||
):
|
||||
"""
|
||||
Initialize webhook manager.
|
||||
|
||||
Args:
|
||||
max_retries: Maximum number of delivery attempts
|
||||
timeout: Request timeout in seconds
|
||||
initial_retry_delay: Initial delay between retries (exponential backoff)
|
||||
"""
|
||||
self.max_retries = max_retries
|
||||
self.timeout = timeout
|
||||
self.initial_retry_delay = initial_retry_delay
|
||||
|
||||
def generate_signature(self, payload: str, secret: str) -> str:
|
||||
"""
|
||||
Generate HMAC-SHA256 signature for webhook payload.
|
||||
|
||||
Args:
|
||||
payload: JSON string payload
|
||||
secret: Webhook secret
|
||||
|
||||
Returns:
|
||||
Hex-encoded signature
|
||||
"""
|
||||
return hmac.new(
|
||||
secret.encode('utf-8'),
|
||||
payload.encode('utf-8'),
|
||||
hashlib.sha256
|
||||
).hexdigest()
|
||||
|
||||
async def send_webhook(
|
||||
self,
|
||||
webhook_url: str,
|
||||
payload: Dict[str, Any],
|
||||
secret: Optional[str] = None,
|
||||
job_id: Optional[UUID] = None,
|
||||
db=None
|
||||
) -> bool:
|
||||
"""
|
||||
Send webhook with retry logic.
|
||||
|
||||
Args:
|
||||
webhook_url: URL to send webhook to
|
||||
payload: Webhook payload dictionary
|
||||
secret: Optional webhook secret for HMAC signature
|
||||
job_id: Optional job ID for logging attempts
|
||||
db: Optional database manager for logging
|
||||
|
||||
Returns:
|
||||
True if delivery succeeded, False otherwise
|
||||
"""
|
||||
payload_json = json.dumps(payload, default=str)
|
||||
|
||||
for attempt in range(1, self.max_retries + 1):
|
||||
try:
|
||||
start_time = datetime.now()
|
||||
|
||||
# Prepare headers
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"User-Agent": "GoogleReviewsScraper-Webhook/1.0"
|
||||
}
|
||||
|
||||
# Add signature if secret provided
|
||||
if secret:
|
||||
signature = self.generate_signature(payload_json, secret)
|
||||
headers["X-Webhook-Signature"] = f"sha256={signature}"
|
||||
headers["X-Webhook-Timestamp"] = str(int(datetime.now().timestamp()))
|
||||
|
||||
# Send webhook
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
webhook_url,
|
||||
content=payload_json,
|
||||
headers=headers,
|
||||
timeout=self.timeout
|
||||
)
|
||||
|
||||
response_time_ms = (datetime.now() - start_time).total_seconds() * 1000
|
||||
|
||||
# Check response
|
||||
if response.status_code in [200, 201, 202, 204]:
|
||||
# Success
|
||||
log.info(
|
||||
f"Webhook delivered successfully to {webhook_url} "
|
||||
f"(attempt {attempt}, {response_time_ms:.0f}ms, status {response.status_code})"
|
||||
)
|
||||
|
||||
# Log successful attempt
|
||||
if db and job_id:
|
||||
await db.log_webhook_attempt(
|
||||
job_id=job_id,
|
||||
attempt_number=attempt,
|
||||
success=True,
|
||||
status_code=response.status_code,
|
||||
response_time_ms=response_time_ms
|
||||
)
|
||||
|
||||
return True
|
||||
else:
|
||||
# Non-2xx response
|
||||
error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
|
||||
log.warning(
|
||||
f"Webhook delivery failed to {webhook_url} "
|
||||
f"(attempt {attempt}/{self.max_retries}): {error_msg}"
|
||||
)
|
||||
|
||||
# Log failed attempt
|
||||
if db and job_id:
|
||||
await db.log_webhook_attempt(
|
||||
job_id=job_id,
|
||||
attempt_number=attempt,
|
||||
success=False,
|
||||
status_code=response.status_code,
|
||||
error_message=error_msg,
|
||||
response_time_ms=response_time_ms
|
||||
)
|
||||
|
||||
except httpx.TimeoutException as e:
|
||||
error_msg = f"Timeout after {self.timeout}s"
|
||||
log.warning(
|
||||
f"Webhook delivery timeout to {webhook_url} "
|
||||
f"(attempt {attempt}/{self.max_retries}): {error_msg}"
|
||||
)
|
||||
|
||||
# Log timeout attempt
|
||||
if db and job_id:
|
||||
await db.log_webhook_attempt(
|
||||
job_id=job_id,
|
||||
attempt_number=attempt,
|
||||
success=False,
|
||||
error_message=error_msg
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"{type(e).__name__}: {str(e)}"
|
||||
log.error(
|
||||
f"Webhook delivery error to {webhook_url} "
|
||||
f"(attempt {attempt}/{self.max_retries}): {error_msg}"
|
||||
)
|
||||
|
||||
# Log error attempt
|
||||
if db and job_id:
|
||||
await db.log_webhook_attempt(
|
||||
job_id=job_id,
|
||||
attempt_number=attempt,
|
||||
success=False,
|
||||
error_message=error_msg
|
||||
)
|
||||
|
||||
# Retry with exponential backoff
|
||||
if attempt < self.max_retries:
|
||||
retry_delay = self.initial_retry_delay * (2 ** (attempt - 1))
|
||||
log.info(f"Retrying in {retry_delay:.1f}s...")
|
||||
await asyncio.sleep(retry_delay)
|
||||
|
||||
# All retries failed
|
||||
log.error(
|
||||
f"Webhook delivery failed to {webhook_url} after {self.max_retries} attempts"
|
||||
)
|
||||
return False
|
||||
|
||||
async def send_job_completed_webhook(
|
||||
self,
|
||||
webhook_url: str,
|
||||
job_id: UUID,
|
||||
status: str,
|
||||
reviews_count: Optional[int] = None,
|
||||
scrape_time: Optional[float] = None,
|
||||
error_message: Optional[str] = None,
|
||||
reviews_url: Optional[str] = None,
|
||||
secret: Optional[str] = None,
|
||||
db=None
|
||||
) -> bool:
|
||||
"""
|
||||
Send job completion webhook.
|
||||
|
||||
Args:
|
||||
webhook_url: URL to send webhook to
|
||||
job_id: Job UUID
|
||||
status: Job status ('completed' or 'failed')
|
||||
reviews_count: Number of reviews scraped
|
||||
scrape_time: Time taken in seconds
|
||||
error_message: Error message if failed
|
||||
reviews_url: URL to retrieve reviews
|
||||
secret: Webhook secret
|
||||
db: Database manager for logging
|
||||
|
||||
Returns:
|
||||
True if delivery succeeded
|
||||
"""
|
||||
payload = {
|
||||
"event": f"job.{status}",
|
||||
"job_id": str(job_id),
|
||||
"status": status,
|
||||
"timestamp": datetime.utcnow().isoformat() + "Z"
|
||||
}
|
||||
|
||||
if status == "completed":
|
||||
payload.update({
|
||||
"reviews_count": reviews_count,
|
||||
"scrape_time": scrape_time,
|
||||
"reviews_url": reviews_url
|
||||
})
|
||||
elif status == "failed":
|
||||
payload["error_message"] = error_message
|
||||
|
||||
return await self.send_webhook(
|
||||
webhook_url=webhook_url,
|
||||
payload=payload,
|
||||
secret=secret,
|
||||
job_id=job_id,
|
||||
db=db
|
||||
)
|
||||
|
||||
|
||||
class WebhookDispatcher:
|
||||
"""
|
||||
Background webhook dispatcher that processes pending webhooks.
|
||||
|
||||
Runs in background and delivers webhooks for completed jobs.
|
||||
"""
|
||||
|
||||
def __init__(self, db, interval_seconds: int = 30):
|
||||
"""
|
||||
Initialize webhook dispatcher.
|
||||
|
||||
Args:
|
||||
db: Database manager instance
|
||||
interval_seconds: How often to check for pending webhooks
|
||||
"""
|
||||
self.db = db
|
||||
self.interval = interval_seconds
|
||||
self.webhook_manager = WebhookManager()
|
||||
self.running = False
|
||||
|
||||
async def start(self):
|
||||
"""Start the background webhook dispatcher"""
|
||||
self.running = True
|
||||
log.info("Webhook dispatcher started")
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
await self.process_pending_webhooks()
|
||||
except Exception as e:
|
||||
log.error(f"Error in webhook dispatcher: {e}")
|
||||
|
||||
await asyncio.sleep(self.interval)
|
||||
|
||||
def stop(self):
|
||||
"""Stop the background webhook dispatcher"""
|
||||
self.running = False
|
||||
log.info("Webhook dispatcher stopped")
|
||||
|
||||
async def process_pending_webhooks(self):
|
||||
"""
|
||||
Process all pending webhooks.
|
||||
|
||||
Fetches jobs with pending webhooks and delivers them.
|
||||
"""
|
||||
# Get jobs with pending webhooks
|
||||
jobs = await self.db.get_pending_jobs_with_webhooks(limit=100)
|
||||
|
||||
if not jobs:
|
||||
return
|
||||
|
||||
log.info(f"Processing {len(jobs)} pending webhooks...")
|
||||
|
||||
for job in jobs:
|
||||
try:
|
||||
job_id = job['job_id']
|
||||
webhook_url = job['webhook_url']
|
||||
webhook_secret = job.get('webhook_secret')
|
||||
status = job['status']
|
||||
|
||||
# Build reviews URL (assuming API base URL from environment)
|
||||
import os
|
||||
api_base_url = os.getenv('API_BASE_URL', 'http://localhost:8000')
|
||||
reviews_url = f"{api_base_url}/jobs/{job_id}/reviews"
|
||||
|
||||
# Send webhook
|
||||
await self.webhook_manager.send_job_completed_webhook(
|
||||
webhook_url=webhook_url,
|
||||
job_id=job_id,
|
||||
status=status,
|
||||
reviews_count=job.get('reviews_count'),
|
||||
scrape_time=job.get('scrape_time'),
|
||||
error_message=job.get('error_message'),
|
||||
reviews_url=reviews_url if status == 'completed' else None,
|
||||
secret=webhook_secret,
|
||||
db=self.db
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error processing webhook for job {job['job_id']}: {e}")
|
||||
|
||||
log.info(f"Processed {len(jobs)} webhooks")
|
||||
|
||||
|
||||
# Webhook verification helper for client implementations
|
||||
def verify_webhook_signature(payload: str, signature: str, secret: str) -> bool:
|
||||
"""
|
||||
Verify webhook signature (for client-side verification).
|
||||
|
||||
Args:
|
||||
payload: Raw JSON payload string
|
||||
signature: Signature from X-Webhook-Signature header (format: "sha256=...")
|
||||
secret: Webhook secret
|
||||
|
||||
Returns:
|
||||
True if signature is valid
|
||||
|
||||
Example:
|
||||
@app.post("/webhook")
|
||||
async def handle_webhook(request: Request):
|
||||
payload = await request.body()
|
||||
signature = request.headers.get("X-Webhook-Signature")
|
||||
|
||||
if not verify_webhook_signature(payload.decode(), signature, WEBHOOK_SECRET):
|
||||
raise HTTPException(status_code=401, detail="Invalid signature")
|
||||
|
||||
# Process webhook...
|
||||
"""
|
||||
if not signature or not signature.startswith("sha256="):
|
||||
return False
|
||||
|
||||
expected_signature = signature.split("sha256=", 1)[1]
|
||||
computed_signature = hmac.new(
|
||||
secret.encode('utf-8'),
|
||||
payload.encode('utf-8'),
|
||||
hashlib.sha256
|
||||
).hexdigest()
|
||||
|
||||
return hmac.compare_digest(expected_signature, computed_signature)
|
||||
Reference in New Issue
Block a user