Phase 0: Project restructure to ReviewIQ platform architecture

New structure:
- scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py)
- scrapers/base.py (BaseScraper interface)
- scrapers/registry.py (ScraperRegistry for version routing)
- core/database.py, models.py, config.py, enums.py
- utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py
- workers/chrome_pool.py
- services/webhook_service.py
- api/ routes structure (empty, ready for Phase 2)
- tests/ structure mirroring source

All imports updated in:
- api_server_production.py (7 import paths updated)
- utils/health_checks.py (scraper import path)

Legacy modules moved to modules/_legacy/:
- data_storage.py, image_handler.py, s3_handler.py (unused)

Syntax verified, frontend build passing.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 15:22:08 +00:00
parent bb0291f265
commit 544e028c3f
37 changed files with 5782 additions and 30 deletions

View File

@@ -1,388 +0,0 @@
#!/usr/bin/env python3
"""
Chrome Worker Pool Manager
Maintains a pool of idle Chrome instances for faster scraping.
Pre-warms browsers on startup to eliminate cold-start delays.
"""
import logging
import asyncio
import time
from typing import Optional, Dict, Any
from seleniumbase import Driver
from queue import Queue, Empty
import threading
log = logging.getLogger(__name__)
class ChromeWorker:
"""Single Chrome worker instance"""
def __init__(self, worker_id: str, headless: bool = True):
self.worker_id = worker_id
self.headless = headless
self.driver: Optional[Driver] = None
self.created_at = None
self.last_used = None
self.use_count = 0
self.is_busy = False
def initialize(self):
"""Initialize Chrome driver with stability flags for unlimited scraping"""
try:
log.info(f"Worker {self.worker_id}: Initializing Chrome for unlimited review scraping...")
# SeleniumBase Driver automatically includes UC mode anti-detection
# Initialize with longer timeouts for large scraping jobs
# Chrome arguments for Docker stability
chrome_args = [
"--disable-dev-shm-usage", # Use /tmp instead of /dev/shm (critical for Docker)
"--disable-gpu", # Disable GPU acceleration
"--no-sandbox", # Required for Docker
"--disable-software-rasterizer",
"--disable-extensions",
"--disable-background-networking",
"--disable-default-apps",
"--disable-sync",
"--metrics-recording-only",
"--mute-audio",
"--no-first-run",
"--safebrowsing-disable-auto-update",
]
self.driver = Driver(
uc=True,
headless=self.headless,
page_load_strategy="normal",
chromium_arg=",".join(chrome_args)
)
# Set generous timeouts for large scraping jobs
self.driver.set_page_load_timeout(120) # 2 minutes for slow networks
self.driver.set_script_timeout(60) # 1 minute for complex extraction
# Set Chrome geolocation to US (Boston, MA) for consistent Google Maps results
# This prevents location-based variations in search results
try:
self.driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': 42.3601,
'longitude': -71.0589,
'accuracy': 100
})
log.info(f"Worker {self.worker_id}: Geolocation set to US (Boston, MA)")
except Exception as e:
log.warning(f"Worker {self.worker_id}: Could not set geolocation: {e}")
self.driver.maximize_window()
self.created_at = time.time()
self.last_used = time.time()
log.info(f"Worker {self.worker_id}: Chrome ready for unlimited scraping")
return True
except Exception as e:
log.error(f"Worker {self.worker_id}: Failed to initialize: {e}")
return False
def reset(self):
"""Reset worker to clean state"""
try:
if self.driver:
# Clear cookies, cache, local storage
self.driver.delete_all_cookies()
self.driver.execute_script("window.localStorage.clear();")
self.driver.execute_script("window.sessionStorage.clear();")
log.debug(f"Worker {self.worker_id}: Reset complete")
except Exception as e:
log.warning(f"Worker {self.worker_id}: Reset failed: {e}")
def shutdown(self):
"""Shutdown worker"""
try:
if self.driver:
self.driver.quit()
log.info(f"Worker {self.worker_id}: Shutdown complete")
except Exception as e:
log.warning(f"Worker {self.worker_id}: Shutdown error: {e}")
finally:
self.driver = None
def should_recycle(self, max_age_seconds: int = 3600, max_uses: int = 50):
"""Check if worker should be recycled"""
if not self.driver:
return True
age = time.time() - self.created_at if self.created_at else 0
if age > max_age_seconds:
log.info(f"Worker {self.worker_id}: Recycling due to age ({age:.0f}s)")
return True
if self.use_count > max_uses:
log.info(f"Worker {self.worker_id}: Recycling due to use count ({self.use_count})")
return True
return False
class ChromeWorkerPool:
"""
Pool of Chrome worker instances for faster scraping.
Maintains idle workers ready to execute tasks immediately.
Workers are recycled after max age or max uses to prevent memory leaks.
"""
def __init__(self, pool_size: int = 2, headless: bool = True):
"""
Initialize worker pool.
Args:
pool_size: Number of idle workers to maintain
headless: Run Chrome in headless mode
"""
self.pool_size = pool_size
self.headless = headless
self.workers: Queue[ChromeWorker] = Queue(maxsize=pool_size)
self.active_workers: Dict[str, ChromeWorker] = {}
self.worker_counter = 0
self.lock = threading.Lock()
self.running = False
self.maintenance_thread = None
def start(self):
"""Start the worker pool"""
log.info(f"Starting Chrome worker pool (size={self.pool_size}, headless={self.headless})")
self.running = True
# Pre-warm workers
for _ in range(self.pool_size):
self._create_worker()
# Start maintenance thread
self.maintenance_thread = threading.Thread(target=self._maintenance_loop, daemon=True)
self.maintenance_thread.start()
log.info(f"Chrome worker pool started with {self.workers.qsize()} ready workers")
def stop(self):
"""Stop the worker pool"""
log.info("Stopping Chrome worker pool...")
self.running = False
if self.maintenance_thread:
self.maintenance_thread.join(timeout=5)
# Shutdown all workers
while not self.workers.empty():
try:
worker = self.workers.get_nowait()
worker.shutdown()
except Empty:
break
# Shutdown active workers
with self.lock:
for worker in self.active_workers.values():
worker.shutdown()
self.active_workers.clear()
log.info("Chrome worker pool stopped")
def _create_worker(self) -> Optional[ChromeWorker]:
"""Create a new worker and add to pool"""
with self.lock:
self.worker_counter += 1
worker_id = f"worker-{self.worker_counter}"
worker = ChromeWorker(worker_id, headless=self.headless)
if worker.initialize():
try:
self.workers.put_nowait(worker)
return worker
except:
worker.shutdown()
return None
return None
def acquire_worker(self, timeout: float = 30) -> Optional[ChromeWorker]:
"""
Acquire a worker from the pool.
Args:
timeout: Maximum time to wait for a worker
Returns:
ChromeWorker instance or None if timeout
"""
try:
worker = self.workers.get(timeout=timeout)
worker.is_busy = True
worker.last_used = time.time()
worker.use_count += 1
with self.lock:
self.active_workers[worker.worker_id] = worker
log.debug(f"Acquired {worker.worker_id} (uses: {worker.use_count}, pool: {self.workers.qsize()}/{self.pool_size})")
# No need to create replacement - worker will be returned to pool after use
# Maintenance thread ensures pool stays at capacity
return worker
except Empty:
log.warning(f"Failed to acquire worker within {timeout}s")
return None
def release_worker(self, worker: ChromeWorker, recycle: bool = False):
"""
Release a worker back to the pool.
Args:
worker: Worker to release
recycle: Force worker recycling
"""
with self.lock:
if worker.worker_id in self.active_workers:
del self.active_workers[worker.worker_id]
worker.is_busy = False
# Check if worker should be recycled
if recycle or worker.should_recycle():
log.info(f"Recycling {worker.worker_id}")
worker.shutdown()
# Create replacement worker in background
threading.Thread(target=self._create_worker, daemon=True).start()
else:
# Reset and return to pool
worker.reset()
try:
# Non-blocking put - if pool is full, it means we have extra workers
# Just keep the worker for next time instead of destroying it
current_size = self.workers.qsize()
if current_size < self.pool_size:
self.workers.put_nowait(worker)
log.debug(f"Released {worker.worker_id} back to pool ({current_size + 1}/{self.pool_size})")
else:
# Pool already at capacity, recycle this extra worker
log.debug(f"Pool at capacity ({current_size}/{self.pool_size}), recycling extra {worker.worker_id}")
worker.shutdown()
except Exception as e:
# Unexpected error, shutdown worker
log.error(f"Failed to release {worker.worker_id}: {e}")
worker.shutdown()
def _maintenance_loop(self):
"""Background maintenance thread"""
while self.running:
try:
# Ensure pool is at capacity
current_size = self.workers.qsize()
needed = self.pool_size - current_size
if needed > 0:
log.debug(f"Pool needs {needed} more workers")
for _ in range(needed):
self._create_worker()
# Sleep for 10 seconds
time.sleep(10)
except Exception as e:
log.error(f"Maintenance loop error: {e}")
time.sleep(5)
def get_stats(self) -> Dict[str, Any]:
"""Get pool statistics"""
with self.lock:
active_count = len(self.active_workers)
return {
"pool_size": self.pool_size,
"idle_workers": self.workers.qsize(),
"active_workers": active_count,
"total_workers_created": self.worker_counter,
"headless": self.headless
}
# Global worker pool instances
validation_pool: Optional[ChromeWorkerPool] = None
scraping_pool: Optional[ChromeWorkerPool] = None
def start_worker_pools(validation_size: int = 1, scraping_size: int = 2, headless: bool = True):
"""
Start global worker pools.
Args:
validation_size: Number of workers for validation checks
scraping_size: Number of workers for scraping jobs
headless: Run Chrome in headless mode
"""
global validation_pool, scraping_pool
log.info("Starting global Chrome worker pools...")
validation_pool = ChromeWorkerPool(pool_size=validation_size, headless=headless)
validation_pool.start()
scraping_pool = ChromeWorkerPool(pool_size=scraping_size, headless=headless)
scraping_pool.start()
log.info("Global Chrome worker pools started")
def stop_worker_pools():
"""Stop global worker pools"""
global validation_pool, scraping_pool
log.info("Stopping global Chrome worker pools...")
if validation_pool:
validation_pool.stop()
validation_pool = None
if scraping_pool:
scraping_pool.stop()
scraping_pool = None
log.info("Global Chrome worker pools stopped")
def get_validation_worker(timeout: float = 10) -> Optional[ChromeWorker]:
"""Get a worker for validation check"""
if validation_pool:
return validation_pool.acquire_worker(timeout=timeout)
return None
def release_validation_worker(worker: ChromeWorker, recycle: bool = False):
"""Release a validation worker"""
if validation_pool:
validation_pool.release_worker(worker, recycle=recycle)
def get_scraping_worker(timeout: float = 30) -> Optional[ChromeWorker]:
"""Get a worker for scraping"""
if scraping_pool:
return scraping_pool.acquire_worker(timeout=timeout)
return None
def release_scraping_worker(worker: ChromeWorker, recycle: bool = False):
"""Release a scraping worker"""
if scraping_pool:
scraping_pool.release_worker(worker, recycle=recycle)
def get_pool_stats() -> Dict[str, Any]:
"""Get statistics for all pools"""
stats = {}
if validation_pool:
stats['validation'] = validation_pool.get_stats()
if scraping_pool:
stats['scraping'] = scraping_pool.get_stats()
return stats

View File

@@ -1,82 +0,0 @@
"""
Configuration management for Google Maps Reviews Scraper.
"""
import logging
from pathlib import Path
from typing import Dict, Any
import yaml
# Configure logging - can be overridden by environment variable
import os
log_level = getattr(logging, os.environ.get('LOG_LEVEL', 'INFO').upper(), logging.INFO)
logging.basicConfig(level=log_level, format="[%(asctime)s] %(levelname)s: %(message)s")
log = logging.getLogger("scraper")
# Default configuration path
DEFAULT_CONFIG_PATH = Path("config.yaml")
# Default configuration - will be overridden by config file
DEFAULT_CONFIG = {
"url": "https://maps.app.goo.gl/6tkNMDjcj3SS6LJe9",
"headless": True,
"sort_by": "relevance",
"stop_on_match": False,
"overwrite_existing": False,
"use_mongodb": True,
"mongodb": {
"uri": "mongodb://localhost:27017",
"database": "reviews",
"collection": "google_reviews"
},
"backup_to_json": True,
"json_path": "google_reviews.json",
"seen_ids_path": "google_reviews.ids",
"convert_dates": True,
"download_images": True,
"image_dir": "review_images",
"download_threads": 4,
"store_local_paths": True, # Option to control storing local image paths
"replace_urls": False, # Option to control URL replacement
"custom_url_base": "https://mycustomurl.com", # Base URL for replacement
"custom_url_profiles": "/profiles/", # Path for profile images
"custom_url_reviews": "/reviews/", # Path for review images
"preserve_original_urls": True, # Option to preserve original URLs
"custom_params": { # Custom parameters to add to each document
"company": "Thaitours", # Default example
"source": "Google Maps" # Default example
}
}
def load_config(config_path: Path = DEFAULT_CONFIG_PATH) -> Dict[str, Any]:
"""Load configuration from YAML file or use defaults"""
config = DEFAULT_CONFIG.copy()
if config_path.exists():
try:
with open(config_path, 'r') as f:
user_config = yaml.safe_load(f)
if user_config:
# Merge configs, with nested dictionary support
def deep_update(d, u):
for k, v in u.items():
if isinstance(v, dict) and k in d and isinstance(d[k], dict):
deep_update(d[k], v)
else:
d[k] = v
deep_update(config, user_config)
log.info(f"Loaded configuration from {config_path}")
except Exception as e:
log.error(f"Error loading config from {config_path}: {e}")
log.info("Using default configuration")
else:
log.info(f"Config file {config_path} not found, using default configuration")
# Create a default config file for future use
with open(config_path, 'w') as f:
yaml.dump(config, f, default_flow_style=False)
log.info(f"Created default configuration file at {config_path}")
return config

View File

@@ -1,666 +0,0 @@
"""
Crash Pattern Analyzer Module
Provides deep analysis of scraper crashes with pattern detection,
confidence scoring, and auto-fix parameter suggestions.
Builds on top of the basic classify_crash function in scraper_clean.py
with more sophisticated pattern matching and multi-signal analysis.
"""
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
import re
@dataclass
class CrashAnalysis:
"""
Result of crash pattern analysis.
Attributes:
pattern: The identified crash pattern type (e.g., "memory_exhaustion", "dom_bloat")
confidence: Confidence score from 0.0 to 1.0 based on multiple signals
description: Human-readable description of the crash cause
suggested_fix: Recommended action to prevent this crash
auto_fix_params: Parameters that can be applied automatically to prevent recurrence
"""
pattern: str # e.g., "memory_exhaustion", "dom_bloat", "rate_limited"
confidence: float # 0.0 to 1.0
description: str
suggested_fix: str
auto_fix_params: Optional[Dict[str, Any]]
# Thresholds for pattern detection
MEMORY_EXHAUSTION_THRESHOLD_MB = 1500 # 1.5GB in MB
MEMORY_GROWTH_RATE_THRESHOLD_MB_S = 10 # 10MB/s
DOM_BLOAT_THRESHOLD = 50000 # 50000 nodes
SCROLL_TIMEOUT_MIN_SCROLLS = 10 # Minimum scrolls before considering scroll_timeout
# Auto-fix parameters for each crash pattern
AUTO_FIX_PARAMS = {
"memory_exhaustion": {
"max_reviews": 500,
"restart_browser_after": 200
},
"dom_bloat": {
"scroll_cleanup": True,
"lazy_load": True
},
"rate_limited": {
"delay_multiplier": 2.0,
"use_different_proxy": True
},
"consent_loop": {
"skip_consent_retries": True
},
"scroll_timeout": {
"reduce_target": True,
"target_reviews": "current - 10%"
},
"element_stale": {
"retry_with_fresh_elements": True
}
}
def _calculate_memory_growth_rate(metrics_history: List[Dict]) -> Optional[float]:
"""
Calculate memory growth rate in MB/s from metrics history.
Args:
metrics_history: List of metric samples with timestamp_ms and memory_mb
Returns:
Growth rate in MB/s, or None if cannot be calculated
"""
if not metrics_history or len(metrics_history) < 2:
return None
# Filter samples that have valid memory readings
valid_samples = [
m for m in metrics_history
if m.get('memory_mb') is not None and m.get('timestamp_ms') is not None
]
if len(valid_samples) < 2:
return None
# Use first and last valid samples
first = valid_samples[0]
last = valid_samples[-1]
time_delta_s = (last['timestamp_ms'] - first['timestamp_ms']) / 1000
if time_delta_s <= 0:
return None
memory_delta_mb = last['memory_mb'] - first['memory_mb']
return memory_delta_mb / time_delta_s
def _get_max_memory(metrics_history: List[Dict]) -> Optional[int]:
"""Get maximum memory usage from metrics history."""
if not metrics_history:
return None
memories = [m.get('memory_mb') for m in metrics_history if m.get('memory_mb') is not None]
return max(memories) if memories else None
def _get_max_dom_nodes(metrics_history: List[Dict]) -> Optional[int]:
"""Get maximum DOM node count from metrics history."""
if not metrics_history:
return None
nodes = [m.get('dom_nodes') for m in metrics_history if m.get('dom_nodes') is not None]
return max(nodes) if nodes else None
def _check_memory_exhaustion(
error_message: str,
metrics_history: List[Dict],
logs: List[Dict]
) -> tuple[float, str]:
"""
Check for memory exhaustion pattern.
Returns:
Tuple of (confidence, description)
"""
confidence = 0.0
signals = []
# Check for high memory usage
max_memory = _get_max_memory(metrics_history)
if max_memory is not None:
if max_memory >= MEMORY_EXHAUSTION_THRESHOLD_MB:
confidence += 0.5
signals.append(f"Memory reached {max_memory}MB (threshold: {MEMORY_EXHAUSTION_THRESHOLD_MB}MB)")
elif max_memory >= MEMORY_EXHAUSTION_THRESHOLD_MB * 0.8:
confidence += 0.3
signals.append(f"Memory at {max_memory}MB approaching threshold")
# Check for rapid memory growth
growth_rate = _calculate_memory_growth_rate(metrics_history)
if growth_rate is not None and growth_rate >= MEMORY_GROWTH_RATE_THRESHOLD_MB_S:
confidence += 0.3
signals.append(f"Memory growing at {growth_rate:.1f}MB/s (threshold: {MEMORY_GROWTH_RATE_THRESHOLD_MB_S}MB/s)")
# Check error message for memory-related keywords
error_lower = error_message.lower()
memory_keywords = ['memory', 'heap', 'out of memory', 'oom', 'aw, snap', 'status_access_violation']
for keyword in memory_keywords:
if keyword in error_lower:
confidence += 0.2
signals.append(f"Error contains '{keyword}'")
break
# Check logs for memory warnings
for log_entry in logs:
msg = log_entry.get('message', '').lower()
if 'memory' in msg and ('high' in msg or 'warning' in msg or 'exceeded' in msg):
confidence += 0.1
signals.append("Memory warning found in logs")
break
description = "; ".join(signals) if signals else "No memory exhaustion signals detected"
return min(confidence, 1.0), description
def _check_dom_bloat(
error_message: str,
metrics_history: List[Dict],
logs: List[Dict]
) -> tuple[float, str]:
"""
Check for DOM bloat pattern.
Returns:
Tuple of (confidence, description)
"""
confidence = 0.0
signals = []
# Check for high DOM node count
max_nodes = _get_max_dom_nodes(metrics_history)
if max_nodes is not None:
if max_nodes >= DOM_BLOAT_THRESHOLD:
confidence += 0.6
signals.append(f"DOM nodes reached {max_nodes} (threshold: {DOM_BLOAT_THRESHOLD})")
elif max_nodes >= DOM_BLOAT_THRESHOLD * 0.8:
confidence += 0.3
signals.append(f"DOM nodes at {max_nodes} approaching threshold")
# Check error message for DOM-related keywords
error_lower = error_message.lower()
dom_keywords = ['dom', 'element', 'node', 'render', 'paint', 'layout']
for keyword in dom_keywords:
if keyword in error_lower:
confidence += 0.2
signals.append(f"Error contains '{keyword}'")
break
# Check if memory is high too (DOM bloat often causes memory issues)
max_memory = _get_max_memory(metrics_history)
if max_memory is not None and max_memory >= 800: # 800MB
confidence += 0.1
signals.append(f"Memory also elevated ({max_memory}MB)")
# Check logs for DOM-related messages
for log_entry in logs:
msg = log_entry.get('message', '').lower()
if 'dom' in msg and ('large' in msg or 'cleanup' in msg or 'remove' in msg):
confidence += 0.1
signals.append("DOM warning found in logs")
break
description = "; ".join(signals) if signals else "No DOM bloat signals detected"
return min(confidence, 1.0), description
def _check_rate_limited(
error_message: str,
metrics_history: List[Dict],
logs: List[Dict]
) -> tuple[float, str]:
"""
Check for rate limiting pattern.
Returns:
Tuple of (confidence, description)
"""
confidence = 0.0
signals = []
# Check error message for rate limit indicators
error_lower = error_message.lower()
if '429' in error_message:
confidence += 0.6
signals.append("HTTP 429 status code in error")
rate_keywords = ['rate limit', 'too many requests', 'unusual traffic', 'captcha', 'blocked']
for keyword in rate_keywords:
if keyword in error_lower:
confidence += 0.4
signals.append(f"Error contains '{keyword}'")
break
# Check logs for rate limiting signals
rate_log_count = 0
for log_entry in logs:
msg = log_entry.get('message', '').lower()
network = log_entry.get('network', {})
status = network.get('status')
if status == 429:
rate_log_count += 1
confidence += 0.2
if 'unusual traffic' in msg or 'rate' in msg or 'blocked' in msg:
rate_log_count += 1
confidence += 0.1
if rate_log_count > 0:
signals.append(f"Found {rate_log_count} rate-limiting indicators in logs")
description = "; ".join(signals) if signals else "No rate limiting signals detected"
return min(confidence, 1.0), description
def _check_consent_loop(
error_message: str,
metrics_history: List[Dict],
logs: List[Dict]
) -> tuple[float, str]:
"""
Check for consent popup loop pattern.
Returns:
Tuple of (confidence, description)
"""
confidence = 0.0
signals = []
# Check error message for consent keywords
error_lower = error_message.lower()
if 'consent' in error_lower:
confidence += 0.3
signals.append("Error mentions consent")
# Count consent-related log entries
consent_count = 0
consent_messages = []
for log_entry in logs:
msg = log_entry.get('message', '').lower()
if 'consent' in msg:
consent_count += 1
consent_messages.append(msg[:50])
# Multiple consent messages indicate a loop
if consent_count >= 3:
confidence += 0.5
signals.append(f"Consent popup appeared {consent_count} times in logs")
elif consent_count >= 2:
confidence += 0.3
signals.append(f"Consent popup appeared {consent_count} times")
elif consent_count == 1:
confidence += 0.1
signals.append("Single consent popup detected")
# Check for timeout after consent handling
if 'timeout' in error_lower and consent_count > 0:
confidence += 0.2
signals.append("Timeout occurred with consent activity")
description = "; ".join(signals) if signals else "No consent loop signals detected"
return min(confidence, 1.0), description
def _check_scroll_timeout(
error_message: str,
metrics_history: List[Dict],
logs: List[Dict],
state: Optional[Dict] = None
) -> tuple[float, str]:
"""
Check for scroll timeout pattern (no new reviews after many scrolls).
Returns:
Tuple of (confidence, description)
"""
confidence = 0.0
signals = []
# Check state for scroll count
scroll_count = 0
reviews_count = 0
if state:
scroll_count = state.get('scroll_count', 0)
reviews_count = state.get('reviews_extracted', 0)
# Check error for timeout indicators
error_lower = error_message.lower()
if 'timeout' in error_lower:
confidence += 0.2
signals.append("Timeout in error message")
# Count recovery attempts in logs (indicate stuck scrolling)
recovery_count = 0
no_new_count = 0
for log_entry in logs:
msg = log_entry.get('message', '').lower()
if 'recovery attempt' in msg:
recovery_count += 1
if 'no new' in msg or 'stuck' in msg:
no_new_count += 1
if recovery_count >= SCROLL_TIMEOUT_MIN_SCROLLS:
confidence += 0.5
signals.append(f"Made {recovery_count} recovery attempts")
elif recovery_count >= 5:
confidence += 0.3
signals.append(f"Made {recovery_count} recovery attempts")
if no_new_count > 0:
confidence += 0.2
signals.append(f"Found {no_new_count} 'no new reviews' log entries")
# Check if reviews stopped growing
if metrics_history and len(metrics_history) >= 5:
# Check if reviews count plateaued
recent_counts = [m.get('reviews_count', 0) for m in metrics_history[-5:] if m.get('reviews_count')]
if recent_counts and len(set(recent_counts)) == 1:
confidence += 0.2
signals.append(f"Review count stuck at {recent_counts[0]}")
description = "; ".join(signals) if signals else "No scroll timeout signals detected"
return min(confidence, 1.0), description
def _check_element_stale(
error_message: str,
metrics_history: List[Dict],
logs: List[Dict]
) -> tuple[float, str]:
"""
Check for stale element reference pattern.
Returns:
Tuple of (confidence, description)
"""
confidence = 0.0
signals = []
# Check error message for stale element indicators
error_lower = error_message.lower()
stale_keywords = [
'stale element', 'staleelement', 'stale_element',
'element is not attached', 'element reference',
'no such element', 'element not found',
'element is no longer valid'
]
for keyword in stale_keywords:
if keyword in error_lower:
confidence += 0.6
signals.append(f"Error contains '{keyword}'")
break
# Check logs for stale element patterns
stale_log_count = 0
for log_entry in logs:
msg = log_entry.get('message', '').lower()
for keyword in stale_keywords:
if keyword in msg:
stale_log_count += 1
break
if stale_log_count > 0:
confidence += 0.2
signals.append(f"Found {stale_log_count} stale element references in logs")
# Check if DOM was changing rapidly (indicates dynamic page)
if metrics_history and len(metrics_history) >= 3:
dom_counts = [m.get('dom_nodes') for m in metrics_history if m.get('dom_nodes')]
if len(dom_counts) >= 3:
# Calculate variance
avg = sum(dom_counts) / len(dom_counts)
variance = sum((x - avg) ** 2 for x in dom_counts) / len(dom_counts)
std_dev = variance ** 0.5
# High variance indicates rapidly changing DOM
if std_dev > 1000:
confidence += 0.2
signals.append(f"High DOM variability (std dev: {std_dev:.0f})")
description = "; ".join(signals) if signals else "No stale element signals detected"
return min(confidence, 1.0), description
def analyze_crash(crash_report: Dict) -> CrashAnalysis:
"""
Analyze a crash report to determine the most likely crash pattern.
Examines error_message, metrics_history, and logs_before_crash to
calculate confidence scores for each crash pattern type.
Args:
crash_report: Dictionary containing:
- error_message: str - The exception message
- metrics_history: List[Dict] - Sampled metrics with timestamp_ms, memory_mb, dom_nodes
- logs_before_crash: List[Dict] - Recent log entries before the crash
- state: Optional[Dict] - Scraper state (reviews_extracted, scroll_count, etc.)
- crash_type: Optional[str] - Basic crash classification from classify_crash()
Returns:
CrashAnalysis with the highest-confidence pattern match
"""
# Extract data from crash report
error_message = crash_report.get('error_message', '')
metrics_history = crash_report.get('metrics_history', [])
logs = crash_report.get('logs_before_crash', [])
state = crash_report.get('state', {})
basic_type = crash_report.get('crash_type', 'unknown')
# Run all pattern checks
pattern_results = {}
# Memory exhaustion
conf, desc = _check_memory_exhaustion(error_message, metrics_history, logs)
pattern_results['memory_exhaustion'] = (conf, desc)
# DOM bloat
conf, desc = _check_dom_bloat(error_message, metrics_history, logs)
pattern_results['dom_bloat'] = (conf, desc)
# Rate limited
conf, desc = _check_rate_limited(error_message, metrics_history, logs)
pattern_results['rate_limited'] = (conf, desc)
# Consent loop
conf, desc = _check_consent_loop(error_message, metrics_history, logs)
pattern_results['consent_loop'] = (conf, desc)
# Scroll timeout
conf, desc = _check_scroll_timeout(error_message, metrics_history, logs, state)
pattern_results['scroll_timeout'] = (conf, desc)
# Element stale
conf, desc = _check_element_stale(error_message, metrics_history, logs)
pattern_results['element_stale'] = (conf, desc)
# Find the pattern with highest confidence
best_pattern = max(pattern_results.items(), key=lambda x: x[1][0])
pattern_name = best_pattern[0]
confidence = best_pattern[1][0]
description = best_pattern[1][1]
# If confidence is too low, fall back to basic classification
if confidence < 0.2:
# Map basic crash types to our patterns
basic_to_pattern = {
'memory_exhaustion': 'memory_exhaustion',
'tab_crash': 'memory_exhaustion', # Tab crashes often from memory
'timeout': 'scroll_timeout',
'element_not_found': 'element_stale',
'rate_limited': 'rate_limited',
'network_failure': 'rate_limited', # Could be blocking
}
if basic_type in basic_to_pattern:
pattern_name = basic_to_pattern[basic_type]
confidence = 0.3 # Low confidence fallback
description = f"Inferred from basic crash type '{basic_type}'"
else:
pattern_name = 'unknown'
confidence = 0.0
description = f"Unable to determine crash pattern (basic type: {basic_type})"
# Generate suggested fix based on pattern
suggested_fixes = {
'memory_exhaustion': (
"Reduce batch size and restart browser more frequently. "
"Consider limiting max_reviews to 500 and restarting browser after every 200 reviews."
),
'dom_bloat': (
"Enable DOM cleanup during scrolling. "
"Hide processed review cards and remove separator elements to keep DOM light."
),
'rate_limited': (
"Increase delays between requests and consider rotating proxies. "
"Double the delay multiplier and switch to a different proxy if available."
),
'consent_loop': (
"Skip consent handling after initial attempt to avoid infinite loops. "
"The consent popup may be appearing due to cookie clearing or navigation issues."
),
'scroll_timeout': (
"The page may have stopped loading new reviews. "
"Try reducing the target review count by 10% and accepting partial results."
),
'element_stale': (
"Page elements are being removed/replaced during scraping. "
"Retry operations with freshly-located elements and add defensive waits."
),
'unknown': (
"Unable to determine specific crash cause. "
"Review logs and consider restarting with fresh browser session."
)
}
suggested_fix = suggested_fixes.get(pattern_name, suggested_fixes['unknown'])
auto_fix_params = AUTO_FIX_PARAMS.get(pattern_name)
return CrashAnalysis(
pattern=pattern_name,
confidence=confidence,
description=description,
suggested_fix=suggested_fix,
auto_fix_params=auto_fix_params
)
def get_auto_fix_params(pattern: str) -> Optional[Dict[str, Any]]:
"""
Get auto-fix parameters for a specific crash pattern.
Args:
pattern: The crash pattern name
Returns:
Dictionary of auto-fix parameters, or None if pattern not recognized
"""
return AUTO_FIX_PARAMS.get(pattern)
def apply_auto_fix(pattern: str, current_params: Dict[str, Any]) -> Dict[str, Any]:
"""
Apply auto-fix parameters to current scraper parameters.
Args:
pattern: The crash pattern name
current_params: Current scraper parameters to modify
Returns:
Updated parameters dictionary with fixes applied
"""
fix_params = AUTO_FIX_PARAMS.get(pattern, {})
updated = current_params.copy()
for key, value in fix_params.items():
if key == 'target_reviews' and value == 'current - 10%':
# Special case: reduce target by 10%
current_target = updated.get('max_reviews', 1000)
updated['max_reviews'] = int(current_target * 0.9)
elif key == 'delay_multiplier':
# Multiply existing delay
current_delay = updated.get('scroll_delay', 1.0)
updated['scroll_delay'] = current_delay * value
else:
updated[key] = value
return updated
def summarize_crash_patterns(crash_reports: List[Dict]) -> Dict[str, Any]:
"""
Analyze multiple crash reports to identify recurring patterns.
Args:
crash_reports: List of crash report dictionaries
Returns:
Summary dictionary with pattern frequencies and recommendations
"""
if not crash_reports:
return {
'total_crashes': 0,
'patterns': {},
'most_common': None,
'recommendations': []
}
pattern_counts: Dict[str, int] = {}
pattern_confidences: Dict[str, List[float]] = {}
for report in crash_reports:
analysis = analyze_crash(report)
pattern = analysis.pattern
pattern_counts[pattern] = pattern_counts.get(pattern, 0) + 1
if pattern not in pattern_confidences:
pattern_confidences[pattern] = []
pattern_confidences[pattern].append(analysis.confidence)
# Calculate average confidence per pattern
patterns_summary = {}
for pattern, count in pattern_counts.items():
avg_confidence = sum(pattern_confidences[pattern]) / len(pattern_confidences[pattern])
patterns_summary[pattern] = {
'count': count,
'percentage': count / len(crash_reports) * 100,
'avg_confidence': avg_confidence
}
# Find most common pattern
most_common = max(pattern_counts.items(), key=lambda x: x[1])[0] if pattern_counts else None
# Generate recommendations
recommendations = []
for pattern, stats in sorted(patterns_summary.items(), key=lambda x: x[1]['count'], reverse=True):
if stats['count'] >= 2: # Only recommend for recurring patterns
fix_params = AUTO_FIX_PARAMS.get(pattern)
if fix_params:
recommendations.append({
'pattern': pattern,
'occurrences': stats['count'],
'auto_fix_params': fix_params
})
return {
'total_crashes': len(crash_reports),
'patterns': patterns_summary,
'most_common': most_common,
'recommendations': recommendations
}

View File

@@ -1,882 +0,0 @@
#!/usr/bin/env python3
"""
PostgreSQL database module for production microservice.
Stores job metadata and reviews as JSONB.
"""
import asyncpg
import json
from datetime import datetime
from typing import Optional, List, Dict, Any
from uuid import UUID, uuid4
from enum import Enum
import logging
log = logging.getLogger(__name__)
class JobStatus(str, Enum):
"""Job status enumeration"""
PENDING = "pending"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
PARTIAL = "partial" # Job crashed but has partial reviews saved
class DatabaseManager:
"""PostgreSQL database manager with connection pooling"""
def __init__(self, database_url: str):
"""
Initialize database manager.
Args:
database_url: PostgreSQL connection URL
Format: postgresql://user:password@host:port/database
"""
self.database_url = database_url
self.pool: Optional[asyncpg.Pool] = None
async def connect(self):
"""Create connection pool"""
log.info("Connecting to PostgreSQL database...")
self.pool = await asyncpg.create_pool(
self.database_url,
min_size=5,
max_size=20,
command_timeout=60
)
log.info("Database connection pool created")
async def disconnect(self):
"""Close connection pool"""
if self.pool:
await self.pool.close()
log.info("Database connection pool closed")
async def initialize_schema(self):
"""Create database schema if it doesn't exist"""
async with self.pool.acquire() as conn:
# Create jobs table
await conn.execute("""
CREATE TABLE IF NOT EXISTS jobs (
job_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
status VARCHAR(20) NOT NULL DEFAULT 'pending',
url TEXT NOT NULL,
webhook_url TEXT,
webhook_secret TEXT,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
started_at TIMESTAMP,
completed_at TIMESTAMP,
updated_at TIMESTAMP,
reviews_count INTEGER,
total_reviews INTEGER,
reviews_data JSONB,
scrape_time REAL,
error_message TEXT,
metadata JSONB,
scrape_logs JSONB,
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'))
);
""")
# Add scrape_logs column if it doesn't exist (for existing databases)
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
""")
# Add updated_at column if it doesn't exist (for incremental progress tracking)
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS updated_at TIMESTAMP;
""")
# Add review_topics column if it doesn't exist (extracted topic filters with mention counts)
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS review_topics JSONB;
""")
# Update constraint to include 'partial' status (for existing databases)
await conn.execute("""
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_status;
""")
await conn.execute("""
ALTER TABLE jobs ADD CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'));
""")
# Create indexes
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
""")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_jobs_created_at ON jobs(created_at DESC);
""")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_jobs_webhook ON jobs(webhook_url) WHERE webhook_url IS NOT NULL;
""")
# Create canary results table
await conn.execute("""
CREATE TABLE IF NOT EXISTS canary_results (
id SERIAL PRIMARY KEY,
timestamp TIMESTAMP NOT NULL DEFAULT NOW(),
success BOOLEAN NOT NULL,
reviews_count INTEGER,
scrape_time REAL,
error_message TEXT,
metadata JSONB
);
""")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_canary_timestamp ON canary_results(timestamp DESC);
""")
# Create webhook attempts table (for retry tracking)
await conn.execute("""
CREATE TABLE IF NOT EXISTS webhook_attempts (
id SERIAL PRIMARY KEY,
job_id UUID NOT NULL REFERENCES jobs(job_id) ON DELETE CASCADE,
attempt_number INTEGER NOT NULL,
timestamp TIMESTAMP NOT NULL DEFAULT NOW(),
success BOOLEAN NOT NULL,
status_code INTEGER,
error_message TEXT,
response_time_ms REAL
);
""")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_webhook_job_id ON webhook_attempts(job_id);
""")
# Add session_fingerprint and metrics_history columns to jobs table
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS session_fingerprint JSONB;
""")
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS metrics_history JSONB;
""")
# Create crash_reports table
await conn.execute("""
CREATE TABLE IF NOT EXISTS crash_reports (
crash_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
job_id UUID REFERENCES jobs(job_id) ON DELETE CASCADE,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
crash_type VARCHAR(50) NOT NULL,
error_message TEXT,
state JSONB NOT NULL,
metrics_history JSONB,
logs_before_crash JSONB,
analysis JSONB,
screenshot_url TEXT,
dom_snapshot_id UUID
);
""")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_crash_reports_job ON crash_reports(job_id);
""")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_crash_reports_type ON crash_reports(crash_type);
""")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_crash_reports_created ON crash_reports(created_at DESC);
""")
log.info("Database schema initialized")
# ==================== Job Operations ====================
async def create_job(
self,
url: str,
webhook_url: Optional[str] = None,
webhook_secret: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
) -> UUID:
"""
Create a new scraping job.
Args:
url: Google Maps URL to scrape
webhook_url: Optional webhook URL for notifications
webhook_secret: Optional secret for webhook signature
metadata: Optional additional metadata
Returns:
UUID of created job
"""
async with self.pool.acquire() as conn:
job_id = await conn.fetchval("""
INSERT INTO jobs (url, webhook_url, webhook_secret, metadata)
VALUES ($1, $2, $3, $4)
RETURNING job_id
""", url, webhook_url, webhook_secret, json.dumps(metadata) if metadata else None)
log.info(f"Created job {job_id} for URL: {url[:80]}...")
return job_id
async def get_job(self, job_id: UUID) -> Optional[Dict[str, Any]]:
"""
Get job by ID.
Args:
job_id: Job UUID
Returns:
Job dictionary or None if not found
"""
async with self.pool.acquire() as conn:
row = await conn.fetchrow("""
SELECT
job_id,
status,
url,
webhook_url,
created_at,
started_at,
completed_at,
updated_at,
reviews_count,
total_reviews,
reviews_data,
scrape_time,
error_message,
metadata,
scrape_logs,
review_topics
FROM jobs
WHERE job_id = $1
""", job_id)
if not row:
return None
return dict(row)
async def get_job_reviews(self, job_id: UUID, include_partial: bool = True) -> Optional[List[Dict[str, Any]]]:
"""
Get reviews for a specific job.
Args:
job_id: Job UUID
include_partial: If True, also return reviews for running and partial jobs
Returns:
List of reviews or None if not found/no reviews
"""
async with self.pool.acquire() as conn:
if include_partial:
# Return reviews for completed, running, or partial jobs
reviews_data = await conn.fetchval("""
SELECT reviews_data
FROM jobs
WHERE job_id = $1 AND status IN ('completed', 'running', 'partial')
""", job_id)
else:
# Only return reviews for completed jobs
reviews_data = await conn.fetchval("""
SELECT reviews_data
FROM jobs
WHERE job_id = $1 AND status = 'completed'
""", job_id)
if not reviews_data:
return None
# asyncpg returns JSONB as string, need to parse it
if isinstance(reviews_data, str):
return json.loads(reviews_data)
return reviews_data
async def update_job_status(
self,
job_id: UUID,
status: JobStatus,
**kwargs
):
"""
Update job status and optional fields.
Args:
job_id: Job UUID
status: New status
**kwargs: Additional fields to update (started_at, completed_at, error_message, etc.)
"""
# Build dynamic UPDATE query
set_clauses = ["status = $2"]
params = [job_id, status.value]
param_idx = 3
if status == JobStatus.RUNNING and 'started_at' not in kwargs:
kwargs['started_at'] = datetime.now()
elif status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED] and 'completed_at' not in kwargs:
kwargs['completed_at'] = datetime.now()
for key, value in kwargs.items():
# Handle JSONB fields specially
if key == 'scrape_logs' and value is not None:
set_clauses.append(f"{key} = ${param_idx}::jsonb")
params.append(json.dumps(value) if not isinstance(value, str) else value)
else:
set_clauses.append(f"{key} = ${param_idx}")
params.append(value)
param_idx += 1
query = f"""
UPDATE jobs
SET {', '.join(set_clauses)}
WHERE job_id = $1
"""
async with self.pool.acquire() as conn:
await conn.execute(query, *params)
async def save_job_result(
self,
job_id: UUID,
reviews: List[Dict[str, Any]],
scrape_time: float,
total_reviews: Optional[int] = None,
scrape_logs: Optional[List[Dict[str, Any]]] = None,
review_topics: Optional[List[Dict[str, Any]]] = None
):
"""
Save scraping results to database.
Args:
job_id: Job UUID
reviews: List of review dictionaries
scrape_time: Time taken to scrape in seconds
total_reviews: Total reviews available (from page counter)
scrape_logs: List of log entries from the scraper
review_topics: List of topic filter dictionaries with topic and count
"""
async with self.pool.acquire() as conn:
# If reviews list is empty, check if job already has reviews from incremental saves
# This happens when flush_callback was used during scraping
if not reviews:
existing = await conn.fetchval(
"SELECT reviews_count FROM jobs WHERE job_id = $1", job_id
)
if existing and existing > 0:
# Job has reviews from incremental saves, don't overwrite reviews_data
await conn.execute("""
UPDATE jobs
SET
status = 'completed',
completed_at = NOW(),
total_reviews = COALESCE($2, total_reviews),
scrape_time = $3,
scrape_logs = $4::jsonb,
review_topics = $5::jsonb
WHERE job_id = $1
""", job_id, total_reviews, scrape_time,
json.dumps(scrape_logs) if scrape_logs else None,
json.dumps(review_topics) if review_topics else None)
log.info(f"Completed job {job_id} with {existing} reviews (from incremental saves)")
return
await conn.execute("""
UPDATE jobs
SET
status = 'completed',
completed_at = NOW(),
reviews_count = $2,
total_reviews = $3,
reviews_data = $4::jsonb,
scrape_time = $5,
scrape_logs = $6::jsonb,
review_topics = $7::jsonb
WHERE job_id = $1
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
json.dumps(scrape_logs) if scrape_logs else None,
json.dumps(review_topics) if review_topics else None)
log.info(f"Saved {len(reviews)} reviews for job {job_id}")
async def save_reviews_incremental(
self,
job_id: UUID,
reviews: List[Dict[str, Any]],
total_reviews: Optional[int] = None
):
"""
Save reviews incrementally during scraping.
Called on each flush to preserve progress in case of crash.
Args:
job_id: Job UUID
reviews: ALL reviews collected so far (not just new ones)
total_reviews: Total reviews available (from page counter)
"""
async with self.pool.acquire() as conn:
await conn.execute("""
UPDATE jobs
SET
reviews_count = $2,
total_reviews = COALESCE($3, total_reviews),
reviews_data = $4::jsonb,
updated_at = NOW()
WHERE job_id = $1 AND status = 'running'
""", job_id, len(reviews), total_reviews, json.dumps(reviews))
log.debug(f"Incremental save: {len(reviews)} reviews for job {job_id}")
async def update_session_fingerprint(
self,
job_id: UUID,
session_fingerprint: Dict[str, Any]
):
"""
Update the session fingerprint for a job.
This should be called early in the scraping process after the browser
fingerprint is captured, to record browser characteristics for
bot detection analysis.
Args:
job_id: Job UUID
session_fingerprint: Dictionary containing browser fingerprint data:
- user_agent: Browser user agent string
- platform: OS platform
- language: Primary language
- languages: List of accepted languages
- timezone: Timezone string
- screen: {width, height, colorDepth}
- viewport: {width, height}
- webgl_vendor: WebGL vendor string
- webgl_renderer: WebGL renderer string
- canvas_fingerprint: Canvas fingerprint hash
- hardware_concurrency: Number of CPU cores
- device_memory: Device memory in GB
- bot_detection_tests: {webdriver_hidden, chrome_runtime, permissions_query}
- captured_at: ISO timestamp when fingerprint was captured
"""
async with self.pool.acquire() as conn:
await conn.execute("""
UPDATE jobs
SET
session_fingerprint = $2::jsonb,
updated_at = NOW()
WHERE job_id = $1
""", job_id, json.dumps(session_fingerprint))
log.debug(f"Updated session fingerprint for job {job_id}")
async def mark_job_partial(
self,
job_id: UUID,
error_message: str,
scrape_logs: Optional[List[Dict[str, Any]]] = None
):
"""
Mark a job as partial (crashed but has some reviews saved).
Args:
job_id: Job UUID
error_message: Error that caused the crash
scrape_logs: Log entries from the scraper
"""
async with self.pool.acquire() as conn:
await conn.execute("""
UPDATE jobs
SET
status = 'partial',
completed_at = NOW(),
error_message = $2,
scrape_logs = $3::jsonb
WHERE job_id = $1
""", job_id, error_message, json.dumps(scrape_logs) if scrape_logs else None)
log.info(f"Marked job {job_id} as partial due to: {error_message}")
async def list_jobs(
self,
status: Optional[JobStatus] = None,
limit: int = 100,
offset: int = 0
) -> List[Dict[str, Any]]:
"""
List jobs with optional filtering.
Args:
status: Optional status filter
limit: Maximum number of jobs to return
offset: Number of jobs to skip
Returns:
List of job dictionaries
"""
async with self.pool.acquire() as conn:
if status:
rows = await conn.fetch("""
SELECT
job_id,
status,
url,
created_at,
completed_at,
reviews_count,
total_reviews,
scrape_time,
error_message,
metadata,
review_topics
FROM jobs
WHERE status = $1
ORDER BY created_at DESC
LIMIT $2 OFFSET $3
""", status.value, limit, offset)
else:
rows = await conn.fetch("""
SELECT
job_id,
status,
url,
created_at,
completed_at,
reviews_count,
total_reviews,
scrape_time,
error_message,
metadata,
review_topics
FROM jobs
ORDER BY created_at DESC
LIMIT $1 OFFSET $2
""", limit, offset)
return [dict(row) for row in rows]
async def get_pending_jobs_with_webhooks(self, limit: int = 100) -> List[Dict[str, Any]]:
"""
Get completed jobs that have webhooks pending delivery.
Args:
limit: Maximum number of jobs to return
Returns:
List of job dictionaries with webhook info
"""
async with self.pool.acquire() as conn:
rows = await conn.fetch("""
SELECT
job_id,
status,
url,
webhook_url,
webhook_secret,
reviews_count,
scrape_time,
error_message,
completed_at
FROM jobs
WHERE webhook_url IS NOT NULL
AND status IN ('completed', 'failed')
AND job_id NOT IN (
SELECT job_id
FROM webhook_attempts
WHERE success = true
)
ORDER BY completed_at ASC
LIMIT $1
""", limit)
return [dict(row) for row in rows]
async def delete_job(self, job_id: UUID) -> bool:
"""
Delete a job from the database.
Args:
job_id: Job UUID
Returns:
True if deleted, False if not found
"""
async with self.pool.acquire() as conn:
result = await conn.execute("""
DELETE FROM jobs WHERE job_id = $1
""", job_id)
deleted = result.split()[-1] == "1"
if deleted:
log.info(f"Deleted job {job_id}")
return deleted
async def cleanup_old_jobs(self, max_age_days: int = 30):
"""
Delete old completed/failed jobs.
Args:
max_age_days: Maximum age in days before deletion
"""
async with self.pool.acquire() as conn:
result = await conn.execute("""
DELETE FROM jobs
WHERE status IN ('completed', 'failed', 'cancelled')
AND completed_at < NOW() - INTERVAL '%s days'
""", max_age_days)
deleted_count = int(result.split()[-1])
if deleted_count > 0:
log.info(f"Cleaned up {deleted_count} old jobs")
# ==================== Statistics ====================
async def get_stats(self) -> Dict[str, Any]:
"""
Get job statistics.
Returns:
Statistics dictionary
"""
async with self.pool.acquire() as conn:
stats = await conn.fetchrow("""
SELECT
COUNT(*) as total_jobs,
COUNT(*) FILTER (WHERE status = 'pending') as pending,
COUNT(*) FILTER (WHERE status = 'running') as running,
COUNT(*) FILTER (WHERE status = 'completed') as completed,
COUNT(*) FILTER (WHERE status = 'failed') as failed,
COUNT(*) FILTER (WHERE status = 'cancelled') as cancelled,
AVG(scrape_time) FILTER (WHERE status = 'completed') as avg_scrape_time,
SUM(reviews_count) FILTER (WHERE status = 'completed') as total_reviews
FROM jobs
""")
return dict(stats)
# ==================== Canary Operations ====================
async def save_canary_result(
self,
success: bool,
reviews_count: Optional[int] = None,
scrape_time: Optional[float] = None,
error_message: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
):
"""
Save canary test result.
Args:
success: Whether canary test succeeded
reviews_count: Number of reviews scraped
scrape_time: Time taken in seconds
error_message: Error message if failed
metadata: Additional metadata
"""
async with self.pool.acquire() as conn:
await conn.execute("""
INSERT INTO canary_results (success, reviews_count, scrape_time, error_message, metadata)
VALUES ($1, $2, $3, $4, $5)
""", success, reviews_count, scrape_time, error_message, json.dumps(metadata) if metadata else None)
async def get_canary_history(self, limit: int = 100) -> List[Dict[str, Any]]:
"""
Get canary test history.
Args:
limit: Maximum number of results to return
Returns:
List of canary result dictionaries
"""
async with self.pool.acquire() as conn:
rows = await conn.fetch("""
SELECT
timestamp,
success,
reviews_count,
scrape_time,
error_message
FROM canary_results
ORDER BY timestamp DESC
LIMIT $1
""", limit)
return [dict(row) for row in rows]
# ==================== Webhook Attempts ====================
async def log_webhook_attempt(
self,
job_id: UUID,
attempt_number: int,
success: bool,
status_code: Optional[int] = None,
error_message: Optional[str] = None,
response_time_ms: Optional[float] = None
):
"""
Log a webhook delivery attempt.
Args:
job_id: Job UUID
attempt_number: Attempt number (1, 2, 3...)
success: Whether delivery succeeded
status_code: HTTP status code
error_message: Error message if failed
response_time_ms: Response time in milliseconds
"""
async with self.pool.acquire() as conn:
await conn.execute("""
INSERT INTO webhook_attempts (job_id, attempt_number, success, status_code, error_message, response_time_ms)
VALUES ($1, $2, $3, $4, $5, $6)
""", job_id, attempt_number, success, status_code, error_message, response_time_ms)
# ==================== Crash Reports ====================
async def save_crash_report(self, job_id: str, crash_data: dict) -> str:
"""
Save a crash report and return the crash_id.
Args:
job_id: Job UUID as string
crash_data: Dictionary containing crash report data:
- crash_type: Type of crash (required)
- error_message: Error message (optional)
- state: Current state at crash time (required)
- metrics_history: Historical metrics (optional)
- logs_before_crash: Log entries before crash (optional)
- analysis: Crash analysis data (optional)
- screenshot_url: URL to screenshot (optional)
- dom_snapshot_id: UUID of DOM snapshot (optional)
Returns:
UUID of created crash report as string
"""
async with self.pool.acquire() as conn:
# Convert job_id string to UUID
job_uuid = UUID(job_id) if isinstance(job_id, str) else job_id
crash_id = await conn.fetchval("""
INSERT INTO crash_reports (
job_id,
crash_type,
error_message,
state,
metrics_history,
logs_before_crash,
analysis,
screenshot_url,
dom_snapshot_id
)
VALUES ($1, $2, $3, $4::jsonb, $5::jsonb, $6::jsonb, $7::jsonb, $8, $9)
RETURNING crash_id
""",
job_uuid,
crash_data.get('crash_type'),
crash_data.get('error_message'),
json.dumps(crash_data.get('state', {})),
json.dumps(crash_data.get('metrics_history')) if crash_data.get('metrics_history') else None,
json.dumps(crash_data.get('logs_before_crash')) if crash_data.get('logs_before_crash') else None,
json.dumps(crash_data.get('analysis')) if crash_data.get('analysis') else None,
crash_data.get('screenshot_url'),
UUID(crash_data['dom_snapshot_id']) if crash_data.get('dom_snapshot_id') else None
)
log.info(f"Saved crash report {crash_id} for job {job_id}, type: {crash_data.get('crash_type')}")
return str(crash_id)
async def get_crash_report(self, job_id: str) -> Optional[dict]:
"""
Get crash report for a job, if any.
Args:
job_id: Job UUID as string
Returns:
Crash report dictionary or None if not found
"""
async with self.pool.acquire() as conn:
job_uuid = UUID(job_id) if isinstance(job_id, str) else job_id
row = await conn.fetchrow("""
SELECT
crash_id,
job_id,
created_at,
crash_type,
error_message,
state,
metrics_history,
logs_before_crash,
analysis,
screenshot_url,
dom_snapshot_id
FROM crash_reports
WHERE job_id = $1
ORDER BY created_at DESC
LIMIT 1
""", job_uuid)
if not row:
return None
result = dict(row)
# Convert UUIDs to strings for JSON serialization
result['crash_id'] = str(result['crash_id'])
result['job_id'] = str(result['job_id'])
if result.get('dom_snapshot_id'):
result['dom_snapshot_id'] = str(result['dom_snapshot_id'])
return result
async def get_crash_stats(self, days: int = 7) -> dict:
"""
Get crash statistics for the last N days.
Args:
days: Number of days to look back (default: 7)
Returns:
Dictionary with:
- total: Total number of crashes
- by_type: Dict mapping crash type to count
- by_day: List of dicts with date and count
"""
async with self.pool.acquire() as conn:
# Get total count
total = await conn.fetchval("""
SELECT COUNT(*)
FROM crash_reports
WHERE created_at >= NOW() - INTERVAL '%s days'
""", days)
# Get counts by type
type_rows = await conn.fetch("""
SELECT crash_type, COUNT(*) as count
FROM crash_reports
WHERE created_at >= NOW() - INTERVAL '%s days'
GROUP BY crash_type
ORDER BY count DESC
""", days)
by_type = {row['crash_type']: row['count'] for row in type_rows}
# Get counts by day
day_rows = await conn.fetch("""
SELECT DATE(created_at) as date, COUNT(*) as count
FROM crash_reports
WHERE created_at >= NOW() - INTERVAL '%s days'
GROUP BY DATE(created_at)
ORDER BY date DESC
""", days)
by_day = [{'date': str(row['date']), 'count': row['count']} for row in day_rows]
return {
'total': total or 0,
'by_type': by_type,
'by_day': by_day
}

View File

@@ -1,391 +0,0 @@
"""
Date conversion utilities for Google Maps reviews.
"""
import logging
import re
from datetime import datetime, timedelta
from typing import Dict, Any, Optional
# Logger
log = logging.getLogger("scraper")
def relative_to_datetime(date_str: str, lang: str = "en") -> Optional[datetime]:
"""
Convert a relative date string to a datetime object.
Args:
date_str: The relative date string (e.g., "2 years ago")
lang: Language code ("en" or "he")
Returns:
datetime object or None if conversion fails
"""
if not date_str:
return None
try:
# Convert to ISO format first
iso_date = parse_relative_date(date_str, lang)
# If original string was returned, it wasn't in the expected format
if iso_date == date_str:
return None
# Parse the ISO format into datetime
return datetime.fromisoformat(iso_date)
except Exception as e:
log.debug(f"Failed to convert relative date '{date_str}': {e}")
return None
class DateConverter:
"""Handler for converting string dates to datetime objects in MongoDB"""
@staticmethod
def convert_dates_in_document(doc: Dict[str, Any]) -> Dict[str, Any]:
"""
Convert string dates to datetime objects in a document.
Args:
doc: MongoDB document with string dates
Returns:
Document with string dates converted to datetime objects
"""
# Remove the original date string field if it exists
if "date" in doc:
original_date = doc.pop("date")
# Try to use the original date to fix review_date if needed
if "review_date" not in doc or not doc["review_date"]:
lang = next(iter(doc.get("description", {}).keys()), "en")
date_obj = relative_to_datetime(original_date, lang)
if date_obj:
doc["review_date"] = date_obj
# Fields that should be converted to dates
date_fields = ["created_date", "last_modified_date", "review_date"]
# Convert date fields to datetime
for field in date_fields:
if field in doc and isinstance(doc[field], str):
try:
# Try to parse as ISO format first
doc[field] = datetime.fromisoformat(doc[field].replace('Z', '+00:00'))
except (ValueError, TypeError):
# If that fails, try parsing as relative date
lang = next(iter(doc.get("description", {}).keys()), "en")
date_obj = relative_to_datetime(doc[field], lang)
if date_obj:
doc[field] = date_obj
# Handle nested date fields in owner_responses
if "owner_responses" in doc and isinstance(doc["owner_responses"], dict):
for lang, response in doc["owner_responses"].items():
if isinstance(response, dict) and "date" in response:
# Remove the date string field from owner responses
del response["date"]
return doc
@staticmethod
def convert_dates_in_reviews(reviews: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
"""
Convert string dates to datetime objects for all reviews.
Args:
reviews: Dictionary of review documents
Returns:
Reviews with dates converted to datetime objects
"""
log.info("Converting string dates to datetime objects...")
for review_id, review in reviews.items():
reviews[review_id] = DateConverter.convert_dates_in_document(review)
return reviews
def parse_relative_date(date_str: str, lang: str, now: Optional[datetime] = None) -> str:
"""
Converts a relative review_date (in English or Hebrew) such as "a week ago" or "לפני 7 שנים"
into an ISO formatted datetime string (UTC).
For English, supported formats include:
- "a day ago", "an hour ago", "3 weeks ago", "4 months ago", "2 years ago", etc.
For Hebrew, supported formats include:
- "לפני יום", "לפני 2 ימים", "לפני שבוע", "לפני שבועיים", "לפני חודש",
"לפני חודשיים", "לפני 10 חודשים", "לפני שנה", "לפני 3 שנים", etc.
Parameters:
- date_str (str): the relative date string.
- lang (str): "en" for English or "he" for Hebrew.
- now (Optional[datetime]): reference datetime; if None, current local time is used.
Returns:
A string representing the calculated absolute datetime in ISO 8601 format.
If parsing fails in all supported languages, returns a random date within the last year.
"""
import random
if now is None:
now = datetime.utcnow() # use UTC for consistency
# Try with the provided language first
result = try_parse_date(date_str, lang, now)
if result != date_str:
return result
# If the provided language failed, try other supported languages
supported_langs = ["en", "he", "th"]
for alt_lang in supported_langs:
if alt_lang != lang.lower():
result = try_parse_date(date_str, alt_lang, now)
if result != date_str:
return result
# If all parsing attempts failed, generate a random date within the last year
# This creates a date between 1 day ago and 365 days ago
random_days_ago = random.randint(1, 365)
random_date = now - timedelta(days=random_days_ago)
return random_date.isoformat()
def try_parse_date(date_str: str, lang: str, now: datetime) -> str:
"""
Helper function that attempts to parse a date string in a specific language.
Returns the ISO formatted date if successful, or the original string if not.
"""
delta = timedelta(0)
parsed = False
if lang.lower() == "en":
# Pattern: capture number or "a"/"an", then unit.
pattern = re.compile(r'(?P<num>a|an|\d+)\s+(?P<unit>day|week|month|year)s?\s+ago', re.IGNORECASE)
m = pattern.search(date_str)
if m:
num_str = m.group("num").lower()
num = 1 if num_str in ("a", "an") else int(num_str)
unit = m.group("unit").lower()
if unit == "day":
delta = timedelta(days=num)
elif unit == "week":
delta = timedelta(weeks=num)
elif unit == "month":
delta = timedelta(days=30 * num) # approximate
elif unit == "year":
delta = timedelta(days=365 * num) # approximate
parsed = True
elif lang.lower() == "he":
# Remove the "לפני" prefix if present
text = date_str.strip()
if text.startswith("לפני"):
text = text[len("לפני"):].strip()
# Handle special cases where the number and unit are combined:
special = {
"חודשיים": (2, "month"),
"שבועיים": (2, "week"),
"יומיים": (2, "day"),
}
if text in special:
num, unit = special[text]
if unit == "day":
delta = timedelta(days=num)
elif unit == "week":
delta = timedelta(weeks=num)
elif unit == "month":
delta = timedelta(days=30 * num) # approximate
parsed = True
else:
# Match optional number (or assume 1) and then a unit.
pattern = re.compile(r'(?P<num>\d+|אחד|אחת)?\s*(?P<unit>שנה|שנים|חודש|חודשים|יום|ימים|שבוע|שבועות)',
re.IGNORECASE)
m = pattern.search(text)
if m:
num_str = m.group("num")
if not num_str:
num = 1
else:
try:
num = int(num_str)
except ValueError:
num = 1
unit_he = m.group("unit")
# Map the Hebrew unit (both singular and plural) to English unit names
if unit_he in ("יום", "ימים"):
unit = "day"
elif unit_he in ("שבוע", "שבועות"):
unit = "week"
elif unit_he in ("חודש", "חודשים"):
unit = "month"
elif unit_he in ("שנה", "שנים"):
unit = "year"
else:
unit = "day" # fallback
if unit == "day":
delta = timedelta(days=num)
elif unit == "week":
delta = timedelta(weeks=num)
elif unit == "month":
delta = timedelta(days=30 * num) # approximate
elif unit == "year":
delta = timedelta(days=365 * num) # approximate
parsed = True
elif lang.lower() == "th":
# Thai language patterns (simplified)
# Check for Thai patterns like "3 วันที่แล้ว" (3 days ago)
thai_pattern = re.compile(r'(?P<num>\d+)?\s*(?P<unit>วัน|สัปดาห์|เดือน|ปี)ที่แล้ว', re.IGNORECASE)
m = thai_pattern.search(date_str)
if m:
num_str = m.group("num")
num = 1 if not num_str else int(num_str)
unit_th = m.group("unit")
# Map Thai units to English
if unit_th == "วัน":
unit = "day"
elif unit_th == "สัปดาห์":
unit = "week"
elif unit_th == "เดือน":
unit = "month"
elif unit_th == "ปี":
unit = "year"
else:
unit = "day" # fallback
if unit == "day":
delta = timedelta(days=num)
elif unit == "week":
delta = timedelta(weeks=num)
elif unit == "month":
delta = timedelta(days=30 * num) # approximate
elif unit == "year":
delta = timedelta(days=365 * num) # approximate
parsed = True
# Return the calculated date if parsing was successful, otherwise return the original string
if parsed:
result = now - delta
return result.isoformat()
else:
return date_str
# def parse_relative_date(date_str: str, lang: str, now: Optional[datetime] = None) -> str:
# """
# Converts a relative review_date (in English or Hebrew) such as "a week ago" or "לפני 7 שנים"
# into an ISO formatted datetime string (UTC).
#
# For English, supported formats include:
# - "a day ago", "an hour ago", "3 weeks ago", "4 months ago", "2 years ago", etc.
# For Hebrew, supported formats include:
# - "לפני יום", "לפני 2 ימים", "לפני שבוע", "לפני שבועיים", "לפני חודש",
# "לפני חודשיים", "לפני 10 חודשים", "לפני שנה", "לפני 3 שנים", etc.
#
# Parameters:
# - date_str (str): the relative date string.
# - lang (str): "en" for English or "he" for Hebrew.
# - now (Optional[datetime]): reference datetime; if None, current local time is used.
#
# Returns:
# A string representing the calculated absolute datetime in ISO 8601 format,
# or the original date_str if parsing fails.
# """
# if now is None:
# now = datetime.utcnow() # use UTC for consistency
#
# delta = timedelta(0)
#
# if lang.lower() == "en":
# # Pattern: capture number or "a"/"an", then unit.
# pattern = re.compile(r'(?P<num>a|an|\d+)\s+(?P<unit>day|week|month|year)s?\s+ago', re.IGNORECASE)
# m = pattern.search(date_str)
# if m:
# num_str = m.group("num").lower()
# num = 1 if num_str in ("a", "an") else int(num_str)
# unit = m.group("unit").lower()
# if unit == "day":
# delta = timedelta(days=num)
# elif unit == "week":
# delta = timedelta(weeks=num)
# elif unit == "month":
# delta = timedelta(days=30 * num) # approximate
# elif unit == "year":
# delta = timedelta(days=365 * num) # approximate
# else:
# return date_str # return original if not matched
# elif lang.lower() == "he":
# # Remove the "לפני" prefix if present
# text = date_str.strip()
# if text.startswith("לפני"):
# text = text[len("לפני"):].strip()
#
# # Handle special cases where the number and unit are combined:
# special = {
# "חודשיים": (2, "month"),
# "שבועיים": (2, "week"),
# "יומיים": (2, "day"),
# }
# if text in special:
# num, unit = special[text]
# else:
# # Match optional number (or assume 1) and then a unit.
# pattern = re.compile(r'(?P<num>\d+|אחד|אחת)?\s*(?P<unit>שנה|שנים|חודש|חודשים|יום|ימים|שבוע|שבועות)',
# re.IGNORECASE)
# m = pattern.search(text)
# if m:
# num_str = m.group("num")
# if not num_str:
# num = 1
# else:
# try:
# num = int(num_str)
# except ValueError:
# num = 1
# unit_he = m.group("unit")
# # Map the Hebrew unit (both singular and plural) to English unit names
# if unit_he in ("יום", "ימים"):
# unit = "day"
# elif unit_he in ("שבוע", "שבועות"):
# unit = "week"
# elif unit_he in ("חודש", "חודשים"):
# unit = "month"
# elif unit_he in ("שנה", "שנים"):
# unit = "year"
# else:
# unit = "day" # fallback
# else:
# return date_str # if nothing matches, return original text
#
# if unit == "day":
# delta = timedelta(days=num)
# elif unit == "week":
# delta = timedelta(weeks=num)
# elif unit == "month":
# delta = timedelta(days=30 * num) # approximate
# elif unit == "year":
# delta = timedelta(days=365 * num) # approximate
#
# result = now - delta
# return result.isoformat()
# --- Example usage ---
if __name__ == "__main__":
# Fixed reference time for reproducibility:
fixed_now = datetime(2025, 2, 5, 12, 0, 0)
examples = [
("a week ago", "he"),
("4 weeks ago", "en"),
("לפני 7 שנים", "he"),
("לפני חודשיים", "he")
]
for text, lang in examples:
iso_date = parse_relative_date(text, lang, now=fixed_now)
print(f"Original: {text} ({lang}) => ISO: {iso_date}")

View File

@@ -1,411 +0,0 @@
#!/usr/bin/env python3
"""
Smart health check system with canary testing.
Verifies that scraping actually works, not just that services are up.
"""
import asyncio
import logging
from datetime import datetime, timedelta
from typing import Dict, Any, Optional
import os
log = logging.getLogger(__name__)
class CanaryMonitor:
"""
Background canary test monitor.
Runs actual scraping tests periodically to verify the scraper works.
This catches issues like:
- Google Maps page structure changes
- Broken CSS selectors
- GDPR consent handling issues
- Network/proxy problems
- Chrome/browser issues
"""
def __init__(
self,
db,
interval_hours: int = 4,
test_url: Optional[str] = None
):
"""
Initialize canary monitor.
Args:
db: Database manager instance
interval_hours: How often to run canary tests
test_url: Optional test URL (defaults to Soho Factory in Vilnius)
"""
self.db = db
self.interval = timedelta(hours=interval_hours)
self.test_url = test_url or os.getenv(
'CANARY_TEST_URL',
'https://www.google.com/maps/place/Soho+Factory/@54.6738155,25.2595844,17z/'
)
self.running = False
self.last_run: Optional[datetime] = None
self.last_success: Optional[datetime] = None
self.consecutive_failures = 0
self.last_result: Optional[Dict[str, Any]] = None
async def start(self):
"""Start the background canary monitoring"""
self.running = True
log.info(f"Canary monitor started (interval: {self.interval.total_seconds()/3600:.1f}h)")
while self.running:
try:
await self.run_canary_test()
except Exception as e:
log.error(f"Canary test failed with exception: {e}")
self.consecutive_failures += 1
# Alert if multiple consecutive failures
if self.consecutive_failures >= 3:
await self.send_alert(
f"🚨 CRITICAL: Scraper canary failed {self.consecutive_failures} times in a row! "
f"Last error: {str(e)[:200]}"
)
# Sleep until next run
await asyncio.sleep(self.interval.total_seconds())
def stop(self):
"""Stop the background monitoring"""
self.running = False
log.info("Canary monitor stopped")
async def run_canary_test(self):
"""
Run a single canary test.
This performs an actual scrape on a known test URL and validates:
- Scraping succeeds
- Reviews are extracted
- Review count is reasonable
- Scrape time is reasonable
- Data structure is valid
"""
from modules.scraper_clean import fast_scrape_reviews
log.info(f"Running canary scrape test on {self.test_url[:60]}...")
self.last_run = datetime.now()
try:
# Run actual scrape with timeout
result = await asyncio.wait_for(
asyncio.to_thread(
fast_scrape_reviews,
url=self.test_url,
headless=True,
max_scrolls=10 # Limited for canary
),
timeout=60 # Fail if takes > 60s
)
# Validate result
checks = {
"scrape_succeeded": result['success'],
"got_reviews": result['count'] > 0,
"reasonable_count": 10 <= result['count'] <= 500,
"reasonable_time": result['time'] < 30,
"data_structure_valid": self._validate_review_structure(result.get('reviews', []))
}
all_passed = all(checks.values())
if all_passed:
# Success!
log.info(
f"✅ Canary test PASSED: {result['count']} reviews in {result['time']:.1f}s"
)
self.consecutive_failures = 0
self.last_success = datetime.now()
self.last_result = {
"status": "pass",
"reviews_count": result['count'],
"scrape_time": result['time'],
"checks": checks
}
# Save to database
await self.db.save_canary_result(
success=True,
reviews_count=result['count'],
scrape_time=result['time'],
metadata={"checks": checks}
)
else:
# Validation failed
failed_checks = [k for k, v in checks.items() if not v]
log.error(
f"❌ Canary test FAILED: validation failed on {failed_checks}"
)
self.consecutive_failures += 1
self.last_result = {
"status": "fail",
"reviews_count": result['count'],
"scrape_time": result['time'],
"checks": checks,
"failed_checks": failed_checks
}
# Save to database
await self.db.save_canary_result(
success=False,
reviews_count=result['count'],
scrape_time=result['time'],
error_message=f"Validation failed: {failed_checks}",
metadata={"checks": checks}
)
# Alert on failure
if self.consecutive_failures >= 3:
await self.send_alert(
f"🚨 CRITICAL: Canary validation failed {self.consecutive_failures} times! "
f"Failed checks: {failed_checks}"
)
except asyncio.TimeoutError:
log.error("❌ Canary test TIMEOUT (>60s)")
self.consecutive_failures += 1
self.last_result = {
"status": "timeout",
"error": "Scrape took longer than 60 seconds"
}
await self.db.save_canary_result(
success=False,
error_message="Timeout after 60 seconds"
)
if self.consecutive_failures >= 3:
await self.send_alert(
f"🚨 CRITICAL: Canary timeout {self.consecutive_failures} times!"
)
except Exception as e:
log.error(f"❌ Canary test ERROR: {e}")
self.consecutive_failures += 1
self.last_result = {
"status": "error",
"error": str(e)
}
await self.db.save_canary_result(
success=False,
error_message=str(e)
)
raise # Re-raise to trigger alert in main loop
def _validate_review_structure(self, reviews) -> bool:
"""
Validate that reviews have expected structure.
Args:
reviews: List of review dictionaries
Returns:
True if structure is valid
"""
if not reviews or len(reviews) == 0:
return False
# Check first review has required fields
first_review = reviews[0]
required_fields = ['author', 'rating', 'date_text']
return all(field in first_review for field in required_fields)
async def send_alert(self, message: str):
"""
Send alert via configured channels.
Args:
message: Alert message to send
"""
log.critical(message)
# TODO: Integrate with alerting systems
# Examples:
# Slack
slack_webhook = os.getenv('SLACK_WEBHOOK_URL')
if slack_webhook:
try:
import httpx
async with httpx.AsyncClient() as client:
await client.post(
slack_webhook,
json={"text": message},
timeout=5.0
)
log.info("Alert sent to Slack")
except Exception as e:
log.error(f"Failed to send Slack alert: {e}")
# Email (example with SMTP)
# smtp_config = os.getenv('SMTP_CONFIG')
# if smtp_config:
# await send_email(
# to=os.getenv('ALERT_EMAIL'),
# subject="Scraper Canary Alert",
# body=message
# )
# PagerDuty
# pagerduty_key = os.getenv('PAGERDUTY_KEY')
# if pagerduty_key:
# await trigger_pagerduty(message)
def get_status(self) -> Dict[str, Any]:
"""
Get current canary status.
Returns:
Status dictionary
"""
if not self.last_success:
return {
"status": "unknown",
"message": "No canary tests run yet",
"last_run": self.last_run.isoformat() if self.last_run else None
}
age = datetime.now() - self.last_success
max_age = timedelta(hours=6) # Alert if no success in 6 hours
if age > max_age:
return {
"status": "stale",
"last_success": self.last_success.isoformat(),
"age_hours": age.total_seconds() / 3600,
"consecutive_failures": self.consecutive_failures,
"message": f"Last successful canary was {age.total_seconds()/3600:.1f} hours ago"
}
return {
"status": "healthy",
"last_success": self.last_success.isoformat(),
"last_run": self.last_run.isoformat() if self.last_run else None,
"age_minutes": age.total_seconds() / 60,
"consecutive_failures": self.consecutive_failures,
"last_result": self.last_result
}
class HealthCheckSystem:
"""
Complete health check system for production.
Provides multiple levels of health checks:
- Liveness: Is the server alive?
- Readiness: Can it handle traffic?
- Canary: Does scraping actually work?
"""
def __init__(self, db):
"""
Initialize health check system.
Args:
db: Database manager instance
"""
self.db = db
self.canary = CanaryMonitor(db, interval_hours=4)
async def start(self):
"""Start background health monitoring"""
asyncio.create_task(self.canary.start())
def stop(self):
"""Stop background health monitoring"""
self.canary.stop()
async def check_liveness(self) -> Dict[str, Any]:
"""
Liveness check: Is the server alive?
This is a simple check that always succeeds if the server is running.
Used by Kubernetes liveness probe - restart container if fails.
Returns:
Liveness status
"""
return {
"status": "alive",
"timestamp": datetime.utcnow().isoformat()
}
async def check_readiness(self) -> Dict[str, Any]:
"""
Readiness check: Can the server handle traffic?
Checks if dependencies are available.
Used by Kubernetes readiness probe - remove from load balancer if fails.
Returns:
Readiness status
"""
checks = {}
# Check database
try:
await self.db.pool.fetchval("SELECT 1")
checks["database"] = {"healthy": True}
except Exception as e:
checks["database"] = {"healthy": False, "error": str(e)}
# Overall readiness
all_healthy = all(c.get("healthy", False) for c in checks.values())
return {
"status": "ready" if all_healthy else "not_ready",
"checks": checks,
"timestamp": datetime.utcnow().isoformat()
}
async def check_canary(self) -> Dict[str, Any]:
"""
Canary check: Does scraping actually work?
Returns the latest canary test result.
Used by external monitoring (PagerDuty, DataDog) for alerts.
Returns:
Canary status
"""
return self.canary.get_status()
async def get_detailed_health(self) -> Dict[str, Any]:
"""
Get detailed health status of all components.
Returns:
Complete health status
"""
liveness = await self.check_liveness()
readiness = await self.check_readiness()
canary = await self.check_canary()
overall_healthy = (
liveness["status"] == "alive" and
readiness["status"] == "ready" and
canary["status"] in ["healthy", "unknown"] # Unknown is OK (first run)
)
return {
"status": "healthy" if overall_healthy else "degraded",
"components": {
"liveness": liveness,
"readiness": readiness,
"canary": canary
},
"timestamp": datetime.utcnow().isoformat()
}

View File

@@ -1,93 +0,0 @@
"""
Data models for Google Maps Reviews Scraper.
"""
import re
from dataclasses import dataclass, field
from selenium.webdriver.remote.webelement import WebElement
from modules.utils import (try_find, first_text, first_attr, safe_int, detect_lang, parse_date_to_iso)
@dataclass
class RawReview:
"""
Data class representing a raw review extracted from Google Maps.
"""
id: str = ""
author: str = ""
rating: float = 0.0
date: str = ""
lang: str = "und"
text: str = ""
likes: int = 0
photos: list[str] = field(default_factory=list)
profile: str = ""
avatar: str = "" # URL to profile picture
owner_date: str = ""
owner_text: str = ""
review_date: str = "" # ISO format date
# Translation fields
translations: dict = field(default_factory=dict) # Store translations by language code
# CSS Selectors for review elements
MORE_BTN = "button.kyuRq"
LIKE_BTN = 'button[jsaction*="toggleThumbsUp" i]'
PHOTO_BTN = "button.Tya61d"
OWNER_RESP = "div.CDe7pd"
@classmethod
def from_card(cls, card: WebElement) -> "RawReview":
"""Factory method to create a RawReview from a WebElement"""
# expand "More" - non-blocking approach
for b in try_find(card, cls.MORE_BTN, all=True):
try:
b.click()
except Exception:
pass
# Try to get data-review-id from the card itself, or from a child element
rid = card.get_attribute("data-review-id") or ""
if not rid:
# Try to find it in a child element
review_id_elem = try_find(card, "[data-review-id]")
if review_id_elem:
rid = review_id_elem[0].get_attribute("data-review-id") or ""
author = first_text(card, 'div[class*="d4r55"]')
profile = first_attr(card, 'button[data-review-id]', "data-href")
avatar = first_attr(card, 'button[data-review-id] img', "src")
label = first_attr(card, 'span[role="img"]', "aria-label")
num = re.search(r"[\d\.]+", label.replace(",", ".")) if label else None
rating = float(num.group()) if num else 0.0
date = first_text(card, 'span[class*="rsqaWe"]')
# Parse the date string to ISO format
review_date = parse_date_to_iso(date)
text = ""
for sel in ('span[jsname="bN97Pc"]',
'span[jsname="fbQN7e"]',
'div.MyEned span.wiI7pd'):
text = first_text(card, sel)
if text: break
lang = detect_lang(text)
likes = 0
if (btn := try_find(card, cls.LIKE_BTN)):
likes = safe_int(btn[0].text or btn[0].get_attribute("aria-label"))
photos: list[str] = []
for btn in try_find(card, cls.PHOTO_BTN, all=True):
if (m := re.search(r'url\("([^"]+)"', btn.get_attribute("style") or "")):
photos.append(m.group(1))
owner_date = owner_text = ""
if (box := try_find(card, cls.OWNER_RESP)):
box = box[0]
owner_date = first_text(box, "span.DZSIDd")
owner_text = first_text(box, "div.wiI7pd")
return cls(rid, author, rating, date, lang, text, likes,
photos, profile, avatar, owner_date, owner_text, review_date)

File diff suppressed because it is too large Load Diff

View File

@@ -1,250 +0,0 @@
"""
Structured Logger Module
Provides a thread-safe, structured logging system with JSON-serializable output.
Designed to replace the LogCapture class with enhanced categorization and metrics support.
"""
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from typing import Dict, List, Literal, Optional
import threading
import time
LogLevel = Literal['DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL']
LogCategory = Literal['scraper', 'browser', 'network', 'system']
@dataclass
class LogEntry:
"""Structured log entry with timestamp, level, category, and optional metrics."""
timestamp: str # ISO 8601 with Z suffix
timestamp_ms: int # Unix milliseconds
level: LogLevel
category: LogCategory
message: str
metrics: Optional[Dict] = None # memory_mb, reviews_count, scroll_position, dom_nodes, etc.
network: Optional[Dict] = None # url, method, status, size_bytes, duration_ms
snapshot_id: Optional[str] = None
def to_dict(self) -> Dict:
"""Convert to JSON-serializable dictionary, excluding None values."""
result = {
'timestamp': self.timestamp,
'timestamp_ms': self.timestamp_ms,
'level': self.level,
'category': self.category,
'message': self.message,
}
if self.metrics is not None:
result['metrics'] = self.metrics
if self.network is not None:
result['network'] = self.network
if self.snapshot_id is not None:
result['snapshot_id'] = self.snapshot_id
return result
class StructuredLogger:
"""
Thread-safe structured logger with categorized log entries and automatic pruning.
Example usage:
logger = StructuredLogger()
logger.info('browser', 'Navigating to URL', metrics={'memory_mb': 245})
logger.warn('network', 'Rate limit detected', network={'status': 429, 'url': '...'})
logger.error('system', 'Chrome crashed', metrics={'memory_mb': 489, 'dom_nodes': 12000})
"""
def __init__(self, max_entries: int = 10000):
"""
Initialize the structured logger.
Args:
max_entries: Maximum number of log entries to retain (default 10000).
Oldest entries are pruned when limit is exceeded.
"""
self._entries: List[LogEntry] = []
self._lock = threading.Lock()
self._max_entries = max_entries
def _create_entry(
self,
level: LogLevel,
category: LogCategory,
message: str,
metrics: Optional[Dict] = None,
network: Optional[Dict] = None,
snapshot_id: Optional[str] = None,
) -> LogEntry:
"""Create a new log entry with current timestamp."""
now = datetime.now(timezone.utc)
timestamp = now.strftime('%Y-%m-%dT%H:%M:%S.') + f'{now.microsecond // 1000:03d}Z'
timestamp_ms = int(now.timestamp() * 1000)
return LogEntry(
timestamp=timestamp,
timestamp_ms=timestamp_ms,
level=level,
category=category,
message=message,
metrics=metrics,
network=network,
snapshot_id=snapshot_id,
)
def _add_entry(self, entry: LogEntry) -> None:
"""Add an entry to the log with thread-safety and automatic pruning."""
with self._lock:
self._entries.append(entry)
# Prune oldest entries if limit exceeded
if len(self._entries) > self._max_entries:
# Remove oldest 10% to avoid frequent pruning
prune_count = max(1, self._max_entries // 10)
self._entries = self._entries[prune_count:]
def debug(
self,
category: LogCategory,
message: str,
*,
metrics: Optional[Dict] = None,
network: Optional[Dict] = None,
snapshot_id: Optional[str] = None,
) -> None:
"""Log a DEBUG level message."""
entry = self._create_entry('DEBUG', category, message, metrics, network, snapshot_id)
self._add_entry(entry)
def info(
self,
category: LogCategory,
message: str,
*,
metrics: Optional[Dict] = None,
network: Optional[Dict] = None,
snapshot_id: Optional[str] = None,
) -> None:
"""Log an INFO level message."""
entry = self._create_entry('INFO', category, message, metrics, network, snapshot_id)
self._add_entry(entry)
def warn(
self,
category: LogCategory,
message: str,
*,
metrics: Optional[Dict] = None,
network: Optional[Dict] = None,
snapshot_id: Optional[str] = None,
) -> None:
"""Log a WARN level message."""
entry = self._create_entry('WARN', category, message, metrics, network, snapshot_id)
self._add_entry(entry)
def error(
self,
category: LogCategory,
message: str,
*,
metrics: Optional[Dict] = None,
network: Optional[Dict] = None,
snapshot_id: Optional[str] = None,
) -> None:
"""Log an ERROR level message."""
entry = self._create_entry('ERROR', category, message, metrics, network, snapshot_id)
self._add_entry(entry)
def fatal(
self,
category: LogCategory,
message: str,
*,
metrics: Optional[Dict] = None,
network: Optional[Dict] = None,
snapshot_id: Optional[str] = None,
) -> None:
"""Log a FATAL level message."""
entry = self._create_entry('FATAL', category, message, metrics, network, snapshot_id)
self._add_entry(entry)
def log(self, message: str, level: str = 'INFO') -> None:
"""
Backward-compatible log method for legacy code.
Maps to 'system' category by default.
Args:
message: The log message
level: Log level as string (DEBUG, INFO, WARN, ERROR, FATAL)
"""
level_upper = level.upper()
if level_upper not in ('DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL'):
level_upper = 'INFO'
entry = self._create_entry(level_upper, 'system', message)
self._add_entry(entry)
def get_logs(self) -> List[Dict]:
"""
Get all log entries as JSON-serializable dictionaries.
Returns:
List of log entry dictionaries.
"""
with self._lock:
return [entry.to_dict() for entry in self._entries]
def get_logs_by_category(self, category: LogCategory) -> List[Dict]:
"""
Get log entries filtered by category.
Args:
category: The category to filter by ('scraper', 'browser', 'network', 'system')
Returns:
List of log entry dictionaries matching the category.
"""
with self._lock:
return [entry.to_dict() for entry in self._entries if entry.category == category]
def get_logs_by_level(self, level: LogLevel) -> List[Dict]:
"""
Get log entries filtered by level.
Args:
level: The level to filter by ('DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL')
Returns:
List of log entry dictionaries matching the level.
"""
with self._lock:
return [entry.to_dict() for entry in self._entries if entry.level == level]
def get_logs_since(self, timestamp_ms: int) -> List[Dict]:
"""
Get log entries since a specific timestamp.
Args:
timestamp_ms: Unix timestamp in milliseconds
Returns:
List of log entry dictionaries with timestamp >= timestamp_ms.
"""
with self._lock:
return [entry.to_dict() for entry in self._entries if entry.timestamp_ms >= timestamp_ms]
def clear(self) -> None:
"""Clear all log entries."""
with self._lock:
self._entries.clear()
def count(self) -> int:
"""Get the current number of log entries."""
with self._lock:
return len(self._entries)
def __len__(self) -> int:
"""Get the current number of log entries."""
return self.count()

View File

@@ -1,307 +0,0 @@
"""
Utility functions for Google Maps Reviews Scraper.
"""
import datetime
import logging
import re
import time
from datetime import timezone
from functools import lru_cache
from typing import List
from selenium.common.exceptions import (NoSuchElementException,
StaleElementReferenceException,
TimeoutException)
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
# Logger
log = logging.getLogger("scraper")
# Constants for language detection
HEB_CHARS = re.compile(r"[\u0590-\u05FF]")
THAI_CHARS = re.compile(r"[\u0E00-\u0E7F]")
@lru_cache(maxsize=1024)
def detect_lang(txt: str) -> str:
"""Detect language based on character sets"""
if HEB_CHARS.search(txt): return "he"
if THAI_CHARS.search(txt): return "th"
return "en"
@lru_cache(maxsize=128)
def safe_int(s: str | None) -> int:
"""Safely convert string to integer, returning 0 if not possible"""
m = re.search(r"\d+", s or "")
return int(m.group()) if m else 0
def try_find(el: WebElement, css: str, *, all=False) -> List[WebElement]:
"""Safely find elements by CSS selector without raising exceptions"""
try:
if all:
return el.find_elements(By.CSS_SELECTOR, css)
obj = el.find_element(By.CSS_SELECTOR, css)
return [obj] if obj else []
except (NoSuchElementException, StaleElementReferenceException):
return []
def first_text(el: WebElement, css: str) -> str:
"""Get text from the first matching element that has non-empty text"""
for e in try_find(el, css, all=True):
try:
if (t := e.text.strip()):
return t
except StaleElementReferenceException:
continue
return ""
def parse_date_to_iso(date_str: str) -> str:
"""
Parse date strings like "2 weeks ago", "January 2023", etc. into ISO format.
Returns a best-effort ISO string, or empty string if parsing fails.
"""
if not date_str:
return ""
try:
now = datetime.now(timezone.utc)
# Handle relative dates
if "ago" in date_str.lower():
# For simplicity, map to approximate dates
if "minute" in date_str.lower():
minutes = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
dt = now.replace(microsecond=0) - timezone.timedelta(minutes=minutes)
elif "hour" in date_str.lower():
hours = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
dt = now.replace(microsecond=0) - timezone.timedelta(hours=hours)
elif "day" in date_str.lower():
days = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
dt = now.replace(microsecond=0) - timezone.timedelta(days=days)
elif "week" in date_str.lower():
weeks = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
dt = now.replace(microsecond=0) - timezone.timedelta(weeks=weeks)
elif "month" in date_str.lower():
months = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# Approximate months as 30 days
dt = now.replace(microsecond=0) - timezone.timedelta(days=30 * months)
elif "year" in date_str.lower():
years = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# Approximate years as 365 days
dt = now.replace(microsecond=0) - timezone.timedelta(days=365 * years)
else:
# Default to current time if can't parse
dt = now.replace(microsecond=0)
else:
# Handle absolute dates (month year format)
# This is a simplification - would need more robust parsing for production
dt = now.replace(microsecond=0)
return dt.isoformat()
except Exception:
# If parsing fails, return empty string
return ""
def first_attr(el: WebElement, css: str, attr: str) -> str:
"""Get attribute value from the first matching element that has a non-empty value"""
for e in try_find(el, css, all=True):
try:
if (v := (e.get_attribute(attr) or "").strip()):
return v
except StaleElementReferenceException:
continue
return ""
def click_if(driver: Chrome, css: str, delay: float = .25, timeout: float = 5.0) -> bool:
"""
Click element if it exists and is clickable, with timeout and better error handling.
Args:
driver: WebDriver instance
css: CSS selector for the element to click
delay: Time to wait after clicking (seconds)
timeout: Maximum time to wait for element (seconds)
Returns:
True if element was found and clicked, False otherwise
"""
try:
# First check if elements exist at all
elements = driver.find_elements(By.CSS_SELECTOR, css)
if not elements:
return False
# Try clicking the first visible element
for element in elements:
try:
if element.is_displayed() and element.is_enabled():
element.click()
time.sleep(delay)
return True
except Exception:
# Try next element if this one fails
continue
# If we couldn't click any of the direct elements, try with WebDriverWait
try:
WebDriverWait(driver, timeout).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, css))
).click()
time.sleep(delay)
return True
except TimeoutException:
return False
except Exception as e:
log.debug(f"Error in click_if: {str(e)}")
return False
def get_current_iso_date() -> str:
"""Return current UTC time in ISO format."""
from datetime import datetime, timezone
return datetime.now(timezone.utc).isoformat()
# """
# Utility functions for Google Maps Reviews Scraper.
# """
#
# import re
# import time
# import logging
# from datetime import datetime, timezone
# from functools import lru_cache
# from typing import List, Optional
#
# from selenium.common.exceptions import (NoSuchElementException,
# StaleElementReferenceException,
# TimeoutException)
# from selenium.webdriver import Chrome
# from selenium.webdriver.common.by import By
# from selenium.webdriver.remote.webelement import WebElement
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.support.ui import WebDriverWait
#
# # Constants for language detection
# HEB_CHARS = re.compile(r"[\u0590-\u05FF]")
# THAI_CHARS = re.compile(r"[\u0E00-\u0E7F]")
#
# # Logger
# log = logging.getLogger("scraper")
#
#
# @lru_cache(maxsize=1024)
# def detect_lang(txt: str) -> str:
# """Detect language based on character sets"""
# if HEB_CHARS.search(txt): return "he"
# if THAI_CHARS.search(txt): return "th"
# return "en"
#
#
# @lru_cache(maxsize=128)
# def safe_int(s: str | None) -> int:
# """Safely convert string to integer, returning 0 if not possible"""
# m = re.search(r"\d+", s or "")
# return int(m.group()) if m else 0
#
#
# def try_find(el: WebElement, css: str, *, all=False) -> List[WebElement]:
# """Safely find elements by CSS selector without raising exceptions"""
# try:
# if all:
# return el.find_elements(By.CSS_SELECTOR, css)
# obj = el.find_element(By.CSS_SELECTOR, css)
# return [obj] if obj else []
# except (NoSuchElementException, StaleElementReferenceException):
# return []
#
#
# def first_text(el: WebElement, css: str) -> str:
# """Get text from the first matching element that has non-empty text"""
# for e in try_find(el, css, all=True):
# if (t := e.text.strip()):
# return t
# return ""
#
#
# def first_attr(el: WebElement, css: str, attr: str) -> str:
# """Get attribute value from the first matching element that has a non-empty value"""
# for e in try_find(el, css, all=True):
# if (v := (e.get_attribute(attr) or "").strip()):
# return v
# return ""
#
#
# def click_if(driver: Chrome, css: str, delay: float = .25, timeout: float = 5.0) -> bool:
# """Click element if it exists and is clickable, with timeout"""
# try:
# WebDriverWait(driver, timeout).until(
# EC.element_to_be_clickable((By.CSS_SELECTOR, css))
# ).click()
# time.sleep(delay)
# return True
# except TimeoutException:
# return False
#
#
# def parse_date_to_iso(date_str: str) -> str:
# """
# Parse date strings like "2 weeks ago", "January 2023", etc. into ISO format.
# Returns a best-effort ISO string, or empty string if parsing fails.
# """
# if not date_str:
# return ""
#
# try:
# now = datetime.now(timezone.utc)
#
# # Handle relative dates
# if "ago" in date_str.lower():
# # For simplicity, map to approximate dates
# if "minute" in date_str.lower():
# minutes = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# dt = now.replace(microsecond=0) - timezone.timedelta(minutes=minutes)
# elif "hour" in date_str.lower():
# hours = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# dt = now.replace(microsecond=0) - timezone.timedelta(hours=hours)
# elif "day" in date_str.lower():
# days = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# dt = now.replace(microsecond=0) - timezone.timedelta(days=days)
# elif "week" in date_str.lower():
# weeks = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# dt = now.replace(microsecond=0) - timezone.timedelta(weeks=weeks)
# elif "month" in date_str.lower():
# months = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# # Approximate months as 30 days
# dt = now.replace(microsecond=0) - timezone.timedelta(days=30 * months)
# elif "year" in date_str.lower():
# years = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# # Approximate years as 365 days
# dt = now.replace(microsecond=0) - timezone.timedelta(days=365 * years)
# else:
# # Default to current time if can't parse
# dt = now.replace(microsecond=0)
# else:
# # Handle absolute dates (month year format)
# # This is a simplification - would need more robust parsing for production
# dt = now.replace(microsecond=0)
#
# return dt.isoformat()
# except Exception:
# # If parsing fails, return empty string
# return ""
#
#
# def get_current_iso_date() -> str:
# """Return current UTC time in ISO format."""
# return datetime.now(timezone.utc).isoformat()

View File

@@ -1,373 +0,0 @@
#!/usr/bin/env python3
"""
Webhook delivery system with retry logic and security.
"""
import asyncio
import hmac
import hashlib
import json
import logging
from typing import Dict, Any, Optional
from datetime import datetime
import httpx
from uuid import UUID
log = logging.getLogger(__name__)
class WebhookDeliveryError(Exception):
"""Raised when webhook delivery fails after all retries"""
pass
class WebhookManager:
"""
Manages webhook delivery with retry logic and security.
Features:
- Exponential backoff retry (3 attempts)
- HMAC signature for security
- Timeout handling
- Async delivery
- Logging of all attempts
"""
def __init__(
self,
max_retries: int = 3,
timeout: float = 10.0,
initial_retry_delay: float = 2.0
):
"""
Initialize webhook manager.
Args:
max_retries: Maximum number of delivery attempts
timeout: Request timeout in seconds
initial_retry_delay: Initial delay between retries (exponential backoff)
"""
self.max_retries = max_retries
self.timeout = timeout
self.initial_retry_delay = initial_retry_delay
def generate_signature(self, payload: str, secret: str) -> str:
"""
Generate HMAC-SHA256 signature for webhook payload.
Args:
payload: JSON string payload
secret: Webhook secret
Returns:
Hex-encoded signature
"""
return hmac.new(
secret.encode('utf-8'),
payload.encode('utf-8'),
hashlib.sha256
).hexdigest()
async def send_webhook(
self,
webhook_url: str,
payload: Dict[str, Any],
secret: Optional[str] = None,
job_id: Optional[UUID] = None,
db=None
) -> bool:
"""
Send webhook with retry logic.
Args:
webhook_url: URL to send webhook to
payload: Webhook payload dictionary
secret: Optional webhook secret for HMAC signature
job_id: Optional job ID for logging attempts
db: Optional database manager for logging
Returns:
True if delivery succeeded, False otherwise
"""
payload_json = json.dumps(payload, default=str)
for attempt in range(1, self.max_retries + 1):
try:
start_time = datetime.now()
# Prepare headers
headers = {
"Content-Type": "application/json",
"User-Agent": "GoogleReviewsScraper-Webhook/1.0"
}
# Add signature if secret provided
if secret:
signature = self.generate_signature(payload_json, secret)
headers["X-Webhook-Signature"] = f"sha256={signature}"
headers["X-Webhook-Timestamp"] = str(int(datetime.now().timestamp()))
# Send webhook
async with httpx.AsyncClient() as client:
response = await client.post(
webhook_url,
content=payload_json,
headers=headers,
timeout=self.timeout
)
response_time_ms = (datetime.now() - start_time).total_seconds() * 1000
# Check response
if response.status_code in [200, 201, 202, 204]:
# Success
log.info(
f"Webhook delivered successfully to {webhook_url} "
f"(attempt {attempt}, {response_time_ms:.0f}ms, status {response.status_code})"
)
# Log successful attempt
if db and job_id:
await db.log_webhook_attempt(
job_id=job_id,
attempt_number=attempt,
success=True,
status_code=response.status_code,
response_time_ms=response_time_ms
)
return True
else:
# Non-2xx response
error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
log.warning(
f"Webhook delivery failed to {webhook_url} "
f"(attempt {attempt}/{self.max_retries}): {error_msg}"
)
# Log failed attempt
if db and job_id:
await db.log_webhook_attempt(
job_id=job_id,
attempt_number=attempt,
success=False,
status_code=response.status_code,
error_message=error_msg,
response_time_ms=response_time_ms
)
except httpx.TimeoutException as e:
error_msg = f"Timeout after {self.timeout}s"
log.warning(
f"Webhook delivery timeout to {webhook_url} "
f"(attempt {attempt}/{self.max_retries}): {error_msg}"
)
# Log timeout attempt
if db and job_id:
await db.log_webhook_attempt(
job_id=job_id,
attempt_number=attempt,
success=False,
error_message=error_msg
)
except Exception as e:
error_msg = f"{type(e).__name__}: {str(e)}"
log.error(
f"Webhook delivery error to {webhook_url} "
f"(attempt {attempt}/{self.max_retries}): {error_msg}"
)
# Log error attempt
if db and job_id:
await db.log_webhook_attempt(
job_id=job_id,
attempt_number=attempt,
success=False,
error_message=error_msg
)
# Retry with exponential backoff
if attempt < self.max_retries:
retry_delay = self.initial_retry_delay * (2 ** (attempt - 1))
log.info(f"Retrying in {retry_delay:.1f}s...")
await asyncio.sleep(retry_delay)
# All retries failed
log.error(
f"Webhook delivery failed to {webhook_url} after {self.max_retries} attempts"
)
return False
async def send_job_completed_webhook(
self,
webhook_url: str,
job_id: UUID,
status: str,
reviews_count: Optional[int] = None,
scrape_time: Optional[float] = None,
error_message: Optional[str] = None,
reviews_url: Optional[str] = None,
secret: Optional[str] = None,
db=None
) -> bool:
"""
Send job completion webhook.
Args:
webhook_url: URL to send webhook to
job_id: Job UUID
status: Job status ('completed' or 'failed')
reviews_count: Number of reviews scraped
scrape_time: Time taken in seconds
error_message: Error message if failed
reviews_url: URL to retrieve reviews
secret: Webhook secret
db: Database manager for logging
Returns:
True if delivery succeeded
"""
payload = {
"event": f"job.{status}",
"job_id": str(job_id),
"status": status,
"timestamp": datetime.utcnow().isoformat() + "Z"
}
if status == "completed":
payload.update({
"reviews_count": reviews_count,
"scrape_time": scrape_time,
"reviews_url": reviews_url
})
elif status == "failed":
payload["error_message"] = error_message
return await self.send_webhook(
webhook_url=webhook_url,
payload=payload,
secret=secret,
job_id=job_id,
db=db
)
class WebhookDispatcher:
"""
Background webhook dispatcher that processes pending webhooks.
Runs in background and delivers webhooks for completed jobs.
"""
def __init__(self, db, interval_seconds: int = 30):
"""
Initialize webhook dispatcher.
Args:
db: Database manager instance
interval_seconds: How often to check for pending webhooks
"""
self.db = db
self.interval = interval_seconds
self.webhook_manager = WebhookManager()
self.running = False
async def start(self):
"""Start the background webhook dispatcher"""
self.running = True
log.info("Webhook dispatcher started")
while self.running:
try:
await self.process_pending_webhooks()
except Exception as e:
log.error(f"Error in webhook dispatcher: {e}")
await asyncio.sleep(self.interval)
def stop(self):
"""Stop the background webhook dispatcher"""
self.running = False
log.info("Webhook dispatcher stopped")
async def process_pending_webhooks(self):
"""
Process all pending webhooks.
Fetches jobs with pending webhooks and delivers them.
"""
# Get jobs with pending webhooks
jobs = await self.db.get_pending_jobs_with_webhooks(limit=100)
if not jobs:
return
log.info(f"Processing {len(jobs)} pending webhooks...")
for job in jobs:
try:
job_id = job['job_id']
webhook_url = job['webhook_url']
webhook_secret = job.get('webhook_secret')
status = job['status']
# Build reviews URL (assuming API base URL from environment)
import os
api_base_url = os.getenv('API_BASE_URL', 'http://localhost:8000')
reviews_url = f"{api_base_url}/jobs/{job_id}/reviews"
# Send webhook
await self.webhook_manager.send_job_completed_webhook(
webhook_url=webhook_url,
job_id=job_id,
status=status,
reviews_count=job.get('reviews_count'),
scrape_time=job.get('scrape_time'),
error_message=job.get('error_message'),
reviews_url=reviews_url if status == 'completed' else None,
secret=webhook_secret,
db=self.db
)
except Exception as e:
log.error(f"Error processing webhook for job {job['job_id']}: {e}")
log.info(f"Processed {len(jobs)} webhooks")
# Webhook verification helper for client implementations
def verify_webhook_signature(payload: str, signature: str, secret: str) -> bool:
"""
Verify webhook signature (for client-side verification).
Args:
payload: Raw JSON payload string
signature: Signature from X-Webhook-Signature header (format: "sha256=...")
secret: Webhook secret
Returns:
True if signature is valid
Example:
@app.post("/webhook")
async def handle_webhook(request: Request):
payload = await request.body()
signature = request.headers.get("X-Webhook-Signature")
if not verify_webhook_signature(payload.decode(), signature, WEBHOOK_SECRET):
raise HTTPException(status_code=401, detail="Invalid signature")
# Process webhook...
"""
if not signature or not signature.startswith("sha256="):
return False
expected_signature = signature.split("sha256=", 1)[1]
computed_signature = hmac.new(
secret.encode('utf-8'),
payload.encode('utf-8'),
hashlib.sha256
).hexdigest()
return hmac.compare_digest(expected_signature, computed_signature)