Phase 0: Project restructure to ReviewIQ platform architecture
New structure: - scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py) - scrapers/base.py (BaseScraper interface) - scrapers/registry.py (ScraperRegistry for version routing) - core/database.py, models.py, config.py, enums.py - utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py - workers/chrome_pool.py - services/webhook_service.py - api/ routes structure (empty, ready for Phase 2) - tests/ structure mirroring source All imports updated in: - api_server_production.py (7 import paths updated) - utils/health_checks.py (scraper import path) Legacy modules moved to modules/_legacy/: - data_storage.py, image_handler.py, s3_handler.py (unused) Syntax verified, frontend build passing. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
0
utils/__init__.py
Normal file
0
utils/__init__.py
Normal file
666
utils/crash_analyzer.py
Normal file
666
utils/crash_analyzer.py
Normal file
@@ -0,0 +1,666 @@
|
||||
"""
|
||||
Crash Pattern Analyzer Module
|
||||
|
||||
Provides deep analysis of scraper crashes with pattern detection,
|
||||
confidence scoring, and auto-fix parameter suggestions.
|
||||
|
||||
Builds on top of the basic classify_crash function in scraper_clean.py
|
||||
with more sophisticated pattern matching and multi-signal analysis.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional
|
||||
import re
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrashAnalysis:
|
||||
"""
|
||||
Result of crash pattern analysis.
|
||||
|
||||
Attributes:
|
||||
pattern: The identified crash pattern type (e.g., "memory_exhaustion", "dom_bloat")
|
||||
confidence: Confidence score from 0.0 to 1.0 based on multiple signals
|
||||
description: Human-readable description of the crash cause
|
||||
suggested_fix: Recommended action to prevent this crash
|
||||
auto_fix_params: Parameters that can be applied automatically to prevent recurrence
|
||||
"""
|
||||
pattern: str # e.g., "memory_exhaustion", "dom_bloat", "rate_limited"
|
||||
confidence: float # 0.0 to 1.0
|
||||
description: str
|
||||
suggested_fix: str
|
||||
auto_fix_params: Optional[Dict[str, Any]]
|
||||
|
||||
|
||||
# Thresholds for pattern detection
|
||||
MEMORY_EXHAUSTION_THRESHOLD_MB = 1500 # 1.5GB in MB
|
||||
MEMORY_GROWTH_RATE_THRESHOLD_MB_S = 10 # 10MB/s
|
||||
DOM_BLOAT_THRESHOLD = 50000 # 50000 nodes
|
||||
SCROLL_TIMEOUT_MIN_SCROLLS = 10 # Minimum scrolls before considering scroll_timeout
|
||||
|
||||
|
||||
# Auto-fix parameters for each crash pattern
|
||||
AUTO_FIX_PARAMS = {
|
||||
"memory_exhaustion": {
|
||||
"max_reviews": 500,
|
||||
"restart_browser_after": 200
|
||||
},
|
||||
"dom_bloat": {
|
||||
"scroll_cleanup": True,
|
||||
"lazy_load": True
|
||||
},
|
||||
"rate_limited": {
|
||||
"delay_multiplier": 2.0,
|
||||
"use_different_proxy": True
|
||||
},
|
||||
"consent_loop": {
|
||||
"skip_consent_retries": True
|
||||
},
|
||||
"scroll_timeout": {
|
||||
"reduce_target": True,
|
||||
"target_reviews": "current - 10%"
|
||||
},
|
||||
"element_stale": {
|
||||
"retry_with_fresh_elements": True
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def _calculate_memory_growth_rate(metrics_history: List[Dict]) -> Optional[float]:
|
||||
"""
|
||||
Calculate memory growth rate in MB/s from metrics history.
|
||||
|
||||
Args:
|
||||
metrics_history: List of metric samples with timestamp_ms and memory_mb
|
||||
|
||||
Returns:
|
||||
Growth rate in MB/s, or None if cannot be calculated
|
||||
"""
|
||||
if not metrics_history or len(metrics_history) < 2:
|
||||
return None
|
||||
|
||||
# Filter samples that have valid memory readings
|
||||
valid_samples = [
|
||||
m for m in metrics_history
|
||||
if m.get('memory_mb') is not None and m.get('timestamp_ms') is not None
|
||||
]
|
||||
|
||||
if len(valid_samples) < 2:
|
||||
return None
|
||||
|
||||
# Use first and last valid samples
|
||||
first = valid_samples[0]
|
||||
last = valid_samples[-1]
|
||||
|
||||
time_delta_s = (last['timestamp_ms'] - first['timestamp_ms']) / 1000
|
||||
if time_delta_s <= 0:
|
||||
return None
|
||||
|
||||
memory_delta_mb = last['memory_mb'] - first['memory_mb']
|
||||
return memory_delta_mb / time_delta_s
|
||||
|
||||
|
||||
def _get_max_memory(metrics_history: List[Dict]) -> Optional[int]:
|
||||
"""Get maximum memory usage from metrics history."""
|
||||
if not metrics_history:
|
||||
return None
|
||||
|
||||
memories = [m.get('memory_mb') for m in metrics_history if m.get('memory_mb') is not None]
|
||||
return max(memories) if memories else None
|
||||
|
||||
|
||||
def _get_max_dom_nodes(metrics_history: List[Dict]) -> Optional[int]:
|
||||
"""Get maximum DOM node count from metrics history."""
|
||||
if not metrics_history:
|
||||
return None
|
||||
|
||||
nodes = [m.get('dom_nodes') for m in metrics_history if m.get('dom_nodes') is not None]
|
||||
return max(nodes) if nodes else None
|
||||
|
||||
|
||||
def _check_memory_exhaustion(
|
||||
error_message: str,
|
||||
metrics_history: List[Dict],
|
||||
logs: List[Dict]
|
||||
) -> tuple[float, str]:
|
||||
"""
|
||||
Check for memory exhaustion pattern.
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence, description)
|
||||
"""
|
||||
confidence = 0.0
|
||||
signals = []
|
||||
|
||||
# Check for high memory usage
|
||||
max_memory = _get_max_memory(metrics_history)
|
||||
if max_memory is not None:
|
||||
if max_memory >= MEMORY_EXHAUSTION_THRESHOLD_MB:
|
||||
confidence += 0.5
|
||||
signals.append(f"Memory reached {max_memory}MB (threshold: {MEMORY_EXHAUSTION_THRESHOLD_MB}MB)")
|
||||
elif max_memory >= MEMORY_EXHAUSTION_THRESHOLD_MB * 0.8:
|
||||
confidence += 0.3
|
||||
signals.append(f"Memory at {max_memory}MB approaching threshold")
|
||||
|
||||
# Check for rapid memory growth
|
||||
growth_rate = _calculate_memory_growth_rate(metrics_history)
|
||||
if growth_rate is not None and growth_rate >= MEMORY_GROWTH_RATE_THRESHOLD_MB_S:
|
||||
confidence += 0.3
|
||||
signals.append(f"Memory growing at {growth_rate:.1f}MB/s (threshold: {MEMORY_GROWTH_RATE_THRESHOLD_MB_S}MB/s)")
|
||||
|
||||
# Check error message for memory-related keywords
|
||||
error_lower = error_message.lower()
|
||||
memory_keywords = ['memory', 'heap', 'out of memory', 'oom', 'aw, snap', 'status_access_violation']
|
||||
for keyword in memory_keywords:
|
||||
if keyword in error_lower:
|
||||
confidence += 0.2
|
||||
signals.append(f"Error contains '{keyword}'")
|
||||
break
|
||||
|
||||
# Check logs for memory warnings
|
||||
for log_entry in logs:
|
||||
msg = log_entry.get('message', '').lower()
|
||||
if 'memory' in msg and ('high' in msg or 'warning' in msg or 'exceeded' in msg):
|
||||
confidence += 0.1
|
||||
signals.append("Memory warning found in logs")
|
||||
break
|
||||
|
||||
description = "; ".join(signals) if signals else "No memory exhaustion signals detected"
|
||||
return min(confidence, 1.0), description
|
||||
|
||||
|
||||
def _check_dom_bloat(
|
||||
error_message: str,
|
||||
metrics_history: List[Dict],
|
||||
logs: List[Dict]
|
||||
) -> tuple[float, str]:
|
||||
"""
|
||||
Check for DOM bloat pattern.
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence, description)
|
||||
"""
|
||||
confidence = 0.0
|
||||
signals = []
|
||||
|
||||
# Check for high DOM node count
|
||||
max_nodes = _get_max_dom_nodes(metrics_history)
|
||||
if max_nodes is not None:
|
||||
if max_nodes >= DOM_BLOAT_THRESHOLD:
|
||||
confidence += 0.6
|
||||
signals.append(f"DOM nodes reached {max_nodes} (threshold: {DOM_BLOAT_THRESHOLD})")
|
||||
elif max_nodes >= DOM_BLOAT_THRESHOLD * 0.8:
|
||||
confidence += 0.3
|
||||
signals.append(f"DOM nodes at {max_nodes} approaching threshold")
|
||||
|
||||
# Check error message for DOM-related keywords
|
||||
error_lower = error_message.lower()
|
||||
dom_keywords = ['dom', 'element', 'node', 'render', 'paint', 'layout']
|
||||
for keyword in dom_keywords:
|
||||
if keyword in error_lower:
|
||||
confidence += 0.2
|
||||
signals.append(f"Error contains '{keyword}'")
|
||||
break
|
||||
|
||||
# Check if memory is high too (DOM bloat often causes memory issues)
|
||||
max_memory = _get_max_memory(metrics_history)
|
||||
if max_memory is not None and max_memory >= 800: # 800MB
|
||||
confidence += 0.1
|
||||
signals.append(f"Memory also elevated ({max_memory}MB)")
|
||||
|
||||
# Check logs for DOM-related messages
|
||||
for log_entry in logs:
|
||||
msg = log_entry.get('message', '').lower()
|
||||
if 'dom' in msg and ('large' in msg or 'cleanup' in msg or 'remove' in msg):
|
||||
confidence += 0.1
|
||||
signals.append("DOM warning found in logs")
|
||||
break
|
||||
|
||||
description = "; ".join(signals) if signals else "No DOM bloat signals detected"
|
||||
return min(confidence, 1.0), description
|
||||
|
||||
|
||||
def _check_rate_limited(
|
||||
error_message: str,
|
||||
metrics_history: List[Dict],
|
||||
logs: List[Dict]
|
||||
) -> tuple[float, str]:
|
||||
"""
|
||||
Check for rate limiting pattern.
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence, description)
|
||||
"""
|
||||
confidence = 0.0
|
||||
signals = []
|
||||
|
||||
# Check error message for rate limit indicators
|
||||
error_lower = error_message.lower()
|
||||
if '429' in error_message:
|
||||
confidence += 0.6
|
||||
signals.append("HTTP 429 status code in error")
|
||||
|
||||
rate_keywords = ['rate limit', 'too many requests', 'unusual traffic', 'captcha', 'blocked']
|
||||
for keyword in rate_keywords:
|
||||
if keyword in error_lower:
|
||||
confidence += 0.4
|
||||
signals.append(f"Error contains '{keyword}'")
|
||||
break
|
||||
|
||||
# Check logs for rate limiting signals
|
||||
rate_log_count = 0
|
||||
for log_entry in logs:
|
||||
msg = log_entry.get('message', '').lower()
|
||||
network = log_entry.get('network', {})
|
||||
status = network.get('status')
|
||||
|
||||
if status == 429:
|
||||
rate_log_count += 1
|
||||
confidence += 0.2
|
||||
|
||||
if 'unusual traffic' in msg or 'rate' in msg or 'blocked' in msg:
|
||||
rate_log_count += 1
|
||||
confidence += 0.1
|
||||
|
||||
if rate_log_count > 0:
|
||||
signals.append(f"Found {rate_log_count} rate-limiting indicators in logs")
|
||||
|
||||
description = "; ".join(signals) if signals else "No rate limiting signals detected"
|
||||
return min(confidence, 1.0), description
|
||||
|
||||
|
||||
def _check_consent_loop(
|
||||
error_message: str,
|
||||
metrics_history: List[Dict],
|
||||
logs: List[Dict]
|
||||
) -> tuple[float, str]:
|
||||
"""
|
||||
Check for consent popup loop pattern.
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence, description)
|
||||
"""
|
||||
confidence = 0.0
|
||||
signals = []
|
||||
|
||||
# Check error message for consent keywords
|
||||
error_lower = error_message.lower()
|
||||
if 'consent' in error_lower:
|
||||
confidence += 0.3
|
||||
signals.append("Error mentions consent")
|
||||
|
||||
# Count consent-related log entries
|
||||
consent_count = 0
|
||||
consent_messages = []
|
||||
for log_entry in logs:
|
||||
msg = log_entry.get('message', '').lower()
|
||||
if 'consent' in msg:
|
||||
consent_count += 1
|
||||
consent_messages.append(msg[:50])
|
||||
|
||||
# Multiple consent messages indicate a loop
|
||||
if consent_count >= 3:
|
||||
confidence += 0.5
|
||||
signals.append(f"Consent popup appeared {consent_count} times in logs")
|
||||
elif consent_count >= 2:
|
||||
confidence += 0.3
|
||||
signals.append(f"Consent popup appeared {consent_count} times")
|
||||
elif consent_count == 1:
|
||||
confidence += 0.1
|
||||
signals.append("Single consent popup detected")
|
||||
|
||||
# Check for timeout after consent handling
|
||||
if 'timeout' in error_lower and consent_count > 0:
|
||||
confidence += 0.2
|
||||
signals.append("Timeout occurred with consent activity")
|
||||
|
||||
description = "; ".join(signals) if signals else "No consent loop signals detected"
|
||||
return min(confidence, 1.0), description
|
||||
|
||||
|
||||
def _check_scroll_timeout(
|
||||
error_message: str,
|
||||
metrics_history: List[Dict],
|
||||
logs: List[Dict],
|
||||
state: Optional[Dict] = None
|
||||
) -> tuple[float, str]:
|
||||
"""
|
||||
Check for scroll timeout pattern (no new reviews after many scrolls).
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence, description)
|
||||
"""
|
||||
confidence = 0.0
|
||||
signals = []
|
||||
|
||||
# Check state for scroll count
|
||||
scroll_count = 0
|
||||
reviews_count = 0
|
||||
if state:
|
||||
scroll_count = state.get('scroll_count', 0)
|
||||
reviews_count = state.get('reviews_extracted', 0)
|
||||
|
||||
# Check error for timeout indicators
|
||||
error_lower = error_message.lower()
|
||||
if 'timeout' in error_lower:
|
||||
confidence += 0.2
|
||||
signals.append("Timeout in error message")
|
||||
|
||||
# Count recovery attempts in logs (indicate stuck scrolling)
|
||||
recovery_count = 0
|
||||
no_new_count = 0
|
||||
for log_entry in logs:
|
||||
msg = log_entry.get('message', '').lower()
|
||||
if 'recovery attempt' in msg:
|
||||
recovery_count += 1
|
||||
if 'no new' in msg or 'stuck' in msg:
|
||||
no_new_count += 1
|
||||
|
||||
if recovery_count >= SCROLL_TIMEOUT_MIN_SCROLLS:
|
||||
confidence += 0.5
|
||||
signals.append(f"Made {recovery_count} recovery attempts")
|
||||
elif recovery_count >= 5:
|
||||
confidence += 0.3
|
||||
signals.append(f"Made {recovery_count} recovery attempts")
|
||||
|
||||
if no_new_count > 0:
|
||||
confidence += 0.2
|
||||
signals.append(f"Found {no_new_count} 'no new reviews' log entries")
|
||||
|
||||
# Check if reviews stopped growing
|
||||
if metrics_history and len(metrics_history) >= 5:
|
||||
# Check if reviews count plateaued
|
||||
recent_counts = [m.get('reviews_count', 0) for m in metrics_history[-5:] if m.get('reviews_count')]
|
||||
if recent_counts and len(set(recent_counts)) == 1:
|
||||
confidence += 0.2
|
||||
signals.append(f"Review count stuck at {recent_counts[0]}")
|
||||
|
||||
description = "; ".join(signals) if signals else "No scroll timeout signals detected"
|
||||
return min(confidence, 1.0), description
|
||||
|
||||
|
||||
def _check_element_stale(
|
||||
error_message: str,
|
||||
metrics_history: List[Dict],
|
||||
logs: List[Dict]
|
||||
) -> tuple[float, str]:
|
||||
"""
|
||||
Check for stale element reference pattern.
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence, description)
|
||||
"""
|
||||
confidence = 0.0
|
||||
signals = []
|
||||
|
||||
# Check error message for stale element indicators
|
||||
error_lower = error_message.lower()
|
||||
stale_keywords = [
|
||||
'stale element', 'staleelement', 'stale_element',
|
||||
'element is not attached', 'element reference',
|
||||
'no such element', 'element not found',
|
||||
'element is no longer valid'
|
||||
]
|
||||
|
||||
for keyword in stale_keywords:
|
||||
if keyword in error_lower:
|
||||
confidence += 0.6
|
||||
signals.append(f"Error contains '{keyword}'")
|
||||
break
|
||||
|
||||
# Check logs for stale element patterns
|
||||
stale_log_count = 0
|
||||
for log_entry in logs:
|
||||
msg = log_entry.get('message', '').lower()
|
||||
for keyword in stale_keywords:
|
||||
if keyword in msg:
|
||||
stale_log_count += 1
|
||||
break
|
||||
|
||||
if stale_log_count > 0:
|
||||
confidence += 0.2
|
||||
signals.append(f"Found {stale_log_count} stale element references in logs")
|
||||
|
||||
# Check if DOM was changing rapidly (indicates dynamic page)
|
||||
if metrics_history and len(metrics_history) >= 3:
|
||||
dom_counts = [m.get('dom_nodes') for m in metrics_history if m.get('dom_nodes')]
|
||||
if len(dom_counts) >= 3:
|
||||
# Calculate variance
|
||||
avg = sum(dom_counts) / len(dom_counts)
|
||||
variance = sum((x - avg) ** 2 for x in dom_counts) / len(dom_counts)
|
||||
std_dev = variance ** 0.5
|
||||
# High variance indicates rapidly changing DOM
|
||||
if std_dev > 1000:
|
||||
confidence += 0.2
|
||||
signals.append(f"High DOM variability (std dev: {std_dev:.0f})")
|
||||
|
||||
description = "; ".join(signals) if signals else "No stale element signals detected"
|
||||
return min(confidence, 1.0), description
|
||||
|
||||
|
||||
def analyze_crash(crash_report: Dict) -> CrashAnalysis:
|
||||
"""
|
||||
Analyze a crash report to determine the most likely crash pattern.
|
||||
|
||||
Examines error_message, metrics_history, and logs_before_crash to
|
||||
calculate confidence scores for each crash pattern type.
|
||||
|
||||
Args:
|
||||
crash_report: Dictionary containing:
|
||||
- error_message: str - The exception message
|
||||
- metrics_history: List[Dict] - Sampled metrics with timestamp_ms, memory_mb, dom_nodes
|
||||
- logs_before_crash: List[Dict] - Recent log entries before the crash
|
||||
- state: Optional[Dict] - Scraper state (reviews_extracted, scroll_count, etc.)
|
||||
- crash_type: Optional[str] - Basic crash classification from classify_crash()
|
||||
|
||||
Returns:
|
||||
CrashAnalysis with the highest-confidence pattern match
|
||||
"""
|
||||
# Extract data from crash report
|
||||
error_message = crash_report.get('error_message', '')
|
||||
metrics_history = crash_report.get('metrics_history', [])
|
||||
logs = crash_report.get('logs_before_crash', [])
|
||||
state = crash_report.get('state', {})
|
||||
basic_type = crash_report.get('crash_type', 'unknown')
|
||||
|
||||
# Run all pattern checks
|
||||
pattern_results = {}
|
||||
|
||||
# Memory exhaustion
|
||||
conf, desc = _check_memory_exhaustion(error_message, metrics_history, logs)
|
||||
pattern_results['memory_exhaustion'] = (conf, desc)
|
||||
|
||||
# DOM bloat
|
||||
conf, desc = _check_dom_bloat(error_message, metrics_history, logs)
|
||||
pattern_results['dom_bloat'] = (conf, desc)
|
||||
|
||||
# Rate limited
|
||||
conf, desc = _check_rate_limited(error_message, metrics_history, logs)
|
||||
pattern_results['rate_limited'] = (conf, desc)
|
||||
|
||||
# Consent loop
|
||||
conf, desc = _check_consent_loop(error_message, metrics_history, logs)
|
||||
pattern_results['consent_loop'] = (conf, desc)
|
||||
|
||||
# Scroll timeout
|
||||
conf, desc = _check_scroll_timeout(error_message, metrics_history, logs, state)
|
||||
pattern_results['scroll_timeout'] = (conf, desc)
|
||||
|
||||
# Element stale
|
||||
conf, desc = _check_element_stale(error_message, metrics_history, logs)
|
||||
pattern_results['element_stale'] = (conf, desc)
|
||||
|
||||
# Find the pattern with highest confidence
|
||||
best_pattern = max(pattern_results.items(), key=lambda x: x[1][0])
|
||||
pattern_name = best_pattern[0]
|
||||
confidence = best_pattern[1][0]
|
||||
description = best_pattern[1][1]
|
||||
|
||||
# If confidence is too low, fall back to basic classification
|
||||
if confidence < 0.2:
|
||||
# Map basic crash types to our patterns
|
||||
basic_to_pattern = {
|
||||
'memory_exhaustion': 'memory_exhaustion',
|
||||
'tab_crash': 'memory_exhaustion', # Tab crashes often from memory
|
||||
'timeout': 'scroll_timeout',
|
||||
'element_not_found': 'element_stale',
|
||||
'rate_limited': 'rate_limited',
|
||||
'network_failure': 'rate_limited', # Could be blocking
|
||||
}
|
||||
|
||||
if basic_type in basic_to_pattern:
|
||||
pattern_name = basic_to_pattern[basic_type]
|
||||
confidence = 0.3 # Low confidence fallback
|
||||
description = f"Inferred from basic crash type '{basic_type}'"
|
||||
else:
|
||||
pattern_name = 'unknown'
|
||||
confidence = 0.0
|
||||
description = f"Unable to determine crash pattern (basic type: {basic_type})"
|
||||
|
||||
# Generate suggested fix based on pattern
|
||||
suggested_fixes = {
|
||||
'memory_exhaustion': (
|
||||
"Reduce batch size and restart browser more frequently. "
|
||||
"Consider limiting max_reviews to 500 and restarting browser after every 200 reviews."
|
||||
),
|
||||
'dom_bloat': (
|
||||
"Enable DOM cleanup during scrolling. "
|
||||
"Hide processed review cards and remove separator elements to keep DOM light."
|
||||
),
|
||||
'rate_limited': (
|
||||
"Increase delays between requests and consider rotating proxies. "
|
||||
"Double the delay multiplier and switch to a different proxy if available."
|
||||
),
|
||||
'consent_loop': (
|
||||
"Skip consent handling after initial attempt to avoid infinite loops. "
|
||||
"The consent popup may be appearing due to cookie clearing or navigation issues."
|
||||
),
|
||||
'scroll_timeout': (
|
||||
"The page may have stopped loading new reviews. "
|
||||
"Try reducing the target review count by 10% and accepting partial results."
|
||||
),
|
||||
'element_stale': (
|
||||
"Page elements are being removed/replaced during scraping. "
|
||||
"Retry operations with freshly-located elements and add defensive waits."
|
||||
),
|
||||
'unknown': (
|
||||
"Unable to determine specific crash cause. "
|
||||
"Review logs and consider restarting with fresh browser session."
|
||||
)
|
||||
}
|
||||
|
||||
suggested_fix = suggested_fixes.get(pattern_name, suggested_fixes['unknown'])
|
||||
auto_fix_params = AUTO_FIX_PARAMS.get(pattern_name)
|
||||
|
||||
return CrashAnalysis(
|
||||
pattern=pattern_name,
|
||||
confidence=confidence,
|
||||
description=description,
|
||||
suggested_fix=suggested_fix,
|
||||
auto_fix_params=auto_fix_params
|
||||
)
|
||||
|
||||
|
||||
def get_auto_fix_params(pattern: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get auto-fix parameters for a specific crash pattern.
|
||||
|
||||
Args:
|
||||
pattern: The crash pattern name
|
||||
|
||||
Returns:
|
||||
Dictionary of auto-fix parameters, or None if pattern not recognized
|
||||
"""
|
||||
return AUTO_FIX_PARAMS.get(pattern)
|
||||
|
||||
|
||||
def apply_auto_fix(pattern: str, current_params: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Apply auto-fix parameters to current scraper parameters.
|
||||
|
||||
Args:
|
||||
pattern: The crash pattern name
|
||||
current_params: Current scraper parameters to modify
|
||||
|
||||
Returns:
|
||||
Updated parameters dictionary with fixes applied
|
||||
"""
|
||||
fix_params = AUTO_FIX_PARAMS.get(pattern, {})
|
||||
updated = current_params.copy()
|
||||
|
||||
for key, value in fix_params.items():
|
||||
if key == 'target_reviews' and value == 'current - 10%':
|
||||
# Special case: reduce target by 10%
|
||||
current_target = updated.get('max_reviews', 1000)
|
||||
updated['max_reviews'] = int(current_target * 0.9)
|
||||
elif key == 'delay_multiplier':
|
||||
# Multiply existing delay
|
||||
current_delay = updated.get('scroll_delay', 1.0)
|
||||
updated['scroll_delay'] = current_delay * value
|
||||
else:
|
||||
updated[key] = value
|
||||
|
||||
return updated
|
||||
|
||||
|
||||
def summarize_crash_patterns(crash_reports: List[Dict]) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze multiple crash reports to identify recurring patterns.
|
||||
|
||||
Args:
|
||||
crash_reports: List of crash report dictionaries
|
||||
|
||||
Returns:
|
||||
Summary dictionary with pattern frequencies and recommendations
|
||||
"""
|
||||
if not crash_reports:
|
||||
return {
|
||||
'total_crashes': 0,
|
||||
'patterns': {},
|
||||
'most_common': None,
|
||||
'recommendations': []
|
||||
}
|
||||
|
||||
pattern_counts: Dict[str, int] = {}
|
||||
pattern_confidences: Dict[str, List[float]] = {}
|
||||
|
||||
for report in crash_reports:
|
||||
analysis = analyze_crash(report)
|
||||
pattern = analysis.pattern
|
||||
|
||||
pattern_counts[pattern] = pattern_counts.get(pattern, 0) + 1
|
||||
if pattern not in pattern_confidences:
|
||||
pattern_confidences[pattern] = []
|
||||
pattern_confidences[pattern].append(analysis.confidence)
|
||||
|
||||
# Calculate average confidence per pattern
|
||||
patterns_summary = {}
|
||||
for pattern, count in pattern_counts.items():
|
||||
avg_confidence = sum(pattern_confidences[pattern]) / len(pattern_confidences[pattern])
|
||||
patterns_summary[pattern] = {
|
||||
'count': count,
|
||||
'percentage': count / len(crash_reports) * 100,
|
||||
'avg_confidence': avg_confidence
|
||||
}
|
||||
|
||||
# Find most common pattern
|
||||
most_common = max(pattern_counts.items(), key=lambda x: x[1])[0] if pattern_counts else None
|
||||
|
||||
# Generate recommendations
|
||||
recommendations = []
|
||||
for pattern, stats in sorted(patterns_summary.items(), key=lambda x: x[1]['count'], reverse=True):
|
||||
if stats['count'] >= 2: # Only recommend for recurring patterns
|
||||
fix_params = AUTO_FIX_PARAMS.get(pattern)
|
||||
if fix_params:
|
||||
recommendations.append({
|
||||
'pattern': pattern,
|
||||
'occurrences': stats['count'],
|
||||
'auto_fix_params': fix_params
|
||||
})
|
||||
|
||||
return {
|
||||
'total_crashes': len(crash_reports),
|
||||
'patterns': patterns_summary,
|
||||
'most_common': most_common,
|
||||
'recommendations': recommendations
|
||||
}
|
||||
391
utils/date_converter.py
Normal file
391
utils/date_converter.py
Normal file
@@ -0,0 +1,391 @@
|
||||
"""
|
||||
Date conversion utilities for Google Maps reviews.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
# Logger
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
|
||||
def relative_to_datetime(date_str: str, lang: str = "en") -> Optional[datetime]:
|
||||
"""
|
||||
Convert a relative date string to a datetime object.
|
||||
|
||||
Args:
|
||||
date_str: The relative date string (e.g., "2 years ago")
|
||||
lang: Language code ("en" or "he")
|
||||
|
||||
Returns:
|
||||
datetime object or None if conversion fails
|
||||
"""
|
||||
if not date_str:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Convert to ISO format first
|
||||
iso_date = parse_relative_date(date_str, lang)
|
||||
|
||||
# If original string was returned, it wasn't in the expected format
|
||||
if iso_date == date_str:
|
||||
return None
|
||||
|
||||
# Parse the ISO format into datetime
|
||||
return datetime.fromisoformat(iso_date)
|
||||
except Exception as e:
|
||||
log.debug(f"Failed to convert relative date '{date_str}': {e}")
|
||||
return None
|
||||
|
||||
|
||||
class DateConverter:
|
||||
"""Handler for converting string dates to datetime objects in MongoDB"""
|
||||
|
||||
@staticmethod
|
||||
def convert_dates_in_document(doc: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert string dates to datetime objects in a document.
|
||||
|
||||
Args:
|
||||
doc: MongoDB document with string dates
|
||||
|
||||
Returns:
|
||||
Document with string dates converted to datetime objects
|
||||
"""
|
||||
# Remove the original date string field if it exists
|
||||
if "date" in doc:
|
||||
original_date = doc.pop("date")
|
||||
|
||||
# Try to use the original date to fix review_date if needed
|
||||
if "review_date" not in doc or not doc["review_date"]:
|
||||
lang = next(iter(doc.get("description", {}).keys()), "en")
|
||||
date_obj = relative_to_datetime(original_date, lang)
|
||||
if date_obj:
|
||||
doc["review_date"] = date_obj
|
||||
|
||||
# Fields that should be converted to dates
|
||||
date_fields = ["created_date", "last_modified_date", "review_date"]
|
||||
|
||||
# Convert date fields to datetime
|
||||
for field in date_fields:
|
||||
if field in doc and isinstance(doc[field], str):
|
||||
try:
|
||||
# Try to parse as ISO format first
|
||||
doc[field] = datetime.fromisoformat(doc[field].replace('Z', '+00:00'))
|
||||
except (ValueError, TypeError):
|
||||
# If that fails, try parsing as relative date
|
||||
lang = next(iter(doc.get("description", {}).keys()), "en")
|
||||
date_obj = relative_to_datetime(doc[field], lang)
|
||||
if date_obj:
|
||||
doc[field] = date_obj
|
||||
|
||||
# Handle nested date fields in owner_responses
|
||||
if "owner_responses" in doc and isinstance(doc["owner_responses"], dict):
|
||||
for lang, response in doc["owner_responses"].items():
|
||||
if isinstance(response, dict) and "date" in response:
|
||||
# Remove the date string field from owner responses
|
||||
del response["date"]
|
||||
|
||||
return doc
|
||||
|
||||
@staticmethod
|
||||
def convert_dates_in_reviews(reviews: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
|
||||
"""
|
||||
Convert string dates to datetime objects for all reviews.
|
||||
|
||||
Args:
|
||||
reviews: Dictionary of review documents
|
||||
|
||||
Returns:
|
||||
Reviews with dates converted to datetime objects
|
||||
"""
|
||||
log.info("Converting string dates to datetime objects...")
|
||||
|
||||
for review_id, review in reviews.items():
|
||||
reviews[review_id] = DateConverter.convert_dates_in_document(review)
|
||||
|
||||
return reviews
|
||||
|
||||
|
||||
def parse_relative_date(date_str: str, lang: str, now: Optional[datetime] = None) -> str:
|
||||
"""
|
||||
Converts a relative review_date (in English or Hebrew) such as "a week ago" or "לפני 7 שנים"
|
||||
into an ISO formatted datetime string (UTC).
|
||||
|
||||
For English, supported formats include:
|
||||
- "a day ago", "an hour ago", "3 weeks ago", "4 months ago", "2 years ago", etc.
|
||||
For Hebrew, supported formats include:
|
||||
- "לפני יום", "לפני 2 ימים", "לפני שבוע", "לפני שבועיים", "לפני חודש",
|
||||
"לפני חודשיים", "לפני 10 חודשים", "לפני שנה", "לפני 3 שנים", etc.
|
||||
|
||||
Parameters:
|
||||
- date_str (str): the relative date string.
|
||||
- lang (str): "en" for English or "he" for Hebrew.
|
||||
- now (Optional[datetime]): reference datetime; if None, current local time is used.
|
||||
|
||||
Returns:
|
||||
A string representing the calculated absolute datetime in ISO 8601 format.
|
||||
If parsing fails in all supported languages, returns a random date within the last year.
|
||||
"""
|
||||
import random
|
||||
|
||||
if now is None:
|
||||
now = datetime.utcnow() # use UTC for consistency
|
||||
|
||||
# Try with the provided language first
|
||||
result = try_parse_date(date_str, lang, now)
|
||||
if result != date_str:
|
||||
return result
|
||||
|
||||
# If the provided language failed, try other supported languages
|
||||
supported_langs = ["en", "he", "th"]
|
||||
for alt_lang in supported_langs:
|
||||
if alt_lang != lang.lower():
|
||||
result = try_parse_date(date_str, alt_lang, now)
|
||||
if result != date_str:
|
||||
return result
|
||||
|
||||
# If all parsing attempts failed, generate a random date within the last year
|
||||
# This creates a date between 1 day ago and 365 days ago
|
||||
random_days_ago = random.randint(1, 365)
|
||||
random_date = now - timedelta(days=random_days_ago)
|
||||
return random_date.isoformat()
|
||||
|
||||
|
||||
def try_parse_date(date_str: str, lang: str, now: datetime) -> str:
|
||||
"""
|
||||
Helper function that attempts to parse a date string in a specific language.
|
||||
|
||||
Returns the ISO formatted date if successful, or the original string if not.
|
||||
"""
|
||||
delta = timedelta(0)
|
||||
parsed = False
|
||||
|
||||
if lang.lower() == "en":
|
||||
# Pattern: capture number or "a"/"an", then unit.
|
||||
pattern = re.compile(r'(?P<num>a|an|\d+)\s+(?P<unit>day|week|month|year)s?\s+ago', re.IGNORECASE)
|
||||
m = pattern.search(date_str)
|
||||
if m:
|
||||
num_str = m.group("num").lower()
|
||||
num = 1 if num_str in ("a", "an") else int(num_str)
|
||||
unit = m.group("unit").lower()
|
||||
if unit == "day":
|
||||
delta = timedelta(days=num)
|
||||
elif unit == "week":
|
||||
delta = timedelta(weeks=num)
|
||||
elif unit == "month":
|
||||
delta = timedelta(days=30 * num) # approximate
|
||||
elif unit == "year":
|
||||
delta = timedelta(days=365 * num) # approximate
|
||||
parsed = True
|
||||
elif lang.lower() == "he":
|
||||
# Remove the "לפני" prefix if present
|
||||
text = date_str.strip()
|
||||
if text.startswith("לפני"):
|
||||
text = text[len("לפני"):].strip()
|
||||
|
||||
# Handle special cases where the number and unit are combined:
|
||||
special = {
|
||||
"חודשיים": (2, "month"),
|
||||
"שבועיים": (2, "week"),
|
||||
"יומיים": (2, "day"),
|
||||
}
|
||||
if text in special:
|
||||
num, unit = special[text]
|
||||
if unit == "day":
|
||||
delta = timedelta(days=num)
|
||||
elif unit == "week":
|
||||
delta = timedelta(weeks=num)
|
||||
elif unit == "month":
|
||||
delta = timedelta(days=30 * num) # approximate
|
||||
parsed = True
|
||||
else:
|
||||
# Match optional number (or assume 1) and then a unit.
|
||||
pattern = re.compile(r'(?P<num>\d+|אחד|אחת)?\s*(?P<unit>שנה|שנים|חודש|חודשים|יום|ימים|שבוע|שבועות)',
|
||||
re.IGNORECASE)
|
||||
m = pattern.search(text)
|
||||
if m:
|
||||
num_str = m.group("num")
|
||||
if not num_str:
|
||||
num = 1
|
||||
else:
|
||||
try:
|
||||
num = int(num_str)
|
||||
except ValueError:
|
||||
num = 1
|
||||
unit_he = m.group("unit")
|
||||
# Map the Hebrew unit (both singular and plural) to English unit names
|
||||
if unit_he in ("יום", "ימים"):
|
||||
unit = "day"
|
||||
elif unit_he in ("שבוע", "שבועות"):
|
||||
unit = "week"
|
||||
elif unit_he in ("חודש", "חודשים"):
|
||||
unit = "month"
|
||||
elif unit_he in ("שנה", "שנים"):
|
||||
unit = "year"
|
||||
else:
|
||||
unit = "day" # fallback
|
||||
|
||||
if unit == "day":
|
||||
delta = timedelta(days=num)
|
||||
elif unit == "week":
|
||||
delta = timedelta(weeks=num)
|
||||
elif unit == "month":
|
||||
delta = timedelta(days=30 * num) # approximate
|
||||
elif unit == "year":
|
||||
delta = timedelta(days=365 * num) # approximate
|
||||
parsed = True
|
||||
elif lang.lower() == "th":
|
||||
# Thai language patterns (simplified)
|
||||
# Check for Thai patterns like "3 วันที่แล้ว" (3 days ago)
|
||||
thai_pattern = re.compile(r'(?P<num>\d+)?\s*(?P<unit>วัน|สัปดาห์|เดือน|ปี)ที่แล้ว', re.IGNORECASE)
|
||||
m = thai_pattern.search(date_str)
|
||||
if m:
|
||||
num_str = m.group("num")
|
||||
num = 1 if not num_str else int(num_str)
|
||||
unit_th = m.group("unit")
|
||||
|
||||
# Map Thai units to English
|
||||
if unit_th == "วัน":
|
||||
unit = "day"
|
||||
elif unit_th == "สัปดาห์":
|
||||
unit = "week"
|
||||
elif unit_th == "เดือน":
|
||||
unit = "month"
|
||||
elif unit_th == "ปี":
|
||||
unit = "year"
|
||||
else:
|
||||
unit = "day" # fallback
|
||||
|
||||
if unit == "day":
|
||||
delta = timedelta(days=num)
|
||||
elif unit == "week":
|
||||
delta = timedelta(weeks=num)
|
||||
elif unit == "month":
|
||||
delta = timedelta(days=30 * num) # approximate
|
||||
elif unit == "year":
|
||||
delta = timedelta(days=365 * num) # approximate
|
||||
parsed = True
|
||||
|
||||
# Return the calculated date if parsing was successful, otherwise return the original string
|
||||
if parsed:
|
||||
result = now - delta
|
||||
return result.isoformat()
|
||||
else:
|
||||
return date_str
|
||||
|
||||
|
||||
# def parse_relative_date(date_str: str, lang: str, now: Optional[datetime] = None) -> str:
|
||||
# """
|
||||
# Converts a relative review_date (in English or Hebrew) such as "a week ago" or "לפני 7 שנים"
|
||||
# into an ISO formatted datetime string (UTC).
|
||||
#
|
||||
# For English, supported formats include:
|
||||
# - "a day ago", "an hour ago", "3 weeks ago", "4 months ago", "2 years ago", etc.
|
||||
# For Hebrew, supported formats include:
|
||||
# - "לפני יום", "לפני 2 ימים", "לפני שבוע", "לפני שבועיים", "לפני חודש",
|
||||
# "לפני חודשיים", "לפני 10 חודשים", "לפני שנה", "לפני 3 שנים", etc.
|
||||
#
|
||||
# Parameters:
|
||||
# - date_str (str): the relative date string.
|
||||
# - lang (str): "en" for English or "he" for Hebrew.
|
||||
# - now (Optional[datetime]): reference datetime; if None, current local time is used.
|
||||
#
|
||||
# Returns:
|
||||
# A string representing the calculated absolute datetime in ISO 8601 format,
|
||||
# or the original date_str if parsing fails.
|
||||
# """
|
||||
# if now is None:
|
||||
# now = datetime.utcnow() # use UTC for consistency
|
||||
#
|
||||
# delta = timedelta(0)
|
||||
#
|
||||
# if lang.lower() == "en":
|
||||
# # Pattern: capture number or "a"/"an", then unit.
|
||||
# pattern = re.compile(r'(?P<num>a|an|\d+)\s+(?P<unit>day|week|month|year)s?\s+ago', re.IGNORECASE)
|
||||
# m = pattern.search(date_str)
|
||||
# if m:
|
||||
# num_str = m.group("num").lower()
|
||||
# num = 1 if num_str in ("a", "an") else int(num_str)
|
||||
# unit = m.group("unit").lower()
|
||||
# if unit == "day":
|
||||
# delta = timedelta(days=num)
|
||||
# elif unit == "week":
|
||||
# delta = timedelta(weeks=num)
|
||||
# elif unit == "month":
|
||||
# delta = timedelta(days=30 * num) # approximate
|
||||
# elif unit == "year":
|
||||
# delta = timedelta(days=365 * num) # approximate
|
||||
# else:
|
||||
# return date_str # return original if not matched
|
||||
# elif lang.lower() == "he":
|
||||
# # Remove the "לפני" prefix if present
|
||||
# text = date_str.strip()
|
||||
# if text.startswith("לפני"):
|
||||
# text = text[len("לפני"):].strip()
|
||||
#
|
||||
# # Handle special cases where the number and unit are combined:
|
||||
# special = {
|
||||
# "חודשיים": (2, "month"),
|
||||
# "שבועיים": (2, "week"),
|
||||
# "יומיים": (2, "day"),
|
||||
# }
|
||||
# if text in special:
|
||||
# num, unit = special[text]
|
||||
# else:
|
||||
# # Match optional number (or assume 1) and then a unit.
|
||||
# pattern = re.compile(r'(?P<num>\d+|אחד|אחת)?\s*(?P<unit>שנה|שנים|חודש|חודשים|יום|ימים|שבוע|שבועות)',
|
||||
# re.IGNORECASE)
|
||||
# m = pattern.search(text)
|
||||
# if m:
|
||||
# num_str = m.group("num")
|
||||
# if not num_str:
|
||||
# num = 1
|
||||
# else:
|
||||
# try:
|
||||
# num = int(num_str)
|
||||
# except ValueError:
|
||||
# num = 1
|
||||
# unit_he = m.group("unit")
|
||||
# # Map the Hebrew unit (both singular and plural) to English unit names
|
||||
# if unit_he in ("יום", "ימים"):
|
||||
# unit = "day"
|
||||
# elif unit_he in ("שבוע", "שבועות"):
|
||||
# unit = "week"
|
||||
# elif unit_he in ("חודש", "חודשים"):
|
||||
# unit = "month"
|
||||
# elif unit_he in ("שנה", "שנים"):
|
||||
# unit = "year"
|
||||
# else:
|
||||
# unit = "day" # fallback
|
||||
# else:
|
||||
# return date_str # if nothing matches, return original text
|
||||
#
|
||||
# if unit == "day":
|
||||
# delta = timedelta(days=num)
|
||||
# elif unit == "week":
|
||||
# delta = timedelta(weeks=num)
|
||||
# elif unit == "month":
|
||||
# delta = timedelta(days=30 * num) # approximate
|
||||
# elif unit == "year":
|
||||
# delta = timedelta(days=365 * num) # approximate
|
||||
#
|
||||
# result = now - delta
|
||||
# return result.isoformat()
|
||||
|
||||
|
||||
# --- Example usage ---
|
||||
if __name__ == "__main__":
|
||||
# Fixed reference time for reproducibility:
|
||||
fixed_now = datetime(2025, 2, 5, 12, 0, 0)
|
||||
examples = [
|
||||
("a week ago", "he"),
|
||||
("4 weeks ago", "en"),
|
||||
("לפני 7 שנים", "he"),
|
||||
("לפני חודשיים", "he")
|
||||
]
|
||||
for text, lang in examples:
|
||||
iso_date = parse_relative_date(text, lang, now=fixed_now)
|
||||
print(f"Original: {text} ({lang}) => ISO: {iso_date}")
|
||||
411
utils/health_checks.py
Normal file
411
utils/health_checks.py
Normal file
@@ -0,0 +1,411 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Smart health check system with canary testing.
|
||||
Verifies that scraping actually works, not just that services are up.
|
||||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, Any, Optional
|
||||
import os
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CanaryMonitor:
|
||||
"""
|
||||
Background canary test monitor.
|
||||
|
||||
Runs actual scraping tests periodically to verify the scraper works.
|
||||
This catches issues like:
|
||||
- Google Maps page structure changes
|
||||
- Broken CSS selectors
|
||||
- GDPR consent handling issues
|
||||
- Network/proxy problems
|
||||
- Chrome/browser issues
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
db,
|
||||
interval_hours: int = 4,
|
||||
test_url: Optional[str] = None
|
||||
):
|
||||
"""
|
||||
Initialize canary monitor.
|
||||
|
||||
Args:
|
||||
db: Database manager instance
|
||||
interval_hours: How often to run canary tests
|
||||
test_url: Optional test URL (defaults to Soho Factory in Vilnius)
|
||||
"""
|
||||
self.db = db
|
||||
self.interval = timedelta(hours=interval_hours)
|
||||
self.test_url = test_url or os.getenv(
|
||||
'CANARY_TEST_URL',
|
||||
'https://www.google.com/maps/place/Soho+Factory/@54.6738155,25.2595844,17z/'
|
||||
)
|
||||
|
||||
self.running = False
|
||||
self.last_run: Optional[datetime] = None
|
||||
self.last_success: Optional[datetime] = None
|
||||
self.consecutive_failures = 0
|
||||
self.last_result: Optional[Dict[str, Any]] = None
|
||||
|
||||
async def start(self):
|
||||
"""Start the background canary monitoring"""
|
||||
self.running = True
|
||||
log.info(f"Canary monitor started (interval: {self.interval.total_seconds()/3600:.1f}h)")
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
await self.run_canary_test()
|
||||
except Exception as e:
|
||||
log.error(f"Canary test failed with exception: {e}")
|
||||
self.consecutive_failures += 1
|
||||
|
||||
# Alert if multiple consecutive failures
|
||||
if self.consecutive_failures >= 3:
|
||||
await self.send_alert(
|
||||
f"CRITICAL: Scraper canary failed {self.consecutive_failures} times in a row! "
|
||||
f"Last error: {str(e)[:200]}"
|
||||
)
|
||||
|
||||
# Sleep until next run
|
||||
await asyncio.sleep(self.interval.total_seconds())
|
||||
|
||||
def stop(self):
|
||||
"""Stop the background monitoring"""
|
||||
self.running = False
|
||||
log.info("Canary monitor stopped")
|
||||
|
||||
async def run_canary_test(self):
|
||||
"""
|
||||
Run a single canary test.
|
||||
|
||||
This performs an actual scrape on a known test URL and validates:
|
||||
- Scraping succeeds
|
||||
- Reviews are extracted
|
||||
- Review count is reasonable
|
||||
- Scrape time is reasonable
|
||||
- Data structure is valid
|
||||
"""
|
||||
from scrapers.google_reviews.v1_0_0 import fast_scrape_reviews
|
||||
|
||||
log.info(f"Running canary scrape test on {self.test_url[:60]}...")
|
||||
self.last_run = datetime.now()
|
||||
|
||||
try:
|
||||
# Run actual scrape with timeout
|
||||
result = await asyncio.wait_for(
|
||||
asyncio.to_thread(
|
||||
fast_scrape_reviews,
|
||||
url=self.test_url,
|
||||
headless=True,
|
||||
max_scrolls=10 # Limited for canary
|
||||
),
|
||||
timeout=60 # Fail if takes > 60s
|
||||
)
|
||||
|
||||
# Validate result
|
||||
checks = {
|
||||
"scrape_succeeded": result['success'],
|
||||
"got_reviews": result['count'] > 0,
|
||||
"reasonable_count": 10 <= result['count'] <= 500,
|
||||
"reasonable_time": result['time'] < 30,
|
||||
"data_structure_valid": self._validate_review_structure(result.get('reviews', []))
|
||||
}
|
||||
|
||||
all_passed = all(checks.values())
|
||||
|
||||
if all_passed:
|
||||
# Success!
|
||||
log.info(
|
||||
f"Canary test PASSED: {result['count']} reviews in {result['time']:.1f}s"
|
||||
)
|
||||
self.consecutive_failures = 0
|
||||
self.last_success = datetime.now()
|
||||
self.last_result = {
|
||||
"status": "pass",
|
||||
"reviews_count": result['count'],
|
||||
"scrape_time": result['time'],
|
||||
"checks": checks
|
||||
}
|
||||
|
||||
# Save to database
|
||||
await self.db.save_canary_result(
|
||||
success=True,
|
||||
reviews_count=result['count'],
|
||||
scrape_time=result['time'],
|
||||
metadata={"checks": checks}
|
||||
)
|
||||
|
||||
else:
|
||||
# Validation failed
|
||||
failed_checks = [k for k, v in checks.items() if not v]
|
||||
log.error(
|
||||
f"Canary test FAILED: validation failed on {failed_checks}"
|
||||
)
|
||||
self.consecutive_failures += 1
|
||||
self.last_result = {
|
||||
"status": "fail",
|
||||
"reviews_count": result['count'],
|
||||
"scrape_time": result['time'],
|
||||
"checks": checks,
|
||||
"failed_checks": failed_checks
|
||||
}
|
||||
|
||||
# Save to database
|
||||
await self.db.save_canary_result(
|
||||
success=False,
|
||||
reviews_count=result['count'],
|
||||
scrape_time=result['time'],
|
||||
error_message=f"Validation failed: {failed_checks}",
|
||||
metadata={"checks": checks}
|
||||
)
|
||||
|
||||
# Alert on failure
|
||||
if self.consecutive_failures >= 3:
|
||||
await self.send_alert(
|
||||
f"CRITICAL: Canary validation failed {self.consecutive_failures} times! "
|
||||
f"Failed checks: {failed_checks}"
|
||||
)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
log.error("Canary test TIMEOUT (>60s)")
|
||||
self.consecutive_failures += 1
|
||||
self.last_result = {
|
||||
"status": "timeout",
|
||||
"error": "Scrape took longer than 60 seconds"
|
||||
}
|
||||
|
||||
await self.db.save_canary_result(
|
||||
success=False,
|
||||
error_message="Timeout after 60 seconds"
|
||||
)
|
||||
|
||||
if self.consecutive_failures >= 3:
|
||||
await self.send_alert(
|
||||
f"CRITICAL: Canary timeout {self.consecutive_failures} times!"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Canary test ERROR: {e}")
|
||||
self.consecutive_failures += 1
|
||||
self.last_result = {
|
||||
"status": "error",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
await self.db.save_canary_result(
|
||||
success=False,
|
||||
error_message=str(e)
|
||||
)
|
||||
|
||||
raise # Re-raise to trigger alert in main loop
|
||||
|
||||
def _validate_review_structure(self, reviews) -> bool:
|
||||
"""
|
||||
Validate that reviews have expected structure.
|
||||
|
||||
Args:
|
||||
reviews: List of review dictionaries
|
||||
|
||||
Returns:
|
||||
True if structure is valid
|
||||
"""
|
||||
if not reviews or len(reviews) == 0:
|
||||
return False
|
||||
|
||||
# Check first review has required fields
|
||||
first_review = reviews[0]
|
||||
required_fields = ['author', 'rating', 'date_text']
|
||||
|
||||
return all(field in first_review for field in required_fields)
|
||||
|
||||
async def send_alert(self, message: str):
|
||||
"""
|
||||
Send alert via configured channels.
|
||||
|
||||
Args:
|
||||
message: Alert message to send
|
||||
"""
|
||||
log.critical(message)
|
||||
|
||||
# TODO: Integrate with alerting systems
|
||||
# Examples:
|
||||
|
||||
# Slack
|
||||
slack_webhook = os.getenv('SLACK_WEBHOOK_URL')
|
||||
if slack_webhook:
|
||||
try:
|
||||
import httpx
|
||||
async with httpx.AsyncClient() as client:
|
||||
await client.post(
|
||||
slack_webhook,
|
||||
json={"text": message},
|
||||
timeout=5.0
|
||||
)
|
||||
log.info("Alert sent to Slack")
|
||||
except Exception as e:
|
||||
log.error(f"Failed to send Slack alert: {e}")
|
||||
|
||||
# Email (example with SMTP)
|
||||
# smtp_config = os.getenv('SMTP_CONFIG')
|
||||
# if smtp_config:
|
||||
# await send_email(
|
||||
# to=os.getenv('ALERT_EMAIL'),
|
||||
# subject="Scraper Canary Alert",
|
||||
# body=message
|
||||
# )
|
||||
|
||||
# PagerDuty
|
||||
# pagerduty_key = os.getenv('PAGERDUTY_KEY')
|
||||
# if pagerduty_key:
|
||||
# await trigger_pagerduty(message)
|
||||
|
||||
def get_status(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get current canary status.
|
||||
|
||||
Returns:
|
||||
Status dictionary
|
||||
"""
|
||||
if not self.last_success:
|
||||
return {
|
||||
"status": "unknown",
|
||||
"message": "No canary tests run yet",
|
||||
"last_run": self.last_run.isoformat() if self.last_run else None
|
||||
}
|
||||
|
||||
age = datetime.now() - self.last_success
|
||||
max_age = timedelta(hours=6) # Alert if no success in 6 hours
|
||||
|
||||
if age > max_age:
|
||||
return {
|
||||
"status": "stale",
|
||||
"last_success": self.last_success.isoformat(),
|
||||
"age_hours": age.total_seconds() / 3600,
|
||||
"consecutive_failures": self.consecutive_failures,
|
||||
"message": f"Last successful canary was {age.total_seconds()/3600:.1f} hours ago"
|
||||
}
|
||||
|
||||
return {
|
||||
"status": "healthy",
|
||||
"last_success": self.last_success.isoformat(),
|
||||
"last_run": self.last_run.isoformat() if self.last_run else None,
|
||||
"age_minutes": age.total_seconds() / 60,
|
||||
"consecutive_failures": self.consecutive_failures,
|
||||
"last_result": self.last_result
|
||||
}
|
||||
|
||||
|
||||
class HealthCheckSystem:
|
||||
"""
|
||||
Complete health check system for production.
|
||||
|
||||
Provides multiple levels of health checks:
|
||||
- Liveness: Is the server alive?
|
||||
- Readiness: Can it handle traffic?
|
||||
- Canary: Does scraping actually work?
|
||||
"""
|
||||
|
||||
def __init__(self, db):
|
||||
"""
|
||||
Initialize health check system.
|
||||
|
||||
Args:
|
||||
db: Database manager instance
|
||||
"""
|
||||
self.db = db
|
||||
self.canary = CanaryMonitor(db, interval_hours=4)
|
||||
|
||||
async def start(self):
|
||||
"""Start background health monitoring"""
|
||||
asyncio.create_task(self.canary.start())
|
||||
|
||||
def stop(self):
|
||||
"""Stop background health monitoring"""
|
||||
self.canary.stop()
|
||||
|
||||
async def check_liveness(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Liveness check: Is the server alive?
|
||||
|
||||
This is a simple check that always succeeds if the server is running.
|
||||
Used by Kubernetes liveness probe - restart container if fails.
|
||||
|
||||
Returns:
|
||||
Liveness status
|
||||
"""
|
||||
return {
|
||||
"status": "alive",
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
async def check_readiness(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Readiness check: Can the server handle traffic?
|
||||
|
||||
Checks if dependencies are available.
|
||||
Used by Kubernetes readiness probe - remove from load balancer if fails.
|
||||
|
||||
Returns:
|
||||
Readiness status
|
||||
"""
|
||||
checks = {}
|
||||
|
||||
# Check database
|
||||
try:
|
||||
await self.db.pool.fetchval("SELECT 1")
|
||||
checks["database"] = {"healthy": True}
|
||||
except Exception as e:
|
||||
checks["database"] = {"healthy": False, "error": str(e)}
|
||||
|
||||
# Overall readiness
|
||||
all_healthy = all(c.get("healthy", False) for c in checks.values())
|
||||
|
||||
return {
|
||||
"status": "ready" if all_healthy else "not_ready",
|
||||
"checks": checks,
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
async def check_canary(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Canary check: Does scraping actually work?
|
||||
|
||||
Returns the latest canary test result.
|
||||
Used by external monitoring (PagerDuty, DataDog) for alerts.
|
||||
|
||||
Returns:
|
||||
Canary status
|
||||
"""
|
||||
return self.canary.get_status()
|
||||
|
||||
async def get_detailed_health(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get detailed health status of all components.
|
||||
|
||||
Returns:
|
||||
Complete health status
|
||||
"""
|
||||
liveness = await self.check_liveness()
|
||||
readiness = await self.check_readiness()
|
||||
canary = await self.check_canary()
|
||||
|
||||
overall_healthy = (
|
||||
liveness["status"] == "alive" and
|
||||
readiness["status"] == "ready" and
|
||||
canary["status"] in ["healthy", "unknown"] # Unknown is OK (first run)
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "healthy" if overall_healthy else "degraded",
|
||||
"components": {
|
||||
"liveness": liveness,
|
||||
"readiness": readiness,
|
||||
"canary": canary
|
||||
},
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
307
utils/helpers.py
Normal file
307
utils/helpers.py
Normal file
@@ -0,0 +1,307 @@
|
||||
"""
|
||||
Utility functions for Google Maps Reviews Scraper.
|
||||
"""
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from datetime import timezone
|
||||
from functools import lru_cache
|
||||
from typing import List
|
||||
|
||||
from selenium.common.exceptions import (NoSuchElementException,
|
||||
StaleElementReferenceException,
|
||||
TimeoutException)
|
||||
from selenium.webdriver import Chrome
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.remote.webelement import WebElement
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
||||
# Logger
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
# Constants for language detection
|
||||
HEB_CHARS = re.compile(r"[\u0590-\u05FF]")
|
||||
THAI_CHARS = re.compile(r"[\u0E00-\u0E7F]")
|
||||
|
||||
|
||||
@lru_cache(maxsize=1024)
|
||||
def detect_lang(txt: str) -> str:
|
||||
"""Detect language based on character sets"""
|
||||
if HEB_CHARS.search(txt): return "he"
|
||||
if THAI_CHARS.search(txt): return "th"
|
||||
return "en"
|
||||
|
||||
|
||||
@lru_cache(maxsize=128)
|
||||
def safe_int(s: str | None) -> int:
|
||||
"""Safely convert string to integer, returning 0 if not possible"""
|
||||
m = re.search(r"\d+", s or "")
|
||||
return int(m.group()) if m else 0
|
||||
|
||||
|
||||
def try_find(el: WebElement, css: str, *, all=False) -> List[WebElement]:
|
||||
"""Safely find elements by CSS selector without raising exceptions"""
|
||||
try:
|
||||
if all:
|
||||
return el.find_elements(By.CSS_SELECTOR, css)
|
||||
obj = el.find_element(By.CSS_SELECTOR, css)
|
||||
return [obj] if obj else []
|
||||
except (NoSuchElementException, StaleElementReferenceException):
|
||||
return []
|
||||
|
||||
|
||||
def first_text(el: WebElement, css: str) -> str:
|
||||
"""Get text from the first matching element that has non-empty text"""
|
||||
for e in try_find(el, css, all=True):
|
||||
try:
|
||||
if (t := e.text.strip()):
|
||||
return t
|
||||
except StaleElementReferenceException:
|
||||
continue
|
||||
return ""
|
||||
|
||||
|
||||
def parse_date_to_iso(date_str: str) -> str:
|
||||
"""
|
||||
Parse date strings like "2 weeks ago", "January 2023", etc. into ISO format.
|
||||
Returns a best-effort ISO string, or empty string if parsing fails.
|
||||
"""
|
||||
if not date_str:
|
||||
return ""
|
||||
|
||||
try:
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
# Handle relative dates
|
||||
if "ago" in date_str.lower():
|
||||
# For simplicity, map to approximate dates
|
||||
if "minute" in date_str.lower():
|
||||
minutes = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
dt = now.replace(microsecond=0) - timezone.timedelta(minutes=minutes)
|
||||
elif "hour" in date_str.lower():
|
||||
hours = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
dt = now.replace(microsecond=0) - timezone.timedelta(hours=hours)
|
||||
elif "day" in date_str.lower():
|
||||
days = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
dt = now.replace(microsecond=0) - timezone.timedelta(days=days)
|
||||
elif "week" in date_str.lower():
|
||||
weeks = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
dt = now.replace(microsecond=0) - timezone.timedelta(weeks=weeks)
|
||||
elif "month" in date_str.lower():
|
||||
months = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# Approximate months as 30 days
|
||||
dt = now.replace(microsecond=0) - timezone.timedelta(days=30 * months)
|
||||
elif "year" in date_str.lower():
|
||||
years = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# Approximate years as 365 days
|
||||
dt = now.replace(microsecond=0) - timezone.timedelta(days=365 * years)
|
||||
else:
|
||||
# Default to current time if can't parse
|
||||
dt = now.replace(microsecond=0)
|
||||
else:
|
||||
# Handle absolute dates (month year format)
|
||||
# This is a simplification - would need more robust parsing for production
|
||||
dt = now.replace(microsecond=0)
|
||||
|
||||
return dt.isoformat()
|
||||
except Exception:
|
||||
# If parsing fails, return empty string
|
||||
return ""
|
||||
|
||||
|
||||
def first_attr(el: WebElement, css: str, attr: str) -> str:
|
||||
"""Get attribute value from the first matching element that has a non-empty value"""
|
||||
for e in try_find(el, css, all=True):
|
||||
try:
|
||||
if (v := (e.get_attribute(attr) or "").strip()):
|
||||
return v
|
||||
except StaleElementReferenceException:
|
||||
continue
|
||||
return ""
|
||||
|
||||
|
||||
def click_if(driver: Chrome, css: str, delay: float = .25, timeout: float = 5.0) -> bool:
|
||||
"""
|
||||
Click element if it exists and is clickable, with timeout and better error handling.
|
||||
|
||||
Args:
|
||||
driver: WebDriver instance
|
||||
css: CSS selector for the element to click
|
||||
delay: Time to wait after clicking (seconds)
|
||||
timeout: Maximum time to wait for element (seconds)
|
||||
|
||||
Returns:
|
||||
True if element was found and clicked, False otherwise
|
||||
"""
|
||||
try:
|
||||
# First check if elements exist at all
|
||||
elements = driver.find_elements(By.CSS_SELECTOR, css)
|
||||
if not elements:
|
||||
return False
|
||||
|
||||
# Try clicking the first visible element
|
||||
for element in elements:
|
||||
try:
|
||||
if element.is_displayed() and element.is_enabled():
|
||||
element.click()
|
||||
time.sleep(delay)
|
||||
return True
|
||||
except Exception:
|
||||
# Try next element if this one fails
|
||||
continue
|
||||
|
||||
# If we couldn't click any of the direct elements, try with WebDriverWait
|
||||
try:
|
||||
WebDriverWait(driver, timeout).until(
|
||||
EC.element_to_be_clickable((By.CSS_SELECTOR, css))
|
||||
).click()
|
||||
time.sleep(delay)
|
||||
return True
|
||||
except TimeoutException:
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error in click_if: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
def get_current_iso_date() -> str:
|
||||
"""Return current UTC time in ISO format."""
|
||||
from datetime import datetime, timezone
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# """
|
||||
# Utility functions for Google Maps Reviews Scraper.
|
||||
# """
|
||||
#
|
||||
# import re
|
||||
# import time
|
||||
# import logging
|
||||
# from datetime import datetime, timezone
|
||||
# from functools import lru_cache
|
||||
# from typing import List, Optional
|
||||
#
|
||||
# from selenium.common.exceptions import (NoSuchElementException,
|
||||
# StaleElementReferenceException,
|
||||
# TimeoutException)
|
||||
# from selenium.webdriver import Chrome
|
||||
# from selenium.webdriver.common.by import By
|
||||
# from selenium.webdriver.remote.webelement import WebElement
|
||||
# from selenium.webdriver.support import expected_conditions as EC
|
||||
# from selenium.webdriver.support.ui import WebDriverWait
|
||||
#
|
||||
# # Constants for language detection
|
||||
# HEB_CHARS = re.compile(r"[\u0590-\u05FF]")
|
||||
# THAI_CHARS = re.compile(r"[\u0E00-\u0E7F]")
|
||||
#
|
||||
# # Logger
|
||||
# log = logging.getLogger("scraper")
|
||||
#
|
||||
#
|
||||
# @lru_cache(maxsize=1024)
|
||||
# def detect_lang(txt: str) -> str:
|
||||
# """Detect language based on character sets"""
|
||||
# if HEB_CHARS.search(txt): return "he"
|
||||
# if THAI_CHARS.search(txt): return "th"
|
||||
# return "en"
|
||||
#
|
||||
#
|
||||
# @lru_cache(maxsize=128)
|
||||
# def safe_int(s: str | None) -> int:
|
||||
# """Safely convert string to integer, returning 0 if not possible"""
|
||||
# m = re.search(r"\d+", s or "")
|
||||
# return int(m.group()) if m else 0
|
||||
#
|
||||
#
|
||||
# def try_find(el: WebElement, css: str, *, all=False) -> List[WebElement]:
|
||||
# """Safely find elements by CSS selector without raising exceptions"""
|
||||
# try:
|
||||
# if all:
|
||||
# return el.find_elements(By.CSS_SELECTOR, css)
|
||||
# obj = el.find_element(By.CSS_SELECTOR, css)
|
||||
# return [obj] if obj else []
|
||||
# except (NoSuchElementException, StaleElementReferenceException):
|
||||
# return []
|
||||
#
|
||||
#
|
||||
# def first_text(el: WebElement, css: str) -> str:
|
||||
# """Get text from the first matching element that has non-empty text"""
|
||||
# for e in try_find(el, css, all=True):
|
||||
# if (t := e.text.strip()):
|
||||
# return t
|
||||
# return ""
|
||||
#
|
||||
#
|
||||
# def first_attr(el: WebElement, css: str, attr: str) -> str:
|
||||
# """Get attribute value from the first matching element that has a non-empty value"""
|
||||
# for e in try_find(el, css, all=True):
|
||||
# if (v := (e.get_attribute(attr) or "").strip()):
|
||||
# return v
|
||||
# return ""
|
||||
#
|
||||
#
|
||||
# def click_if(driver: Chrome, css: str, delay: float = .25, timeout: float = 5.0) -> bool:
|
||||
# """Click element if it exists and is clickable, with timeout"""
|
||||
# try:
|
||||
# WebDriverWait(driver, timeout).until(
|
||||
# EC.element_to_be_clickable((By.CSS_SELECTOR, css))
|
||||
# ).click()
|
||||
# time.sleep(delay)
|
||||
# return True
|
||||
# except TimeoutException:
|
||||
# return False
|
||||
#
|
||||
#
|
||||
# def parse_date_to_iso(date_str: str) -> str:
|
||||
# """
|
||||
# Parse date strings like "2 weeks ago", "January 2023", etc. into ISO format.
|
||||
# Returns a best-effort ISO string, or empty string if parsing fails.
|
||||
# """
|
||||
# if not date_str:
|
||||
# return ""
|
||||
#
|
||||
# try:
|
||||
# now = datetime.now(timezone.utc)
|
||||
#
|
||||
# # Handle relative dates
|
||||
# if "ago" in date_str.lower():
|
||||
# # For simplicity, map to approximate dates
|
||||
# if "minute" in date_str.lower():
|
||||
# minutes = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# dt = now.replace(microsecond=0) - timezone.timedelta(minutes=minutes)
|
||||
# elif "hour" in date_str.lower():
|
||||
# hours = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# dt = now.replace(microsecond=0) - timezone.timedelta(hours=hours)
|
||||
# elif "day" in date_str.lower():
|
||||
# days = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# dt = now.replace(microsecond=0) - timezone.timedelta(days=days)
|
||||
# elif "week" in date_str.lower():
|
||||
# weeks = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# dt = now.replace(microsecond=0) - timezone.timedelta(weeks=weeks)
|
||||
# elif "month" in date_str.lower():
|
||||
# months = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# # Approximate months as 30 days
|
||||
# dt = now.replace(microsecond=0) - timezone.timedelta(days=30 * months)
|
||||
# elif "year" in date_str.lower():
|
||||
# years = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# # Approximate years as 365 days
|
||||
# dt = now.replace(microsecond=0) - timezone.timedelta(days=365 * years)
|
||||
# else:
|
||||
# # Default to current time if can't parse
|
||||
# dt = now.replace(microsecond=0)
|
||||
# else:
|
||||
# # Handle absolute dates (month year format)
|
||||
# # This is a simplification - would need more robust parsing for production
|
||||
# dt = now.replace(microsecond=0)
|
||||
#
|
||||
# return dt.isoformat()
|
||||
# except Exception:
|
||||
# # If parsing fails, return empty string
|
||||
# return ""
|
||||
#
|
||||
#
|
||||
# def get_current_iso_date() -> str:
|
||||
# """Return current UTC time in ISO format."""
|
||||
# return datetime.now(timezone.utc).isoformat()
|
||||
250
utils/logger.py
Normal file
250
utils/logger.py
Normal file
@@ -0,0 +1,250 @@
|
||||
"""
|
||||
Structured Logger Module
|
||||
|
||||
Provides a thread-safe, structured logging system with JSON-serializable output.
|
||||
Designed to replace the LogCapture class with enhanced categorization and metrics support.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, List, Literal, Optional
|
||||
import threading
|
||||
import time
|
||||
|
||||
|
||||
LogLevel = Literal['DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL']
|
||||
LogCategory = Literal['scraper', 'browser', 'network', 'system']
|
||||
|
||||
|
||||
@dataclass
|
||||
class LogEntry:
|
||||
"""Structured log entry with timestamp, level, category, and optional metrics."""
|
||||
timestamp: str # ISO 8601 with Z suffix
|
||||
timestamp_ms: int # Unix milliseconds
|
||||
level: LogLevel
|
||||
category: LogCategory
|
||||
message: str
|
||||
metrics: Optional[Dict] = None # memory_mb, reviews_count, scroll_position, dom_nodes, etc.
|
||||
network: Optional[Dict] = None # url, method, status, size_bytes, duration_ms
|
||||
snapshot_id: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert to JSON-serializable dictionary, excluding None values."""
|
||||
result = {
|
||||
'timestamp': self.timestamp,
|
||||
'timestamp_ms': self.timestamp_ms,
|
||||
'level': self.level,
|
||||
'category': self.category,
|
||||
'message': self.message,
|
||||
}
|
||||
if self.metrics is not None:
|
||||
result['metrics'] = self.metrics
|
||||
if self.network is not None:
|
||||
result['network'] = self.network
|
||||
if self.snapshot_id is not None:
|
||||
result['snapshot_id'] = self.snapshot_id
|
||||
return result
|
||||
|
||||
|
||||
class StructuredLogger:
|
||||
"""
|
||||
Thread-safe structured logger with categorized log entries and automatic pruning.
|
||||
|
||||
Example usage:
|
||||
logger = StructuredLogger()
|
||||
logger.info('browser', 'Navigating to URL', metrics={'memory_mb': 245})
|
||||
logger.warn('network', 'Rate limit detected', network={'status': 429, 'url': '...'})
|
||||
logger.error('system', 'Chrome crashed', metrics={'memory_mb': 489, 'dom_nodes': 12000})
|
||||
"""
|
||||
|
||||
def __init__(self, max_entries: int = 10000):
|
||||
"""
|
||||
Initialize the structured logger.
|
||||
|
||||
Args:
|
||||
max_entries: Maximum number of log entries to retain (default 10000).
|
||||
Oldest entries are pruned when limit is exceeded.
|
||||
"""
|
||||
self._entries: List[LogEntry] = []
|
||||
self._lock = threading.Lock()
|
||||
self._max_entries = max_entries
|
||||
|
||||
def _create_entry(
|
||||
self,
|
||||
level: LogLevel,
|
||||
category: LogCategory,
|
||||
message: str,
|
||||
metrics: Optional[Dict] = None,
|
||||
network: Optional[Dict] = None,
|
||||
snapshot_id: Optional[str] = None,
|
||||
) -> LogEntry:
|
||||
"""Create a new log entry with current timestamp."""
|
||||
now = datetime.now(timezone.utc)
|
||||
timestamp = now.strftime('%Y-%m-%dT%H:%M:%S.') + f'{now.microsecond // 1000:03d}Z'
|
||||
timestamp_ms = int(now.timestamp() * 1000)
|
||||
|
||||
return LogEntry(
|
||||
timestamp=timestamp,
|
||||
timestamp_ms=timestamp_ms,
|
||||
level=level,
|
||||
category=category,
|
||||
message=message,
|
||||
metrics=metrics,
|
||||
network=network,
|
||||
snapshot_id=snapshot_id,
|
||||
)
|
||||
|
||||
def _add_entry(self, entry: LogEntry) -> None:
|
||||
"""Add an entry to the log with thread-safety and automatic pruning."""
|
||||
with self._lock:
|
||||
self._entries.append(entry)
|
||||
# Prune oldest entries if limit exceeded
|
||||
if len(self._entries) > self._max_entries:
|
||||
# Remove oldest 10% to avoid frequent pruning
|
||||
prune_count = max(1, self._max_entries // 10)
|
||||
self._entries = self._entries[prune_count:]
|
||||
|
||||
def debug(
|
||||
self,
|
||||
category: LogCategory,
|
||||
message: str,
|
||||
*,
|
||||
metrics: Optional[Dict] = None,
|
||||
network: Optional[Dict] = None,
|
||||
snapshot_id: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Log a DEBUG level message."""
|
||||
entry = self._create_entry('DEBUG', category, message, metrics, network, snapshot_id)
|
||||
self._add_entry(entry)
|
||||
|
||||
def info(
|
||||
self,
|
||||
category: LogCategory,
|
||||
message: str,
|
||||
*,
|
||||
metrics: Optional[Dict] = None,
|
||||
network: Optional[Dict] = None,
|
||||
snapshot_id: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Log an INFO level message."""
|
||||
entry = self._create_entry('INFO', category, message, metrics, network, snapshot_id)
|
||||
self._add_entry(entry)
|
||||
|
||||
def warn(
|
||||
self,
|
||||
category: LogCategory,
|
||||
message: str,
|
||||
*,
|
||||
metrics: Optional[Dict] = None,
|
||||
network: Optional[Dict] = None,
|
||||
snapshot_id: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Log a WARN level message."""
|
||||
entry = self._create_entry('WARN', category, message, metrics, network, snapshot_id)
|
||||
self._add_entry(entry)
|
||||
|
||||
def error(
|
||||
self,
|
||||
category: LogCategory,
|
||||
message: str,
|
||||
*,
|
||||
metrics: Optional[Dict] = None,
|
||||
network: Optional[Dict] = None,
|
||||
snapshot_id: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Log an ERROR level message."""
|
||||
entry = self._create_entry('ERROR', category, message, metrics, network, snapshot_id)
|
||||
self._add_entry(entry)
|
||||
|
||||
def fatal(
|
||||
self,
|
||||
category: LogCategory,
|
||||
message: str,
|
||||
*,
|
||||
metrics: Optional[Dict] = None,
|
||||
network: Optional[Dict] = None,
|
||||
snapshot_id: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Log a FATAL level message."""
|
||||
entry = self._create_entry('FATAL', category, message, metrics, network, snapshot_id)
|
||||
self._add_entry(entry)
|
||||
|
||||
def log(self, message: str, level: str = 'INFO') -> None:
|
||||
"""
|
||||
Backward-compatible log method for legacy code.
|
||||
|
||||
Maps to 'system' category by default.
|
||||
|
||||
Args:
|
||||
message: The log message
|
||||
level: Log level as string (DEBUG, INFO, WARN, ERROR, FATAL)
|
||||
"""
|
||||
level_upper = level.upper()
|
||||
if level_upper not in ('DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL'):
|
||||
level_upper = 'INFO'
|
||||
|
||||
entry = self._create_entry(level_upper, 'system', message)
|
||||
self._add_entry(entry)
|
||||
|
||||
def get_logs(self) -> List[Dict]:
|
||||
"""
|
||||
Get all log entries as JSON-serializable dictionaries.
|
||||
|
||||
Returns:
|
||||
List of log entry dictionaries.
|
||||
"""
|
||||
with self._lock:
|
||||
return [entry.to_dict() for entry in self._entries]
|
||||
|
||||
def get_logs_by_category(self, category: LogCategory) -> List[Dict]:
|
||||
"""
|
||||
Get log entries filtered by category.
|
||||
|
||||
Args:
|
||||
category: The category to filter by ('scraper', 'browser', 'network', 'system')
|
||||
|
||||
Returns:
|
||||
List of log entry dictionaries matching the category.
|
||||
"""
|
||||
with self._lock:
|
||||
return [entry.to_dict() for entry in self._entries if entry.category == category]
|
||||
|
||||
def get_logs_by_level(self, level: LogLevel) -> List[Dict]:
|
||||
"""
|
||||
Get log entries filtered by level.
|
||||
|
||||
Args:
|
||||
level: The level to filter by ('DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL')
|
||||
|
||||
Returns:
|
||||
List of log entry dictionaries matching the level.
|
||||
"""
|
||||
with self._lock:
|
||||
return [entry.to_dict() for entry in self._entries if entry.level == level]
|
||||
|
||||
def get_logs_since(self, timestamp_ms: int) -> List[Dict]:
|
||||
"""
|
||||
Get log entries since a specific timestamp.
|
||||
|
||||
Args:
|
||||
timestamp_ms: Unix timestamp in milliseconds
|
||||
|
||||
Returns:
|
||||
List of log entry dictionaries with timestamp >= timestamp_ms.
|
||||
"""
|
||||
with self._lock:
|
||||
return [entry.to_dict() for entry in self._entries if entry.timestamp_ms >= timestamp_ms]
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all log entries."""
|
||||
with self._lock:
|
||||
self._entries.clear()
|
||||
|
||||
def count(self) -> int:
|
||||
"""Get the current number of log entries."""
|
||||
with self._lock:
|
||||
return len(self._entries)
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Get the current number of log entries."""
|
||||
return self.count()
|
||||
Reference in New Issue
Block a user