Phase 0: Project restructure to ReviewIQ platform architecture
New structure: - scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py) - scrapers/base.py (BaseScraper interface) - scrapers/registry.py (ScraperRegistry for version routing) - core/database.py, models.py, config.py, enums.py - utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py - workers/chrome_pool.py - services/webhook_service.py - api/ routes structure (empty, ready for Phase 2) - tests/ structure mirroring source All imports updated in: - api_server_production.py (7 import paths updated) - utils/health_checks.py (scraper import path) Legacy modules moved to modules/_legacy/: - data_storage.py, image_handler.py, s3_handler.py (unused) Syntax verified, frontend build passing. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
666
utils/crash_analyzer.py
Normal file
666
utils/crash_analyzer.py
Normal file
@@ -0,0 +1,666 @@
|
||||
"""
|
||||
Crash Pattern Analyzer Module
|
||||
|
||||
Provides deep analysis of scraper crashes with pattern detection,
|
||||
confidence scoring, and auto-fix parameter suggestions.
|
||||
|
||||
Builds on top of the basic classify_crash function in scraper_clean.py
|
||||
with more sophisticated pattern matching and multi-signal analysis.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional
|
||||
import re
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrashAnalysis:
|
||||
"""
|
||||
Result of crash pattern analysis.
|
||||
|
||||
Attributes:
|
||||
pattern: The identified crash pattern type (e.g., "memory_exhaustion", "dom_bloat")
|
||||
confidence: Confidence score from 0.0 to 1.0 based on multiple signals
|
||||
description: Human-readable description of the crash cause
|
||||
suggested_fix: Recommended action to prevent this crash
|
||||
auto_fix_params: Parameters that can be applied automatically to prevent recurrence
|
||||
"""
|
||||
pattern: str # e.g., "memory_exhaustion", "dom_bloat", "rate_limited"
|
||||
confidence: float # 0.0 to 1.0
|
||||
description: str
|
||||
suggested_fix: str
|
||||
auto_fix_params: Optional[Dict[str, Any]]
|
||||
|
||||
|
||||
# Thresholds for pattern detection
|
||||
MEMORY_EXHAUSTION_THRESHOLD_MB = 1500 # 1.5GB in MB
|
||||
MEMORY_GROWTH_RATE_THRESHOLD_MB_S = 10 # 10MB/s
|
||||
DOM_BLOAT_THRESHOLD = 50000 # 50000 nodes
|
||||
SCROLL_TIMEOUT_MIN_SCROLLS = 10 # Minimum scrolls before considering scroll_timeout
|
||||
|
||||
|
||||
# Auto-fix parameters for each crash pattern
|
||||
AUTO_FIX_PARAMS = {
|
||||
"memory_exhaustion": {
|
||||
"max_reviews": 500,
|
||||
"restart_browser_after": 200
|
||||
},
|
||||
"dom_bloat": {
|
||||
"scroll_cleanup": True,
|
||||
"lazy_load": True
|
||||
},
|
||||
"rate_limited": {
|
||||
"delay_multiplier": 2.0,
|
||||
"use_different_proxy": True
|
||||
},
|
||||
"consent_loop": {
|
||||
"skip_consent_retries": True
|
||||
},
|
||||
"scroll_timeout": {
|
||||
"reduce_target": True,
|
||||
"target_reviews": "current - 10%"
|
||||
},
|
||||
"element_stale": {
|
||||
"retry_with_fresh_elements": True
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def _calculate_memory_growth_rate(metrics_history: List[Dict]) -> Optional[float]:
|
||||
"""
|
||||
Calculate memory growth rate in MB/s from metrics history.
|
||||
|
||||
Args:
|
||||
metrics_history: List of metric samples with timestamp_ms and memory_mb
|
||||
|
||||
Returns:
|
||||
Growth rate in MB/s, or None if cannot be calculated
|
||||
"""
|
||||
if not metrics_history or len(metrics_history) < 2:
|
||||
return None
|
||||
|
||||
# Filter samples that have valid memory readings
|
||||
valid_samples = [
|
||||
m for m in metrics_history
|
||||
if m.get('memory_mb') is not None and m.get('timestamp_ms') is not None
|
||||
]
|
||||
|
||||
if len(valid_samples) < 2:
|
||||
return None
|
||||
|
||||
# Use first and last valid samples
|
||||
first = valid_samples[0]
|
||||
last = valid_samples[-1]
|
||||
|
||||
time_delta_s = (last['timestamp_ms'] - first['timestamp_ms']) / 1000
|
||||
if time_delta_s <= 0:
|
||||
return None
|
||||
|
||||
memory_delta_mb = last['memory_mb'] - first['memory_mb']
|
||||
return memory_delta_mb / time_delta_s
|
||||
|
||||
|
||||
def _get_max_memory(metrics_history: List[Dict]) -> Optional[int]:
|
||||
"""Get maximum memory usage from metrics history."""
|
||||
if not metrics_history:
|
||||
return None
|
||||
|
||||
memories = [m.get('memory_mb') for m in metrics_history if m.get('memory_mb') is not None]
|
||||
return max(memories) if memories else None
|
||||
|
||||
|
||||
def _get_max_dom_nodes(metrics_history: List[Dict]) -> Optional[int]:
|
||||
"""Get maximum DOM node count from metrics history."""
|
||||
if not metrics_history:
|
||||
return None
|
||||
|
||||
nodes = [m.get('dom_nodes') for m in metrics_history if m.get('dom_nodes') is not None]
|
||||
return max(nodes) if nodes else None
|
||||
|
||||
|
||||
def _check_memory_exhaustion(
|
||||
error_message: str,
|
||||
metrics_history: List[Dict],
|
||||
logs: List[Dict]
|
||||
) -> tuple[float, str]:
|
||||
"""
|
||||
Check for memory exhaustion pattern.
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence, description)
|
||||
"""
|
||||
confidence = 0.0
|
||||
signals = []
|
||||
|
||||
# Check for high memory usage
|
||||
max_memory = _get_max_memory(metrics_history)
|
||||
if max_memory is not None:
|
||||
if max_memory >= MEMORY_EXHAUSTION_THRESHOLD_MB:
|
||||
confidence += 0.5
|
||||
signals.append(f"Memory reached {max_memory}MB (threshold: {MEMORY_EXHAUSTION_THRESHOLD_MB}MB)")
|
||||
elif max_memory >= MEMORY_EXHAUSTION_THRESHOLD_MB * 0.8:
|
||||
confidence += 0.3
|
||||
signals.append(f"Memory at {max_memory}MB approaching threshold")
|
||||
|
||||
# Check for rapid memory growth
|
||||
growth_rate = _calculate_memory_growth_rate(metrics_history)
|
||||
if growth_rate is not None and growth_rate >= MEMORY_GROWTH_RATE_THRESHOLD_MB_S:
|
||||
confidence += 0.3
|
||||
signals.append(f"Memory growing at {growth_rate:.1f}MB/s (threshold: {MEMORY_GROWTH_RATE_THRESHOLD_MB_S}MB/s)")
|
||||
|
||||
# Check error message for memory-related keywords
|
||||
error_lower = error_message.lower()
|
||||
memory_keywords = ['memory', 'heap', 'out of memory', 'oom', 'aw, snap', 'status_access_violation']
|
||||
for keyword in memory_keywords:
|
||||
if keyword in error_lower:
|
||||
confidence += 0.2
|
||||
signals.append(f"Error contains '{keyword}'")
|
||||
break
|
||||
|
||||
# Check logs for memory warnings
|
||||
for log_entry in logs:
|
||||
msg = log_entry.get('message', '').lower()
|
||||
if 'memory' in msg and ('high' in msg or 'warning' in msg or 'exceeded' in msg):
|
||||
confidence += 0.1
|
||||
signals.append("Memory warning found in logs")
|
||||
break
|
||||
|
||||
description = "; ".join(signals) if signals else "No memory exhaustion signals detected"
|
||||
return min(confidence, 1.0), description
|
||||
|
||||
|
||||
def _check_dom_bloat(
|
||||
error_message: str,
|
||||
metrics_history: List[Dict],
|
||||
logs: List[Dict]
|
||||
) -> tuple[float, str]:
|
||||
"""
|
||||
Check for DOM bloat pattern.
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence, description)
|
||||
"""
|
||||
confidence = 0.0
|
||||
signals = []
|
||||
|
||||
# Check for high DOM node count
|
||||
max_nodes = _get_max_dom_nodes(metrics_history)
|
||||
if max_nodes is not None:
|
||||
if max_nodes >= DOM_BLOAT_THRESHOLD:
|
||||
confidence += 0.6
|
||||
signals.append(f"DOM nodes reached {max_nodes} (threshold: {DOM_BLOAT_THRESHOLD})")
|
||||
elif max_nodes >= DOM_BLOAT_THRESHOLD * 0.8:
|
||||
confidence += 0.3
|
||||
signals.append(f"DOM nodes at {max_nodes} approaching threshold")
|
||||
|
||||
# Check error message for DOM-related keywords
|
||||
error_lower = error_message.lower()
|
||||
dom_keywords = ['dom', 'element', 'node', 'render', 'paint', 'layout']
|
||||
for keyword in dom_keywords:
|
||||
if keyword in error_lower:
|
||||
confidence += 0.2
|
||||
signals.append(f"Error contains '{keyword}'")
|
||||
break
|
||||
|
||||
# Check if memory is high too (DOM bloat often causes memory issues)
|
||||
max_memory = _get_max_memory(metrics_history)
|
||||
if max_memory is not None and max_memory >= 800: # 800MB
|
||||
confidence += 0.1
|
||||
signals.append(f"Memory also elevated ({max_memory}MB)")
|
||||
|
||||
# Check logs for DOM-related messages
|
||||
for log_entry in logs:
|
||||
msg = log_entry.get('message', '').lower()
|
||||
if 'dom' in msg and ('large' in msg or 'cleanup' in msg or 'remove' in msg):
|
||||
confidence += 0.1
|
||||
signals.append("DOM warning found in logs")
|
||||
break
|
||||
|
||||
description = "; ".join(signals) if signals else "No DOM bloat signals detected"
|
||||
return min(confidence, 1.0), description
|
||||
|
||||
|
||||
def _check_rate_limited(
|
||||
error_message: str,
|
||||
metrics_history: List[Dict],
|
||||
logs: List[Dict]
|
||||
) -> tuple[float, str]:
|
||||
"""
|
||||
Check for rate limiting pattern.
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence, description)
|
||||
"""
|
||||
confidence = 0.0
|
||||
signals = []
|
||||
|
||||
# Check error message for rate limit indicators
|
||||
error_lower = error_message.lower()
|
||||
if '429' in error_message:
|
||||
confidence += 0.6
|
||||
signals.append("HTTP 429 status code in error")
|
||||
|
||||
rate_keywords = ['rate limit', 'too many requests', 'unusual traffic', 'captcha', 'blocked']
|
||||
for keyword in rate_keywords:
|
||||
if keyword in error_lower:
|
||||
confidence += 0.4
|
||||
signals.append(f"Error contains '{keyword}'")
|
||||
break
|
||||
|
||||
# Check logs for rate limiting signals
|
||||
rate_log_count = 0
|
||||
for log_entry in logs:
|
||||
msg = log_entry.get('message', '').lower()
|
||||
network = log_entry.get('network', {})
|
||||
status = network.get('status')
|
||||
|
||||
if status == 429:
|
||||
rate_log_count += 1
|
||||
confidence += 0.2
|
||||
|
||||
if 'unusual traffic' in msg or 'rate' in msg or 'blocked' in msg:
|
||||
rate_log_count += 1
|
||||
confidence += 0.1
|
||||
|
||||
if rate_log_count > 0:
|
||||
signals.append(f"Found {rate_log_count} rate-limiting indicators in logs")
|
||||
|
||||
description = "; ".join(signals) if signals else "No rate limiting signals detected"
|
||||
return min(confidence, 1.0), description
|
||||
|
||||
|
||||
def _check_consent_loop(
|
||||
error_message: str,
|
||||
metrics_history: List[Dict],
|
||||
logs: List[Dict]
|
||||
) -> tuple[float, str]:
|
||||
"""
|
||||
Check for consent popup loop pattern.
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence, description)
|
||||
"""
|
||||
confidence = 0.0
|
||||
signals = []
|
||||
|
||||
# Check error message for consent keywords
|
||||
error_lower = error_message.lower()
|
||||
if 'consent' in error_lower:
|
||||
confidence += 0.3
|
||||
signals.append("Error mentions consent")
|
||||
|
||||
# Count consent-related log entries
|
||||
consent_count = 0
|
||||
consent_messages = []
|
||||
for log_entry in logs:
|
||||
msg = log_entry.get('message', '').lower()
|
||||
if 'consent' in msg:
|
||||
consent_count += 1
|
||||
consent_messages.append(msg[:50])
|
||||
|
||||
# Multiple consent messages indicate a loop
|
||||
if consent_count >= 3:
|
||||
confidence += 0.5
|
||||
signals.append(f"Consent popup appeared {consent_count} times in logs")
|
||||
elif consent_count >= 2:
|
||||
confidence += 0.3
|
||||
signals.append(f"Consent popup appeared {consent_count} times")
|
||||
elif consent_count == 1:
|
||||
confidence += 0.1
|
||||
signals.append("Single consent popup detected")
|
||||
|
||||
# Check for timeout after consent handling
|
||||
if 'timeout' in error_lower and consent_count > 0:
|
||||
confidence += 0.2
|
||||
signals.append("Timeout occurred with consent activity")
|
||||
|
||||
description = "; ".join(signals) if signals else "No consent loop signals detected"
|
||||
return min(confidence, 1.0), description
|
||||
|
||||
|
||||
def _check_scroll_timeout(
|
||||
error_message: str,
|
||||
metrics_history: List[Dict],
|
||||
logs: List[Dict],
|
||||
state: Optional[Dict] = None
|
||||
) -> tuple[float, str]:
|
||||
"""
|
||||
Check for scroll timeout pattern (no new reviews after many scrolls).
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence, description)
|
||||
"""
|
||||
confidence = 0.0
|
||||
signals = []
|
||||
|
||||
# Check state for scroll count
|
||||
scroll_count = 0
|
||||
reviews_count = 0
|
||||
if state:
|
||||
scroll_count = state.get('scroll_count', 0)
|
||||
reviews_count = state.get('reviews_extracted', 0)
|
||||
|
||||
# Check error for timeout indicators
|
||||
error_lower = error_message.lower()
|
||||
if 'timeout' in error_lower:
|
||||
confidence += 0.2
|
||||
signals.append("Timeout in error message")
|
||||
|
||||
# Count recovery attempts in logs (indicate stuck scrolling)
|
||||
recovery_count = 0
|
||||
no_new_count = 0
|
||||
for log_entry in logs:
|
||||
msg = log_entry.get('message', '').lower()
|
||||
if 'recovery attempt' in msg:
|
||||
recovery_count += 1
|
||||
if 'no new' in msg or 'stuck' in msg:
|
||||
no_new_count += 1
|
||||
|
||||
if recovery_count >= SCROLL_TIMEOUT_MIN_SCROLLS:
|
||||
confidence += 0.5
|
||||
signals.append(f"Made {recovery_count} recovery attempts")
|
||||
elif recovery_count >= 5:
|
||||
confidence += 0.3
|
||||
signals.append(f"Made {recovery_count} recovery attempts")
|
||||
|
||||
if no_new_count > 0:
|
||||
confidence += 0.2
|
||||
signals.append(f"Found {no_new_count} 'no new reviews' log entries")
|
||||
|
||||
# Check if reviews stopped growing
|
||||
if metrics_history and len(metrics_history) >= 5:
|
||||
# Check if reviews count plateaued
|
||||
recent_counts = [m.get('reviews_count', 0) for m in metrics_history[-5:] if m.get('reviews_count')]
|
||||
if recent_counts and len(set(recent_counts)) == 1:
|
||||
confidence += 0.2
|
||||
signals.append(f"Review count stuck at {recent_counts[0]}")
|
||||
|
||||
description = "; ".join(signals) if signals else "No scroll timeout signals detected"
|
||||
return min(confidence, 1.0), description
|
||||
|
||||
|
||||
def _check_element_stale(
|
||||
error_message: str,
|
||||
metrics_history: List[Dict],
|
||||
logs: List[Dict]
|
||||
) -> tuple[float, str]:
|
||||
"""
|
||||
Check for stale element reference pattern.
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence, description)
|
||||
"""
|
||||
confidence = 0.0
|
||||
signals = []
|
||||
|
||||
# Check error message for stale element indicators
|
||||
error_lower = error_message.lower()
|
||||
stale_keywords = [
|
||||
'stale element', 'staleelement', 'stale_element',
|
||||
'element is not attached', 'element reference',
|
||||
'no such element', 'element not found',
|
||||
'element is no longer valid'
|
||||
]
|
||||
|
||||
for keyword in stale_keywords:
|
||||
if keyword in error_lower:
|
||||
confidence += 0.6
|
||||
signals.append(f"Error contains '{keyword}'")
|
||||
break
|
||||
|
||||
# Check logs for stale element patterns
|
||||
stale_log_count = 0
|
||||
for log_entry in logs:
|
||||
msg = log_entry.get('message', '').lower()
|
||||
for keyword in stale_keywords:
|
||||
if keyword in msg:
|
||||
stale_log_count += 1
|
||||
break
|
||||
|
||||
if stale_log_count > 0:
|
||||
confidence += 0.2
|
||||
signals.append(f"Found {stale_log_count} stale element references in logs")
|
||||
|
||||
# Check if DOM was changing rapidly (indicates dynamic page)
|
||||
if metrics_history and len(metrics_history) >= 3:
|
||||
dom_counts = [m.get('dom_nodes') for m in metrics_history if m.get('dom_nodes')]
|
||||
if len(dom_counts) >= 3:
|
||||
# Calculate variance
|
||||
avg = sum(dom_counts) / len(dom_counts)
|
||||
variance = sum((x - avg) ** 2 for x in dom_counts) / len(dom_counts)
|
||||
std_dev = variance ** 0.5
|
||||
# High variance indicates rapidly changing DOM
|
||||
if std_dev > 1000:
|
||||
confidence += 0.2
|
||||
signals.append(f"High DOM variability (std dev: {std_dev:.0f})")
|
||||
|
||||
description = "; ".join(signals) if signals else "No stale element signals detected"
|
||||
return min(confidence, 1.0), description
|
||||
|
||||
|
||||
def analyze_crash(crash_report: Dict) -> CrashAnalysis:
|
||||
"""
|
||||
Analyze a crash report to determine the most likely crash pattern.
|
||||
|
||||
Examines error_message, metrics_history, and logs_before_crash to
|
||||
calculate confidence scores for each crash pattern type.
|
||||
|
||||
Args:
|
||||
crash_report: Dictionary containing:
|
||||
- error_message: str - The exception message
|
||||
- metrics_history: List[Dict] - Sampled metrics with timestamp_ms, memory_mb, dom_nodes
|
||||
- logs_before_crash: List[Dict] - Recent log entries before the crash
|
||||
- state: Optional[Dict] - Scraper state (reviews_extracted, scroll_count, etc.)
|
||||
- crash_type: Optional[str] - Basic crash classification from classify_crash()
|
||||
|
||||
Returns:
|
||||
CrashAnalysis with the highest-confidence pattern match
|
||||
"""
|
||||
# Extract data from crash report
|
||||
error_message = crash_report.get('error_message', '')
|
||||
metrics_history = crash_report.get('metrics_history', [])
|
||||
logs = crash_report.get('logs_before_crash', [])
|
||||
state = crash_report.get('state', {})
|
||||
basic_type = crash_report.get('crash_type', 'unknown')
|
||||
|
||||
# Run all pattern checks
|
||||
pattern_results = {}
|
||||
|
||||
# Memory exhaustion
|
||||
conf, desc = _check_memory_exhaustion(error_message, metrics_history, logs)
|
||||
pattern_results['memory_exhaustion'] = (conf, desc)
|
||||
|
||||
# DOM bloat
|
||||
conf, desc = _check_dom_bloat(error_message, metrics_history, logs)
|
||||
pattern_results['dom_bloat'] = (conf, desc)
|
||||
|
||||
# Rate limited
|
||||
conf, desc = _check_rate_limited(error_message, metrics_history, logs)
|
||||
pattern_results['rate_limited'] = (conf, desc)
|
||||
|
||||
# Consent loop
|
||||
conf, desc = _check_consent_loop(error_message, metrics_history, logs)
|
||||
pattern_results['consent_loop'] = (conf, desc)
|
||||
|
||||
# Scroll timeout
|
||||
conf, desc = _check_scroll_timeout(error_message, metrics_history, logs, state)
|
||||
pattern_results['scroll_timeout'] = (conf, desc)
|
||||
|
||||
# Element stale
|
||||
conf, desc = _check_element_stale(error_message, metrics_history, logs)
|
||||
pattern_results['element_stale'] = (conf, desc)
|
||||
|
||||
# Find the pattern with highest confidence
|
||||
best_pattern = max(pattern_results.items(), key=lambda x: x[1][0])
|
||||
pattern_name = best_pattern[0]
|
||||
confidence = best_pattern[1][0]
|
||||
description = best_pattern[1][1]
|
||||
|
||||
# If confidence is too low, fall back to basic classification
|
||||
if confidence < 0.2:
|
||||
# Map basic crash types to our patterns
|
||||
basic_to_pattern = {
|
||||
'memory_exhaustion': 'memory_exhaustion',
|
||||
'tab_crash': 'memory_exhaustion', # Tab crashes often from memory
|
||||
'timeout': 'scroll_timeout',
|
||||
'element_not_found': 'element_stale',
|
||||
'rate_limited': 'rate_limited',
|
||||
'network_failure': 'rate_limited', # Could be blocking
|
||||
}
|
||||
|
||||
if basic_type in basic_to_pattern:
|
||||
pattern_name = basic_to_pattern[basic_type]
|
||||
confidence = 0.3 # Low confidence fallback
|
||||
description = f"Inferred from basic crash type '{basic_type}'"
|
||||
else:
|
||||
pattern_name = 'unknown'
|
||||
confidence = 0.0
|
||||
description = f"Unable to determine crash pattern (basic type: {basic_type})"
|
||||
|
||||
# Generate suggested fix based on pattern
|
||||
suggested_fixes = {
|
||||
'memory_exhaustion': (
|
||||
"Reduce batch size and restart browser more frequently. "
|
||||
"Consider limiting max_reviews to 500 and restarting browser after every 200 reviews."
|
||||
),
|
||||
'dom_bloat': (
|
||||
"Enable DOM cleanup during scrolling. "
|
||||
"Hide processed review cards and remove separator elements to keep DOM light."
|
||||
),
|
||||
'rate_limited': (
|
||||
"Increase delays between requests and consider rotating proxies. "
|
||||
"Double the delay multiplier and switch to a different proxy if available."
|
||||
),
|
||||
'consent_loop': (
|
||||
"Skip consent handling after initial attempt to avoid infinite loops. "
|
||||
"The consent popup may be appearing due to cookie clearing or navigation issues."
|
||||
),
|
||||
'scroll_timeout': (
|
||||
"The page may have stopped loading new reviews. "
|
||||
"Try reducing the target review count by 10% and accepting partial results."
|
||||
),
|
||||
'element_stale': (
|
||||
"Page elements are being removed/replaced during scraping. "
|
||||
"Retry operations with freshly-located elements and add defensive waits."
|
||||
),
|
||||
'unknown': (
|
||||
"Unable to determine specific crash cause. "
|
||||
"Review logs and consider restarting with fresh browser session."
|
||||
)
|
||||
}
|
||||
|
||||
suggested_fix = suggested_fixes.get(pattern_name, suggested_fixes['unknown'])
|
||||
auto_fix_params = AUTO_FIX_PARAMS.get(pattern_name)
|
||||
|
||||
return CrashAnalysis(
|
||||
pattern=pattern_name,
|
||||
confidence=confidence,
|
||||
description=description,
|
||||
suggested_fix=suggested_fix,
|
||||
auto_fix_params=auto_fix_params
|
||||
)
|
||||
|
||||
|
||||
def get_auto_fix_params(pattern: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get auto-fix parameters for a specific crash pattern.
|
||||
|
||||
Args:
|
||||
pattern: The crash pattern name
|
||||
|
||||
Returns:
|
||||
Dictionary of auto-fix parameters, or None if pattern not recognized
|
||||
"""
|
||||
return AUTO_FIX_PARAMS.get(pattern)
|
||||
|
||||
|
||||
def apply_auto_fix(pattern: str, current_params: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Apply auto-fix parameters to current scraper parameters.
|
||||
|
||||
Args:
|
||||
pattern: The crash pattern name
|
||||
current_params: Current scraper parameters to modify
|
||||
|
||||
Returns:
|
||||
Updated parameters dictionary with fixes applied
|
||||
"""
|
||||
fix_params = AUTO_FIX_PARAMS.get(pattern, {})
|
||||
updated = current_params.copy()
|
||||
|
||||
for key, value in fix_params.items():
|
||||
if key == 'target_reviews' and value == 'current - 10%':
|
||||
# Special case: reduce target by 10%
|
||||
current_target = updated.get('max_reviews', 1000)
|
||||
updated['max_reviews'] = int(current_target * 0.9)
|
||||
elif key == 'delay_multiplier':
|
||||
# Multiply existing delay
|
||||
current_delay = updated.get('scroll_delay', 1.0)
|
||||
updated['scroll_delay'] = current_delay * value
|
||||
else:
|
||||
updated[key] = value
|
||||
|
||||
return updated
|
||||
|
||||
|
||||
def summarize_crash_patterns(crash_reports: List[Dict]) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze multiple crash reports to identify recurring patterns.
|
||||
|
||||
Args:
|
||||
crash_reports: List of crash report dictionaries
|
||||
|
||||
Returns:
|
||||
Summary dictionary with pattern frequencies and recommendations
|
||||
"""
|
||||
if not crash_reports:
|
||||
return {
|
||||
'total_crashes': 0,
|
||||
'patterns': {},
|
||||
'most_common': None,
|
||||
'recommendations': []
|
||||
}
|
||||
|
||||
pattern_counts: Dict[str, int] = {}
|
||||
pattern_confidences: Dict[str, List[float]] = {}
|
||||
|
||||
for report in crash_reports:
|
||||
analysis = analyze_crash(report)
|
||||
pattern = analysis.pattern
|
||||
|
||||
pattern_counts[pattern] = pattern_counts.get(pattern, 0) + 1
|
||||
if pattern not in pattern_confidences:
|
||||
pattern_confidences[pattern] = []
|
||||
pattern_confidences[pattern].append(analysis.confidence)
|
||||
|
||||
# Calculate average confidence per pattern
|
||||
patterns_summary = {}
|
||||
for pattern, count in pattern_counts.items():
|
||||
avg_confidence = sum(pattern_confidences[pattern]) / len(pattern_confidences[pattern])
|
||||
patterns_summary[pattern] = {
|
||||
'count': count,
|
||||
'percentage': count / len(crash_reports) * 100,
|
||||
'avg_confidence': avg_confidence
|
||||
}
|
||||
|
||||
# Find most common pattern
|
||||
most_common = max(pattern_counts.items(), key=lambda x: x[1])[0] if pattern_counts else None
|
||||
|
||||
# Generate recommendations
|
||||
recommendations = []
|
||||
for pattern, stats in sorted(patterns_summary.items(), key=lambda x: x[1]['count'], reverse=True):
|
||||
if stats['count'] >= 2: # Only recommend for recurring patterns
|
||||
fix_params = AUTO_FIX_PARAMS.get(pattern)
|
||||
if fix_params:
|
||||
recommendations.append({
|
||||
'pattern': pattern,
|
||||
'occurrences': stats['count'],
|
||||
'auto_fix_params': fix_params
|
||||
})
|
||||
|
||||
return {
|
||||
'total_crashes': len(crash_reports),
|
||||
'patterns': patterns_summary,
|
||||
'most_common': most_common,
|
||||
'recommendations': recommendations
|
||||
}
|
||||
Reference in New Issue
Block a user