""" Crash Pattern Analyzer Module Provides deep analysis of scraper crashes with pattern detection, confidence scoring, and auto-fix parameter suggestions. Builds on top of the basic classify_crash function in scraper_clean.py with more sophisticated pattern matching and multi-signal analysis. """ from dataclasses import dataclass from typing import Any, Dict, List, Optional import re @dataclass class CrashAnalysis: """ Result of crash pattern analysis. Attributes: pattern: The identified crash pattern type (e.g., "memory_exhaustion", "dom_bloat") confidence: Confidence score from 0.0 to 1.0 based on multiple signals description: Human-readable description of the crash cause suggested_fix: Recommended action to prevent this crash auto_fix_params: Parameters that can be applied automatically to prevent recurrence """ pattern: str # e.g., "memory_exhaustion", "dom_bloat", "rate_limited" confidence: float # 0.0 to 1.0 description: str suggested_fix: str auto_fix_params: Optional[Dict[str, Any]] # Thresholds for pattern detection MEMORY_EXHAUSTION_THRESHOLD_MB = 1500 # 1.5GB in MB MEMORY_GROWTH_RATE_THRESHOLD_MB_S = 10 # 10MB/s DOM_BLOAT_THRESHOLD = 50000 # 50000 nodes SCROLL_TIMEOUT_MIN_SCROLLS = 10 # Minimum scrolls before considering scroll_timeout # Auto-fix parameters for each crash pattern AUTO_FIX_PARAMS = { "memory_exhaustion": { "max_reviews": 500, "restart_browser_after": 200 }, "dom_bloat": { "scroll_cleanup": True, "lazy_load": True }, "rate_limited": { "delay_multiplier": 2.0, "use_different_proxy": True }, "consent_loop": { "skip_consent_retries": True }, "scroll_timeout": { "reduce_target": True, "target_reviews": "current - 10%" }, "element_stale": { "retry_with_fresh_elements": True } } def _calculate_memory_growth_rate(metrics_history: List[Dict]) -> Optional[float]: """ Calculate memory growth rate in MB/s from metrics history. Args: metrics_history: List of metric samples with timestamp_ms and memory_mb Returns: Growth rate in MB/s, or None if cannot be calculated """ if not metrics_history or len(metrics_history) < 2: return None # Filter samples that have valid memory readings valid_samples = [ m for m in metrics_history if m.get('memory_mb') is not None and m.get('timestamp_ms') is not None ] if len(valid_samples) < 2: return None # Use first and last valid samples first = valid_samples[0] last = valid_samples[-1] time_delta_s = (last['timestamp_ms'] - first['timestamp_ms']) / 1000 if time_delta_s <= 0: return None memory_delta_mb = last['memory_mb'] - first['memory_mb'] return memory_delta_mb / time_delta_s def _get_max_memory(metrics_history: List[Dict]) -> Optional[int]: """Get maximum memory usage from metrics history.""" if not metrics_history: return None memories = [m.get('memory_mb') for m in metrics_history if m.get('memory_mb') is not None] return max(memories) if memories else None def _get_max_dom_nodes(metrics_history: List[Dict]) -> Optional[int]: """Get maximum DOM node count from metrics history.""" if not metrics_history: return None nodes = [m.get('dom_nodes') for m in metrics_history if m.get('dom_nodes') is not None] return max(nodes) if nodes else None def _check_memory_exhaustion( error_message: str, metrics_history: List[Dict], logs: List[Dict] ) -> tuple[float, str]: """ Check for memory exhaustion pattern. Returns: Tuple of (confidence, description) """ confidence = 0.0 signals = [] # Check for high memory usage max_memory = _get_max_memory(metrics_history) if max_memory is not None: if max_memory >= MEMORY_EXHAUSTION_THRESHOLD_MB: confidence += 0.5 signals.append(f"Memory reached {max_memory}MB (threshold: {MEMORY_EXHAUSTION_THRESHOLD_MB}MB)") elif max_memory >= MEMORY_EXHAUSTION_THRESHOLD_MB * 0.8: confidence += 0.3 signals.append(f"Memory at {max_memory}MB approaching threshold") # Check for rapid memory growth growth_rate = _calculate_memory_growth_rate(metrics_history) if growth_rate is not None and growth_rate >= MEMORY_GROWTH_RATE_THRESHOLD_MB_S: confidence += 0.3 signals.append(f"Memory growing at {growth_rate:.1f}MB/s (threshold: {MEMORY_GROWTH_RATE_THRESHOLD_MB_S}MB/s)") # Check error message for memory-related keywords error_lower = error_message.lower() memory_keywords = ['memory', 'heap', 'out of memory', 'oom', 'aw, snap', 'status_access_violation'] for keyword in memory_keywords: if keyword in error_lower: confidence += 0.2 signals.append(f"Error contains '{keyword}'") break # Check logs for memory warnings for log_entry in logs: msg = log_entry.get('message', '').lower() if 'memory' in msg and ('high' in msg or 'warning' in msg or 'exceeded' in msg): confidence += 0.1 signals.append("Memory warning found in logs") break description = "; ".join(signals) if signals else "No memory exhaustion signals detected" return min(confidence, 1.0), description def _check_dom_bloat( error_message: str, metrics_history: List[Dict], logs: List[Dict] ) -> tuple[float, str]: """ Check for DOM bloat pattern. Returns: Tuple of (confidence, description) """ confidence = 0.0 signals = [] # Check for high DOM node count max_nodes = _get_max_dom_nodes(metrics_history) if max_nodes is not None: if max_nodes >= DOM_BLOAT_THRESHOLD: confidence += 0.6 signals.append(f"DOM nodes reached {max_nodes} (threshold: {DOM_BLOAT_THRESHOLD})") elif max_nodes >= DOM_BLOAT_THRESHOLD * 0.8: confidence += 0.3 signals.append(f"DOM nodes at {max_nodes} approaching threshold") # Check error message for DOM-related keywords error_lower = error_message.lower() dom_keywords = ['dom', 'element', 'node', 'render', 'paint', 'layout'] for keyword in dom_keywords: if keyword in error_lower: confidence += 0.2 signals.append(f"Error contains '{keyword}'") break # Check if memory is high too (DOM bloat often causes memory issues) max_memory = _get_max_memory(metrics_history) if max_memory is not None and max_memory >= 800: # 800MB confidence += 0.1 signals.append(f"Memory also elevated ({max_memory}MB)") # Check logs for DOM-related messages for log_entry in logs: msg = log_entry.get('message', '').lower() if 'dom' in msg and ('large' in msg or 'cleanup' in msg or 'remove' in msg): confidence += 0.1 signals.append("DOM warning found in logs") break description = "; ".join(signals) if signals else "No DOM bloat signals detected" return min(confidence, 1.0), description def _check_rate_limited( error_message: str, metrics_history: List[Dict], logs: List[Dict] ) -> tuple[float, str]: """ Check for rate limiting pattern. Returns: Tuple of (confidence, description) """ confidence = 0.0 signals = [] # Check error message for rate limit indicators error_lower = error_message.lower() if '429' in error_message: confidence += 0.6 signals.append("HTTP 429 status code in error") rate_keywords = ['rate limit', 'too many requests', 'unusual traffic', 'captcha', 'blocked'] for keyword in rate_keywords: if keyword in error_lower: confidence += 0.4 signals.append(f"Error contains '{keyword}'") break # Check logs for rate limiting signals rate_log_count = 0 for log_entry in logs: msg = log_entry.get('message', '').lower() network = log_entry.get('network', {}) status = network.get('status') if status == 429: rate_log_count += 1 confidence += 0.2 if 'unusual traffic' in msg or 'rate' in msg or 'blocked' in msg: rate_log_count += 1 confidence += 0.1 if rate_log_count > 0: signals.append(f"Found {rate_log_count} rate-limiting indicators in logs") description = "; ".join(signals) if signals else "No rate limiting signals detected" return min(confidence, 1.0), description def _check_consent_loop( error_message: str, metrics_history: List[Dict], logs: List[Dict] ) -> tuple[float, str]: """ Check for consent popup loop pattern. Returns: Tuple of (confidence, description) """ confidence = 0.0 signals = [] # Check error message for consent keywords error_lower = error_message.lower() if 'consent' in error_lower: confidence += 0.3 signals.append("Error mentions consent") # Count consent-related log entries consent_count = 0 consent_messages = [] for log_entry in logs: msg = log_entry.get('message', '').lower() if 'consent' in msg: consent_count += 1 consent_messages.append(msg[:50]) # Multiple consent messages indicate a loop if consent_count >= 3: confidence += 0.5 signals.append(f"Consent popup appeared {consent_count} times in logs") elif consent_count >= 2: confidence += 0.3 signals.append(f"Consent popup appeared {consent_count} times") elif consent_count == 1: confidence += 0.1 signals.append("Single consent popup detected") # Check for timeout after consent handling if 'timeout' in error_lower and consent_count > 0: confidence += 0.2 signals.append("Timeout occurred with consent activity") description = "; ".join(signals) if signals else "No consent loop signals detected" return min(confidence, 1.0), description def _check_scroll_timeout( error_message: str, metrics_history: List[Dict], logs: List[Dict], state: Optional[Dict] = None ) -> tuple[float, str]: """ Check for scroll timeout pattern (no new reviews after many scrolls). Returns: Tuple of (confidence, description) """ confidence = 0.0 signals = [] # Check state for scroll count scroll_count = 0 reviews_count = 0 if state: scroll_count = state.get('scroll_count', 0) reviews_count = state.get('reviews_extracted', 0) # Check error for timeout indicators error_lower = error_message.lower() if 'timeout' in error_lower: confidence += 0.2 signals.append("Timeout in error message") # Count recovery attempts in logs (indicate stuck scrolling) recovery_count = 0 no_new_count = 0 for log_entry in logs: msg = log_entry.get('message', '').lower() if 'recovery attempt' in msg: recovery_count += 1 if 'no new' in msg or 'stuck' in msg: no_new_count += 1 if recovery_count >= SCROLL_TIMEOUT_MIN_SCROLLS: confidence += 0.5 signals.append(f"Made {recovery_count} recovery attempts") elif recovery_count >= 5: confidence += 0.3 signals.append(f"Made {recovery_count} recovery attempts") if no_new_count > 0: confidence += 0.2 signals.append(f"Found {no_new_count} 'no new reviews' log entries") # Check if reviews stopped growing if metrics_history and len(metrics_history) >= 5: # Check if reviews count plateaued recent_counts = [m.get('reviews_count', 0) for m in metrics_history[-5:] if m.get('reviews_count')] if recent_counts and len(set(recent_counts)) == 1: confidence += 0.2 signals.append(f"Review count stuck at {recent_counts[0]}") description = "; ".join(signals) if signals else "No scroll timeout signals detected" return min(confidence, 1.0), description def _check_element_stale( error_message: str, metrics_history: List[Dict], logs: List[Dict] ) -> tuple[float, str]: """ Check for stale element reference pattern. Returns: Tuple of (confidence, description) """ confidence = 0.0 signals = [] # Check error message for stale element indicators error_lower = error_message.lower() stale_keywords = [ 'stale element', 'staleelement', 'stale_element', 'element is not attached', 'element reference', 'no such element', 'element not found', 'element is no longer valid' ] for keyword in stale_keywords: if keyword in error_lower: confidence += 0.6 signals.append(f"Error contains '{keyword}'") break # Check logs for stale element patterns stale_log_count = 0 for log_entry in logs: msg = log_entry.get('message', '').lower() for keyword in stale_keywords: if keyword in msg: stale_log_count += 1 break if stale_log_count > 0: confidence += 0.2 signals.append(f"Found {stale_log_count} stale element references in logs") # Check if DOM was changing rapidly (indicates dynamic page) if metrics_history and len(metrics_history) >= 3: dom_counts = [m.get('dom_nodes') for m in metrics_history if m.get('dom_nodes')] if len(dom_counts) >= 3: # Calculate variance avg = sum(dom_counts) / len(dom_counts) variance = sum((x - avg) ** 2 for x in dom_counts) / len(dom_counts) std_dev = variance ** 0.5 # High variance indicates rapidly changing DOM if std_dev > 1000: confidence += 0.2 signals.append(f"High DOM variability (std dev: {std_dev:.0f})") description = "; ".join(signals) if signals else "No stale element signals detected" return min(confidence, 1.0), description def analyze_crash(crash_report: Dict) -> CrashAnalysis: """ Analyze a crash report to determine the most likely crash pattern. Examines error_message, metrics_history, and logs_before_crash to calculate confidence scores for each crash pattern type. Args: crash_report: Dictionary containing: - error_message: str - The exception message - metrics_history: List[Dict] - Sampled metrics with timestamp_ms, memory_mb, dom_nodes - logs_before_crash: List[Dict] - Recent log entries before the crash - state: Optional[Dict] - Scraper state (reviews_extracted, scroll_count, etc.) - crash_type: Optional[str] - Basic crash classification from classify_crash() Returns: CrashAnalysis with the highest-confidence pattern match """ # Extract data from crash report error_message = crash_report.get('error_message', '') metrics_history = crash_report.get('metrics_history', []) logs = crash_report.get('logs_before_crash', []) state = crash_report.get('state', {}) basic_type = crash_report.get('crash_type', 'unknown') # Run all pattern checks pattern_results = {} # Memory exhaustion conf, desc = _check_memory_exhaustion(error_message, metrics_history, logs) pattern_results['memory_exhaustion'] = (conf, desc) # DOM bloat conf, desc = _check_dom_bloat(error_message, metrics_history, logs) pattern_results['dom_bloat'] = (conf, desc) # Rate limited conf, desc = _check_rate_limited(error_message, metrics_history, logs) pattern_results['rate_limited'] = (conf, desc) # Consent loop conf, desc = _check_consent_loop(error_message, metrics_history, logs) pattern_results['consent_loop'] = (conf, desc) # Scroll timeout conf, desc = _check_scroll_timeout(error_message, metrics_history, logs, state) pattern_results['scroll_timeout'] = (conf, desc) # Element stale conf, desc = _check_element_stale(error_message, metrics_history, logs) pattern_results['element_stale'] = (conf, desc) # Find the pattern with highest confidence best_pattern = max(pattern_results.items(), key=lambda x: x[1][0]) pattern_name = best_pattern[0] confidence = best_pattern[1][0] description = best_pattern[1][1] # If confidence is too low, fall back to basic classification if confidence < 0.2: # Map basic crash types to our patterns basic_to_pattern = { 'memory_exhaustion': 'memory_exhaustion', 'tab_crash': 'memory_exhaustion', # Tab crashes often from memory 'timeout': 'scroll_timeout', 'element_not_found': 'element_stale', 'rate_limited': 'rate_limited', 'network_failure': 'rate_limited', # Could be blocking } if basic_type in basic_to_pattern: pattern_name = basic_to_pattern[basic_type] confidence = 0.3 # Low confidence fallback description = f"Inferred from basic crash type '{basic_type}'" else: pattern_name = 'unknown' confidence = 0.0 description = f"Unable to determine crash pattern (basic type: {basic_type})" # Generate suggested fix based on pattern suggested_fixes = { 'memory_exhaustion': ( "Reduce batch size and restart browser more frequently. " "Consider limiting max_reviews to 500 and restarting browser after every 200 reviews." ), 'dom_bloat': ( "Enable DOM cleanup during scrolling. " "Hide processed review cards and remove separator elements to keep DOM light." ), 'rate_limited': ( "Increase delays between requests and consider rotating proxies. " "Double the delay multiplier and switch to a different proxy if available." ), 'consent_loop': ( "Skip consent handling after initial attempt to avoid infinite loops. " "The consent popup may be appearing due to cookie clearing or navigation issues." ), 'scroll_timeout': ( "The page may have stopped loading new reviews. " "Try reducing the target review count by 10% and accepting partial results." ), 'element_stale': ( "Page elements are being removed/replaced during scraping. " "Retry operations with freshly-located elements and add defensive waits." ), 'unknown': ( "Unable to determine specific crash cause. " "Review logs and consider restarting with fresh browser session." ) } suggested_fix = suggested_fixes.get(pattern_name, suggested_fixes['unknown']) auto_fix_params = AUTO_FIX_PARAMS.get(pattern_name) return CrashAnalysis( pattern=pattern_name, confidence=confidence, description=description, suggested_fix=suggested_fix, auto_fix_params=auto_fix_params ) def get_auto_fix_params(pattern: str) -> Optional[Dict[str, Any]]: """ Get auto-fix parameters for a specific crash pattern. Args: pattern: The crash pattern name Returns: Dictionary of auto-fix parameters, or None if pattern not recognized """ return AUTO_FIX_PARAMS.get(pattern) def apply_auto_fix(pattern: str, current_params: Dict[str, Any]) -> Dict[str, Any]: """ Apply auto-fix parameters to current scraper parameters. Args: pattern: The crash pattern name current_params: Current scraper parameters to modify Returns: Updated parameters dictionary with fixes applied """ fix_params = AUTO_FIX_PARAMS.get(pattern, {}) updated = current_params.copy() for key, value in fix_params.items(): if key == 'target_reviews' and value == 'current - 10%': # Special case: reduce target by 10% current_target = updated.get('max_reviews', 1000) updated['max_reviews'] = int(current_target * 0.9) elif key == 'delay_multiplier': # Multiply existing delay current_delay = updated.get('scroll_delay', 1.0) updated['scroll_delay'] = current_delay * value else: updated[key] = value return updated def summarize_crash_patterns(crash_reports: List[Dict]) -> Dict[str, Any]: """ Analyze multiple crash reports to identify recurring patterns. Args: crash_reports: List of crash report dictionaries Returns: Summary dictionary with pattern frequencies and recommendations """ if not crash_reports: return { 'total_crashes': 0, 'patterns': {}, 'most_common': None, 'recommendations': [] } pattern_counts: Dict[str, int] = {} pattern_confidences: Dict[str, List[float]] = {} for report in crash_reports: analysis = analyze_crash(report) pattern = analysis.pattern pattern_counts[pattern] = pattern_counts.get(pattern, 0) + 1 if pattern not in pattern_confidences: pattern_confidences[pattern] = [] pattern_confidences[pattern].append(analysis.confidence) # Calculate average confidence per pattern patterns_summary = {} for pattern, count in pattern_counts.items(): avg_confidence = sum(pattern_confidences[pattern]) / len(pattern_confidences[pattern]) patterns_summary[pattern] = { 'count': count, 'percentage': count / len(crash_reports) * 100, 'avg_confidence': avg_confidence } # Find most common pattern most_common = max(pattern_counts.items(), key=lambda x: x[1])[0] if pattern_counts else None # Generate recommendations recommendations = [] for pattern, stats in sorted(patterns_summary.items(), key=lambda x: x[1]['count'], reverse=True): if stats['count'] >= 2: # Only recommend for recurring patterns fix_params = AUTO_FIX_PARAMS.get(pattern) if fix_params: recommendations.append({ 'pattern': pattern, 'occurrences': stats['count'], 'auto_fix_params': fix_params }) return { 'total_crashes': len(crash_reports), 'patterns': patterns_summary, 'most_common': most_common, 'recommendations': recommendations }