whyrating-engine-legacy/utils/crash_analyzer.py

"""
Crash Pattern Analyzer Module

Provides deep analysis of scraper crashes with pattern detection,
confidence scoring, and auto-fix parameter suggestions.

Builds on top of the basic classify_crash function in scraper_clean.py
with more sophisticated pattern matching and multi-signal analysis.
"""

from dataclasses import dataclass
from typing import Any, Dict, List, Optional
import re


@dataclass
class CrashAnalysis:
    """
    Result of crash pattern analysis.

    Attributes:
        pattern: The identified crash pattern type (e.g., "memory_exhaustion", "dom_bloat")
        confidence: Confidence score from 0.0 to 1.0 based on multiple signals
        description: Human-readable description of the crash cause
        suggested_fix: Recommended action to prevent this crash
        auto_fix_params: Parameters that can be applied automatically to prevent recurrence
    """
    pattern: str  # e.g., "memory_exhaustion", "dom_bloat", "rate_limited"
    confidence: float  # 0.0 to 1.0
    description: str
    suggested_fix: str
    auto_fix_params: Optional[Dict[str, Any]]


# Thresholds for pattern detection
MEMORY_EXHAUSTION_THRESHOLD_MB = 1500  # 1.5GB in MB
MEMORY_GROWTH_RATE_THRESHOLD_MB_S = 10  # 10MB/s
DOM_BLOAT_THRESHOLD = 50000  # 50000 nodes
SCROLL_TIMEOUT_MIN_SCROLLS = 10  # Minimum scrolls before considering scroll_timeout


# Auto-fix parameters for each crash pattern
AUTO_FIX_PARAMS = {
    "memory_exhaustion": {
        "max_reviews": 500,
        "restart_browser_after": 200
    },
    "dom_bloat": {
        "scroll_cleanup": True,
        "lazy_load": True
    },
    "rate_limited": {
        "delay_multiplier": 2.0,
        "use_different_proxy": True
    },
    "consent_loop": {
        "skip_consent_retries": True
    },
    "scroll_timeout": {
        "reduce_target": True,
        "target_reviews": "current - 10%"
    },
    "element_stale": {
        "retry_with_fresh_elements": True
    }
}


def _calculate_memory_growth_rate(metrics_history: List[Dict]) -> Optional[float]:
    """
    Calculate memory growth rate in MB/s from metrics history.

    Args:
        metrics_history: List of metric samples with timestamp_ms and memory_mb

    Returns:
        Growth rate in MB/s, or None if cannot be calculated
    """
    if not metrics_history or len(metrics_history) < 2:
        return None

    # Filter samples that have valid memory readings
    valid_samples = [
        m for m in metrics_history
        if m.get('memory_mb') is not None and m.get('timestamp_ms') is not None
    ]

    if len(valid_samples) < 2:
        return None

    # Use first and last valid samples
    first = valid_samples[0]
    last = valid_samples[-1]

    time_delta_s = (last['timestamp_ms'] - first['timestamp_ms']) / 1000
    if time_delta_s <= 0:
        return None

    memory_delta_mb = last['memory_mb'] - first['memory_mb']
    return memory_delta_mb / time_delta_s


def _get_max_memory(metrics_history: List[Dict]) -> Optional[int]:
    """Get maximum memory usage from metrics history."""
    if not metrics_history:
        return None

    memories = [m.get('memory_mb') for m in metrics_history if m.get('memory_mb') is not None]
    return max(memories) if memories else None


def _get_max_dom_nodes(metrics_history: List[Dict]) -> Optional[int]:
    """Get maximum DOM node count from metrics history."""
    if not metrics_history:
        return None

    nodes = [m.get('dom_nodes') for m in metrics_history if m.get('dom_nodes') is not None]
    return max(nodes) if nodes else None


def _check_memory_exhaustion(
    error_message: str,
    metrics_history: List[Dict],
    logs: List[Dict]
) -> tuple[float, str]:
    """
    Check for memory exhaustion pattern.

    Returns:
        Tuple of (confidence, description)
    """
    confidence = 0.0
    signals = []

    # Check for high memory usage
    max_memory = _get_max_memory(metrics_history)
    if max_memory is not None:
        if max_memory >= MEMORY_EXHAUSTION_THRESHOLD_MB:
            confidence += 0.5
            signals.append(f"Memory reached {max_memory}MB (threshold: {MEMORY_EXHAUSTION_THRESHOLD_MB}MB)")
        elif max_memory >= MEMORY_EXHAUSTION_THRESHOLD_MB * 0.8:
            confidence += 0.3
            signals.append(f"Memory at {max_memory}MB approaching threshold")

    # Check for rapid memory growth
    growth_rate = _calculate_memory_growth_rate(metrics_history)
    if growth_rate is not None and growth_rate >= MEMORY_GROWTH_RATE_THRESHOLD_MB_S:
        confidence += 0.3
        signals.append(f"Memory growing at {growth_rate:.1f}MB/s (threshold: {MEMORY_GROWTH_RATE_THRESHOLD_MB_S}MB/s)")

    # Check error message for memory-related keywords
    error_lower = error_message.lower()
    memory_keywords = ['memory', 'heap', 'out of memory', 'oom', 'aw, snap', 'status_access_violation']
    for keyword in memory_keywords:
        if keyword in error_lower:
            confidence += 0.2
            signals.append(f"Error contains '{keyword}'")
            break

    # Check logs for memory warnings
    for log_entry in logs:
        msg = log_entry.get('message', '').lower()
        if 'memory' in msg and ('high' in msg or 'warning' in msg or 'exceeded' in msg):
            confidence += 0.1
            signals.append("Memory warning found in logs")
            break

    description = "; ".join(signals) if signals else "No memory exhaustion signals detected"
    return min(confidence, 1.0), description


def _check_dom_bloat(
    error_message: str,
    metrics_history: List[Dict],
    logs: List[Dict]
) -> tuple[float, str]:
    """
    Check for DOM bloat pattern.

    Returns:
        Tuple of (confidence, description)
    """
    confidence = 0.0
    signals = []

    # Check for high DOM node count
    max_nodes = _get_max_dom_nodes(metrics_history)
    if max_nodes is not None:
        if max_nodes >= DOM_BLOAT_THRESHOLD:
            confidence += 0.6
            signals.append(f"DOM nodes reached {max_nodes} (threshold: {DOM_BLOAT_THRESHOLD})")
        elif max_nodes >= DOM_BLOAT_THRESHOLD * 0.8:
            confidence += 0.3
            signals.append(f"DOM nodes at {max_nodes} approaching threshold")

    # Check error message for DOM-related keywords
    error_lower = error_message.lower()
    dom_keywords = ['dom', 'element', 'node', 'render', 'paint', 'layout']
    for keyword in dom_keywords:
        if keyword in error_lower:
            confidence += 0.2
            signals.append(f"Error contains '{keyword}'")
            break

    # Check if memory is high too (DOM bloat often causes memory issues)
    max_memory = _get_max_memory(metrics_history)
    if max_memory is not None and max_memory >= 800:  # 800MB
        confidence += 0.1
        signals.append(f"Memory also elevated ({max_memory}MB)")

    # Check logs for DOM-related messages
    for log_entry in logs:
        msg = log_entry.get('message', '').lower()
        if 'dom' in msg and ('large' in msg or 'cleanup' in msg or 'remove' in msg):
            confidence += 0.1
            signals.append("DOM warning found in logs")
            break

    description = "; ".join(signals) if signals else "No DOM bloat signals detected"
    return min(confidence, 1.0), description


def _check_rate_limited(
    error_message: str,
    metrics_history: List[Dict],
    logs: List[Dict]
) -> tuple[float, str]:
    """
    Check for rate limiting pattern.

    Returns:
        Tuple of (confidence, description)
    """
    confidence = 0.0
    signals = []

    # Check error message for rate limit indicators
    error_lower = error_message.lower()
    if '429' in error_message:
        confidence += 0.6
        signals.append("HTTP 429 status code in error")

    rate_keywords = ['rate limit', 'too many requests', 'unusual traffic', 'captcha', 'blocked']
    for keyword in rate_keywords:
        if keyword in error_lower:
            confidence += 0.4
            signals.append(f"Error contains '{keyword}'")
            break

    # Check logs for rate limiting signals
    rate_log_count = 0
    for log_entry in logs:
        msg = log_entry.get('message', '').lower()
        network = log_entry.get('network', {})
        status = network.get('status')

        if status == 429:
            rate_log_count += 1
            confidence += 0.2

        if 'unusual traffic' in msg or 'rate' in msg or 'blocked' in msg:
            rate_log_count += 1
            confidence += 0.1

    if rate_log_count > 0:
        signals.append(f"Found {rate_log_count} rate-limiting indicators in logs")

    description = "; ".join(signals) if signals else "No rate limiting signals detected"
    return min(confidence, 1.0), description


def _check_consent_loop(
    error_message: str,
    metrics_history: List[Dict],
    logs: List[Dict]
) -> tuple[float, str]:
    """
    Check for consent popup loop pattern.

    Returns:
        Tuple of (confidence, description)
    """
    confidence = 0.0
    signals = []

    # Check error message for consent keywords
    error_lower = error_message.lower()
    if 'consent' in error_lower:
        confidence += 0.3
        signals.append("Error mentions consent")

    # Count consent-related log entries
    consent_count = 0
    consent_messages = []
    for log_entry in logs:
        msg = log_entry.get('message', '').lower()
        if 'consent' in msg:
            consent_count += 1
            consent_messages.append(msg[:50])

    # Multiple consent messages indicate a loop
    if consent_count >= 3:
        confidence += 0.5
        signals.append(f"Consent popup appeared {consent_count} times in logs")
    elif consent_count >= 2:
        confidence += 0.3
        signals.append(f"Consent popup appeared {consent_count} times")
    elif consent_count == 1:
        confidence += 0.1
        signals.append("Single consent popup detected")

    # Check for timeout after consent handling
    if 'timeout' in error_lower and consent_count > 0:
        confidence += 0.2
        signals.append("Timeout occurred with consent activity")

    description = "; ".join(signals) if signals else "No consent loop signals detected"
    return min(confidence, 1.0), description


def _check_scroll_timeout(
    error_message: str,
    metrics_history: List[Dict],
    logs: List[Dict],
    state: Optional[Dict] = None
) -> tuple[float, str]:
    """
    Check for scroll timeout pattern (no new reviews after many scrolls).

    Returns:
        Tuple of (confidence, description)
    """
    confidence = 0.0
    signals = []

    # Check state for scroll count
    scroll_count = 0
    reviews_count = 0
    if state:
        scroll_count = state.get('scroll_count', 0)
        reviews_count = state.get('reviews_extracted', 0)

    # Check error for timeout indicators
    error_lower = error_message.lower()
    if 'timeout' in error_lower:
        confidence += 0.2
        signals.append("Timeout in error message")

    # Count recovery attempts in logs (indicate stuck scrolling)
    recovery_count = 0
    no_new_count = 0
    for log_entry in logs:
        msg = log_entry.get('message', '').lower()
        if 'recovery attempt' in msg:
            recovery_count += 1
        if 'no new' in msg or 'stuck' in msg:
            no_new_count += 1

    if recovery_count >= SCROLL_TIMEOUT_MIN_SCROLLS:
        confidence += 0.5
        signals.append(f"Made {recovery_count} recovery attempts")
    elif recovery_count >= 5:
        confidence += 0.3
        signals.append(f"Made {recovery_count} recovery attempts")

    if no_new_count > 0:
        confidence += 0.2
        signals.append(f"Found {no_new_count} 'no new reviews' log entries")

    # Check if reviews stopped growing
    if metrics_history and len(metrics_history) >= 5:
        # Check if reviews count plateaued
        recent_counts = [m.get('reviews_count', 0) for m in metrics_history[-5:] if m.get('reviews_count')]
        if recent_counts and len(set(recent_counts)) == 1:
            confidence += 0.2
            signals.append(f"Review count stuck at {recent_counts[0]}")

    description = "; ".join(signals) if signals else "No scroll timeout signals detected"
    return min(confidence, 1.0), description


def _check_element_stale(
    error_message: str,
    metrics_history: List[Dict],
    logs: List[Dict]
) -> tuple[float, str]:
    """
    Check for stale element reference pattern.

    Returns:
        Tuple of (confidence, description)
    """
    confidence = 0.0
    signals = []

    # Check error message for stale element indicators
    error_lower = error_message.lower()
    stale_keywords = [
        'stale element', 'staleelement', 'stale_element',
        'element is not attached', 'element reference',
        'no such element', 'element not found',
        'element is no longer valid'
    ]

    for keyword in stale_keywords:
        if keyword in error_lower:
            confidence += 0.6
            signals.append(f"Error contains '{keyword}'")
            break

    # Check logs for stale element patterns
    stale_log_count = 0
    for log_entry in logs:
        msg = log_entry.get('message', '').lower()
        for keyword in stale_keywords:
            if keyword in msg:
                stale_log_count += 1
                break

    if stale_log_count > 0:
        confidence += 0.2
        signals.append(f"Found {stale_log_count} stale element references in logs")

    # Check if DOM was changing rapidly (indicates dynamic page)
    if metrics_history and len(metrics_history) >= 3:
        dom_counts = [m.get('dom_nodes') for m in metrics_history if m.get('dom_nodes')]
        if len(dom_counts) >= 3:
            # Calculate variance
            avg = sum(dom_counts) / len(dom_counts)
            variance = sum((x - avg) ** 2 for x in dom_counts) / len(dom_counts)
            std_dev = variance ** 0.5
            # High variance indicates rapidly changing DOM
            if std_dev > 1000:
                confidence += 0.2
                signals.append(f"High DOM variability (std dev: {std_dev:.0f})")

    description = "; ".join(signals) if signals else "No stale element signals detected"
    return min(confidence, 1.0), description


def analyze_crash(crash_report: Dict) -> CrashAnalysis:
    """
    Analyze a crash report to determine the most likely crash pattern.

    Examines error_message, metrics_history, and logs_before_crash to
    calculate confidence scores for each crash pattern type.

    Args:
        crash_report: Dictionary containing:
            - error_message: str - The exception message
            - metrics_history: List[Dict] - Sampled metrics with timestamp_ms, memory_mb, dom_nodes
            - logs_before_crash: List[Dict] - Recent log entries before the crash
            - state: Optional[Dict] - Scraper state (reviews_extracted, scroll_count, etc.)
            - crash_type: Optional[str] - Basic crash classification from classify_crash()

    Returns:
        CrashAnalysis with the highest-confidence pattern match
    """
    # Extract data from crash report
    error_message = crash_report.get('error_message', '')
    metrics_history = crash_report.get('metrics_history', [])
    logs = crash_report.get('logs_before_crash', [])
    state = crash_report.get('state', {})
    basic_type = crash_report.get('crash_type', 'unknown')

    # Run all pattern checks
    pattern_results = {}

    # Memory exhaustion
    conf, desc = _check_memory_exhaustion(error_message, metrics_history, logs)
    pattern_results['memory_exhaustion'] = (conf, desc)

    # DOM bloat
    conf, desc = _check_dom_bloat(error_message, metrics_history, logs)
    pattern_results['dom_bloat'] = (conf, desc)

    # Rate limited
    conf, desc = _check_rate_limited(error_message, metrics_history, logs)
    pattern_results['rate_limited'] = (conf, desc)

    # Consent loop
    conf, desc = _check_consent_loop(error_message, metrics_history, logs)
    pattern_results['consent_loop'] = (conf, desc)

    # Scroll timeout
    conf, desc = _check_scroll_timeout(error_message, metrics_history, logs, state)
    pattern_results['scroll_timeout'] = (conf, desc)

    # Element stale
    conf, desc = _check_element_stale(error_message, metrics_history, logs)
    pattern_results['element_stale'] = (conf, desc)

    # Find the pattern with highest confidence
    best_pattern = max(pattern_results.items(), key=lambda x: x[1][0])
    pattern_name = best_pattern[0]
    confidence = best_pattern[1][0]
    description = best_pattern[1][1]

    # If confidence is too low, fall back to basic classification
    if confidence < 0.2:
        # Map basic crash types to our patterns
        basic_to_pattern = {
            'memory_exhaustion': 'memory_exhaustion',
            'tab_crash': 'memory_exhaustion',  # Tab crashes often from memory
            'timeout': 'scroll_timeout',
            'element_not_found': 'element_stale',
            'rate_limited': 'rate_limited',
            'network_failure': 'rate_limited',  # Could be blocking
        }

        if basic_type in basic_to_pattern:
            pattern_name = basic_to_pattern[basic_type]
            confidence = 0.3  # Low confidence fallback
            description = f"Inferred from basic crash type '{basic_type}'"
        else:
            pattern_name = 'unknown'
            confidence = 0.0
            description = f"Unable to determine crash pattern (basic type: {basic_type})"

    # Generate suggested fix based on pattern
    suggested_fixes = {
        'memory_exhaustion': (
            "Reduce batch size and restart browser more frequently. "
            "Consider limiting max_reviews to 500 and restarting browser after every 200 reviews."
        ),
        'dom_bloat': (
            "Enable DOM cleanup during scrolling. "
            "Hide processed review cards and remove separator elements to keep DOM light."
        ),
        'rate_limited': (
            "Increase delays between requests and consider rotating proxies. "
            "Double the delay multiplier and switch to a different proxy if available."
        ),
        'consent_loop': (
            "Skip consent handling after initial attempt to avoid infinite loops. "
            "The consent popup may be appearing due to cookie clearing or navigation issues."
        ),
        'scroll_timeout': (
            "The page may have stopped loading new reviews. "
            "Try reducing the target review count by 10% and accepting partial results."
        ),
        'element_stale': (
            "Page elements are being removed/replaced during scraping. "
            "Retry operations with freshly-located elements and add defensive waits."
        ),
        'unknown': (
            "Unable to determine specific crash cause. "
            "Review logs and consider restarting with fresh browser session."
        )
    }

    suggested_fix = suggested_fixes.get(pattern_name, suggested_fixes['unknown'])
    auto_fix_params = AUTO_FIX_PARAMS.get(pattern_name)

    return CrashAnalysis(
        pattern=pattern_name,
        confidence=confidence,
        description=description,
        suggested_fix=suggested_fix,
        auto_fix_params=auto_fix_params
    )


def get_auto_fix_params(pattern: str) -> Optional[Dict[str, Any]]:
    """
    Get auto-fix parameters for a specific crash pattern.

    Args:
        pattern: The crash pattern name

    Returns:
        Dictionary of auto-fix parameters, or None if pattern not recognized
    """
    return AUTO_FIX_PARAMS.get(pattern)


def apply_auto_fix(pattern: str, current_params: Dict[str, Any]) -> Dict[str, Any]:
    """
    Apply auto-fix parameters to current scraper parameters.

    Args:
        pattern: The crash pattern name
        current_params: Current scraper parameters to modify

    Returns:
        Updated parameters dictionary with fixes applied
    """
    fix_params = AUTO_FIX_PARAMS.get(pattern, {})
    updated = current_params.copy()

    for key, value in fix_params.items():
        if key == 'target_reviews' and value == 'current - 10%':
            # Special case: reduce target by 10%
            current_target = updated.get('max_reviews', 1000)
            updated['max_reviews'] = int(current_target * 0.9)
        elif key == 'delay_multiplier':
            # Multiply existing delay
            current_delay = updated.get('scroll_delay', 1.0)
            updated['scroll_delay'] = current_delay * value
        else:
            updated[key] = value

    return updated


def summarize_crash_patterns(crash_reports: List[Dict]) -> Dict[str, Any]:
    """
    Analyze multiple crash reports to identify recurring patterns.

    Args:
        crash_reports: List of crash report dictionaries

    Returns:
        Summary dictionary with pattern frequencies and recommendations
    """
    if not crash_reports:
        return {
            'total_crashes': 0,
            'patterns': {},
            'most_common': None,
            'recommendations': []
        }

    pattern_counts: Dict[str, int] = {}
    pattern_confidences: Dict[str, List[float]] = {}

    for report in crash_reports:
        analysis = analyze_crash(report)
        pattern = analysis.pattern

        pattern_counts[pattern] = pattern_counts.get(pattern, 0) + 1
        if pattern not in pattern_confidences:
            pattern_confidences[pattern] = []
        pattern_confidences[pattern].append(analysis.confidence)

    # Calculate average confidence per pattern
    patterns_summary = {}
    for pattern, count in pattern_counts.items():
        avg_confidence = sum(pattern_confidences[pattern]) / len(pattern_confidences[pattern])
        patterns_summary[pattern] = {
            'count': count,
            'percentage': count / len(crash_reports) * 100,
            'avg_confidence': avg_confidence
        }

    # Find most common pattern
    most_common = max(pattern_counts.items(), key=lambda x: x[1])[0] if pattern_counts else None

    # Generate recommendations
    recommendations = []
    for pattern, stats in sorted(patterns_summary.items(), key=lambda x: x[1]['count'], reverse=True):
        if stats['count'] >= 2:  # Only recommend for recurring patterns
            fix_params = AUTO_FIX_PARAMS.get(pattern)
            if fix_params:
                recommendations.append({
                    'pattern': pattern,
                    'occurrences': stats['count'],
                    'auto_fix_params': fix_params
                })

    return {
        'total_crashes': len(crash_reports),
        'patterns': patterns_summary,
        'most_common': most_common,
        'recommendations': recommendations
    }