Phase 0: Project restructure to ReviewIQ platform architecture

New structure:
- scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py)
- scrapers/base.py (BaseScraper interface)
- scrapers/registry.py (ScraperRegistry for version routing)
- core/database.py, models.py, config.py, enums.py
- utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py
- workers/chrome_pool.py
- services/webhook_service.py
- api/ routes structure (empty, ready for Phase 2)
- tests/ structure mirroring source

All imports updated in:
- api_server_production.py (7 import paths updated)
- utils/health_checks.py (scraper import path)

Legacy modules moved to modules/_legacy/:
- data_storage.py, image_handler.py, s3_handler.py (unused)

Syntax verified, frontend build passing.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 15:22:08 +00:00
parent bb0291f265
commit 544e028c3f
37 changed files with 5782 additions and 30 deletions

0
utils/__init__.py Normal file
View File

666
utils/crash_analyzer.py Normal file
View File

@@ -0,0 +1,666 @@
"""
Crash Pattern Analyzer Module
Provides deep analysis of scraper crashes with pattern detection,
confidence scoring, and auto-fix parameter suggestions.
Builds on top of the basic classify_crash function in scraper_clean.py
with more sophisticated pattern matching and multi-signal analysis.
"""
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
import re
@dataclass
class CrashAnalysis:
"""
Result of crash pattern analysis.
Attributes:
pattern: The identified crash pattern type (e.g., "memory_exhaustion", "dom_bloat")
confidence: Confidence score from 0.0 to 1.0 based on multiple signals
description: Human-readable description of the crash cause
suggested_fix: Recommended action to prevent this crash
auto_fix_params: Parameters that can be applied automatically to prevent recurrence
"""
pattern: str # e.g., "memory_exhaustion", "dom_bloat", "rate_limited"
confidence: float # 0.0 to 1.0
description: str
suggested_fix: str
auto_fix_params: Optional[Dict[str, Any]]
# Thresholds for pattern detection
MEMORY_EXHAUSTION_THRESHOLD_MB = 1500 # 1.5GB in MB
MEMORY_GROWTH_RATE_THRESHOLD_MB_S = 10 # 10MB/s
DOM_BLOAT_THRESHOLD = 50000 # 50000 nodes
SCROLL_TIMEOUT_MIN_SCROLLS = 10 # Minimum scrolls before considering scroll_timeout
# Auto-fix parameters for each crash pattern
AUTO_FIX_PARAMS = {
"memory_exhaustion": {
"max_reviews": 500,
"restart_browser_after": 200
},
"dom_bloat": {
"scroll_cleanup": True,
"lazy_load": True
},
"rate_limited": {
"delay_multiplier": 2.0,
"use_different_proxy": True
},
"consent_loop": {
"skip_consent_retries": True
},
"scroll_timeout": {
"reduce_target": True,
"target_reviews": "current - 10%"
},
"element_stale": {
"retry_with_fresh_elements": True
}
}
def _calculate_memory_growth_rate(metrics_history: List[Dict]) -> Optional[float]:
"""
Calculate memory growth rate in MB/s from metrics history.
Args:
metrics_history: List of metric samples with timestamp_ms and memory_mb
Returns:
Growth rate in MB/s, or None if cannot be calculated
"""
if not metrics_history or len(metrics_history) < 2:
return None
# Filter samples that have valid memory readings
valid_samples = [
m for m in metrics_history
if m.get('memory_mb') is not None and m.get('timestamp_ms') is not None
]
if len(valid_samples) < 2:
return None
# Use first and last valid samples
first = valid_samples[0]
last = valid_samples[-1]
time_delta_s = (last['timestamp_ms'] - first['timestamp_ms']) / 1000
if time_delta_s <= 0:
return None
memory_delta_mb = last['memory_mb'] - first['memory_mb']
return memory_delta_mb / time_delta_s
def _get_max_memory(metrics_history: List[Dict]) -> Optional[int]:
"""Get maximum memory usage from metrics history."""
if not metrics_history:
return None
memories = [m.get('memory_mb') for m in metrics_history if m.get('memory_mb') is not None]
return max(memories) if memories else None
def _get_max_dom_nodes(metrics_history: List[Dict]) -> Optional[int]:
"""Get maximum DOM node count from metrics history."""
if not metrics_history:
return None
nodes = [m.get('dom_nodes') for m in metrics_history if m.get('dom_nodes') is not None]
return max(nodes) if nodes else None
def _check_memory_exhaustion(
error_message: str,
metrics_history: List[Dict],
logs: List[Dict]
) -> tuple[float, str]:
"""
Check for memory exhaustion pattern.
Returns:
Tuple of (confidence, description)
"""
confidence = 0.0
signals = []
# Check for high memory usage
max_memory = _get_max_memory(metrics_history)
if max_memory is not None:
if max_memory >= MEMORY_EXHAUSTION_THRESHOLD_MB:
confidence += 0.5
signals.append(f"Memory reached {max_memory}MB (threshold: {MEMORY_EXHAUSTION_THRESHOLD_MB}MB)")
elif max_memory >= MEMORY_EXHAUSTION_THRESHOLD_MB * 0.8:
confidence += 0.3
signals.append(f"Memory at {max_memory}MB approaching threshold")
# Check for rapid memory growth
growth_rate = _calculate_memory_growth_rate(metrics_history)
if growth_rate is not None and growth_rate >= MEMORY_GROWTH_RATE_THRESHOLD_MB_S:
confidence += 0.3
signals.append(f"Memory growing at {growth_rate:.1f}MB/s (threshold: {MEMORY_GROWTH_RATE_THRESHOLD_MB_S}MB/s)")
# Check error message for memory-related keywords
error_lower = error_message.lower()
memory_keywords = ['memory', 'heap', 'out of memory', 'oom', 'aw, snap', 'status_access_violation']
for keyword in memory_keywords:
if keyword in error_lower:
confidence += 0.2
signals.append(f"Error contains '{keyword}'")
break
# Check logs for memory warnings
for log_entry in logs:
msg = log_entry.get('message', '').lower()
if 'memory' in msg and ('high' in msg or 'warning' in msg or 'exceeded' in msg):
confidence += 0.1
signals.append("Memory warning found in logs")
break
description = "; ".join(signals) if signals else "No memory exhaustion signals detected"
return min(confidence, 1.0), description
def _check_dom_bloat(
error_message: str,
metrics_history: List[Dict],
logs: List[Dict]
) -> tuple[float, str]:
"""
Check for DOM bloat pattern.
Returns:
Tuple of (confidence, description)
"""
confidence = 0.0
signals = []
# Check for high DOM node count
max_nodes = _get_max_dom_nodes(metrics_history)
if max_nodes is not None:
if max_nodes >= DOM_BLOAT_THRESHOLD:
confidence += 0.6
signals.append(f"DOM nodes reached {max_nodes} (threshold: {DOM_BLOAT_THRESHOLD})")
elif max_nodes >= DOM_BLOAT_THRESHOLD * 0.8:
confidence += 0.3
signals.append(f"DOM nodes at {max_nodes} approaching threshold")
# Check error message for DOM-related keywords
error_lower = error_message.lower()
dom_keywords = ['dom', 'element', 'node', 'render', 'paint', 'layout']
for keyword in dom_keywords:
if keyword in error_lower:
confidence += 0.2
signals.append(f"Error contains '{keyword}'")
break
# Check if memory is high too (DOM bloat often causes memory issues)
max_memory = _get_max_memory(metrics_history)
if max_memory is not None and max_memory >= 800: # 800MB
confidence += 0.1
signals.append(f"Memory also elevated ({max_memory}MB)")
# Check logs for DOM-related messages
for log_entry in logs:
msg = log_entry.get('message', '').lower()
if 'dom' in msg and ('large' in msg or 'cleanup' in msg or 'remove' in msg):
confidence += 0.1
signals.append("DOM warning found in logs")
break
description = "; ".join(signals) if signals else "No DOM bloat signals detected"
return min(confidence, 1.0), description
def _check_rate_limited(
error_message: str,
metrics_history: List[Dict],
logs: List[Dict]
) -> tuple[float, str]:
"""
Check for rate limiting pattern.
Returns:
Tuple of (confidence, description)
"""
confidence = 0.0
signals = []
# Check error message for rate limit indicators
error_lower = error_message.lower()
if '429' in error_message:
confidence += 0.6
signals.append("HTTP 429 status code in error")
rate_keywords = ['rate limit', 'too many requests', 'unusual traffic', 'captcha', 'blocked']
for keyword in rate_keywords:
if keyword in error_lower:
confidence += 0.4
signals.append(f"Error contains '{keyword}'")
break
# Check logs for rate limiting signals
rate_log_count = 0
for log_entry in logs:
msg = log_entry.get('message', '').lower()
network = log_entry.get('network', {})
status = network.get('status')
if status == 429:
rate_log_count += 1
confidence += 0.2
if 'unusual traffic' in msg or 'rate' in msg or 'blocked' in msg:
rate_log_count += 1
confidence += 0.1
if rate_log_count > 0:
signals.append(f"Found {rate_log_count} rate-limiting indicators in logs")
description = "; ".join(signals) if signals else "No rate limiting signals detected"
return min(confidence, 1.0), description
def _check_consent_loop(
error_message: str,
metrics_history: List[Dict],
logs: List[Dict]
) -> tuple[float, str]:
"""
Check for consent popup loop pattern.
Returns:
Tuple of (confidence, description)
"""
confidence = 0.0
signals = []
# Check error message for consent keywords
error_lower = error_message.lower()
if 'consent' in error_lower:
confidence += 0.3
signals.append("Error mentions consent")
# Count consent-related log entries
consent_count = 0
consent_messages = []
for log_entry in logs:
msg = log_entry.get('message', '').lower()
if 'consent' in msg:
consent_count += 1
consent_messages.append(msg[:50])
# Multiple consent messages indicate a loop
if consent_count >= 3:
confidence += 0.5
signals.append(f"Consent popup appeared {consent_count} times in logs")
elif consent_count >= 2:
confidence += 0.3
signals.append(f"Consent popup appeared {consent_count} times")
elif consent_count == 1:
confidence += 0.1
signals.append("Single consent popup detected")
# Check for timeout after consent handling
if 'timeout' in error_lower and consent_count > 0:
confidence += 0.2
signals.append("Timeout occurred with consent activity")
description = "; ".join(signals) if signals else "No consent loop signals detected"
return min(confidence, 1.0), description
def _check_scroll_timeout(
error_message: str,
metrics_history: List[Dict],
logs: List[Dict],
state: Optional[Dict] = None
) -> tuple[float, str]:
"""
Check for scroll timeout pattern (no new reviews after many scrolls).
Returns:
Tuple of (confidence, description)
"""
confidence = 0.0
signals = []
# Check state for scroll count
scroll_count = 0
reviews_count = 0
if state:
scroll_count = state.get('scroll_count', 0)
reviews_count = state.get('reviews_extracted', 0)
# Check error for timeout indicators
error_lower = error_message.lower()
if 'timeout' in error_lower:
confidence += 0.2
signals.append("Timeout in error message")
# Count recovery attempts in logs (indicate stuck scrolling)
recovery_count = 0
no_new_count = 0
for log_entry in logs:
msg = log_entry.get('message', '').lower()
if 'recovery attempt' in msg:
recovery_count += 1
if 'no new' in msg or 'stuck' in msg:
no_new_count += 1
if recovery_count >= SCROLL_TIMEOUT_MIN_SCROLLS:
confidence += 0.5
signals.append(f"Made {recovery_count} recovery attempts")
elif recovery_count >= 5:
confidence += 0.3
signals.append(f"Made {recovery_count} recovery attempts")
if no_new_count > 0:
confidence += 0.2
signals.append(f"Found {no_new_count} 'no new reviews' log entries")
# Check if reviews stopped growing
if metrics_history and len(metrics_history) >= 5:
# Check if reviews count plateaued
recent_counts = [m.get('reviews_count', 0) for m in metrics_history[-5:] if m.get('reviews_count')]
if recent_counts and len(set(recent_counts)) == 1:
confidence += 0.2
signals.append(f"Review count stuck at {recent_counts[0]}")
description = "; ".join(signals) if signals else "No scroll timeout signals detected"
return min(confidence, 1.0), description
def _check_element_stale(
error_message: str,
metrics_history: List[Dict],
logs: List[Dict]
) -> tuple[float, str]:
"""
Check for stale element reference pattern.
Returns:
Tuple of (confidence, description)
"""
confidence = 0.0
signals = []
# Check error message for stale element indicators
error_lower = error_message.lower()
stale_keywords = [
'stale element', 'staleelement', 'stale_element',
'element is not attached', 'element reference',
'no such element', 'element not found',
'element is no longer valid'
]
for keyword in stale_keywords:
if keyword in error_lower:
confidence += 0.6
signals.append(f"Error contains '{keyword}'")
break
# Check logs for stale element patterns
stale_log_count = 0
for log_entry in logs:
msg = log_entry.get('message', '').lower()
for keyword in stale_keywords:
if keyword in msg:
stale_log_count += 1
break
if stale_log_count > 0:
confidence += 0.2
signals.append(f"Found {stale_log_count} stale element references in logs")
# Check if DOM was changing rapidly (indicates dynamic page)
if metrics_history and len(metrics_history) >= 3:
dom_counts = [m.get('dom_nodes') for m in metrics_history if m.get('dom_nodes')]
if len(dom_counts) >= 3:
# Calculate variance
avg = sum(dom_counts) / len(dom_counts)
variance = sum((x - avg) ** 2 for x in dom_counts) / len(dom_counts)
std_dev = variance ** 0.5
# High variance indicates rapidly changing DOM
if std_dev > 1000:
confidence += 0.2
signals.append(f"High DOM variability (std dev: {std_dev:.0f})")
description = "; ".join(signals) if signals else "No stale element signals detected"
return min(confidence, 1.0), description
def analyze_crash(crash_report: Dict) -> CrashAnalysis:
"""
Analyze a crash report to determine the most likely crash pattern.
Examines error_message, metrics_history, and logs_before_crash to
calculate confidence scores for each crash pattern type.
Args:
crash_report: Dictionary containing:
- error_message: str - The exception message
- metrics_history: List[Dict] - Sampled metrics with timestamp_ms, memory_mb, dom_nodes
- logs_before_crash: List[Dict] - Recent log entries before the crash
- state: Optional[Dict] - Scraper state (reviews_extracted, scroll_count, etc.)
- crash_type: Optional[str] - Basic crash classification from classify_crash()
Returns:
CrashAnalysis with the highest-confidence pattern match
"""
# Extract data from crash report
error_message = crash_report.get('error_message', '')
metrics_history = crash_report.get('metrics_history', [])
logs = crash_report.get('logs_before_crash', [])
state = crash_report.get('state', {})
basic_type = crash_report.get('crash_type', 'unknown')
# Run all pattern checks
pattern_results = {}
# Memory exhaustion
conf, desc = _check_memory_exhaustion(error_message, metrics_history, logs)
pattern_results['memory_exhaustion'] = (conf, desc)
# DOM bloat
conf, desc = _check_dom_bloat(error_message, metrics_history, logs)
pattern_results['dom_bloat'] = (conf, desc)
# Rate limited
conf, desc = _check_rate_limited(error_message, metrics_history, logs)
pattern_results['rate_limited'] = (conf, desc)
# Consent loop
conf, desc = _check_consent_loop(error_message, metrics_history, logs)
pattern_results['consent_loop'] = (conf, desc)
# Scroll timeout
conf, desc = _check_scroll_timeout(error_message, metrics_history, logs, state)
pattern_results['scroll_timeout'] = (conf, desc)
# Element stale
conf, desc = _check_element_stale(error_message, metrics_history, logs)
pattern_results['element_stale'] = (conf, desc)
# Find the pattern with highest confidence
best_pattern = max(pattern_results.items(), key=lambda x: x[1][0])
pattern_name = best_pattern[0]
confidence = best_pattern[1][0]
description = best_pattern[1][1]
# If confidence is too low, fall back to basic classification
if confidence < 0.2:
# Map basic crash types to our patterns
basic_to_pattern = {
'memory_exhaustion': 'memory_exhaustion',
'tab_crash': 'memory_exhaustion', # Tab crashes often from memory
'timeout': 'scroll_timeout',
'element_not_found': 'element_stale',
'rate_limited': 'rate_limited',
'network_failure': 'rate_limited', # Could be blocking
}
if basic_type in basic_to_pattern:
pattern_name = basic_to_pattern[basic_type]
confidence = 0.3 # Low confidence fallback
description = f"Inferred from basic crash type '{basic_type}'"
else:
pattern_name = 'unknown'
confidence = 0.0
description = f"Unable to determine crash pattern (basic type: {basic_type})"
# Generate suggested fix based on pattern
suggested_fixes = {
'memory_exhaustion': (
"Reduce batch size and restart browser more frequently. "
"Consider limiting max_reviews to 500 and restarting browser after every 200 reviews."
),
'dom_bloat': (
"Enable DOM cleanup during scrolling. "
"Hide processed review cards and remove separator elements to keep DOM light."
),
'rate_limited': (
"Increase delays between requests and consider rotating proxies. "
"Double the delay multiplier and switch to a different proxy if available."
),
'consent_loop': (
"Skip consent handling after initial attempt to avoid infinite loops. "
"The consent popup may be appearing due to cookie clearing or navigation issues."
),
'scroll_timeout': (
"The page may have stopped loading new reviews. "
"Try reducing the target review count by 10% and accepting partial results."
),
'element_stale': (
"Page elements are being removed/replaced during scraping. "
"Retry operations with freshly-located elements and add defensive waits."
),
'unknown': (
"Unable to determine specific crash cause. "
"Review logs and consider restarting with fresh browser session."
)
}
suggested_fix = suggested_fixes.get(pattern_name, suggested_fixes['unknown'])
auto_fix_params = AUTO_FIX_PARAMS.get(pattern_name)
return CrashAnalysis(
pattern=pattern_name,
confidence=confidence,
description=description,
suggested_fix=suggested_fix,
auto_fix_params=auto_fix_params
)
def get_auto_fix_params(pattern: str) -> Optional[Dict[str, Any]]:
"""
Get auto-fix parameters for a specific crash pattern.
Args:
pattern: The crash pattern name
Returns:
Dictionary of auto-fix parameters, or None if pattern not recognized
"""
return AUTO_FIX_PARAMS.get(pattern)
def apply_auto_fix(pattern: str, current_params: Dict[str, Any]) -> Dict[str, Any]:
"""
Apply auto-fix parameters to current scraper parameters.
Args:
pattern: The crash pattern name
current_params: Current scraper parameters to modify
Returns:
Updated parameters dictionary with fixes applied
"""
fix_params = AUTO_FIX_PARAMS.get(pattern, {})
updated = current_params.copy()
for key, value in fix_params.items():
if key == 'target_reviews' and value == 'current - 10%':
# Special case: reduce target by 10%
current_target = updated.get('max_reviews', 1000)
updated['max_reviews'] = int(current_target * 0.9)
elif key == 'delay_multiplier':
# Multiply existing delay
current_delay = updated.get('scroll_delay', 1.0)
updated['scroll_delay'] = current_delay * value
else:
updated[key] = value
return updated
def summarize_crash_patterns(crash_reports: List[Dict]) -> Dict[str, Any]:
"""
Analyze multiple crash reports to identify recurring patterns.
Args:
crash_reports: List of crash report dictionaries
Returns:
Summary dictionary with pattern frequencies and recommendations
"""
if not crash_reports:
return {
'total_crashes': 0,
'patterns': {},
'most_common': None,
'recommendations': []
}
pattern_counts: Dict[str, int] = {}
pattern_confidences: Dict[str, List[float]] = {}
for report in crash_reports:
analysis = analyze_crash(report)
pattern = analysis.pattern
pattern_counts[pattern] = pattern_counts.get(pattern, 0) + 1
if pattern not in pattern_confidences:
pattern_confidences[pattern] = []
pattern_confidences[pattern].append(analysis.confidence)
# Calculate average confidence per pattern
patterns_summary = {}
for pattern, count in pattern_counts.items():
avg_confidence = sum(pattern_confidences[pattern]) / len(pattern_confidences[pattern])
patterns_summary[pattern] = {
'count': count,
'percentage': count / len(crash_reports) * 100,
'avg_confidence': avg_confidence
}
# Find most common pattern
most_common = max(pattern_counts.items(), key=lambda x: x[1])[0] if pattern_counts else None
# Generate recommendations
recommendations = []
for pattern, stats in sorted(patterns_summary.items(), key=lambda x: x[1]['count'], reverse=True):
if stats['count'] >= 2: # Only recommend for recurring patterns
fix_params = AUTO_FIX_PARAMS.get(pattern)
if fix_params:
recommendations.append({
'pattern': pattern,
'occurrences': stats['count'],
'auto_fix_params': fix_params
})
return {
'total_crashes': len(crash_reports),
'patterns': patterns_summary,
'most_common': most_common,
'recommendations': recommendations
}

391
utils/date_converter.py Normal file
View File

@@ -0,0 +1,391 @@
"""
Date conversion utilities for Google Maps reviews.
"""
import logging
import re
from datetime import datetime, timedelta
from typing import Dict, Any, Optional
# Logger
log = logging.getLogger("scraper")
def relative_to_datetime(date_str: str, lang: str = "en") -> Optional[datetime]:
"""
Convert a relative date string to a datetime object.
Args:
date_str: The relative date string (e.g., "2 years ago")
lang: Language code ("en" or "he")
Returns:
datetime object or None if conversion fails
"""
if not date_str:
return None
try:
# Convert to ISO format first
iso_date = parse_relative_date(date_str, lang)
# If original string was returned, it wasn't in the expected format
if iso_date == date_str:
return None
# Parse the ISO format into datetime
return datetime.fromisoformat(iso_date)
except Exception as e:
log.debug(f"Failed to convert relative date '{date_str}': {e}")
return None
class DateConverter:
"""Handler for converting string dates to datetime objects in MongoDB"""
@staticmethod
def convert_dates_in_document(doc: Dict[str, Any]) -> Dict[str, Any]:
"""
Convert string dates to datetime objects in a document.
Args:
doc: MongoDB document with string dates
Returns:
Document with string dates converted to datetime objects
"""
# Remove the original date string field if it exists
if "date" in doc:
original_date = doc.pop("date")
# Try to use the original date to fix review_date if needed
if "review_date" not in doc or not doc["review_date"]:
lang = next(iter(doc.get("description", {}).keys()), "en")
date_obj = relative_to_datetime(original_date, lang)
if date_obj:
doc["review_date"] = date_obj
# Fields that should be converted to dates
date_fields = ["created_date", "last_modified_date", "review_date"]
# Convert date fields to datetime
for field in date_fields:
if field in doc and isinstance(doc[field], str):
try:
# Try to parse as ISO format first
doc[field] = datetime.fromisoformat(doc[field].replace('Z', '+00:00'))
except (ValueError, TypeError):
# If that fails, try parsing as relative date
lang = next(iter(doc.get("description", {}).keys()), "en")
date_obj = relative_to_datetime(doc[field], lang)
if date_obj:
doc[field] = date_obj
# Handle nested date fields in owner_responses
if "owner_responses" in doc and isinstance(doc["owner_responses"], dict):
for lang, response in doc["owner_responses"].items():
if isinstance(response, dict) and "date" in response:
# Remove the date string field from owner responses
del response["date"]
return doc
@staticmethod
def convert_dates_in_reviews(reviews: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
"""
Convert string dates to datetime objects for all reviews.
Args:
reviews: Dictionary of review documents
Returns:
Reviews with dates converted to datetime objects
"""
log.info("Converting string dates to datetime objects...")
for review_id, review in reviews.items():
reviews[review_id] = DateConverter.convert_dates_in_document(review)
return reviews
def parse_relative_date(date_str: str, lang: str, now: Optional[datetime] = None) -> str:
"""
Converts a relative review_date (in English or Hebrew) such as "a week ago" or "לפני 7 שנים"
into an ISO formatted datetime string (UTC).
For English, supported formats include:
- "a day ago", "an hour ago", "3 weeks ago", "4 months ago", "2 years ago", etc.
For Hebrew, supported formats include:
- "לפני יום", "לפני 2 ימים", "לפני שבוע", "לפני שבועיים", "לפני חודש",
"לפני חודשיים", "לפני 10 חודשים", "לפני שנה", "לפני 3 שנים", etc.
Parameters:
- date_str (str): the relative date string.
- lang (str): "en" for English or "he" for Hebrew.
- now (Optional[datetime]): reference datetime; if None, current local time is used.
Returns:
A string representing the calculated absolute datetime in ISO 8601 format.
If parsing fails in all supported languages, returns a random date within the last year.
"""
import random
if now is None:
now = datetime.utcnow() # use UTC for consistency
# Try with the provided language first
result = try_parse_date(date_str, lang, now)
if result != date_str:
return result
# If the provided language failed, try other supported languages
supported_langs = ["en", "he", "th"]
for alt_lang in supported_langs:
if alt_lang != lang.lower():
result = try_parse_date(date_str, alt_lang, now)
if result != date_str:
return result
# If all parsing attempts failed, generate a random date within the last year
# This creates a date between 1 day ago and 365 days ago
random_days_ago = random.randint(1, 365)
random_date = now - timedelta(days=random_days_ago)
return random_date.isoformat()
def try_parse_date(date_str: str, lang: str, now: datetime) -> str:
"""
Helper function that attempts to parse a date string in a specific language.
Returns the ISO formatted date if successful, or the original string if not.
"""
delta = timedelta(0)
parsed = False
if lang.lower() == "en":
# Pattern: capture number or "a"/"an", then unit.
pattern = re.compile(r'(?P<num>a|an|\d+)\s+(?P<unit>day|week|month|year)s?\s+ago', re.IGNORECASE)
m = pattern.search(date_str)
if m:
num_str = m.group("num").lower()
num = 1 if num_str in ("a", "an") else int(num_str)
unit = m.group("unit").lower()
if unit == "day":
delta = timedelta(days=num)
elif unit == "week":
delta = timedelta(weeks=num)
elif unit == "month":
delta = timedelta(days=30 * num) # approximate
elif unit == "year":
delta = timedelta(days=365 * num) # approximate
parsed = True
elif lang.lower() == "he":
# Remove the "לפני" prefix if present
text = date_str.strip()
if text.startswith("לפני"):
text = text[len("לפני"):].strip()
# Handle special cases where the number and unit are combined:
special = {
"חודשיים": (2, "month"),
"שבועיים": (2, "week"),
"יומיים": (2, "day"),
}
if text in special:
num, unit = special[text]
if unit == "day":
delta = timedelta(days=num)
elif unit == "week":
delta = timedelta(weeks=num)
elif unit == "month":
delta = timedelta(days=30 * num) # approximate
parsed = True
else:
# Match optional number (or assume 1) and then a unit.
pattern = re.compile(r'(?P<num>\d+|אחד|אחת)?\s*(?P<unit>שנה|שנים|חודש|חודשים|יום|ימים|שבוע|שבועות)',
re.IGNORECASE)
m = pattern.search(text)
if m:
num_str = m.group("num")
if not num_str:
num = 1
else:
try:
num = int(num_str)
except ValueError:
num = 1
unit_he = m.group("unit")
# Map the Hebrew unit (both singular and plural) to English unit names
if unit_he in ("יום", "ימים"):
unit = "day"
elif unit_he in ("שבוע", "שבועות"):
unit = "week"
elif unit_he in ("חודש", "חודשים"):
unit = "month"
elif unit_he in ("שנה", "שנים"):
unit = "year"
else:
unit = "day" # fallback
if unit == "day":
delta = timedelta(days=num)
elif unit == "week":
delta = timedelta(weeks=num)
elif unit == "month":
delta = timedelta(days=30 * num) # approximate
elif unit == "year":
delta = timedelta(days=365 * num) # approximate
parsed = True
elif lang.lower() == "th":
# Thai language patterns (simplified)
# Check for Thai patterns like "3 วันที่แล้ว" (3 days ago)
thai_pattern = re.compile(r'(?P<num>\d+)?\s*(?P<unit>วัน|สัปดาห์|เดือน|ปี)ที่แล้ว', re.IGNORECASE)
m = thai_pattern.search(date_str)
if m:
num_str = m.group("num")
num = 1 if not num_str else int(num_str)
unit_th = m.group("unit")
# Map Thai units to English
if unit_th == "วัน":
unit = "day"
elif unit_th == "สัปดาห์":
unit = "week"
elif unit_th == "เดือน":
unit = "month"
elif unit_th == "ปี":
unit = "year"
else:
unit = "day" # fallback
if unit == "day":
delta = timedelta(days=num)
elif unit == "week":
delta = timedelta(weeks=num)
elif unit == "month":
delta = timedelta(days=30 * num) # approximate
elif unit == "year":
delta = timedelta(days=365 * num) # approximate
parsed = True
# Return the calculated date if parsing was successful, otherwise return the original string
if parsed:
result = now - delta
return result.isoformat()
else:
return date_str
# def parse_relative_date(date_str: str, lang: str, now: Optional[datetime] = None) -> str:
# """
# Converts a relative review_date (in English or Hebrew) such as "a week ago" or "לפני 7 שנים"
# into an ISO formatted datetime string (UTC).
#
# For English, supported formats include:
# - "a day ago", "an hour ago", "3 weeks ago", "4 months ago", "2 years ago", etc.
# For Hebrew, supported formats include:
# - "לפני יום", "לפני 2 ימים", "לפני שבוע", "לפני שבועיים", "לפני חודש",
# "לפני חודשיים", "לפני 10 חודשים", "לפני שנה", "לפני 3 שנים", etc.
#
# Parameters:
# - date_str (str): the relative date string.
# - lang (str): "en" for English or "he" for Hebrew.
# - now (Optional[datetime]): reference datetime; if None, current local time is used.
#
# Returns:
# A string representing the calculated absolute datetime in ISO 8601 format,
# or the original date_str if parsing fails.
# """
# if now is None:
# now = datetime.utcnow() # use UTC for consistency
#
# delta = timedelta(0)
#
# if lang.lower() == "en":
# # Pattern: capture number or "a"/"an", then unit.
# pattern = re.compile(r'(?P<num>a|an|\d+)\s+(?P<unit>day|week|month|year)s?\s+ago', re.IGNORECASE)
# m = pattern.search(date_str)
# if m:
# num_str = m.group("num").lower()
# num = 1 if num_str in ("a", "an") else int(num_str)
# unit = m.group("unit").lower()
# if unit == "day":
# delta = timedelta(days=num)
# elif unit == "week":
# delta = timedelta(weeks=num)
# elif unit == "month":
# delta = timedelta(days=30 * num) # approximate
# elif unit == "year":
# delta = timedelta(days=365 * num) # approximate
# else:
# return date_str # return original if not matched
# elif lang.lower() == "he":
# # Remove the "לפני" prefix if present
# text = date_str.strip()
# if text.startswith("לפני"):
# text = text[len("לפני"):].strip()
#
# # Handle special cases where the number and unit are combined:
# special = {
# "חודשיים": (2, "month"),
# "שבועיים": (2, "week"),
# "יומיים": (2, "day"),
# }
# if text in special:
# num, unit = special[text]
# else:
# # Match optional number (or assume 1) and then a unit.
# pattern = re.compile(r'(?P<num>\d+|אחד|אחת)?\s*(?P<unit>שנה|שנים|חודש|חודשים|יום|ימים|שבוע|שבועות)',
# re.IGNORECASE)
# m = pattern.search(text)
# if m:
# num_str = m.group("num")
# if not num_str:
# num = 1
# else:
# try:
# num = int(num_str)
# except ValueError:
# num = 1
# unit_he = m.group("unit")
# # Map the Hebrew unit (both singular and plural) to English unit names
# if unit_he in ("יום", "ימים"):
# unit = "day"
# elif unit_he in ("שבוע", "שבועות"):
# unit = "week"
# elif unit_he in ("חודש", "חודשים"):
# unit = "month"
# elif unit_he in ("שנה", "שנים"):
# unit = "year"
# else:
# unit = "day" # fallback
# else:
# return date_str # if nothing matches, return original text
#
# if unit == "day":
# delta = timedelta(days=num)
# elif unit == "week":
# delta = timedelta(weeks=num)
# elif unit == "month":
# delta = timedelta(days=30 * num) # approximate
# elif unit == "year":
# delta = timedelta(days=365 * num) # approximate
#
# result = now - delta
# return result.isoformat()
# --- Example usage ---
if __name__ == "__main__":
# Fixed reference time for reproducibility:
fixed_now = datetime(2025, 2, 5, 12, 0, 0)
examples = [
("a week ago", "he"),
("4 weeks ago", "en"),
("לפני 7 שנים", "he"),
("לפני חודשיים", "he")
]
for text, lang in examples:
iso_date = parse_relative_date(text, lang, now=fixed_now)
print(f"Original: {text} ({lang}) => ISO: {iso_date}")

411
utils/health_checks.py Normal file
View File

@@ -0,0 +1,411 @@
#!/usr/bin/env python3
"""
Smart health check system with canary testing.
Verifies that scraping actually works, not just that services are up.
"""
import asyncio
import logging
from datetime import datetime, timedelta
from typing import Dict, Any, Optional
import os
log = logging.getLogger(__name__)
class CanaryMonitor:
"""
Background canary test monitor.
Runs actual scraping tests periodically to verify the scraper works.
This catches issues like:
- Google Maps page structure changes
- Broken CSS selectors
- GDPR consent handling issues
- Network/proxy problems
- Chrome/browser issues
"""
def __init__(
self,
db,
interval_hours: int = 4,
test_url: Optional[str] = None
):
"""
Initialize canary monitor.
Args:
db: Database manager instance
interval_hours: How often to run canary tests
test_url: Optional test URL (defaults to Soho Factory in Vilnius)
"""
self.db = db
self.interval = timedelta(hours=interval_hours)
self.test_url = test_url or os.getenv(
'CANARY_TEST_URL',
'https://www.google.com/maps/place/Soho+Factory/@54.6738155,25.2595844,17z/'
)
self.running = False
self.last_run: Optional[datetime] = None
self.last_success: Optional[datetime] = None
self.consecutive_failures = 0
self.last_result: Optional[Dict[str, Any]] = None
async def start(self):
"""Start the background canary monitoring"""
self.running = True
log.info(f"Canary monitor started (interval: {self.interval.total_seconds()/3600:.1f}h)")
while self.running:
try:
await self.run_canary_test()
except Exception as e:
log.error(f"Canary test failed with exception: {e}")
self.consecutive_failures += 1
# Alert if multiple consecutive failures
if self.consecutive_failures >= 3:
await self.send_alert(
f"CRITICAL: Scraper canary failed {self.consecutive_failures} times in a row! "
f"Last error: {str(e)[:200]}"
)
# Sleep until next run
await asyncio.sleep(self.interval.total_seconds())
def stop(self):
"""Stop the background monitoring"""
self.running = False
log.info("Canary monitor stopped")
async def run_canary_test(self):
"""
Run a single canary test.
This performs an actual scrape on a known test URL and validates:
- Scraping succeeds
- Reviews are extracted
- Review count is reasonable
- Scrape time is reasonable
- Data structure is valid
"""
from scrapers.google_reviews.v1_0_0 import fast_scrape_reviews
log.info(f"Running canary scrape test on {self.test_url[:60]}...")
self.last_run = datetime.now()
try:
# Run actual scrape with timeout
result = await asyncio.wait_for(
asyncio.to_thread(
fast_scrape_reviews,
url=self.test_url,
headless=True,
max_scrolls=10 # Limited for canary
),
timeout=60 # Fail if takes > 60s
)
# Validate result
checks = {
"scrape_succeeded": result['success'],
"got_reviews": result['count'] > 0,
"reasonable_count": 10 <= result['count'] <= 500,
"reasonable_time": result['time'] < 30,
"data_structure_valid": self._validate_review_structure(result.get('reviews', []))
}
all_passed = all(checks.values())
if all_passed:
# Success!
log.info(
f"Canary test PASSED: {result['count']} reviews in {result['time']:.1f}s"
)
self.consecutive_failures = 0
self.last_success = datetime.now()
self.last_result = {
"status": "pass",
"reviews_count": result['count'],
"scrape_time": result['time'],
"checks": checks
}
# Save to database
await self.db.save_canary_result(
success=True,
reviews_count=result['count'],
scrape_time=result['time'],
metadata={"checks": checks}
)
else:
# Validation failed
failed_checks = [k for k, v in checks.items() if not v]
log.error(
f"Canary test FAILED: validation failed on {failed_checks}"
)
self.consecutive_failures += 1
self.last_result = {
"status": "fail",
"reviews_count": result['count'],
"scrape_time": result['time'],
"checks": checks,
"failed_checks": failed_checks
}
# Save to database
await self.db.save_canary_result(
success=False,
reviews_count=result['count'],
scrape_time=result['time'],
error_message=f"Validation failed: {failed_checks}",
metadata={"checks": checks}
)
# Alert on failure
if self.consecutive_failures >= 3:
await self.send_alert(
f"CRITICAL: Canary validation failed {self.consecutive_failures} times! "
f"Failed checks: {failed_checks}"
)
except asyncio.TimeoutError:
log.error("Canary test TIMEOUT (>60s)")
self.consecutive_failures += 1
self.last_result = {
"status": "timeout",
"error": "Scrape took longer than 60 seconds"
}
await self.db.save_canary_result(
success=False,
error_message="Timeout after 60 seconds"
)
if self.consecutive_failures >= 3:
await self.send_alert(
f"CRITICAL: Canary timeout {self.consecutive_failures} times!"
)
except Exception as e:
log.error(f"Canary test ERROR: {e}")
self.consecutive_failures += 1
self.last_result = {
"status": "error",
"error": str(e)
}
await self.db.save_canary_result(
success=False,
error_message=str(e)
)
raise # Re-raise to trigger alert in main loop
def _validate_review_structure(self, reviews) -> bool:
"""
Validate that reviews have expected structure.
Args:
reviews: List of review dictionaries
Returns:
True if structure is valid
"""
if not reviews or len(reviews) == 0:
return False
# Check first review has required fields
first_review = reviews[0]
required_fields = ['author', 'rating', 'date_text']
return all(field in first_review for field in required_fields)
async def send_alert(self, message: str):
"""
Send alert via configured channels.
Args:
message: Alert message to send
"""
log.critical(message)
# TODO: Integrate with alerting systems
# Examples:
# Slack
slack_webhook = os.getenv('SLACK_WEBHOOK_URL')
if slack_webhook:
try:
import httpx
async with httpx.AsyncClient() as client:
await client.post(
slack_webhook,
json={"text": message},
timeout=5.0
)
log.info("Alert sent to Slack")
except Exception as e:
log.error(f"Failed to send Slack alert: {e}")
# Email (example with SMTP)
# smtp_config = os.getenv('SMTP_CONFIG')
# if smtp_config:
# await send_email(
# to=os.getenv('ALERT_EMAIL'),
# subject="Scraper Canary Alert",
# body=message
# )
# PagerDuty
# pagerduty_key = os.getenv('PAGERDUTY_KEY')
# if pagerduty_key:
# await trigger_pagerduty(message)
def get_status(self) -> Dict[str, Any]:
"""
Get current canary status.
Returns:
Status dictionary
"""
if not self.last_success:
return {
"status": "unknown",
"message": "No canary tests run yet",
"last_run": self.last_run.isoformat() if self.last_run else None
}
age = datetime.now() - self.last_success
max_age = timedelta(hours=6) # Alert if no success in 6 hours
if age > max_age:
return {
"status": "stale",
"last_success": self.last_success.isoformat(),
"age_hours": age.total_seconds() / 3600,
"consecutive_failures": self.consecutive_failures,
"message": f"Last successful canary was {age.total_seconds()/3600:.1f} hours ago"
}
return {
"status": "healthy",
"last_success": self.last_success.isoformat(),
"last_run": self.last_run.isoformat() if self.last_run else None,
"age_minutes": age.total_seconds() / 60,
"consecutive_failures": self.consecutive_failures,
"last_result": self.last_result
}
class HealthCheckSystem:
"""
Complete health check system for production.
Provides multiple levels of health checks:
- Liveness: Is the server alive?
- Readiness: Can it handle traffic?
- Canary: Does scraping actually work?
"""
def __init__(self, db):
"""
Initialize health check system.
Args:
db: Database manager instance
"""
self.db = db
self.canary = CanaryMonitor(db, interval_hours=4)
async def start(self):
"""Start background health monitoring"""
asyncio.create_task(self.canary.start())
def stop(self):
"""Stop background health monitoring"""
self.canary.stop()
async def check_liveness(self) -> Dict[str, Any]:
"""
Liveness check: Is the server alive?
This is a simple check that always succeeds if the server is running.
Used by Kubernetes liveness probe - restart container if fails.
Returns:
Liveness status
"""
return {
"status": "alive",
"timestamp": datetime.utcnow().isoformat()
}
async def check_readiness(self) -> Dict[str, Any]:
"""
Readiness check: Can the server handle traffic?
Checks if dependencies are available.
Used by Kubernetes readiness probe - remove from load balancer if fails.
Returns:
Readiness status
"""
checks = {}
# Check database
try:
await self.db.pool.fetchval("SELECT 1")
checks["database"] = {"healthy": True}
except Exception as e:
checks["database"] = {"healthy": False, "error": str(e)}
# Overall readiness
all_healthy = all(c.get("healthy", False) for c in checks.values())
return {
"status": "ready" if all_healthy else "not_ready",
"checks": checks,
"timestamp": datetime.utcnow().isoformat()
}
async def check_canary(self) -> Dict[str, Any]:
"""
Canary check: Does scraping actually work?
Returns the latest canary test result.
Used by external monitoring (PagerDuty, DataDog) for alerts.
Returns:
Canary status
"""
return self.canary.get_status()
async def get_detailed_health(self) -> Dict[str, Any]:
"""
Get detailed health status of all components.
Returns:
Complete health status
"""
liveness = await self.check_liveness()
readiness = await self.check_readiness()
canary = await self.check_canary()
overall_healthy = (
liveness["status"] == "alive" and
readiness["status"] == "ready" and
canary["status"] in ["healthy", "unknown"] # Unknown is OK (first run)
)
return {
"status": "healthy" if overall_healthy else "degraded",
"components": {
"liveness": liveness,
"readiness": readiness,
"canary": canary
},
"timestamp": datetime.utcnow().isoformat()
}

307
utils/helpers.py Normal file
View File

@@ -0,0 +1,307 @@
"""
Utility functions for Google Maps Reviews Scraper.
"""
import datetime
import logging
import re
import time
from datetime import timezone
from functools import lru_cache
from typing import List
from selenium.common.exceptions import (NoSuchElementException,
StaleElementReferenceException,
TimeoutException)
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
# Logger
log = logging.getLogger("scraper")
# Constants for language detection
HEB_CHARS = re.compile(r"[\u0590-\u05FF]")
THAI_CHARS = re.compile(r"[\u0E00-\u0E7F]")
@lru_cache(maxsize=1024)
def detect_lang(txt: str) -> str:
"""Detect language based on character sets"""
if HEB_CHARS.search(txt): return "he"
if THAI_CHARS.search(txt): return "th"
return "en"
@lru_cache(maxsize=128)
def safe_int(s: str | None) -> int:
"""Safely convert string to integer, returning 0 if not possible"""
m = re.search(r"\d+", s or "")
return int(m.group()) if m else 0
def try_find(el: WebElement, css: str, *, all=False) -> List[WebElement]:
"""Safely find elements by CSS selector without raising exceptions"""
try:
if all:
return el.find_elements(By.CSS_SELECTOR, css)
obj = el.find_element(By.CSS_SELECTOR, css)
return [obj] if obj else []
except (NoSuchElementException, StaleElementReferenceException):
return []
def first_text(el: WebElement, css: str) -> str:
"""Get text from the first matching element that has non-empty text"""
for e in try_find(el, css, all=True):
try:
if (t := e.text.strip()):
return t
except StaleElementReferenceException:
continue
return ""
def parse_date_to_iso(date_str: str) -> str:
"""
Parse date strings like "2 weeks ago", "January 2023", etc. into ISO format.
Returns a best-effort ISO string, or empty string if parsing fails.
"""
if not date_str:
return ""
try:
now = datetime.now(timezone.utc)
# Handle relative dates
if "ago" in date_str.lower():
# For simplicity, map to approximate dates
if "minute" in date_str.lower():
minutes = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
dt = now.replace(microsecond=0) - timezone.timedelta(minutes=minutes)
elif "hour" in date_str.lower():
hours = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
dt = now.replace(microsecond=0) - timezone.timedelta(hours=hours)
elif "day" in date_str.lower():
days = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
dt = now.replace(microsecond=0) - timezone.timedelta(days=days)
elif "week" in date_str.lower():
weeks = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
dt = now.replace(microsecond=0) - timezone.timedelta(weeks=weeks)
elif "month" in date_str.lower():
months = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# Approximate months as 30 days
dt = now.replace(microsecond=0) - timezone.timedelta(days=30 * months)
elif "year" in date_str.lower():
years = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# Approximate years as 365 days
dt = now.replace(microsecond=0) - timezone.timedelta(days=365 * years)
else:
# Default to current time if can't parse
dt = now.replace(microsecond=0)
else:
# Handle absolute dates (month year format)
# This is a simplification - would need more robust parsing for production
dt = now.replace(microsecond=0)
return dt.isoformat()
except Exception:
# If parsing fails, return empty string
return ""
def first_attr(el: WebElement, css: str, attr: str) -> str:
"""Get attribute value from the first matching element that has a non-empty value"""
for e in try_find(el, css, all=True):
try:
if (v := (e.get_attribute(attr) or "").strip()):
return v
except StaleElementReferenceException:
continue
return ""
def click_if(driver: Chrome, css: str, delay: float = .25, timeout: float = 5.0) -> bool:
"""
Click element if it exists and is clickable, with timeout and better error handling.
Args:
driver: WebDriver instance
css: CSS selector for the element to click
delay: Time to wait after clicking (seconds)
timeout: Maximum time to wait for element (seconds)
Returns:
True if element was found and clicked, False otherwise
"""
try:
# First check if elements exist at all
elements = driver.find_elements(By.CSS_SELECTOR, css)
if not elements:
return False
# Try clicking the first visible element
for element in elements:
try:
if element.is_displayed() and element.is_enabled():
element.click()
time.sleep(delay)
return True
except Exception:
# Try next element if this one fails
continue
# If we couldn't click any of the direct elements, try with WebDriverWait
try:
WebDriverWait(driver, timeout).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, css))
).click()
time.sleep(delay)
return True
except TimeoutException:
return False
except Exception as e:
log.debug(f"Error in click_if: {str(e)}")
return False
def get_current_iso_date() -> str:
"""Return current UTC time in ISO format."""
from datetime import datetime, timezone
return datetime.now(timezone.utc).isoformat()
# """
# Utility functions for Google Maps Reviews Scraper.
# """
#
# import re
# import time
# import logging
# from datetime import datetime, timezone
# from functools import lru_cache
# from typing import List, Optional
#
# from selenium.common.exceptions import (NoSuchElementException,
# StaleElementReferenceException,
# TimeoutException)
# from selenium.webdriver import Chrome
# from selenium.webdriver.common.by import By
# from selenium.webdriver.remote.webelement import WebElement
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.support.ui import WebDriverWait
#
# # Constants for language detection
# HEB_CHARS = re.compile(r"[\u0590-\u05FF]")
# THAI_CHARS = re.compile(r"[\u0E00-\u0E7F]")
#
# # Logger
# log = logging.getLogger("scraper")
#
#
# @lru_cache(maxsize=1024)
# def detect_lang(txt: str) -> str:
# """Detect language based on character sets"""
# if HEB_CHARS.search(txt): return "he"
# if THAI_CHARS.search(txt): return "th"
# return "en"
#
#
# @lru_cache(maxsize=128)
# def safe_int(s: str | None) -> int:
# """Safely convert string to integer, returning 0 if not possible"""
# m = re.search(r"\d+", s or "")
# return int(m.group()) if m else 0
#
#
# def try_find(el: WebElement, css: str, *, all=False) -> List[WebElement]:
# """Safely find elements by CSS selector without raising exceptions"""
# try:
# if all:
# return el.find_elements(By.CSS_SELECTOR, css)
# obj = el.find_element(By.CSS_SELECTOR, css)
# return [obj] if obj else []
# except (NoSuchElementException, StaleElementReferenceException):
# return []
#
#
# def first_text(el: WebElement, css: str) -> str:
# """Get text from the first matching element that has non-empty text"""
# for e in try_find(el, css, all=True):
# if (t := e.text.strip()):
# return t
# return ""
#
#
# def first_attr(el: WebElement, css: str, attr: str) -> str:
# """Get attribute value from the first matching element that has a non-empty value"""
# for e in try_find(el, css, all=True):
# if (v := (e.get_attribute(attr) or "").strip()):
# return v
# return ""
#
#
# def click_if(driver: Chrome, css: str, delay: float = .25, timeout: float = 5.0) -> bool:
# """Click element if it exists and is clickable, with timeout"""
# try:
# WebDriverWait(driver, timeout).until(
# EC.element_to_be_clickable((By.CSS_SELECTOR, css))
# ).click()
# time.sleep(delay)
# return True
# except TimeoutException:
# return False
#
#
# def parse_date_to_iso(date_str: str) -> str:
# """
# Parse date strings like "2 weeks ago", "January 2023", etc. into ISO format.
# Returns a best-effort ISO string, or empty string if parsing fails.
# """
# if not date_str:
# return ""
#
# try:
# now = datetime.now(timezone.utc)
#
# # Handle relative dates
# if "ago" in date_str.lower():
# # For simplicity, map to approximate dates
# if "minute" in date_str.lower():
# minutes = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# dt = now.replace(microsecond=0) - timezone.timedelta(minutes=minutes)
# elif "hour" in date_str.lower():
# hours = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# dt = now.replace(microsecond=0) - timezone.timedelta(hours=hours)
# elif "day" in date_str.lower():
# days = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# dt = now.replace(microsecond=0) - timezone.timedelta(days=days)
# elif "week" in date_str.lower():
# weeks = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# dt = now.replace(microsecond=0) - timezone.timedelta(weeks=weeks)
# elif "month" in date_str.lower():
# months = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# # Approximate months as 30 days
# dt = now.replace(microsecond=0) - timezone.timedelta(days=30 * months)
# elif "year" in date_str.lower():
# years = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# # Approximate years as 365 days
# dt = now.replace(microsecond=0) - timezone.timedelta(days=365 * years)
# else:
# # Default to current time if can't parse
# dt = now.replace(microsecond=0)
# else:
# # Handle absolute dates (month year format)
# # This is a simplification - would need more robust parsing for production
# dt = now.replace(microsecond=0)
#
# return dt.isoformat()
# except Exception:
# # If parsing fails, return empty string
# return ""
#
#
# def get_current_iso_date() -> str:
# """Return current UTC time in ISO format."""
# return datetime.now(timezone.utc).isoformat()

250
utils/logger.py Normal file
View File

@@ -0,0 +1,250 @@
"""
Structured Logger Module
Provides a thread-safe, structured logging system with JSON-serializable output.
Designed to replace the LogCapture class with enhanced categorization and metrics support.
"""
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from typing import Dict, List, Literal, Optional
import threading
import time
LogLevel = Literal['DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL']
LogCategory = Literal['scraper', 'browser', 'network', 'system']
@dataclass
class LogEntry:
"""Structured log entry with timestamp, level, category, and optional metrics."""
timestamp: str # ISO 8601 with Z suffix
timestamp_ms: int # Unix milliseconds
level: LogLevel
category: LogCategory
message: str
metrics: Optional[Dict] = None # memory_mb, reviews_count, scroll_position, dom_nodes, etc.
network: Optional[Dict] = None # url, method, status, size_bytes, duration_ms
snapshot_id: Optional[str] = None
def to_dict(self) -> Dict:
"""Convert to JSON-serializable dictionary, excluding None values."""
result = {
'timestamp': self.timestamp,
'timestamp_ms': self.timestamp_ms,
'level': self.level,
'category': self.category,
'message': self.message,
}
if self.metrics is not None:
result['metrics'] = self.metrics
if self.network is not None:
result['network'] = self.network
if self.snapshot_id is not None:
result['snapshot_id'] = self.snapshot_id
return result
class StructuredLogger:
"""
Thread-safe structured logger with categorized log entries and automatic pruning.
Example usage:
logger = StructuredLogger()
logger.info('browser', 'Navigating to URL', metrics={'memory_mb': 245})
logger.warn('network', 'Rate limit detected', network={'status': 429, 'url': '...'})
logger.error('system', 'Chrome crashed', metrics={'memory_mb': 489, 'dom_nodes': 12000})
"""
def __init__(self, max_entries: int = 10000):
"""
Initialize the structured logger.
Args:
max_entries: Maximum number of log entries to retain (default 10000).
Oldest entries are pruned when limit is exceeded.
"""
self._entries: List[LogEntry] = []
self._lock = threading.Lock()
self._max_entries = max_entries
def _create_entry(
self,
level: LogLevel,
category: LogCategory,
message: str,
metrics: Optional[Dict] = None,
network: Optional[Dict] = None,
snapshot_id: Optional[str] = None,
) -> LogEntry:
"""Create a new log entry with current timestamp."""
now = datetime.now(timezone.utc)
timestamp = now.strftime('%Y-%m-%dT%H:%M:%S.') + f'{now.microsecond // 1000:03d}Z'
timestamp_ms = int(now.timestamp() * 1000)
return LogEntry(
timestamp=timestamp,
timestamp_ms=timestamp_ms,
level=level,
category=category,
message=message,
metrics=metrics,
network=network,
snapshot_id=snapshot_id,
)
def _add_entry(self, entry: LogEntry) -> None:
"""Add an entry to the log with thread-safety and automatic pruning."""
with self._lock:
self._entries.append(entry)
# Prune oldest entries if limit exceeded
if len(self._entries) > self._max_entries:
# Remove oldest 10% to avoid frequent pruning
prune_count = max(1, self._max_entries // 10)
self._entries = self._entries[prune_count:]
def debug(
self,
category: LogCategory,
message: str,
*,
metrics: Optional[Dict] = None,
network: Optional[Dict] = None,
snapshot_id: Optional[str] = None,
) -> None:
"""Log a DEBUG level message."""
entry = self._create_entry('DEBUG', category, message, metrics, network, snapshot_id)
self._add_entry(entry)
def info(
self,
category: LogCategory,
message: str,
*,
metrics: Optional[Dict] = None,
network: Optional[Dict] = None,
snapshot_id: Optional[str] = None,
) -> None:
"""Log an INFO level message."""
entry = self._create_entry('INFO', category, message, metrics, network, snapshot_id)
self._add_entry(entry)
def warn(
self,
category: LogCategory,
message: str,
*,
metrics: Optional[Dict] = None,
network: Optional[Dict] = None,
snapshot_id: Optional[str] = None,
) -> None:
"""Log a WARN level message."""
entry = self._create_entry('WARN', category, message, metrics, network, snapshot_id)
self._add_entry(entry)
def error(
self,
category: LogCategory,
message: str,
*,
metrics: Optional[Dict] = None,
network: Optional[Dict] = None,
snapshot_id: Optional[str] = None,
) -> None:
"""Log an ERROR level message."""
entry = self._create_entry('ERROR', category, message, metrics, network, snapshot_id)
self._add_entry(entry)
def fatal(
self,
category: LogCategory,
message: str,
*,
metrics: Optional[Dict] = None,
network: Optional[Dict] = None,
snapshot_id: Optional[str] = None,
) -> None:
"""Log a FATAL level message."""
entry = self._create_entry('FATAL', category, message, metrics, network, snapshot_id)
self._add_entry(entry)
def log(self, message: str, level: str = 'INFO') -> None:
"""
Backward-compatible log method for legacy code.
Maps to 'system' category by default.
Args:
message: The log message
level: Log level as string (DEBUG, INFO, WARN, ERROR, FATAL)
"""
level_upper = level.upper()
if level_upper not in ('DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL'):
level_upper = 'INFO'
entry = self._create_entry(level_upper, 'system', message)
self._add_entry(entry)
def get_logs(self) -> List[Dict]:
"""
Get all log entries as JSON-serializable dictionaries.
Returns:
List of log entry dictionaries.
"""
with self._lock:
return [entry.to_dict() for entry in self._entries]
def get_logs_by_category(self, category: LogCategory) -> List[Dict]:
"""
Get log entries filtered by category.
Args:
category: The category to filter by ('scraper', 'browser', 'network', 'system')
Returns:
List of log entry dictionaries matching the category.
"""
with self._lock:
return [entry.to_dict() for entry in self._entries if entry.category == category]
def get_logs_by_level(self, level: LogLevel) -> List[Dict]:
"""
Get log entries filtered by level.
Args:
level: The level to filter by ('DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL')
Returns:
List of log entry dictionaries matching the level.
"""
with self._lock:
return [entry.to_dict() for entry in self._entries if entry.level == level]
def get_logs_since(self, timestamp_ms: int) -> List[Dict]:
"""
Get log entries since a specific timestamp.
Args:
timestamp_ms: Unix timestamp in milliseconds
Returns:
List of log entry dictionaries with timestamp >= timestamp_ms.
"""
with self._lock:
return [entry.to_dict() for entry in self._entries if entry.timestamp_ms >= timestamp_ms]
def clear(self) -> None:
"""Clear all log entries."""
with self._lock:
self._entries.clear()
def count(self) -> int:
"""Get the current number of log entries."""
with self._lock:
return len(self._entries)
def __len__(self) -> int:
"""Get the current number of log entries."""
return self.count()