Wave 3: SSE structured logs, crash analyzer, session fingerprint
- Task #3: Update SSE stream to emit structured log events (type: "log" for entries, type: "metrics" every 5s, ?format=legacy for backward compat) - Task #10: Create crash pattern analyzer module (6 patterns: memory_exhaustion, dom_bloat, rate_limited, consent_loop, scroll_timeout, element_stale) (confidence scoring, auto-fix params, summarize_crash_patterns for recurring issues) - Task #13: Capture session fingerprint in backend (user_agent, platform, timezone, webgl, canvas, bot_detection_tests) (saved on success and failure for debugging) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -35,6 +35,214 @@ def get_dom_node_count(driver) -> Optional[int]:
|
||||
return None
|
||||
|
||||
|
||||
def capture_session_fingerprint(driver) -> dict:
|
||||
"""
|
||||
Capture browser session fingerprint for bot detection analysis.
|
||||
|
||||
This captures various browser attributes that can be used to:
|
||||
1. Verify bot detection evasion is working
|
||||
2. Debug issues when scraping fails
|
||||
3. Track session characteristics for analysis
|
||||
|
||||
Args:
|
||||
driver: Selenium WebDriver instance (must be initialized)
|
||||
|
||||
Returns:
|
||||
Dictionary containing session fingerprint data
|
||||
"""
|
||||
fingerprint = {
|
||||
"user_agent": None,
|
||||
"platform": None,
|
||||
"language": None,
|
||||
"languages": None,
|
||||
"timezone": None,
|
||||
"screen": {
|
||||
"width": None,
|
||||
"height": None,
|
||||
"colorDepth": None
|
||||
},
|
||||
"viewport": {
|
||||
"width": None,
|
||||
"height": None
|
||||
},
|
||||
"webgl_vendor": None,
|
||||
"webgl_renderer": None,
|
||||
"canvas_fingerprint": None,
|
||||
"hardware_concurrency": None,
|
||||
"device_memory": None,
|
||||
"bot_detection_tests": {
|
||||
"webdriver_hidden": None,
|
||||
"chrome_runtime": None,
|
||||
"permissions_query": None
|
||||
},
|
||||
"captured_at": None
|
||||
}
|
||||
|
||||
try:
|
||||
# Navigate to about:blank first to ensure we can execute JS
|
||||
# (in case driver was just created and hasn't navigated yet)
|
||||
current_url = driver.current_url
|
||||
if not current_url or current_url == "data:,":
|
||||
driver.get("about:blank")
|
||||
|
||||
# Capture timestamp
|
||||
fingerprint["captured_at"] = datetime.now().isoformat()
|
||||
|
||||
# Basic navigator properties
|
||||
try:
|
||||
fingerprint["user_agent"] = driver.execute_script("return navigator.userAgent")
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
fingerprint["platform"] = driver.execute_script("return navigator.platform")
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
fingerprint["language"] = driver.execute_script("return navigator.language")
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
fingerprint["languages"] = driver.execute_script("return navigator.languages")
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
fingerprint["timezone"] = driver.execute_script(
|
||||
"return Intl.DateTimeFormat().resolvedOptions().timeZone"
|
||||
)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Screen properties
|
||||
try:
|
||||
fingerprint["screen"]["width"] = driver.execute_script("return screen.width")
|
||||
fingerprint["screen"]["height"] = driver.execute_script("return screen.height")
|
||||
fingerprint["screen"]["colorDepth"] = driver.execute_script("return screen.colorDepth")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Viewport properties
|
||||
try:
|
||||
fingerprint["viewport"]["width"] = driver.execute_script("return window.innerWidth")
|
||||
fingerprint["viewport"]["height"] = driver.execute_script("return window.innerHeight")
|
||||
except:
|
||||
pass
|
||||
|
||||
# WebGL vendor and renderer (important for fingerprinting)
|
||||
try:
|
||||
webgl_info = driver.execute_script("""
|
||||
try {
|
||||
var canvas = document.createElement('canvas');
|
||||
var gl = canvas.getContext('webgl') || canvas.getContext('experimental-webgl');
|
||||
if (gl) {
|
||||
var debugInfo = gl.getExtension('WEBGL_debug_renderer_info');
|
||||
if (debugInfo) {
|
||||
return {
|
||||
vendor: gl.getParameter(debugInfo.UNMASKED_VENDOR_WEBGL),
|
||||
renderer: gl.getParameter(debugInfo.UNMASKED_RENDERER_WEBGL)
|
||||
};
|
||||
}
|
||||
}
|
||||
} catch(e) {}
|
||||
return {vendor: null, renderer: null};
|
||||
""")
|
||||
fingerprint["webgl_vendor"] = webgl_info.get("vendor")
|
||||
fingerprint["webgl_renderer"] = webgl_info.get("renderer")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Canvas fingerprint (hash of canvas drawing)
|
||||
try:
|
||||
canvas_hash = driver.execute_script("""
|
||||
try {
|
||||
var canvas = document.createElement('canvas');
|
||||
canvas.width = 200;
|
||||
canvas.height = 50;
|
||||
var ctx = canvas.getContext('2d');
|
||||
ctx.textBaseline = 'top';
|
||||
ctx.font = '14px Arial';
|
||||
ctx.fillStyle = '#f60';
|
||||
ctx.fillRect(125, 1, 62, 20);
|
||||
ctx.fillStyle = '#069';
|
||||
ctx.fillText('Fingerprint', 2, 15);
|
||||
ctx.fillStyle = 'rgba(102, 204, 0, 0.7)';
|
||||
ctx.fillText('Fingerprint', 4, 17);
|
||||
var dataUrl = canvas.toDataURL();
|
||||
// Simple hash
|
||||
var hash = 0;
|
||||
for (var i = 0; i < dataUrl.length; i++) {
|
||||
var char = dataUrl.charCodeAt(i);
|
||||
hash = ((hash << 5) - hash) + char;
|
||||
hash = hash & hash;
|
||||
}
|
||||
return hash.toString(16);
|
||||
} catch(e) {
|
||||
return null;
|
||||
}
|
||||
""")
|
||||
fingerprint["canvas_fingerprint"] = canvas_hash
|
||||
except:
|
||||
pass
|
||||
|
||||
# Hardware info
|
||||
try:
|
||||
fingerprint["hardware_concurrency"] = driver.execute_script(
|
||||
"return navigator.hardwareConcurrency"
|
||||
)
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
fingerprint["device_memory"] = driver.execute_script(
|
||||
"return navigator.deviceMemory"
|
||||
)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Bot detection tests
|
||||
try:
|
||||
# Test 1: webdriver property should be hidden/false for undetected Chrome
|
||||
webdriver_hidden = driver.execute_script(
|
||||
"return navigator.webdriver === undefined || navigator.webdriver === false"
|
||||
)
|
||||
fingerprint["bot_detection_tests"]["webdriver_hidden"] = webdriver_hidden
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
# Test 2: chrome runtime should exist in real Chrome
|
||||
chrome_runtime = driver.execute_script(
|
||||
"return typeof window.chrome !== 'undefined'"
|
||||
)
|
||||
fingerprint["bot_detection_tests"]["chrome_runtime"] = chrome_runtime
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
# Test 3: permissions.query should work in real Chrome
|
||||
permissions_query = driver.execute_script("""
|
||||
try {
|
||||
if (navigator.permissions && navigator.permissions.query) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
} catch(e) {
|
||||
return false;
|
||||
}
|
||||
""")
|
||||
fingerprint["bot_detection_tests"]["permissions_query"] = permissions_query
|
||||
except:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
fingerprint["capture_error"] = str(e)
|
||||
|
||||
return fingerprint
|
||||
|
||||
|
||||
def classify_crash(exception: Exception, metrics_history: list) -> str:
|
||||
"""Classify crash type based on exception and metrics."""
|
||||
error_str = str(exception).lower()
|
||||
@@ -519,6 +727,16 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
# Use provided log_capture or create a dummy that just prints
|
||||
log = log_capture or LogCapture()
|
||||
|
||||
# Capture session fingerprint early (before navigation) for bot detection analysis
|
||||
session_fingerprint = capture_session_fingerprint(driver)
|
||||
log.info('browser', "Session fingerprint captured", metrics={
|
||||
'user_agent': session_fingerprint.get('user_agent', 'unknown')[:50] + '...' if session_fingerprint.get('user_agent') else 'unknown',
|
||||
'platform': session_fingerprint.get('platform'),
|
||||
'timezone': session_fingerprint.get('timezone'),
|
||||
'webdriver_hidden': session_fingerprint.get('bot_detection_tests', {}).get('webdriver_hidden'),
|
||||
'chrome_runtime': session_fingerprint.get('bot_detection_tests', {}).get('chrome_runtime')
|
||||
})
|
||||
|
||||
# Storage - use review ID as key
|
||||
reviews = {} # review_id -> review
|
||||
seen_ids = set() # Track all IDs we've seen (persists after flush)
|
||||
@@ -946,11 +1164,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
"category": business_info.get("category"),
|
||||
"address": business_info.get("address"),
|
||||
"total_reviews": total_reviews[0]
|
||||
}
|
||||
},
|
||||
"session_fingerprint": session_fingerprint # Browser fingerprint for bot detection analysis
|
||||
}
|
||||
|
||||
if not scroll_container:
|
||||
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
|
||||
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found", "session_fingerprint": session_fingerprint}
|
||||
|
||||
# Extract review topics after reviews tab is loaded (before scrolling begins)
|
||||
time.sleep(0.5) # Brief wait for topic filters to render
|
||||
@@ -1408,7 +1627,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
"logs": log.get_logs(),
|
||||
"review_topics": review_topics, # Topic filters with mention counts
|
||||
"metrics_history": metrics_history, # For crash detection
|
||||
"start_time": start_time # For crash report elapsed time
|
||||
"start_time": start_time, # For crash report elapsed time
|
||||
"session_fingerprint": session_fingerprint # Browser fingerprint for bot detection analysis
|
||||
}
|
||||
|
||||
|
||||
@@ -1544,7 +1764,8 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
"success": True,
|
||||
"error": None,
|
||||
"logs": result.get("logs", []),
|
||||
"review_topics": result.get("review_topics", []) # Topic filters with mention counts
|
||||
"review_topics": result.get("review_topics", []), # Topic filters with mention counts
|
||||
"session_fingerprint": result.get("session_fingerprint") # Browser fingerprint for bot detection
|
||||
}
|
||||
|
||||
# Include validation_info if in validation_only mode
|
||||
|
||||
Reference in New Issue
Block a user