Optimize scraper performance and add fallback selectors for robustness
Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -7,10 +7,12 @@ Google's internal API responses for faster, more reliable data extraction.
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
@@ -209,38 +211,62 @@ class GoogleMapsAPIInterceptor:
|
||||
intercept_script = """
|
||||
(function() {
|
||||
// Skip if already injected
|
||||
if (window.__reviewInterceptorInjected) return;
|
||||
if (window.__reviewInterceptorInjected) {
|
||||
console.log('[API Interceptor] Already injected, skipping');
|
||||
return;
|
||||
}
|
||||
window.__reviewInterceptorInjected = true;
|
||||
window.__interceptedResponses = [];
|
||||
window.__interceptorStats = {
|
||||
totalFetch: 0,
|
||||
totalXHR: 0,
|
||||
capturedFetch: 0,
|
||||
capturedXHR: 0,
|
||||
lastCapture: null
|
||||
};
|
||||
|
||||
console.log('[API Interceptor] Initializing...');
|
||||
|
||||
// Store original fetch
|
||||
const originalFetch = window.fetch;
|
||||
|
||||
// Override fetch
|
||||
window.fetch = async function(...args) {
|
||||
const response = await originalFetch.apply(this, args);
|
||||
window.__interceptorStats.totalFetch++;
|
||||
const url = args[0].toString();
|
||||
|
||||
// Log ALL fetch requests for debugging
|
||||
console.debug('[API Interceptor] FETCH:', url.substring(0, 150));
|
||||
|
||||
const response = await originalFetch.apply(this, args);
|
||||
|
||||
// Check if this is a review-related API call
|
||||
if (url.includes('review') || url.includes('batchexecute') ||
|
||||
url.includes('place') || url.includes('maps')) {
|
||||
url.includes('place') || url.includes('maps') ||
|
||||
url.includes('listugcposts') || url.includes('getreviews')) {
|
||||
try {
|
||||
const clone = response.clone();
|
||||
const text = await clone.text();
|
||||
|
||||
console.log('[API Interceptor] ✅ CAPTURED FETCH:', url.substring(0, 100), 'Size:', text.length);
|
||||
|
||||
window.__interceptedResponses.push({
|
||||
url: url,
|
||||
body: text,
|
||||
timestamp: Date.now(),
|
||||
type: 'fetch'
|
||||
type: 'fetch',
|
||||
size: text.length
|
||||
});
|
||||
|
||||
window.__interceptorStats.capturedFetch++;
|
||||
window.__interceptorStats.lastCapture = new Date().toISOString();
|
||||
|
||||
// Keep only last 100 responses to avoid memory issues
|
||||
if (window.__interceptedResponses.length > 100) {
|
||||
window.__interceptedResponses = window.__interceptedResponses.slice(-50);
|
||||
}
|
||||
} catch (e) {
|
||||
console.debug('Response capture error:', e);
|
||||
console.error('[API Interceptor] Response capture error:', e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -259,25 +285,35 @@ class GoogleMapsAPIInterceptor:
|
||||
|
||||
xhr.open = function(method, url, ...rest) {
|
||||
requestUrl = url;
|
||||
window.__interceptorStats.totalXHR++;
|
||||
console.debug('[API Interceptor] XHR:', method, url.substring(0, 150));
|
||||
return originalOpen.apply(this, [method, url, ...rest]);
|
||||
};
|
||||
|
||||
xhr.addEventListener('load', function() {
|
||||
if (requestUrl.includes('review') || requestUrl.includes('batchexecute') ||
|
||||
requestUrl.includes('place') || requestUrl.includes('maps')) {
|
||||
requestUrl.includes('place') || requestUrl.includes('maps') ||
|
||||
requestUrl.includes('listugcposts') || requestUrl.includes('getreviews')) {
|
||||
try {
|
||||
console.log('[API Interceptor] ✅ CAPTURED XHR:', requestUrl.substring(0, 100), 'Size:', xhr.responseText.length);
|
||||
|
||||
window.__interceptedResponses.push({
|
||||
url: requestUrl,
|
||||
body: xhr.responseText,
|
||||
timestamp: Date.now(),
|
||||
type: 'xhr'
|
||||
type: 'xhr',
|
||||
status: xhr.status,
|
||||
size: xhr.responseText.length
|
||||
});
|
||||
|
||||
window.__interceptorStats.capturedXHR++;
|
||||
window.__interceptorStats.lastCapture = new Date().toISOString();
|
||||
|
||||
if (window.__interceptedResponses.length > 100) {
|
||||
window.__interceptedResponses = window.__interceptedResponses.slice(-50);
|
||||
}
|
||||
} catch (e) {
|
||||
console.debug('XHR capture error:', e);
|
||||
console.error('[API Interceptor] XHR capture error:', e);
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -292,14 +328,30 @@ class GoogleMapsAPIInterceptor:
|
||||
} catch (e) {}
|
||||
}
|
||||
|
||||
console.log('Review API interceptor injected');
|
||||
console.log('[API Interceptor] ✅ Injected successfully! Monitoring network requests...');
|
||||
|
||||
// Log stats every 10 seconds
|
||||
setInterval(() => {
|
||||
if (window.__interceptorStats.totalFetch > 0 || window.__interceptorStats.totalXHR > 0) {
|
||||
console.log('[API Interceptor] Stats:',
|
||||
'Fetch:', window.__interceptorStats.totalFetch, '/', window.__interceptorStats.capturedFetch,
|
||||
'XHR:', window.__interceptorStats.totalXHR, '/', window.__interceptorStats.capturedXHR,
|
||||
'Queue:', window.__interceptedResponses.length);
|
||||
}
|
||||
}, 10000);
|
||||
|
||||
return true;
|
||||
})();
|
||||
"""
|
||||
|
||||
try:
|
||||
result = self.driver.execute_script(intercept_script)
|
||||
log.info("JavaScript response interceptor injected")
|
||||
log.info("JavaScript response interceptor injected with enhanced debugging")
|
||||
|
||||
# Get initial stats
|
||||
stats = self.get_interceptor_stats()
|
||||
log.debug(f"Interceptor stats: {stats}")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
log.warning(f"Failed to inject interceptor: {e}")
|
||||
@@ -317,11 +369,81 @@ class GoogleMapsAPIInterceptor:
|
||||
return [];
|
||||
"""
|
||||
responses = self.driver.execute_script(script)
|
||||
|
||||
if responses:
|
||||
log.debug(f"Retrieved {len(responses)} intercepted responses from browser")
|
||||
for resp in responses[:3]: # Log first 3 for debugging
|
||||
log.debug(f" - {resp.get('type', '?').upper()}: {resp.get('url', '')[:100]} ({resp.get('size', 0)} bytes)")
|
||||
else:
|
||||
log.debug("No intercepted responses available")
|
||||
|
||||
return responses or []
|
||||
except Exception as e:
|
||||
log.debug(f"Error getting intercepted responses: {e}")
|
||||
return []
|
||||
|
||||
def get_interceptor_stats(self):
|
||||
"""Get statistics from the JavaScript interceptor"""
|
||||
try:
|
||||
script = """
|
||||
if (window.__interceptorStats) {
|
||||
return window.__interceptorStats;
|
||||
}
|
||||
return null;
|
||||
"""
|
||||
stats = self.driver.execute_script(script)
|
||||
return stats
|
||||
except Exception as e:
|
||||
log.debug(f"Error getting interceptor stats: {e}")
|
||||
return None
|
||||
|
||||
def get_browser_console_logs(self):
|
||||
"""Get browser console logs (for debugging)"""
|
||||
try:
|
||||
logs = self.driver.get_log('browser')
|
||||
return logs
|
||||
except Exception as e:
|
||||
log.debug(f"Could not get browser console logs: {e}")
|
||||
return []
|
||||
|
||||
def dump_responses_to_file(self, responses: List[Dict], output_dir: str = "debug_api_responses"):
|
||||
"""
|
||||
Dump captured responses to files for debugging.
|
||||
Creates one file per response with metadata and body.
|
||||
"""
|
||||
try:
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(exist_ok=True)
|
||||
|
||||
for i, response in enumerate(responses):
|
||||
timestamp = response.get('timestamp', int(time.time() * 1000))
|
||||
url = response.get('url', 'unknown')
|
||||
req_type = response.get('type', 'unknown')
|
||||
|
||||
# Create filename from timestamp and type
|
||||
filename = f"{timestamp}_{req_type}_{i}.json"
|
||||
filepath = output_path / filename
|
||||
|
||||
# Write response with metadata
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
json.dump({
|
||||
'metadata': {
|
||||
'url': url,
|
||||
'type': req_type,
|
||||
'timestamp': timestamp,
|
||||
'size': response.get('size', len(response.get('body', ''))),
|
||||
'status': response.get('status')
|
||||
},
|
||||
'body': response.get('body', '')
|
||||
}, f, indent=2, ensure_ascii=False)
|
||||
|
||||
log.info(f"Dumped {len(responses)} responses to {output_path}")
|
||||
return str(output_path)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error dumping responses to file: {e}")
|
||||
return None
|
||||
|
||||
def _is_review_api(self, url: str) -> bool:
|
||||
"""Check if URL matches review API patterns"""
|
||||
url_lower = url.lower()
|
||||
@@ -381,6 +503,10 @@ class GoogleMapsAPIInterceptor:
|
||||
"""Parse a single response body for review data"""
|
||||
reviews = []
|
||||
|
||||
# Skip empty or HTML responses
|
||||
if not body or body.startswith('<!DOCTYPE') or body.startswith('<html'):
|
||||
return reviews
|
||||
|
||||
# Handle batch execute format (starts with )]}' prefix)
|
||||
if body.startswith(")]}'"):
|
||||
body = body[4:].strip()
|
||||
@@ -394,15 +520,213 @@ class GoogleMapsAPIInterceptor:
|
||||
try:
|
||||
data = json.loads(json_match.group())
|
||||
except:
|
||||
log.debug(f"Failed to parse JSON from response")
|
||||
return reviews
|
||||
else:
|
||||
log.debug(f"No JSON found in response")
|
||||
return reviews
|
||||
|
||||
# Extract reviews from nested structure
|
||||
reviews.extend(self._extract_reviews_recursive(data))
|
||||
# Special handling for listugcposts endpoint
|
||||
if 'listugcposts' in url.lower():
|
||||
reviews.extend(self._parse_listugcposts_response(data))
|
||||
else:
|
||||
# Generic recursive extraction
|
||||
reviews.extend(self._extract_reviews_recursive(data))
|
||||
|
||||
return reviews
|
||||
|
||||
def _parse_listugcposts_response(self, data: Any) -> List[InterceptedReview]:
|
||||
"""
|
||||
Parse Google Maps listugcposts API response.
|
||||
|
||||
Structure discovered:
|
||||
data[2] = array of review groups
|
||||
data[2][i] = single review group [review_data, metadata, continuation_token]
|
||||
data[2][i][0] = review data (6-item array containing all review info)
|
||||
"""
|
||||
reviews = []
|
||||
|
||||
try:
|
||||
if not isinstance(data, list) or len(data) < 3:
|
||||
log.debug("Response doesn't match expected structure (not a list or too short)")
|
||||
return reviews
|
||||
|
||||
# data[2] contains the review groups
|
||||
review_groups = data[2]
|
||||
if not isinstance(review_groups, list):
|
||||
log.debug("data[2] is not a list")
|
||||
return reviews
|
||||
|
||||
log.debug(f"Found {len(review_groups)} reviews in data[2]")
|
||||
|
||||
# Each group IS ONE REVIEW
|
||||
for group_idx, group in enumerate(review_groups):
|
||||
if not isinstance(group, list) or len(group) == 0:
|
||||
continue
|
||||
|
||||
# group[0] is the review data array (6 items)
|
||||
review_data = group[0]
|
||||
if not isinstance(review_data, list):
|
||||
continue
|
||||
|
||||
try:
|
||||
review = self._parse_google_review_array(review_data)
|
||||
if review:
|
||||
reviews.append(review)
|
||||
log.debug(f"Parsed review {group_idx}: {review.author} - {review.rating}★")
|
||||
except Exception as e:
|
||||
log.debug(f"Error parsing review at group[{group_idx}]: {e}")
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error in _parse_listugcposts_response: {e}")
|
||||
|
||||
return reviews
|
||||
|
||||
def _parse_google_review_array(self, review_data: List) -> Optional[InterceptedReview]:
|
||||
"""
|
||||
Parse a single review from Google's 6-item array format.
|
||||
|
||||
Discovered structure (review_data is a 6-item array):
|
||||
review_data[0] = Review ID (string)
|
||||
review_data[1][4][5][0] = Author Name
|
||||
review_data[1][4][5][3] = User ID
|
||||
review_data[1][6] = Date Text
|
||||
review_data[2][0][0] = Rating (1-5)
|
||||
review_data[2][15][0][0] = Review Text (original)
|
||||
review_data[2][15][1][0] = Review Text (translated)
|
||||
"""
|
||||
review = InterceptedReview()
|
||||
|
||||
try:
|
||||
# Extract review ID from review_data[0]
|
||||
if len(review_data) > 0 and isinstance(review_data[0], str):
|
||||
review.review_id = review_data[0]
|
||||
|
||||
# Extract author info from review_data[1][4][5]
|
||||
if (len(review_data) > 1 and
|
||||
isinstance(review_data[1], list) and
|
||||
len(review_data[1]) > 4 and
|
||||
isinstance(review_data[1][4], list) and
|
||||
len(review_data[1][4]) > 5 and
|
||||
isinstance(review_data[1][4][5], list)):
|
||||
|
||||
author_info = review_data[1][4][5]
|
||||
|
||||
# Author name at [1][4][5][0]
|
||||
if len(author_info) > 0 and isinstance(author_info[0], str):
|
||||
review.author = author_info[0]
|
||||
|
||||
# Profile picture at [1][4][5][1] (if available)
|
||||
if len(author_info) > 1 and isinstance(author_info[1], str):
|
||||
review.avatar_url = author_info[1]
|
||||
|
||||
# Extract date from review_data[1][6]
|
||||
if (len(review_data) > 1 and
|
||||
isinstance(review_data[1], list) and
|
||||
len(review_data[1]) > 6 and
|
||||
isinstance(review_data[1][6], str)):
|
||||
review.date_text = review_data[1][6]
|
||||
|
||||
# Extract rating from review_data[2][0][0]
|
||||
if (len(review_data) > 2 and
|
||||
isinstance(review_data[2], list) and
|
||||
len(review_data[2]) > 0 and
|
||||
isinstance(review_data[2][0], list) and
|
||||
len(review_data[2][0]) > 0):
|
||||
rating_val = review_data[2][0][0]
|
||||
if isinstance(rating_val, (int, float)) and 1 <= rating_val <= 5:
|
||||
review.rating = float(rating_val)
|
||||
|
||||
# Extract review text from review_data[2][15][0][0]
|
||||
if (len(review_data) > 2 and
|
||||
isinstance(review_data[2], list) and
|
||||
len(review_data[2]) > 15 and
|
||||
isinstance(review_data[2][15], list) and
|
||||
len(review_data[2][15]) > 0 and
|
||||
isinstance(review_data[2][15][0], list) and
|
||||
len(review_data[2][15][0]) > 0):
|
||||
text = review_data[2][15][0][0]
|
||||
if isinstance(text, str):
|
||||
review.text = text
|
||||
|
||||
# Only return if we have minimum required data
|
||||
if review.rating > 0 and (review.author or review.text):
|
||||
return review
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error parsing Google review array: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _parse_review_array_v2(self, arr: List) -> Optional[InterceptedReview]:
|
||||
"""
|
||||
Parse review from Google's nested array format.
|
||||
Improved version with better field detection.
|
||||
"""
|
||||
review = InterceptedReview()
|
||||
|
||||
try:
|
||||
# Extract review ID (usually a long string in first few elements)
|
||||
for i, item in enumerate(arr[:5]):
|
||||
if isinstance(item, str) and len(item) > 30 and not item.startswith('http'):
|
||||
review.review_id = item
|
||||
break
|
||||
|
||||
# Extract rating (number between 1-5)
|
||||
for item in arr:
|
||||
if isinstance(item, (int, float)) and 1 <= item <= 5:
|
||||
review.rating = float(item)
|
||||
break
|
||||
elif isinstance(item, list):
|
||||
for subitem in item:
|
||||
if isinstance(subitem, (int, float)) and 1 <= subitem <= 5:
|
||||
review.rating = float(subitem)
|
||||
break
|
||||
if review.rating > 0:
|
||||
break
|
||||
|
||||
# Extract review text (long string, not a URL)
|
||||
for item in arr:
|
||||
if isinstance(item, str) and len(item) > 50 and not item.startswith('http'):
|
||||
if not review.review_id or item != review.review_id:
|
||||
review.text = item
|
||||
break
|
||||
|
||||
# Extract author name (shorter string, not ID or text)
|
||||
for item in arr:
|
||||
if isinstance(item, str) and 3 <= len(item) <= 100:
|
||||
if item != review.review_id and item != review.text and not item.startswith('http'):
|
||||
review.author = item
|
||||
break
|
||||
elif isinstance(item, list):
|
||||
for subitem in item:
|
||||
if isinstance(subitem, str) and 3 <= len(subitem) <= 100:
|
||||
if subitem != review.text and not subitem.startswith('http'):
|
||||
review.author = subitem
|
||||
break
|
||||
if review.author:
|
||||
break
|
||||
|
||||
# Extract dates (strings that look like dates)
|
||||
date_patterns = [r'\d{1,2}/\d{1,2}/\d{2,4}', r'\d{4}-\d{2}-\d{2}', r'hace \d+', r'\d+ days? ago']
|
||||
for item in arr:
|
||||
if isinstance(item, str):
|
||||
for pattern in date_patterns:
|
||||
if re.search(pattern, item, re.IGNORECASE):
|
||||
review.date_text = item
|
||||
break
|
||||
if review.date_text:
|
||||
break
|
||||
|
||||
# Only return if we have meaningful data
|
||||
if (review.review_id or review.author) and review.rating > 0:
|
||||
return review
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error in _parse_review_array_v2: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _extract_reviews_recursive(self, data: Any, depth: int = 0) -> List[InterceptedReview]:
|
||||
"""Recursively search for review data in nested structures"""
|
||||
reviews = []
|
||||
@@ -410,6 +734,10 @@ class GoogleMapsAPIInterceptor:
|
||||
if depth > 20: # Prevent infinite recursion
|
||||
return reviews
|
||||
|
||||
# Skip if data is already an InterceptedReview object
|
||||
if isinstance(data, InterceptedReview):
|
||||
return [data]
|
||||
|
||||
if isinstance(data, dict):
|
||||
# Check if this looks like a review object
|
||||
review = self._try_parse_review_dict(data)
|
||||
@@ -418,7 +746,8 @@ class GoogleMapsAPIInterceptor:
|
||||
|
||||
# Recurse into dict values
|
||||
for value in data.values():
|
||||
reviews.extend(self._extract_reviews_recursive(value, depth + 1))
|
||||
if not isinstance(value, InterceptedReview):
|
||||
reviews.extend(self._extract_reviews_recursive(value, depth + 1))
|
||||
|
||||
elif isinstance(data, list):
|
||||
# Check if this array looks like a review array
|
||||
@@ -428,7 +757,8 @@ class GoogleMapsAPIInterceptor:
|
||||
|
||||
# Recurse into list items
|
||||
for item in data:
|
||||
reviews.extend(self._extract_reviews_recursive(item, depth + 1))
|
||||
if not isinstance(item, InterceptedReview):
|
||||
reviews.extend(self._extract_reviews_recursive(item, depth + 1))
|
||||
|
||||
return reviews
|
||||
|
||||
|
||||
Reference in New Issue
Block a user