Remove old scraper files - consolidate to scraper_clean
Production (api_server_production.py) only uses: - modules/scraper_clean.py - main scraping logic - modules/fast_scraper.py - validation helpers - modules/database.py, webhooks.py, health_checks.py, chrome_pool.py Deleted 33 unused Python files including: - Old API server (api_server.py) - 14 start*.py experimental scrapers - 7 *_scraper.py variants - Old modules: scraper.py, api_interceptor.py, job_manager.py, cli.py - Various debug/test/utility scripts Saves ~11,000 lines of unmaintained code. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,923 +0,0 @@
|
||||
"""
|
||||
API Interceptor for Google Maps Reviews.
|
||||
Uses Chrome DevTools Protocol (CDP) to intercept network requests and capture
|
||||
Google's internal API responses for faster, more reliable data extraction.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
log = logging.getLogger("api_interceptor")
|
||||
|
||||
|
||||
@dataclass
|
||||
class InterceptedReview:
|
||||
"""Data class for a review extracted from API response"""
|
||||
review_id: str = ""
|
||||
author: str = ""
|
||||
rating: float = 0.0
|
||||
text: str = ""
|
||||
date_text: str = ""
|
||||
timestamp: int = 0
|
||||
likes: int = 0
|
||||
photos: List[str] = field(default_factory=list)
|
||||
profile_url: str = ""
|
||||
avatar_url: str = ""
|
||||
owner_response: str = ""
|
||||
owner_response_date: str = ""
|
||||
lang: str = ""
|
||||
|
||||
|
||||
class GoogleMapsAPIInterceptor:
|
||||
"""
|
||||
Intercepts Google Maps internal API calls to capture review data directly.
|
||||
|
||||
Google Maps uses several internal endpoints for reviews:
|
||||
- /maps/preview/review/listentitiesreviews - Main reviews endpoint
|
||||
- /maps/rpc/placereview - Alternative review endpoint
|
||||
- /maps/preview/reviewsdata - Review data endpoint
|
||||
|
||||
The responses are often in a custom protobuf-like JSON format that needs parsing.
|
||||
"""
|
||||
|
||||
# Patterns for review-related API endpoints
|
||||
REVIEW_API_PATTERNS = [
|
||||
r'maps/preview/review',
|
||||
r'maps/rpc/placereview',
|
||||
r'maps/preview/reviewsdata',
|
||||
r'maps/preview/place',
|
||||
r'maps/api/place',
|
||||
r'/locationhistory/preview',
|
||||
r'batchexecute.*review',
|
||||
]
|
||||
|
||||
def __init__(self, driver):
|
||||
"""Initialize the interceptor with a Selenium driver"""
|
||||
self.driver = driver
|
||||
self.captured_responses: List[Dict[str, Any]] = []
|
||||
self.captured_reviews: List[InterceptedReview] = []
|
||||
self.request_map: Dict[str, Dict] = {} # Map request IDs to URLs
|
||||
self._lock = threading.Lock()
|
||||
self._listening = False
|
||||
self._response_callback: Optional[Callable] = None
|
||||
|
||||
def setup_interception(self):
|
||||
"""Enable network interception via CDP"""
|
||||
try:
|
||||
# Enable network domain
|
||||
self.driver.execute_cdp_cmd('Network.enable', {})
|
||||
|
||||
# Set up request interception patterns
|
||||
self.driver.execute_cdp_cmd('Network.setRequestInterception', {
|
||||
'patterns': [
|
||||
{'urlPattern': '*maps*review*', 'resourceType': 'XHR'},
|
||||
{'urlPattern': '*maps*review*', 'resourceType': 'Fetch'},
|
||||
{'urlPattern': '*batchexecute*', 'resourceType': 'XHR'},
|
||||
{'urlPattern': '*batchexecute*', 'resourceType': 'Fetch'},
|
||||
]
|
||||
})
|
||||
|
||||
self._listening = True
|
||||
log.info("API interception enabled via CDP")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
log.warning(f"Could not enable CDP interception: {e}")
|
||||
# Try alternative approach
|
||||
return self._setup_performance_logging()
|
||||
|
||||
def _setup_performance_logging(self):
|
||||
"""Alternative approach using Performance logging"""
|
||||
try:
|
||||
self.driver.execute_cdp_cmd('Network.enable', {
|
||||
'maxTotalBufferSize': 10000000,
|
||||
'maxResourceBufferSize': 5000000
|
||||
})
|
||||
self._listening = True
|
||||
log.info("API interception enabled via performance logging")
|
||||
return True
|
||||
except Exception as e:
|
||||
log.error(f"Failed to setup performance logging: {e}")
|
||||
return False
|
||||
|
||||
def capture_network_responses(self, duration: float = 5.0):
|
||||
"""
|
||||
Capture network responses for a specified duration.
|
||||
Call this while scrolling/loading more reviews.
|
||||
"""
|
||||
if not self._listening:
|
||||
log.warning("Interception not set up, call setup_interception() first")
|
||||
return []
|
||||
|
||||
captured = []
|
||||
start_time = time.time()
|
||||
|
||||
while time.time() - start_time < duration:
|
||||
try:
|
||||
# Get performance logs which contain network events
|
||||
logs = self.driver.get_log('performance')
|
||||
|
||||
for entry in logs:
|
||||
try:
|
||||
log_data = json.loads(entry['message'])
|
||||
message = log_data.get('message', {})
|
||||
method = message.get('method', '')
|
||||
params = message.get('params', {})
|
||||
|
||||
# Capture response received events
|
||||
if method == 'Network.responseReceived':
|
||||
response = params.get('response', {})
|
||||
url = response.get('url', '')
|
||||
|
||||
if self._is_review_api(url):
|
||||
request_id = params.get('requestId')
|
||||
self.request_map[request_id] = {
|
||||
'url': url,
|
||||
'status': response.get('status'),
|
||||
'headers': response.get('headers', {})
|
||||
}
|
||||
|
||||
# Capture response body when loading is finished
|
||||
elif method == 'Network.loadingFinished':
|
||||
request_id = params.get('requestId')
|
||||
if request_id in self.request_map:
|
||||
body = self._get_response_body(request_id)
|
||||
if body:
|
||||
captured.append({
|
||||
'url': self.request_map[request_id]['url'],
|
||||
'body': body,
|
||||
'timestamp': time.time()
|
||||
})
|
||||
|
||||
except Exception as parse_error:
|
||||
log.debug(f"Error parsing log entry: {parse_error}")
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
# Performance logs might not be available
|
||||
log.debug(f"Could not get performance logs: {e}")
|
||||
break
|
||||
|
||||
time.sleep(0.1)
|
||||
|
||||
with self._lock:
|
||||
self.captured_responses.extend(captured)
|
||||
|
||||
return captured
|
||||
|
||||
def get_response_bodies_cdp(self):
|
||||
"""Get response bodies using CDP directly (more reliable method)"""
|
||||
responses = []
|
||||
|
||||
try:
|
||||
# Use CDP to get all responses
|
||||
result = self.driver.execute_cdp_cmd('Network.getAllCookies', {})
|
||||
|
||||
# Execute JavaScript to intercept fetch/XHR responses
|
||||
intercept_script = """
|
||||
(function() {
|
||||
if (window.__interceptedResponses) {
|
||||
var responses = window.__interceptedResponses;
|
||||
window.__interceptedResponses = [];
|
||||
return responses;
|
||||
}
|
||||
return [];
|
||||
})();
|
||||
"""
|
||||
|
||||
captured = self.driver.execute_script(intercept_script)
|
||||
if captured:
|
||||
responses.extend(captured)
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"CDP response capture error: {e}")
|
||||
|
||||
return responses
|
||||
|
||||
def inject_response_interceptor(self):
|
||||
"""
|
||||
Inject JavaScript to intercept XHR/Fetch responses at the browser level.
|
||||
This is the most reliable method for capturing API responses.
|
||||
"""
|
||||
intercept_script = """
|
||||
(function() {
|
||||
// Skip if already injected
|
||||
if (window.__reviewInterceptorInjected) {
|
||||
console.log('[API Interceptor] Already injected, skipping');
|
||||
return;
|
||||
}
|
||||
window.__reviewInterceptorInjected = true;
|
||||
window.__interceptedResponses = [];
|
||||
window.__interceptorStats = {
|
||||
totalFetch: 0,
|
||||
totalXHR: 0,
|
||||
capturedFetch: 0,
|
||||
capturedXHR: 0,
|
||||
lastCapture: null
|
||||
};
|
||||
|
||||
console.log('[API Interceptor] Initializing...');
|
||||
|
||||
// Store original fetch
|
||||
const originalFetch = window.fetch;
|
||||
|
||||
// Override fetch
|
||||
window.fetch = async function(...args) {
|
||||
window.__interceptorStats.totalFetch++;
|
||||
const url = args[0].toString();
|
||||
|
||||
// Log ALL fetch requests for debugging
|
||||
console.debug('[API Interceptor] FETCH:', url.substring(0, 150));
|
||||
|
||||
const response = await originalFetch.apply(this, args);
|
||||
|
||||
// Check if this is a review-related API call
|
||||
if (url.includes('review') || url.includes('batchexecute') ||
|
||||
url.includes('place') || url.includes('maps') ||
|
||||
url.includes('listugcposts') || url.includes('getreviews')) {
|
||||
try {
|
||||
const clone = response.clone();
|
||||
const text = await clone.text();
|
||||
|
||||
console.log('[API Interceptor] ✅ CAPTURED FETCH:', url.substring(0, 100), 'Size:', text.length);
|
||||
|
||||
window.__interceptedResponses.push({
|
||||
url: url,
|
||||
body: text,
|
||||
timestamp: Date.now(),
|
||||
type: 'fetch',
|
||||
size: text.length
|
||||
});
|
||||
|
||||
window.__interceptorStats.capturedFetch++;
|
||||
window.__interceptorStats.lastCapture = new Date().toISOString();
|
||||
|
||||
// Keep only last 100 responses to avoid memory issues
|
||||
if (window.__interceptedResponses.length > 100) {
|
||||
window.__interceptedResponses = window.__interceptedResponses.slice(-50);
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('[API Interceptor] Response capture error:', e);
|
||||
}
|
||||
}
|
||||
|
||||
return response;
|
||||
};
|
||||
|
||||
// Store original XMLHttpRequest
|
||||
const originalXHR = window.XMLHttpRequest;
|
||||
|
||||
// Create intercepting XHR
|
||||
window.XMLHttpRequest = function() {
|
||||
const xhr = new originalXHR();
|
||||
const originalOpen = xhr.open;
|
||||
const originalSend = xhr.send;
|
||||
let requestUrl = '';
|
||||
|
||||
xhr.open = function(method, url, ...rest) {
|
||||
requestUrl = url;
|
||||
window.__interceptorStats.totalXHR++;
|
||||
console.debug('[API Interceptor] XHR:', method, url.substring(0, 150));
|
||||
return originalOpen.apply(this, [method, url, ...rest]);
|
||||
};
|
||||
|
||||
xhr.addEventListener('load', function() {
|
||||
if (requestUrl.includes('review') || requestUrl.includes('batchexecute') ||
|
||||
requestUrl.includes('place') || requestUrl.includes('maps') ||
|
||||
requestUrl.includes('listugcposts') || requestUrl.includes('getreviews')) {
|
||||
try {
|
||||
console.log('[API Interceptor] ✅ CAPTURED XHR:', requestUrl.substring(0, 100), 'Size:', xhr.responseText.length);
|
||||
|
||||
window.__interceptedResponses.push({
|
||||
url: requestUrl,
|
||||
body: xhr.responseText,
|
||||
timestamp: Date.now(),
|
||||
type: 'xhr',
|
||||
status: xhr.status,
|
||||
size: xhr.responseText.length
|
||||
});
|
||||
|
||||
window.__interceptorStats.capturedXHR++;
|
||||
window.__interceptorStats.lastCapture = new Date().toISOString();
|
||||
|
||||
if (window.__interceptedResponses.length > 100) {
|
||||
window.__interceptedResponses = window.__interceptedResponses.slice(-50);
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('[API Interceptor] XHR capture error:', e);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return xhr;
|
||||
};
|
||||
|
||||
// Copy static properties
|
||||
for (let prop of Object.getOwnPropertyNames(originalXHR)) {
|
||||
try {
|
||||
window.XMLHttpRequest[prop] = originalXHR[prop];
|
||||
} catch (e) {}
|
||||
}
|
||||
|
||||
console.log('[API Interceptor] ✅ Injected successfully! Monitoring network requests...');
|
||||
|
||||
// Log stats every 10 seconds
|
||||
setInterval(() => {
|
||||
if (window.__interceptorStats.totalFetch > 0 || window.__interceptorStats.totalXHR > 0) {
|
||||
console.log('[API Interceptor] Stats:',
|
||||
'Fetch:', window.__interceptorStats.totalFetch, '/', window.__interceptorStats.capturedFetch,
|
||||
'XHR:', window.__interceptorStats.totalXHR, '/', window.__interceptorStats.capturedXHR,
|
||||
'Queue:', window.__interceptedResponses.length);
|
||||
}
|
||||
}, 10000);
|
||||
|
||||
return true;
|
||||
})();
|
||||
"""
|
||||
|
||||
try:
|
||||
result = self.driver.execute_script(intercept_script)
|
||||
log.info("JavaScript response interceptor injected with enhanced debugging")
|
||||
|
||||
# Get initial stats
|
||||
stats = self.get_interceptor_stats()
|
||||
log.debug(f"Interceptor stats: {stats}")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
log.warning(f"Failed to inject interceptor: {e}")
|
||||
return False
|
||||
|
||||
def get_intercepted_responses(self):
|
||||
"""Retrieve intercepted responses from the browser"""
|
||||
try:
|
||||
script = """
|
||||
if (window.__interceptedResponses) {
|
||||
var responses = window.__interceptedResponses.slice();
|
||||
window.__interceptedResponses = [];
|
||||
return responses;
|
||||
}
|
||||
return [];
|
||||
"""
|
||||
responses = self.driver.execute_script(script)
|
||||
|
||||
if responses:
|
||||
log.debug(f"Retrieved {len(responses)} intercepted responses from browser")
|
||||
for resp in responses[:3]: # Log first 3 for debugging
|
||||
log.debug(f" - {resp.get('type', '?').upper()}: {resp.get('url', '')[:100]} ({resp.get('size', 0)} bytes)")
|
||||
else:
|
||||
log.debug("No intercepted responses available")
|
||||
|
||||
return responses or []
|
||||
except Exception as e:
|
||||
log.debug(f"Error getting intercepted responses: {e}")
|
||||
return []
|
||||
|
||||
def get_interceptor_stats(self):
|
||||
"""Get statistics from the JavaScript interceptor"""
|
||||
try:
|
||||
script = """
|
||||
if (window.__interceptorStats) {
|
||||
return window.__interceptorStats;
|
||||
}
|
||||
return null;
|
||||
"""
|
||||
stats = self.driver.execute_script(script)
|
||||
return stats
|
||||
except Exception as e:
|
||||
log.debug(f"Error getting interceptor stats: {e}")
|
||||
return None
|
||||
|
||||
def get_browser_console_logs(self):
|
||||
"""Get browser console logs (for debugging)"""
|
||||
try:
|
||||
logs = self.driver.get_log('browser')
|
||||
return logs
|
||||
except Exception as e:
|
||||
log.debug(f"Could not get browser console logs: {e}")
|
||||
return []
|
||||
|
||||
def dump_responses_to_file(self, responses: List[Dict], output_dir: str = "debug_api_responses"):
|
||||
"""
|
||||
Dump captured responses to files for debugging.
|
||||
Creates one file per response with metadata and body.
|
||||
"""
|
||||
try:
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(exist_ok=True)
|
||||
|
||||
for i, response in enumerate(responses):
|
||||
timestamp = response.get('timestamp', int(time.time() * 1000))
|
||||
url = response.get('url', 'unknown')
|
||||
req_type = response.get('type', 'unknown')
|
||||
|
||||
# Create filename from timestamp and type
|
||||
filename = f"{timestamp}_{req_type}_{i}.json"
|
||||
filepath = output_path / filename
|
||||
|
||||
# Write response with metadata
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
json.dump({
|
||||
'metadata': {
|
||||
'url': url,
|
||||
'type': req_type,
|
||||
'timestamp': timestamp,
|
||||
'size': response.get('size', len(response.get('body', ''))),
|
||||
'status': response.get('status')
|
||||
},
|
||||
'body': response.get('body', '')
|
||||
}, f, indent=2, ensure_ascii=False)
|
||||
|
||||
log.info(f"Dumped {len(responses)} responses to {output_path}")
|
||||
return str(output_path)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error dumping responses to file: {e}")
|
||||
return None
|
||||
|
||||
def _is_review_api(self, url: str) -> bool:
|
||||
"""Check if URL matches review API patterns"""
|
||||
url_lower = url.lower()
|
||||
return any(re.search(pattern, url_lower) for pattern in self.REVIEW_API_PATTERNS)
|
||||
|
||||
def _get_response_body(self, request_id: str) -> Optional[str]:
|
||||
"""Get response body for a request ID using CDP"""
|
||||
try:
|
||||
result = self.driver.execute_cdp_cmd('Network.getResponseBody', {
|
||||
'requestId': request_id
|
||||
})
|
||||
|
||||
body = result.get('body', '')
|
||||
if result.get('base64Encoded'):
|
||||
body = base64.b64decode(body).decode('utf-8', errors='ignore')
|
||||
|
||||
return body
|
||||
except Exception as e:
|
||||
log.debug(f"Could not get response body for {request_id}: {e}")
|
||||
return None
|
||||
|
||||
def parse_reviews_from_responses(self, responses: List[Dict]) -> List[InterceptedReview]:
|
||||
"""
|
||||
Parse review data from captured API responses.
|
||||
Google's API responses use a custom nested array format.
|
||||
"""
|
||||
reviews = []
|
||||
|
||||
for response in responses:
|
||||
try:
|
||||
body = response.get('body', '')
|
||||
url = response.get('url', '')
|
||||
|
||||
# Skip non-JSON responses
|
||||
if not body or body.startswith('<!DOCTYPE'):
|
||||
continue
|
||||
|
||||
# Try to parse as JSON
|
||||
parsed_reviews = self._parse_response_body(body, url)
|
||||
reviews.extend(parsed_reviews)
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error parsing response: {e}")
|
||||
continue
|
||||
|
||||
# Deduplicate by review ID
|
||||
seen_ids = set()
|
||||
unique_reviews = []
|
||||
for review in reviews:
|
||||
if review.review_id and review.review_id not in seen_ids:
|
||||
seen_ids.add(review.review_id)
|
||||
unique_reviews.append(review)
|
||||
|
||||
return unique_reviews
|
||||
|
||||
def _parse_response_body(self, body: str, url: str) -> List[InterceptedReview]:
|
||||
"""Parse a single response body for review data"""
|
||||
reviews = []
|
||||
|
||||
# Skip empty or HTML responses
|
||||
if not body or body.startswith('<!DOCTYPE') or body.startswith('<html'):
|
||||
return reviews
|
||||
|
||||
# Handle batch execute format (starts with )]}' prefix)
|
||||
if body.startswith(")]}'"):
|
||||
body = body[4:].strip()
|
||||
|
||||
try:
|
||||
data = json.loads(body)
|
||||
except json.JSONDecodeError:
|
||||
# Try to extract JSON from the response
|
||||
json_match = re.search(r'\[.*\]', body, re.DOTALL)
|
||||
if json_match:
|
||||
try:
|
||||
data = json.loads(json_match.group())
|
||||
except:
|
||||
log.debug(f"Failed to parse JSON from response")
|
||||
return reviews
|
||||
else:
|
||||
log.debug(f"No JSON found in response")
|
||||
return reviews
|
||||
|
||||
# Special handling for listugcposts endpoint
|
||||
if 'listugcposts' in url.lower():
|
||||
reviews.extend(self._parse_listugcposts_response(data))
|
||||
else:
|
||||
# Generic recursive extraction
|
||||
reviews.extend(self._extract_reviews_recursive(data))
|
||||
|
||||
return reviews
|
||||
|
||||
def _parse_listugcposts_response(self, data: Any) -> List[InterceptedReview]:
|
||||
"""
|
||||
Parse Google Maps listugcposts API response.
|
||||
|
||||
Structure discovered:
|
||||
data[2] = array of review groups
|
||||
data[2][i] = single review group [review_data, metadata, continuation_token]
|
||||
data[2][i][0] = review data (6-item array containing all review info)
|
||||
"""
|
||||
reviews = []
|
||||
|
||||
try:
|
||||
if not isinstance(data, list) or len(data) < 3:
|
||||
log.debug("Response doesn't match expected structure (not a list or too short)")
|
||||
return reviews
|
||||
|
||||
# data[2] contains the review groups
|
||||
review_groups = data[2]
|
||||
if not isinstance(review_groups, list):
|
||||
log.debug("data[2] is not a list")
|
||||
return reviews
|
||||
|
||||
log.debug(f"Found {len(review_groups)} reviews in data[2]")
|
||||
|
||||
# Each group IS ONE REVIEW
|
||||
for group_idx, group in enumerate(review_groups):
|
||||
if not isinstance(group, list) or len(group) == 0:
|
||||
continue
|
||||
|
||||
# group[0] is the review data array (6 items)
|
||||
review_data = group[0]
|
||||
if not isinstance(review_data, list):
|
||||
continue
|
||||
|
||||
try:
|
||||
review = self._parse_google_review_array(review_data)
|
||||
if review:
|
||||
reviews.append(review)
|
||||
log.debug(f"Parsed review {group_idx}: {review.author} - {review.rating}★")
|
||||
except Exception as e:
|
||||
log.debug(f"Error parsing review at group[{group_idx}]: {e}")
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error in _parse_listugcposts_response: {e}")
|
||||
|
||||
return reviews
|
||||
|
||||
def _parse_google_review_array(self, review_data: List) -> Optional[InterceptedReview]:
|
||||
"""
|
||||
Parse a single review from Google's 6-item array format.
|
||||
|
||||
Discovered structure (review_data is a 6-item array):
|
||||
review_data[0] = Review ID (string)
|
||||
review_data[1][4][5][0] = Author Name
|
||||
review_data[1][4][5][3] = User ID
|
||||
review_data[1][6] = Date Text
|
||||
review_data[2][0][0] = Rating (1-5)
|
||||
review_data[2][15][0][0] = Review Text (original)
|
||||
review_data[2][15][1][0] = Review Text (translated)
|
||||
"""
|
||||
review = InterceptedReview()
|
||||
|
||||
try:
|
||||
# Extract review ID from review_data[0]
|
||||
if len(review_data) > 0 and isinstance(review_data[0], str):
|
||||
review.review_id = review_data[0]
|
||||
|
||||
# Extract author info from review_data[1][4][5]
|
||||
if (len(review_data) > 1 and
|
||||
isinstance(review_data[1], list) and
|
||||
len(review_data[1]) > 4 and
|
||||
isinstance(review_data[1][4], list) and
|
||||
len(review_data[1][4]) > 5 and
|
||||
isinstance(review_data[1][4][5], list)):
|
||||
|
||||
author_info = review_data[1][4][5]
|
||||
|
||||
# Author name at [1][4][5][0]
|
||||
if len(author_info) > 0 and isinstance(author_info[0], str):
|
||||
review.author = author_info[0]
|
||||
|
||||
# Profile picture at [1][4][5][1] (if available)
|
||||
if len(author_info) > 1 and isinstance(author_info[1], str):
|
||||
review.avatar_url = author_info[1]
|
||||
|
||||
# Extract date from review_data[1][6]
|
||||
if (len(review_data) > 1 and
|
||||
isinstance(review_data[1], list) and
|
||||
len(review_data[1]) > 6 and
|
||||
isinstance(review_data[1][6], str)):
|
||||
review.date_text = review_data[1][6]
|
||||
|
||||
# Extract rating from review_data[2][0][0]
|
||||
if (len(review_data) > 2 and
|
||||
isinstance(review_data[2], list) and
|
||||
len(review_data[2]) > 0 and
|
||||
isinstance(review_data[2][0], list) and
|
||||
len(review_data[2][0]) > 0):
|
||||
rating_val = review_data[2][0][0]
|
||||
if isinstance(rating_val, (int, float)) and 1 <= rating_val <= 5:
|
||||
review.rating = float(rating_val)
|
||||
|
||||
# Extract review text from review_data[2][15][0][0]
|
||||
if (len(review_data) > 2 and
|
||||
isinstance(review_data[2], list) and
|
||||
len(review_data[2]) > 15 and
|
||||
isinstance(review_data[2][15], list) and
|
||||
len(review_data[2][15]) > 0 and
|
||||
isinstance(review_data[2][15][0], list) and
|
||||
len(review_data[2][15][0]) > 0):
|
||||
text = review_data[2][15][0][0]
|
||||
if isinstance(text, str):
|
||||
review.text = text
|
||||
|
||||
# Only return if we have minimum required data
|
||||
if review.rating > 0 and (review.author or review.text):
|
||||
return review
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error parsing Google review array: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _parse_review_array_v2(self, arr: List) -> Optional[InterceptedReview]:
|
||||
"""
|
||||
Parse review from Google's nested array format.
|
||||
Improved version with better field detection.
|
||||
"""
|
||||
review = InterceptedReview()
|
||||
|
||||
try:
|
||||
# Extract review ID (usually a long string in first few elements)
|
||||
for i, item in enumerate(arr[:5]):
|
||||
if isinstance(item, str) and len(item) > 30 and not item.startswith('http'):
|
||||
review.review_id = item
|
||||
break
|
||||
|
||||
# Extract rating (number between 1-5)
|
||||
for item in arr:
|
||||
if isinstance(item, (int, float)) and 1 <= item <= 5:
|
||||
review.rating = float(item)
|
||||
break
|
||||
elif isinstance(item, list):
|
||||
for subitem in item:
|
||||
if isinstance(subitem, (int, float)) and 1 <= subitem <= 5:
|
||||
review.rating = float(subitem)
|
||||
break
|
||||
if review.rating > 0:
|
||||
break
|
||||
|
||||
# Extract review text (long string, not a URL)
|
||||
for item in arr:
|
||||
if isinstance(item, str) and len(item) > 50 and not item.startswith('http'):
|
||||
if not review.review_id or item != review.review_id:
|
||||
review.text = item
|
||||
break
|
||||
|
||||
# Extract author name (shorter string, not ID or text)
|
||||
for item in arr:
|
||||
if isinstance(item, str) and 3 <= len(item) <= 100:
|
||||
if item != review.review_id and item != review.text and not item.startswith('http'):
|
||||
review.author = item
|
||||
break
|
||||
elif isinstance(item, list):
|
||||
for subitem in item:
|
||||
if isinstance(subitem, str) and 3 <= len(subitem) <= 100:
|
||||
if subitem != review.text and not subitem.startswith('http'):
|
||||
review.author = subitem
|
||||
break
|
||||
if review.author:
|
||||
break
|
||||
|
||||
# Extract dates (strings that look like dates)
|
||||
date_patterns = [r'\d{1,2}/\d{1,2}/\d{2,4}', r'\d{4}-\d{2}-\d{2}', r'hace \d+', r'\d+ days? ago']
|
||||
for item in arr:
|
||||
if isinstance(item, str):
|
||||
for pattern in date_patterns:
|
||||
if re.search(pattern, item, re.IGNORECASE):
|
||||
review.date_text = item
|
||||
break
|
||||
if review.date_text:
|
||||
break
|
||||
|
||||
# Only return if we have meaningful data
|
||||
if (review.review_id or review.author) and review.rating > 0:
|
||||
return review
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error in _parse_review_array_v2: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _extract_reviews_recursive(self, data: Any, depth: int = 0) -> List[InterceptedReview]:
|
||||
"""Recursively search for review data in nested structures"""
|
||||
reviews = []
|
||||
|
||||
if depth > 20: # Prevent infinite recursion
|
||||
return reviews
|
||||
|
||||
# Skip if data is already an InterceptedReview object
|
||||
if isinstance(data, InterceptedReview):
|
||||
return [data]
|
||||
|
||||
if isinstance(data, dict):
|
||||
# Check if this looks like a review object
|
||||
review = self._try_parse_review_dict(data)
|
||||
if review:
|
||||
reviews.append(review)
|
||||
|
||||
# Recurse into dict values
|
||||
for value in data.values():
|
||||
if not isinstance(value, InterceptedReview):
|
||||
reviews.extend(self._extract_reviews_recursive(value, depth + 1))
|
||||
|
||||
elif isinstance(data, list):
|
||||
# Check if this array looks like a review array
|
||||
review = self._try_parse_review_array(data)
|
||||
if review:
|
||||
reviews.append(review)
|
||||
|
||||
# Recurse into list items
|
||||
for item in data:
|
||||
if not isinstance(item, InterceptedReview):
|
||||
reviews.extend(self._extract_reviews_recursive(item, depth + 1))
|
||||
|
||||
return reviews
|
||||
|
||||
def _try_parse_review_dict(self, data: Dict) -> Optional[InterceptedReview]:
|
||||
"""Try to parse a dictionary as a review object"""
|
||||
# Common keys in review objects
|
||||
review_keys = {'reviewId', 'review_id', 'author', 'rating', 'text', 'comment'}
|
||||
|
||||
if not any(k in data for k in review_keys):
|
||||
return None
|
||||
|
||||
try:
|
||||
review = InterceptedReview()
|
||||
|
||||
# Try various key names for each field
|
||||
review.review_id = data.get('reviewId') or data.get('review_id') or data.get('id', '')
|
||||
review.author = data.get('author') or data.get('authorName') or data.get('name', '')
|
||||
review.rating = float(data.get('rating') or data.get('starRating') or 0)
|
||||
review.text = data.get('text') or data.get('comment') or data.get('reviewText', '')
|
||||
review.date_text = data.get('publishTime') or data.get('relativePublishTime') or data.get('date', '')
|
||||
review.likes = int(data.get('thumbsUpCount') or data.get('likes') or 0)
|
||||
|
||||
# Photos
|
||||
photos = data.get('photos') or data.get('reviewPhotos') or []
|
||||
if photos:
|
||||
review.photos = [p.get('url') or p for p in photos if p]
|
||||
|
||||
# Profile
|
||||
author_data = data.get('author') if isinstance(data.get('author'), dict) else {}
|
||||
review.profile_url = author_data.get('profileUrl') or data.get('profileUrl', '')
|
||||
review.avatar_url = author_data.get('profilePhotoUrl') or data.get('avatar', '')
|
||||
|
||||
# Owner response
|
||||
owner_resp = data.get('ownerResponse') or data.get('ownerReply') or {}
|
||||
if isinstance(owner_resp, dict):
|
||||
review.owner_response = owner_resp.get('text', '')
|
||||
review.owner_response_date = owner_resp.get('publishTime', '')
|
||||
|
||||
# Only return if we have meaningful data
|
||||
if review.review_id or (review.author and review.text):
|
||||
return review
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error parsing review dict: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _try_parse_review_array(self, data: List) -> Optional[InterceptedReview]:
|
||||
"""
|
||||
Try to parse a nested array as a review (Google's protobuf-like format).
|
||||
Google often uses positional arrays like: [id, author, [rating], text, ...]
|
||||
"""
|
||||
if not data or len(data) < 3:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Look for patterns that indicate this is a review array
|
||||
# Pattern 1: [review_id, [author_info], rating_array, text, ...]
|
||||
|
||||
review = InterceptedReview()
|
||||
|
||||
# Check if first element looks like a review ID
|
||||
if isinstance(data[0], str) and len(data[0]) > 20:
|
||||
review.review_id = data[0]
|
||||
|
||||
# Search for rating (usually a small number 1-5)
|
||||
for item in data:
|
||||
if isinstance(item, (int, float)) and 1 <= item <= 5:
|
||||
review.rating = float(item)
|
||||
break
|
||||
elif isinstance(item, list) and len(item) >= 1:
|
||||
if isinstance(item[0], (int, float)) and 1 <= item[0] <= 5:
|
||||
review.rating = float(item[0])
|
||||
break
|
||||
|
||||
# Search for text (long string)
|
||||
for item in data:
|
||||
if isinstance(item, str) and len(item) > 30:
|
||||
review.text = item
|
||||
break
|
||||
elif isinstance(item, list):
|
||||
for subitem in item:
|
||||
if isinstance(subitem, str) and len(subitem) > 30:
|
||||
review.text = subitem
|
||||
break
|
||||
|
||||
# Search for author name (shorter string)
|
||||
for item in data:
|
||||
if isinstance(item, list) and len(item) >= 1:
|
||||
for subitem in item:
|
||||
if isinstance(subitem, str) and 2 <= len(subitem) <= 100 and subitem != review.text:
|
||||
review.author = subitem
|
||||
break
|
||||
if review.author:
|
||||
break
|
||||
|
||||
# Search for URLs (photos, profile)
|
||||
for item in data:
|
||||
if isinstance(item, str) and item.startswith('http'):
|
||||
if 'googleusercontent' in item or 'ggpht' in item:
|
||||
if not review.avatar_url:
|
||||
review.avatar_url = item
|
||||
else:
|
||||
review.photos.append(item)
|
||||
elif isinstance(item, list):
|
||||
self._extract_urls_from_array(item, review)
|
||||
|
||||
# Only return if we have meaningful data
|
||||
if review.review_id and review.rating > 0:
|
||||
return review
|
||||
if review.text and review.rating > 0:
|
||||
return review
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error parsing review array: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _extract_urls_from_array(self, arr: List, review: InterceptedReview, depth: int = 0):
|
||||
"""Extract URLs from nested arrays"""
|
||||
if depth > 5:
|
||||
return
|
||||
|
||||
for item in arr:
|
||||
if isinstance(item, str) and item.startswith('http'):
|
||||
if 'googleusercontent' in item or 'ggpht' in item or 'lh3' in item:
|
||||
if 'w72-h72' in item or 'p-rp-mo' in item: # Profile pic pattern
|
||||
review.avatar_url = item
|
||||
else:
|
||||
review.photos.append(item)
|
||||
elif isinstance(item, list):
|
||||
self._extract_urls_from_array(item, depth + 1, review)
|
||||
|
||||
def convert_to_raw_review_format(self, intercepted: InterceptedReview) -> Dict[str, Any]:
|
||||
"""Convert an InterceptedReview to the format used by RawReview/storage"""
|
||||
return {
|
||||
'review_id': intercepted.review_id,
|
||||
'author': intercepted.author,
|
||||
'rating': intercepted.rating,
|
||||
'description': {'en': intercepted.text} if intercepted.text else {},
|
||||
'likes': intercepted.likes,
|
||||
'user_images': intercepted.photos,
|
||||
'author_profile_url': intercepted.profile_url,
|
||||
'profile_picture': intercepted.avatar_url,
|
||||
'owner_responses': {
|
||||
'en': {'text': intercepted.owner_response}
|
||||
} if intercepted.owner_response else {},
|
||||
'review_date': intercepted.date_text,
|
||||
'_source': 'api_intercept'
|
||||
}
|
||||
|
||||
def cleanup(self):
|
||||
"""Clean up interception resources"""
|
||||
try:
|
||||
self.driver.execute_cdp_cmd('Network.disable', {})
|
||||
except:
|
||||
pass
|
||||
|
||||
self.captured_responses.clear()
|
||||
self.captured_reviews.clear()
|
||||
self.request_map.clear()
|
||||
self._listening = False
|
||||
@@ -35,16 +35,45 @@ class ChromeWorker:
|
||||
|
||||
# SeleniumBase Driver automatically includes UC mode anti-detection
|
||||
# Initialize with longer timeouts for large scraping jobs
|
||||
# Chrome arguments for Docker stability
|
||||
chrome_args = [
|
||||
"--disable-dev-shm-usage", # Use /tmp instead of /dev/shm (critical for Docker)
|
||||
"--disable-gpu", # Disable GPU acceleration
|
||||
"--no-sandbox", # Required for Docker
|
||||
"--disable-software-rasterizer",
|
||||
"--disable-extensions",
|
||||
"--disable-background-networking",
|
||||
"--disable-default-apps",
|
||||
"--disable-sync",
|
||||
"--metrics-recording-only",
|
||||
"--mute-audio",
|
||||
"--no-first-run",
|
||||
"--safebrowsing-disable-auto-update",
|
||||
]
|
||||
|
||||
self.driver = Driver(
|
||||
uc=True,
|
||||
headless=self.headless,
|
||||
page_load_strategy="normal"
|
||||
page_load_strategy="normal",
|
||||
chromium_arg=",".join(chrome_args)
|
||||
)
|
||||
|
||||
# Set generous timeouts for large scraping jobs
|
||||
self.driver.set_page_load_timeout(120) # 2 minutes for slow networks
|
||||
self.driver.set_script_timeout(60) # 1 minute for complex extraction
|
||||
|
||||
# Set Chrome geolocation to US (Boston, MA) for consistent Google Maps results
|
||||
# This prevents location-based variations in search results
|
||||
try:
|
||||
self.driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
||||
'latitude': 42.3601,
|
||||
'longitude': -71.0589,
|
||||
'accuracy': 100
|
||||
})
|
||||
log.info(f"Worker {self.worker_id}: Geolocation set to US (Boston, MA)")
|
||||
except Exception as e:
|
||||
log.warning(f"Worker {self.worker_id}: Could not set geolocation: {e}")
|
||||
|
||||
self.driver.maximize_window()
|
||||
self.created_at = time.time()
|
||||
self.last_used = time.time()
|
||||
|
||||
@@ -1,80 +0,0 @@
|
||||
"""
|
||||
Command line interface handling for Google Maps Reviews Scraper.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from modules.config import DEFAULT_CONFIG_PATH
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
"""Parse command line arguments"""
|
||||
ap = argparse.ArgumentParser(description="Google‑Maps review scraper with MongoDB integration")
|
||||
ap.add_argument("-q", "--headless", action="store_true",
|
||||
help="run Chrome in the background")
|
||||
ap.add_argument("-s", "--sort", dest="sort_by",
|
||||
choices=("newest", "highest", "lowest", "relevance"),
|
||||
default=None, help="sorting order for reviews")
|
||||
ap.add_argument("--stop-on-match", action="store_true",
|
||||
help="stop scrolling when first already‑seen id is met "
|
||||
"(useful with --sort newest)")
|
||||
ap.add_argument("--url", type=str, default=None,
|
||||
help="custom Google Maps URL to scrape")
|
||||
ap.add_argument("--overwrite", action="store_true", dest="overwrite_existing",
|
||||
help="overwrite existing reviews instead of appending")
|
||||
ap.add_argument("--config", type=str, default=None,
|
||||
help="path to custom configuration file")
|
||||
ap.add_argument("--use-mongodb", type=bool, default=None,
|
||||
help="whether to use MongoDB for storage")
|
||||
|
||||
# Arguments for date conversion and image downloading
|
||||
ap.add_argument("--convert-dates", type=bool, default=None,
|
||||
help="convert string dates to MongoDB Date objects")
|
||||
ap.add_argument("--download-images", type=bool, default=None,
|
||||
help="download images from reviews")
|
||||
ap.add_argument("--image-dir", type=str, default=None,
|
||||
help="directory to store downloaded images")
|
||||
ap.add_argument("--download-threads", type=int, default=None,
|
||||
help="number of threads for downloading images")
|
||||
|
||||
# Arguments for local image paths and URL replacement
|
||||
ap.add_argument("--store-local-paths", type=bool, default=None,
|
||||
help="whether to store local image paths in documents")
|
||||
ap.add_argument("--replace-urls", type=bool, default=None,
|
||||
help="whether to replace original URLs with custom ones")
|
||||
ap.add_argument("--custom-url-base", type=str, default=None,
|
||||
help="base URL for replacement")
|
||||
ap.add_argument("--custom-url-profiles", type=str, default=None,
|
||||
help="path for profile images")
|
||||
ap.add_argument("--custom-url-reviews", type=str, default=None,
|
||||
help="path for review images")
|
||||
ap.add_argument("--preserve-original-urls", type=bool, default=None,
|
||||
help="whether to preserve original URLs in original_* fields")
|
||||
|
||||
# Arguments for custom parameters
|
||||
ap.add_argument("--custom-params", type=str, default=None,
|
||||
help="JSON string with custom parameters to add to each document (e.g. '{\"company\":\"Thaitours\"}')")
|
||||
|
||||
# API interception option
|
||||
ap.add_argument("--api-intercept", action="store_true", dest="enable_api_intercept",
|
||||
help="enable API response interception for faster data capture (experimental)")
|
||||
|
||||
args = ap.parse_args()
|
||||
|
||||
# Handle config path
|
||||
if args.config is not None:
|
||||
args.config = Path(args.config)
|
||||
else:
|
||||
args.config = DEFAULT_CONFIG_PATH
|
||||
|
||||
# Process custom params if provided
|
||||
if args.custom_params:
|
||||
try:
|
||||
args.custom_params = json.loads(args.custom_params)
|
||||
except json.JSONDecodeError:
|
||||
print(f"Warning: Could not parse custom params JSON: {args.custom_params}")
|
||||
args.custom_params = None
|
||||
|
||||
return args
|
||||
@@ -77,11 +77,17 @@ class DatabaseManager:
|
||||
|
||||
error_message TEXT,
|
||||
metadata JSONB,
|
||||
scrape_logs JSONB,
|
||||
|
||||
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
|
||||
);
|
||||
""")
|
||||
|
||||
# Add scrape_logs column if it doesn't exist (for existing databases)
|
||||
await conn.execute("""
|
||||
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
|
||||
""")
|
||||
|
||||
# Create indexes
|
||||
await conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
|
||||
@@ -182,10 +188,12 @@ class DatabaseManager:
|
||||
started_at,
|
||||
completed_at,
|
||||
reviews_count,
|
||||
total_reviews,
|
||||
reviews_data,
|
||||
scrape_time,
|
||||
error_message,
|
||||
metadata
|
||||
metadata,
|
||||
scrape_logs
|
||||
FROM jobs
|
||||
WHERE job_id = $1
|
||||
""", job_id)
|
||||
@@ -246,8 +254,13 @@ class DatabaseManager:
|
||||
kwargs['completed_at'] = datetime.now()
|
||||
|
||||
for key, value in kwargs.items():
|
||||
set_clauses.append(f"{key} = ${param_idx}")
|
||||
params.append(value)
|
||||
# Handle JSONB fields specially
|
||||
if key == 'scrape_logs' and value is not None:
|
||||
set_clauses.append(f"{key} = ${param_idx}::jsonb")
|
||||
params.append(json.dumps(value) if not isinstance(value, str) else value)
|
||||
else:
|
||||
set_clauses.append(f"{key} = ${param_idx}")
|
||||
params.append(value)
|
||||
param_idx += 1
|
||||
|
||||
query = f"""
|
||||
@@ -264,7 +277,8 @@ class DatabaseManager:
|
||||
job_id: UUID,
|
||||
reviews: List[Dict[str, Any]],
|
||||
scrape_time: float,
|
||||
total_reviews: Optional[int] = None
|
||||
total_reviews: Optional[int] = None,
|
||||
scrape_logs: Optional[List[Dict[str, Any]]] = None
|
||||
):
|
||||
"""
|
||||
Save scraping results to database.
|
||||
@@ -274,6 +288,7 @@ class DatabaseManager:
|
||||
reviews: List of review dictionaries
|
||||
scrape_time: Time taken to scrape in seconds
|
||||
total_reviews: Total reviews available (from page counter)
|
||||
scrape_logs: List of log entries from the scraper
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
@@ -284,9 +299,11 @@ class DatabaseManager:
|
||||
reviews_count = $2,
|
||||
total_reviews = $3,
|
||||
reviews_data = $4::jsonb,
|
||||
scrape_time = $5
|
||||
scrape_time = $5,
|
||||
scrape_logs = $6::jsonb
|
||||
WHERE job_id = $1
|
||||
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time)
|
||||
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
|
||||
json.dumps(scrape_logs) if scrape_logs else None)
|
||||
|
||||
log.info(f"Saved {len(reviews)} reviews for job {job_id}")
|
||||
|
||||
@@ -317,8 +334,10 @@ class DatabaseManager:
|
||||
created_at,
|
||||
completed_at,
|
||||
reviews_count,
|
||||
total_reviews,
|
||||
scrape_time,
|
||||
error_message
|
||||
error_message,
|
||||
metadata
|
||||
FROM jobs
|
||||
WHERE status = $1
|
||||
ORDER BY created_at DESC
|
||||
@@ -333,8 +352,10 @@ class DatabaseManager:
|
||||
created_at,
|
||||
completed_at,
|
||||
reviews_count,
|
||||
total_reviews,
|
||||
scrape_time,
|
||||
error_message
|
||||
error_message,
|
||||
metadata
|
||||
FROM jobs
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $1 OFFSET $2
|
||||
|
||||
@@ -1140,13 +1140,30 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
else:
|
||||
log.info(f"[PROFILE] Using pooled driver (0.00s)")
|
||||
|
||||
# Force English locale for consistent parsing
|
||||
# Force English locale AND US region for consistent parsing/results
|
||||
# This helps avoid geolocation-based variations in Google Maps results
|
||||
if 'hl=' in url:
|
||||
url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en')
|
||||
else:
|
||||
separator = '&' if '?' in url else '?'
|
||||
url = f"{url}{separator}hl=en"
|
||||
|
||||
# Add US region parameter if not present
|
||||
if 'gl=' not in url:
|
||||
url = f"{url}&gl=us"
|
||||
|
||||
# Set Chrome geolocation to US (Boston, MA) using CDP
|
||||
# This ensures Google Maps shows US results regardless of server location
|
||||
try:
|
||||
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
||||
'latitude': 42.3601,
|
||||
'longitude': -71.0589,
|
||||
'accuracy': 100
|
||||
})
|
||||
log.info("Set geolocation to US (Boston, MA)")
|
||||
except Exception as e:
|
||||
log.warning(f"Could not set geolocation: {e}")
|
||||
|
||||
log.info(f"Loading Google Maps page...")
|
||||
t0 = timing_module.time()
|
||||
driver.get(url)
|
||||
@@ -1164,18 +1181,23 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
||||
for btn in form_btns:
|
||||
btn_text = (btn.text or '').lower()
|
||||
if 'aceptar todo' in btn_text or 'accept all' in btn_text:
|
||||
if 'aceptar todo' in btn_text or 'accept all' in btn_text or 'reject all' in btn_text:
|
||||
log.info(f"Clicking GDPR consent: {btn.text}")
|
||||
btn.click()
|
||||
time.sleep(1) # Reduced from 2s
|
||||
time.sleep(1)
|
||||
break
|
||||
else:
|
||||
if len(form_btns) >= 2:
|
||||
log.info("Using fallback: clicking second form button")
|
||||
form_btns[1].click()
|
||||
time.sleep(1) # Reduced from 2s
|
||||
time.sleep(1)
|
||||
except Exception as e:
|
||||
log.warning(f"GDPR consent handling failed: {e}")
|
||||
|
||||
# After GDPR consent, reload the original URL to ensure proper page state
|
||||
log.info(f"Reloading original URL after GDPR consent...")
|
||||
driver.get(url)
|
||||
time.sleep(1)
|
||||
log.info(f"[PROFILE] GDPR consent handling: {timing_module.time() - t0:.2f}s")
|
||||
else:
|
||||
log.info(f"[PROFILE] No GDPR consent page (0.00s)")
|
||||
@@ -1197,14 +1219,77 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
try:
|
||||
log.info("Waiting for Google Maps content to load...")
|
||||
wait = WebDriverWait(driver, 10)
|
||||
# Wait for basic page structure (h1 or heading)
|
||||
wait.until(
|
||||
lambda d: d.find_elements(By.CSS_SELECTOR, 'h1.DUwDvf, h1, [role="article"], [data-review-id]')
|
||||
lambda d: d.find_elements(By.CSS_SELECTOR, 'h1, [role="heading"]')
|
||||
)
|
||||
log.info("Google Maps content loaded successfully")
|
||||
log.info("Basic page structure loaded")
|
||||
|
||||
# Wait for page to settle - search URLs redirect to place URLs
|
||||
# which triggers additional content loading
|
||||
time.sleep(2)
|
||||
|
||||
# Wait specifically for review count element (aria-label ending with "reviews")
|
||||
# This is the most reliable indicator that the business detail is loaded
|
||||
try:
|
||||
WebDriverWait(driver, 5).until(
|
||||
lambda d: d.execute_script("""
|
||||
var elems = document.querySelectorAll('[aria-label]');
|
||||
for (var i = 0; i < elems.length; i++) {
|
||||
var label = elems[i].getAttribute('aria-label') || '';
|
||||
if (/^[0-9]+ reviews?$/.test(label)) return true;
|
||||
}
|
||||
return false;
|
||||
""")
|
||||
)
|
||||
log.info("Review count element loaded")
|
||||
except:
|
||||
# Fallback: Try clicking Reviews tab or rating stars to expose the review count
|
||||
log.info("Review count wait timeout, trying to click Reviews/rating...")
|
||||
try:
|
||||
# Try 1: Click Reviews tab (if exists)
|
||||
clicked = driver.execute_script("""
|
||||
var tabs = document.querySelectorAll('[role="tab"]');
|
||||
for (var i = 0; i < tabs.length; i++) {
|
||||
var txt = (tabs[i].textContent || '').toLowerCase();
|
||||
if (txt.includes('review')) {
|
||||
tabs[i].click();
|
||||
return 'tab';
|
||||
}
|
||||
}
|
||||
// Try 2: Click the rating stars element (often links to reviews)
|
||||
var stars = document.querySelector('[role="img"][aria-label*="star"]');
|
||||
if (stars) {
|
||||
var parent = stars.parentElement;
|
||||
if (parent && parent.tagName.toLowerCase() === 'button') {
|
||||
parent.click();
|
||||
return 'stars_button';
|
||||
}
|
||||
stars.click();
|
||||
return 'stars';
|
||||
}
|
||||
// Try 3: Click "Write a review" or any review-related button
|
||||
var btns = document.querySelectorAll('button[aria-label*="review" i]');
|
||||
for (var b = 0; b < btns.length; b++) {
|
||||
var label = btns[b].getAttribute('aria-label') || '';
|
||||
if (!/write/i.test(label) && /review/i.test(label)) {
|
||||
btns[b].click();
|
||||
return 'review_btn: ' + label;
|
||||
}
|
||||
}
|
||||
return 'none';
|
||||
""")
|
||||
log.info(f"Clicked: {clicked}")
|
||||
time.sleep(2) # Wait for reviews panel to load
|
||||
except Exception as e:
|
||||
log.warning(f"Click attempt failed: {e}")
|
||||
|
||||
except Exception as e:
|
||||
log.warning(f"Timeout waiting for Maps content: {e}")
|
||||
time.sleep(0.5) # Minimal fallback wait
|
||||
time.sleep(2) # Fallback wait
|
||||
log.info(f"[PROFILE] Smart wait for content: {timing_module.time() - t0:.2f}s")
|
||||
log.info(f"DEBUG: Current URL: {driver.current_url[:100]}...")
|
||||
log.info(f"DEBUG: Page title: {driver.title}")
|
||||
|
||||
# Extract business card information using JavaScript
|
||||
t0 = timing_module.time()
|
||||
@@ -1216,85 +1301,166 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
total_reviews: null
|
||||
};
|
||||
|
||||
// Extract business name
|
||||
const nameSelectors = [
|
||||
'h1.DUwDvf',
|
||||
'[role="main"] h1',
|
||||
'h1.fontHeadlineLarge'
|
||||
];
|
||||
// ============ ROBUST EXTRACTION (no class names, aria/data attributes preferred) ============
|
||||
|
||||
for (const selector of nameSelectors) {
|
||||
const elem = document.querySelector(selector);
|
||||
if (elem && elem.textContent) {
|
||||
info.name = elem.textContent.trim();
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Helper: Parse review count from text, handling multiple formats
|
||||
function parseReviewCount(text) {
|
||||
if (!text) return null;
|
||||
|
||||
// Extract address
|
||||
const addressSelectors = [
|
||||
'button[data-item-id*="address"]',
|
||||
'[data-item-id*="address"]',
|
||||
'div[aria-label*="Address"]'
|
||||
];
|
||||
|
||||
for (const selector of addressSelectors) {
|
||||
const elem = document.querySelector(selector);
|
||||
if (elem && elem.textContent) {
|
||||
info.address = elem.textContent.trim();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract rating (look for aria-label like "4.2 stars")
|
||||
const ratingElem = document.querySelector('[role="img"][aria-label*="star"]');
|
||||
if (ratingElem) {
|
||||
const ariaLabel = ratingElem.getAttribute('aria-label');
|
||||
const match = ariaLabel.match(/([0-9.]+)/);
|
||||
// Pattern 1: Exact "N reviews" format (aria-labels, clean text)
|
||||
// Matches: "27 reviews", "1,234 reviews", "27 reseñas", "27 avis"
|
||||
var match = text.match(/^([0-9][0-9,.]*)[ ]*(?:reviews?|reseñas?|avis|bewertungen?|recensioni?)$/i);
|
||||
if (match) {
|
||||
info.rating = parseFloat(match[1]);
|
||||
return parseInt(match[1].replace(/[,. ]/g, ''));
|
||||
}
|
||||
}
|
||||
|
||||
// Extract total review count
|
||||
const reviewPattern = /\\((\\d[\\d,\\.]*)\\)/;
|
||||
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
|
||||
// Pattern 2: "(N)" format often used in tabs like "Reviews (27)"
|
||||
match = text.match(/[(]([0-9][0-9,.]*)[)]$/);
|
||||
if (match) {
|
||||
return parseInt(match[1].replace(/[,. ]/g, ''));
|
||||
}
|
||||
|
||||
// PRIORITY 1: Look for review count in search results sidebar/panel
|
||||
// This is where "152 reviews" appears on search results
|
||||
const searchPanelSelectors = [
|
||||
'a[href*="reviews"]', // Link with "reviews" in href
|
||||
'button[jsaction*="reviews"]', // Button related to reviews
|
||||
'div[role="link"]', // Clickable divs that might contain review info
|
||||
];
|
||||
|
||||
for (const selector of searchPanelSelectors) {
|
||||
const elements = document.querySelectorAll(selector);
|
||||
for (let elem of elements) {
|
||||
const text = elem.textContent || '';
|
||||
const match = text.match(numberPattern);
|
||||
// Pattern 3: "N reviews" anywhere in short text (< 30 chars to avoid false positives)
|
||||
if (text.length < 30) {
|
||||
match = text.match(/([0-9][0-9,]*)[ ]+(?:reviews?|reseñas?|avis)/i);
|
||||
if (match) {
|
||||
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
if (num > 0 && num < 1000000) {
|
||||
info.total_reviews = num;
|
||||
break;
|
||||
}
|
||||
return parseInt(match[1].replace(/[,. ]/g, ''));
|
||||
}
|
||||
}
|
||||
if (info.total_reviews) break;
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// PRIORITY 2: Look in any span/div that contains the word "review"
|
||||
// ============ EXTRACT BUSINESS NAME ============
|
||||
// Priority: h1 (semantic), then role="heading"
|
||||
const h1 = document.querySelector('h1');
|
||||
if (h1 && h1.textContent) {
|
||||
info.name = h1.textContent.trim();
|
||||
}
|
||||
if (!info.name) {
|
||||
const heading = document.querySelector('[role="heading"][aria-level="1"]');
|
||||
if (heading && heading.textContent) {
|
||||
info.name = heading.textContent.trim();
|
||||
}
|
||||
}
|
||||
|
||||
// ============ EXTRACT ADDRESS ============
|
||||
// Priority: data-item-id (semantic), then aria-label containing "address"
|
||||
const addressElem = document.querySelector('[data-item-id*="address"]');
|
||||
if (addressElem && addressElem.textContent) {
|
||||
info.address = addressElem.textContent.trim();
|
||||
}
|
||||
if (!info.address) {
|
||||
const ariaAddress = document.querySelector('[aria-label*="ddress"]');
|
||||
if (ariaAddress && ariaAddress.textContent) {
|
||||
info.address = ariaAddress.textContent.trim();
|
||||
}
|
||||
}
|
||||
|
||||
// ============ EXTRACT RATING ============
|
||||
// Priority: aria-label containing "star" on role="img" elements
|
||||
info._debug_rating_context = [];
|
||||
const ratingElems = document.querySelectorAll('[role="img"][aria-label*="star"]');
|
||||
for (let elem of ratingElems) {
|
||||
const ariaLabel = elem.getAttribute('aria-label') || '';
|
||||
// Match "4.9 stars" or "4,9 stars" (European format)
|
||||
const match = ariaLabel.match(/([0-9][.,]?[0-9]?)\\s*star/i);
|
||||
if (match) {
|
||||
info.rating = parseFloat(match[1].replace(',', '.'));
|
||||
// DEBUG: Capture parent/sibling context to find review count
|
||||
var parent = elem.parentElement;
|
||||
if (parent) {
|
||||
info._debug_rating_context.push('PARENT: ' + (parent.textContent || '').trim().substring(0, 100));
|
||||
var grandparent = parent.parentElement;
|
||||
if (grandparent) {
|
||||
info._debug_rating_context.push('GRANDPARENT: ' + (grandparent.textContent || '').trim().substring(0, 100));
|
||||
// Check all children of grandparent for review count
|
||||
var gpChildren = grandparent.querySelectorAll('*');
|
||||
for (var c = 0; c < Math.min(gpChildren.length, 30); c++) {
|
||||
var childText = (gpChildren[c].textContent || '').trim();
|
||||
if (childText.length > 0 && childText.length < 20 && /[0-9]/.test(childText)) {
|
||||
info._debug_rating_context.push('GP_CHILD: ' + childText);
|
||||
}
|
||||
}
|
||||
// Also check great-grandparent
|
||||
var ggp = grandparent.parentElement;
|
||||
if (ggp) {
|
||||
info._debug_rating_context.push('GREAT_GP: ' + (ggp.textContent || '').trim().substring(0, 150));
|
||||
}
|
||||
}
|
||||
// Check siblings
|
||||
var nextSib = parent.nextElementSibling;
|
||||
if (nextSib) {
|
||||
info._debug_rating_context.push('NEXT_SIB: ' + (nextSib.textContent || '').trim().substring(0, 100));
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// ============ EXTRACT TOTAL REVIEWS (ROBUST, ARIA-FIRST) ============
|
||||
|
||||
// PRIORITY 1: aria-label with exact "N reviews" format (most reliable)
|
||||
// Google Maps uses aria-label="27 reviews" for accessibility
|
||||
info._debug_aria = [];
|
||||
info._debug_all_numeric = [];
|
||||
if (!info.total_reviews) {
|
||||
const allElements = document.querySelectorAll('span, div, a');
|
||||
for (let elem of allElements) {
|
||||
const text = elem.textContent || '';
|
||||
if (text.length < 100) { // Skip very long text blocks
|
||||
const match = text.match(numberPattern);
|
||||
var ariaElems = document.querySelectorAll('[aria-label]');
|
||||
for (var i = 0; i < ariaElems.length; i++) {
|
||||
var ariaLabel = ariaElems[i].getAttribute('aria-label') || '';
|
||||
// Collect all labels containing "review"
|
||||
if (ariaLabel.toLowerCase().indexOf('review') >= 0) {
|
||||
info._debug_aria.push(ariaLabel);
|
||||
}
|
||||
// Collect all labels starting with a digit
|
||||
if (/^[0-9]/.test(ariaLabel)) {
|
||||
info._debug_all_numeric.push(ariaLabel);
|
||||
}
|
||||
var count = parseReviewCount(ariaLabel);
|
||||
if (count && count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = ariaLabel;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// DEBUG: Find all text with parenthetical numbers like "(27)"
|
||||
info._debug_parens = [];
|
||||
info._debug_short_text = []; // All short text with numbers
|
||||
var allSpans = document.querySelectorAll('span, div, a, button');
|
||||
for (var j = 0; j < Math.min(allSpans.length, 500); j++) {
|
||||
var spanText = allSpans[j].textContent || '';
|
||||
// Capture parenthetical numbers
|
||||
if (spanText.length < 20 && /[(][0-9]+[)]/.test(spanText)) {
|
||||
info._debug_parens.push(spanText.trim());
|
||||
}
|
||||
// Capture ALL short text containing numbers (for debugging)
|
||||
if (spanText.length > 0 && spanText.length < 30 && /[0-9]+/.test(spanText)) {
|
||||
var cleaned = spanText.trim().replace(/\\s+/g, ' ');
|
||||
if (cleaned && info._debug_short_text.indexOf(cleaned) < 0) {
|
||||
info._debug_short_text.push(cleaned);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 2.5: Look for text containing numbers near "review" word anywhere on page
|
||||
// This catches formats like "27 reviews", "reviews: 27", etc. that aren't in aria-labels
|
||||
if (!info.total_reviews) {
|
||||
var allElems = document.querySelectorAll('*');
|
||||
for (var k = 0; k < Math.min(allElems.length, 1000); k++) {
|
||||
var elem = allElems[k];
|
||||
// Skip if has children (we want leaf nodes only)
|
||||
if (elem.children.length > 0) continue;
|
||||
var txt = (elem.textContent || '').trim();
|
||||
// Look for short text with both numbers and "review" word
|
||||
if (txt.length >= 3 && txt.length < 30 && /review/i.test(txt)) {
|
||||
var match = txt.match(/([0-9][0-9,]*)/);
|
||||
if (match) {
|
||||
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
if (num > 0 && num < 1000000) {
|
||||
info.total_reviews = num;
|
||||
var count = parseInt(match[1].replace(/,/g, ''));
|
||||
if (count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'LEAF: ' + txt;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -1302,38 +1468,167 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 3: Try tabs (for business detail pages)
|
||||
// DEBUG: Collect all tab names
|
||||
info._debug_tabs = [];
|
||||
const tabs = document.querySelectorAll('[role="tab"]');
|
||||
for (let t = 0; t < tabs.length; t++) {
|
||||
info._debug_tabs.push((tabs[t].textContent || '').trim().substring(0, 30));
|
||||
}
|
||||
|
||||
// DEBUG: Collect all buttons with text (might contain review count)
|
||||
info._debug_buttons = [];
|
||||
const buttons = document.querySelectorAll('button');
|
||||
for (let b = 0; b < Math.min(buttons.length, 20); b++) {
|
||||
var btnText = (buttons[b].textContent || '').trim();
|
||||
if (btnText && btnText.length < 40) {
|
||||
info._debug_buttons.push(btnText.substring(0, 40));
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 2: Tabs with role="tab" (Reviews tab often shows count)
|
||||
if (!info.total_reviews) {
|
||||
const tabs = document.querySelectorAll('button[role="tab"]');
|
||||
for (let tab of tabs) {
|
||||
const text = tab.textContent || '';
|
||||
let match = text.match(reviewPattern);
|
||||
if (match) {
|
||||
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
break;
|
||||
const text = (tab.textContent || '').trim();
|
||||
// Look for "Reviews" tab with count
|
||||
if (text.toLowerCase().includes('review')) {
|
||||
const count = parseReviewCount(text);
|
||||
if (count && count > 0) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'TAB: ' + text;
|
||||
break;
|
||||
}
|
||||
}
|
||||
match = text.match(numberPattern);
|
||||
if (match) {
|
||||
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 2.3: Reviews panel header (after clicking Reviews tab)
|
||||
// Google Maps shows "27 reviews" as heading text in the reviews panel
|
||||
if (!info.total_reviews) {
|
||||
// Look for headings containing review count
|
||||
var headings = document.querySelectorAll('h1, h2, [role="heading"]');
|
||||
for (var h = 0; h < headings.length; h++) {
|
||||
var hText = (headings[h].textContent || '').trim();
|
||||
if (/review/i.test(hText)) {
|
||||
var match = hText.match(/([0-9][0-9,]*)/);
|
||||
if (match) {
|
||||
var count = parseInt(match[1].replace(/,/g, ''));
|
||||
if (count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'HEADING: ' + hText;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 2.4: Look for sort button area which often has total count
|
||||
// The sort dropdown area displays "Sort: Newest" and total reviews
|
||||
if (!info.total_reviews) {
|
||||
var sortBtns = document.querySelectorAll('button[data-value="sort"], [aria-label*="Sort"]');
|
||||
for (var s = 0; s < sortBtns.length; s++) {
|
||||
var parent = sortBtns[s].parentElement;
|
||||
if (parent) {
|
||||
var pText = (parent.textContent || '').trim();
|
||||
if (/review/i.test(pText)) {
|
||||
var match = pText.match(/([0-9][0-9,]*)\\s*review/i);
|
||||
if (match) {
|
||||
var count = parseInt(match[1].replace(/,/g, ''));
|
||||
if (count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'SORT_AREA: ' + pText.substring(0, 50);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 3: Elements with semantic review-related attributes
|
||||
if (!info.total_reviews) {
|
||||
const reviewLinks = document.querySelectorAll('a[href*="review"], button[aria-label*="review" i]');
|
||||
for (let elem of reviewLinks) {
|
||||
const text = (elem.textContent || '').trim();
|
||||
const count = parseReviewCount(text);
|
||||
if (count && count > 0) {
|
||||
info.total_reviews = count;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 4: Try aria-labels
|
||||
// PRIORITY 4: Look for standalone review count text near rating
|
||||
// Find elements that contain ONLY "N reviews" pattern (not concatenated with rating)
|
||||
if (!info.total_reviews) {
|
||||
const elements = document.querySelectorAll('[aria-label]');
|
||||
for (let elem of elements) {
|
||||
const ariaLabel = elem.getAttribute('aria-label') || '';
|
||||
let match = ariaLabel.match(reviewPattern);
|
||||
if (match) {
|
||||
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
const allElements = document.querySelectorAll('span, a');
|
||||
for (let elem of allElements) {
|
||||
// Get direct text content only (not nested children)
|
||||
const text = (elem.textContent || '').trim();
|
||||
// Skip if too long (likely contains other content)
|
||||
if (text.length > 50) continue;
|
||||
// Skip if it looks like rating+reviews concatenated (e.g., "4.927 reviews")
|
||||
if (/^[0-9]\\.[0-9]+[0-9]/.test(text)) continue;
|
||||
|
||||
const count = parseReviewCount(text);
|
||||
if (count && count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
break;
|
||||
}
|
||||
match = ariaLabel.match(numberPattern);
|
||||
if (match) {
|
||||
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 5: Parse from visible page text using regex on short text blocks
|
||||
if (!info.total_reviews) {
|
||||
const walker = document.createTreeWalker(
|
||||
document.body,
|
||||
NodeFilter.SHOW_TEXT,
|
||||
null,
|
||||
false
|
||||
);
|
||||
while (walker.nextNode()) {
|
||||
const text = walker.currentNode.textContent.trim();
|
||||
if (text.length >= 5 && text.length <= 30) {
|
||||
// Match "27 reviews" but not "4.927 reviews"
|
||||
const match = text.match(/(?:^|[^0-9.,])([0-9,]+)\\s+(?:reviews?|reseñas?)/i);
|
||||
if (match) {
|
||||
const count = parseInt(match[1].replace(/[,]/g, ''));
|
||||
if (count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'WALKER: ' + text;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 6: Extract from embedded JSON in page source (Google embeds data in scripts)
|
||||
if (!info.total_reviews) {
|
||||
var scripts = document.querySelectorAll('script');
|
||||
for (var sc = 0; sc < scripts.length; sc++) {
|
||||
var scriptText = scripts[sc].textContent || '';
|
||||
// Look for patterns like "user_reviews":{"count":27} or reviews_count":27
|
||||
var jsonMatch = scriptText.match(/"(?:user_reviews|reviews?)(?:_count)?"\s*[:\{]\s*"?(\d+)"?/i);
|
||||
if (jsonMatch) {
|
||||
var count = parseInt(jsonMatch[1]);
|
||||
if (count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'JSON_SCRIPT';
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Also look for review count in Google's data format like [\"27 reviews\"]
|
||||
if (!info.total_reviews) {
|
||||
var dataMatch = scriptText.match(/"(\d+)\s+reviews?"/i);
|
||||
if (dataMatch) {
|
||||
var count = parseInt(dataMatch[1]);
|
||||
if (count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'JSON_DATA: ' + dataMatch[0];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1348,6 +1643,32 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
log.info(f"[PROFILE] *** TOTAL GET_BUSINESS_CARD TIME: {total_time:.2f}s ***")
|
||||
log.info(f"Business card extracted: name={business_info.get('name')}, "
|
||||
f"rating={business_info.get('rating')}, reviews={business_info.get('total_reviews')}")
|
||||
# Debug: log what aria-labels were found
|
||||
if business_info.get('_debug_aria'):
|
||||
log.info(f"DEBUG: Found {len(business_info.get('_debug_aria'))} aria-labels with 'review': {business_info.get('_debug_aria')[:5]}")
|
||||
if business_info.get('_debug_matched'):
|
||||
log.info(f"DEBUG: Matched aria-label: {business_info.get('_debug_matched')}")
|
||||
# Also log all numeric aria-labels (potential review counts)
|
||||
if business_info.get('_debug_all_numeric'):
|
||||
log.info(f"DEBUG: Numeric aria-labels: {business_info.get('_debug_all_numeric')[:10]}")
|
||||
# Log any text with parenthetical numbers like "(27)"
|
||||
if business_info.get('_debug_parens'):
|
||||
log.info(f"DEBUG: Parenthetical text: {business_info.get('_debug_parens')[:5]}")
|
||||
# Log all short text containing numbers (for debugging review count detection)
|
||||
if business_info.get('_debug_short_text'):
|
||||
log.info(f"DEBUG: Short text with numbers: {business_info.get('_debug_short_text')[:15]}")
|
||||
# Log the context around the rating element
|
||||
if business_info.get('_debug_rating_context'):
|
||||
for ctx in business_info.get('_debug_rating_context', []):
|
||||
log.info(f"DEBUG: Rating context: {ctx}")
|
||||
# Log what tabs exist on the page
|
||||
if business_info.get('_debug_tabs'):
|
||||
log.info(f"DEBUG: Page tabs: {business_info.get('_debug_tabs')}")
|
||||
else:
|
||||
log.info(f"DEBUG: No tabs found on page")
|
||||
# Log buttons (might contain review count)
|
||||
if business_info.get('_debug_buttons'):
|
||||
log.info(f"DEBUG: Buttons: {business_info.get('_debug_buttons')[:10]}")
|
||||
|
||||
result = {
|
||||
"name": business_info.get('name'),
|
||||
|
||||
@@ -1,407 +0,0 @@
|
||||
"""
|
||||
Background job manager for Google Reviews Scraper.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Dict, Any, Optional, List
|
||||
from dataclasses import dataclass, asdict
|
||||
|
||||
from modules.config import load_config
|
||||
from modules.scraper import GoogleReviewsScraper
|
||||
from modules.scraper_clean import fast_scrape_reviews # Updated to use clean scraper with hard refresh recovery
|
||||
from modules.chrome_pool import get_scraping_worker, release_scraping_worker
|
||||
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
|
||||
class JobStatus(str, Enum):
|
||||
"""Job status enumeration"""
|
||||
PENDING = "pending"
|
||||
RUNNING = "running"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
CANCELLED = "cancelled"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapingJob:
|
||||
"""Scraping job data class"""
|
||||
job_id: str
|
||||
status: JobStatus
|
||||
url: str
|
||||
config: Dict[str, Any]
|
||||
created_at: datetime
|
||||
started_at: Optional[datetime] = None
|
||||
completed_at: Optional[datetime] = None
|
||||
updated_at: Optional[datetime] = None # Last update time (for progress tracking)
|
||||
error_message: Optional[str] = None
|
||||
reviews_count: Optional[int] = None
|
||||
total_reviews: Optional[int] = None # Total reviews available (from page counter)
|
||||
images_count: Optional[int] = None
|
||||
progress: Dict[str, Any] = None
|
||||
reviews_data: Optional[List[Dict[str, Any]]] = None # Store actual review data
|
||||
scrape_time: Optional[float] = None # Time taken to scrape
|
||||
|
||||
def to_dict(self, include_reviews: bool = False) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert job to dictionary for JSON serialization
|
||||
|
||||
Args:
|
||||
include_reviews: Whether to include the full reviews data (default: False)
|
||||
"""
|
||||
data = asdict(self)
|
||||
# Convert datetime objects to ISO strings
|
||||
for field in ['created_at', 'started_at', 'completed_at']:
|
||||
if data[field]:
|
||||
data[field] = data[field].isoformat()
|
||||
|
||||
# Exclude reviews_data by default (can be large)
|
||||
if not include_reviews:
|
||||
data.pop('reviews_data', None)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
class JobManager:
|
||||
"""Manager for background scraping jobs"""
|
||||
|
||||
def __init__(self, max_concurrent_jobs: int = 3):
|
||||
"""Initialize job manager"""
|
||||
self.max_concurrent_jobs = max_concurrent_jobs
|
||||
self.jobs: Dict[str, ScrapingJob] = {}
|
||||
self.executor = ThreadPoolExecutor(max_workers=max_concurrent_jobs)
|
||||
self.lock = threading.Lock()
|
||||
|
||||
def create_job(self, url: str, config_overrides: Dict[str, Any] = None) -> str:
|
||||
"""
|
||||
Create a new scraping job.
|
||||
|
||||
Args:
|
||||
url: Google Maps URL to scrape
|
||||
config_overrides: Optional config overrides
|
||||
|
||||
Returns:
|
||||
Job ID
|
||||
"""
|
||||
job_id = str(uuid.uuid4())
|
||||
|
||||
# Load base config
|
||||
config = load_config()
|
||||
|
||||
# Apply URL
|
||||
config["url"] = url
|
||||
|
||||
# Apply any overrides
|
||||
if config_overrides:
|
||||
config.update(config_overrides)
|
||||
|
||||
job = ScrapingJob(
|
||||
job_id=job_id,
|
||||
status=JobStatus.PENDING,
|
||||
url=url,
|
||||
config=config,
|
||||
created_at=datetime.now(),
|
||||
progress={"stage": "created", "message": "Job created and queued"}
|
||||
)
|
||||
|
||||
with self.lock:
|
||||
self.jobs[job_id] = job
|
||||
|
||||
log.info(f"Created scraping job {job_id} for URL: {url}")
|
||||
return job_id
|
||||
|
||||
def start_job(self, job_id: str) -> bool:
|
||||
"""
|
||||
Start a pending job.
|
||||
|
||||
Args:
|
||||
job_id: Job ID to start
|
||||
|
||||
Returns:
|
||||
True if job was started, False otherwise
|
||||
"""
|
||||
with self.lock:
|
||||
if job_id not in self.jobs:
|
||||
return False
|
||||
|
||||
job = self.jobs[job_id]
|
||||
if job.status != JobStatus.PENDING:
|
||||
return False
|
||||
|
||||
# Check if we can start more jobs
|
||||
running_count = sum(1 for j in self.jobs.values() if j.status == JobStatus.RUNNING)
|
||||
if running_count >= self.max_concurrent_jobs:
|
||||
return False
|
||||
|
||||
job.status = JobStatus.RUNNING
|
||||
job.started_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.progress = {"stage": "starting", "message": "Initializing scraper"}
|
||||
|
||||
# Submit job to thread pool
|
||||
future = self.executor.submit(self._run_scraping_job, job_id)
|
||||
|
||||
log.info(f"Started scraping job {job_id}")
|
||||
return True
|
||||
|
||||
def _run_scraping_job(self, job_id: str):
|
||||
"""
|
||||
Run the actual scraping job in background thread.
|
||||
|
||||
Args:
|
||||
job_id: Job ID to run
|
||||
"""
|
||||
def progress_callback(current_count: int, total_count: int):
|
||||
"""Update job progress during scraping"""
|
||||
with self.lock:
|
||||
job = self.jobs.get(job_id)
|
||||
if job:
|
||||
job.reviews_count = current_count
|
||||
job.total_reviews = total_count
|
||||
job.updated_at = datetime.now() # Update last update time
|
||||
# Calculate percentage for better UX
|
||||
percentage = int((current_count / total_count * 100)) if total_count > 0 else 0
|
||||
job.progress = {
|
||||
"stage": "scraping",
|
||||
"message": f"Collecting reviews: {current_count} / {total_count} ({percentage}%)",
|
||||
"percentage": percentage
|
||||
}
|
||||
|
||||
worker = None
|
||||
try:
|
||||
with self.lock:
|
||||
job = self.jobs[job_id]
|
||||
job.progress = {"stage": "initializing", "message": "Acquiring Chrome worker from pool"}
|
||||
|
||||
# Get a worker from the scraping pool
|
||||
worker = get_scraping_worker(timeout=30)
|
||||
|
||||
if not worker:
|
||||
raise Exception("No Chrome workers available. Pool may be at capacity.")
|
||||
|
||||
log.info(f"Job {job_id}: Acquired worker {worker.worker_id} from pool")
|
||||
|
||||
# Get config
|
||||
url = job.config.get('url')
|
||||
headless = job.config.get('headless', True) # Default to headless
|
||||
max_scrolls = job.config.get('max_scrolls', 999999) # Effectively unlimited - relies on idle detection
|
||||
|
||||
with self.lock:
|
||||
job.progress = {"stage": "scraping", "message": f"Scraping reviews with {worker.worker_id} (fast mode)"}
|
||||
|
||||
# Run the FAST scraping with progress callback using pooled worker
|
||||
result = fast_scrape_reviews(
|
||||
url=url,
|
||||
headless=headless,
|
||||
max_scrolls=max_scrolls,
|
||||
progress_callback=progress_callback,
|
||||
driver=worker.driver, # Use worker's driver
|
||||
return_driver=True # Don't close the driver
|
||||
)
|
||||
|
||||
# Pop the driver from result before storing
|
||||
result.pop('driver', None)
|
||||
|
||||
# Mark job as completed or failed
|
||||
with self.lock:
|
||||
if result['success']:
|
||||
job.status = JobStatus.COMPLETED
|
||||
job.completed_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.reviews_count = result['count']
|
||||
job.total_reviews = result.get('total_reviews') # Store total review count from page
|
||||
job.reviews_data = result['reviews'] # Store the actual reviews
|
||||
job.scrape_time = result['time']
|
||||
job.progress = {
|
||||
"stage": "completed",
|
||||
"message": f"Scraping completed successfully in {result['time']:.1f}s",
|
||||
"scroll_time": result.get('scroll_time'),
|
||||
"extract_time": result.get('extract_time')
|
||||
}
|
||||
log.info(f"Completed scraping job {job_id}: {result['count']} reviews in {result['time']:.1f}s")
|
||||
else:
|
||||
job.status = JobStatus.FAILED
|
||||
job.completed_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.error_message = result.get('error', 'Unknown error')
|
||||
job.progress = {"stage": "failed", "message": f"Job failed: {result.get('error')}"}
|
||||
log.error(f"Failed scraping job {job_id}: {result.get('error')}")
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error in scraping job {job_id}: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
with self.lock:
|
||||
job = self.jobs[job_id]
|
||||
job.status = JobStatus.FAILED
|
||||
job.completed_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.error_message = str(e)
|
||||
job.progress = {"stage": "failed", "message": f"Job failed: {str(e)}"}
|
||||
|
||||
# Recycle worker on error
|
||||
if worker:
|
||||
log.info(f"Job {job_id}: Recycling worker {worker.worker_id} due to error")
|
||||
release_scraping_worker(worker, recycle=True)
|
||||
worker = None # Mark as released
|
||||
|
||||
finally:
|
||||
# Release worker back to pool if not already released
|
||||
if worker:
|
||||
log.info(f"Job {job_id}: Releasing worker {worker.worker_id} back to pool")
|
||||
release_scraping_worker(worker, recycle=False)
|
||||
|
||||
def get_job(self, job_id: str) -> Optional[ScrapingJob]:
|
||||
"""
|
||||
Get job by ID.
|
||||
|
||||
Args:
|
||||
job_id: Job ID
|
||||
|
||||
Returns:
|
||||
Job object or None if not found
|
||||
"""
|
||||
with self.lock:
|
||||
return self.jobs.get(job_id)
|
||||
|
||||
def get_job_reviews(self, job_id: str) -> Optional[List[Dict[str, Any]]]:
|
||||
"""
|
||||
Get reviews data for a specific job.
|
||||
|
||||
Args:
|
||||
job_id: Job ID
|
||||
|
||||
Returns:
|
||||
List of reviews or None if not found/not completed
|
||||
"""
|
||||
with self.lock:
|
||||
job = self.jobs.get(job_id)
|
||||
if job and job.status == JobStatus.COMPLETED:
|
||||
return job.reviews_data
|
||||
return None
|
||||
|
||||
def list_jobs(self, status: Optional[JobStatus] = None, limit: int = 100) -> List[ScrapingJob]:
|
||||
"""
|
||||
List jobs, optionally filtered by status.
|
||||
|
||||
Args:
|
||||
status: Optional status filter
|
||||
limit: Maximum number of jobs to return
|
||||
|
||||
Returns:
|
||||
List of jobs
|
||||
"""
|
||||
with self.lock:
|
||||
jobs = list(self.jobs.values())
|
||||
|
||||
if status:
|
||||
jobs = [job for job in jobs if job.status == status]
|
||||
|
||||
# Sort by creation time (newest first)
|
||||
jobs.sort(key=lambda x: x.created_at, reverse=True)
|
||||
|
||||
return jobs[:limit]
|
||||
|
||||
def cancel_job(self, job_id: str) -> bool:
|
||||
"""
|
||||
Cancel a pending or running job.
|
||||
|
||||
Args:
|
||||
job_id: Job ID to cancel
|
||||
|
||||
Returns:
|
||||
True if job was cancelled, False otherwise
|
||||
"""
|
||||
with self.lock:
|
||||
if job_id not in self.jobs:
|
||||
return False
|
||||
|
||||
job = self.jobs[job_id]
|
||||
if job.status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED]:
|
||||
return False
|
||||
|
||||
job.status = JobStatus.CANCELLED
|
||||
job.completed_at = datetime.now()
|
||||
job.updated_at = datetime.now()
|
||||
job.progress = {"stage": "cancelled", "message": "Job was cancelled"}
|
||||
|
||||
log.info(f"Cancelled scraping job {job_id}")
|
||||
return True
|
||||
|
||||
def delete_job(self, job_id: str) -> bool:
|
||||
"""
|
||||
Delete a job from the manager.
|
||||
|
||||
Args:
|
||||
job_id: Job ID to delete
|
||||
|
||||
Returns:
|
||||
True if job was deleted, False otherwise
|
||||
"""
|
||||
with self.lock:
|
||||
if job_id not in self.jobs:
|
||||
return False
|
||||
del self.jobs[job_id]
|
||||
|
||||
log.info(f"Deleted scraping job {job_id}")
|
||||
return True
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get job manager statistics.
|
||||
|
||||
Returns:
|
||||
Statistics dictionary
|
||||
"""
|
||||
with self.lock:
|
||||
jobs = list(self.jobs.values())
|
||||
|
||||
stats = {
|
||||
"total_jobs": len(jobs),
|
||||
"by_status": {},
|
||||
"running_jobs": 0,
|
||||
"max_concurrent_jobs": self.max_concurrent_jobs
|
||||
}
|
||||
|
||||
for status in JobStatus:
|
||||
count = sum(1 for job in jobs if job.status == status)
|
||||
stats["by_status"][status.value] = count
|
||||
|
||||
stats["running_jobs"] = stats["by_status"].get(JobStatus.RUNNING.value, 0)
|
||||
|
||||
return stats
|
||||
|
||||
def cleanup_old_jobs(self, max_age_hours: int = 24):
|
||||
"""
|
||||
Clean up old completed/failed jobs.
|
||||
|
||||
Args:
|
||||
max_age_hours: Maximum age in hours before cleanup
|
||||
"""
|
||||
cutoff_time = datetime.now().timestamp() - (max_age_hours * 3600)
|
||||
|
||||
with self.lock:
|
||||
to_delete = []
|
||||
for job_id, job in self.jobs.items():
|
||||
if job.status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED]:
|
||||
if job.completed_at and job.completed_at.timestamp() < cutoff_time:
|
||||
to_delete.append(job_id)
|
||||
|
||||
for job_id in to_delete:
|
||||
del self.jobs[job_id]
|
||||
|
||||
if to_delete:
|
||||
log.info(f"Cleaned up {len(to_delete)} old jobs")
|
||||
|
||||
def shutdown(self):
|
||||
"""Shutdown the job manager"""
|
||||
log.info("Shutting down job manager")
|
||||
self.executor.shutdown(wait=True)
|
||||
2335
modules/scraper.py
2335
modules/scraper.py
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user