Remove old scraper files - consolidate to scraper_clean

Production (api_server_production.py) only uses:
- modules/scraper_clean.py - main scraping logic
- modules/fast_scraper.py - validation helpers
- modules/database.py, webhooks.py, health_checks.py, chrome_pool.py

Deleted 33 unused Python files including:
- Old API server (api_server.py)
- 14 start*.py experimental scrapers
- 7 *_scraper.py variants
- Old modules: scraper.py, api_interceptor.py, job_manager.py, cli.py
- Various debug/test/utility scripts

Saves ~11,000 lines of unmaintained code.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-23 17:25:00 +00:00
parent 80e7771c00
commit 8ccf72a489
37 changed files with 859 additions and 11116 deletions

View File

@@ -1,923 +0,0 @@
"""
API Interceptor for Google Maps Reviews.
Uses Chrome DevTools Protocol (CDP) to intercept network requests and capture
Google's internal API responses for faster, more reliable data extraction.
"""
import base64
import json
import logging
import os
import re
import threading
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional
from urllib.parse import parse_qs, urlparse
log = logging.getLogger("api_interceptor")
@dataclass
class InterceptedReview:
"""Data class for a review extracted from API response"""
review_id: str = ""
author: str = ""
rating: float = 0.0
text: str = ""
date_text: str = ""
timestamp: int = 0
likes: int = 0
photos: List[str] = field(default_factory=list)
profile_url: str = ""
avatar_url: str = ""
owner_response: str = ""
owner_response_date: str = ""
lang: str = ""
class GoogleMapsAPIInterceptor:
"""
Intercepts Google Maps internal API calls to capture review data directly.
Google Maps uses several internal endpoints for reviews:
- /maps/preview/review/listentitiesreviews - Main reviews endpoint
- /maps/rpc/placereview - Alternative review endpoint
- /maps/preview/reviewsdata - Review data endpoint
The responses are often in a custom protobuf-like JSON format that needs parsing.
"""
# Patterns for review-related API endpoints
REVIEW_API_PATTERNS = [
r'maps/preview/review',
r'maps/rpc/placereview',
r'maps/preview/reviewsdata',
r'maps/preview/place',
r'maps/api/place',
r'/locationhistory/preview',
r'batchexecute.*review',
]
def __init__(self, driver):
"""Initialize the interceptor with a Selenium driver"""
self.driver = driver
self.captured_responses: List[Dict[str, Any]] = []
self.captured_reviews: List[InterceptedReview] = []
self.request_map: Dict[str, Dict] = {} # Map request IDs to URLs
self._lock = threading.Lock()
self._listening = False
self._response_callback: Optional[Callable] = None
def setup_interception(self):
"""Enable network interception via CDP"""
try:
# Enable network domain
self.driver.execute_cdp_cmd('Network.enable', {})
# Set up request interception patterns
self.driver.execute_cdp_cmd('Network.setRequestInterception', {
'patterns': [
{'urlPattern': '*maps*review*', 'resourceType': 'XHR'},
{'urlPattern': '*maps*review*', 'resourceType': 'Fetch'},
{'urlPattern': '*batchexecute*', 'resourceType': 'XHR'},
{'urlPattern': '*batchexecute*', 'resourceType': 'Fetch'},
]
})
self._listening = True
log.info("API interception enabled via CDP")
return True
except Exception as e:
log.warning(f"Could not enable CDP interception: {e}")
# Try alternative approach
return self._setup_performance_logging()
def _setup_performance_logging(self):
"""Alternative approach using Performance logging"""
try:
self.driver.execute_cdp_cmd('Network.enable', {
'maxTotalBufferSize': 10000000,
'maxResourceBufferSize': 5000000
})
self._listening = True
log.info("API interception enabled via performance logging")
return True
except Exception as e:
log.error(f"Failed to setup performance logging: {e}")
return False
def capture_network_responses(self, duration: float = 5.0):
"""
Capture network responses for a specified duration.
Call this while scrolling/loading more reviews.
"""
if not self._listening:
log.warning("Interception not set up, call setup_interception() first")
return []
captured = []
start_time = time.time()
while time.time() - start_time < duration:
try:
# Get performance logs which contain network events
logs = self.driver.get_log('performance')
for entry in logs:
try:
log_data = json.loads(entry['message'])
message = log_data.get('message', {})
method = message.get('method', '')
params = message.get('params', {})
# Capture response received events
if method == 'Network.responseReceived':
response = params.get('response', {})
url = response.get('url', '')
if self._is_review_api(url):
request_id = params.get('requestId')
self.request_map[request_id] = {
'url': url,
'status': response.get('status'),
'headers': response.get('headers', {})
}
# Capture response body when loading is finished
elif method == 'Network.loadingFinished':
request_id = params.get('requestId')
if request_id in self.request_map:
body = self._get_response_body(request_id)
if body:
captured.append({
'url': self.request_map[request_id]['url'],
'body': body,
'timestamp': time.time()
})
except Exception as parse_error:
log.debug(f"Error parsing log entry: {parse_error}")
continue
except Exception as e:
# Performance logs might not be available
log.debug(f"Could not get performance logs: {e}")
break
time.sleep(0.1)
with self._lock:
self.captured_responses.extend(captured)
return captured
def get_response_bodies_cdp(self):
"""Get response bodies using CDP directly (more reliable method)"""
responses = []
try:
# Use CDP to get all responses
result = self.driver.execute_cdp_cmd('Network.getAllCookies', {})
# Execute JavaScript to intercept fetch/XHR responses
intercept_script = """
(function() {
if (window.__interceptedResponses) {
var responses = window.__interceptedResponses;
window.__interceptedResponses = [];
return responses;
}
return [];
})();
"""
captured = self.driver.execute_script(intercept_script)
if captured:
responses.extend(captured)
except Exception as e:
log.debug(f"CDP response capture error: {e}")
return responses
def inject_response_interceptor(self):
"""
Inject JavaScript to intercept XHR/Fetch responses at the browser level.
This is the most reliable method for capturing API responses.
"""
intercept_script = """
(function() {
// Skip if already injected
if (window.__reviewInterceptorInjected) {
console.log('[API Interceptor] Already injected, skipping');
return;
}
window.__reviewInterceptorInjected = true;
window.__interceptedResponses = [];
window.__interceptorStats = {
totalFetch: 0,
totalXHR: 0,
capturedFetch: 0,
capturedXHR: 0,
lastCapture: null
};
console.log('[API Interceptor] Initializing...');
// Store original fetch
const originalFetch = window.fetch;
// Override fetch
window.fetch = async function(...args) {
window.__interceptorStats.totalFetch++;
const url = args[0].toString();
// Log ALL fetch requests for debugging
console.debug('[API Interceptor] FETCH:', url.substring(0, 150));
const response = await originalFetch.apply(this, args);
// Check if this is a review-related API call
if (url.includes('review') || url.includes('batchexecute') ||
url.includes('place') || url.includes('maps') ||
url.includes('listugcposts') || url.includes('getreviews')) {
try {
const clone = response.clone();
const text = await clone.text();
console.log('[API Interceptor] ✅ CAPTURED FETCH:', url.substring(0, 100), 'Size:', text.length);
window.__interceptedResponses.push({
url: url,
body: text,
timestamp: Date.now(),
type: 'fetch',
size: text.length
});
window.__interceptorStats.capturedFetch++;
window.__interceptorStats.lastCapture = new Date().toISOString();
// Keep only last 100 responses to avoid memory issues
if (window.__interceptedResponses.length > 100) {
window.__interceptedResponses = window.__interceptedResponses.slice(-50);
}
} catch (e) {
console.error('[API Interceptor] Response capture error:', e);
}
}
return response;
};
// Store original XMLHttpRequest
const originalXHR = window.XMLHttpRequest;
// Create intercepting XHR
window.XMLHttpRequest = function() {
const xhr = new originalXHR();
const originalOpen = xhr.open;
const originalSend = xhr.send;
let requestUrl = '';
xhr.open = function(method, url, ...rest) {
requestUrl = url;
window.__interceptorStats.totalXHR++;
console.debug('[API Interceptor] XHR:', method, url.substring(0, 150));
return originalOpen.apply(this, [method, url, ...rest]);
};
xhr.addEventListener('load', function() {
if (requestUrl.includes('review') || requestUrl.includes('batchexecute') ||
requestUrl.includes('place') || requestUrl.includes('maps') ||
requestUrl.includes('listugcposts') || requestUrl.includes('getreviews')) {
try {
console.log('[API Interceptor] ✅ CAPTURED XHR:', requestUrl.substring(0, 100), 'Size:', xhr.responseText.length);
window.__interceptedResponses.push({
url: requestUrl,
body: xhr.responseText,
timestamp: Date.now(),
type: 'xhr',
status: xhr.status,
size: xhr.responseText.length
});
window.__interceptorStats.capturedXHR++;
window.__interceptorStats.lastCapture = new Date().toISOString();
if (window.__interceptedResponses.length > 100) {
window.__interceptedResponses = window.__interceptedResponses.slice(-50);
}
} catch (e) {
console.error('[API Interceptor] XHR capture error:', e);
}
}
});
return xhr;
};
// Copy static properties
for (let prop of Object.getOwnPropertyNames(originalXHR)) {
try {
window.XMLHttpRequest[prop] = originalXHR[prop];
} catch (e) {}
}
console.log('[API Interceptor] ✅ Injected successfully! Monitoring network requests...');
// Log stats every 10 seconds
setInterval(() => {
if (window.__interceptorStats.totalFetch > 0 || window.__interceptorStats.totalXHR > 0) {
console.log('[API Interceptor] Stats:',
'Fetch:', window.__interceptorStats.totalFetch, '/', window.__interceptorStats.capturedFetch,
'XHR:', window.__interceptorStats.totalXHR, '/', window.__interceptorStats.capturedXHR,
'Queue:', window.__interceptedResponses.length);
}
}, 10000);
return true;
})();
"""
try:
result = self.driver.execute_script(intercept_script)
log.info("JavaScript response interceptor injected with enhanced debugging")
# Get initial stats
stats = self.get_interceptor_stats()
log.debug(f"Interceptor stats: {stats}")
return True
except Exception as e:
log.warning(f"Failed to inject interceptor: {e}")
return False
def get_intercepted_responses(self):
"""Retrieve intercepted responses from the browser"""
try:
script = """
if (window.__interceptedResponses) {
var responses = window.__interceptedResponses.slice();
window.__interceptedResponses = [];
return responses;
}
return [];
"""
responses = self.driver.execute_script(script)
if responses:
log.debug(f"Retrieved {len(responses)} intercepted responses from browser")
for resp in responses[:3]: # Log first 3 for debugging
log.debug(f" - {resp.get('type', '?').upper()}: {resp.get('url', '')[:100]} ({resp.get('size', 0)} bytes)")
else:
log.debug("No intercepted responses available")
return responses or []
except Exception as e:
log.debug(f"Error getting intercepted responses: {e}")
return []
def get_interceptor_stats(self):
"""Get statistics from the JavaScript interceptor"""
try:
script = """
if (window.__interceptorStats) {
return window.__interceptorStats;
}
return null;
"""
stats = self.driver.execute_script(script)
return stats
except Exception as e:
log.debug(f"Error getting interceptor stats: {e}")
return None
def get_browser_console_logs(self):
"""Get browser console logs (for debugging)"""
try:
logs = self.driver.get_log('browser')
return logs
except Exception as e:
log.debug(f"Could not get browser console logs: {e}")
return []
def dump_responses_to_file(self, responses: List[Dict], output_dir: str = "debug_api_responses"):
"""
Dump captured responses to files for debugging.
Creates one file per response with metadata and body.
"""
try:
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
for i, response in enumerate(responses):
timestamp = response.get('timestamp', int(time.time() * 1000))
url = response.get('url', 'unknown')
req_type = response.get('type', 'unknown')
# Create filename from timestamp and type
filename = f"{timestamp}_{req_type}_{i}.json"
filepath = output_path / filename
# Write response with metadata
with open(filepath, 'w', encoding='utf-8') as f:
json.dump({
'metadata': {
'url': url,
'type': req_type,
'timestamp': timestamp,
'size': response.get('size', len(response.get('body', ''))),
'status': response.get('status')
},
'body': response.get('body', '')
}, f, indent=2, ensure_ascii=False)
log.info(f"Dumped {len(responses)} responses to {output_path}")
return str(output_path)
except Exception as e:
log.error(f"Error dumping responses to file: {e}")
return None
def _is_review_api(self, url: str) -> bool:
"""Check if URL matches review API patterns"""
url_lower = url.lower()
return any(re.search(pattern, url_lower) for pattern in self.REVIEW_API_PATTERNS)
def _get_response_body(self, request_id: str) -> Optional[str]:
"""Get response body for a request ID using CDP"""
try:
result = self.driver.execute_cdp_cmd('Network.getResponseBody', {
'requestId': request_id
})
body = result.get('body', '')
if result.get('base64Encoded'):
body = base64.b64decode(body).decode('utf-8', errors='ignore')
return body
except Exception as e:
log.debug(f"Could not get response body for {request_id}: {e}")
return None
def parse_reviews_from_responses(self, responses: List[Dict]) -> List[InterceptedReview]:
"""
Parse review data from captured API responses.
Google's API responses use a custom nested array format.
"""
reviews = []
for response in responses:
try:
body = response.get('body', '')
url = response.get('url', '')
# Skip non-JSON responses
if not body or body.startswith('<!DOCTYPE'):
continue
# Try to parse as JSON
parsed_reviews = self._parse_response_body(body, url)
reviews.extend(parsed_reviews)
except Exception as e:
log.debug(f"Error parsing response: {e}")
continue
# Deduplicate by review ID
seen_ids = set()
unique_reviews = []
for review in reviews:
if review.review_id and review.review_id not in seen_ids:
seen_ids.add(review.review_id)
unique_reviews.append(review)
return unique_reviews
def _parse_response_body(self, body: str, url: str) -> List[InterceptedReview]:
"""Parse a single response body for review data"""
reviews = []
# Skip empty or HTML responses
if not body or body.startswith('<!DOCTYPE') or body.startswith('<html'):
return reviews
# Handle batch execute format (starts with )]}' prefix)
if body.startswith(")]}'"):
body = body[4:].strip()
try:
data = json.loads(body)
except json.JSONDecodeError:
# Try to extract JSON from the response
json_match = re.search(r'\[.*\]', body, re.DOTALL)
if json_match:
try:
data = json.loads(json_match.group())
except:
log.debug(f"Failed to parse JSON from response")
return reviews
else:
log.debug(f"No JSON found in response")
return reviews
# Special handling for listugcposts endpoint
if 'listugcposts' in url.lower():
reviews.extend(self._parse_listugcposts_response(data))
else:
# Generic recursive extraction
reviews.extend(self._extract_reviews_recursive(data))
return reviews
def _parse_listugcposts_response(self, data: Any) -> List[InterceptedReview]:
"""
Parse Google Maps listugcposts API response.
Structure discovered:
data[2] = array of review groups
data[2][i] = single review group [review_data, metadata, continuation_token]
data[2][i][0] = review data (6-item array containing all review info)
"""
reviews = []
try:
if not isinstance(data, list) or len(data) < 3:
log.debug("Response doesn't match expected structure (not a list or too short)")
return reviews
# data[2] contains the review groups
review_groups = data[2]
if not isinstance(review_groups, list):
log.debug("data[2] is not a list")
return reviews
log.debug(f"Found {len(review_groups)} reviews in data[2]")
# Each group IS ONE REVIEW
for group_idx, group in enumerate(review_groups):
if not isinstance(group, list) or len(group) == 0:
continue
# group[0] is the review data array (6 items)
review_data = group[0]
if not isinstance(review_data, list):
continue
try:
review = self._parse_google_review_array(review_data)
if review:
reviews.append(review)
log.debug(f"Parsed review {group_idx}: {review.author} - {review.rating}")
except Exception as e:
log.debug(f"Error parsing review at group[{group_idx}]: {e}")
except Exception as e:
log.debug(f"Error in _parse_listugcposts_response: {e}")
return reviews
def _parse_google_review_array(self, review_data: List) -> Optional[InterceptedReview]:
"""
Parse a single review from Google's 6-item array format.
Discovered structure (review_data is a 6-item array):
review_data[0] = Review ID (string)
review_data[1][4][5][0] = Author Name
review_data[1][4][5][3] = User ID
review_data[1][6] = Date Text
review_data[2][0][0] = Rating (1-5)
review_data[2][15][0][0] = Review Text (original)
review_data[2][15][1][0] = Review Text (translated)
"""
review = InterceptedReview()
try:
# Extract review ID from review_data[0]
if len(review_data) > 0 and isinstance(review_data[0], str):
review.review_id = review_data[0]
# Extract author info from review_data[1][4][5]
if (len(review_data) > 1 and
isinstance(review_data[1], list) and
len(review_data[1]) > 4 and
isinstance(review_data[1][4], list) and
len(review_data[1][4]) > 5 and
isinstance(review_data[1][4][5], list)):
author_info = review_data[1][4][5]
# Author name at [1][4][5][0]
if len(author_info) > 0 and isinstance(author_info[0], str):
review.author = author_info[0]
# Profile picture at [1][4][5][1] (if available)
if len(author_info) > 1 and isinstance(author_info[1], str):
review.avatar_url = author_info[1]
# Extract date from review_data[1][6]
if (len(review_data) > 1 and
isinstance(review_data[1], list) and
len(review_data[1]) > 6 and
isinstance(review_data[1][6], str)):
review.date_text = review_data[1][6]
# Extract rating from review_data[2][0][0]
if (len(review_data) > 2 and
isinstance(review_data[2], list) and
len(review_data[2]) > 0 and
isinstance(review_data[2][0], list) and
len(review_data[2][0]) > 0):
rating_val = review_data[2][0][0]
if isinstance(rating_val, (int, float)) and 1 <= rating_val <= 5:
review.rating = float(rating_val)
# Extract review text from review_data[2][15][0][0]
if (len(review_data) > 2 and
isinstance(review_data[2], list) and
len(review_data[2]) > 15 and
isinstance(review_data[2][15], list) and
len(review_data[2][15]) > 0 and
isinstance(review_data[2][15][0], list) and
len(review_data[2][15][0]) > 0):
text = review_data[2][15][0][0]
if isinstance(text, str):
review.text = text
# Only return if we have minimum required data
if review.rating > 0 and (review.author or review.text):
return review
except Exception as e:
log.debug(f"Error parsing Google review array: {e}")
return None
def _parse_review_array_v2(self, arr: List) -> Optional[InterceptedReview]:
"""
Parse review from Google's nested array format.
Improved version with better field detection.
"""
review = InterceptedReview()
try:
# Extract review ID (usually a long string in first few elements)
for i, item in enumerate(arr[:5]):
if isinstance(item, str) and len(item) > 30 and not item.startswith('http'):
review.review_id = item
break
# Extract rating (number between 1-5)
for item in arr:
if isinstance(item, (int, float)) and 1 <= item <= 5:
review.rating = float(item)
break
elif isinstance(item, list):
for subitem in item:
if isinstance(subitem, (int, float)) and 1 <= subitem <= 5:
review.rating = float(subitem)
break
if review.rating > 0:
break
# Extract review text (long string, not a URL)
for item in arr:
if isinstance(item, str) and len(item) > 50 and not item.startswith('http'):
if not review.review_id or item != review.review_id:
review.text = item
break
# Extract author name (shorter string, not ID or text)
for item in arr:
if isinstance(item, str) and 3 <= len(item) <= 100:
if item != review.review_id and item != review.text and not item.startswith('http'):
review.author = item
break
elif isinstance(item, list):
for subitem in item:
if isinstance(subitem, str) and 3 <= len(subitem) <= 100:
if subitem != review.text and not subitem.startswith('http'):
review.author = subitem
break
if review.author:
break
# Extract dates (strings that look like dates)
date_patterns = [r'\d{1,2}/\d{1,2}/\d{2,4}', r'\d{4}-\d{2}-\d{2}', r'hace \d+', r'\d+ days? ago']
for item in arr:
if isinstance(item, str):
for pattern in date_patterns:
if re.search(pattern, item, re.IGNORECASE):
review.date_text = item
break
if review.date_text:
break
# Only return if we have meaningful data
if (review.review_id or review.author) and review.rating > 0:
return review
except Exception as e:
log.debug(f"Error in _parse_review_array_v2: {e}")
return None
def _extract_reviews_recursive(self, data: Any, depth: int = 0) -> List[InterceptedReview]:
"""Recursively search for review data in nested structures"""
reviews = []
if depth > 20: # Prevent infinite recursion
return reviews
# Skip if data is already an InterceptedReview object
if isinstance(data, InterceptedReview):
return [data]
if isinstance(data, dict):
# Check if this looks like a review object
review = self._try_parse_review_dict(data)
if review:
reviews.append(review)
# Recurse into dict values
for value in data.values():
if not isinstance(value, InterceptedReview):
reviews.extend(self._extract_reviews_recursive(value, depth + 1))
elif isinstance(data, list):
# Check if this array looks like a review array
review = self._try_parse_review_array(data)
if review:
reviews.append(review)
# Recurse into list items
for item in data:
if not isinstance(item, InterceptedReview):
reviews.extend(self._extract_reviews_recursive(item, depth + 1))
return reviews
def _try_parse_review_dict(self, data: Dict) -> Optional[InterceptedReview]:
"""Try to parse a dictionary as a review object"""
# Common keys in review objects
review_keys = {'reviewId', 'review_id', 'author', 'rating', 'text', 'comment'}
if not any(k in data for k in review_keys):
return None
try:
review = InterceptedReview()
# Try various key names for each field
review.review_id = data.get('reviewId') or data.get('review_id') or data.get('id', '')
review.author = data.get('author') or data.get('authorName') or data.get('name', '')
review.rating = float(data.get('rating') or data.get('starRating') or 0)
review.text = data.get('text') or data.get('comment') or data.get('reviewText', '')
review.date_text = data.get('publishTime') or data.get('relativePublishTime') or data.get('date', '')
review.likes = int(data.get('thumbsUpCount') or data.get('likes') or 0)
# Photos
photos = data.get('photos') or data.get('reviewPhotos') or []
if photos:
review.photos = [p.get('url') or p for p in photos if p]
# Profile
author_data = data.get('author') if isinstance(data.get('author'), dict) else {}
review.profile_url = author_data.get('profileUrl') or data.get('profileUrl', '')
review.avatar_url = author_data.get('profilePhotoUrl') or data.get('avatar', '')
# Owner response
owner_resp = data.get('ownerResponse') or data.get('ownerReply') or {}
if isinstance(owner_resp, dict):
review.owner_response = owner_resp.get('text', '')
review.owner_response_date = owner_resp.get('publishTime', '')
# Only return if we have meaningful data
if review.review_id or (review.author and review.text):
return review
except Exception as e:
log.debug(f"Error parsing review dict: {e}")
return None
def _try_parse_review_array(self, data: List) -> Optional[InterceptedReview]:
"""
Try to parse a nested array as a review (Google's protobuf-like format).
Google often uses positional arrays like: [id, author, [rating], text, ...]
"""
if not data or len(data) < 3:
return None
try:
# Look for patterns that indicate this is a review array
# Pattern 1: [review_id, [author_info], rating_array, text, ...]
review = InterceptedReview()
# Check if first element looks like a review ID
if isinstance(data[0], str) and len(data[0]) > 20:
review.review_id = data[0]
# Search for rating (usually a small number 1-5)
for item in data:
if isinstance(item, (int, float)) and 1 <= item <= 5:
review.rating = float(item)
break
elif isinstance(item, list) and len(item) >= 1:
if isinstance(item[0], (int, float)) and 1 <= item[0] <= 5:
review.rating = float(item[0])
break
# Search for text (long string)
for item in data:
if isinstance(item, str) and len(item) > 30:
review.text = item
break
elif isinstance(item, list):
for subitem in item:
if isinstance(subitem, str) and len(subitem) > 30:
review.text = subitem
break
# Search for author name (shorter string)
for item in data:
if isinstance(item, list) and len(item) >= 1:
for subitem in item:
if isinstance(subitem, str) and 2 <= len(subitem) <= 100 and subitem != review.text:
review.author = subitem
break
if review.author:
break
# Search for URLs (photos, profile)
for item in data:
if isinstance(item, str) and item.startswith('http'):
if 'googleusercontent' in item or 'ggpht' in item:
if not review.avatar_url:
review.avatar_url = item
else:
review.photos.append(item)
elif isinstance(item, list):
self._extract_urls_from_array(item, review)
# Only return if we have meaningful data
if review.review_id and review.rating > 0:
return review
if review.text and review.rating > 0:
return review
except Exception as e:
log.debug(f"Error parsing review array: {e}")
return None
def _extract_urls_from_array(self, arr: List, review: InterceptedReview, depth: int = 0):
"""Extract URLs from nested arrays"""
if depth > 5:
return
for item in arr:
if isinstance(item, str) and item.startswith('http'):
if 'googleusercontent' in item or 'ggpht' in item or 'lh3' in item:
if 'w72-h72' in item or 'p-rp-mo' in item: # Profile pic pattern
review.avatar_url = item
else:
review.photos.append(item)
elif isinstance(item, list):
self._extract_urls_from_array(item, depth + 1, review)
def convert_to_raw_review_format(self, intercepted: InterceptedReview) -> Dict[str, Any]:
"""Convert an InterceptedReview to the format used by RawReview/storage"""
return {
'review_id': intercepted.review_id,
'author': intercepted.author,
'rating': intercepted.rating,
'description': {'en': intercepted.text} if intercepted.text else {},
'likes': intercepted.likes,
'user_images': intercepted.photos,
'author_profile_url': intercepted.profile_url,
'profile_picture': intercepted.avatar_url,
'owner_responses': {
'en': {'text': intercepted.owner_response}
} if intercepted.owner_response else {},
'review_date': intercepted.date_text,
'_source': 'api_intercept'
}
def cleanup(self):
"""Clean up interception resources"""
try:
self.driver.execute_cdp_cmd('Network.disable', {})
except:
pass
self.captured_responses.clear()
self.captured_reviews.clear()
self.request_map.clear()
self._listening = False

View File

@@ -35,16 +35,45 @@ class ChromeWorker:
# SeleniumBase Driver automatically includes UC mode anti-detection
# Initialize with longer timeouts for large scraping jobs
# Chrome arguments for Docker stability
chrome_args = [
"--disable-dev-shm-usage", # Use /tmp instead of /dev/shm (critical for Docker)
"--disable-gpu", # Disable GPU acceleration
"--no-sandbox", # Required for Docker
"--disable-software-rasterizer",
"--disable-extensions",
"--disable-background-networking",
"--disable-default-apps",
"--disable-sync",
"--metrics-recording-only",
"--mute-audio",
"--no-first-run",
"--safebrowsing-disable-auto-update",
]
self.driver = Driver(
uc=True,
headless=self.headless,
page_load_strategy="normal"
page_load_strategy="normal",
chromium_arg=",".join(chrome_args)
)
# Set generous timeouts for large scraping jobs
self.driver.set_page_load_timeout(120) # 2 minutes for slow networks
self.driver.set_script_timeout(60) # 1 minute for complex extraction
# Set Chrome geolocation to US (Boston, MA) for consistent Google Maps results
# This prevents location-based variations in search results
try:
self.driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': 42.3601,
'longitude': -71.0589,
'accuracy': 100
})
log.info(f"Worker {self.worker_id}: Geolocation set to US (Boston, MA)")
except Exception as e:
log.warning(f"Worker {self.worker_id}: Could not set geolocation: {e}")
self.driver.maximize_window()
self.created_at = time.time()
self.last_used = time.time()

View File

@@ -1,80 +0,0 @@
"""
Command line interface handling for Google Maps Reviews Scraper.
"""
import argparse
import json
from pathlib import Path
from modules.config import DEFAULT_CONFIG_PATH
def parse_arguments():
"""Parse command line arguments"""
ap = argparse.ArgumentParser(description="GoogleMaps review scraper with MongoDB integration")
ap.add_argument("-q", "--headless", action="store_true",
help="run Chrome in the background")
ap.add_argument("-s", "--sort", dest="sort_by",
choices=("newest", "highest", "lowest", "relevance"),
default=None, help="sorting order for reviews")
ap.add_argument("--stop-on-match", action="store_true",
help="stop scrolling when first alreadyseen id is met "
"(useful with --sort newest)")
ap.add_argument("--url", type=str, default=None,
help="custom Google Maps URL to scrape")
ap.add_argument("--overwrite", action="store_true", dest="overwrite_existing",
help="overwrite existing reviews instead of appending")
ap.add_argument("--config", type=str, default=None,
help="path to custom configuration file")
ap.add_argument("--use-mongodb", type=bool, default=None,
help="whether to use MongoDB for storage")
# Arguments for date conversion and image downloading
ap.add_argument("--convert-dates", type=bool, default=None,
help="convert string dates to MongoDB Date objects")
ap.add_argument("--download-images", type=bool, default=None,
help="download images from reviews")
ap.add_argument("--image-dir", type=str, default=None,
help="directory to store downloaded images")
ap.add_argument("--download-threads", type=int, default=None,
help="number of threads for downloading images")
# Arguments for local image paths and URL replacement
ap.add_argument("--store-local-paths", type=bool, default=None,
help="whether to store local image paths in documents")
ap.add_argument("--replace-urls", type=bool, default=None,
help="whether to replace original URLs with custom ones")
ap.add_argument("--custom-url-base", type=str, default=None,
help="base URL for replacement")
ap.add_argument("--custom-url-profiles", type=str, default=None,
help="path for profile images")
ap.add_argument("--custom-url-reviews", type=str, default=None,
help="path for review images")
ap.add_argument("--preserve-original-urls", type=bool, default=None,
help="whether to preserve original URLs in original_* fields")
# Arguments for custom parameters
ap.add_argument("--custom-params", type=str, default=None,
help="JSON string with custom parameters to add to each document (e.g. '{\"company\":\"Thaitours\"}')")
# API interception option
ap.add_argument("--api-intercept", action="store_true", dest="enable_api_intercept",
help="enable API response interception for faster data capture (experimental)")
args = ap.parse_args()
# Handle config path
if args.config is not None:
args.config = Path(args.config)
else:
args.config = DEFAULT_CONFIG_PATH
# Process custom params if provided
if args.custom_params:
try:
args.custom_params = json.loads(args.custom_params)
except json.JSONDecodeError:
print(f"Warning: Could not parse custom params JSON: {args.custom_params}")
args.custom_params = None
return args

View File

@@ -77,11 +77,17 @@ class DatabaseManager:
error_message TEXT,
metadata JSONB,
scrape_logs JSONB,
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
);
""")
# Add scrape_logs column if it doesn't exist (for existing databases)
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
""")
# Create indexes
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
@@ -182,10 +188,12 @@ class DatabaseManager:
started_at,
completed_at,
reviews_count,
total_reviews,
reviews_data,
scrape_time,
error_message,
metadata
metadata,
scrape_logs
FROM jobs
WHERE job_id = $1
""", job_id)
@@ -246,8 +254,13 @@ class DatabaseManager:
kwargs['completed_at'] = datetime.now()
for key, value in kwargs.items():
set_clauses.append(f"{key} = ${param_idx}")
params.append(value)
# Handle JSONB fields specially
if key == 'scrape_logs' and value is not None:
set_clauses.append(f"{key} = ${param_idx}::jsonb")
params.append(json.dumps(value) if not isinstance(value, str) else value)
else:
set_clauses.append(f"{key} = ${param_idx}")
params.append(value)
param_idx += 1
query = f"""
@@ -264,7 +277,8 @@ class DatabaseManager:
job_id: UUID,
reviews: List[Dict[str, Any]],
scrape_time: float,
total_reviews: Optional[int] = None
total_reviews: Optional[int] = None,
scrape_logs: Optional[List[Dict[str, Any]]] = None
):
"""
Save scraping results to database.
@@ -274,6 +288,7 @@ class DatabaseManager:
reviews: List of review dictionaries
scrape_time: Time taken to scrape in seconds
total_reviews: Total reviews available (from page counter)
scrape_logs: List of log entries from the scraper
"""
async with self.pool.acquire() as conn:
await conn.execute("""
@@ -284,9 +299,11 @@ class DatabaseManager:
reviews_count = $2,
total_reviews = $3,
reviews_data = $4::jsonb,
scrape_time = $5
scrape_time = $5,
scrape_logs = $6::jsonb
WHERE job_id = $1
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time)
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
json.dumps(scrape_logs) if scrape_logs else None)
log.info(f"Saved {len(reviews)} reviews for job {job_id}")
@@ -317,8 +334,10 @@ class DatabaseManager:
created_at,
completed_at,
reviews_count,
total_reviews,
scrape_time,
error_message
error_message,
metadata
FROM jobs
WHERE status = $1
ORDER BY created_at DESC
@@ -333,8 +352,10 @@ class DatabaseManager:
created_at,
completed_at,
reviews_count,
total_reviews,
scrape_time,
error_message
error_message,
metadata
FROM jobs
ORDER BY created_at DESC
LIMIT $1 OFFSET $2

View File

@@ -1140,13 +1140,30 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
else:
log.info(f"[PROFILE] Using pooled driver (0.00s)")
# Force English locale for consistent parsing
# Force English locale AND US region for consistent parsing/results
# This helps avoid geolocation-based variations in Google Maps results
if 'hl=' in url:
url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en')
else:
separator = '&' if '?' in url else '?'
url = f"{url}{separator}hl=en"
# Add US region parameter if not present
if 'gl=' not in url:
url = f"{url}&gl=us"
# Set Chrome geolocation to US (Boston, MA) using CDP
# This ensures Google Maps shows US results regardless of server location
try:
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': 42.3601,
'longitude': -71.0589,
'accuracy': 100
})
log.info("Set geolocation to US (Boston, MA)")
except Exception as e:
log.warning(f"Could not set geolocation: {e}")
log.info(f"Loading Google Maps page...")
t0 = timing_module.time()
driver.get(url)
@@ -1164,18 +1181,23 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
for btn in form_btns:
btn_text = (btn.text or '').lower()
if 'aceptar todo' in btn_text or 'accept all' in btn_text:
if 'aceptar todo' in btn_text or 'accept all' in btn_text or 'reject all' in btn_text:
log.info(f"Clicking GDPR consent: {btn.text}")
btn.click()
time.sleep(1) # Reduced from 2s
time.sleep(1)
break
else:
if len(form_btns) >= 2:
log.info("Using fallback: clicking second form button")
form_btns[1].click()
time.sleep(1) # Reduced from 2s
time.sleep(1)
except Exception as e:
log.warning(f"GDPR consent handling failed: {e}")
# After GDPR consent, reload the original URL to ensure proper page state
log.info(f"Reloading original URL after GDPR consent...")
driver.get(url)
time.sleep(1)
log.info(f"[PROFILE] GDPR consent handling: {timing_module.time() - t0:.2f}s")
else:
log.info(f"[PROFILE] No GDPR consent page (0.00s)")
@@ -1197,14 +1219,77 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
try:
log.info("Waiting for Google Maps content to load...")
wait = WebDriverWait(driver, 10)
# Wait for basic page structure (h1 or heading)
wait.until(
lambda d: d.find_elements(By.CSS_SELECTOR, 'h1.DUwDvf, h1, [role="article"], [data-review-id]')
lambda d: d.find_elements(By.CSS_SELECTOR, 'h1, [role="heading"]')
)
log.info("Google Maps content loaded successfully")
log.info("Basic page structure loaded")
# Wait for page to settle - search URLs redirect to place URLs
# which triggers additional content loading
time.sleep(2)
# Wait specifically for review count element (aria-label ending with "reviews")
# This is the most reliable indicator that the business detail is loaded
try:
WebDriverWait(driver, 5).until(
lambda d: d.execute_script("""
var elems = document.querySelectorAll('[aria-label]');
for (var i = 0; i < elems.length; i++) {
var label = elems[i].getAttribute('aria-label') || '';
if (/^[0-9]+ reviews?$/.test(label)) return true;
}
return false;
""")
)
log.info("Review count element loaded")
except:
# Fallback: Try clicking Reviews tab or rating stars to expose the review count
log.info("Review count wait timeout, trying to click Reviews/rating...")
try:
# Try 1: Click Reviews tab (if exists)
clicked = driver.execute_script("""
var tabs = document.querySelectorAll('[role="tab"]');
for (var i = 0; i < tabs.length; i++) {
var txt = (tabs[i].textContent || '').toLowerCase();
if (txt.includes('review')) {
tabs[i].click();
return 'tab';
}
}
// Try 2: Click the rating stars element (often links to reviews)
var stars = document.querySelector('[role="img"][aria-label*="star"]');
if (stars) {
var parent = stars.parentElement;
if (parent && parent.tagName.toLowerCase() === 'button') {
parent.click();
return 'stars_button';
}
stars.click();
return 'stars';
}
// Try 3: Click "Write a review" or any review-related button
var btns = document.querySelectorAll('button[aria-label*="review" i]');
for (var b = 0; b < btns.length; b++) {
var label = btns[b].getAttribute('aria-label') || '';
if (!/write/i.test(label) && /review/i.test(label)) {
btns[b].click();
return 'review_btn: ' + label;
}
}
return 'none';
""")
log.info(f"Clicked: {clicked}")
time.sleep(2) # Wait for reviews panel to load
except Exception as e:
log.warning(f"Click attempt failed: {e}")
except Exception as e:
log.warning(f"Timeout waiting for Maps content: {e}")
time.sleep(0.5) # Minimal fallback wait
time.sleep(2) # Fallback wait
log.info(f"[PROFILE] Smart wait for content: {timing_module.time() - t0:.2f}s")
log.info(f"DEBUG: Current URL: {driver.current_url[:100]}...")
log.info(f"DEBUG: Page title: {driver.title}")
# Extract business card information using JavaScript
t0 = timing_module.time()
@@ -1216,85 +1301,166 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
total_reviews: null
};
// Extract business name
const nameSelectors = [
'h1.DUwDvf',
'[role="main"] h1',
'h1.fontHeadlineLarge'
];
// ============ ROBUST EXTRACTION (no class names, aria/data attributes preferred) ============
for (const selector of nameSelectors) {
const elem = document.querySelector(selector);
if (elem && elem.textContent) {
info.name = elem.textContent.trim();
break;
}
}
// Helper: Parse review count from text, handling multiple formats
function parseReviewCount(text) {
if (!text) return null;
// Extract address
const addressSelectors = [
'button[data-item-id*="address"]',
'[data-item-id*="address"]',
'div[aria-label*="Address"]'
];
for (const selector of addressSelectors) {
const elem = document.querySelector(selector);
if (elem && elem.textContent) {
info.address = elem.textContent.trim();
break;
}
}
// Extract rating (look for aria-label like "4.2 stars")
const ratingElem = document.querySelector('[role="img"][aria-label*="star"]');
if (ratingElem) {
const ariaLabel = ratingElem.getAttribute('aria-label');
const match = ariaLabel.match(/([0-9.]+)/);
// Pattern 1: Exact "N reviews" format (aria-labels, clean text)
// Matches: "27 reviews", "1,234 reviews", "27 reseñas", "27 avis"
var match = text.match(/^([0-9][0-9,.]*)[ ]*(?:reviews?|reseñas?|avis|bewertungen?|recensioni?)$/i);
if (match) {
info.rating = parseFloat(match[1]);
return parseInt(match[1].replace(/[,. ]/g, ''));
}
}
// Extract total review count
const reviewPattern = /\\((\\d[\\d,\\.]*)\\)/;
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
// Pattern 2: "(N)" format often used in tabs like "Reviews (27)"
match = text.match(/[(]([0-9][0-9,.]*)[)]$/);
if (match) {
return parseInt(match[1].replace(/[,. ]/g, ''));
}
// PRIORITY 1: Look for review count in search results sidebar/panel
// This is where "152 reviews" appears on search results
const searchPanelSelectors = [
'a[href*="reviews"]', // Link with "reviews" in href
'button[jsaction*="reviews"]', // Button related to reviews
'div[role="link"]', // Clickable divs that might contain review info
];
for (const selector of searchPanelSelectors) {
const elements = document.querySelectorAll(selector);
for (let elem of elements) {
const text = elem.textContent || '';
const match = text.match(numberPattern);
// Pattern 3: "N reviews" anywhere in short text (< 30 chars to avoid false positives)
if (text.length < 30) {
match = text.match(/([0-9][0-9,]*)[ ]+(?:reviews?|reseñas?|avis)/i);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
if (num > 0 && num < 1000000) {
info.total_reviews = num;
break;
}
return parseInt(match[1].replace(/[,. ]/g, ''));
}
}
if (info.total_reviews) break;
return null;
}
// PRIORITY 2: Look in any span/div that contains the word "review"
// ============ EXTRACT BUSINESS NAME ============
// Priority: h1 (semantic), then role="heading"
const h1 = document.querySelector('h1');
if (h1 && h1.textContent) {
info.name = h1.textContent.trim();
}
if (!info.name) {
const heading = document.querySelector('[role="heading"][aria-level="1"]');
if (heading && heading.textContent) {
info.name = heading.textContent.trim();
}
}
// ============ EXTRACT ADDRESS ============
// Priority: data-item-id (semantic), then aria-label containing "address"
const addressElem = document.querySelector('[data-item-id*="address"]');
if (addressElem && addressElem.textContent) {
info.address = addressElem.textContent.trim();
}
if (!info.address) {
const ariaAddress = document.querySelector('[aria-label*="ddress"]');
if (ariaAddress && ariaAddress.textContent) {
info.address = ariaAddress.textContent.trim();
}
}
// ============ EXTRACT RATING ============
// Priority: aria-label containing "star" on role="img" elements
info._debug_rating_context = [];
const ratingElems = document.querySelectorAll('[role="img"][aria-label*="star"]');
for (let elem of ratingElems) {
const ariaLabel = elem.getAttribute('aria-label') || '';
// Match "4.9 stars" or "4,9 stars" (European format)
const match = ariaLabel.match(/([0-9][.,]?[0-9]?)\\s*star/i);
if (match) {
info.rating = parseFloat(match[1].replace(',', '.'));
// DEBUG: Capture parent/sibling context to find review count
var parent = elem.parentElement;
if (parent) {
info._debug_rating_context.push('PARENT: ' + (parent.textContent || '').trim().substring(0, 100));
var grandparent = parent.parentElement;
if (grandparent) {
info._debug_rating_context.push('GRANDPARENT: ' + (grandparent.textContent || '').trim().substring(0, 100));
// Check all children of grandparent for review count
var gpChildren = grandparent.querySelectorAll('*');
for (var c = 0; c < Math.min(gpChildren.length, 30); c++) {
var childText = (gpChildren[c].textContent || '').trim();
if (childText.length > 0 && childText.length < 20 && /[0-9]/.test(childText)) {
info._debug_rating_context.push('GP_CHILD: ' + childText);
}
}
// Also check great-grandparent
var ggp = grandparent.parentElement;
if (ggp) {
info._debug_rating_context.push('GREAT_GP: ' + (ggp.textContent || '').trim().substring(0, 150));
}
}
// Check siblings
var nextSib = parent.nextElementSibling;
if (nextSib) {
info._debug_rating_context.push('NEXT_SIB: ' + (nextSib.textContent || '').trim().substring(0, 100));
}
}
break;
}
}
// ============ EXTRACT TOTAL REVIEWS (ROBUST, ARIA-FIRST) ============
// PRIORITY 1: aria-label with exact "N reviews" format (most reliable)
// Google Maps uses aria-label="27 reviews" for accessibility
info._debug_aria = [];
info._debug_all_numeric = [];
if (!info.total_reviews) {
const allElements = document.querySelectorAll('span, div, a');
for (let elem of allElements) {
const text = elem.textContent || '';
if (text.length < 100) { // Skip very long text blocks
const match = text.match(numberPattern);
var ariaElems = document.querySelectorAll('[aria-label]');
for (var i = 0; i < ariaElems.length; i++) {
var ariaLabel = ariaElems[i].getAttribute('aria-label') || '';
// Collect all labels containing "review"
if (ariaLabel.toLowerCase().indexOf('review') >= 0) {
info._debug_aria.push(ariaLabel);
}
// Collect all labels starting with a digit
if (/^[0-9]/.test(ariaLabel)) {
info._debug_all_numeric.push(ariaLabel);
}
var count = parseReviewCount(ariaLabel);
if (count && count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = ariaLabel;
break;
}
}
}
// DEBUG: Find all text with parenthetical numbers like "(27)"
info._debug_parens = [];
info._debug_short_text = []; // All short text with numbers
var allSpans = document.querySelectorAll('span, div, a, button');
for (var j = 0; j < Math.min(allSpans.length, 500); j++) {
var spanText = allSpans[j].textContent || '';
// Capture parenthetical numbers
if (spanText.length < 20 && /[(][0-9]+[)]/.test(spanText)) {
info._debug_parens.push(spanText.trim());
}
// Capture ALL short text containing numbers (for debugging)
if (spanText.length > 0 && spanText.length < 30 && /[0-9]+/.test(spanText)) {
var cleaned = spanText.trim().replace(/\\s+/g, ' ');
if (cleaned && info._debug_short_text.indexOf(cleaned) < 0) {
info._debug_short_text.push(cleaned);
}
}
}
// PRIORITY 2.5: Look for text containing numbers near "review" word anywhere on page
// This catches formats like "27 reviews", "reviews: 27", etc. that aren't in aria-labels
if (!info.total_reviews) {
var allElems = document.querySelectorAll('*');
for (var k = 0; k < Math.min(allElems.length, 1000); k++) {
var elem = allElems[k];
// Skip if has children (we want leaf nodes only)
if (elem.children.length > 0) continue;
var txt = (elem.textContent || '').trim();
// Look for short text with both numbers and "review" word
if (txt.length >= 3 && txt.length < 30 && /review/i.test(txt)) {
var match = txt.match(/([0-9][0-9,]*)/);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
if (num > 0 && num < 1000000) {
info.total_reviews = num;
var count = parseInt(match[1].replace(/,/g, ''));
if (count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = 'LEAF: ' + txt;
break;
}
}
@@ -1302,38 +1468,167 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
}
}
// PRIORITY 3: Try tabs (for business detail pages)
// DEBUG: Collect all tab names
info._debug_tabs = [];
const tabs = document.querySelectorAll('[role="tab"]');
for (let t = 0; t < tabs.length; t++) {
info._debug_tabs.push((tabs[t].textContent || '').trim().substring(0, 30));
}
// DEBUG: Collect all buttons with text (might contain review count)
info._debug_buttons = [];
const buttons = document.querySelectorAll('button');
for (let b = 0; b < Math.min(buttons.length, 20); b++) {
var btnText = (buttons[b].textContent || '').trim();
if (btnText && btnText.length < 40) {
info._debug_buttons.push(btnText.substring(0, 40));
}
}
// PRIORITY 2: Tabs with role="tab" (Reviews tab often shows count)
if (!info.total_reviews) {
const tabs = document.querySelectorAll('button[role="tab"]');
for (let tab of tabs) {
const text = tab.textContent || '';
let match = text.match(reviewPattern);
if (match) {
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
break;
const text = (tab.textContent || '').trim();
// Look for "Reviews" tab with count
if (text.toLowerCase().includes('review')) {
const count = parseReviewCount(text);
if (count && count > 0) {
info.total_reviews = count;
info._debug_matched = 'TAB: ' + text;
break;
}
}
match = text.match(numberPattern);
if (match) {
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
}
}
// PRIORITY 2.3: Reviews panel header (after clicking Reviews tab)
// Google Maps shows "27 reviews" as heading text in the reviews panel
if (!info.total_reviews) {
// Look for headings containing review count
var headings = document.querySelectorAll('h1, h2, [role="heading"]');
for (var h = 0; h < headings.length; h++) {
var hText = (headings[h].textContent || '').trim();
if (/review/i.test(hText)) {
var match = hText.match(/([0-9][0-9,]*)/);
if (match) {
var count = parseInt(match[1].replace(/,/g, ''));
if (count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = 'HEADING: ' + hText;
break;
}
}
}
}
}
// PRIORITY 2.4: Look for sort button area which often has total count
// The sort dropdown area displays "Sort: Newest" and total reviews
if (!info.total_reviews) {
var sortBtns = document.querySelectorAll('button[data-value="sort"], [aria-label*="Sort"]');
for (var s = 0; s < sortBtns.length; s++) {
var parent = sortBtns[s].parentElement;
if (parent) {
var pText = (parent.textContent || '').trim();
if (/review/i.test(pText)) {
var match = pText.match(/([0-9][0-9,]*)\\s*review/i);
if (match) {
var count = parseInt(match[1].replace(/,/g, ''));
if (count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = 'SORT_AREA: ' + pText.substring(0, 50);
break;
}
}
}
}
}
}
// PRIORITY 3: Elements with semantic review-related attributes
if (!info.total_reviews) {
const reviewLinks = document.querySelectorAll('a[href*="review"], button[aria-label*="review" i]');
for (let elem of reviewLinks) {
const text = (elem.textContent || '').trim();
const count = parseReviewCount(text);
if (count && count > 0) {
info.total_reviews = count;
break;
}
}
}
// PRIORITY 4: Try aria-labels
// PRIORITY 4: Look for standalone review count text near rating
// Find elements that contain ONLY "N reviews" pattern (not concatenated with rating)
if (!info.total_reviews) {
const elements = document.querySelectorAll('[aria-label]');
for (let elem of elements) {
const ariaLabel = elem.getAttribute('aria-label') || '';
let match = ariaLabel.match(reviewPattern);
if (match) {
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
const allElements = document.querySelectorAll('span, a');
for (let elem of allElements) {
// Get direct text content only (not nested children)
const text = (elem.textContent || '').trim();
// Skip if too long (likely contains other content)
if (text.length > 50) continue;
// Skip if it looks like rating+reviews concatenated (e.g., "4.927 reviews")
if (/^[0-9]\\.[0-9]+[0-9]/.test(text)) continue;
const count = parseReviewCount(text);
if (count && count > 0 && count < 100000) {
info.total_reviews = count;
break;
}
match = ariaLabel.match(numberPattern);
if (match) {
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
break;
}
}
// PRIORITY 5: Parse from visible page text using regex on short text blocks
if (!info.total_reviews) {
const walker = document.createTreeWalker(
document.body,
NodeFilter.SHOW_TEXT,
null,
false
);
while (walker.nextNode()) {
const text = walker.currentNode.textContent.trim();
if (text.length >= 5 && text.length <= 30) {
// Match "27 reviews" but not "4.927 reviews"
const match = text.match(/(?:^|[^0-9.,])([0-9,]+)\\s+(?:reviews?|reseñas?)/i);
if (match) {
const count = parseInt(match[1].replace(/[,]/g, ''));
if (count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = 'WALKER: ' + text;
break;
}
}
}
}
}
// PRIORITY 6: Extract from embedded JSON in page source (Google embeds data in scripts)
if (!info.total_reviews) {
var scripts = document.querySelectorAll('script');
for (var sc = 0; sc < scripts.length; sc++) {
var scriptText = scripts[sc].textContent || '';
// Look for patterns like "user_reviews":{"count":27} or reviews_count":27
var jsonMatch = scriptText.match(/"(?:user_reviews|reviews?)(?:_count)?"\s*[:\{]\s*"?(\d+)"?/i);
if (jsonMatch) {
var count = parseInt(jsonMatch[1]);
if (count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = 'JSON_SCRIPT';
break;
}
}
// Also look for review count in Google's data format like [\"27 reviews\"]
if (!info.total_reviews) {
var dataMatch = scriptText.match(/"(\d+)\s+reviews?"/i);
if (dataMatch) {
var count = parseInt(dataMatch[1]);
if (count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = 'JSON_DATA: ' + dataMatch[0];
break;
}
}
}
}
}
@@ -1348,6 +1643,32 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
log.info(f"[PROFILE] *** TOTAL GET_BUSINESS_CARD TIME: {total_time:.2f}s ***")
log.info(f"Business card extracted: name={business_info.get('name')}, "
f"rating={business_info.get('rating')}, reviews={business_info.get('total_reviews')}")
# Debug: log what aria-labels were found
if business_info.get('_debug_aria'):
log.info(f"DEBUG: Found {len(business_info.get('_debug_aria'))} aria-labels with 'review': {business_info.get('_debug_aria')[:5]}")
if business_info.get('_debug_matched'):
log.info(f"DEBUG: Matched aria-label: {business_info.get('_debug_matched')}")
# Also log all numeric aria-labels (potential review counts)
if business_info.get('_debug_all_numeric'):
log.info(f"DEBUG: Numeric aria-labels: {business_info.get('_debug_all_numeric')[:10]}")
# Log any text with parenthetical numbers like "(27)"
if business_info.get('_debug_parens'):
log.info(f"DEBUG: Parenthetical text: {business_info.get('_debug_parens')[:5]}")
# Log all short text containing numbers (for debugging review count detection)
if business_info.get('_debug_short_text'):
log.info(f"DEBUG: Short text with numbers: {business_info.get('_debug_short_text')[:15]}")
# Log the context around the rating element
if business_info.get('_debug_rating_context'):
for ctx in business_info.get('_debug_rating_context', []):
log.info(f"DEBUG: Rating context: {ctx}")
# Log what tabs exist on the page
if business_info.get('_debug_tabs'):
log.info(f"DEBUG: Page tabs: {business_info.get('_debug_tabs')}")
else:
log.info(f"DEBUG: No tabs found on page")
# Log buttons (might contain review count)
if business_info.get('_debug_buttons'):
log.info(f"DEBUG: Buttons: {business_info.get('_debug_buttons')[:10]}")
result = {
"name": business_info.get('name'),

View File

@@ -1,407 +0,0 @@
"""
Background job manager for Google Reviews Scraper.
"""
import asyncio
import logging
import threading
import time
import uuid
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from enum import Enum
from typing import Dict, Any, Optional, List
from dataclasses import dataclass, asdict
from modules.config import load_config
from modules.scraper import GoogleReviewsScraper
from modules.scraper_clean import fast_scrape_reviews # Updated to use clean scraper with hard refresh recovery
from modules.chrome_pool import get_scraping_worker, release_scraping_worker
log = logging.getLogger("scraper")
class JobStatus(str, Enum):
"""Job status enumeration"""
PENDING = "pending"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
@dataclass
class ScrapingJob:
"""Scraping job data class"""
job_id: str
status: JobStatus
url: str
config: Dict[str, Any]
created_at: datetime
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
updated_at: Optional[datetime] = None # Last update time (for progress tracking)
error_message: Optional[str] = None
reviews_count: Optional[int] = None
total_reviews: Optional[int] = None # Total reviews available (from page counter)
images_count: Optional[int] = None
progress: Dict[str, Any] = None
reviews_data: Optional[List[Dict[str, Any]]] = None # Store actual review data
scrape_time: Optional[float] = None # Time taken to scrape
def to_dict(self, include_reviews: bool = False) -> Dict[str, Any]:
"""
Convert job to dictionary for JSON serialization
Args:
include_reviews: Whether to include the full reviews data (default: False)
"""
data = asdict(self)
# Convert datetime objects to ISO strings
for field in ['created_at', 'started_at', 'completed_at']:
if data[field]:
data[field] = data[field].isoformat()
# Exclude reviews_data by default (can be large)
if not include_reviews:
data.pop('reviews_data', None)
return data
class JobManager:
"""Manager for background scraping jobs"""
def __init__(self, max_concurrent_jobs: int = 3):
"""Initialize job manager"""
self.max_concurrent_jobs = max_concurrent_jobs
self.jobs: Dict[str, ScrapingJob] = {}
self.executor = ThreadPoolExecutor(max_workers=max_concurrent_jobs)
self.lock = threading.Lock()
def create_job(self, url: str, config_overrides: Dict[str, Any] = None) -> str:
"""
Create a new scraping job.
Args:
url: Google Maps URL to scrape
config_overrides: Optional config overrides
Returns:
Job ID
"""
job_id = str(uuid.uuid4())
# Load base config
config = load_config()
# Apply URL
config["url"] = url
# Apply any overrides
if config_overrides:
config.update(config_overrides)
job = ScrapingJob(
job_id=job_id,
status=JobStatus.PENDING,
url=url,
config=config,
created_at=datetime.now(),
progress={"stage": "created", "message": "Job created and queued"}
)
with self.lock:
self.jobs[job_id] = job
log.info(f"Created scraping job {job_id} for URL: {url}")
return job_id
def start_job(self, job_id: str) -> bool:
"""
Start a pending job.
Args:
job_id: Job ID to start
Returns:
True if job was started, False otherwise
"""
with self.lock:
if job_id not in self.jobs:
return False
job = self.jobs[job_id]
if job.status != JobStatus.PENDING:
return False
# Check if we can start more jobs
running_count = sum(1 for j in self.jobs.values() if j.status == JobStatus.RUNNING)
if running_count >= self.max_concurrent_jobs:
return False
job.status = JobStatus.RUNNING
job.started_at = datetime.now()
job.updated_at = datetime.now()
job.progress = {"stage": "starting", "message": "Initializing scraper"}
# Submit job to thread pool
future = self.executor.submit(self._run_scraping_job, job_id)
log.info(f"Started scraping job {job_id}")
return True
def _run_scraping_job(self, job_id: str):
"""
Run the actual scraping job in background thread.
Args:
job_id: Job ID to run
"""
def progress_callback(current_count: int, total_count: int):
"""Update job progress during scraping"""
with self.lock:
job = self.jobs.get(job_id)
if job:
job.reviews_count = current_count
job.total_reviews = total_count
job.updated_at = datetime.now() # Update last update time
# Calculate percentage for better UX
percentage = int((current_count / total_count * 100)) if total_count > 0 else 0
job.progress = {
"stage": "scraping",
"message": f"Collecting reviews: {current_count} / {total_count} ({percentage}%)",
"percentage": percentage
}
worker = None
try:
with self.lock:
job = self.jobs[job_id]
job.progress = {"stage": "initializing", "message": "Acquiring Chrome worker from pool"}
# Get a worker from the scraping pool
worker = get_scraping_worker(timeout=30)
if not worker:
raise Exception("No Chrome workers available. Pool may be at capacity.")
log.info(f"Job {job_id}: Acquired worker {worker.worker_id} from pool")
# Get config
url = job.config.get('url')
headless = job.config.get('headless', True) # Default to headless
max_scrolls = job.config.get('max_scrolls', 999999) # Effectively unlimited - relies on idle detection
with self.lock:
job.progress = {"stage": "scraping", "message": f"Scraping reviews with {worker.worker_id} (fast mode)"}
# Run the FAST scraping with progress callback using pooled worker
result = fast_scrape_reviews(
url=url,
headless=headless,
max_scrolls=max_scrolls,
progress_callback=progress_callback,
driver=worker.driver, # Use worker's driver
return_driver=True # Don't close the driver
)
# Pop the driver from result before storing
result.pop('driver', None)
# Mark job as completed or failed
with self.lock:
if result['success']:
job.status = JobStatus.COMPLETED
job.completed_at = datetime.now()
job.updated_at = datetime.now()
job.reviews_count = result['count']
job.total_reviews = result.get('total_reviews') # Store total review count from page
job.reviews_data = result['reviews'] # Store the actual reviews
job.scrape_time = result['time']
job.progress = {
"stage": "completed",
"message": f"Scraping completed successfully in {result['time']:.1f}s",
"scroll_time": result.get('scroll_time'),
"extract_time": result.get('extract_time')
}
log.info(f"Completed scraping job {job_id}: {result['count']} reviews in {result['time']:.1f}s")
else:
job.status = JobStatus.FAILED
job.completed_at = datetime.now()
job.updated_at = datetime.now()
job.error_message = result.get('error', 'Unknown error')
job.progress = {"stage": "failed", "message": f"Job failed: {result.get('error')}"}
log.error(f"Failed scraping job {job_id}: {result.get('error')}")
except Exception as e:
log.error(f"Error in scraping job {job_id}: {e}")
import traceback
traceback.print_exc()
with self.lock:
job = self.jobs[job_id]
job.status = JobStatus.FAILED
job.completed_at = datetime.now()
job.updated_at = datetime.now()
job.error_message = str(e)
job.progress = {"stage": "failed", "message": f"Job failed: {str(e)}"}
# Recycle worker on error
if worker:
log.info(f"Job {job_id}: Recycling worker {worker.worker_id} due to error")
release_scraping_worker(worker, recycle=True)
worker = None # Mark as released
finally:
# Release worker back to pool if not already released
if worker:
log.info(f"Job {job_id}: Releasing worker {worker.worker_id} back to pool")
release_scraping_worker(worker, recycle=False)
def get_job(self, job_id: str) -> Optional[ScrapingJob]:
"""
Get job by ID.
Args:
job_id: Job ID
Returns:
Job object or None if not found
"""
with self.lock:
return self.jobs.get(job_id)
def get_job_reviews(self, job_id: str) -> Optional[List[Dict[str, Any]]]:
"""
Get reviews data for a specific job.
Args:
job_id: Job ID
Returns:
List of reviews or None if not found/not completed
"""
with self.lock:
job = self.jobs.get(job_id)
if job and job.status == JobStatus.COMPLETED:
return job.reviews_data
return None
def list_jobs(self, status: Optional[JobStatus] = None, limit: int = 100) -> List[ScrapingJob]:
"""
List jobs, optionally filtered by status.
Args:
status: Optional status filter
limit: Maximum number of jobs to return
Returns:
List of jobs
"""
with self.lock:
jobs = list(self.jobs.values())
if status:
jobs = [job for job in jobs if job.status == status]
# Sort by creation time (newest first)
jobs.sort(key=lambda x: x.created_at, reverse=True)
return jobs[:limit]
def cancel_job(self, job_id: str) -> bool:
"""
Cancel a pending or running job.
Args:
job_id: Job ID to cancel
Returns:
True if job was cancelled, False otherwise
"""
with self.lock:
if job_id not in self.jobs:
return False
job = self.jobs[job_id]
if job.status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED]:
return False
job.status = JobStatus.CANCELLED
job.completed_at = datetime.now()
job.updated_at = datetime.now()
job.progress = {"stage": "cancelled", "message": "Job was cancelled"}
log.info(f"Cancelled scraping job {job_id}")
return True
def delete_job(self, job_id: str) -> bool:
"""
Delete a job from the manager.
Args:
job_id: Job ID to delete
Returns:
True if job was deleted, False otherwise
"""
with self.lock:
if job_id not in self.jobs:
return False
del self.jobs[job_id]
log.info(f"Deleted scraping job {job_id}")
return True
def get_stats(self) -> Dict[str, Any]:
"""
Get job manager statistics.
Returns:
Statistics dictionary
"""
with self.lock:
jobs = list(self.jobs.values())
stats = {
"total_jobs": len(jobs),
"by_status": {},
"running_jobs": 0,
"max_concurrent_jobs": self.max_concurrent_jobs
}
for status in JobStatus:
count = sum(1 for job in jobs if job.status == status)
stats["by_status"][status.value] = count
stats["running_jobs"] = stats["by_status"].get(JobStatus.RUNNING.value, 0)
return stats
def cleanup_old_jobs(self, max_age_hours: int = 24):
"""
Clean up old completed/failed jobs.
Args:
max_age_hours: Maximum age in hours before cleanup
"""
cutoff_time = datetime.now().timestamp() - (max_age_hours * 3600)
with self.lock:
to_delete = []
for job_id, job in self.jobs.items():
if job.status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED]:
if job.completed_at and job.completed_at.timestamp() < cutoff_time:
to_delete.append(job_id)
for job_id in to_delete:
del self.jobs[job_id]
if to_delete:
log.info(f"Cleaned up {len(to_delete)} old jobs")
def shutdown(self):
"""Shutdown the job manager"""
log.info("Shutting down job manager")
self.executor.shutdown(wait=True)

File diff suppressed because it is too large Load Diff