Add API interception for hybrid scraping and update selectors
- Add new api_interceptor.py module for CDP network interception - Capture Google Maps internal API responses during scrolling - Parse protobuf-like JSON responses to extract review data - Merge API-captured reviews with DOM-scraped data - Update CSS selectors for January 2026 Google Maps structure - Add cookie consent dismissal for multiple languages - Add --api-intercept CLI flag and config option - Fix review card and pane selectors (.jftiEf, .XiKgde) - Improve review ID extraction from card elements Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
593
modules/api_interceptor.py
Normal file
593
modules/api_interceptor.py
Normal file
@@ -0,0 +1,593 @@
|
|||||||
|
"""
|
||||||
|
API Interceptor for Google Maps Reviews.
|
||||||
|
Uses Chrome DevTools Protocol (CDP) to intercept network requests and capture
|
||||||
|
Google's internal API responses for faster, more reliable data extraction.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any, Callable, Dict, List, Optional
|
||||||
|
from urllib.parse import parse_qs, urlparse
|
||||||
|
|
||||||
|
log = logging.getLogger("api_interceptor")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class InterceptedReview:
|
||||||
|
"""Data class for a review extracted from API response"""
|
||||||
|
review_id: str = ""
|
||||||
|
author: str = ""
|
||||||
|
rating: float = 0.0
|
||||||
|
text: str = ""
|
||||||
|
date_text: str = ""
|
||||||
|
timestamp: int = 0
|
||||||
|
likes: int = 0
|
||||||
|
photos: List[str] = field(default_factory=list)
|
||||||
|
profile_url: str = ""
|
||||||
|
avatar_url: str = ""
|
||||||
|
owner_response: str = ""
|
||||||
|
owner_response_date: str = ""
|
||||||
|
lang: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class GoogleMapsAPIInterceptor:
|
||||||
|
"""
|
||||||
|
Intercepts Google Maps internal API calls to capture review data directly.
|
||||||
|
|
||||||
|
Google Maps uses several internal endpoints for reviews:
|
||||||
|
- /maps/preview/review/listentitiesreviews - Main reviews endpoint
|
||||||
|
- /maps/rpc/placereview - Alternative review endpoint
|
||||||
|
- /maps/preview/reviewsdata - Review data endpoint
|
||||||
|
|
||||||
|
The responses are often in a custom protobuf-like JSON format that needs parsing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Patterns for review-related API endpoints
|
||||||
|
REVIEW_API_PATTERNS = [
|
||||||
|
r'maps/preview/review',
|
||||||
|
r'maps/rpc/placereview',
|
||||||
|
r'maps/preview/reviewsdata',
|
||||||
|
r'maps/preview/place',
|
||||||
|
r'maps/api/place',
|
||||||
|
r'/locationhistory/preview',
|
||||||
|
r'batchexecute.*review',
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(self, driver):
|
||||||
|
"""Initialize the interceptor with a Selenium driver"""
|
||||||
|
self.driver = driver
|
||||||
|
self.captured_responses: List[Dict[str, Any]] = []
|
||||||
|
self.captured_reviews: List[InterceptedReview] = []
|
||||||
|
self.request_map: Dict[str, Dict] = {} # Map request IDs to URLs
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
self._listening = False
|
||||||
|
self._response_callback: Optional[Callable] = None
|
||||||
|
|
||||||
|
def setup_interception(self):
|
||||||
|
"""Enable network interception via CDP"""
|
||||||
|
try:
|
||||||
|
# Enable network domain
|
||||||
|
self.driver.execute_cdp_cmd('Network.enable', {})
|
||||||
|
|
||||||
|
# Set up request interception patterns
|
||||||
|
self.driver.execute_cdp_cmd('Network.setRequestInterception', {
|
||||||
|
'patterns': [
|
||||||
|
{'urlPattern': '*maps*review*', 'resourceType': 'XHR'},
|
||||||
|
{'urlPattern': '*maps*review*', 'resourceType': 'Fetch'},
|
||||||
|
{'urlPattern': '*batchexecute*', 'resourceType': 'XHR'},
|
||||||
|
{'urlPattern': '*batchexecute*', 'resourceType': 'Fetch'},
|
||||||
|
]
|
||||||
|
})
|
||||||
|
|
||||||
|
self._listening = True
|
||||||
|
log.info("API interception enabled via CDP")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f"Could not enable CDP interception: {e}")
|
||||||
|
# Try alternative approach
|
||||||
|
return self._setup_performance_logging()
|
||||||
|
|
||||||
|
def _setup_performance_logging(self):
|
||||||
|
"""Alternative approach using Performance logging"""
|
||||||
|
try:
|
||||||
|
self.driver.execute_cdp_cmd('Network.enable', {
|
||||||
|
'maxTotalBufferSize': 10000000,
|
||||||
|
'maxResourceBufferSize': 5000000
|
||||||
|
})
|
||||||
|
self._listening = True
|
||||||
|
log.info("API interception enabled via performance logging")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Failed to setup performance logging: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def capture_network_responses(self, duration: float = 5.0):
|
||||||
|
"""
|
||||||
|
Capture network responses for a specified duration.
|
||||||
|
Call this while scrolling/loading more reviews.
|
||||||
|
"""
|
||||||
|
if not self._listening:
|
||||||
|
log.warning("Interception not set up, call setup_interception() first")
|
||||||
|
return []
|
||||||
|
|
||||||
|
captured = []
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
while time.time() - start_time < duration:
|
||||||
|
try:
|
||||||
|
# Get performance logs which contain network events
|
||||||
|
logs = self.driver.get_log('performance')
|
||||||
|
|
||||||
|
for entry in logs:
|
||||||
|
try:
|
||||||
|
log_data = json.loads(entry['message'])
|
||||||
|
message = log_data.get('message', {})
|
||||||
|
method = message.get('method', '')
|
||||||
|
params = message.get('params', {})
|
||||||
|
|
||||||
|
# Capture response received events
|
||||||
|
if method == 'Network.responseReceived':
|
||||||
|
response = params.get('response', {})
|
||||||
|
url = response.get('url', '')
|
||||||
|
|
||||||
|
if self._is_review_api(url):
|
||||||
|
request_id = params.get('requestId')
|
||||||
|
self.request_map[request_id] = {
|
||||||
|
'url': url,
|
||||||
|
'status': response.get('status'),
|
||||||
|
'headers': response.get('headers', {})
|
||||||
|
}
|
||||||
|
|
||||||
|
# Capture response body when loading is finished
|
||||||
|
elif method == 'Network.loadingFinished':
|
||||||
|
request_id = params.get('requestId')
|
||||||
|
if request_id in self.request_map:
|
||||||
|
body = self._get_response_body(request_id)
|
||||||
|
if body:
|
||||||
|
captured.append({
|
||||||
|
'url': self.request_map[request_id]['url'],
|
||||||
|
'body': body,
|
||||||
|
'timestamp': time.time()
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as parse_error:
|
||||||
|
log.debug(f"Error parsing log entry: {parse_error}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Performance logs might not be available
|
||||||
|
log.debug(f"Could not get performance logs: {e}")
|
||||||
|
break
|
||||||
|
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
with self._lock:
|
||||||
|
self.captured_responses.extend(captured)
|
||||||
|
|
||||||
|
return captured
|
||||||
|
|
||||||
|
def get_response_bodies_cdp(self):
|
||||||
|
"""Get response bodies using CDP directly (more reliable method)"""
|
||||||
|
responses = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Use CDP to get all responses
|
||||||
|
result = self.driver.execute_cdp_cmd('Network.getAllCookies', {})
|
||||||
|
|
||||||
|
# Execute JavaScript to intercept fetch/XHR responses
|
||||||
|
intercept_script = """
|
||||||
|
(function() {
|
||||||
|
if (window.__interceptedResponses) {
|
||||||
|
var responses = window.__interceptedResponses;
|
||||||
|
window.__interceptedResponses = [];
|
||||||
|
return responses;
|
||||||
|
}
|
||||||
|
return [];
|
||||||
|
})();
|
||||||
|
"""
|
||||||
|
|
||||||
|
captured = self.driver.execute_script(intercept_script)
|
||||||
|
if captured:
|
||||||
|
responses.extend(captured)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log.debug(f"CDP response capture error: {e}")
|
||||||
|
|
||||||
|
return responses
|
||||||
|
|
||||||
|
def inject_response_interceptor(self):
|
||||||
|
"""
|
||||||
|
Inject JavaScript to intercept XHR/Fetch responses at the browser level.
|
||||||
|
This is the most reliable method for capturing API responses.
|
||||||
|
"""
|
||||||
|
intercept_script = """
|
||||||
|
(function() {
|
||||||
|
// Skip if already injected
|
||||||
|
if (window.__reviewInterceptorInjected) return;
|
||||||
|
window.__reviewInterceptorInjected = true;
|
||||||
|
window.__interceptedResponses = [];
|
||||||
|
|
||||||
|
// Store original fetch
|
||||||
|
const originalFetch = window.fetch;
|
||||||
|
|
||||||
|
// Override fetch
|
||||||
|
window.fetch = async function(...args) {
|
||||||
|
const response = await originalFetch.apply(this, args);
|
||||||
|
const url = args[0].toString();
|
||||||
|
|
||||||
|
// Check if this is a review-related API call
|
||||||
|
if (url.includes('review') || url.includes('batchexecute') ||
|
||||||
|
url.includes('place') || url.includes('maps')) {
|
||||||
|
try {
|
||||||
|
const clone = response.clone();
|
||||||
|
const text = await clone.text();
|
||||||
|
|
||||||
|
window.__interceptedResponses.push({
|
||||||
|
url: url,
|
||||||
|
body: text,
|
||||||
|
timestamp: Date.now(),
|
||||||
|
type: 'fetch'
|
||||||
|
});
|
||||||
|
|
||||||
|
// Keep only last 100 responses to avoid memory issues
|
||||||
|
if (window.__interceptedResponses.length > 100) {
|
||||||
|
window.__interceptedResponses = window.__interceptedResponses.slice(-50);
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.debug('Response capture error:', e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return response;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Store original XMLHttpRequest
|
||||||
|
const originalXHR = window.XMLHttpRequest;
|
||||||
|
|
||||||
|
// Create intercepting XHR
|
||||||
|
window.XMLHttpRequest = function() {
|
||||||
|
const xhr = new originalXHR();
|
||||||
|
const originalOpen = xhr.open;
|
||||||
|
const originalSend = xhr.send;
|
||||||
|
let requestUrl = '';
|
||||||
|
|
||||||
|
xhr.open = function(method, url, ...rest) {
|
||||||
|
requestUrl = url;
|
||||||
|
return originalOpen.apply(this, [method, url, ...rest]);
|
||||||
|
};
|
||||||
|
|
||||||
|
xhr.addEventListener('load', function() {
|
||||||
|
if (requestUrl.includes('review') || requestUrl.includes('batchexecute') ||
|
||||||
|
requestUrl.includes('place') || requestUrl.includes('maps')) {
|
||||||
|
try {
|
||||||
|
window.__interceptedResponses.push({
|
||||||
|
url: requestUrl,
|
||||||
|
body: xhr.responseText,
|
||||||
|
timestamp: Date.now(),
|
||||||
|
type: 'xhr'
|
||||||
|
});
|
||||||
|
|
||||||
|
if (window.__interceptedResponses.length > 100) {
|
||||||
|
window.__interceptedResponses = window.__interceptedResponses.slice(-50);
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.debug('XHR capture error:', e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return xhr;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Copy static properties
|
||||||
|
for (let prop of Object.getOwnPropertyNames(originalXHR)) {
|
||||||
|
try {
|
||||||
|
window.XMLHttpRequest[prop] = originalXHR[prop];
|
||||||
|
} catch (e) {}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('Review API interceptor injected');
|
||||||
|
return true;
|
||||||
|
})();
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = self.driver.execute_script(intercept_script)
|
||||||
|
log.info("JavaScript response interceptor injected")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f"Failed to inject interceptor: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_intercepted_responses(self):
|
||||||
|
"""Retrieve intercepted responses from the browser"""
|
||||||
|
try:
|
||||||
|
script = """
|
||||||
|
if (window.__interceptedResponses) {
|
||||||
|
var responses = window.__interceptedResponses.slice();
|
||||||
|
window.__interceptedResponses = [];
|
||||||
|
return responses;
|
||||||
|
}
|
||||||
|
return [];
|
||||||
|
"""
|
||||||
|
responses = self.driver.execute_script(script)
|
||||||
|
return responses or []
|
||||||
|
except Exception as e:
|
||||||
|
log.debug(f"Error getting intercepted responses: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _is_review_api(self, url: str) -> bool:
|
||||||
|
"""Check if URL matches review API patterns"""
|
||||||
|
url_lower = url.lower()
|
||||||
|
return any(re.search(pattern, url_lower) for pattern in self.REVIEW_API_PATTERNS)
|
||||||
|
|
||||||
|
def _get_response_body(self, request_id: str) -> Optional[str]:
|
||||||
|
"""Get response body for a request ID using CDP"""
|
||||||
|
try:
|
||||||
|
result = self.driver.execute_cdp_cmd('Network.getResponseBody', {
|
||||||
|
'requestId': request_id
|
||||||
|
})
|
||||||
|
|
||||||
|
body = result.get('body', '')
|
||||||
|
if result.get('base64Encoded'):
|
||||||
|
body = base64.b64decode(body).decode('utf-8', errors='ignore')
|
||||||
|
|
||||||
|
return body
|
||||||
|
except Exception as e:
|
||||||
|
log.debug(f"Could not get response body for {request_id}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def parse_reviews_from_responses(self, responses: List[Dict]) -> List[InterceptedReview]:
|
||||||
|
"""
|
||||||
|
Parse review data from captured API responses.
|
||||||
|
Google's API responses use a custom nested array format.
|
||||||
|
"""
|
||||||
|
reviews = []
|
||||||
|
|
||||||
|
for response in responses:
|
||||||
|
try:
|
||||||
|
body = response.get('body', '')
|
||||||
|
url = response.get('url', '')
|
||||||
|
|
||||||
|
# Skip non-JSON responses
|
||||||
|
if not body or body.startswith('<!DOCTYPE'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Try to parse as JSON
|
||||||
|
parsed_reviews = self._parse_response_body(body, url)
|
||||||
|
reviews.extend(parsed_reviews)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log.debug(f"Error parsing response: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Deduplicate by review ID
|
||||||
|
seen_ids = set()
|
||||||
|
unique_reviews = []
|
||||||
|
for review in reviews:
|
||||||
|
if review.review_id and review.review_id not in seen_ids:
|
||||||
|
seen_ids.add(review.review_id)
|
||||||
|
unique_reviews.append(review)
|
||||||
|
|
||||||
|
return unique_reviews
|
||||||
|
|
||||||
|
def _parse_response_body(self, body: str, url: str) -> List[InterceptedReview]:
|
||||||
|
"""Parse a single response body for review data"""
|
||||||
|
reviews = []
|
||||||
|
|
||||||
|
# Handle batch execute format (starts with )]}' prefix)
|
||||||
|
if body.startswith(")]}'"):
|
||||||
|
body = body[4:].strip()
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(body)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# Try to extract JSON from the response
|
||||||
|
json_match = re.search(r'\[.*\]', body, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
try:
|
||||||
|
data = json.loads(json_match.group())
|
||||||
|
except:
|
||||||
|
return reviews
|
||||||
|
else:
|
||||||
|
return reviews
|
||||||
|
|
||||||
|
# Extract reviews from nested structure
|
||||||
|
reviews.extend(self._extract_reviews_recursive(data))
|
||||||
|
|
||||||
|
return reviews
|
||||||
|
|
||||||
|
def _extract_reviews_recursive(self, data: Any, depth: int = 0) -> List[InterceptedReview]:
|
||||||
|
"""Recursively search for review data in nested structures"""
|
||||||
|
reviews = []
|
||||||
|
|
||||||
|
if depth > 20: # Prevent infinite recursion
|
||||||
|
return reviews
|
||||||
|
|
||||||
|
if isinstance(data, dict):
|
||||||
|
# Check if this looks like a review object
|
||||||
|
review = self._try_parse_review_dict(data)
|
||||||
|
if review:
|
||||||
|
reviews.append(review)
|
||||||
|
|
||||||
|
# Recurse into dict values
|
||||||
|
for value in data.values():
|
||||||
|
reviews.extend(self._extract_reviews_recursive(value, depth + 1))
|
||||||
|
|
||||||
|
elif isinstance(data, list):
|
||||||
|
# Check if this array looks like a review array
|
||||||
|
review = self._try_parse_review_array(data)
|
||||||
|
if review:
|
||||||
|
reviews.append(review)
|
||||||
|
|
||||||
|
# Recurse into list items
|
||||||
|
for item in data:
|
||||||
|
reviews.extend(self._extract_reviews_recursive(item, depth + 1))
|
||||||
|
|
||||||
|
return reviews
|
||||||
|
|
||||||
|
def _try_parse_review_dict(self, data: Dict) -> Optional[InterceptedReview]:
|
||||||
|
"""Try to parse a dictionary as a review object"""
|
||||||
|
# Common keys in review objects
|
||||||
|
review_keys = {'reviewId', 'review_id', 'author', 'rating', 'text', 'comment'}
|
||||||
|
|
||||||
|
if not any(k in data for k in review_keys):
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
review = InterceptedReview()
|
||||||
|
|
||||||
|
# Try various key names for each field
|
||||||
|
review.review_id = data.get('reviewId') or data.get('review_id') or data.get('id', '')
|
||||||
|
review.author = data.get('author') or data.get('authorName') or data.get('name', '')
|
||||||
|
review.rating = float(data.get('rating') or data.get('starRating') or 0)
|
||||||
|
review.text = data.get('text') or data.get('comment') or data.get('reviewText', '')
|
||||||
|
review.date_text = data.get('publishTime') or data.get('relativePublishTime') or data.get('date', '')
|
||||||
|
review.likes = int(data.get('thumbsUpCount') or data.get('likes') or 0)
|
||||||
|
|
||||||
|
# Photos
|
||||||
|
photos = data.get('photos') or data.get('reviewPhotos') or []
|
||||||
|
if photos:
|
||||||
|
review.photos = [p.get('url') or p for p in photos if p]
|
||||||
|
|
||||||
|
# Profile
|
||||||
|
author_data = data.get('author') if isinstance(data.get('author'), dict) else {}
|
||||||
|
review.profile_url = author_data.get('profileUrl') or data.get('profileUrl', '')
|
||||||
|
review.avatar_url = author_data.get('profilePhotoUrl') or data.get('avatar', '')
|
||||||
|
|
||||||
|
# Owner response
|
||||||
|
owner_resp = data.get('ownerResponse') or data.get('ownerReply') or {}
|
||||||
|
if isinstance(owner_resp, dict):
|
||||||
|
review.owner_response = owner_resp.get('text', '')
|
||||||
|
review.owner_response_date = owner_resp.get('publishTime', '')
|
||||||
|
|
||||||
|
# Only return if we have meaningful data
|
||||||
|
if review.review_id or (review.author and review.text):
|
||||||
|
return review
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log.debug(f"Error parsing review dict: {e}")
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _try_parse_review_array(self, data: List) -> Optional[InterceptedReview]:
|
||||||
|
"""
|
||||||
|
Try to parse a nested array as a review (Google's protobuf-like format).
|
||||||
|
Google often uses positional arrays like: [id, author, [rating], text, ...]
|
||||||
|
"""
|
||||||
|
if not data or len(data) < 3:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Look for patterns that indicate this is a review array
|
||||||
|
# Pattern 1: [review_id, [author_info], rating_array, text, ...]
|
||||||
|
|
||||||
|
review = InterceptedReview()
|
||||||
|
|
||||||
|
# Check if first element looks like a review ID
|
||||||
|
if isinstance(data[0], str) and len(data[0]) > 20:
|
||||||
|
review.review_id = data[0]
|
||||||
|
|
||||||
|
# Search for rating (usually a small number 1-5)
|
||||||
|
for item in data:
|
||||||
|
if isinstance(item, (int, float)) and 1 <= item <= 5:
|
||||||
|
review.rating = float(item)
|
||||||
|
break
|
||||||
|
elif isinstance(item, list) and len(item) >= 1:
|
||||||
|
if isinstance(item[0], (int, float)) and 1 <= item[0] <= 5:
|
||||||
|
review.rating = float(item[0])
|
||||||
|
break
|
||||||
|
|
||||||
|
# Search for text (long string)
|
||||||
|
for item in data:
|
||||||
|
if isinstance(item, str) and len(item) > 30:
|
||||||
|
review.text = item
|
||||||
|
break
|
||||||
|
elif isinstance(item, list):
|
||||||
|
for subitem in item:
|
||||||
|
if isinstance(subitem, str) and len(subitem) > 30:
|
||||||
|
review.text = subitem
|
||||||
|
break
|
||||||
|
|
||||||
|
# Search for author name (shorter string)
|
||||||
|
for item in data:
|
||||||
|
if isinstance(item, list) and len(item) >= 1:
|
||||||
|
for subitem in item:
|
||||||
|
if isinstance(subitem, str) and 2 <= len(subitem) <= 100 and subitem != review.text:
|
||||||
|
review.author = subitem
|
||||||
|
break
|
||||||
|
if review.author:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Search for URLs (photos, profile)
|
||||||
|
for item in data:
|
||||||
|
if isinstance(item, str) and item.startswith('http'):
|
||||||
|
if 'googleusercontent' in item or 'ggpht' in item:
|
||||||
|
if not review.avatar_url:
|
||||||
|
review.avatar_url = item
|
||||||
|
else:
|
||||||
|
review.photos.append(item)
|
||||||
|
elif isinstance(item, list):
|
||||||
|
self._extract_urls_from_array(item, review)
|
||||||
|
|
||||||
|
# Only return if we have meaningful data
|
||||||
|
if review.review_id and review.rating > 0:
|
||||||
|
return review
|
||||||
|
if review.text and review.rating > 0:
|
||||||
|
return review
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log.debug(f"Error parsing review array: {e}")
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _extract_urls_from_array(self, arr: List, review: InterceptedReview, depth: int = 0):
|
||||||
|
"""Extract URLs from nested arrays"""
|
||||||
|
if depth > 5:
|
||||||
|
return
|
||||||
|
|
||||||
|
for item in arr:
|
||||||
|
if isinstance(item, str) and item.startswith('http'):
|
||||||
|
if 'googleusercontent' in item or 'ggpht' in item or 'lh3' in item:
|
||||||
|
if 'w72-h72' in item or 'p-rp-mo' in item: # Profile pic pattern
|
||||||
|
review.avatar_url = item
|
||||||
|
else:
|
||||||
|
review.photos.append(item)
|
||||||
|
elif isinstance(item, list):
|
||||||
|
self._extract_urls_from_array(item, depth + 1, review)
|
||||||
|
|
||||||
|
def convert_to_raw_review_format(self, intercepted: InterceptedReview) -> Dict[str, Any]:
|
||||||
|
"""Convert an InterceptedReview to the format used by RawReview/storage"""
|
||||||
|
return {
|
||||||
|
'review_id': intercepted.review_id,
|
||||||
|
'author': intercepted.author,
|
||||||
|
'rating': intercepted.rating,
|
||||||
|
'description': {'en': intercepted.text} if intercepted.text else {},
|
||||||
|
'likes': intercepted.likes,
|
||||||
|
'user_images': intercepted.photos,
|
||||||
|
'author_profile_url': intercepted.profile_url,
|
||||||
|
'profile_picture': intercepted.avatar_url,
|
||||||
|
'owner_responses': {
|
||||||
|
'en': {'text': intercepted.owner_response}
|
||||||
|
} if intercepted.owner_response else {},
|
||||||
|
'review_date': intercepted.date_text,
|
||||||
|
'_source': 'api_intercept'
|
||||||
|
}
|
||||||
|
|
||||||
|
def cleanup(self):
|
||||||
|
"""Clean up interception resources"""
|
||||||
|
try:
|
||||||
|
self.driver.execute_cdp_cmd('Network.disable', {})
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
self.captured_responses.clear()
|
||||||
|
self.captured_reviews.clear()
|
||||||
|
self.request_map.clear()
|
||||||
|
self._listening = False
|
||||||
@@ -57,6 +57,10 @@ def parse_arguments():
|
|||||||
ap.add_argument("--custom-params", type=str, default=None,
|
ap.add_argument("--custom-params", type=str, default=None,
|
||||||
help="JSON string with custom parameters to add to each document (e.g. '{\"company\":\"Thaitours\"}')")
|
help="JSON string with custom parameters to add to each document (e.g. '{\"company\":\"Thaitours\"}')")
|
||||||
|
|
||||||
|
# API interception option
|
||||||
|
ap.add_argument("--api-intercept", action="store_true", dest="enable_api_intercept",
|
||||||
|
help="enable API response interception for faster data capture (experimental)")
|
||||||
|
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
|
|
||||||
# Handle config path
|
# Handle config path
|
||||||
|
|||||||
@@ -47,7 +47,13 @@ class RawReview:
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Try to get data-review-id from the card itself, or from a child element
|
||||||
rid = card.get_attribute("data-review-id") or ""
|
rid = card.get_attribute("data-review-id") or ""
|
||||||
|
if not rid:
|
||||||
|
# Try to find it in a child element
|
||||||
|
review_id_elem = try_find(card, "[data-review-id]")
|
||||||
|
if review_id_elem:
|
||||||
|
rid = review_id_elem[0].get_attribute("data-review-id") or ""
|
||||||
author = first_text(card, 'div[class*="d4r55"]')
|
author = first_text(card, 'div[class*="d4r55"]')
|
||||||
profile = first_attr(card, 'button[data-review-id]', "data-href")
|
profile = first_attr(card, 'button[data-review-id]', "data-href")
|
||||||
avatar = first_attr(card, 'button[data-review-id] img', "src")
|
avatar = first_attr(card, 'button[data-review-id] img', "src")
|
||||||
|
|||||||
@@ -24,16 +24,25 @@ from tqdm import tqdm
|
|||||||
|
|
||||||
from modules.data_storage import MongoDBStorage, JSONStorage, merge_review
|
from modules.data_storage import MongoDBStorage, JSONStorage, merge_review
|
||||||
from modules.models import RawReview
|
from modules.models import RawReview
|
||||||
|
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
||||||
|
|
||||||
# Logger
|
# Logger
|
||||||
log = logging.getLogger("scraper")
|
log = logging.getLogger("scraper")
|
||||||
|
|
||||||
# CSS Selectors
|
# CSS Selectors (Updated January 2026 for current Google Maps structure)
|
||||||
PANE_SEL = 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf'
|
PANE_SEL = 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'
|
||||||
CARD_SEL = "div[data-review-id]"
|
CARD_SEL = "div.jftiEf" # Review card container
|
||||||
|
# Cookie/consent dialog selectors (Updated January 2026)
|
||||||
COOKIE_BTN = ('button[aria-label*="Accept" i],'
|
COOKIE_BTN = ('button[aria-label*="Accept" i],'
|
||||||
|
'button[aria-label*="Aceptar" i],'
|
||||||
|
'button[aria-label*="Akzeptieren" i],'
|
||||||
|
'button[aria-label*="Aceitar" i],'
|
||||||
|
'button[jsname="higCR"],' # Google's "Accept all" button
|
||||||
'button[jsname="hZCF7e"],'
|
'button[jsname="hZCF7e"],'
|
||||||
'button[data-mdc-dialog-action="accept"]')
|
'button[data-mdc-dialog-action="accept"],'
|
||||||
|
'form[action*="consent"] button,'
|
||||||
|
'div[role="dialog"] button[jsname],'
|
||||||
|
'.VfPpkd-LgbsSe[data-mdc-dialog-action="accept"]')
|
||||||
SORT_BTN = 'button[aria-label="Sort reviews" i], button[aria-label="Sort" i]'
|
SORT_BTN = 'button[aria-label="Sort reviews" i], button[aria-label="Sort" i]'
|
||||||
MENU_ITEMS = 'div[role="menu"] [role="menuitem"], li[role="menuitem"]'
|
MENU_ITEMS = 'div[role="menu"] [role="menuitem"], li[role="menuitem"]'
|
||||||
|
|
||||||
@@ -169,6 +178,8 @@ class GoogleReviewsScraper:
|
|||||||
self.json_storage = JSONStorage(config)
|
self.json_storage = JSONStorage(config)
|
||||||
self.backup_to_json = config.get("backup_to_json", True)
|
self.backup_to_json = config.get("backup_to_json", True)
|
||||||
self.overwrite_existing = config.get("overwrite_existing", False)
|
self.overwrite_existing = config.get("overwrite_existing", False)
|
||||||
|
self.enable_api_intercept = config.get("enable_api_intercept", False)
|
||||||
|
self.api_interceptor = None # Will be initialized when driver is ready
|
||||||
|
|
||||||
def setup_driver(self, headless: bool):
|
def setup_driver(self, headless: bool):
|
||||||
"""
|
"""
|
||||||
@@ -257,32 +268,61 @@ class GoogleReviewsScraper:
|
|||||||
"""
|
"""
|
||||||
Dismiss cookie consent dialogs if present.
|
Dismiss cookie consent dialogs if present.
|
||||||
Handles stale element references by re-finding elements if needed.
|
Handles stale element references by re-finding elements if needed.
|
||||||
|
Updated January 2026 to handle current Google consent dialogs.
|
||||||
"""
|
"""
|
||||||
try:
|
dismissed = False
|
||||||
# Use WebDriverWait with expected_conditions to handle stale elements
|
|
||||||
WebDriverWait(driver, 3).until(
|
|
||||||
EC.presence_of_element_located((By.CSS_SELECTOR, COOKIE_BTN))
|
|
||||||
)
|
|
||||||
log.info("Cookie consent dialog found, attempting to dismiss")
|
|
||||||
|
|
||||||
# Get elements again after waiting to avoid stale references
|
# Try multiple approaches to dismiss consent dialogs
|
||||||
elements = driver.find_elements(By.CSS_SELECTOR, COOKIE_BTN)
|
consent_selectors = [
|
||||||
for elem in elements:
|
COOKIE_BTN,
|
||||||
try:
|
# Additional Google consent selectors
|
||||||
if elem.is_displayed():
|
'button[aria-label*="Accept all" i]',
|
||||||
elem.click()
|
'button[aria-label*="Aceptar todo" i]',
|
||||||
log.info("Cookie dialog dismissed")
|
'button[aria-label*="Reject all" i]', # Sometimes we need to reject
|
||||||
return True
|
'button:has-text("Accept")',
|
||||||
except Exception as e:
|
'button:has-text("Aceptar")',
|
||||||
log.debug(f"Error clicking cookie button: {e}")
|
'[role="dialog"] button:first-of-type',
|
||||||
continue
|
'form[action*="consent"] button:first-of-type',
|
||||||
except TimeoutException:
|
]
|
||||||
# This is expected if no cookie dialog is present
|
|
||||||
log.debug("No cookie consent dialog detected")
|
|
||||||
except Exception as e:
|
|
||||||
log.debug(f"Error handling cookie dialog: {e}")
|
|
||||||
|
|
||||||
return False
|
for selector in consent_selectors:
|
||||||
|
try:
|
||||||
|
elements = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||||
|
for elem in elements:
|
||||||
|
try:
|
||||||
|
if elem.is_displayed() and elem.is_enabled():
|
||||||
|
# Try JavaScript click first (more reliable)
|
||||||
|
driver.execute_script("arguments[0].click();", elem)
|
||||||
|
log.info(f"Cookie/consent dialog dismissed with selector: {selector}")
|
||||||
|
time.sleep(1) # Wait for dialog to close
|
||||||
|
dismissed = True
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
log.debug(f"Error clicking consent button: {e}")
|
||||||
|
continue
|
||||||
|
if dismissed:
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
log.debug(f"Error finding consent elements with {selector}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Also try to find and click any visible modal close buttons
|
||||||
|
if not dismissed:
|
||||||
|
try:
|
||||||
|
close_btns = driver.find_elements(By.CSS_SELECTOR,
|
||||||
|
'[role="dialog"] button[aria-label*="close" i], '
|
||||||
|
'[role="dialog"] button[aria-label*="cerrar" i], '
|
||||||
|
'.modal-close, .dialog-close')
|
||||||
|
for btn in close_btns:
|
||||||
|
if btn.is_displayed():
|
||||||
|
driver.execute_script("arguments[0].click();", btn)
|
||||||
|
log.info("Closed modal dialog")
|
||||||
|
dismissed = True
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return dismissed
|
||||||
|
|
||||||
def is_reviews_tab(self, tab: WebElement) -> bool:
|
def is_reviews_tab(self, tab: WebElement) -> bool:
|
||||||
"""
|
"""
|
||||||
@@ -364,6 +404,10 @@ class GoogleReviewsScraper:
|
|||||||
|
|
||||||
# Define different selectors to try in order of reliability
|
# Define different selectors to try in order of reliability
|
||||||
tab_selectors = [
|
tab_selectors = [
|
||||||
|
# Current Google Maps tab selectors (January 2026)
|
||||||
|
'.LRkQ2', # Main tab button class in current Google Maps
|
||||||
|
'.hh2c6', # Alternative tab button class
|
||||||
|
|
||||||
# Direct tab selectors
|
# Direct tab selectors
|
||||||
'[data-tab-index="1"]', # Most common tab index
|
'[data-tab-index="1"]', # Most common tab index
|
||||||
'[role="tab"][data-tab-index]', # Any tab with index
|
'[role="tab"][data-tab-index]', # Any tab with index
|
||||||
@@ -373,7 +417,6 @@ class GoogleReviewsScraper:
|
|||||||
|
|
||||||
# Common Google Maps review tab selectors
|
# Common Google Maps review tab selectors
|
||||||
'.fontTitleSmall[role="tab"]', # Google Maps title font tabs
|
'.fontTitleSmall[role="tab"]', # Google Maps title font tabs
|
||||||
'.hh2c6[role="tab"]', # Common Google Maps class
|
|
||||||
'.m6QErb [role="tab"]', # Maps container tabs
|
'.m6QErb [role="tab"]', # Maps container tabs
|
||||||
|
|
||||||
# Text-based selectors for various languages
|
# Text-based selectors for various languages
|
||||||
@@ -517,12 +560,14 @@ class GoogleReviewsScraper:
|
|||||||
characteristic elements that appear on the reviews page.
|
characteristic elements that appear on the reviews page.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Common elements that appear when reviews tab is active
|
# Common elements that appear when reviews tab is active (Updated January 2026)
|
||||||
verification_selectors = [
|
verification_selectors = [
|
||||||
# Reviews container
|
# Reviews container (current)
|
||||||
'div.m6QErb.DxyBCb.kA9KIf.dS8AEf',
|
'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde',
|
||||||
|
'div.m6QErb.WNBkOb.XiKgde',
|
||||||
|
|
||||||
# Review cards
|
# Review cards (current)
|
||||||
|
'div.jftiEf',
|
||||||
'div[data-review-id]',
|
'div[data-review-id]',
|
||||||
|
|
||||||
# Sort button (usually appears with reviews)
|
# Sort button (usually appears with reviews)
|
||||||
@@ -1122,6 +1167,7 @@ class GoogleReviewsScraper:
|
|||||||
seen = self.json_storage.load_seen()
|
seen = self.json_storage.load_seen()
|
||||||
|
|
||||||
driver = None
|
driver = None
|
||||||
|
api_reviews = {} # Store reviews captured from API
|
||||||
try:
|
try:
|
||||||
driver = self.setup_driver(headless)
|
driver = self.setup_driver(headless)
|
||||||
wait = WebDriverWait(driver, 20) # Reduced from 40 to 20 for faster timeout
|
wait = WebDriverWait(driver, 20) # Reduced from 40 to 20 for faster timeout
|
||||||
@@ -1129,7 +1175,15 @@ class GoogleReviewsScraper:
|
|||||||
driver.get(url)
|
driver.get(url)
|
||||||
wait.until(lambda d: "google.com/maps" in d.current_url)
|
wait.until(lambda d: "google.com/maps" in d.current_url)
|
||||||
|
|
||||||
self.dismiss_cookies(driver)
|
# Wait for page to load and consent dialogs to appear
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
# Try to dismiss any consent/cookie dialogs
|
||||||
|
if not self.dismiss_cookies(driver):
|
||||||
|
# Wait a bit more and try again
|
||||||
|
time.sleep(2)
|
||||||
|
self.dismiss_cookies(driver)
|
||||||
|
|
||||||
self.click_reviews_tab(driver)
|
self.click_reviews_tab(driver)
|
||||||
|
|
||||||
# Extra wait after clicking reviews tab to ensure page loads
|
# Extra wait after clicking reviews tab to ensure page loads
|
||||||
@@ -1158,10 +1212,14 @@ class GoogleReviewsScraper:
|
|||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
|
|
||||||
# Use try-except to handle cases where the pane is not found
|
# Use try-except to handle cases where the pane is not found
|
||||||
# Try multiple selectors for the reviews pane
|
# Try multiple selectors for the reviews pane (Updated January 2026)
|
||||||
pane = None
|
pane = None
|
||||||
pane_selectors = [
|
pane_selectors = [
|
||||||
PANE_SEL, # Primary selector
|
PANE_SEL, # Primary selector with XiKgde
|
||||||
|
'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', # Without role="main" prefix
|
||||||
|
'div.m6QErb.WNBkOb.XiKgde', # Alternative class combination
|
||||||
|
'div[role="main"] div.m6QErb.XiKgde', # Simplified with XiKgde
|
||||||
|
'div.m6QErb.DxyBCb.XiKgde', # Another variant
|
||||||
'div[role="main"] div.m6QErb', # Simplified version
|
'div[role="main"] div.m6QErb', # Simplified version
|
||||||
'div.m6QErb.DxyBCb', # Even more simplified
|
'div.m6QErb.DxyBCb', # Even more simplified
|
||||||
'div[role="main"]' # Most generic
|
'div[role="main"]' # Most generic
|
||||||
@@ -1182,6 +1240,15 @@ class GoogleReviewsScraper:
|
|||||||
log.warning("Could not find reviews pane with any selector. Page structure might have changed.")
|
log.warning("Could not find reviews pane with any selector. Page structure might have changed.")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# Initialize API interceptor AFTER reviews page is loaded (if enabled)
|
||||||
|
# This prevents CDP interception from affecting initial page load and tab detection
|
||||||
|
if self.enable_api_intercept:
|
||||||
|
log.info("Setting up API interception for reviews capture")
|
||||||
|
self.api_interceptor = GoogleMapsAPIInterceptor(driver)
|
||||||
|
self.api_interceptor.setup_interception()
|
||||||
|
self.api_interceptor.inject_response_interceptor()
|
||||||
|
log.info("API interceptor ready - capturing network responses")
|
||||||
|
|
||||||
pbar = tqdm(desc="Scraped", ncols=80, initial=len(seen))
|
pbar = tqdm(desc="Scraped", ncols=80, initial=len(seen))
|
||||||
idle = 0
|
idle = 0
|
||||||
processed_ids = set() # Track processed IDs in current session
|
processed_ids = set() # Track processed IDs in current session
|
||||||
@@ -1201,9 +1268,35 @@ class GoogleReviewsScraper:
|
|||||||
last_scroll_position = 0
|
last_scroll_position = 0
|
||||||
scroll_stuck_count = 0
|
scroll_stuck_count = 0
|
||||||
|
|
||||||
|
# Card selectors to try (Updated January 2026)
|
||||||
|
card_selectors = [
|
||||||
|
CARD_SEL, # Primary: div.jftiEf
|
||||||
|
"div[data-review-id]", # Alternative: direct data-review-id
|
||||||
|
".jftiEf", # Without div prefix
|
||||||
|
"div.WMbnJf", # Another common review card class
|
||||||
|
"[data-review-id]", # Any element with review ID
|
||||||
|
]
|
||||||
|
|
||||||
while attempts < max_attempts:
|
while attempts < max_attempts:
|
||||||
try:
|
try:
|
||||||
cards = pane.find_elements(By.CSS_SELECTOR, CARD_SEL)
|
# Try multiple card selectors within the pane
|
||||||
|
cards = []
|
||||||
|
for card_sel in card_selectors:
|
||||||
|
cards = pane.find_elements(By.CSS_SELECTOR, card_sel)
|
||||||
|
if cards:
|
||||||
|
if attempts == 0: # Only log once
|
||||||
|
log.info(f"Found {len(cards)} cards with selector: {card_sel}")
|
||||||
|
break
|
||||||
|
|
||||||
|
# If no cards found in pane, try searching the entire document
|
||||||
|
if not cards:
|
||||||
|
for card_sel in card_selectors:
|
||||||
|
cards = driver.find_elements(By.CSS_SELECTOR, card_sel)
|
||||||
|
if cards:
|
||||||
|
if attempts == 0:
|
||||||
|
log.info(f"Found {len(cards)} cards in document with selector: {card_sel}")
|
||||||
|
break
|
||||||
|
|
||||||
fresh_cards: List[WebElement] = []
|
fresh_cards: List[WebElement] = []
|
||||||
|
|
||||||
# Check for valid cards
|
# Check for valid cards
|
||||||
@@ -1228,7 +1321,15 @@ class GoogleReviewsScraper:
|
|||||||
|
|
||||||
for c in cards:
|
for c in cards:
|
||||||
try:
|
try:
|
||||||
|
# Try to get data-review-id from the card itself
|
||||||
cid = c.get_attribute("data-review-id")
|
cid = c.get_attribute("data-review-id")
|
||||||
|
# If not found on card, try to find it in a child element
|
||||||
|
if not cid:
|
||||||
|
try:
|
||||||
|
review_id_elem = c.find_element(By.CSS_SELECTOR, "[data-review-id]")
|
||||||
|
cid = review_id_elem.get_attribute("data-review-id")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
if not cid or cid in seen or cid in processed_ids:
|
if not cid or cid in seen or cid in processed_ids:
|
||||||
if stop_on_match and cid and (cid in seen or cid in processed_ids):
|
if stop_on_match and cid and (cid in seen or cid in processed_ids):
|
||||||
idle = 999
|
idle = 999
|
||||||
@@ -1314,6 +1415,20 @@ class GoogleReviewsScraper:
|
|||||||
# Try a simpler scroll method
|
# Try a simpler scroll method
|
||||||
driver.execute_script("window.scrollBy(0, 300);")
|
driver.execute_script("window.scrollBy(0, 300);")
|
||||||
|
|
||||||
|
# Collect API responses if interception is enabled
|
||||||
|
if self.enable_api_intercept and self.api_interceptor:
|
||||||
|
try:
|
||||||
|
responses = self.api_interceptor.get_intercepted_responses()
|
||||||
|
if responses:
|
||||||
|
parsed = self.api_interceptor.parse_reviews_from_responses(responses)
|
||||||
|
for intercepted in parsed:
|
||||||
|
if intercepted.review_id and intercepted.review_id not in api_reviews:
|
||||||
|
api_reviews[intercepted.review_id] = self.api_interceptor.convert_to_raw_review_format(intercepted)
|
||||||
|
if parsed:
|
||||||
|
log.debug(f"API interceptor captured {len(parsed)} reviews (total unique: {len(api_reviews)})")
|
||||||
|
except Exception as api_err:
|
||||||
|
log.debug(f"API interception error: {api_err}")
|
||||||
|
|
||||||
# Dynamic sleep: sleep less when processing many reviews, more when finding none
|
# Dynamic sleep: sleep less when processing many reviews, more when finding none
|
||||||
if len(fresh_cards) > 5:
|
if len(fresh_cards) > 5:
|
||||||
sleep_time = 0.7
|
sleep_time = 0.7
|
||||||
@@ -1339,6 +1454,23 @@ class GoogleReviewsScraper:
|
|||||||
|
|
||||||
pbar.close()
|
pbar.close()
|
||||||
|
|
||||||
|
# Merge API-captured reviews if any
|
||||||
|
if self.enable_api_intercept and api_reviews:
|
||||||
|
log.info(f"Merging {len(api_reviews)} reviews captured via API interception")
|
||||||
|
for review_id, api_review in api_reviews.items():
|
||||||
|
if review_id not in docs:
|
||||||
|
# New review from API only
|
||||||
|
docs[review_id] = api_review
|
||||||
|
seen.add(review_id)
|
||||||
|
else:
|
||||||
|
# Merge API data with existing DOM data (API might have more details)
|
||||||
|
existing = docs[review_id]
|
||||||
|
# Only update fields that are missing or empty
|
||||||
|
for key, value in api_review.items():
|
||||||
|
if key not in existing or not existing.get(key):
|
||||||
|
existing[key] = value
|
||||||
|
log.info(f"After merge: {len(docs)} total reviews")
|
||||||
|
|
||||||
# Save to MongoDB if enabled
|
# Save to MongoDB if enabled
|
||||||
if self.use_mongodb and self.mongodb:
|
if self.use_mongodb and self.mongodb:
|
||||||
log.info("Saving reviews to MongoDB...")
|
log.info("Saving reviews to MongoDB...")
|
||||||
@@ -1364,6 +1496,13 @@ class GoogleReviewsScraper:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
|
# Cleanup API interceptor
|
||||||
|
if self.api_interceptor:
|
||||||
|
try:
|
||||||
|
self.api_interceptor.cleanup()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
if driver is not None:
|
if driver is not None:
|
||||||
try:
|
try:
|
||||||
driver.quit()
|
driver.quit()
|
||||||
|
|||||||
4
start.py
4
start.py
@@ -64,6 +64,10 @@ def main():
|
|||||||
# Update config with the provided custom parameters
|
# Update config with the provided custom parameters
|
||||||
config["custom_params"].update(args.custom_params)
|
config["custom_params"].update(args.custom_params)
|
||||||
|
|
||||||
|
# Handle API interception option
|
||||||
|
if args.enable_api_intercept:
|
||||||
|
config["enable_api_intercept"] = True
|
||||||
|
|
||||||
# Initialize and run scraper
|
# Initialize and run scraper
|
||||||
scraper = GoogleReviewsScraper(config)
|
scraper = GoogleReviewsScraper(config)
|
||||||
scraper.scrape()
|
scraper.scrape()
|
||||||
|
|||||||
Reference in New Issue
Block a user