v1.0.0 improvements: - Add captcha detection (reCAPTCHA, unusual traffic, challenges) - Block fonts, analytics, maps tiles for faster scrolling - Add 95% close-enough threshold to skip unnecessary retries - Stop immediately if captcha detected instead of retrying v1.1.0 new features: - Multi-sort strategy to bypass ~1000 review limit - Cycles through newest/lowest/highest/relevant sorts - Auto mode: enables multi-sort when total > 1000 - Diminishing returns detection (stops if <5% new per pass) - Configurable sort order and thresholds Also adds test_scraper_v110.py CLI tool for testing multi-sort. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2365 lines
98 KiB
Python
2365 lines
98 KiB
Python
"""
|
|
Google Reviews Scraper v1.0.0
|
|
|
|
This module provides the core Google Maps reviews scraping functionality.
|
|
- Simple down scrolling
|
|
- DOM scraping + API interception
|
|
|
|
Version: 1.0.0
|
|
Migrated from: modules/scraper_clean.py
|
|
"""
|
|
|
|
import re
|
|
import json
|
|
import time
|
|
import threading
|
|
from datetime import datetime
|
|
from typing import List, Optional
|
|
from selenium.webdriver.common.by import By
|
|
|
|
from utils.logger import StructuredLogger
|
|
|
|
def get_chrome_memory(driver) -> Optional[int]:
|
|
"""Get Chrome memory usage in MB using CDP."""
|
|
try:
|
|
# Use CDP Performance.getMetrics
|
|
result = driver.execute_cdp_cmd('Performance.getMetrics', {})
|
|
for metric in result.get('metrics', []):
|
|
if metric['name'] == 'JSHeapUsedSize':
|
|
return int(metric['value'] / 1024 / 1024)
|
|
except:
|
|
pass
|
|
return None
|
|
|
|
|
|
def get_dom_node_count(driver) -> Optional[int]:
|
|
"""Get DOM node count."""
|
|
try:
|
|
return driver.execute_script("return document.getElementsByTagName('*').length")
|
|
except:
|
|
return None
|
|
|
|
|
|
def capture_session_fingerprint(driver) -> dict:
|
|
"""
|
|
Capture browser session fingerprint for bot detection analysis.
|
|
|
|
This captures various browser attributes that can be used to:
|
|
1. Verify bot detection evasion is working
|
|
2. Debug issues when scraping fails
|
|
3. Track session characteristics for analysis
|
|
|
|
Args:
|
|
driver: Selenium WebDriver instance (must be initialized)
|
|
|
|
Returns:
|
|
Dictionary containing session fingerprint data
|
|
"""
|
|
fingerprint = {
|
|
"user_agent": None,
|
|
"platform": None,
|
|
"language": None,
|
|
"languages": None,
|
|
"timezone": None,
|
|
"screen": {
|
|
"width": None,
|
|
"height": None,
|
|
"colorDepth": None
|
|
},
|
|
"viewport": {
|
|
"width": None,
|
|
"height": None
|
|
},
|
|
"webgl_vendor": None,
|
|
"webgl_renderer": None,
|
|
"canvas_fingerprint": None,
|
|
"hardware_concurrency": None,
|
|
"device_memory": None,
|
|
"bot_detection_tests": {
|
|
"webdriver_hidden": None,
|
|
"chrome_runtime": None,
|
|
"permissions_query": None
|
|
},
|
|
"captured_at": None
|
|
}
|
|
|
|
try:
|
|
# Navigate to about:blank first to ensure we can execute JS
|
|
# (in case driver was just created and hasn't navigated yet)
|
|
current_url = driver.current_url
|
|
if not current_url or current_url == "data:,":
|
|
driver.get("about:blank")
|
|
|
|
# Capture timestamp
|
|
fingerprint["captured_at"] = datetime.now().isoformat()
|
|
|
|
# Basic navigator properties
|
|
try:
|
|
fingerprint["user_agent"] = driver.execute_script("return navigator.userAgent")
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
fingerprint["platform"] = driver.execute_script("return navigator.platform")
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
fingerprint["language"] = driver.execute_script("return navigator.language")
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
fingerprint["languages"] = driver.execute_script("return navigator.languages")
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
fingerprint["timezone"] = driver.execute_script(
|
|
"return Intl.DateTimeFormat().resolvedOptions().timeZone"
|
|
)
|
|
except:
|
|
pass
|
|
|
|
# Screen properties
|
|
try:
|
|
fingerprint["screen"]["width"] = driver.execute_script("return screen.width")
|
|
fingerprint["screen"]["height"] = driver.execute_script("return screen.height")
|
|
fingerprint["screen"]["colorDepth"] = driver.execute_script("return screen.colorDepth")
|
|
except:
|
|
pass
|
|
|
|
# Viewport properties
|
|
try:
|
|
fingerprint["viewport"]["width"] = driver.execute_script("return window.innerWidth")
|
|
fingerprint["viewport"]["height"] = driver.execute_script("return window.innerHeight")
|
|
except:
|
|
pass
|
|
|
|
# WebGL vendor and renderer (important for fingerprinting)
|
|
try:
|
|
webgl_info = driver.execute_script("""
|
|
try {
|
|
var canvas = document.createElement('canvas');
|
|
var gl = canvas.getContext('webgl') || canvas.getContext('experimental-webgl');
|
|
if (gl) {
|
|
var debugInfo = gl.getExtension('WEBGL_debug_renderer_info');
|
|
if (debugInfo) {
|
|
return {
|
|
vendor: gl.getParameter(debugInfo.UNMASKED_VENDOR_WEBGL),
|
|
renderer: gl.getParameter(debugInfo.UNMASKED_RENDERER_WEBGL)
|
|
};
|
|
}
|
|
}
|
|
} catch(e) {}
|
|
return {vendor: null, renderer: null};
|
|
""")
|
|
fingerprint["webgl_vendor"] = webgl_info.get("vendor")
|
|
fingerprint["webgl_renderer"] = webgl_info.get("renderer")
|
|
except:
|
|
pass
|
|
|
|
# Canvas fingerprint (hash of canvas drawing)
|
|
try:
|
|
canvas_hash = driver.execute_script("""
|
|
try {
|
|
var canvas = document.createElement('canvas');
|
|
canvas.width = 200;
|
|
canvas.height = 50;
|
|
var ctx = canvas.getContext('2d');
|
|
ctx.textBaseline = 'top';
|
|
ctx.font = '14px Arial';
|
|
ctx.fillStyle = '#f60';
|
|
ctx.fillRect(125, 1, 62, 20);
|
|
ctx.fillStyle = '#069';
|
|
ctx.fillText('Fingerprint', 2, 15);
|
|
ctx.fillStyle = 'rgba(102, 204, 0, 0.7)';
|
|
ctx.fillText('Fingerprint', 4, 17);
|
|
var dataUrl = canvas.toDataURL();
|
|
// Simple hash
|
|
var hash = 0;
|
|
for (var i = 0; i < dataUrl.length; i++) {
|
|
var char = dataUrl.charCodeAt(i);
|
|
hash = ((hash << 5) - hash) + char;
|
|
hash = hash & hash;
|
|
}
|
|
return hash.toString(16);
|
|
} catch(e) {
|
|
return null;
|
|
}
|
|
""")
|
|
fingerprint["canvas_fingerprint"] = canvas_hash
|
|
except:
|
|
pass
|
|
|
|
# Hardware info
|
|
try:
|
|
fingerprint["hardware_concurrency"] = driver.execute_script(
|
|
"return navigator.hardwareConcurrency"
|
|
)
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
fingerprint["device_memory"] = driver.execute_script(
|
|
"return navigator.deviceMemory"
|
|
)
|
|
except:
|
|
pass
|
|
|
|
# Bot detection tests
|
|
try:
|
|
# Test 1: webdriver property should be hidden/false for undetected Chrome
|
|
webdriver_hidden = driver.execute_script(
|
|
"return navigator.webdriver === undefined || navigator.webdriver === false"
|
|
)
|
|
fingerprint["bot_detection_tests"]["webdriver_hidden"] = webdriver_hidden
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
# Test 2: chrome runtime should exist in real Chrome
|
|
chrome_runtime = driver.execute_script(
|
|
"return typeof window.chrome !== 'undefined'"
|
|
)
|
|
fingerprint["bot_detection_tests"]["chrome_runtime"] = chrome_runtime
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
# Test 3: permissions.query should work in real Chrome
|
|
permissions_query = driver.execute_script("""
|
|
try {
|
|
if (navigator.permissions && navigator.permissions.query) {
|
|
return true;
|
|
}
|
|
return false;
|
|
} catch(e) {
|
|
return false;
|
|
}
|
|
""")
|
|
fingerprint["bot_detection_tests"]["permissions_query"] = permissions_query
|
|
except:
|
|
pass
|
|
|
|
except Exception as e:
|
|
fingerprint["capture_error"] = str(e)
|
|
|
|
return fingerprint
|
|
|
|
|
|
def classify_crash(exception: Exception, metrics_history: list) -> str:
|
|
"""Classify crash type based on exception and metrics."""
|
|
error_str = str(exception).lower()
|
|
|
|
if 'aw, snap' in error_str or 'status_access_violation' in error_str:
|
|
return 'tab_crash'
|
|
if 'timeout' in error_str:
|
|
return 'timeout'
|
|
if metrics_history and metrics_history[-1].get('memory_mb', 0) > 400:
|
|
return 'memory_exhaustion'
|
|
if 'no such element' in error_str:
|
|
return 'element_not_found'
|
|
if '429' in error_str or 'rate' in error_str:
|
|
return 'rate_limited'
|
|
if 'network' in error_str or 'connection' in error_str:
|
|
return 'network_failure'
|
|
return 'unknown'
|
|
|
|
|
|
class ScraperCrashException(Exception):
|
|
"""Exception that carries crash report data for analysis."""
|
|
def __init__(self, original_exception, crash_report):
|
|
self.original_exception = original_exception
|
|
self.crash_report = crash_report
|
|
super().__init__(str(original_exception))
|
|
|
|
|
|
def get_topic_variants(topic: str) -> List[str]:
|
|
"""
|
|
Generate common variants of a topic word for matching.
|
|
|
|
Handles:
|
|
- Singular/plural forms
|
|
- Verb forms (-ing, -ed, -s)
|
|
- Common stemming patterns
|
|
|
|
Args:
|
|
topic: The topic word/phrase to generate variants for
|
|
|
|
Returns:
|
|
List of variant strings including the original
|
|
|
|
Example:
|
|
>>> get_topic_variants("cutting")
|
|
["cutting", "cut", "cuts"]
|
|
>>> get_topic_variants("service")
|
|
["service", "services", "servicing"]
|
|
"""
|
|
if not topic:
|
|
return []
|
|
|
|
topic = topic.lower().strip()
|
|
variants = {topic} # Use set to avoid duplicates
|
|
|
|
# Handle -ing forms (cutting -> cut, cuts)
|
|
if topic.endswith("ing"):
|
|
base = topic[:-3] # Remove -ing
|
|
if base:
|
|
variants.add(base)
|
|
variants.add(base + "s")
|
|
# Handle doubled consonants (cutting -> cut)
|
|
if len(base) >= 2 and base[-1] == base[-2]:
|
|
single_consonant = base[:-1]
|
|
variants.add(single_consonant)
|
|
variants.add(single_consonant + "s")
|
|
|
|
# Handle -s/-es plural forms (services -> service)
|
|
if topic.endswith("es") and len(topic) > 2:
|
|
variants.add(topic[:-2]) # Remove -es
|
|
variants.add(topic[:-2] + "ing")
|
|
elif topic.endswith("s") and len(topic) > 1 and not topic.endswith("ss"):
|
|
variants.add(topic[:-1]) # Remove -s
|
|
variants.add(topic[:-1] + "ing")
|
|
|
|
# Handle -ed forms (colored -> color)
|
|
if topic.endswith("ed") and len(topic) > 2:
|
|
base = topic[:-2]
|
|
if base:
|
|
variants.add(base)
|
|
variants.add(base + "s")
|
|
variants.add(base + "ing")
|
|
# Handle doubled consonants (colored -> color from coloured)
|
|
if len(base) >= 2 and base[-1] == base[-2]:
|
|
single_consonant = base[:-1]
|
|
variants.add(single_consonant)
|
|
|
|
# Add common forms if base word (no suffix detected)
|
|
if not (topic.endswith("ing") or topic.endswith("ed") or topic.endswith("s")):
|
|
variants.add(topic + "s")
|
|
variants.add(topic + "ing")
|
|
# Handle consonant doubling for -ing (cut -> cutting)
|
|
if len(topic) >= 2 and topic[-1] not in "aeiouwy":
|
|
variants.add(topic + topic[-1] + "ing")
|
|
|
|
return list(variants)
|
|
|
|
|
|
def infer_review_topics(review_text: str, topics: List[dict]) -> List[str]:
|
|
"""
|
|
Match review text against extracted topic keywords.
|
|
|
|
Args:
|
|
review_text: The review text to analyze
|
|
topics: List of topic dicts, e.g., [{"topic": "cutting", "count": 3}]
|
|
|
|
Returns:
|
|
List of matched topic names
|
|
|
|
Example:
|
|
>>> topics = [{"topic": "hair salon", "count": 4}, {"topic": "cutting", "count": 3}]
|
|
>>> text = "Great haircut! The cutting was professional."
|
|
>>> infer_review_topics(text, topics)
|
|
["cutting"]
|
|
"""
|
|
# Handle empty/None inputs gracefully
|
|
if not review_text or not topics:
|
|
return []
|
|
|
|
review_text_lower = review_text.lower()
|
|
matched_topics = []
|
|
|
|
for topic_dict in topics:
|
|
topic = topic_dict.get("topic", "")
|
|
if not topic:
|
|
continue
|
|
|
|
topic_lower = topic.lower().strip()
|
|
|
|
# Get all variants of the topic
|
|
variants = get_topic_variants(topic_lower)
|
|
|
|
# Check each variant for word boundary match
|
|
for variant in variants:
|
|
if not variant:
|
|
continue
|
|
|
|
# Use word boundary regex to avoid partial matches
|
|
# \b ensures we match whole words only
|
|
# E.g., "cut" won't match "execute" or "cutlery" partially
|
|
pattern = r'\b' + re.escape(variant) + r'\b'
|
|
|
|
if re.search(pattern, review_text_lower):
|
|
matched_topics.append(topic) # Use original topic name
|
|
break # Found a match, no need to check other variants
|
|
|
|
return matched_topics
|
|
|
|
|
|
class LogCapture:
|
|
"""
|
|
Backward-compatible wrapper around StructuredLogger.
|
|
|
|
Maintains the original LogCapture API while using StructuredLogger internally.
|
|
This allows existing code to continue working while gaining structured logging benefits.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self._logger = StructuredLogger()
|
|
|
|
def log(self, message: str, level: str = "INFO", source: str = "scraper"):
|
|
"""Add a log entry with timestamp (backward compatible)."""
|
|
# Map source to category
|
|
category = self._source_to_category(source)
|
|
level_upper = level.upper()
|
|
|
|
if level_upper == "ERROR":
|
|
self._logger.error(category, message)
|
|
elif level_upper == "WARNING" or level_upper == "WARN":
|
|
self._logger.warn(category, message)
|
|
elif level_upper == "DEBUG":
|
|
self._logger.debug(category, message)
|
|
else:
|
|
self._logger.info(category, message)
|
|
|
|
# Also print for console visibility
|
|
print(message, flush=True)
|
|
|
|
def info(self, category_or_msg, message: str = None, *, metrics: dict = None):
|
|
"""
|
|
Log an INFO message.
|
|
|
|
Supports both old API: info(message, source)
|
|
And new API: info(category, message, metrics={...})
|
|
"""
|
|
if message is None:
|
|
# Old API: info(message) or info(message, source)
|
|
self._logger.info('scraper', category_or_msg, metrics=metrics)
|
|
print(category_or_msg, flush=True)
|
|
else:
|
|
# New API: info(category, message, metrics={...})
|
|
self._logger.info(category_or_msg, message, metrics=metrics)
|
|
print(message, flush=True)
|
|
|
|
def warning(self, category_or_msg, message: str = None, *, metrics: dict = None):
|
|
"""Log a WARNING message (supports both old and new API)."""
|
|
if message is None:
|
|
self._logger.warn('scraper', category_or_msg, metrics=metrics)
|
|
print(category_or_msg, flush=True)
|
|
else:
|
|
self._logger.warn(category_or_msg, message, metrics=metrics)
|
|
print(message, flush=True)
|
|
|
|
def warn(self, category, message: str, *, metrics: dict = None):
|
|
"""Log a WARN message with category (new API)."""
|
|
self._logger.warn(category, message, metrics=metrics)
|
|
print(message, flush=True)
|
|
|
|
def error(self, category_or_msg, message: str = None, *, metrics: dict = None):
|
|
"""Log an ERROR message (supports both old and new API)."""
|
|
if message is None:
|
|
self._logger.error('scraper', category_or_msg, metrics=metrics)
|
|
print(category_or_msg, flush=True)
|
|
else:
|
|
self._logger.error(category_or_msg, message, metrics=metrics)
|
|
print(message, flush=True)
|
|
|
|
def debug(self, category, message: str, *, metrics: dict = None):
|
|
"""Log a DEBUG message with category (new API)."""
|
|
self._logger.debug(category, message, metrics=metrics)
|
|
print(message, flush=True)
|
|
|
|
def get_logs(self):
|
|
"""Get all log entries as JSON-serializable dictionaries."""
|
|
return self._logger.get_logs()
|
|
|
|
def _source_to_category(self, source: str) -> str:
|
|
"""Map legacy source names to StructuredLogger categories."""
|
|
source_lower = source.lower() if source else 'scraper'
|
|
if source_lower in ('browser', 'navigation', 'page'):
|
|
return 'browser'
|
|
elif source_lower in ('network', 'api'):
|
|
return 'network'
|
|
elif source_lower in ('system', 'memory', 'chrome'):
|
|
return 'system'
|
|
else:
|
|
return 'scraper'
|
|
|
|
|
|
def parse_api_review(raw: list) -> dict:
|
|
"""Parse a review from API response array."""
|
|
try:
|
|
if not isinstance(raw, list) or len(raw) < 5:
|
|
return None
|
|
|
|
author = raw[0] if len(raw) > 0 and isinstance(raw[0], str) else ""
|
|
timestamp = raw[1] if len(raw) > 1 else ""
|
|
text = raw[3] if len(raw) > 3 and isinstance(raw[3], str) else ""
|
|
rating = raw[4] if len(raw) > 4 and isinstance(raw[4], int) else 0
|
|
|
|
if not (1 <= rating <= 5):
|
|
return None
|
|
|
|
# Filter out garbage data (language codes, metadata, etc.)
|
|
if len(author) <= 3: # Real names are longer than 3 chars
|
|
return None
|
|
if author.lower() in ['google', 'maps', 'reviews', 'es', 'en', 'it', 'no', 'de', 'fr', 'pt']:
|
|
return None
|
|
# Timestamp should look like a date, not a URL or language code
|
|
if timestamp and ('http' in str(timestamp) or len(str(timestamp)) <= 3):
|
|
return None
|
|
|
|
# Owner response
|
|
owner_response = None
|
|
for idx in [9, 18]:
|
|
if len(raw) > idx and raw[idx] and isinstance(raw[idx], list):
|
|
resp = raw[idx]
|
|
if len(resp) > 1:
|
|
owner_response = {"text": resp[1], "timestamp": resp[0] if resp[0] else ""}
|
|
break
|
|
|
|
return {
|
|
"author": author,
|
|
"text": text,
|
|
"rating": rating,
|
|
"timestamp": timestamp,
|
|
"owner_response": owner_response,
|
|
"source": "api"
|
|
}
|
|
except:
|
|
return None
|
|
|
|
|
|
def extract_reviews_from_api_body(body: str) -> list:
|
|
"""Extract reviews from API response body using correct Google Maps structure."""
|
|
reviews = []
|
|
try:
|
|
# Remove )]}' prefix
|
|
if body.startswith(")]}'"):
|
|
body = body[4:].strip()
|
|
|
|
data = json.loads(body)
|
|
|
|
# Google Maps API structure: data[2] contains review arrays
|
|
# Each review: data[2][X][0] where:
|
|
# Author: [1][4][5][0]
|
|
# Rating: [2][0][0]
|
|
# Text: [2][15][0][0]
|
|
# Time: [1][6]
|
|
if not isinstance(data, list) or len(data) < 3:
|
|
return reviews
|
|
|
|
reviews_area = data[2]
|
|
if not isinstance(reviews_area, list):
|
|
return reviews
|
|
|
|
for item in reviews_area:
|
|
try:
|
|
if not isinstance(item, list) or len(item) < 1:
|
|
continue
|
|
review_data = item[0]
|
|
if not isinstance(review_data, list) or len(review_data) < 3:
|
|
continue
|
|
|
|
# Extract fields using correct paths
|
|
review_id = ""
|
|
author = ""
|
|
rating = 0
|
|
text = ""
|
|
timestamp = ""
|
|
|
|
# Review ID: [0] - same format as DOM's data-review-id
|
|
try:
|
|
review_id = review_data[0]
|
|
except (IndexError, TypeError):
|
|
pass
|
|
|
|
# Author: [1][4][5][0]
|
|
try:
|
|
author = review_data[1][4][5][0]
|
|
except (IndexError, TypeError):
|
|
pass
|
|
|
|
# Rating: [2][0][0]
|
|
try:
|
|
rating = review_data[2][0][0]
|
|
except (IndexError, TypeError):
|
|
pass
|
|
|
|
# Text: [2][15][0][0]
|
|
try:
|
|
text = review_data[2][15][0][0]
|
|
except (IndexError, TypeError):
|
|
pass
|
|
|
|
# Timestamp: [1][6]
|
|
try:
|
|
timestamp = review_data[1][6]
|
|
except (IndexError, TypeError):
|
|
pass
|
|
|
|
# Validate and add (include review_id for deduplication)
|
|
if author and isinstance(rating, int) and 1 <= rating <= 5:
|
|
reviews.append({
|
|
"review_id": review_id,
|
|
"author": author,
|
|
"text": text or "",
|
|
"rating": rating,
|
|
"timestamp": timestamp or "",
|
|
"source": "api"
|
|
})
|
|
except:
|
|
continue
|
|
except:
|
|
pass
|
|
return reviews
|
|
|
|
def parse_dom_review(card) -> dict:
|
|
"""Parse a review from DOM element."""
|
|
try:
|
|
# Get review ID
|
|
review_id = card.get_attribute("data-review-id") or ""
|
|
if not review_id:
|
|
try:
|
|
id_el = card.find_element(By.CSS_SELECTOR, "[data-review-id]")
|
|
review_id = id_el.get_attribute("data-review-id") or ""
|
|
except:
|
|
pass
|
|
|
|
# Author - multiple selectors
|
|
author = ""
|
|
for sel in ['div[class*="d4r55"]', '.d4r55', 'button[data-review-id] + div']:
|
|
try:
|
|
author_el = card.find_element(By.CSS_SELECTOR, sel)
|
|
author = author_el.text.strip()
|
|
if author:
|
|
break
|
|
except:
|
|
pass
|
|
|
|
# Rating from aria-label on span[role="img"]
|
|
rating = 0
|
|
try:
|
|
stars_el = card.find_element(By.CSS_SELECTOR, 'span[role="img"]')
|
|
aria = stars_el.get_attribute("aria-label") or ""
|
|
# Extract number from label (handles "5 stars", "5 estrellas", etc.)
|
|
num = re.search(r'[\d\.]+', aria.replace(',', '.'))
|
|
if num:
|
|
rating = int(float(num.group()))
|
|
except:
|
|
pass
|
|
|
|
# Review text - try multiple selectors
|
|
text = ""
|
|
for sel in ['span[jsname="bN97Pc"]', 'span[jsname="fbQN7e"]', 'div.MyEned span.wiI7pd', '.wiI7pd']:
|
|
try:
|
|
text_el = card.find_element(By.CSS_SELECTOR, sel)
|
|
text = text_el.text.strip()
|
|
if text:
|
|
break
|
|
except:
|
|
pass
|
|
|
|
# Note: "More" button clicking removed for speed
|
|
# Full text can be expanded later if needed
|
|
|
|
# Timestamp
|
|
timestamp = ""
|
|
try:
|
|
time_el = card.find_element(By.CSS_SELECTOR, 'span[class*="rsqaWe"]')
|
|
timestamp = time_el.text.strip()
|
|
except:
|
|
pass
|
|
|
|
# Owner response
|
|
owner_response = None
|
|
try:
|
|
resp_box = card.find_element(By.CSS_SELECTOR, "div.CDe7pd")
|
|
if resp_box:
|
|
resp_text = ""
|
|
resp_date = ""
|
|
try:
|
|
resp_text_el = resp_box.find_element(By.CSS_SELECTOR, "div.wiI7pd")
|
|
resp_text = resp_text_el.text.strip()
|
|
except:
|
|
pass
|
|
try:
|
|
resp_date_el = resp_box.find_element(By.CSS_SELECTOR, "span.DZSIDd")
|
|
resp_date = resp_date_el.text.strip()
|
|
except:
|
|
pass
|
|
if resp_text:
|
|
owner_response = {"text": resp_text, "timestamp": resp_date}
|
|
except:
|
|
pass
|
|
|
|
if not review_id and not author:
|
|
return None
|
|
|
|
return {
|
|
"id": review_id,
|
|
"author": author,
|
|
"text": text,
|
|
"rating": rating,
|
|
"timestamp": timestamp,
|
|
"owner_response": owner_response,
|
|
"source": "dom"
|
|
}
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
|
|
flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None,
|
|
progress_callback=None, validation_only: bool = False) -> dict:
|
|
"""
|
|
Scrape Google Maps reviews.
|
|
|
|
Args:
|
|
driver: Selenium WebDriver instance
|
|
url: Google Maps place URL
|
|
max_reviews: Maximum reviews to collect
|
|
timeout_no_new: Seconds to wait with no new reviews before stopping
|
|
flush_callback: Optional callback(reviews_list) called every flush_batch_size reviews
|
|
This allows streaming data to disk and freeing memory
|
|
flush_batch_size: Number of reviews to collect before flushing (default 500)
|
|
log_capture: Optional LogCapture instance for storing logs
|
|
progress_callback: Optional callback(current_count, total_count) called every iteration
|
|
|
|
Returns:
|
|
dict with reviews list and metadata
|
|
"""
|
|
# Use provided log_capture or create a dummy that just prints
|
|
log = log_capture or LogCapture()
|
|
|
|
# Capture session fingerprint early (before navigation) for bot detection analysis
|
|
session_fingerprint = capture_session_fingerprint(driver)
|
|
log.info('browser', "Session fingerprint captured", metrics={
|
|
'user_agent': session_fingerprint.get('user_agent', 'unknown')[:50] + '...' if session_fingerprint.get('user_agent') else 'unknown',
|
|
'platform': session_fingerprint.get('platform'),
|
|
'timezone': session_fingerprint.get('timezone'),
|
|
'webdriver_hidden': session_fingerprint.get('bot_detection_tests', {}).get('webdriver_hidden'),
|
|
'chrome_runtime': session_fingerprint.get('bot_detection_tests', {}).get('chrome_runtime')
|
|
})
|
|
|
|
# Storage - use review ID as key
|
|
reviews = {} # review_id -> review
|
|
seen_ids = set() # Track all IDs we've seen (persists after flush)
|
|
total_flushed = [0] # Use list for closure mutation
|
|
review_order = {} # review_id -> position (DOM visual order for sorting)
|
|
order_counter = [0] # Current order position
|
|
|
|
# Track total reviews (persists across refreshes)
|
|
total_reviews = [None] # Use list for closure mutation
|
|
|
|
# Store business info extracted from overview (before clicking reviews tab)
|
|
business_info_cache = [None]
|
|
|
|
# Hard refresh counter
|
|
hard_refresh_count = [0]
|
|
max_hard_refreshes = 3 # Max number of hard refreshes before giving up
|
|
|
|
# Find scrollable reviews container helper
|
|
def find_scroll_container():
|
|
selectors = [
|
|
"div.m6QErb.DxyBCb.kA9KIf.dS8AEf",
|
|
"div.m6QErb.DxyBCb.kA9KIf",
|
|
"div.m6QErb.DxyBCb",
|
|
"div.m6QErb[aria-label]",
|
|
"div.DxyBCb.kA9KIf.dS8AEf",
|
|
"div[role='main'] div.m6QErb",
|
|
]
|
|
for sel in selectors:
|
|
try:
|
|
els = driver.find_elements(By.CSS_SELECTOR, sel)
|
|
for el in els:
|
|
if el.is_displayed() and el.size['height'] > 100:
|
|
return el
|
|
except:
|
|
pass
|
|
return None
|
|
|
|
def setup_reviews_page(is_refresh=False, validation_only_mode=False):
|
|
"""
|
|
Setup the reviews page for scraping.
|
|
Returns (scroll_container, stop_scrolling_event) or (None, None) on failure.
|
|
Can be called after initial load or after a hard refresh.
|
|
|
|
If validation_only_mode=True, returns early after extracting business info
|
|
without clicking reviews tab or finding scroll container.
|
|
"""
|
|
nonlocal total_reviews
|
|
|
|
refresh_label = " (after refresh)" if is_refresh else ""
|
|
|
|
# Navigate to URL (only on initial load or refresh)
|
|
if not is_refresh:
|
|
# Reset browser state by navigating to blank page first
|
|
# This clears any stale state from pooled browser sessions
|
|
try:
|
|
driver.get("about:blank")
|
|
time.sleep(0.1)
|
|
except:
|
|
pass
|
|
|
|
log.info('browser', f"Loading: {url[:80]}...")
|
|
else:
|
|
log.info('browser', f"Hard refresh #{hard_refresh_count[0]}: reloading page...")
|
|
driver.get(url)
|
|
|
|
# Handle consent popup if redirected (poll with tiny sleep)
|
|
start = time.time()
|
|
while time.time() - start < 5: # Max 5s for consent
|
|
if "consent.google" in driver.current_url:
|
|
log.info('browser', "Handling consent popup...")
|
|
try:
|
|
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
|
|
txt = btn.text.lower()
|
|
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
|
btn.click()
|
|
# Reload original URL after consent
|
|
log.info('browser', "Reloading after consent...")
|
|
driver.get(url)
|
|
# Wait for page to settle after consent reload
|
|
time.sleep(1)
|
|
break
|
|
except:
|
|
pass
|
|
break
|
|
# Check if we're already on the target page
|
|
if "maps/place" in driver.current_url and "consent" not in driver.current_url:
|
|
break
|
|
time.sleep(0.01) # 10ms - responsive but low CPU
|
|
|
|
# Extract business info and total review count BEFORE clicking reviews tab (on Overview)
|
|
# This captures name, rating, category, address while they're visible
|
|
# Only on first load (don't overwrite if we already have it)
|
|
if total_reviews[0] is None or business_info_cache[0] is None:
|
|
start = time.time()
|
|
while time.time() - start < 5:
|
|
try:
|
|
info = driver.execute_script("""
|
|
var result = {
|
|
total_reviews: null,
|
|
name: null,
|
|
rating: null,
|
|
category: null,
|
|
address: null
|
|
};
|
|
|
|
// Business name from h1
|
|
var h1 = document.querySelector('h1');
|
|
if (h1) result.name = h1.textContent.trim();
|
|
|
|
// Category - use jsaction attribute (robust selector)
|
|
var catBtn = document.querySelector('button[jsaction*="category"]');
|
|
if (catBtn) result.category = catBtn.textContent.trim();
|
|
|
|
// Rating and review count from span[role="img"] aria-labels
|
|
var spans = document.querySelectorAll('span[role="img"]');
|
|
for (var i = 0; i < spans.length; i++) {
|
|
var label = spans[i].getAttribute('aria-label') || '';
|
|
|
|
// Rating: "4.8 stars"
|
|
var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
|
|
if (rMatch && !result.rating) {
|
|
result.rating = parseFloat(rMatch[1].replace(',', '.'));
|
|
}
|
|
|
|
// Reviews: "79 reviews"
|
|
var revMatch = label.match(/^([\\d,\\.]+)\\s*review/i);
|
|
if (revMatch && !result.total_reviews) {
|
|
result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
|
|
}
|
|
}
|
|
|
|
// Address from button
|
|
var addrBtn = document.querySelector('button[data-item-id="address"]');
|
|
if (addrBtn) {
|
|
var label = addrBtn.getAttribute('aria-label');
|
|
if (label) result.address = label.replace(/^Address:\\s*/i, '');
|
|
}
|
|
|
|
return result;
|
|
""")
|
|
|
|
if info:
|
|
if info.get('total_reviews') and total_reviews[0] is None:
|
|
total_reviews[0] = info['total_reviews']
|
|
log.info('scraper', f"Total reviews on page: {total_reviews[0]}", metrics={'total_reviews': total_reviews[0]})
|
|
if info.get('name') and business_info_cache[0] is None:
|
|
business_info_cache[0] = info
|
|
log.info('scraper', f"Business: {info.get('name')}")
|
|
if total_reviews[0] and business_info_cache[0]:
|
|
break
|
|
except:
|
|
pass
|
|
time.sleep(0.1)
|
|
|
|
# VALIDATION_ONLY: Return early - skip clicking reviews tab, sorting, etc.
|
|
if validation_only_mode:
|
|
log.info('scraper', "Validation mode: returning early (skipping reviews tab)")
|
|
return ("validation_done", None)
|
|
|
|
# Click reviews tab - poll until found
|
|
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
|
|
start = time.time()
|
|
tab_clicked = False
|
|
tabs_logged = False
|
|
while time.time() - start < 5: # Max 5s for tabs
|
|
try:
|
|
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
|
|
# Log available tabs once for debugging
|
|
if not tabs_logged and tabs:
|
|
tabs_logged = True
|
|
tab_texts = [t.text for t in tabs]
|
|
log.info('browser', f"Available tabs: {tab_texts}")
|
|
for tab in tabs:
|
|
tab_text = tab.text.lower()
|
|
if any(kw in tab_text for kw in review_keywords):
|
|
if not is_refresh:
|
|
log.info('browser', f"Clicking reviews tab: '{tab.text}'")
|
|
# Extract total_reviews from tab text like "Reviews (79)" or "Reviews\n79"
|
|
if total_reviews[0] is None:
|
|
import re
|
|
# Try pattern with parentheses: "Reviews (79)"
|
|
match = re.search(r'\((\d+)\)', tab.text)
|
|
if match:
|
|
total_reviews[0] = int(match.group(1))
|
|
log.info('scraper', f"Total reviews from tab: {total_reviews[0]}", metrics={'total_reviews': total_reviews[0]})
|
|
else:
|
|
# Try pattern with newline: "Reviews\n79"
|
|
match = re.search(r'(\d+)', tab.text)
|
|
if match:
|
|
total_reviews[0] = int(match.group(1))
|
|
log.info('scraper', f"Total reviews from tab: {total_reviews[0]}", metrics={'total_reviews': total_reviews[0]})
|
|
tab.click()
|
|
tab_clicked = True
|
|
break
|
|
if tab_clicked:
|
|
break
|
|
time.sleep(0.01) # 10ms between polls
|
|
except:
|
|
time.sleep(0.01)
|
|
|
|
# Poll for scroll container (10ms intervals - fast but low CPU)
|
|
scroll_container = None
|
|
start = time.time()
|
|
last_print = 0
|
|
while time.time() - start < 10: # Max 10s
|
|
scroll_container = find_scroll_container()
|
|
if scroll_container:
|
|
break
|
|
elapsed = int(time.time() - start)
|
|
if elapsed > last_print:
|
|
log.info('browser', f"Waiting for reviews panel...{refresh_label} ({elapsed}s)")
|
|
last_print = elapsed
|
|
time.sleep(0.01) # 10ms - responsive but low CPU
|
|
|
|
if not scroll_container:
|
|
log.error('browser', f"Could not find reviews scroll container{refresh_label}")
|
|
try:
|
|
log.error('browser', f"Page title: {driver.title}")
|
|
log.error('browser', f"Current URL: {driver.current_url[:100]}")
|
|
except:
|
|
pass
|
|
return None, None
|
|
|
|
log.info('browser', f"Found scroll container{refresh_label}")
|
|
|
|
# Inject API interceptor (needs to be re-injected after refresh)
|
|
if not is_refresh:
|
|
log.info('network', "Injecting API interceptor...")
|
|
driver.execute_script("""
|
|
// Always re-setup on refresh
|
|
window.__reviewInterceptorInjected = true;
|
|
window.__interceptedResponses = window.__interceptedResponses || [];
|
|
|
|
// Intercept fetch (only if not already patched)
|
|
if (!window.__fetchPatched) {
|
|
window.__fetchPatched = true;
|
|
const originalFetch = window.fetch;
|
|
window.fetch = async function(...args) {
|
|
const url = args[0].toString();
|
|
const response = await originalFetch.apply(this, args);
|
|
if (url.includes('listugcposts') || url.includes('review')) {
|
|
try {
|
|
const clone = response.clone();
|
|
const text = await clone.text();
|
|
window.__interceptedResponses.push({url: url, body: text});
|
|
} catch(e) {}
|
|
}
|
|
return response;
|
|
};
|
|
}
|
|
|
|
// Intercept XHR (only if not already patched)
|
|
if (!window.__xhrPatched) {
|
|
window.__xhrPatched = true;
|
|
const originalXHR = window.XMLHttpRequest;
|
|
window.XMLHttpRequest = function() {
|
|
const xhr = new originalXHR();
|
|
const originalOpen = xhr.open;
|
|
let reqUrl = '';
|
|
xhr.open = function(method, url, ...rest) {
|
|
reqUrl = url;
|
|
return originalOpen.apply(this, [method, url, ...rest]);
|
|
};
|
|
xhr.addEventListener('load', function() {
|
|
if (reqUrl.includes('listugcposts') || reqUrl.includes('review')) {
|
|
try {
|
|
window.__interceptedResponses.push({url: reqUrl, body: xhr.responseText});
|
|
} catch(e) {}
|
|
}
|
|
});
|
|
return xhr;
|
|
};
|
|
for (let prop of Object.getOwnPropertyNames(originalXHR)) {
|
|
try { window.XMLHttpRequest[prop] = originalXHR[prop]; } catch(e) {}
|
|
}
|
|
}
|
|
""")
|
|
|
|
# Sort by newest first
|
|
try:
|
|
sort_btn = driver.execute_script("""
|
|
var btns = document.querySelectorAll('button[data-value="sort"]');
|
|
if (btns.length) return btns[0];
|
|
var all = document.querySelectorAll('button[aria-label*="Sort"]');
|
|
if (all.length) return all[0];
|
|
return null;
|
|
""")
|
|
if sort_btn:
|
|
sort_btn.click()
|
|
time.sleep(0.3)
|
|
driver.execute_script("""
|
|
var items = document.querySelectorAll('[role="menuitemradio"], [data-index="1"]');
|
|
for (var i = 0; i < items.length; i++) {
|
|
var txt = items[i].textContent.toLowerCase();
|
|
if (txt.includes('newest') || txt.includes('recent') || txt.includes('más reciente')) {
|
|
items[i].click();
|
|
break;
|
|
}
|
|
}
|
|
""")
|
|
time.sleep(0.5)
|
|
log.info('browser', "Sorted by newest")
|
|
# Re-find scroll container after sorting (DOM may be recreated)
|
|
new_container = find_scroll_container()
|
|
if new_container:
|
|
scroll_container = new_container
|
|
log.info('browser', "Refreshed scroll container reference")
|
|
except:
|
|
pass
|
|
|
|
# Expand "More" buttons for full text
|
|
try:
|
|
expanded = driver.execute_script("""
|
|
var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
|
|
var count = 0;
|
|
for (var i = 0; i < buttons.length; i++) {
|
|
if (buttons[i].textContent.trim() === 'More') {
|
|
buttons[i].click();
|
|
count++;
|
|
}
|
|
}
|
|
return count;
|
|
""")
|
|
if expanded > 0:
|
|
log.info('browser', f"Expanded {expanded} truncated reviews", metrics={'expanded_count': expanded})
|
|
except:
|
|
pass
|
|
|
|
# Block heavy resources to speed up scrolling (use CDP)
|
|
try:
|
|
driver.execute_cdp_cmd('Network.setBlockedURLs', {
|
|
'urls': [
|
|
# Images
|
|
'*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg',
|
|
'*googleusercontent.com/*',
|
|
# Fonts
|
|
'*.woff', '*.woff2', '*.ttf', '*.otf',
|
|
# Analytics/tracking
|
|
'*google-analytics.com/*', '*googletagmanager.com/*',
|
|
'*doubleclick.net/*', '*googlesyndication.com/*',
|
|
# Maps tiles (not needed for reviews)
|
|
'*khms*.google.com/*', '*maps.googleapis.com/maps/vt*'
|
|
]
|
|
})
|
|
driver.execute_cdp_cmd('Network.enable', {})
|
|
if not is_refresh:
|
|
log.info('browser', "Blocking heavy resources for faster scrolling")
|
|
except:
|
|
pass
|
|
|
|
# Setup scrollable pane reference
|
|
driver.execute_script("window.scrollablePane = arguments[0];", scroll_container)
|
|
|
|
# Create scroll worker
|
|
stop_scrolling = threading.Event()
|
|
|
|
def scroll_worker():
|
|
while not stop_scrolling.is_set():
|
|
try:
|
|
driver.execute_script("""
|
|
var p = window.scrollablePane;
|
|
if (p) p.scrollTop = p.scrollHeight;
|
|
""")
|
|
except:
|
|
pass
|
|
time.sleep(0.1)
|
|
|
|
scroll_thread = threading.Thread(target=scroll_worker, daemon=True)
|
|
scroll_thread.start()
|
|
|
|
return scroll_container, stop_scrolling
|
|
|
|
# Helper to extract review topics from the reviews tab
|
|
def extract_review_topics():
|
|
"""Extract review topic filters from radiogroup (robust selectors)."""
|
|
try:
|
|
topics = driver.execute_script("""
|
|
var topics = [];
|
|
|
|
// Primary: use role="radiogroup" with aria-label="Refine reviews" (robust)
|
|
var container = document.querySelector('div[role="radiogroup"][aria-label*="Refine"], div[role="radiogroup"][aria-label*="refine"]');
|
|
|
|
if (!container) {
|
|
// Fallback: any radiogroup in the reviews area
|
|
container = document.querySelector('div[role="radiogroup"]');
|
|
}
|
|
|
|
if (container) {
|
|
var buttons = container.querySelectorAll('button[role="radio"]');
|
|
for (var btn of buttons) {
|
|
var label = btn.getAttribute('aria-label') || '';
|
|
// Parse "hair salon, mentioned in 4 reviews" format
|
|
var match = label.match(/^([^,]+),\\s*mentioned in (\\d+)/i);
|
|
if (match) {
|
|
topics.push({
|
|
topic: match[1].trim(),
|
|
count: parseInt(match[2])
|
|
});
|
|
} else if (label && !label.toLowerCase().includes('all review')) {
|
|
// Fallback: try to extract from child spans
|
|
var countSpan = btn.querySelector('.bC3Nkc, .fontBodySmall');
|
|
var nameSpan = btn.querySelector('.uEubGf, span:first-child');
|
|
if (nameSpan) {
|
|
var name = nameSpan.textContent.trim();
|
|
var count = countSpan ? parseInt(countSpan.textContent) : 0;
|
|
if (name && name.toLowerCase() !== 'all') {
|
|
topics.push({topic: name, count: count || 0});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return topics;
|
|
""")
|
|
return topics or []
|
|
except:
|
|
return []
|
|
|
|
# Initial page setup (pass validation_only to skip unnecessary steps)
|
|
scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False, validation_only_mode=validation_only)
|
|
|
|
# VALIDATION_ONLY MODE: Return early with just total_reviews and business info
|
|
# setup_reviews_page returns ("validation_done", None) in this case
|
|
if validation_only or scroll_container == "validation_done":
|
|
# Use the business info captured from Overview (before clicking reviews tab)
|
|
business_info = business_info_cache[0] or {}
|
|
|
|
return {
|
|
"reviews": [],
|
|
"total": total_reviews[0] or 0,
|
|
"scrolls": 0,
|
|
"error": None,
|
|
"validation_info": {
|
|
"name": business_info.get("name"),
|
|
"rating": business_info.get("rating"),
|
|
"category": business_info.get("category"),
|
|
"address": business_info.get("address"),
|
|
"total_reviews": total_reviews[0]
|
|
},
|
|
"session_fingerprint": session_fingerprint # Browser fingerprint for bot detection analysis
|
|
}
|
|
|
|
if not scroll_container:
|
|
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found", "session_fingerprint": session_fingerprint}
|
|
|
|
# Extract review topics after reviews tab is loaded (before scrolling begins)
|
|
time.sleep(0.5) # Brief wait for topic filters to render
|
|
review_topics = extract_review_topics()
|
|
if review_topics:
|
|
log.info('scraper', f"Found {len(review_topics)} review topics: {', '.join(t['topic'] for t in review_topics[:5])}...", metrics={'topic_count': len(review_topics)})
|
|
|
|
def get_api_reviews():
|
|
"""Get reviews from intercepted API responses."""
|
|
api_revs = []
|
|
try:
|
|
responses = driver.execute_script("""
|
|
var r = window.__interceptedResponses || [];
|
|
window.__interceptedResponses = [];
|
|
return r;
|
|
""")
|
|
for resp in (responses or []):
|
|
body = resp.get("body", "")
|
|
api_revs.extend(extract_reviews_from_api_body(body))
|
|
except:
|
|
pass
|
|
return api_revs
|
|
|
|
# Captcha detection helper
|
|
def detect_captcha():
|
|
"""Check if a captcha or challenge is blocking the page. Returns captcha type or None."""
|
|
try:
|
|
return driver.execute_script("""
|
|
// Check for reCAPTCHA iframe or checkbox
|
|
var recaptcha = document.querySelector('iframe[src*="recaptcha"], iframe[title*="reCAPTCHA"]');
|
|
if (recaptcha) return 'recaptcha';
|
|
|
|
// Check for "unusual traffic" message
|
|
var body = document.body ? document.body.innerText : '';
|
|
if (body.includes('unusual traffic') || body.includes('not a robot')) return 'unusual_traffic';
|
|
|
|
// Check for challenge frame
|
|
var challenge = document.querySelector('iframe[src*="challenge"]');
|
|
if (challenge) return 'challenge';
|
|
|
|
return null;
|
|
""")
|
|
except:
|
|
return None
|
|
|
|
# Recovery function - use real mouse actions when stuck
|
|
from selenium.webdriver.common.action_chains import ActionChains
|
|
from selenium.webdriver.common.keys import Keys
|
|
recovery_count = [0]
|
|
|
|
def unstick_scroll():
|
|
nonlocal scroll_container
|
|
recovery_count[0] += 1
|
|
method = recovery_count[0] % 4
|
|
try:
|
|
if method == 1:
|
|
# Method 1: Click pane and send Page Down keys
|
|
scroll_container.click()
|
|
ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
|
|
ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
|
|
elif method == 2:
|
|
# Method 2: Real mouse wheel scroll
|
|
ActionChains(driver).move_to_element(scroll_container)\
|
|
.scroll_by_amount(0, 800).perform()
|
|
elif method == 3:
|
|
# Method 3: Scroll up significantly then back down (force reload)
|
|
driver.execute_script("""
|
|
var p = window.scrollablePane;
|
|
if (p) p.scrollTop = Math.max(0, p.scrollTop - 2000);
|
|
""")
|
|
time.sleep(0.3)
|
|
driver.execute_script("""
|
|
var p = window.scrollablePane;
|
|
if (p) p.scrollTop = p.scrollHeight;
|
|
""")
|
|
else:
|
|
# Method 4: Scroll last card into view, then scroll pane (no click to avoid opening profile)
|
|
driver.execute_script("""
|
|
var cards = document.querySelectorAll('[data-review-id]');
|
|
if (cards.length > 0) {
|
|
cards[cards.length - 1].scrollIntoView({block: 'end', behavior: 'smooth'});
|
|
}
|
|
""")
|
|
time.sleep(0.3)
|
|
driver.execute_script("""
|
|
var p = window.scrollablePane;
|
|
if (p) p.scrollTop = p.scrollHeight;
|
|
""")
|
|
except:
|
|
pass
|
|
|
|
def do_hard_refresh():
|
|
"""Hard refresh the page and re-setup everything. Returns True on success."""
|
|
nonlocal scroll_container, stop_scrolling
|
|
hard_refresh_count[0] += 1
|
|
|
|
if hard_refresh_count[0] > max_hard_refreshes:
|
|
log.warn('system', f"Max hard refreshes ({max_hard_refreshes}) reached, giving up", metrics={'hard_refresh_count': hard_refresh_count[0]})
|
|
return False
|
|
|
|
# Stop current scroll worker
|
|
stop_scrolling.set()
|
|
time.sleep(0.2)
|
|
|
|
# Re-setup page
|
|
new_container, new_stop = setup_reviews_page(is_refresh=True)
|
|
if new_container:
|
|
scroll_container = new_container
|
|
stop_scrolling = new_stop
|
|
recovery_count[0] = 0 # Reset recovery count after successful refresh
|
|
log.info('browser', f"Hard refresh successful, resuming with {len(seen_ids)} reviews already collected", metrics={'reviews_collected': len(seen_ids)})
|
|
return True
|
|
else:
|
|
log.error('browser', "Hard refresh failed to find scroll container")
|
|
return False
|
|
|
|
# Main collection loop
|
|
last_new_time = time.time()
|
|
last_count = len(reviews)
|
|
check_num = 0
|
|
start_time = time.time()
|
|
|
|
# Crash detection: metrics sampling
|
|
metrics_history = []
|
|
last_sample_time = time.time()
|
|
scroll_count = [0] # Track scroll operations for crash reports
|
|
|
|
log.info('browser', f"Scrolling... (timeout: {timeout_no_new}s with no new)", metrics={'timeout_seconds': timeout_no_new})
|
|
|
|
cycle_start = time.time()
|
|
while True:
|
|
check_num += 1
|
|
time.sleep(1.0) # Check every second
|
|
|
|
# TIMING: Track cycle performance
|
|
t0 = time.time()
|
|
cycle_delta = t0 - cycle_start
|
|
cycle_start = t0
|
|
|
|
# CRASH DETECTION: Sample metrics every 5 seconds
|
|
if time.time() - last_sample_time >= 5:
|
|
current_count_for_metrics = total_flushed[0] + len(reviews)
|
|
metrics_history.append({
|
|
'timestamp_ms': int(time.time() * 1000),
|
|
'memory_mb': get_chrome_memory(driver),
|
|
'dom_nodes': get_dom_node_count(driver),
|
|
'reviews_count': current_count_for_metrics
|
|
})
|
|
# Keep only last 100 samples
|
|
metrics_history = metrics_history[-100:]
|
|
last_sample_time = time.time()
|
|
|
|
# Collect from API (doesn't affect scroll) - API has FULL TEXT in original language
|
|
# Use review_id as key to avoid duplicates with DOM
|
|
t1 = time.time()
|
|
for rev in get_api_reviews():
|
|
rid = rev.get('review_id', '')
|
|
if rid and rid not in seen_ids:
|
|
reviews[rid] = rev
|
|
seen_ids.add(rid)
|
|
api_time = time.time() - t1
|
|
|
|
# Expand any new "More" buttons for full text (batch click, fast)
|
|
try:
|
|
driver.execute_script("""
|
|
var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
|
|
for (var i = 0; i < buttons.length; i++) {
|
|
if (buttons[i].textContent.trim() === 'More') {
|
|
buttons[i].click();
|
|
}
|
|
}
|
|
""")
|
|
except:
|
|
pass
|
|
|
|
# Parse reviews using ROBUST selectors (no class names - uses data/aria attributes)
|
|
# This survives Google's CSS class name changes
|
|
# MEMORY FIX: Actually remove processed cards from DOM (not just hide)
|
|
# Keep last N cards for scroll continuity
|
|
t2 = time.time()
|
|
dom_cards = 0
|
|
try:
|
|
seen_list = list(seen_ids)
|
|
parsed_reviews = driver.execute_script("""
|
|
var seenSet = new Set(arguments[0]);
|
|
var results = [];
|
|
var processedIds = new Set();
|
|
var sepsRemoved = 0;
|
|
var cardsRemoved = 0;
|
|
var KEEP_LAST_N = 50; // Keep last 50 cards for scroll reference
|
|
|
|
// ROBUST: Find cards by data attribute only (not class names)
|
|
var cards = document.querySelectorAll('[data-review-id]');
|
|
var cardsArray = Array.from(cards);
|
|
var totalCards = cardsArray.length;
|
|
|
|
for (var i = 0; i < cardsArray.length; i++) {
|
|
var card = cardsArray[i];
|
|
var rid = card.getAttribute('data-review-id');
|
|
var isHidden = card.style.display === 'none';
|
|
var isNearEnd = i >= totalCards - KEEP_LAST_N;
|
|
|
|
// AGGRESSIVE CLEANUP: Remove hidden cards that are NOT near the scroll end
|
|
// This prevents memory buildup that causes tab crashes
|
|
if (isHidden && !isNearEnd) {
|
|
// Remove separators first
|
|
var sibling = card.nextElementSibling;
|
|
while (sibling) {
|
|
var nextSib = sibling.nextElementSibling;
|
|
var classes = sibling.className || '';
|
|
if (classes.includes('AyRUI') || classes.includes('TFQHme')) {
|
|
sibling.remove();
|
|
sepsRemoved++;
|
|
sibling = nextSib;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
// Remove the card itself from DOM
|
|
card.remove();
|
|
cardsRemoved++;
|
|
continue;
|
|
}
|
|
|
|
// Skip already hidden cards near end (keep for scroll reference)
|
|
if (isHidden) continue;
|
|
|
|
// Skip if no ID or already processed this cycle
|
|
if (!rid || processedIds.has(rid)) continue;
|
|
|
|
// Only process top-level review cards (have aria-label with author name)
|
|
if (!card.getAttribute('aria-label')) continue;
|
|
processedIds.add(rid);
|
|
|
|
// Already seen from API - just track order, skip content
|
|
// BUT still hide the card to keep DOM light!
|
|
if (seenSet.has(rid)) {
|
|
results.push({id: rid, orderOnly: true});
|
|
// Hide this card since we already have its data from API
|
|
card.style.display = 'none';
|
|
card.innerHTML = '';
|
|
continue;
|
|
}
|
|
|
|
var author = '', text = '', rating = 0, timestamp = '';
|
|
|
|
// AUTHOR: Extract from "Photo of {Name}" button aria-label
|
|
var photoBtn = card.querySelector('button[aria-label^="Photo of"]');
|
|
if (photoBtn) {
|
|
author = photoBtn.getAttribute('aria-label').replace('Photo of ', '').trim();
|
|
}
|
|
// Fallback: card's own aria-label is the author name
|
|
if (!author) {
|
|
author = card.getAttribute('aria-label') || '';
|
|
}
|
|
|
|
// RATING: span with role="img" and aria-label containing "star"
|
|
var ratingEl = card.querySelector('span[role="img"][aria-label*="star"]');
|
|
if (ratingEl) {
|
|
var match = ratingEl.getAttribute('aria-label').match(/(\\d)/);
|
|
if (match) rating = parseInt(match[1]);
|
|
}
|
|
|
|
// TIMESTAMP: Find span with "X time ago" pattern
|
|
var spans = card.querySelectorAll('span');
|
|
for (var j = 0; j < spans.length; j++) {
|
|
var spanText = spans[j].textContent.trim();
|
|
if (spanText.match(/^(\\d+|a|an)\\s+(second|minute|hour|day|week|month|year)s?\\s+ago$/i)) {
|
|
timestamp = spanText;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// TEXT: Find longest text span (not timestamp/UI elements)
|
|
var longestText = '';
|
|
for (var j = 0; j < spans.length; j++) {
|
|
var spanText = spans[j].textContent.trim();
|
|
if (spanText === timestamp) continue;
|
|
if (spanText.match(/^\\d+ stars?$/i)) continue;
|
|
if (spanText === 'More' || spanText === 'Less') continue;
|
|
if (spanText.match(/^(Like\\d*|Share)$/)) continue;
|
|
if (spanText.length > longestText.length && spanText.length > 10) {
|
|
longestText = spanText;
|
|
}
|
|
}
|
|
text = longestText;
|
|
|
|
if (author && rating >= 1 && rating <= 5) {
|
|
results.push({
|
|
id: rid,
|
|
orderOnly: false,
|
|
author: author,
|
|
text: text,
|
|
rating: rating,
|
|
timestamp: timestamp,
|
|
source: 'dom'
|
|
});
|
|
}
|
|
|
|
// Mark card as processed (hide + clear) - will be removed on next cycle
|
|
// Keep near-end cards visible for scroll reference
|
|
if (!isNearEnd) {
|
|
card.style.display = 'none';
|
|
card.innerHTML = '';
|
|
}
|
|
}
|
|
return {reviews: results, cardCount: totalCards, cardsRemoved: cardsRemoved, sepsRemoved: sepsRemoved};
|
|
""", seen_list)
|
|
|
|
dom_cards = parsed_reviews.get('cardCount', 0) if parsed_reviews else 0
|
|
cards_removed = parsed_reviews.get('cardsRemoved', 0) if parsed_reviews else 0
|
|
if cards_removed > 0:
|
|
log.info('system', f"DOM cleanup: removed {cards_removed} cards to prevent memory buildup", metrics={'cards_removed': cards_removed, 'cards_remaining': dom_cards - cards_removed})
|
|
new_reviews = parsed_reviews.get('reviews', []) if parsed_reviews else []
|
|
for rev in new_reviews:
|
|
rid = rev.pop('id')
|
|
order_only = rev.pop('orderOnly', False)
|
|
# Track DOM order for ALL reviews (for sorting output)
|
|
if rid not in review_order:
|
|
review_order[rid] = order_counter[0]
|
|
order_counter[0] += 1
|
|
# Only add content for new reviews (not already from API)
|
|
if not order_only:
|
|
reviews[rid] = rev
|
|
seen_ids.add(rid)
|
|
except Exception as e:
|
|
log.error('scraper', f"DOM parse error: {e}")
|
|
dom_time = time.time() - t2
|
|
|
|
# BATCH FLUSH: If we have enough reviews, flush to callback and clear memory
|
|
# Sort by DOM order before flushing
|
|
t3 = time.time()
|
|
if flush_callback and len(reviews) >= flush_batch_size:
|
|
log.info('scraper', f"Flushing {len(reviews)} reviews to disk...", metrics={'batch_size': len(reviews), 'source': 'flush'})
|
|
sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
|
|
flush_callback([r for _, r in sorted_reviews])
|
|
total_flushed[0] += len(reviews)
|
|
reviews.clear() # Free memory, but keep seen_ids and review_order
|
|
flush_time = time.time() - t3
|
|
|
|
current_count = total_flushed[0] + len(reviews)
|
|
|
|
# TIMING: Print if cycle is slow (>2s)
|
|
if cycle_delta > 2.0:
|
|
log.warn('system', f"SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})", metrics={'cycle_time_s': cycle_delta, 'api_time_s': api_time, 'dom_time_s': dom_time, 'dom_cards': dom_cards, 'seen_count': len(seen_ids)})
|
|
|
|
# Check for new reviews
|
|
if current_count > last_count:
|
|
last_new_time = time.time()
|
|
last_count = current_count
|
|
|
|
# Check if loading (spinner visible OR network activity)
|
|
try:
|
|
loading_status = driver.execute_script("""
|
|
var status = {spinner: false, network: false};
|
|
// Check for Google's loading indicators
|
|
var spinner = document.querySelector('div[role="progressbar"]');
|
|
if (spinner && spinner.offsetParent !== null) status.spinner = true;
|
|
var loading = document.querySelector('.qjESne, .loading');
|
|
if (loading && loading.offsetParent !== null) status.spinner = true;
|
|
// Check for recent network activity (API interceptor)
|
|
var responses = window.__interceptedResponses || [];
|
|
var lastCount = window.__lastResponseCount || 0;
|
|
if (responses.length > lastCount) {
|
|
status.network = true;
|
|
window.__lastResponseCount = responses.length;
|
|
}
|
|
return status;
|
|
""")
|
|
is_loading = loading_status.get('spinner') or loading_status.get('network')
|
|
if is_loading:
|
|
last_new_time = time.time() # Reset timer while loading
|
|
except:
|
|
is_loading = False
|
|
|
|
# Progress update
|
|
elapsed = time.time() - last_new_time
|
|
if total_reviews[0]:
|
|
pct = (current_count / total_reviews[0]) * 100
|
|
log.info('scraper', f"{current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", metrics={'reviews_count': current_count, 'total_reviews': total_reviews[0], 'progress_pct': pct, 'idle_seconds': elapsed})
|
|
else:
|
|
log.info('scraper', f"{current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", metrics={'reviews_count': current_count, 'idle_seconds': elapsed})
|
|
|
|
# Call progress callback on every iteration (for real-time log updates)
|
|
if progress_callback:
|
|
progress_callback(current_count, total_reviews[0])
|
|
|
|
# Stop conditions - check BEFORE recovery attempts
|
|
if current_count >= max_reviews:
|
|
log.info('scraper', f"Reached max: {current_count}", metrics={'total_reviews': current_count, 'elapsed_seconds': time.time() - start_time})
|
|
stop_scrolling.set()
|
|
break
|
|
|
|
# Also stop if we have all reviews from the page
|
|
if total_reviews[0] and current_count >= total_reviews[0]:
|
|
log.info('scraper', f"All {current_count} reviews collected", metrics={'total_reviews': current_count, 'elapsed_seconds': time.time() - start_time})
|
|
stop_scrolling.set()
|
|
break
|
|
|
|
# STUCK DETECTION: If no new reviews for 3s+, try to unstick
|
|
# Only if we haven't collected all reviews yet
|
|
if elapsed >= 3 and int(elapsed) % 3 == 0:
|
|
# After 8+ failed recovery attempts, try hard refresh
|
|
if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes:
|
|
# Check for captcha before hard refresh - no point refreshing if blocked
|
|
captcha_type = detect_captcha()
|
|
if captcha_type:
|
|
log.warn('browser', f"Captcha detected ({captcha_type}), stopping - human intervention needed", metrics={'captcha_type': captcha_type})
|
|
stop_scrolling.set()
|
|
return {
|
|
"reviews": [],
|
|
"total": current_count,
|
|
"error": f"Captcha detected: {captcha_type}. Please solve manually and retry.",
|
|
"captcha_detected": True
|
|
}
|
|
|
|
log.info('browser', f"Soft recovery failed {recovery_count[0]} times, trying hard refresh...", metrics={'recovery_count': recovery_count[0]})
|
|
if do_hard_refresh():
|
|
last_new_time = time.time() # Reset timer after refresh
|
|
continue # Skip to next iteration
|
|
else:
|
|
log.info('browser', f"Recovery attempt #{recovery_count[0] + 1}...", metrics={'recovery_attempt': recovery_count[0] + 1})
|
|
unstick_scroll()
|
|
|
|
# Check scroll state - track if content is still being added
|
|
try:
|
|
scroll_state = driver.execute_script("""
|
|
var p = window.scrollablePane;
|
|
if (!p) return {atBottom: true, height: 0};
|
|
var atBottom = (p.scrollTop + p.clientHeight >= p.scrollHeight - 50);
|
|
var height = p.scrollHeight;
|
|
var lastHeight = window.__lastScrollHeight || 0;
|
|
var growing = height > lastHeight;
|
|
window.__lastScrollHeight = height;
|
|
return {atBottom: atBottom, height: height, growing: growing};
|
|
""")
|
|
at_bottom = scroll_state.get('atBottom', True)
|
|
content_growing = scroll_state.get('growing', False)
|
|
except:
|
|
at_bottom = True
|
|
content_growing = False
|
|
|
|
# Reset timer if content is growing (new reviews loading)
|
|
if content_growing:
|
|
last_new_time = time.time()
|
|
|
|
# Dynamic timeout based on state and recovery attempts
|
|
# - Try hard refresh before giving up if we still have refreshes left
|
|
# - 5s if at bottom AND content stopped growing AND multiple recovery attempts failed
|
|
# - 15s max otherwise (keep trying)
|
|
recovery_failed = recovery_count[0] >= 5 and elapsed >= 5
|
|
truly_done = at_bottom and not content_growing and recovery_failed
|
|
timeout_hit = elapsed >= timeout_no_new
|
|
|
|
if truly_done or timeout_hit:
|
|
# Check if we're close enough to total (95%+ threshold)
|
|
# If we have 95%+ of reviews, don't waste time with hard refreshes
|
|
close_enough = False
|
|
if total_reviews[0] and current_count > 0:
|
|
pct_complete = (current_count / total_reviews[0]) * 100
|
|
close_enough = pct_complete >= 95
|
|
if close_enough:
|
|
log.info('scraper', f"Close enough ({pct_complete:.1f}% complete), skipping further retries", metrics={'pct_complete': pct_complete})
|
|
|
|
# Last chance: try hard refresh before giving up (only if not close enough)
|
|
if not close_enough and hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
|
|
# Check for captcha first
|
|
captcha_type = detect_captcha()
|
|
if captcha_type:
|
|
log.warn('browser', f"Captcha detected ({captcha_type}), stopping - human intervention needed", metrics={'captcha_type': captcha_type})
|
|
stop_scrolling.set()
|
|
break
|
|
|
|
log.info('browser', "Timeout reached, trying hard refresh before giving up...", metrics={'idle_seconds': elapsed})
|
|
if do_hard_refresh():
|
|
last_new_time = time.time()
|
|
continue # Keep trying
|
|
log.info('scraper', f"All reviews loaded: {current_count}", metrics={'total_reviews': current_count, 'elapsed_seconds': time.time() - start_time})
|
|
stop_scrolling.set()
|
|
break
|
|
|
|
# Flush any remaining reviews (sorted by DOM order)
|
|
if flush_callback and reviews:
|
|
log.info('scraper', f"Final flush: {len(reviews)} reviews...", metrics={'batch_size': len(reviews), 'source': 'final_flush'})
|
|
sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
|
|
flush_callback([r for _, r in sorted_reviews])
|
|
total_flushed[0] += len(reviews)
|
|
reviews.clear()
|
|
|
|
# Reviews already parsed during scrolling (real-time parsing)
|
|
log.info('scraper', "Finalizing review data...")
|
|
|
|
# Final results (sorted by DOM order)
|
|
sorted_items = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
|
|
review_list = [r for _, r in sorted_items]
|
|
grand_total = total_flushed[0] + len(review_list)
|
|
dom_count = sum(1 for r in review_list if r.get("source") == "dom")
|
|
api_count = sum(1 for r in review_list if r.get("source") == "api")
|
|
|
|
if total_flushed[0] > 0:
|
|
log.info('scraper', f"Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})", metrics={'total_reviews': grand_total, 'flushed_count': total_flushed[0], 'in_memory_count': len(review_list), 'elapsed_seconds': time.time() - start_time})
|
|
else:
|
|
log.info('scraper', f"Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})", metrics={'total_reviews': len(review_list), 'dom_count': dom_count, 'api_count': api_count, 'elapsed_seconds': time.time() - start_time})
|
|
|
|
# Infer topics for each review if review_topics is available
|
|
if review_topics:
|
|
log.info('scraper', f"Inferring topics for {len(review_list)} reviews...", metrics={'reviews_count': len(review_list)})
|
|
topics_inferred_count = 0
|
|
for review in review_list:
|
|
review_text = review.get("text", "")
|
|
matched = infer_review_topics(review_text, review_topics)
|
|
review["topics"] = matched
|
|
if matched:
|
|
topics_inferred_count += 1
|
|
log.info('scraper', f"Topics inferred for {topics_inferred_count}/{len(review_list)} reviews", metrics={'topics_inferred_count': topics_inferred_count, 'reviews_count': len(review_list)})
|
|
|
|
return {
|
|
"reviews": review_list, # Only unflushed reviews (flushed already sent to callback)
|
|
"total": grand_total,
|
|
"total_flushed": total_flushed[0],
|
|
"checks": check_num,
|
|
"url": url,
|
|
"logs": log.get_logs(),
|
|
"review_topics": review_topics, # Topic filters with mention counts
|
|
"metrics_history": metrics_history, # For crash detection
|
|
"start_time": start_time, # For crash report elapsed time
|
|
"session_fingerprint": session_fingerprint # Browser fingerprint for bot detection analysis
|
|
}
|
|
|
|
|
|
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
|
|
progress_callback=None, driver=None, return_driver: bool = False,
|
|
log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False,
|
|
browser_fingerprint: dict = None):
|
|
"""
|
|
Production-compatible wrapper for scrape_reviews.
|
|
Matches the API expected by job_manager.py.
|
|
|
|
Args:
|
|
url: Google Maps URL to scrape
|
|
headless: Run Chrome in headless mode
|
|
max_scrolls: Not used (kept for API compatibility)
|
|
progress_callback: Optional callback(current_count, total_count) for progress
|
|
driver: Existing driver instance to reuse
|
|
return_driver: If True, return driver in result
|
|
log_capture: Optional LogCapture instance for real-time log access
|
|
browser_fingerprint: Optional dict with user's browser fingerprint:
|
|
- geolocation: {lat, lng}
|
|
- userAgent: string
|
|
- viewport: {width, height}
|
|
- timezone: string (e.g., "Europe/Madrid")
|
|
- language: string (e.g., "en-US")
|
|
- platform: string (e.g., "MacIntel")
|
|
|
|
Returns:
|
|
Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
|
|
"""
|
|
from seleniumbase import Driver
|
|
|
|
start_time = time.time()
|
|
driver_provided = driver is not None
|
|
should_close_driver = not return_driver and not driver_provided
|
|
|
|
# Use provided log_capture or create new one
|
|
log_capture = log_capture or LogCapture()
|
|
|
|
try:
|
|
# Extract fingerprint settings
|
|
fp = browser_fingerprint or {}
|
|
user_agent = fp.get('userAgent') or "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
viewport = fp.get('viewport') or {'width': 1200, 'height': 900}
|
|
geolocation = fp.get('geolocation')
|
|
timezone = fp.get('timezone')
|
|
language = fp.get('language', 'en-US')
|
|
|
|
# Create driver if not provided
|
|
if not driver:
|
|
driver = Driver(
|
|
uc=True,
|
|
headless=headless,
|
|
page_load_strategy="normal",
|
|
agent=user_agent # Use user's actual user agent
|
|
)
|
|
# Set viewport to match user's screen
|
|
driver.set_window_size(viewport['width'], viewport['height'])
|
|
|
|
# Apply browser fingerprint settings via CDP
|
|
try:
|
|
# Set timezone if provided
|
|
if timezone:
|
|
driver.execute_cdp_cmd('Emulation.setTimezoneOverride', {'timezoneId': timezone})
|
|
log_capture.info('browser', f"Set timezone to {timezone}")
|
|
|
|
# Set locale/language
|
|
driver.execute_cdp_cmd('Emulation.setLocaleOverride', {'locale': language})
|
|
|
|
# Set geolocation
|
|
if geolocation and 'lat' in geolocation and 'lng' in geolocation:
|
|
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
|
'latitude': geolocation['lat'],
|
|
'longitude': geolocation['lng'],
|
|
'accuracy': 1000 # ~1km accuracy for IP-based location
|
|
})
|
|
log_capture.info('browser', f"Set geolocation to ({geolocation['lat']:.2f}, {geolocation['lng']:.2f})", metrics={'lat': geolocation['lat'], 'lng': geolocation['lng']})
|
|
else:
|
|
# Default to US (Boston, MA) if no geolocation provided
|
|
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
|
'latitude': 42.3601,
|
|
'longitude': -71.0589,
|
|
'accuracy': 100
|
|
})
|
|
log_capture.info('browser', "Set geolocation to US (Boston, MA) [default]", metrics={'lat': 42.3601, 'lng': -71.0589})
|
|
|
|
if fp:
|
|
log_capture.info('browser', f"Browser fingerprint applied: {fp.get('platform', 'unknown')}, {viewport['width']}x{viewport['height']}", metrics={'viewport_width': viewport['width'], 'viewport_height': viewport['height']})
|
|
except Exception as e:
|
|
log_capture.warn('system', f"Could not apply fingerprint settings: {e}")
|
|
|
|
# Add URL parameters for consistent results
|
|
if 'hl=' not in url:
|
|
separator = '&' if '?' in url else '?'
|
|
url = f"{url}{separator}hl=en"
|
|
if 'gl=' not in url:
|
|
url = f"{url}&gl=us"
|
|
|
|
# Create combined flush callback for progress + external handler
|
|
external_flush = flush_callback # Save external callback
|
|
internal_flush = None
|
|
if progress_callback or external_flush:
|
|
collected = [0]
|
|
def combined_flush(reviews_batch):
|
|
collected[0] = len(reviews_batch) # reviews_batch is ALL reviews so far
|
|
if progress_callback:
|
|
progress_callback(collected[0], None)
|
|
if external_flush:
|
|
external_flush(reviews_batch) # Pass reviews to external handler
|
|
internal_flush = combined_flush
|
|
|
|
# Run the scraper with progress callback for real-time updates
|
|
result = scrape_reviews(
|
|
driver=driver,
|
|
url=url,
|
|
max_reviews=999999, # Effectively unlimited
|
|
timeout_no_new=15,
|
|
flush_callback=internal_flush,
|
|
flush_batch_size=100, # Smaller batches for more frequent progress
|
|
log_capture=log_capture,
|
|
progress_callback=progress_callback, # Pass through for real-time log updates
|
|
validation_only=validation_only # Return early if just validating
|
|
)
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
# Return in expected format
|
|
response = {
|
|
"reviews": result.get("reviews", []),
|
|
"count": result.get("total", 0),
|
|
"total_reviews": result.get("total", 0),
|
|
"time": elapsed,
|
|
"success": True,
|
|
"error": None,
|
|
"logs": result.get("logs", []),
|
|
"review_topics": result.get("review_topics", []), # Topic filters with mention counts
|
|
"session_fingerprint": result.get("session_fingerprint") # Browser fingerprint for bot detection
|
|
}
|
|
|
|
# Include validation_info if in validation_only mode
|
|
if validation_only and "validation_info" in result:
|
|
response["validation_info"] = result["validation_info"]
|
|
|
|
if return_driver:
|
|
response["driver"] = driver
|
|
elif should_close_driver:
|
|
try:
|
|
driver.quit()
|
|
except:
|
|
pass
|
|
|
|
return response
|
|
|
|
except Exception as e:
|
|
elapsed = time.time() - start_time
|
|
|
|
# CRASH DETECTION: Build crash report before closing driver
|
|
crash_report = None
|
|
try:
|
|
if driver:
|
|
# Try to sample final metrics from the browser
|
|
final_metrics = {
|
|
'timestamp_ms': int(time.time() * 1000),
|
|
'memory_mb': get_chrome_memory(driver),
|
|
'dom_nodes': get_dom_node_count(driver)
|
|
}
|
|
# Build crash report with available information
|
|
crash_report = {
|
|
'crash_type': classify_crash(e, [final_metrics]),
|
|
'error_message': str(e),
|
|
'state': {
|
|
'reviews_extracted': 0, # Unknown at crash time
|
|
'total_expected': None,
|
|
'scroll_count': 0,
|
|
'elapsed_seconds': elapsed
|
|
},
|
|
'metrics_history': [final_metrics],
|
|
'logs_before_crash': log_capture.get_logs()[-20:] if log_capture else [],
|
|
'last_successful_review_id': None
|
|
}
|
|
log_capture.error('system', f"Crash detected: {crash_report['crash_type']}",
|
|
metrics={'error': str(e), 'elapsed_seconds': elapsed})
|
|
except:
|
|
# If we can't build crash report, continue with basic error handling
|
|
pass
|
|
|
|
if should_close_driver and driver:
|
|
try:
|
|
driver.quit()
|
|
except:
|
|
pass
|
|
|
|
# Log error to the existing log_capture
|
|
log_capture.error('system', f"Scraper failed: {str(e)}")
|
|
|
|
result = {
|
|
"reviews": [],
|
|
"count": 0,
|
|
"total_reviews": 0,
|
|
"time": elapsed,
|
|
"success": False,
|
|
"error": str(e),
|
|
"driver": driver if return_driver else None,
|
|
"logs": log_capture.get_logs()
|
|
}
|
|
|
|
# Include crash report if available
|
|
if crash_report:
|
|
result['crash_report'] = crash_report
|
|
|
|
return result
|
|
|
|
|
|
def extract_about_info(driver, url: str = None) -> dict:
|
|
"""
|
|
Extract About section info from Google Maps (Accessibility, Amenities, etc.).
|
|
|
|
This function should be called AFTER reviews are scraped if about info is needed,
|
|
as it navigates to a different tab.
|
|
|
|
Args:
|
|
driver: Selenium WebDriver instance (already on the business page)
|
|
url: Optional URL to navigate to first (if not already on the page)
|
|
|
|
Returns:
|
|
dict with section names as keys, each containing list of features
|
|
"""
|
|
try:
|
|
# Navigate if URL provided
|
|
if url:
|
|
# Force English
|
|
if 'hl=' not in url:
|
|
separator = '&' if '?' in url else '?'
|
|
url = f"{url}{separator}hl=en"
|
|
if 'gl=' not in url:
|
|
url = f"{url}&gl=us"
|
|
driver.get(url)
|
|
time.sleep(1)
|
|
|
|
# Click About tab using robust selectors
|
|
clicked = driver.execute_script("""
|
|
// Try multiple selectors for about tab
|
|
var selectors = [
|
|
'button[aria-label*="About"]',
|
|
'button[data-tab-index="2"]',
|
|
'div[role="tablist"] button:nth-child(3)',
|
|
'button[jsaction*="about"]'
|
|
];
|
|
|
|
for (var sel of selectors) {
|
|
var btn = document.querySelector(sel);
|
|
if (btn && btn.textContent.toLowerCase().includes('about')) {
|
|
btn.click();
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// Fallback: find by text content
|
|
var buttons = document.querySelectorAll('button');
|
|
for (var btn of buttons) {
|
|
if (btn.textContent.trim().toLowerCase() === 'about') {
|
|
btn.click();
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
""")
|
|
|
|
if not clicked:
|
|
return {}
|
|
|
|
time.sleep(1.5) # Wait for about tab to load
|
|
|
|
# Extract about sections using aria-labels (robust)
|
|
about = driver.execute_script("""
|
|
var about = {};
|
|
|
|
// Find the about region by aria-label or role
|
|
var container = document.querySelector('div[role="region"][aria-label*="About"]');
|
|
|
|
if (!container) {
|
|
// Fallback: look for the scrollable area with sections
|
|
container = document.querySelector('.m6QErb[aria-label*="About"]');
|
|
}
|
|
|
|
if (!container) {
|
|
// Last resort: find sections by h2 headers
|
|
container = document;
|
|
}
|
|
|
|
// Find all section headers (h2 elements)
|
|
var sections = container.querySelectorAll('h2');
|
|
|
|
for (var h2 of sections) {
|
|
var sectionName = h2.textContent.trim();
|
|
var items = [];
|
|
|
|
// Find the ul list following this h2
|
|
var parent = h2.closest('.iP2t7d, div');
|
|
if (parent) {
|
|
var listItems = parent.querySelectorAll('li span[aria-label]');
|
|
for (var li of listItems) {
|
|
var label = li.getAttribute('aria-label');
|
|
if (label) {
|
|
// Parse "Has toilet" or "No wheelchair-accessible car park"
|
|
var hasFeature = !label.toLowerCase().startsWith('no ');
|
|
var featureName = label.replace(/^(Has |No )/i, '');
|
|
items.push({
|
|
feature: featureName,
|
|
available: hasFeature
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
if (sectionName && items.length > 0) {
|
|
about[sectionName] = items;
|
|
}
|
|
}
|
|
|
|
return about;
|
|
""")
|
|
|
|
return about or {}
|
|
|
|
except Exception as e:
|
|
return {"error": str(e)}
|
|
|
|
|
|
# Test function
|
|
if __name__ == "__main__":
|
|
from seleniumbase import Driver
|
|
|
|
# Test URL - 79 reviews
|
|
TEST_URL = "https://www.google.com/maps/place/R.+Fleitas+Peluqueros/@28.1302986,-15.4448111,821m/data=!3m1!1e3!4m6!3m5!1s0xc40951a43c21f19:0x85f89601b9909c72!8m2!3d28.1299805!4d-15.4436854!16s%2Fg%2F11gbwtk8c8"
|
|
|
|
print("🚀 Starting clean scraper test...")
|
|
|
|
# Set up driver
|
|
driver = Driver(uc=True, headless=False)
|
|
driver.set_window_size(1200, 900)
|
|
|
|
try:
|
|
result = scrape_reviews(driver, TEST_URL, max_reviews=100, timeout_no_new=15)
|
|
print(f"\n✅ Got {result['total']} reviews in {result['checks']} checks")
|
|
|
|
# Show sample
|
|
if result["reviews"]:
|
|
print("\n📝 Sample review:")
|
|
sample = result["reviews"][0]
|
|
print(f" Author: {sample['author']}")
|
|
print(f" Rating: {sample['rating']}⭐")
|
|
print(f" Text: {sample['text'][:100]}..." if sample['text'] else " Text: (none)")
|
|
|
|
finally:
|
|
driver.quit()
|
|
print("\n🏁 Done")
|
|
|
|
|
|
def get_business_card_info(url: str, headless: bool = True, driver=None, return_driver: bool = False) -> dict:
|
|
"""
|
|
Extract business card info from Google Maps.
|
|
Uses the same efficient polling navigation as scrape_reviews (no fixed waits).
|
|
|
|
Returns:
|
|
dict with: name, address, rating, total_reviews, success, error, time
|
|
"""
|
|
from seleniumbase import Driver
|
|
import logging
|
|
log = logging.getLogger(__name__)
|
|
|
|
start_time = time.time()
|
|
driver_provided = driver is not None
|
|
should_close_driver = not return_driver and not driver_provided
|
|
|
|
try:
|
|
# Create driver if not provided
|
|
if not driver:
|
|
driver = Driver(uc=True, headless=headless)
|
|
|
|
# Set geolocation to US
|
|
try:
|
|
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
|
'latitude': 42.3601, 'longitude': -71.0589, 'accuracy': 100
|
|
})
|
|
except:
|
|
pass
|
|
|
|
# Don't clear state - Google may serve different content based on session history
|
|
# The scraper doesn't reset state, so validation shouldn't either
|
|
|
|
# Force English interface for consistent parsing
|
|
if 'hl=' not in url:
|
|
separator = '&' if '?' in url else '?'
|
|
url = f"{url}{separator}hl=en"
|
|
if 'gl=' not in url:
|
|
url = f"{url}&gl=us"
|
|
|
|
# Navigate to URL
|
|
driver.get(url)
|
|
|
|
# Handle consent popup - poll with 10ms sleep (same as scrape_reviews)
|
|
start = time.time()
|
|
while time.time() - start < 5:
|
|
if "consent.google" in driver.current_url:
|
|
try:
|
|
# Try multiple approaches to find and click accept button
|
|
clicked = False
|
|
|
|
# Method 1: Find by aria-label (most reliable for Google consent)
|
|
for btn in driver.find_elements(By.CSS_SELECTOR, "button[aria-label*='Accept']"):
|
|
btn.click()
|
|
clicked = True
|
|
break
|
|
|
|
# Method 2: Find by text content
|
|
if not clicked:
|
|
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
|
|
txt = btn.text.lower()
|
|
if "accept all" in txt or "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
|
btn.click()
|
|
clicked = True
|
|
break
|
|
|
|
if clicked:
|
|
time.sleep(0.5) # Brief wait for consent to process
|
|
driver.get(url) # Reload the target URL
|
|
time.sleep(0.5) # Wait for reload
|
|
except Exception as e:
|
|
pass
|
|
break
|
|
if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url):
|
|
break
|
|
time.sleep(0.01) # 10ms - responsive but low CPU
|
|
|
|
# Log current URL after consent handling
|
|
try:
|
|
current_url = driver.current_url
|
|
log.info(f"🔍 Validation: Current URL after load: {current_url[:80]}...")
|
|
except:
|
|
pass
|
|
|
|
# Wait for page to fully render before polling (tabs may load dynamically)
|
|
time.sleep(2)
|
|
|
|
# Poll for business info (same pattern as total_reviews extraction)
|
|
# Timeout increased to 10s because Reviews tab can take 6+ seconds to appear after consent
|
|
info = {"name": None, "rating": None, "total_reviews": None, "address": None, "category": None}
|
|
start = time.time()
|
|
debug_logged = False
|
|
while time.time() - start < 10:
|
|
try:
|
|
info = driver.execute_script("""
|
|
var result = {name: null, rating: null, total_reviews: null, address: null, category: null, debug: []};
|
|
|
|
// Business name from h1
|
|
var h1 = document.querySelector('h1');
|
|
if (h1) result.name = h1.textContent.trim();
|
|
|
|
// Category - use jsaction attribute (robust, survives class changes)
|
|
var catBtn = document.querySelector('button[jsaction*="category"]');
|
|
if (catBtn) result.category = catBtn.textContent.trim();
|
|
|
|
// Fallback: look for button after rating that's not a link
|
|
if (!result.category) {
|
|
var buttons = document.querySelectorAll('button');
|
|
for (var btn of buttons) {
|
|
var text = btn.textContent.trim();
|
|
// Categories are short words, no numbers, not navigation
|
|
if (text && text.length < 50 && !text.match(/^[0-9]/) &&
|
|
!text.match(/review|star|direction|save|share|photo/i)) {
|
|
// Check if it's near the rating area
|
|
var parent = btn.closest('.LBgpqf, .skqShb, .fontBodyMedium');
|
|
if (parent) {
|
|
result.category = text;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Rating from span[role="img"] aria-labels
|
|
var spans = document.querySelectorAll('span[role="img"]');
|
|
for (var i = 0; i < spans.length; i++) {
|
|
var label = spans[i].getAttribute('aria-label') || '';
|
|
|
|
// Collect debug info for all aria-labels
|
|
if (label) {
|
|
result.debug.push('img-aria: ' + label);
|
|
}
|
|
|
|
// Rating: "4.8 stars" (English forced via hl=en)
|
|
var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
|
|
if (rMatch && !result.rating) {
|
|
result.rating = parseFloat(rMatch[1].replace(',', '.'));
|
|
}
|
|
|
|
// Reviews: "79 reviews" or "4.8 stars 79 reviews" (English forced via hl=en)
|
|
// Try direct format first: "79 reviews"
|
|
var revMatch = label.match(/^([\\d,]+)\\s*review/i);
|
|
if (revMatch && !result.total_reviews) {
|
|
result.total_reviews = parseInt(revMatch[1].replace(/,/g, ''));
|
|
}
|
|
|
|
// Try combined format: "4.8 stars 79 reviews" or "4.8 stars 79k+ reviews"
|
|
if (!result.total_reviews) {
|
|
var combinedMatch = label.match(/stars?\\s+([\\d,]+k?\\+?)\\s*review/i);
|
|
if (combinedMatch) {
|
|
var countStr = combinedMatch[1].replace(/,/g, '');
|
|
if (countStr.includes('k')) {
|
|
// Handle "9k+" format
|
|
result.total_reviews = parseInt(countStr) * 1000;
|
|
} else {
|
|
result.total_reviews = parseInt(countStr);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Also collect tab button texts for debugging (include full text including numbers)
|
|
var tabs = document.querySelectorAll('button[role="tab"]');
|
|
for (var j = 0; j < tabs.length; j++) {
|
|
var tabText = tabs[j].textContent.trim();
|
|
result.debug.push('tab: ' + tabText);
|
|
// Also try to extract review count from tab text like "Reviews (79)"
|
|
if (tabText.toLowerCase().includes('review') && !result.total_reviews) {
|
|
var tabMatch = tabText.match(/\\((\\d+)\\)/);
|
|
if (tabMatch) {
|
|
result.total_reviews = parseInt(tabMatch[1]);
|
|
result.debug.push('Found reviews in tab: ' + tabText);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Also check ALL buttons for reviews count
|
|
var allButtons = document.querySelectorAll('button');
|
|
for (var b = 0; b < allButtons.length; b++) {
|
|
var btnText = allButtons[b].textContent || '';
|
|
if (btnText.toLowerCase().includes('review') && !btnText.toLowerCase().includes('write')) {
|
|
var numMatch = btnText.match(/\\((\\d+)\\)/);
|
|
if (numMatch && !result.total_reviews) {
|
|
result.total_reviews = parseInt(numMatch[1]);
|
|
result.debug.push('Found reviews in button: ' + btnText.substring(0, 50));
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check if we're on search results vs place page
|
|
result.debug.push('title: ' + document.title);
|
|
result.debug.push('url: ' + window.location.href.substring(0, 80));
|
|
|
|
// Check for search results list
|
|
var searchResults = document.querySelectorAll('div[role="feed"] > div');
|
|
result.debug.push('search_results_count: ' + searchResults.length);
|
|
|
|
// Fallback: Get review count from Reviews tab button "Reviews (79)"
|
|
// Search ALL tab buttons for one containing "review" text (same as scrape_reviews)
|
|
if (!result.total_reviews) {
|
|
var tabs = document.querySelectorAll('button[role="tab"]');
|
|
for (var tab of tabs) {
|
|
var text = tab.textContent.toLowerCase();
|
|
if (text.includes('review')) {
|
|
var match = tab.textContent.match(/\\((\\d+)\\)/);
|
|
if (match) {
|
|
result.total_reviews = parseInt(match[1]);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback 2: Look for any button with "Reviews" and a number
|
|
if (!result.total_reviews) {
|
|
var buttons = document.querySelectorAll('button');
|
|
for (var btn of buttons) {
|
|
var text = btn.textContent;
|
|
if (text.toLowerCase().includes('review') && !text.toLowerCase().includes('write')) {
|
|
var numMatch = text.match(/\\((\\d+)\\)/);
|
|
if (numMatch) {
|
|
result.total_reviews = parseInt(numMatch[1]);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Address from button
|
|
var addrBtn = document.querySelector('button[data-item-id="address"]');
|
|
if (addrBtn) {
|
|
var label = addrBtn.getAttribute('aria-label');
|
|
if (label) result.address = label.replace(/^Address:\\s*/i, '');
|
|
}
|
|
|
|
return result;
|
|
""")
|
|
# Exit early if we have the essentials (name found AND reviews count > 0)
|
|
if info.get("name") and info.get("total_reviews") and info.get("total_reviews") > 0:
|
|
break
|
|
|
|
# Log debug info once after 3 seconds
|
|
if not debug_logged and time.time() - start > 3:
|
|
debug_logged = True
|
|
debug_info = info.get("debug", [])
|
|
if debug_info:
|
|
log.info(f"🔍 Validation debug - URL: {url[:50]}...")
|
|
log.info(f" Name: {info.get('name')}, Rating: {info.get('rating')}, Reviews: {info.get('total_reviews')}")
|
|
for d in debug_info[:10]: # First 10 debug items
|
|
log.info(f" {d}")
|
|
except:
|
|
pass
|
|
time.sleep(0.1) # 100ms between polls
|
|
|
|
# Final debug log if still no reviews
|
|
if not info.get("total_reviews"):
|
|
debug_info = info.get("debug", [])
|
|
log.warning(f"⚠️ Validation: No reviews found for '{info.get('name')}' after 10s polling")
|
|
if debug_info:
|
|
log.warning(f" Debug items: {debug_info[:10]}")
|
|
|
|
return {
|
|
"name": info.get("name"),
|
|
"address": info.get("address"),
|
|
"rating": info.get("rating"),
|
|
"total_reviews": info.get("total_reviews"),
|
|
"category": info.get("category"),
|
|
"success": bool(info.get("name")),
|
|
"error": None,
|
|
"time": time.time() - start_time
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
"name": None,
|
|
"address": None,
|
|
"rating": None,
|
|
"total_reviews": None,
|
|
"category": None,
|
|
"success": False,
|
|
"error": str(e),
|
|
"time": time.time() - start_time
|
|
}
|
|
|
|
finally:
|
|
if should_close_driver and driver:
|
|
try:
|
|
driver.quit()
|
|
except:
|
|
pass
|