Files
whyrating-engine-legacy/scrapers/google_reviews/v1_0_0.py
Alejandro Gutiérrez 6b3f055760 fix: Prevent Chrome tab crash by removing processed DOM cards
Root cause: Cards were hidden but not removed from DOM, causing
memory buildup (400+ nodes) that crashed Chrome tabs.

Changes:
- Actually remove processed cards from DOM (not just hide them)
- Keep last 50 cards for scroll reference/continuity
- Remove adjacent separator elements along with cards
- Add logging when DOM cleanup removes cards
- Cards near scroll end stay visible for reference

This should prevent "tab crashed" errors during long scraping
sessions with 500+ reviews.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 17:17:21 +00:00

2303 lines
95 KiB
Python

"""
Google Reviews Scraper v1.0.0
This module provides the core Google Maps reviews scraping functionality.
- Simple down scrolling
- DOM scraping + API interception
Version: 1.0.0
Migrated from: modules/scraper_clean.py
"""
import re
import json
import time
import threading
from datetime import datetime
from typing import List, Optional
from selenium.webdriver.common.by import By
from utils.logger import StructuredLogger
def get_chrome_memory(driver) -> Optional[int]:
"""Get Chrome memory usage in MB using CDP."""
try:
# Use CDP Performance.getMetrics
result = driver.execute_cdp_cmd('Performance.getMetrics', {})
for metric in result.get('metrics', []):
if metric['name'] == 'JSHeapUsedSize':
return int(metric['value'] / 1024 / 1024)
except:
pass
return None
def get_dom_node_count(driver) -> Optional[int]:
"""Get DOM node count."""
try:
return driver.execute_script("return document.getElementsByTagName('*').length")
except:
return None
def capture_session_fingerprint(driver) -> dict:
"""
Capture browser session fingerprint for bot detection analysis.
This captures various browser attributes that can be used to:
1. Verify bot detection evasion is working
2. Debug issues when scraping fails
3. Track session characteristics for analysis
Args:
driver: Selenium WebDriver instance (must be initialized)
Returns:
Dictionary containing session fingerprint data
"""
fingerprint = {
"user_agent": None,
"platform": None,
"language": None,
"languages": None,
"timezone": None,
"screen": {
"width": None,
"height": None,
"colorDepth": None
},
"viewport": {
"width": None,
"height": None
},
"webgl_vendor": None,
"webgl_renderer": None,
"canvas_fingerprint": None,
"hardware_concurrency": None,
"device_memory": None,
"bot_detection_tests": {
"webdriver_hidden": None,
"chrome_runtime": None,
"permissions_query": None
},
"captured_at": None
}
try:
# Navigate to about:blank first to ensure we can execute JS
# (in case driver was just created and hasn't navigated yet)
current_url = driver.current_url
if not current_url or current_url == "data:,":
driver.get("about:blank")
# Capture timestamp
fingerprint["captured_at"] = datetime.now().isoformat()
# Basic navigator properties
try:
fingerprint["user_agent"] = driver.execute_script("return navigator.userAgent")
except:
pass
try:
fingerprint["platform"] = driver.execute_script("return navigator.platform")
except:
pass
try:
fingerprint["language"] = driver.execute_script("return navigator.language")
except:
pass
try:
fingerprint["languages"] = driver.execute_script("return navigator.languages")
except:
pass
try:
fingerprint["timezone"] = driver.execute_script(
"return Intl.DateTimeFormat().resolvedOptions().timeZone"
)
except:
pass
# Screen properties
try:
fingerprint["screen"]["width"] = driver.execute_script("return screen.width")
fingerprint["screen"]["height"] = driver.execute_script("return screen.height")
fingerprint["screen"]["colorDepth"] = driver.execute_script("return screen.colorDepth")
except:
pass
# Viewport properties
try:
fingerprint["viewport"]["width"] = driver.execute_script("return window.innerWidth")
fingerprint["viewport"]["height"] = driver.execute_script("return window.innerHeight")
except:
pass
# WebGL vendor and renderer (important for fingerprinting)
try:
webgl_info = driver.execute_script("""
try {
var canvas = document.createElement('canvas');
var gl = canvas.getContext('webgl') || canvas.getContext('experimental-webgl');
if (gl) {
var debugInfo = gl.getExtension('WEBGL_debug_renderer_info');
if (debugInfo) {
return {
vendor: gl.getParameter(debugInfo.UNMASKED_VENDOR_WEBGL),
renderer: gl.getParameter(debugInfo.UNMASKED_RENDERER_WEBGL)
};
}
}
} catch(e) {}
return {vendor: null, renderer: null};
""")
fingerprint["webgl_vendor"] = webgl_info.get("vendor")
fingerprint["webgl_renderer"] = webgl_info.get("renderer")
except:
pass
# Canvas fingerprint (hash of canvas drawing)
try:
canvas_hash = driver.execute_script("""
try {
var canvas = document.createElement('canvas');
canvas.width = 200;
canvas.height = 50;
var ctx = canvas.getContext('2d');
ctx.textBaseline = 'top';
ctx.font = '14px Arial';
ctx.fillStyle = '#f60';
ctx.fillRect(125, 1, 62, 20);
ctx.fillStyle = '#069';
ctx.fillText('Fingerprint', 2, 15);
ctx.fillStyle = 'rgba(102, 204, 0, 0.7)';
ctx.fillText('Fingerprint', 4, 17);
var dataUrl = canvas.toDataURL();
// Simple hash
var hash = 0;
for (var i = 0; i < dataUrl.length; i++) {
var char = dataUrl.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash;
}
return hash.toString(16);
} catch(e) {
return null;
}
""")
fingerprint["canvas_fingerprint"] = canvas_hash
except:
pass
# Hardware info
try:
fingerprint["hardware_concurrency"] = driver.execute_script(
"return navigator.hardwareConcurrency"
)
except:
pass
try:
fingerprint["device_memory"] = driver.execute_script(
"return navigator.deviceMemory"
)
except:
pass
# Bot detection tests
try:
# Test 1: webdriver property should be hidden/false for undetected Chrome
webdriver_hidden = driver.execute_script(
"return navigator.webdriver === undefined || navigator.webdriver === false"
)
fingerprint["bot_detection_tests"]["webdriver_hidden"] = webdriver_hidden
except:
pass
try:
# Test 2: chrome runtime should exist in real Chrome
chrome_runtime = driver.execute_script(
"return typeof window.chrome !== 'undefined'"
)
fingerprint["bot_detection_tests"]["chrome_runtime"] = chrome_runtime
except:
pass
try:
# Test 3: permissions.query should work in real Chrome
permissions_query = driver.execute_script("""
try {
if (navigator.permissions && navigator.permissions.query) {
return true;
}
return false;
} catch(e) {
return false;
}
""")
fingerprint["bot_detection_tests"]["permissions_query"] = permissions_query
except:
pass
except Exception as e:
fingerprint["capture_error"] = str(e)
return fingerprint
def classify_crash(exception: Exception, metrics_history: list) -> str:
"""Classify crash type based on exception and metrics."""
error_str = str(exception).lower()
if 'aw, snap' in error_str or 'status_access_violation' in error_str:
return 'tab_crash'
if 'timeout' in error_str:
return 'timeout'
if metrics_history and metrics_history[-1].get('memory_mb', 0) > 400:
return 'memory_exhaustion'
if 'no such element' in error_str:
return 'element_not_found'
if '429' in error_str or 'rate' in error_str:
return 'rate_limited'
if 'network' in error_str or 'connection' in error_str:
return 'network_failure'
return 'unknown'
class ScraperCrashException(Exception):
"""Exception that carries crash report data for analysis."""
def __init__(self, original_exception, crash_report):
self.original_exception = original_exception
self.crash_report = crash_report
super().__init__(str(original_exception))
def get_topic_variants(topic: str) -> List[str]:
"""
Generate common variants of a topic word for matching.
Handles:
- Singular/plural forms
- Verb forms (-ing, -ed, -s)
- Common stemming patterns
Args:
topic: The topic word/phrase to generate variants for
Returns:
List of variant strings including the original
Example:
>>> get_topic_variants("cutting")
["cutting", "cut", "cuts"]
>>> get_topic_variants("service")
["service", "services", "servicing"]
"""
if not topic:
return []
topic = topic.lower().strip()
variants = {topic} # Use set to avoid duplicates
# Handle -ing forms (cutting -> cut, cuts)
if topic.endswith("ing"):
base = topic[:-3] # Remove -ing
if base:
variants.add(base)
variants.add(base + "s")
# Handle doubled consonants (cutting -> cut)
if len(base) >= 2 and base[-1] == base[-2]:
single_consonant = base[:-1]
variants.add(single_consonant)
variants.add(single_consonant + "s")
# Handle -s/-es plural forms (services -> service)
if topic.endswith("es") and len(topic) > 2:
variants.add(topic[:-2]) # Remove -es
variants.add(topic[:-2] + "ing")
elif topic.endswith("s") and len(topic) > 1 and not topic.endswith("ss"):
variants.add(topic[:-1]) # Remove -s
variants.add(topic[:-1] + "ing")
# Handle -ed forms (colored -> color)
if topic.endswith("ed") and len(topic) > 2:
base = topic[:-2]
if base:
variants.add(base)
variants.add(base + "s")
variants.add(base + "ing")
# Handle doubled consonants (colored -> color from coloured)
if len(base) >= 2 and base[-1] == base[-2]:
single_consonant = base[:-1]
variants.add(single_consonant)
# Add common forms if base word (no suffix detected)
if not (topic.endswith("ing") or topic.endswith("ed") or topic.endswith("s")):
variants.add(topic + "s")
variants.add(topic + "ing")
# Handle consonant doubling for -ing (cut -> cutting)
if len(topic) >= 2 and topic[-1] not in "aeiouwy":
variants.add(topic + topic[-1] + "ing")
return list(variants)
def infer_review_topics(review_text: str, topics: List[dict]) -> List[str]:
"""
Match review text against extracted topic keywords.
Args:
review_text: The review text to analyze
topics: List of topic dicts, e.g., [{"topic": "cutting", "count": 3}]
Returns:
List of matched topic names
Example:
>>> topics = [{"topic": "hair salon", "count": 4}, {"topic": "cutting", "count": 3}]
>>> text = "Great haircut! The cutting was professional."
>>> infer_review_topics(text, topics)
["cutting"]
"""
# Handle empty/None inputs gracefully
if not review_text or not topics:
return []
review_text_lower = review_text.lower()
matched_topics = []
for topic_dict in topics:
topic = topic_dict.get("topic", "")
if not topic:
continue
topic_lower = topic.lower().strip()
# Get all variants of the topic
variants = get_topic_variants(topic_lower)
# Check each variant for word boundary match
for variant in variants:
if not variant:
continue
# Use word boundary regex to avoid partial matches
# \b ensures we match whole words only
# E.g., "cut" won't match "execute" or "cutlery" partially
pattern = r'\b' + re.escape(variant) + r'\b'
if re.search(pattern, review_text_lower):
matched_topics.append(topic) # Use original topic name
break # Found a match, no need to check other variants
return matched_topics
class LogCapture:
"""
Backward-compatible wrapper around StructuredLogger.
Maintains the original LogCapture API while using StructuredLogger internally.
This allows existing code to continue working while gaining structured logging benefits.
"""
def __init__(self):
self._logger = StructuredLogger()
def log(self, message: str, level: str = "INFO", source: str = "scraper"):
"""Add a log entry with timestamp (backward compatible)."""
# Map source to category
category = self._source_to_category(source)
level_upper = level.upper()
if level_upper == "ERROR":
self._logger.error(category, message)
elif level_upper == "WARNING" or level_upper == "WARN":
self._logger.warn(category, message)
elif level_upper == "DEBUG":
self._logger.debug(category, message)
else:
self._logger.info(category, message)
# Also print for console visibility
print(message, flush=True)
def info(self, category_or_msg, message: str = None, *, metrics: dict = None):
"""
Log an INFO message.
Supports both old API: info(message, source)
And new API: info(category, message, metrics={...})
"""
if message is None:
# Old API: info(message) or info(message, source)
self._logger.info('scraper', category_or_msg, metrics=metrics)
print(category_or_msg, flush=True)
else:
# New API: info(category, message, metrics={...})
self._logger.info(category_or_msg, message, metrics=metrics)
print(message, flush=True)
def warning(self, category_or_msg, message: str = None, *, metrics: dict = None):
"""Log a WARNING message (supports both old and new API)."""
if message is None:
self._logger.warn('scraper', category_or_msg, metrics=metrics)
print(category_or_msg, flush=True)
else:
self._logger.warn(category_or_msg, message, metrics=metrics)
print(message, flush=True)
def warn(self, category, message: str, *, metrics: dict = None):
"""Log a WARN message with category (new API)."""
self._logger.warn(category, message, metrics=metrics)
print(message, flush=True)
def error(self, category_or_msg, message: str = None, *, metrics: dict = None):
"""Log an ERROR message (supports both old and new API)."""
if message is None:
self._logger.error('scraper', category_or_msg, metrics=metrics)
print(category_or_msg, flush=True)
else:
self._logger.error(category_or_msg, message, metrics=metrics)
print(message, flush=True)
def debug(self, category, message: str, *, metrics: dict = None):
"""Log a DEBUG message with category (new API)."""
self._logger.debug(category, message, metrics=metrics)
print(message, flush=True)
def get_logs(self):
"""Get all log entries as JSON-serializable dictionaries."""
return self._logger.get_logs()
def _source_to_category(self, source: str) -> str:
"""Map legacy source names to StructuredLogger categories."""
source_lower = source.lower() if source else 'scraper'
if source_lower in ('browser', 'navigation', 'page'):
return 'browser'
elif source_lower in ('network', 'api'):
return 'network'
elif source_lower in ('system', 'memory', 'chrome'):
return 'system'
else:
return 'scraper'
def parse_api_review(raw: list) -> dict:
"""Parse a review from API response array."""
try:
if not isinstance(raw, list) or len(raw) < 5:
return None
author = raw[0] if len(raw) > 0 and isinstance(raw[0], str) else ""
timestamp = raw[1] if len(raw) > 1 else ""
text = raw[3] if len(raw) > 3 and isinstance(raw[3], str) else ""
rating = raw[4] if len(raw) > 4 and isinstance(raw[4], int) else 0
if not (1 <= rating <= 5):
return None
# Filter out garbage data (language codes, metadata, etc.)
if len(author) <= 3: # Real names are longer than 3 chars
return None
if author.lower() in ['google', 'maps', 'reviews', 'es', 'en', 'it', 'no', 'de', 'fr', 'pt']:
return None
# Timestamp should look like a date, not a URL or language code
if timestamp and ('http' in str(timestamp) or len(str(timestamp)) <= 3):
return None
# Owner response
owner_response = None
for idx in [9, 18]:
if len(raw) > idx and raw[idx] and isinstance(raw[idx], list):
resp = raw[idx]
if len(resp) > 1:
owner_response = {"text": resp[1], "timestamp": resp[0] if resp[0] else ""}
break
return {
"author": author,
"text": text,
"rating": rating,
"timestamp": timestamp,
"owner_response": owner_response,
"source": "api"
}
except:
return None
def extract_reviews_from_api_body(body: str) -> list:
"""Extract reviews from API response body using correct Google Maps structure."""
reviews = []
try:
# Remove )]}' prefix
if body.startswith(")]}'"):
body = body[4:].strip()
data = json.loads(body)
# Google Maps API structure: data[2] contains review arrays
# Each review: data[2][X][0] where:
# Author: [1][4][5][0]
# Rating: [2][0][0]
# Text: [2][15][0][0]
# Time: [1][6]
if not isinstance(data, list) or len(data) < 3:
return reviews
reviews_area = data[2]
if not isinstance(reviews_area, list):
return reviews
for item in reviews_area:
try:
if not isinstance(item, list) or len(item) < 1:
continue
review_data = item[0]
if not isinstance(review_data, list) or len(review_data) < 3:
continue
# Extract fields using correct paths
review_id = ""
author = ""
rating = 0
text = ""
timestamp = ""
# Review ID: [0] - same format as DOM's data-review-id
try:
review_id = review_data[0]
except (IndexError, TypeError):
pass
# Author: [1][4][5][0]
try:
author = review_data[1][4][5][0]
except (IndexError, TypeError):
pass
# Rating: [2][0][0]
try:
rating = review_data[2][0][0]
except (IndexError, TypeError):
pass
# Text: [2][15][0][0]
try:
text = review_data[2][15][0][0]
except (IndexError, TypeError):
pass
# Timestamp: [1][6]
try:
timestamp = review_data[1][6]
except (IndexError, TypeError):
pass
# Validate and add (include review_id for deduplication)
if author and isinstance(rating, int) and 1 <= rating <= 5:
reviews.append({
"review_id": review_id,
"author": author,
"text": text or "",
"rating": rating,
"timestamp": timestamp or "",
"source": "api"
})
except:
continue
except:
pass
return reviews
def parse_dom_review(card) -> dict:
"""Parse a review from DOM element."""
try:
# Get review ID
review_id = card.get_attribute("data-review-id") or ""
if not review_id:
try:
id_el = card.find_element(By.CSS_SELECTOR, "[data-review-id]")
review_id = id_el.get_attribute("data-review-id") or ""
except:
pass
# Author - multiple selectors
author = ""
for sel in ['div[class*="d4r55"]', '.d4r55', 'button[data-review-id] + div']:
try:
author_el = card.find_element(By.CSS_SELECTOR, sel)
author = author_el.text.strip()
if author:
break
except:
pass
# Rating from aria-label on span[role="img"]
rating = 0
try:
stars_el = card.find_element(By.CSS_SELECTOR, 'span[role="img"]')
aria = stars_el.get_attribute("aria-label") or ""
# Extract number from label (handles "5 stars", "5 estrellas", etc.)
num = re.search(r'[\d\.]+', aria.replace(',', '.'))
if num:
rating = int(float(num.group()))
except:
pass
# Review text - try multiple selectors
text = ""
for sel in ['span[jsname="bN97Pc"]', 'span[jsname="fbQN7e"]', 'div.MyEned span.wiI7pd', '.wiI7pd']:
try:
text_el = card.find_element(By.CSS_SELECTOR, sel)
text = text_el.text.strip()
if text:
break
except:
pass
# Note: "More" button clicking removed for speed
# Full text can be expanded later if needed
# Timestamp
timestamp = ""
try:
time_el = card.find_element(By.CSS_SELECTOR, 'span[class*="rsqaWe"]')
timestamp = time_el.text.strip()
except:
pass
# Owner response
owner_response = None
try:
resp_box = card.find_element(By.CSS_SELECTOR, "div.CDe7pd")
if resp_box:
resp_text = ""
resp_date = ""
try:
resp_text_el = resp_box.find_element(By.CSS_SELECTOR, "div.wiI7pd")
resp_text = resp_text_el.text.strip()
except:
pass
try:
resp_date_el = resp_box.find_element(By.CSS_SELECTOR, "span.DZSIDd")
resp_date = resp_date_el.text.strip()
except:
pass
if resp_text:
owner_response = {"text": resp_text, "timestamp": resp_date}
except:
pass
if not review_id and not author:
return None
return {
"id": review_id,
"author": author,
"text": text,
"rating": rating,
"timestamp": timestamp,
"owner_response": owner_response,
"source": "dom"
}
except Exception:
return None
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None,
progress_callback=None, validation_only: bool = False) -> dict:
"""
Scrape Google Maps reviews.
Args:
driver: Selenium WebDriver instance
url: Google Maps place URL
max_reviews: Maximum reviews to collect
timeout_no_new: Seconds to wait with no new reviews before stopping
flush_callback: Optional callback(reviews_list) called every flush_batch_size reviews
This allows streaming data to disk and freeing memory
flush_batch_size: Number of reviews to collect before flushing (default 500)
log_capture: Optional LogCapture instance for storing logs
progress_callback: Optional callback(current_count, total_count) called every iteration
Returns:
dict with reviews list and metadata
"""
# Use provided log_capture or create a dummy that just prints
log = log_capture or LogCapture()
# Capture session fingerprint early (before navigation) for bot detection analysis
session_fingerprint = capture_session_fingerprint(driver)
log.info('browser', "Session fingerprint captured", metrics={
'user_agent': session_fingerprint.get('user_agent', 'unknown')[:50] + '...' if session_fingerprint.get('user_agent') else 'unknown',
'platform': session_fingerprint.get('platform'),
'timezone': session_fingerprint.get('timezone'),
'webdriver_hidden': session_fingerprint.get('bot_detection_tests', {}).get('webdriver_hidden'),
'chrome_runtime': session_fingerprint.get('bot_detection_tests', {}).get('chrome_runtime')
})
# Storage - use review ID as key
reviews = {} # review_id -> review
seen_ids = set() # Track all IDs we've seen (persists after flush)
total_flushed = [0] # Use list for closure mutation
review_order = {} # review_id -> position (DOM visual order for sorting)
order_counter = [0] # Current order position
# Track total reviews (persists across refreshes)
total_reviews = [None] # Use list for closure mutation
# Store business info extracted from overview (before clicking reviews tab)
business_info_cache = [None]
# Hard refresh counter
hard_refresh_count = [0]
max_hard_refreshes = 3 # Max number of hard refreshes before giving up
# Find scrollable reviews container helper
def find_scroll_container():
selectors = [
"div.m6QErb.DxyBCb.kA9KIf.dS8AEf",
"div.m6QErb.DxyBCb.kA9KIf",
"div.m6QErb.DxyBCb",
"div.m6QErb[aria-label]",
"div.DxyBCb.kA9KIf.dS8AEf",
"div[role='main'] div.m6QErb",
]
for sel in selectors:
try:
els = driver.find_elements(By.CSS_SELECTOR, sel)
for el in els:
if el.is_displayed() and el.size['height'] > 100:
return el
except:
pass
return None
def setup_reviews_page(is_refresh=False, validation_only_mode=False):
"""
Setup the reviews page for scraping.
Returns (scroll_container, stop_scrolling_event) or (None, None) on failure.
Can be called after initial load or after a hard refresh.
If validation_only_mode=True, returns early after extracting business info
without clicking reviews tab or finding scroll container.
"""
nonlocal total_reviews
refresh_label = " (after refresh)" if is_refresh else ""
# Navigate to URL (only on initial load or refresh)
if not is_refresh:
# Reset browser state by navigating to blank page first
# This clears any stale state from pooled browser sessions
try:
driver.get("about:blank")
time.sleep(0.1)
except:
pass
log.info('browser', f"Loading: {url[:80]}...")
else:
log.info('browser', f"Hard refresh #{hard_refresh_count[0]}: reloading page...")
driver.get(url)
# Handle consent popup if redirected (poll with tiny sleep)
start = time.time()
while time.time() - start < 5: # Max 5s for consent
if "consent.google" in driver.current_url:
log.info('browser', "Handling consent popup...")
try:
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
txt = btn.text.lower()
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
btn.click()
# Reload original URL after consent
log.info('browser', "Reloading after consent...")
driver.get(url)
# Wait for page to settle after consent reload
time.sleep(1)
break
except:
pass
break
# Check if we're already on the target page
if "maps/place" in driver.current_url and "consent" not in driver.current_url:
break
time.sleep(0.01) # 10ms - responsive but low CPU
# Extract business info and total review count BEFORE clicking reviews tab (on Overview)
# This captures name, rating, category, address while they're visible
# Only on first load (don't overwrite if we already have it)
if total_reviews[0] is None or business_info_cache[0] is None:
start = time.time()
while time.time() - start < 5:
try:
info = driver.execute_script("""
var result = {
total_reviews: null,
name: null,
rating: null,
category: null,
address: null
};
// Business name from h1
var h1 = document.querySelector('h1');
if (h1) result.name = h1.textContent.trim();
// Category - use jsaction attribute (robust selector)
var catBtn = document.querySelector('button[jsaction*="category"]');
if (catBtn) result.category = catBtn.textContent.trim();
// Rating and review count from span[role="img"] aria-labels
var spans = document.querySelectorAll('span[role="img"]');
for (var i = 0; i < spans.length; i++) {
var label = spans[i].getAttribute('aria-label') || '';
// Rating: "4.8 stars"
var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
if (rMatch && !result.rating) {
result.rating = parseFloat(rMatch[1].replace(',', '.'));
}
// Reviews: "79 reviews"
var revMatch = label.match(/^([\\d,\\.]+)\\s*review/i);
if (revMatch && !result.total_reviews) {
result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
}
}
// Address from button
var addrBtn = document.querySelector('button[data-item-id="address"]');
if (addrBtn) {
var label = addrBtn.getAttribute('aria-label');
if (label) result.address = label.replace(/^Address:\\s*/i, '');
}
return result;
""")
if info:
if info.get('total_reviews') and total_reviews[0] is None:
total_reviews[0] = info['total_reviews']
log.info('scraper', f"Total reviews on page: {total_reviews[0]}", metrics={'total_reviews': total_reviews[0]})
if info.get('name') and business_info_cache[0] is None:
business_info_cache[0] = info
log.info('scraper', f"Business: {info.get('name')}")
if total_reviews[0] and business_info_cache[0]:
break
except:
pass
time.sleep(0.1)
# VALIDATION_ONLY: Return early - skip clicking reviews tab, sorting, etc.
if validation_only_mode:
log.info('scraper', "Validation mode: returning early (skipping reviews tab)")
return ("validation_done", None)
# Click reviews tab - poll until found
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
start = time.time()
tab_clicked = False
tabs_logged = False
while time.time() - start < 5: # Max 5s for tabs
try:
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
# Log available tabs once for debugging
if not tabs_logged and tabs:
tabs_logged = True
tab_texts = [t.text for t in tabs]
log.info('browser', f"Available tabs: {tab_texts}")
for tab in tabs:
tab_text = tab.text.lower()
if any(kw in tab_text for kw in review_keywords):
if not is_refresh:
log.info('browser', f"Clicking reviews tab: '{tab.text}'")
# Extract total_reviews from tab text like "Reviews (79)" or "Reviews\n79"
if total_reviews[0] is None:
import re
# Try pattern with parentheses: "Reviews (79)"
match = re.search(r'\((\d+)\)', tab.text)
if match:
total_reviews[0] = int(match.group(1))
log.info('scraper', f"Total reviews from tab: {total_reviews[0]}", metrics={'total_reviews': total_reviews[0]})
else:
# Try pattern with newline: "Reviews\n79"
match = re.search(r'(\d+)', tab.text)
if match:
total_reviews[0] = int(match.group(1))
log.info('scraper', f"Total reviews from tab: {total_reviews[0]}", metrics={'total_reviews': total_reviews[0]})
tab.click()
tab_clicked = True
break
if tab_clicked:
break
time.sleep(0.01) # 10ms between polls
except:
time.sleep(0.01)
# Poll for scroll container (10ms intervals - fast but low CPU)
scroll_container = None
start = time.time()
last_print = 0
while time.time() - start < 10: # Max 10s
scroll_container = find_scroll_container()
if scroll_container:
break
elapsed = int(time.time() - start)
if elapsed > last_print:
log.info('browser', f"Waiting for reviews panel...{refresh_label} ({elapsed}s)")
last_print = elapsed
time.sleep(0.01) # 10ms - responsive but low CPU
if not scroll_container:
log.error('browser', f"Could not find reviews scroll container{refresh_label}")
try:
log.error('browser', f"Page title: {driver.title}")
log.error('browser', f"Current URL: {driver.current_url[:100]}")
except:
pass
return None, None
log.info('browser', f"Found scroll container{refresh_label}")
# Inject API interceptor (needs to be re-injected after refresh)
if not is_refresh:
log.info('network', "Injecting API interceptor...")
driver.execute_script("""
// Always re-setup on refresh
window.__reviewInterceptorInjected = true;
window.__interceptedResponses = window.__interceptedResponses || [];
// Intercept fetch (only if not already patched)
if (!window.__fetchPatched) {
window.__fetchPatched = true;
const originalFetch = window.fetch;
window.fetch = async function(...args) {
const url = args[0].toString();
const response = await originalFetch.apply(this, args);
if (url.includes('listugcposts') || url.includes('review')) {
try {
const clone = response.clone();
const text = await clone.text();
window.__interceptedResponses.push({url: url, body: text});
} catch(e) {}
}
return response;
};
}
// Intercept XHR (only if not already patched)
if (!window.__xhrPatched) {
window.__xhrPatched = true;
const originalXHR = window.XMLHttpRequest;
window.XMLHttpRequest = function() {
const xhr = new originalXHR();
const originalOpen = xhr.open;
let reqUrl = '';
xhr.open = function(method, url, ...rest) {
reqUrl = url;
return originalOpen.apply(this, [method, url, ...rest]);
};
xhr.addEventListener('load', function() {
if (reqUrl.includes('listugcposts') || reqUrl.includes('review')) {
try {
window.__interceptedResponses.push({url: reqUrl, body: xhr.responseText});
} catch(e) {}
}
});
return xhr;
};
for (let prop of Object.getOwnPropertyNames(originalXHR)) {
try { window.XMLHttpRequest[prop] = originalXHR[prop]; } catch(e) {}
}
}
""")
# Sort by newest first
try:
sort_btn = driver.execute_script("""
var btns = document.querySelectorAll('button[data-value="sort"]');
if (btns.length) return btns[0];
var all = document.querySelectorAll('button[aria-label*="Sort"]');
if (all.length) return all[0];
return null;
""")
if sort_btn:
sort_btn.click()
time.sleep(0.3)
driver.execute_script("""
var items = document.querySelectorAll('[role="menuitemradio"], [data-index="1"]');
for (var i = 0; i < items.length; i++) {
var txt = items[i].textContent.toLowerCase();
if (txt.includes('newest') || txt.includes('recent') || txt.includes('más reciente')) {
items[i].click();
break;
}
}
""")
time.sleep(0.5)
log.info('browser', "Sorted by newest")
# Re-find scroll container after sorting (DOM may be recreated)
new_container = find_scroll_container()
if new_container:
scroll_container = new_container
log.info('browser', "Refreshed scroll container reference")
except:
pass
# Expand "More" buttons for full text
try:
expanded = driver.execute_script("""
var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
var count = 0;
for (var i = 0; i < buttons.length; i++) {
if (buttons[i].textContent.trim() === 'More') {
buttons[i].click();
count++;
}
}
return count;
""")
if expanded > 0:
log.info('browser', f"Expanded {expanded} truncated reviews", metrics={'expanded_count': expanded})
except:
pass
# Block images to speed up scrolling (use CDP)
try:
driver.execute_cdp_cmd('Network.setBlockedURLs', {
'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*']
})
driver.execute_cdp_cmd('Network.enable', {})
if not is_refresh:
log.info('browser', "Blocking images for faster scrolling")
except:
pass
# Setup scrollable pane reference
driver.execute_script("window.scrollablePane = arguments[0];", scroll_container)
# Create scroll worker
stop_scrolling = threading.Event()
def scroll_worker():
while not stop_scrolling.is_set():
try:
driver.execute_script("""
var p = window.scrollablePane;
if (p) p.scrollTop = p.scrollHeight;
""")
except:
pass
time.sleep(0.1)
scroll_thread = threading.Thread(target=scroll_worker, daemon=True)
scroll_thread.start()
return scroll_container, stop_scrolling
# Helper to extract review topics from the reviews tab
def extract_review_topics():
"""Extract review topic filters from radiogroup (robust selectors)."""
try:
topics = driver.execute_script("""
var topics = [];
// Primary: use role="radiogroup" with aria-label="Refine reviews" (robust)
var container = document.querySelector('div[role="radiogroup"][aria-label*="Refine"], div[role="radiogroup"][aria-label*="refine"]');
if (!container) {
// Fallback: any radiogroup in the reviews area
container = document.querySelector('div[role="radiogroup"]');
}
if (container) {
var buttons = container.querySelectorAll('button[role="radio"]');
for (var btn of buttons) {
var label = btn.getAttribute('aria-label') || '';
// Parse "hair salon, mentioned in 4 reviews" format
var match = label.match(/^([^,]+),\\s*mentioned in (\\d+)/i);
if (match) {
topics.push({
topic: match[1].trim(),
count: parseInt(match[2])
});
} else if (label && !label.toLowerCase().includes('all review')) {
// Fallback: try to extract from child spans
var countSpan = btn.querySelector('.bC3Nkc, .fontBodySmall');
var nameSpan = btn.querySelector('.uEubGf, span:first-child');
if (nameSpan) {
var name = nameSpan.textContent.trim();
var count = countSpan ? parseInt(countSpan.textContent) : 0;
if (name && name.toLowerCase() !== 'all') {
topics.push({topic: name, count: count || 0});
}
}
}
}
}
return topics;
""")
return topics or []
except:
return []
# Initial page setup (pass validation_only to skip unnecessary steps)
scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False, validation_only_mode=validation_only)
# VALIDATION_ONLY MODE: Return early with just total_reviews and business info
# setup_reviews_page returns ("validation_done", None) in this case
if validation_only or scroll_container == "validation_done":
# Use the business info captured from Overview (before clicking reviews tab)
business_info = business_info_cache[0] or {}
return {
"reviews": [],
"total": total_reviews[0] or 0,
"scrolls": 0,
"error": None,
"validation_info": {
"name": business_info.get("name"),
"rating": business_info.get("rating"),
"category": business_info.get("category"),
"address": business_info.get("address"),
"total_reviews": total_reviews[0]
},
"session_fingerprint": session_fingerprint # Browser fingerprint for bot detection analysis
}
if not scroll_container:
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found", "session_fingerprint": session_fingerprint}
# Extract review topics after reviews tab is loaded (before scrolling begins)
time.sleep(0.5) # Brief wait for topic filters to render
review_topics = extract_review_topics()
if review_topics:
log.info('scraper', f"Found {len(review_topics)} review topics: {', '.join(t['topic'] for t in review_topics[:5])}...", metrics={'topic_count': len(review_topics)})
def get_api_reviews():
"""Get reviews from intercepted API responses."""
api_revs = []
try:
responses = driver.execute_script("""
var r = window.__interceptedResponses || [];
window.__interceptedResponses = [];
return r;
""")
for resp in (responses or []):
body = resp.get("body", "")
api_revs.extend(extract_reviews_from_api_body(body))
except:
pass
return api_revs
# Recovery function - use real mouse actions when stuck
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
recovery_count = [0]
def unstick_scroll():
nonlocal scroll_container
recovery_count[0] += 1
method = recovery_count[0] % 4
try:
if method == 1:
# Method 1: Click pane and send Page Down keys
scroll_container.click()
ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
elif method == 2:
# Method 2: Real mouse wheel scroll
ActionChains(driver).move_to_element(scroll_container)\
.scroll_by_amount(0, 800).perform()
elif method == 3:
# Method 3: Scroll up significantly then back down (force reload)
driver.execute_script("""
var p = window.scrollablePane;
if (p) p.scrollTop = Math.max(0, p.scrollTop - 2000);
""")
time.sleep(0.3)
driver.execute_script("""
var p = window.scrollablePane;
if (p) p.scrollTop = p.scrollHeight;
""")
else:
# Method 4: Scroll last card into view, then scroll pane (no click to avoid opening profile)
driver.execute_script("""
var cards = document.querySelectorAll('[data-review-id]');
if (cards.length > 0) {
cards[cards.length - 1].scrollIntoView({block: 'end', behavior: 'smooth'});
}
""")
time.sleep(0.3)
driver.execute_script("""
var p = window.scrollablePane;
if (p) p.scrollTop = p.scrollHeight;
""")
except:
pass
def do_hard_refresh():
"""Hard refresh the page and re-setup everything. Returns True on success."""
nonlocal scroll_container, stop_scrolling
hard_refresh_count[0] += 1
if hard_refresh_count[0] > max_hard_refreshes:
log.warn('system', f"Max hard refreshes ({max_hard_refreshes}) reached, giving up", metrics={'hard_refresh_count': hard_refresh_count[0]})
return False
# Stop current scroll worker
stop_scrolling.set()
time.sleep(0.2)
# Re-setup page
new_container, new_stop = setup_reviews_page(is_refresh=True)
if new_container:
scroll_container = new_container
stop_scrolling = new_stop
recovery_count[0] = 0 # Reset recovery count after successful refresh
log.info('browser', f"Hard refresh successful, resuming with {len(seen_ids)} reviews already collected", metrics={'reviews_collected': len(seen_ids)})
return True
else:
log.error('browser', "Hard refresh failed to find scroll container")
return False
# Main collection loop
last_new_time = time.time()
last_count = len(reviews)
check_num = 0
start_time = time.time()
# Crash detection: metrics sampling
metrics_history = []
last_sample_time = time.time()
scroll_count = [0] # Track scroll operations for crash reports
log.info('browser', f"Scrolling... (timeout: {timeout_no_new}s with no new)", metrics={'timeout_seconds': timeout_no_new})
cycle_start = time.time()
while True:
check_num += 1
time.sleep(1.0) # Check every second
# TIMING: Track cycle performance
t0 = time.time()
cycle_delta = t0 - cycle_start
cycle_start = t0
# CRASH DETECTION: Sample metrics every 5 seconds
if time.time() - last_sample_time >= 5:
current_count_for_metrics = total_flushed[0] + len(reviews)
metrics_history.append({
'timestamp_ms': int(time.time() * 1000),
'memory_mb': get_chrome_memory(driver),
'dom_nodes': get_dom_node_count(driver),
'reviews_count': current_count_for_metrics
})
# Keep only last 100 samples
metrics_history = metrics_history[-100:]
last_sample_time = time.time()
# Collect from API (doesn't affect scroll) - API has FULL TEXT in original language
# Use review_id as key to avoid duplicates with DOM
t1 = time.time()
for rev in get_api_reviews():
rid = rev.get('review_id', '')
if rid and rid not in seen_ids:
reviews[rid] = rev
seen_ids.add(rid)
api_time = time.time() - t1
# Expand any new "More" buttons for full text (batch click, fast)
try:
driver.execute_script("""
var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
for (var i = 0; i < buttons.length; i++) {
if (buttons[i].textContent.trim() === 'More') {
buttons[i].click();
}
}
""")
except:
pass
# Parse reviews using ROBUST selectors (no class names - uses data/aria attributes)
# This survives Google's CSS class name changes
# MEMORY FIX: Actually remove processed cards from DOM (not just hide)
# Keep last N cards for scroll continuity
t2 = time.time()
dom_cards = 0
try:
seen_list = list(seen_ids)
parsed_reviews = driver.execute_script("""
var seenSet = new Set(arguments[0]);
var results = [];
var processedIds = new Set();
var sepsRemoved = 0;
var cardsRemoved = 0;
var KEEP_LAST_N = 50; // Keep last 50 cards for scroll reference
// ROBUST: Find cards by data attribute only (not class names)
var cards = document.querySelectorAll('[data-review-id]');
var cardsArray = Array.from(cards);
var totalCards = cardsArray.length;
for (var i = 0; i < cardsArray.length; i++) {
var card = cardsArray[i];
var rid = card.getAttribute('data-review-id');
var isHidden = card.style.display === 'none';
var isNearEnd = i >= totalCards - KEEP_LAST_N;
// AGGRESSIVE CLEANUP: Remove hidden cards that are NOT near the scroll end
// This prevents memory buildup that causes tab crashes
if (isHidden && !isNearEnd) {
// Remove separators first
var sibling = card.nextElementSibling;
while (sibling) {
var nextSib = sibling.nextElementSibling;
var classes = sibling.className || '';
if (classes.includes('AyRUI') || classes.includes('TFQHme')) {
sibling.remove();
sepsRemoved++;
sibling = nextSib;
} else {
break;
}
}
// Remove the card itself from DOM
card.remove();
cardsRemoved++;
continue;
}
// Skip already hidden cards near end (keep for scroll reference)
if (isHidden) continue;
// Skip if no ID or already processed this cycle
if (!rid || processedIds.has(rid)) continue;
// Only process top-level review cards (have aria-label with author name)
if (!card.getAttribute('aria-label')) continue;
processedIds.add(rid);
// Already seen from API - just track order, skip content
// BUT still hide the card to keep DOM light!
if (seenSet.has(rid)) {
results.push({id: rid, orderOnly: true});
// Hide this card since we already have its data from API
card.style.display = 'none';
card.innerHTML = '';
continue;
}
var author = '', text = '', rating = 0, timestamp = '';
// AUTHOR: Extract from "Photo of {Name}" button aria-label
var photoBtn = card.querySelector('button[aria-label^="Photo of"]');
if (photoBtn) {
author = photoBtn.getAttribute('aria-label').replace('Photo of ', '').trim();
}
// Fallback: card's own aria-label is the author name
if (!author) {
author = card.getAttribute('aria-label') || '';
}
// RATING: span with role="img" and aria-label containing "star"
var ratingEl = card.querySelector('span[role="img"][aria-label*="star"]');
if (ratingEl) {
var match = ratingEl.getAttribute('aria-label').match(/(\\d)/);
if (match) rating = parseInt(match[1]);
}
// TIMESTAMP: Find span with "X time ago" pattern
var spans = card.querySelectorAll('span');
for (var j = 0; j < spans.length; j++) {
var spanText = spans[j].textContent.trim();
if (spanText.match(/^(\\d+|a|an)\\s+(second|minute|hour|day|week|month|year)s?\\s+ago$/i)) {
timestamp = spanText;
break;
}
}
// TEXT: Find longest text span (not timestamp/UI elements)
var longestText = '';
for (var j = 0; j < spans.length; j++) {
var spanText = spans[j].textContent.trim();
if (spanText === timestamp) continue;
if (spanText.match(/^\\d+ stars?$/i)) continue;
if (spanText === 'More' || spanText === 'Less') continue;
if (spanText.match(/^(Like\\d*|Share)$/)) continue;
if (spanText.length > longestText.length && spanText.length > 10) {
longestText = spanText;
}
}
text = longestText;
if (author && rating >= 1 && rating <= 5) {
results.push({
id: rid,
orderOnly: false,
author: author,
text: text,
rating: rating,
timestamp: timestamp,
source: 'dom'
});
}
// Mark card as processed (hide + clear) - will be removed on next cycle
// Keep near-end cards visible for scroll reference
if (!isNearEnd) {
card.style.display = 'none';
card.innerHTML = '';
}
}
return {reviews: results, cardCount: totalCards, cardsRemoved: cardsRemoved, sepsRemoved: sepsRemoved};
""", seen_list)
dom_cards = parsed_reviews.get('cardCount', 0) if parsed_reviews else 0
cards_removed = parsed_reviews.get('cardsRemoved', 0) if parsed_reviews else 0
if cards_removed > 0:
log.info('system', f"DOM cleanup: removed {cards_removed} cards to prevent memory buildup", metrics={'cards_removed': cards_removed, 'cards_remaining': dom_cards - cards_removed})
new_reviews = parsed_reviews.get('reviews', []) if parsed_reviews else []
for rev in new_reviews:
rid = rev.pop('id')
order_only = rev.pop('orderOnly', False)
# Track DOM order for ALL reviews (for sorting output)
if rid not in review_order:
review_order[rid] = order_counter[0]
order_counter[0] += 1
# Only add content for new reviews (not already from API)
if not order_only:
reviews[rid] = rev
seen_ids.add(rid)
except Exception as e:
log.error('scraper', f"DOM parse error: {e}")
dom_time = time.time() - t2
# BATCH FLUSH: If we have enough reviews, flush to callback and clear memory
# Sort by DOM order before flushing
t3 = time.time()
if flush_callback and len(reviews) >= flush_batch_size:
log.info('scraper', f"Flushing {len(reviews)} reviews to disk...", metrics={'batch_size': len(reviews), 'source': 'flush'})
sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
flush_callback([r for _, r in sorted_reviews])
total_flushed[0] += len(reviews)
reviews.clear() # Free memory, but keep seen_ids and review_order
flush_time = time.time() - t3
current_count = total_flushed[0] + len(reviews)
# TIMING: Print if cycle is slow (>2s)
if cycle_delta > 2.0:
log.warn('system', f"SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})", metrics={'cycle_time_s': cycle_delta, 'api_time_s': api_time, 'dom_time_s': dom_time, 'dom_cards': dom_cards, 'seen_count': len(seen_ids)})
# Check for new reviews
if current_count > last_count:
last_new_time = time.time()
last_count = current_count
# Check if loading (spinner visible OR network activity)
try:
loading_status = driver.execute_script("""
var status = {spinner: false, network: false};
// Check for Google's loading indicators
var spinner = document.querySelector('div[role="progressbar"]');
if (spinner && spinner.offsetParent !== null) status.spinner = true;
var loading = document.querySelector('.qjESne, .loading');
if (loading && loading.offsetParent !== null) status.spinner = true;
// Check for recent network activity (API interceptor)
var responses = window.__interceptedResponses || [];
var lastCount = window.__lastResponseCount || 0;
if (responses.length > lastCount) {
status.network = true;
window.__lastResponseCount = responses.length;
}
return status;
""")
is_loading = loading_status.get('spinner') or loading_status.get('network')
if is_loading:
last_new_time = time.time() # Reset timer while loading
except:
is_loading = False
# Progress update
elapsed = time.time() - last_new_time
if total_reviews[0]:
pct = (current_count / total_reviews[0]) * 100
log.info('scraper', f"{current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", metrics={'reviews_count': current_count, 'total_reviews': total_reviews[0], 'progress_pct': pct, 'idle_seconds': elapsed})
else:
log.info('scraper', f"{current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", metrics={'reviews_count': current_count, 'idle_seconds': elapsed})
# Call progress callback on every iteration (for real-time log updates)
if progress_callback:
progress_callback(current_count, total_reviews[0])
# Stop conditions - check BEFORE recovery attempts
if current_count >= max_reviews:
log.info('scraper', f"Reached max: {current_count}", metrics={'total_reviews': current_count, 'elapsed_seconds': time.time() - start_time})
stop_scrolling.set()
break
# Also stop if we have all reviews from the page
if total_reviews[0] and current_count >= total_reviews[0]:
log.info('scraper', f"All {current_count} reviews collected", metrics={'total_reviews': current_count, 'elapsed_seconds': time.time() - start_time})
stop_scrolling.set()
break
# STUCK DETECTION: If no new reviews for 3s+, try to unstick
# Only if we haven't collected all reviews yet
if elapsed >= 3 and int(elapsed) % 3 == 0:
# After 8+ failed recovery attempts, try hard refresh
if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes:
log.info('browser', f"Soft recovery failed {recovery_count[0]} times, trying hard refresh...", metrics={'recovery_count': recovery_count[0]})
if do_hard_refresh():
last_new_time = time.time() # Reset timer after refresh
continue # Skip to next iteration
else:
log.info('browser', f"Recovery attempt #{recovery_count[0] + 1}...", metrics={'recovery_attempt': recovery_count[0] + 1})
unstick_scroll()
# Check scroll state - track if content is still being added
try:
scroll_state = driver.execute_script("""
var p = window.scrollablePane;
if (!p) return {atBottom: true, height: 0};
var atBottom = (p.scrollTop + p.clientHeight >= p.scrollHeight - 50);
var height = p.scrollHeight;
var lastHeight = window.__lastScrollHeight || 0;
var growing = height > lastHeight;
window.__lastScrollHeight = height;
return {atBottom: atBottom, height: height, growing: growing};
""")
at_bottom = scroll_state.get('atBottom', True)
content_growing = scroll_state.get('growing', False)
except:
at_bottom = True
content_growing = False
# Reset timer if content is growing (new reviews loading)
if content_growing:
last_new_time = time.time()
# Dynamic timeout based on state and recovery attempts
# - Try hard refresh before giving up if we still have refreshes left
# - 5s if at bottom AND content stopped growing AND multiple recovery attempts failed
# - 15s max otherwise (keep trying)
recovery_failed = recovery_count[0] >= 5 and elapsed >= 5
truly_done = at_bottom and not content_growing and recovery_failed
timeout_hit = elapsed >= timeout_no_new
if truly_done or timeout_hit:
# Last chance: try hard refresh before giving up
if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
log.info('browser', "Timeout reached, trying hard refresh before giving up...", metrics={'idle_seconds': elapsed})
if do_hard_refresh():
last_new_time = time.time()
continue # Keep trying
log.info('scraper', f"All reviews loaded: {current_count}", metrics={'total_reviews': current_count, 'elapsed_seconds': time.time() - start_time})
stop_scrolling.set()
break
# Flush any remaining reviews (sorted by DOM order)
if flush_callback and reviews:
log.info('scraper', f"Final flush: {len(reviews)} reviews...", metrics={'batch_size': len(reviews), 'source': 'final_flush'})
sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
flush_callback([r for _, r in sorted_reviews])
total_flushed[0] += len(reviews)
reviews.clear()
# Reviews already parsed during scrolling (real-time parsing)
log.info('scraper', "Finalizing review data...")
# Final results (sorted by DOM order)
sorted_items = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
review_list = [r for _, r in sorted_items]
grand_total = total_flushed[0] + len(review_list)
dom_count = sum(1 for r in review_list if r.get("source") == "dom")
api_count = sum(1 for r in review_list if r.get("source") == "api")
if total_flushed[0] > 0:
log.info('scraper', f"Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})", metrics={'total_reviews': grand_total, 'flushed_count': total_flushed[0], 'in_memory_count': len(review_list), 'elapsed_seconds': time.time() - start_time})
else:
log.info('scraper', f"Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})", metrics={'total_reviews': len(review_list), 'dom_count': dom_count, 'api_count': api_count, 'elapsed_seconds': time.time() - start_time})
# Infer topics for each review if review_topics is available
if review_topics:
log.info('scraper', f"Inferring topics for {len(review_list)} reviews...", metrics={'reviews_count': len(review_list)})
topics_inferred_count = 0
for review in review_list:
review_text = review.get("text", "")
matched = infer_review_topics(review_text, review_topics)
review["topics"] = matched
if matched:
topics_inferred_count += 1
log.info('scraper', f"Topics inferred for {topics_inferred_count}/{len(review_list)} reviews", metrics={'topics_inferred_count': topics_inferred_count, 'reviews_count': len(review_list)})
return {
"reviews": review_list, # Only unflushed reviews (flushed already sent to callback)
"total": grand_total,
"total_flushed": total_flushed[0],
"checks": check_num,
"url": url,
"logs": log.get_logs(),
"review_topics": review_topics, # Topic filters with mention counts
"metrics_history": metrics_history, # For crash detection
"start_time": start_time, # For crash report elapsed time
"session_fingerprint": session_fingerprint # Browser fingerprint for bot detection analysis
}
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
progress_callback=None, driver=None, return_driver: bool = False,
log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False,
browser_fingerprint: dict = None):
"""
Production-compatible wrapper for scrape_reviews.
Matches the API expected by job_manager.py.
Args:
url: Google Maps URL to scrape
headless: Run Chrome in headless mode
max_scrolls: Not used (kept for API compatibility)
progress_callback: Optional callback(current_count, total_count) for progress
driver: Existing driver instance to reuse
return_driver: If True, return driver in result
log_capture: Optional LogCapture instance for real-time log access
browser_fingerprint: Optional dict with user's browser fingerprint:
- geolocation: {lat, lng}
- userAgent: string
- viewport: {width, height}
- timezone: string (e.g., "Europe/Madrid")
- language: string (e.g., "en-US")
- platform: string (e.g., "MacIntel")
Returns:
Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
"""
from seleniumbase import Driver
start_time = time.time()
driver_provided = driver is not None
should_close_driver = not return_driver and not driver_provided
# Use provided log_capture or create new one
log_capture = log_capture or LogCapture()
try:
# Extract fingerprint settings
fp = browser_fingerprint or {}
user_agent = fp.get('userAgent') or "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
viewport = fp.get('viewport') or {'width': 1200, 'height': 900}
geolocation = fp.get('geolocation')
timezone = fp.get('timezone')
language = fp.get('language', 'en-US')
# Create driver if not provided
if not driver:
driver = Driver(
uc=True,
headless=headless,
page_load_strategy="normal",
agent=user_agent # Use user's actual user agent
)
# Set viewport to match user's screen
driver.set_window_size(viewport['width'], viewport['height'])
# Apply browser fingerprint settings via CDP
try:
# Set timezone if provided
if timezone:
driver.execute_cdp_cmd('Emulation.setTimezoneOverride', {'timezoneId': timezone})
log_capture.info('browser', f"Set timezone to {timezone}")
# Set locale/language
driver.execute_cdp_cmd('Emulation.setLocaleOverride', {'locale': language})
# Set geolocation
if geolocation and 'lat' in geolocation and 'lng' in geolocation:
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': geolocation['lat'],
'longitude': geolocation['lng'],
'accuracy': 1000 # ~1km accuracy for IP-based location
})
log_capture.info('browser', f"Set geolocation to ({geolocation['lat']:.2f}, {geolocation['lng']:.2f})", metrics={'lat': geolocation['lat'], 'lng': geolocation['lng']})
else:
# Default to US (Boston, MA) if no geolocation provided
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': 42.3601,
'longitude': -71.0589,
'accuracy': 100
})
log_capture.info('browser', "Set geolocation to US (Boston, MA) [default]", metrics={'lat': 42.3601, 'lng': -71.0589})
if fp:
log_capture.info('browser', f"Browser fingerprint applied: {fp.get('platform', 'unknown')}, {viewport['width']}x{viewport['height']}", metrics={'viewport_width': viewport['width'], 'viewport_height': viewport['height']})
except Exception as e:
log_capture.warn('system', f"Could not apply fingerprint settings: {e}")
# Add URL parameters for consistent results
if 'hl=' not in url:
separator = '&' if '?' in url else '?'
url = f"{url}{separator}hl=en"
if 'gl=' not in url:
url = f"{url}&gl=us"
# Create combined flush callback for progress + external handler
external_flush = flush_callback # Save external callback
internal_flush = None
if progress_callback or external_flush:
collected = [0]
def combined_flush(reviews_batch):
collected[0] = len(reviews_batch) # reviews_batch is ALL reviews so far
if progress_callback:
progress_callback(collected[0], None)
if external_flush:
external_flush(reviews_batch) # Pass reviews to external handler
internal_flush = combined_flush
# Run the scraper with progress callback for real-time updates
result = scrape_reviews(
driver=driver,
url=url,
max_reviews=999999, # Effectively unlimited
timeout_no_new=15,
flush_callback=internal_flush,
flush_batch_size=100, # Smaller batches for more frequent progress
log_capture=log_capture,
progress_callback=progress_callback, # Pass through for real-time log updates
validation_only=validation_only # Return early if just validating
)
elapsed = time.time() - start_time
# Return in expected format
response = {
"reviews": result.get("reviews", []),
"count": result.get("total", 0),
"total_reviews": result.get("total", 0),
"time": elapsed,
"success": True,
"error": None,
"logs": result.get("logs", []),
"review_topics": result.get("review_topics", []), # Topic filters with mention counts
"session_fingerprint": result.get("session_fingerprint") # Browser fingerprint for bot detection
}
# Include validation_info if in validation_only mode
if validation_only and "validation_info" in result:
response["validation_info"] = result["validation_info"]
if return_driver:
response["driver"] = driver
elif should_close_driver:
try:
driver.quit()
except:
pass
return response
except Exception as e:
elapsed = time.time() - start_time
# CRASH DETECTION: Build crash report before closing driver
crash_report = None
try:
if driver:
# Try to sample final metrics from the browser
final_metrics = {
'timestamp_ms': int(time.time() * 1000),
'memory_mb': get_chrome_memory(driver),
'dom_nodes': get_dom_node_count(driver)
}
# Build crash report with available information
crash_report = {
'crash_type': classify_crash(e, [final_metrics]),
'error_message': str(e),
'state': {
'reviews_extracted': 0, # Unknown at crash time
'total_expected': None,
'scroll_count': 0,
'elapsed_seconds': elapsed
},
'metrics_history': [final_metrics],
'logs_before_crash': log_capture.get_logs()[-20:] if log_capture else [],
'last_successful_review_id': None
}
log_capture.error('system', f"Crash detected: {crash_report['crash_type']}",
metrics={'error': str(e), 'elapsed_seconds': elapsed})
except:
# If we can't build crash report, continue with basic error handling
pass
if should_close_driver and driver:
try:
driver.quit()
except:
pass
# Log error to the existing log_capture
log_capture.error('system', f"Scraper failed: {str(e)}")
result = {
"reviews": [],
"count": 0,
"total_reviews": 0,
"time": elapsed,
"success": False,
"error": str(e),
"driver": driver if return_driver else None,
"logs": log_capture.get_logs()
}
# Include crash report if available
if crash_report:
result['crash_report'] = crash_report
return result
def extract_about_info(driver, url: str = None) -> dict:
"""
Extract About section info from Google Maps (Accessibility, Amenities, etc.).
This function should be called AFTER reviews are scraped if about info is needed,
as it navigates to a different tab.
Args:
driver: Selenium WebDriver instance (already on the business page)
url: Optional URL to navigate to first (if not already on the page)
Returns:
dict with section names as keys, each containing list of features
"""
try:
# Navigate if URL provided
if url:
# Force English
if 'hl=' not in url:
separator = '&' if '?' in url else '?'
url = f"{url}{separator}hl=en"
if 'gl=' not in url:
url = f"{url}&gl=us"
driver.get(url)
time.sleep(1)
# Click About tab using robust selectors
clicked = driver.execute_script("""
// Try multiple selectors for about tab
var selectors = [
'button[aria-label*="About"]',
'button[data-tab-index="2"]',
'div[role="tablist"] button:nth-child(3)',
'button[jsaction*="about"]'
];
for (var sel of selectors) {
var btn = document.querySelector(sel);
if (btn && btn.textContent.toLowerCase().includes('about')) {
btn.click();
return true;
}
}
// Fallback: find by text content
var buttons = document.querySelectorAll('button');
for (var btn of buttons) {
if (btn.textContent.trim().toLowerCase() === 'about') {
btn.click();
return true;
}
}
return false;
""")
if not clicked:
return {}
time.sleep(1.5) # Wait for about tab to load
# Extract about sections using aria-labels (robust)
about = driver.execute_script("""
var about = {};
// Find the about region by aria-label or role
var container = document.querySelector('div[role="region"][aria-label*="About"]');
if (!container) {
// Fallback: look for the scrollable area with sections
container = document.querySelector('.m6QErb[aria-label*="About"]');
}
if (!container) {
// Last resort: find sections by h2 headers
container = document;
}
// Find all section headers (h2 elements)
var sections = container.querySelectorAll('h2');
for (var h2 of sections) {
var sectionName = h2.textContent.trim();
var items = [];
// Find the ul list following this h2
var parent = h2.closest('.iP2t7d, div');
if (parent) {
var listItems = parent.querySelectorAll('li span[aria-label]');
for (var li of listItems) {
var label = li.getAttribute('aria-label');
if (label) {
// Parse "Has toilet" or "No wheelchair-accessible car park"
var hasFeature = !label.toLowerCase().startsWith('no ');
var featureName = label.replace(/^(Has |No )/i, '');
items.push({
feature: featureName,
available: hasFeature
});
}
}
}
if (sectionName && items.length > 0) {
about[sectionName] = items;
}
}
return about;
""")
return about or {}
except Exception as e:
return {"error": str(e)}
# Test function
if __name__ == "__main__":
from seleniumbase import Driver
# Test URL - 79 reviews
TEST_URL = "https://www.google.com/maps/place/R.+Fleitas+Peluqueros/@28.1302986,-15.4448111,821m/data=!3m1!1e3!4m6!3m5!1s0xc40951a43c21f19:0x85f89601b9909c72!8m2!3d28.1299805!4d-15.4436854!16s%2Fg%2F11gbwtk8c8"
print("🚀 Starting clean scraper test...")
# Set up driver
driver = Driver(uc=True, headless=False)
driver.set_window_size(1200, 900)
try:
result = scrape_reviews(driver, TEST_URL, max_reviews=100, timeout_no_new=15)
print(f"\n✅ Got {result['total']} reviews in {result['checks']} checks")
# Show sample
if result["reviews"]:
print("\n📝 Sample review:")
sample = result["reviews"][0]
print(f" Author: {sample['author']}")
print(f" Rating: {sample['rating']}")
print(f" Text: {sample['text'][:100]}..." if sample['text'] else " Text: (none)")
finally:
driver.quit()
print("\n🏁 Done")
def get_business_card_info(url: str, headless: bool = True, driver=None, return_driver: bool = False) -> dict:
"""
Extract business card info from Google Maps.
Uses the same efficient polling navigation as scrape_reviews (no fixed waits).
Returns:
dict with: name, address, rating, total_reviews, success, error, time
"""
from seleniumbase import Driver
import logging
log = logging.getLogger(__name__)
start_time = time.time()
driver_provided = driver is not None
should_close_driver = not return_driver and not driver_provided
try:
# Create driver if not provided
if not driver:
driver = Driver(uc=True, headless=headless)
# Set geolocation to US
try:
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': 42.3601, 'longitude': -71.0589, 'accuracy': 100
})
except:
pass
# Don't clear state - Google may serve different content based on session history
# The scraper doesn't reset state, so validation shouldn't either
# Force English interface for consistent parsing
if 'hl=' not in url:
separator = '&' if '?' in url else '?'
url = f"{url}{separator}hl=en"
if 'gl=' not in url:
url = f"{url}&gl=us"
# Navigate to URL
driver.get(url)
# Handle consent popup - poll with 10ms sleep (same as scrape_reviews)
start = time.time()
while time.time() - start < 5:
if "consent.google" in driver.current_url:
try:
# Try multiple approaches to find and click accept button
clicked = False
# Method 1: Find by aria-label (most reliable for Google consent)
for btn in driver.find_elements(By.CSS_SELECTOR, "button[aria-label*='Accept']"):
btn.click()
clicked = True
break
# Method 2: Find by text content
if not clicked:
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
txt = btn.text.lower()
if "accept all" in txt or "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
btn.click()
clicked = True
break
if clicked:
time.sleep(0.5) # Brief wait for consent to process
driver.get(url) # Reload the target URL
time.sleep(0.5) # Wait for reload
except Exception as e:
pass
break
if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url):
break
time.sleep(0.01) # 10ms - responsive but low CPU
# Log current URL after consent handling
try:
current_url = driver.current_url
log.info(f"🔍 Validation: Current URL after load: {current_url[:80]}...")
except:
pass
# Wait for page to fully render before polling (tabs may load dynamically)
time.sleep(2)
# Poll for business info (same pattern as total_reviews extraction)
# Timeout increased to 10s because Reviews tab can take 6+ seconds to appear after consent
info = {"name": None, "rating": None, "total_reviews": None, "address": None, "category": None}
start = time.time()
debug_logged = False
while time.time() - start < 10:
try:
info = driver.execute_script("""
var result = {name: null, rating: null, total_reviews: null, address: null, category: null, debug: []};
// Business name from h1
var h1 = document.querySelector('h1');
if (h1) result.name = h1.textContent.trim();
// Category - use jsaction attribute (robust, survives class changes)
var catBtn = document.querySelector('button[jsaction*="category"]');
if (catBtn) result.category = catBtn.textContent.trim();
// Fallback: look for button after rating that's not a link
if (!result.category) {
var buttons = document.querySelectorAll('button');
for (var btn of buttons) {
var text = btn.textContent.trim();
// Categories are short words, no numbers, not navigation
if (text && text.length < 50 && !text.match(/^[0-9]/) &&
!text.match(/review|star|direction|save|share|photo/i)) {
// Check if it's near the rating area
var parent = btn.closest('.LBgpqf, .skqShb, .fontBodyMedium');
if (parent) {
result.category = text;
break;
}
}
}
}
// Rating from span[role="img"] aria-labels
var spans = document.querySelectorAll('span[role="img"]');
for (var i = 0; i < spans.length; i++) {
var label = spans[i].getAttribute('aria-label') || '';
// Collect debug info for all aria-labels
if (label) {
result.debug.push('img-aria: ' + label);
}
// Rating: "4.8 stars" (English forced via hl=en)
var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
if (rMatch && !result.rating) {
result.rating = parseFloat(rMatch[1].replace(',', '.'));
}
// Reviews: "79 reviews" or "4.8 stars 79 reviews" (English forced via hl=en)
// Try direct format first: "79 reviews"
var revMatch = label.match(/^([\\d,]+)\\s*review/i);
if (revMatch && !result.total_reviews) {
result.total_reviews = parseInt(revMatch[1].replace(/,/g, ''));
}
// Try combined format: "4.8 stars 79 reviews" or "4.8 stars 79k+ reviews"
if (!result.total_reviews) {
var combinedMatch = label.match(/stars?\\s+([\\d,]+k?\\+?)\\s*review/i);
if (combinedMatch) {
var countStr = combinedMatch[1].replace(/,/g, '');
if (countStr.includes('k')) {
// Handle "9k+" format
result.total_reviews = parseInt(countStr) * 1000;
} else {
result.total_reviews = parseInt(countStr);
}
}
}
}
// Also collect tab button texts for debugging (include full text including numbers)
var tabs = document.querySelectorAll('button[role="tab"]');
for (var j = 0; j < tabs.length; j++) {
var tabText = tabs[j].textContent.trim();
result.debug.push('tab: ' + tabText);
// Also try to extract review count from tab text like "Reviews (79)"
if (tabText.toLowerCase().includes('review') && !result.total_reviews) {
var tabMatch = tabText.match(/\\((\\d+)\\)/);
if (tabMatch) {
result.total_reviews = parseInt(tabMatch[1]);
result.debug.push('Found reviews in tab: ' + tabText);
}
}
}
// Also check ALL buttons for reviews count
var allButtons = document.querySelectorAll('button');
for (var b = 0; b < allButtons.length; b++) {
var btnText = allButtons[b].textContent || '';
if (btnText.toLowerCase().includes('review') && !btnText.toLowerCase().includes('write')) {
var numMatch = btnText.match(/\\((\\d+)\\)/);
if (numMatch && !result.total_reviews) {
result.total_reviews = parseInt(numMatch[1]);
result.debug.push('Found reviews in button: ' + btnText.substring(0, 50));
}
}
}
// Check if we're on search results vs place page
result.debug.push('title: ' + document.title);
result.debug.push('url: ' + window.location.href.substring(0, 80));
// Check for search results list
var searchResults = document.querySelectorAll('div[role="feed"] > div');
result.debug.push('search_results_count: ' + searchResults.length);
// Fallback: Get review count from Reviews tab button "Reviews (79)"
// Search ALL tab buttons for one containing "review" text (same as scrape_reviews)
if (!result.total_reviews) {
var tabs = document.querySelectorAll('button[role="tab"]');
for (var tab of tabs) {
var text = tab.textContent.toLowerCase();
if (text.includes('review')) {
var match = tab.textContent.match(/\\((\\d+)\\)/);
if (match) {
result.total_reviews = parseInt(match[1]);
break;
}
}
}
}
// Fallback 2: Look for any button with "Reviews" and a number
if (!result.total_reviews) {
var buttons = document.querySelectorAll('button');
for (var btn of buttons) {
var text = btn.textContent;
if (text.toLowerCase().includes('review') && !text.toLowerCase().includes('write')) {
var numMatch = text.match(/\\((\\d+)\\)/);
if (numMatch) {
result.total_reviews = parseInt(numMatch[1]);
break;
}
}
}
}
// Address from button
var addrBtn = document.querySelector('button[data-item-id="address"]');
if (addrBtn) {
var label = addrBtn.getAttribute('aria-label');
if (label) result.address = label.replace(/^Address:\\s*/i, '');
}
return result;
""")
# Exit early if we have the essentials (name found AND reviews count > 0)
if info.get("name") and info.get("total_reviews") and info.get("total_reviews") > 0:
break
# Log debug info once after 3 seconds
if not debug_logged and time.time() - start > 3:
debug_logged = True
debug_info = info.get("debug", [])
if debug_info:
log.info(f"🔍 Validation debug - URL: {url[:50]}...")
log.info(f" Name: {info.get('name')}, Rating: {info.get('rating')}, Reviews: {info.get('total_reviews')}")
for d in debug_info[:10]: # First 10 debug items
log.info(f" {d}")
except:
pass
time.sleep(0.1) # 100ms between polls
# Final debug log if still no reviews
if not info.get("total_reviews"):
debug_info = info.get("debug", [])
log.warning(f"⚠️ Validation: No reviews found for '{info.get('name')}' after 10s polling")
if debug_info:
log.warning(f" Debug items: {debug_info[:10]}")
return {
"name": info.get("name"),
"address": info.get("address"),
"rating": info.get("rating"),
"total_reviews": info.get("total_reviews"),
"category": info.get("category"),
"success": bool(info.get("name")),
"error": None,
"time": time.time() - start_time
}
except Exception as e:
return {
"name": None,
"address": None,
"rating": None,
"total_reviews": None,
"category": None,
"success": False,
"error": str(e),
"time": time.time() - start_time
}
finally:
if should_close_driver and driver:
try:
driver.quit()
except:
pass