""" Google Reviews Scraper v1.2.0 This module provides the core Google Maps reviews scraping functionality. - Simple down scrolling - DOM scraping + API interception - Multi-sort strategy to bypass ~1000 review limit - Session handoff: reuse browser between validation and scraping Version: 1.2.0 Based on: v1.1.0 New features: - Session handoff support for efficiency - Resume scraping from validated browser session - Skip navigation/consent when resuming from validation - Session manager integration """ # Sort strategy constants SORT_NEWEST = "newest" SORT_LOWEST = "lowest" SORT_HIGHEST = "highest" SORT_RELEVANT = "relevant" SORT_AUTO = "auto" DEFAULT_SORT_ORDER = [SORT_NEWEST, SORT_LOWEST, SORT_HIGHEST, SORT_RELEVANT] MULTI_SORT_THRESHOLD = 1000 # Auto-enable multi-sort if total > this DIMINISHING_RETURNS_PCT = 5 # Stop if pass yields < 5% new reviews import re import json import time import threading from datetime import datetime from typing import List, Optional from selenium.webdriver.common.by import By from utils.logger import StructuredLogger def get_chrome_memory(driver) -> Optional[int]: """Get Chrome memory usage in MB using CDP.""" try: # Use CDP Performance.getMetrics result = driver.execute_cdp_cmd('Performance.getMetrics', {}) for metric in result.get('metrics', []): if metric['name'] == 'JSHeapUsedSize': return int(metric['value'] / 1024 / 1024) except: pass return None def get_dom_node_count(driver) -> Optional[int]: """Get DOM node count.""" try: return driver.execute_script("return document.getElementsByTagName('*').length") except: return None def capture_session_fingerprint(driver) -> dict: """ Capture browser session fingerprint for bot detection analysis. This captures various browser attributes that can be used to: 1. Verify bot detection evasion is working 2. Debug issues when scraping fails 3. Track session characteristics for analysis Args: driver: Selenium WebDriver instance (must be initialized) Returns: Dictionary containing session fingerprint data """ fingerprint = { "user_agent": None, "platform": None, "language": None, "languages": None, "timezone": None, "screen": { "width": None, "height": None, "colorDepth": None }, "viewport": { "width": None, "height": None }, "webgl_vendor": None, "webgl_renderer": None, "canvas_fingerprint": None, "hardware_concurrency": None, "device_memory": None, "bot_detection_tests": { "webdriver_hidden": None, "chrome_runtime": None, "permissions_query": None }, "captured_at": None } try: # Navigate to about:blank first to ensure we can execute JS # (in case driver was just created and hasn't navigated yet) current_url = driver.current_url if not current_url or current_url == "data:,": driver.get("about:blank") # Capture timestamp fingerprint["captured_at"] = datetime.now().isoformat() # Basic navigator properties try: fingerprint["user_agent"] = driver.execute_script("return navigator.userAgent") except: pass try: fingerprint["platform"] = driver.execute_script("return navigator.platform") except: pass try: fingerprint["language"] = driver.execute_script("return navigator.language") except: pass try: fingerprint["languages"] = driver.execute_script("return navigator.languages") except: pass try: fingerprint["timezone"] = driver.execute_script( "return Intl.DateTimeFormat().resolvedOptions().timeZone" ) except: pass # Screen properties try: fingerprint["screen"]["width"] = driver.execute_script("return screen.width") fingerprint["screen"]["height"] = driver.execute_script("return screen.height") fingerprint["screen"]["colorDepth"] = driver.execute_script("return screen.colorDepth") except: pass # Viewport properties try: fingerprint["viewport"]["width"] = driver.execute_script("return window.innerWidth") fingerprint["viewport"]["height"] = driver.execute_script("return window.innerHeight") except: pass # WebGL vendor and renderer (important for fingerprinting) try: webgl_info = driver.execute_script(""" try { var canvas = document.createElement('canvas'); var gl = canvas.getContext('webgl') || canvas.getContext('experimental-webgl'); if (gl) { var debugInfo = gl.getExtension('WEBGL_debug_renderer_info'); if (debugInfo) { return { vendor: gl.getParameter(debugInfo.UNMASKED_VENDOR_WEBGL), renderer: gl.getParameter(debugInfo.UNMASKED_RENDERER_WEBGL) }; } } } catch(e) {} return {vendor: null, renderer: null}; """) fingerprint["webgl_vendor"] = webgl_info.get("vendor") fingerprint["webgl_renderer"] = webgl_info.get("renderer") except: pass # Canvas fingerprint (hash of canvas drawing) try: canvas_hash = driver.execute_script(""" try { var canvas = document.createElement('canvas'); canvas.width = 200; canvas.height = 50; var ctx = canvas.getContext('2d'); ctx.textBaseline = 'top'; ctx.font = '14px Arial'; ctx.fillStyle = '#f60'; ctx.fillRect(125, 1, 62, 20); ctx.fillStyle = '#069'; ctx.fillText('Fingerprint', 2, 15); ctx.fillStyle = 'rgba(102, 204, 0, 0.7)'; ctx.fillText('Fingerprint', 4, 17); var dataUrl = canvas.toDataURL(); // Simple hash var hash = 0; for (var i = 0; i < dataUrl.length; i++) { var char = dataUrl.charCodeAt(i); hash = ((hash << 5) - hash) + char; hash = hash & hash; } return hash.toString(16); } catch(e) { return null; } """) fingerprint["canvas_fingerprint"] = canvas_hash except: pass # Hardware info try: fingerprint["hardware_concurrency"] = driver.execute_script( "return navigator.hardwareConcurrency" ) except: pass try: fingerprint["device_memory"] = driver.execute_script( "return navigator.deviceMemory" ) except: pass # Bot detection tests try: # Test 1: webdriver property should be hidden/false for undetected Chrome webdriver_hidden = driver.execute_script( "return navigator.webdriver === undefined || navigator.webdriver === false" ) fingerprint["bot_detection_tests"]["webdriver_hidden"] = webdriver_hidden except: pass try: # Test 2: chrome runtime should exist in real Chrome chrome_runtime = driver.execute_script( "return typeof window.chrome !== 'undefined'" ) fingerprint["bot_detection_tests"]["chrome_runtime"] = chrome_runtime except: pass try: # Test 3: permissions.query should work in real Chrome permissions_query = driver.execute_script(""" try { if (navigator.permissions && navigator.permissions.query) { return true; } return false; } catch(e) { return false; } """) fingerprint["bot_detection_tests"]["permissions_query"] = permissions_query except: pass except Exception as e: fingerprint["capture_error"] = str(e) return fingerprint def classify_crash(exception: Exception, metrics_history: list) -> str: """Classify crash type based on exception and metrics.""" error_str = str(exception).lower() if 'aw, snap' in error_str or 'status_access_violation' in error_str: return 'tab_crash' if 'timeout' in error_str: return 'timeout' if metrics_history and metrics_history[-1].get('memory_mb', 0) > 400: return 'memory_exhaustion' if 'no such element' in error_str: return 'element_not_found' if '429' in error_str or 'rate' in error_str: return 'rate_limited' if 'network' in error_str or 'connection' in error_str: return 'network_failure' return 'unknown' class ScraperCrashException(Exception): """Exception that carries crash report data for analysis.""" def __init__(self, original_exception, crash_report): self.original_exception = original_exception self.crash_report = crash_report super().__init__(str(original_exception)) def get_topic_variants(topic: str) -> List[str]: """ Generate common variants of a topic word for matching. Handles: - Singular/plural forms - Verb forms (-ing, -ed, -s) - Common stemming patterns Args: topic: The topic word/phrase to generate variants for Returns: List of variant strings including the original Example: >>> get_topic_variants("cutting") ["cutting", "cut", "cuts"] >>> get_topic_variants("service") ["service", "services", "servicing"] """ if not topic: return [] topic = topic.lower().strip() variants = {topic} # Use set to avoid duplicates # Handle -ing forms (cutting -> cut, cuts) if topic.endswith("ing"): base = topic[:-3] # Remove -ing if base: variants.add(base) variants.add(base + "s") # Handle doubled consonants (cutting -> cut) if len(base) >= 2 and base[-1] == base[-2]: single_consonant = base[:-1] variants.add(single_consonant) variants.add(single_consonant + "s") # Handle -s/-es plural forms (services -> service) if topic.endswith("es") and len(topic) > 2: variants.add(topic[:-2]) # Remove -es variants.add(topic[:-2] + "ing") elif topic.endswith("s") and len(topic) > 1 and not topic.endswith("ss"): variants.add(topic[:-1]) # Remove -s variants.add(topic[:-1] + "ing") # Handle -ed forms (colored -> color) if topic.endswith("ed") and len(topic) > 2: base = topic[:-2] if base: variants.add(base) variants.add(base + "s") variants.add(base + "ing") # Handle doubled consonants (colored -> color from coloured) if len(base) >= 2 and base[-1] == base[-2]: single_consonant = base[:-1] variants.add(single_consonant) # Add common forms if base word (no suffix detected) if not (topic.endswith("ing") or topic.endswith("ed") or topic.endswith("s")): variants.add(topic + "s") variants.add(topic + "ing") # Handle consonant doubling for -ing (cut -> cutting) if len(topic) >= 2 and topic[-1] not in "aeiouwy": variants.add(topic + topic[-1] + "ing") return list(variants) def infer_review_topics(review_text: str, topics: List[dict]) -> List[str]: """ Match review text against extracted topic keywords. Args: review_text: The review text to analyze topics: List of topic dicts, e.g., [{"topic": "cutting", "count": 3}] Returns: List of matched topic names Example: >>> topics = [{"topic": "hair salon", "count": 4}, {"topic": "cutting", "count": 3}] >>> text = "Great haircut! The cutting was professional." >>> infer_review_topics(text, topics) ["cutting"] """ # Handle empty/None inputs gracefully if not review_text or not topics: return [] review_text_lower = review_text.lower() matched_topics = [] for topic_dict in topics: topic = topic_dict.get("topic", "") if not topic: continue topic_lower = topic.lower().strip() # Get all variants of the topic variants = get_topic_variants(topic_lower) # Check each variant for word boundary match for variant in variants: if not variant: continue # Use word boundary regex to avoid partial matches # \b ensures we match whole words only # E.g., "cut" won't match "execute" or "cutlery" partially pattern = r'\b' + re.escape(variant) + r'\b' if re.search(pattern, review_text_lower): matched_topics.append(topic) # Use original topic name break # Found a match, no need to check other variants return matched_topics class LogCapture: """ Backward-compatible wrapper around StructuredLogger. Maintains the original LogCapture API while using StructuredLogger internally. This allows existing code to continue working while gaining structured logging benefits. """ def __init__(self): self._logger = StructuredLogger() def log(self, message: str, level: str = "INFO", source: str = "scraper"): """Add a log entry with timestamp (backward compatible).""" # Map source to category category = self._source_to_category(source) level_upper = level.upper() if level_upper == "ERROR": self._logger.error(category, message) elif level_upper == "WARNING" or level_upper == "WARN": self._logger.warn(category, message) elif level_upper == "DEBUG": self._logger.debug(category, message) else: self._logger.info(category, message) # Also print for console visibility print(message, flush=True) def info(self, category_or_msg, message: str = None, *, metrics: dict = None): """ Log an INFO message. Supports both old API: info(message, source) And new API: info(category, message, metrics={...}) """ if message is None: # Old API: info(message) or info(message, source) self._logger.info('scraper', category_or_msg, metrics=metrics) print(category_or_msg, flush=True) else: # New API: info(category, message, metrics={...}) self._logger.info(category_or_msg, message, metrics=metrics) print(message, flush=True) def warning(self, category_or_msg, message: str = None, *, metrics: dict = None): """Log a WARNING message (supports both old and new API).""" if message is None: self._logger.warn('scraper', category_or_msg, metrics=metrics) print(category_or_msg, flush=True) else: self._logger.warn(category_or_msg, message, metrics=metrics) print(message, flush=True) def warn(self, category, message: str, *, metrics: dict = None): """Log a WARN message with category (new API).""" self._logger.warn(category, message, metrics=metrics) print(message, flush=True) def error(self, category_or_msg, message: str = None, *, metrics: dict = None): """Log an ERROR message (supports both old and new API).""" if message is None: self._logger.error('scraper', category_or_msg, metrics=metrics) print(category_or_msg, flush=True) else: self._logger.error(category_or_msg, message, metrics=metrics) print(message, flush=True) def debug(self, category, message: str, *, metrics: dict = None): """Log a DEBUG message with category (new API).""" self._logger.debug(category, message, metrics=metrics) print(message, flush=True) def get_logs(self): """Get all log entries as JSON-serializable dictionaries.""" return self._logger.get_logs() def _source_to_category(self, source: str) -> str: """Map legacy source names to StructuredLogger categories.""" source_lower = source.lower() if source else 'scraper' if source_lower in ('browser', 'navigation', 'page'): return 'browser' elif source_lower in ('network', 'api'): return 'network' elif source_lower in ('system', 'memory', 'chrome'): return 'system' else: return 'scraper' def parse_api_review(raw: list) -> dict: """Parse a review from API response array.""" try: if not isinstance(raw, list) or len(raw) < 5: return None author = raw[0] if len(raw) > 0 and isinstance(raw[0], str) else "" timestamp = raw[1] if len(raw) > 1 else "" text = raw[3] if len(raw) > 3 and isinstance(raw[3], str) else "" rating = raw[4] if len(raw) > 4 and isinstance(raw[4], int) else 0 if not (1 <= rating <= 5): return None # Filter out garbage data (language codes, metadata, etc.) if len(author) <= 3: # Real names are longer than 3 chars return None if author.lower() in ['google', 'maps', 'reviews', 'es', 'en', 'it', 'no', 'de', 'fr', 'pt']: return None # Timestamp should look like a date, not a URL or language code if timestamp and ('http' in str(timestamp) or len(str(timestamp)) <= 3): return None # Owner response owner_response = None for idx in [9, 18]: if len(raw) > idx and raw[idx] and isinstance(raw[idx], list): resp = raw[idx] if len(resp) > 1: owner_response = {"text": resp[1], "timestamp": resp[0] if resp[0] else ""} break return { "author": author, "text": text, "rating": rating, "timestamp": timestamp, "owner_response": owner_response, "source": "api" } except: return None def extract_reviews_from_api_body(body: str) -> list: """Extract reviews from API response body using correct Google Maps structure.""" reviews = [] try: # Remove )]}' prefix if body.startswith(")]}'"): body = body[4:].strip() data = json.loads(body) # Google Maps API structure: data[2] contains review arrays # Each review: data[2][X][0] where: # Author: [1][4][5][0] # Rating: [2][0][0] # Text: [2][15][0][0] # Time: [1][6] if not isinstance(data, list) or len(data) < 3: return reviews reviews_area = data[2] if not isinstance(reviews_area, list): return reviews for item in reviews_area: try: if not isinstance(item, list) or len(item) < 1: continue review_data = item[0] if not isinstance(review_data, list) or len(review_data) < 3: continue # Extract fields using correct paths review_id = "" author = "" rating = 0 text = "" timestamp = "" # Review ID: [0] - same format as DOM's data-review-id try: review_id = review_data[0] except (IndexError, TypeError): pass # Author: [1][4][5][0] try: author = review_data[1][4][5][0] except (IndexError, TypeError): pass # Rating: [2][0][0] try: rating = review_data[2][0][0] except (IndexError, TypeError): pass # Text: [2][15][0][0] try: text = review_data[2][15][0][0] except (IndexError, TypeError): pass # Timestamp: [1][6] try: timestamp = review_data[1][6] except (IndexError, TypeError): pass # Validate and add (include review_id for deduplication) if author and isinstance(rating, int) and 1 <= rating <= 5: reviews.append({ "review_id": review_id, "author": author, "text": text or "", "rating": rating, "timestamp": timestamp or "", "source": "api" }) except: continue except: pass return reviews def parse_dom_review(card) -> dict: """Parse a review from DOM element.""" try: # Get review ID review_id = card.get_attribute("data-review-id") or "" if not review_id: try: id_el = card.find_element(By.CSS_SELECTOR, "[data-review-id]") review_id = id_el.get_attribute("data-review-id") or "" except: pass # Author - multiple selectors author = "" for sel in ['div[class*="d4r55"]', '.d4r55', 'button[data-review-id] + div']: try: author_el = card.find_element(By.CSS_SELECTOR, sel) author = author_el.text.strip() if author: break except: pass # Rating from aria-label on span[role="img"] rating = 0 try: stars_el = card.find_element(By.CSS_SELECTOR, 'span[role="img"]') aria = stars_el.get_attribute("aria-label") or "" # Extract number from label (handles "5 stars", "5 estrellas", etc.) num = re.search(r'[\d\.]+', aria.replace(',', '.')) if num: rating = int(float(num.group())) except: pass # Review text - try multiple selectors text = "" for sel in ['span[jsname="bN97Pc"]', 'span[jsname="fbQN7e"]', 'div.MyEned span.wiI7pd', '.wiI7pd']: try: text_el = card.find_element(By.CSS_SELECTOR, sel) text = text_el.text.strip() if text: break except: pass # Note: "More" button clicking removed for speed # Full text can be expanded later if needed # Timestamp timestamp = "" try: time_el = card.find_element(By.CSS_SELECTOR, 'span[class*="rsqaWe"]') timestamp = time_el.text.strip() except: pass # Owner response owner_response = None try: resp_box = card.find_element(By.CSS_SELECTOR, "div.CDe7pd") if resp_box: resp_text = "" resp_date = "" try: resp_text_el = resp_box.find_element(By.CSS_SELECTOR, "div.wiI7pd") resp_text = resp_text_el.text.strip() except: pass try: resp_date_el = resp_box.find_element(By.CSS_SELECTOR, "span.DZSIDd") resp_date = resp_date_el.text.strip() except: pass if resp_text: owner_response = {"text": resp_text, "timestamp": resp_date} except: pass if not review_id and not author: return None return { "id": review_id, "author": author, "text": text, "rating": rating, "timestamp": timestamp, "owner_response": owner_response, "source": "dom" } except Exception: return None def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15, flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None, progress_callback=None, validation_only: bool = False, sort_strategy: str = SORT_AUTO, sort_order: List[str] = None, multi_sort_threshold: int = MULTI_SORT_THRESHOLD, close_enough_pct: float = 95.0, initial_sort: str = None, resume_from_validation: bool = False, validated_business_info: dict = None, validated_total_reviews: int = None) -> dict: """ Scrape Google Maps reviews with optional multi-sort strategy. Args: driver: Selenium WebDriver instance url: Google Maps place URL max_reviews: Maximum reviews to collect timeout_no_new: Seconds to wait with no new reviews before stopping flush_callback: Optional callback(reviews_list) called every flush_batch_size reviews This allows streaming data to disk and freeing memory flush_batch_size: Number of reviews to collect before flushing (default 500) log_capture: Optional LogCapture instance for storing logs progress_callback: Optional callback(current_count, total_count) called every iteration validation_only: If True, only validate the business exists (no scraping) sort_strategy: Sort strategy - "auto", "newest", "lowest", "highest", "relevant", or "multi" - "auto": Use multi-sort if total > threshold or first pass incomplete - "multi": Force multi-sort through all sort orders - Others: Single sort mode sort_order: Custom sort order for multi-sort (default: newest, lowest, highest, relevant) multi_sort_threshold: Auto-enable multi-sort if total reviews > this (default: 1000) close_enough_pct: Stop retrying if we have this % of total reviews (default: 95.0) initial_sort: Initial sort order to use (default: newest). Used for retry with different sort resume_from_validation: If True, skip navigation/consent (browser already on page from validation) validated_business_info: Pre-extracted business info from validation phase validated_total_reviews: Pre-extracted total review count from validation phase Returns: dict with reviews list and metadata """ # Use provided log_capture or create a dummy that just prints log = log_capture or LogCapture() # Capture session fingerprint early (before navigation) for bot detection analysis session_fingerprint = capture_session_fingerprint(driver) log.info('browser', "Session fingerprint captured", metrics={ 'user_agent': session_fingerprint.get('user_agent', 'unknown')[:50] + '...' if session_fingerprint.get('user_agent') else 'unknown', 'platform': session_fingerprint.get('platform'), 'timezone': session_fingerprint.get('timezone'), 'webdriver_hidden': session_fingerprint.get('bot_detection_tests', {}).get('webdriver_hidden'), 'chrome_runtime': session_fingerprint.get('bot_detection_tests', {}).get('chrome_runtime') }) # Storage - use review ID as key reviews = {} # review_id -> review seen_ids = set() # Track all IDs we've seen (persists after flush) total_flushed = [0] # Use list for closure mutation review_order = {} # review_id -> position (DOM visual order for sorting) order_counter = [0] # Current order position # Track total reviews (persists across refreshes) total_reviews = [None] # Use list for closure mutation # Store business info extracted from overview (before clicking reviews tab) business_info_cache = [None] # If resuming from validation, populate with pre-extracted data if resume_from_validation: if validated_total_reviews: total_reviews[0] = validated_total_reviews log.info('scraper', f"Resuming from validation: {validated_total_reviews} total reviews", metrics={'total_reviews': validated_total_reviews}) if validated_business_info: business_info_cache[0] = validated_business_info log.info('scraper', f"Resuming from validation: {validated_business_info.get('name', 'unknown')}") # Hard refresh counter hard_refresh_count = [0] max_hard_refreshes = 3 # Max number of hard refreshes before giving up # Find scrollable reviews container helper def find_scroll_container(): selectors = [ "div.m6QErb.DxyBCb.kA9KIf.dS8AEf", "div.m6QErb.DxyBCb.kA9KIf", "div.m6QErb.DxyBCb", "div.m6QErb[aria-label]", "div.DxyBCb.kA9KIf.dS8AEf", "div[role='main'] div.m6QErb", ] for sel in selectors: try: els = driver.find_elements(By.CSS_SELECTOR, sel) for el in els: if el.is_displayed() and el.size['height'] > 100: return el except: pass return None def change_sort_order(target_sort: str) -> bool: """ Change the review sort order in the UI. Args: target_sort: One of "newest", "lowest", "highest", "relevant" Returns: True if sort was changed successfully, False otherwise """ # Mapping of our sort names to UI text patterns sort_patterns = { SORT_NEWEST: ['newest', 'recent', 'más reciente', 'neueste', 'plus récent'], SORT_LOWEST: ['lowest', 'rating: low', 'puntuación: baja', 'niedrigste'], SORT_HIGHEST: ['highest', 'rating: high', 'puntuación: alta', 'höchste'], SORT_RELEVANT: ['relevant', 'relevance', 'más relevante', 'relevanteste'], } patterns = sort_patterns.get(target_sort, [target_sort]) try: # Click sort button to open menu # Note: The button shows current sort value (e.g., "Most relevant", "Newest") # IMPORTANT: There are two HQzyZ buttons - "All reviews" (filter) and sort button # We need the one with sort values, NOT "All reviews" # The button textContent includes nested elements like "SortMost relevant" so we use includes() sort_btn = driver.execute_script(""" // Sort option values (what the button displays) - use lowercase for matching var sortValues = ['most relevant', 'newest', 'highest rating', 'lowest rating', 'más relevantes', 'más recientes', 'puntuación más alta', 'puntuación más baja', 'relevantes', 'recientes']; // Exclusion patterns - buttons we should NOT click var excludePatterns = ['all reviews', 'todas las reseñas', 'alle bewertungen']; function normalizeText(text) { return text.toLowerCase().replace(/\\s+/g, ' ').trim(); } function isExcluded(text) { var normalized = normalizeText(text); for (var i = 0; i < excludePatterns.length; i++) { if (normalized.includes(excludePatterns[i])) return true; } return false; } function matchesSortValue(text) { var normalized = normalizeText(text); for (var i = 0; i < sortValues.length; i++) { if (normalized.includes(sortValues[i])) return true; } return false; } // Method 1: Find button with aria-haspopup containing sort value (not excluded) var btns = document.querySelectorAll('button[aria-haspopup="true"]'); for (var i = 0; i < btns.length; i++) { var text = btns[i].textContent; if (!isExcluded(text) && matchesSortValue(text)) { return btns[i]; } } // Method 2: Button with S9kvJb class (Google's sort button class) btns = document.querySelectorAll('button.S9kvJb'); for (var i = 0; i < btns.length; i++) { var text = btns[i].textContent; if (!isExcluded(text) && matchesSortValue(text)) { return btns[i]; } } // Method 3: Any button containing sort value text btns = document.querySelectorAll('button'); for (var i = 0; i < btns.length; i++) { var text = btns[i].textContent; if (!isExcluded(text) && matchesSortValue(text)) { return btns[i]; } } // Method 4: Look for button near a "Sort" label var spans = document.querySelectorAll('span'); for (var i = 0; i < spans.length; i++) { if (normalizeText(spans[i].textContent) === 'sort' || normalizeText(spans[i].textContent) === 'ordenar') { var parent = spans[i].closest('button'); if (parent && !isExcluded(parent.textContent)) { return parent; } } } return null; """) if not sort_btn: # Debug: check what buttons exist - focus on aria-haspopup buttons btn_debug = driver.execute_script(""" var result = []; // Show all buttons with aria-haspopup (dropdown buttons) var dropdowns = document.querySelectorAll('button[aria-haspopup="true"]'); result.push('Dropdown buttons (' + dropdowns.length + '):'); for (var i = 0; i < dropdowns.length && i < 5; i++) { var text = dropdowns[i].textContent.replace(/\\s+/g, ' ').trim().substring(0, 50); result.push(' [' + i + ']: "' + text + '"'); } // Look for any element with "Sort" or "relevant" text var sortKeywords = ['sort', 'relevant', 'newest', 'highest', 'lowest']; var btns = document.querySelectorAll('button'); var sortBtns = []; for (var i = 0; i < btns.length; i++) { var text = btns[i].textContent.toLowerCase(); for (var k = 0; k < sortKeywords.length; k++) { if (text.includes(sortKeywords[k])) { sortBtns.push(btns[i].textContent.replace(/\\s+/g, ' ').trim().substring(0, 40)); break; } } } if (sortBtns.length > 0) { result.push('Sort-related buttons: ' + JSON.stringify(sortBtns.slice(0, 5))); } return result; """) log.warn('browser', f"Could not find sort button for {target_sort}. Debug: {btn_debug}") return False # Use JavaScript click (more reliable than Selenium click) driver.execute_script("arguments[0].click();", sort_btn) time.sleep(0.5) # Wait for menu to appear with retries menu_found = False for attempt in range(3): menu_items = driver.execute_script(""" var items = []; // Check for menuitemradio (Google's standard) var radios = document.querySelectorAll('[role="menuitemradio"]'); for (var i = 0; i < radios.length; i++) { items.push(radios[i].textContent.trim()); } // Also check for menuitem if (items.length === 0) { var menuItems = document.querySelectorAll('[role="menuitem"]'); for (var i = 0; i < menuItems.length; i++) { items.push(menuItems[i].textContent.trim()); } } // Check menu containers if (items.length === 0) { var menus = document.querySelectorAll('[role="menu"], [role="listbox"]'); for (var m = 0; m < menus.length; m++) { var divs = menus[m].querySelectorAll('div'); for (var i = 0; i < divs.length; i++) { var text = divs[i].textContent.trim(); if (text.length > 2 && text.length < 30) { items.push(text); } } } } return items; """) if menu_items and len(menu_items) > 0: menu_found = True log.info('browser', f"Menu opened, items: {menu_items[:4]}") break time.sleep(0.5) if not menu_found: log.warn('browser', f"Menu did not appear after clicking sort button") # Click the target sort option - try multiple selector strategies # Use more comprehensive patterns including exact menu text extended_patterns = { SORT_NEWEST: ['newest', 'recent', 'más reciente', 'neueste', 'plus récent', 'más recientes'], SORT_LOWEST: ['lowest', 'rating: low', 'puntuación: baja', 'niedrigste', 'puntuación más baja'], SORT_HIGHEST: ['highest', 'rating: high', 'puntuación: alta', 'höchste', 'puntuación más alta'], SORT_RELEVANT: ['most relevant', 'relevant', 'relevance', 'más relevante', 'relevanteste', 'más relevantes'], } search_patterns = extended_patterns.get(target_sort, patterns) patterns_js = json.dumps(search_patterns) clicked = driver.execute_script(f""" var patterns = {patterns_js}; function textMatches(txt, patterns) {{ txt = txt.toLowerCase().trim(); for (var p = 0; p < patterns.length; p++) {{ if (txt.includes(patterns[p])) return true; }} return false; }} // Strategy 1: menuitemradio elements (Google's standard) var items = document.querySelectorAll('[role="menuitemradio"]'); for (var i = 0; i < items.length; i++) {{ if (textMatches(items[i].textContent, patterns)) {{ items[i].click(); return 'menuitemradio'; }} }} // Strategy 2: menuitem elements items = document.querySelectorAll('[role="menuitem"]'); for (var i = 0; i < items.length; i++) {{ if (textMatches(items[i].textContent, patterns)) {{ items[i].click(); return 'menuitem'; }} }} // Strategy 3: menu items with data-index items = document.querySelectorAll('[data-index]'); for (var i = 0; i < items.length; i++) {{ if (textMatches(items[i].textContent, patterns)) {{ items[i].click(); return 'data-index'; }} }} // Strategy 4: Any element in a menu/listbox container var menus = document.querySelectorAll('[role="menu"], [role="listbox"]'); for (var m = 0; m < menus.length; m++) {{ var children = menus[m].querySelectorAll('*'); for (var i = 0; i < children.length; i++) {{ var el = children[i]; // Only click leaf elements with matching text if (el.children.length === 0 || el.tagName === 'SPAN') {{ if (textMatches(el.textContent, patterns) && el.textContent.length < 50) {{ el.click(); return 'menu-child'; }} }} }} }} // Strategy 5: Any visible div with jsaction that matches var allDivs = document.querySelectorAll('div[jsaction]'); for (var i = 0; i < allDivs.length; i++) {{ var txt = allDivs[i].textContent; var visible = allDivs[i].offsetParent !== null; if (visible && textMatches(txt, patterns) && txt.length < 50) {{ allDivs[i].click(); return 'jsaction-div'; }} }} return false; """) if clicked: time.sleep(0.5) log.info('browser', f"Sorted by {target_sort} (via {clicked})") return True else: # Debug: log what menu items we can see and any visible elements with sort text menu_items = driver.execute_script(""" var items = []; // Check standard menu elements var all = document.querySelectorAll('[role="menuitemradio"], [role="menuitem"], [data-index]'); for (var i = 0; i < all.length && i < 10; i++) { items.push(all[i].textContent.trim().substring(0, 30)); } // If nothing found, look for any visible element with sort-related text if (items.length === 0) { var allVisible = document.querySelectorAll('div, span'); var sortTexts = ['newest', 'relevant', 'highest', 'lowest']; for (var i = 0; i < allVisible.length && items.length < 10; i++) { var txt = allVisible[i].textContent.toLowerCase(); for (var s = 0; s < sortTexts.length; s++) { if (txt.includes(sortTexts[s]) && txt.length < 30 && allVisible[i].offsetParent !== null) { items.push(allVisible[i].textContent.trim()); break; } } } } return items; """) log.warn('browser', f"Could not find sort option for {target_sort}. Menu items: {menu_items}") return False except Exception as e: log.warn('browser', f"Failed to change sort to {target_sort}: {e}") return False def setup_reviews_page(is_refresh=False, validation_only_mode=False, initial_sort: str = None, skip_navigation: bool = False): """ Setup the reviews page for scraping. Returns (scroll_container, stop_scrolling_event) or (None, None) on failure. Can be called after initial load or after a hard refresh. If validation_only_mode=True, returns early after extracting business info without clicking reviews tab or finding scroll container. If skip_navigation=True (resume from validation), skips navigation/consent/business info since browser is already on the page with data extracted. """ nonlocal total_reviews refresh_label = " (after refresh)" if is_refresh else "" # RESUME FROM VALIDATION: Skip navigation, consent, and business info extraction # Browser is already on the Google Maps page with data already extracted if skip_navigation and not is_refresh: log.info('browser', "Resuming from validation: skipping navigation (already on page)") # Skip directly to clicking reviews tab (below) else: # Navigate to URL (only on initial load or refresh) if not is_refresh: # Reset browser state by navigating to blank page first # This clears any stale state from pooled browser sessions try: driver.get("about:blank") time.sleep(0.1) except: pass log.info('browser', f"Loading: {url[:80]}...") else: log.info('browser', f"Hard refresh #{hard_refresh_count[0]}: reloading page...") driver.get(url) # Handle consent popup if redirected (poll with tiny sleep) start = time.time() while time.time() - start < 5: # Max 5s for consent if "consent.google" in driver.current_url: log.info('browser', "Handling consent popup...") try: for btn in driver.find_elements(By.CSS_SELECTOR, "button"): txt = btn.text.lower() if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt: btn.click() # Reload original URL after consent log.info('browser', "Reloading after consent...") driver.get(url) # Wait for page to settle after consent reload time.sleep(1) break except: pass break # Check if we're already on the target page if "maps/place" in driver.current_url and "consent" not in driver.current_url: break time.sleep(0.01) # 10ms - responsive but low CPU # Extract business info and total review count BEFORE clicking reviews tab (on Overview) # This captures name, rating, category, address while they're visible # Only on first load (don't overwrite if we already have it) if total_reviews[0] is None or business_info_cache[0] is None: start = time.time() while time.time() - start < 5: try: info = driver.execute_script(""" var result = { total_reviews: null, name: null, rating: null, category: null, address: null }; // Business name from h1 var h1 = document.querySelector('h1'); if (h1) result.name = h1.textContent.trim(); // Category - use jsaction attribute (robust selector) var catBtn = document.querySelector('button[jsaction*="category"]'); if (catBtn) result.category = catBtn.textContent.trim(); // Rating and review count from span[role="img"] aria-labels var spans = document.querySelectorAll('span[role="img"]'); for (var i = 0; i < spans.length; i++) { var label = spans[i].getAttribute('aria-label') || ''; // Rating: "4.8 stars" var rMatch = label.match(/^([\\d,.]+)\\s*star/i); if (rMatch && !result.rating) { result.rating = parseFloat(rMatch[1].replace(',', '.')); } // Reviews: "79 reviews" var revMatch = label.match(/^([\\d,\\.]+)\\s*review/i); if (revMatch && !result.total_reviews) { result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, '')); } } // Address from button var addrBtn = document.querySelector('button[data-item-id="address"]'); if (addrBtn) { var label = addrBtn.getAttribute('aria-label'); if (label) result.address = label.replace(/^Address:\\s*/i, ''); } return result; """) if info: if info.get('total_reviews') and total_reviews[0] is None: total_reviews[0] = info['total_reviews'] log.info('scraper', f"Total reviews on page: {total_reviews[0]}", metrics={'total_reviews': total_reviews[0]}) if info.get('name') and business_info_cache[0] is None: business_info_cache[0] = info log.info('scraper', f"Business: {info.get('name')}") if total_reviews[0] and business_info_cache[0]: break except: pass time.sleep(0.1) # VALIDATION_ONLY: Return early - skip clicking reviews tab, sorting, etc. if validation_only_mode: log.info('scraper', "Validation mode: returning early (skipping reviews tab)") return ("validation_done", None) # Click reviews tab - poll until found review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"] start = time.time() tab_clicked = False tabs_logged = False while time.time() - start < 5: # Max 5s for tabs try: tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']") # Log available tabs once for debugging if not tabs_logged and tabs: tabs_logged = True tab_texts = [t.text for t in tabs] log.info('browser', f"Available tabs: {tab_texts}") for tab in tabs: tab_text = tab.text.lower() if any(kw in tab_text for kw in review_keywords): if not is_refresh: log.info('browser', f"Clicking reviews tab: '{tab.text}'") # Extract total_reviews from tab text like "Reviews (79)" or "Reviews\n79" if total_reviews[0] is None: import re # Try pattern with parentheses: "Reviews (79)" match = re.search(r'\((\d+)\)', tab.text) if match: total_reviews[0] = int(match.group(1)) log.info('scraper', f"Total reviews from tab: {total_reviews[0]}", metrics={'total_reviews': total_reviews[0]}) else: # Try pattern with newline: "Reviews\n79" match = re.search(r'(\d+)', tab.text) if match: total_reviews[0] = int(match.group(1)) log.info('scraper', f"Total reviews from tab: {total_reviews[0]}", metrics={'total_reviews': total_reviews[0]}) tab.click() tab_clicked = True break if tab_clicked: break time.sleep(0.01) # 10ms between polls except: time.sleep(0.01) # Poll for scroll container (10ms intervals - fast but low CPU) scroll_container = None start = time.time() last_print = 0 while time.time() - start < 10: # Max 10s scroll_container = find_scroll_container() if scroll_container: break elapsed = int(time.time() - start) if elapsed > last_print: log.info('browser', f"Waiting for reviews panel...{refresh_label} ({elapsed}s)") last_print = elapsed time.sleep(0.01) # 10ms - responsive but low CPU if not scroll_container: log.error('browser', f"Could not find reviews scroll container{refresh_label}") try: log.error('browser', f"Page title: {driver.title}") log.error('browser', f"Current URL: {driver.current_url[:100]}") except: pass return None, None log.info('browser', f"Found scroll container{refresh_label}") # Inject API interceptor (needs to be re-injected after refresh) if not is_refresh: log.info('network', "Injecting API interceptor...") driver.execute_script(""" // Always re-setup on refresh window.__reviewInterceptorInjected = true; window.__interceptedResponses = window.__interceptedResponses || []; // Intercept fetch (only if not already patched) if (!window.__fetchPatched) { window.__fetchPatched = true; const originalFetch = window.fetch; window.fetch = async function(...args) { const url = args[0].toString(); const response = await originalFetch.apply(this, args); if (url.includes('listugcposts') || url.includes('review')) { try { const clone = response.clone(); const text = await clone.text(); window.__interceptedResponses.push({url: url, body: text}); } catch(e) {} } return response; }; } // Intercept XHR (only if not already patched) if (!window.__xhrPatched) { window.__xhrPatched = true; const originalXHR = window.XMLHttpRequest; window.XMLHttpRequest = function() { const xhr = new originalXHR(); const originalOpen = xhr.open; let reqUrl = ''; xhr.open = function(method, url, ...rest) { reqUrl = url; return originalOpen.apply(this, [method, url, ...rest]); }; xhr.addEventListener('load', function() { if (reqUrl.includes('listugcposts') || reqUrl.includes('review')) { try { window.__interceptedResponses.push({url: reqUrl, body: xhr.responseText}); } catch(e) {} } }); return xhr; }; for (let prop of Object.getOwnPropertyNames(originalXHR)) { try { window.XMLHttpRequest[prop] = originalXHR[prop]; } catch(e) {} } } """) # Wait for sort button to appear (it loads after reviews panel) sort_found = False for wait_attempt in range(5): time.sleep(0.5) has_sort = driver.execute_script(""" // Sort option values (use lowercase for matching with includes) var sortValues = ['most relevant', 'newest', 'highest rating', 'lowest rating', 'más relevantes', 'más recientes', 'puntuación más alta', 'puntuación más baja']; // Exclusion patterns - buttons we should NOT match var excludePatterns = ['all reviews', 'todas las reseñas']; function normalizeText(text) { return text.toLowerCase().replace(/\\s+/g, ' ').trim(); } function isExcluded(text) { var normalized = normalizeText(text); for (var i = 0; i < excludePatterns.length; i++) { if (normalized.includes(excludePatterns[i])) return true; } return false; } // Check for buttons containing sort value text (not excluded) var btns = document.querySelectorAll('button[aria-haspopup="true"], button.HQzyZ, button.S9kvJb'); for (var i = 0; i < btns.length; i++) { var text = normalizeText(btns[i].textContent); if (isExcluded(text)) continue; for (var j = 0; j < sortValues.length; j++) { if (text.includes(sortValues[j])) return true; } } return false; """) if has_sort: sort_found = True log.info('browser', "Sort button found") break # Track bot detection - if sort button hidden, Google likely detected bot bot_detected = not sort_found if not sort_found: log.warn('browser', "Sort button not found after waiting, continuing without sorting (bot detection likely)") # Sort by specified order (default: newest) target_sort = initial_sort or SORT_NEWEST if sort_found and change_sort_order(target_sort): # Re-find scroll container after sorting (DOM may be recreated) new_container = find_scroll_container() if new_container: scroll_container = new_container log.info('browser', "Refreshed scroll container reference") # Expand "More" buttons for full text try: expanded = driver.execute_script(""" var buttons = document.querySelectorAll('button.w8nwRe.kyuRq'); var count = 0; for (var i = 0; i < buttons.length; i++) { if (buttons[i].textContent.trim() === 'More') { buttons[i].click(); count++; } } return count; """) if expanded > 0: log.info('browser', f"Expanded {expanded} truncated reviews", metrics={'expanded_count': expanded}) except: pass # Block heavy resources to speed up scrolling (use CDP) try: driver.execute_cdp_cmd('Network.setBlockedURLs', { 'urls': [ # Images '*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*', # Fonts '*.woff', '*.woff2', '*.ttf', '*.otf', # Analytics/tracking '*google-analytics.com/*', '*googletagmanager.com/*', '*doubleclick.net/*', '*googlesyndication.com/*', # Maps tiles (not needed for reviews) '*khms*.google.com/*', '*maps.googleapis.com/maps/vt*' ] }) driver.execute_cdp_cmd('Network.enable', {}) if not is_refresh: log.info('browser', "Blocking heavy resources for faster scrolling") except: pass # Setup scrollable pane reference driver.execute_script("window.scrollablePane = arguments[0];", scroll_container) # Create scroll worker stop_scrolling = threading.Event() def scroll_worker(): while not stop_scrolling.is_set(): try: driver.execute_script(""" var p = window.scrollablePane; if (p) p.scrollTop = p.scrollHeight; """) except: pass time.sleep(0.1) scroll_thread = threading.Thread(target=scroll_worker, daemon=True) scroll_thread.start() return scroll_container, stop_scrolling # Helper to extract review topics from the reviews tab def extract_review_topics(): """Extract review topic filters from radiogroup (robust selectors).""" try: topics = driver.execute_script(""" var topics = []; // Primary: use role="radiogroup" with aria-label="Refine reviews" (robust) var container = document.querySelector('div[role="radiogroup"][aria-label*="Refine"], div[role="radiogroup"][aria-label*="refine"]'); if (!container) { // Fallback: any radiogroup in the reviews area container = document.querySelector('div[role="radiogroup"]'); } if (container) { var buttons = container.querySelectorAll('button[role="radio"]'); for (var btn of buttons) { var label = btn.getAttribute('aria-label') || ''; // Parse "hair salon, mentioned in 4 reviews" format var match = label.match(/^([^,]+),\\s*mentioned in (\\d+)/i); if (match) { topics.push({ topic: match[1].trim(), count: parseInt(match[2]) }); } else if (label && !label.toLowerCase().includes('all review')) { // Fallback: try to extract from child spans var countSpan = btn.querySelector('.bC3Nkc, .fontBodySmall'); var nameSpan = btn.querySelector('.uEubGf, span:first-child'); if (nameSpan) { var name = nameSpan.textContent.trim(); var count = countSpan ? parseInt(countSpan.textContent) : 0; if (name && name.toLowerCase() !== 'all') { topics.push({topic: name, count: count || 0}); } } } } } return topics; """) return topics or [] except: return [] # Initial page setup (pass validation_only to skip unnecessary steps) # If resuming from validation, skip navigation since browser is already on page scroll_container, stop_scrolling = setup_reviews_page( is_refresh=False, validation_only_mode=validation_only, skip_navigation=resume_from_validation ) # VALIDATION_ONLY MODE: Return early with just total_reviews and business info # setup_reviews_page returns ("validation_done", None) in this case if validation_only or scroll_container == "validation_done": # Use the business info captured from Overview (before clicking reviews tab) business_info = business_info_cache[0] or {} return { "reviews": [], "total": total_reviews[0] or 0, "scrolls": 0, "error": None, "validation_info": { "name": business_info.get("name"), "rating": business_info.get("rating"), "category": business_info.get("category"), "address": business_info.get("address"), "total_reviews": total_reviews[0] }, "session_fingerprint": session_fingerprint # Browser fingerprint for bot detection analysis } if not scroll_container: return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found", "session_fingerprint": session_fingerprint} # Extract review topics after reviews tab is loaded (before scrolling begins) time.sleep(0.5) # Brief wait for topic filters to render review_topics = extract_review_topics() if review_topics: log.info('scraper', f"Found {len(review_topics)} review topics: {', '.join(t['topic'] for t in review_topics[:5])}...", metrics={'topic_count': len(review_topics)}) def get_api_reviews(): """Get reviews from intercepted API responses.""" api_revs = [] try: responses = driver.execute_script(""" var r = window.__interceptedResponses || []; window.__interceptedResponses = []; return r; """) for resp in (responses or []): body = resp.get("body", "") api_revs.extend(extract_reviews_from_api_body(body)) except: pass return api_revs # Captcha detection helper def detect_captcha(): """Check if a captcha or challenge is blocking the page. Returns captcha type or None.""" try: return driver.execute_script(""" // Check for reCAPTCHA iframe or checkbox var recaptcha = document.querySelector('iframe[src*="recaptcha"], iframe[title*="reCAPTCHA"]'); if (recaptcha) return 'recaptcha'; // Check for "unusual traffic" message var body = document.body ? document.body.innerText : ''; if (body.includes('unusual traffic') || body.includes('not a robot')) return 'unusual_traffic'; // Check for challenge frame var challenge = document.querySelector('iframe[src*="challenge"]'); if (challenge) return 'challenge'; return null; """) except: return None # Recovery function - use real mouse actions when stuck from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.keys import Keys recovery_count = [0] def unstick_scroll(): nonlocal scroll_container recovery_count[0] += 1 method = recovery_count[0] % 4 try: if method == 1: # Method 1: Click pane and send Page Down keys scroll_container.click() ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform() ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform() elif method == 2: # Method 2: Real mouse wheel scroll ActionChains(driver).move_to_element(scroll_container)\ .scroll_by_amount(0, 800).perform() elif method == 3: # Method 3: Scroll up significantly then back down (force reload) driver.execute_script(""" var p = window.scrollablePane; if (p) p.scrollTop = Math.max(0, p.scrollTop - 2000); """) time.sleep(0.3) driver.execute_script(""" var p = window.scrollablePane; if (p) p.scrollTop = p.scrollHeight; """) else: # Method 4: Scroll last card into view, then scroll pane (no click to avoid opening profile) driver.execute_script(""" var cards = document.querySelectorAll('[data-review-id]'); if (cards.length > 0) { cards[cards.length - 1].scrollIntoView({block: 'end', behavior: 'smooth'}); } """) time.sleep(0.3) driver.execute_script(""" var p = window.scrollablePane; if (p) p.scrollTop = p.scrollHeight; """) except: pass def do_hard_refresh(): """Hard refresh the page and re-setup everything. Returns True on success.""" nonlocal scroll_container, stop_scrolling hard_refresh_count[0] += 1 if hard_refresh_count[0] > max_hard_refreshes: log.warn('system', f"Max hard refreshes ({max_hard_refreshes}) reached, giving up", metrics={'hard_refresh_count': hard_refresh_count[0]}) return False # Stop current scroll worker stop_scrolling.set() time.sleep(0.2) # Re-setup page new_container, new_stop = setup_reviews_page(is_refresh=True) if new_container: scroll_container = new_container stop_scrolling = new_stop recovery_count[0] = 0 # Reset recovery count after successful refresh log.info('browser', f"Hard refresh successful, resuming with {len(seen_ids)} reviews already collected", metrics={'reviews_collected': len(seen_ids)}) return True else: log.error('browser', "Hard refresh failed to find scroll container") return False # Main collection loop last_new_time = time.time() last_count = len(reviews) check_num = 0 start_time = time.time() # Crash detection: metrics sampling metrics_history = [] last_sample_time = time.time() scroll_count = [0] # Track scroll operations for crash reports log.info('browser', f"Scrolling... (timeout: {timeout_no_new}s with no new)", metrics={'timeout_seconds': timeout_no_new}) cycle_start = time.time() while True: check_num += 1 time.sleep(1.0) # Check every second # TIMING: Track cycle performance t0 = time.time() cycle_delta = t0 - cycle_start cycle_start = t0 # CRASH DETECTION: Sample metrics every 5 seconds if time.time() - last_sample_time >= 5: current_count_for_metrics = total_flushed[0] + len(reviews) metrics_history.append({ 'timestamp_ms': int(time.time() * 1000), 'memory_mb': get_chrome_memory(driver), 'dom_nodes': get_dom_node_count(driver), 'reviews_count': current_count_for_metrics }) # Keep only last 100 samples metrics_history = metrics_history[-100:] last_sample_time = time.time() # Collect from API (doesn't affect scroll) - API has FULL TEXT in original language # Use review_id as key to avoid duplicates with DOM t1 = time.time() for rev in get_api_reviews(): rid = rev.get('review_id', '') if rid and rid not in seen_ids: reviews[rid] = rev seen_ids.add(rid) api_time = time.time() - t1 # Expand any new "More" buttons for full text (batch click, fast) try: driver.execute_script(""" var buttons = document.querySelectorAll('button.w8nwRe.kyuRq'); for (var i = 0; i < buttons.length; i++) { if (buttons[i].textContent.trim() === 'More') { buttons[i].click(); } } """) except: pass # Parse reviews using ROBUST selectors (no class names - uses data/aria attributes) # This survives Google's CSS class name changes # MEMORY FIX: Actually remove processed cards from DOM (not just hide) # Keep last N cards for scroll continuity t2 = time.time() dom_cards = 0 try: seen_list = list(seen_ids) parsed_reviews = driver.execute_script(""" var seenSet = new Set(arguments[0]); var results = []; var processedIds = new Set(); var sepsRemoved = 0; var cardsRemoved = 0; var KEEP_LAST_N = 50; // Keep last 50 cards for scroll reference // ROBUST: Find cards by data attribute only (not class names) var cards = document.querySelectorAll('[data-review-id]'); var cardsArray = Array.from(cards); var totalCards = cardsArray.length; for (var i = 0; i < cardsArray.length; i++) { var card = cardsArray[i]; var rid = card.getAttribute('data-review-id'); var isHidden = card.style.display === 'none'; var isNearEnd = i >= totalCards - KEEP_LAST_N; // AGGRESSIVE CLEANUP: Remove hidden cards that are NOT near the scroll end // This prevents memory buildup that causes tab crashes if (isHidden && !isNearEnd) { // Remove separators first var sibling = card.nextElementSibling; while (sibling) { var nextSib = sibling.nextElementSibling; var classes = sibling.className || ''; if (classes.includes('AyRUI') || classes.includes('TFQHme')) { sibling.remove(); sepsRemoved++; sibling = nextSib; } else { break; } } // Remove the card itself from DOM card.remove(); cardsRemoved++; continue; } // Skip already hidden cards near end (keep for scroll reference) if (isHidden) continue; // Skip if no ID or already processed this cycle if (!rid || processedIds.has(rid)) continue; // Only process top-level review cards (have aria-label with author name) if (!card.getAttribute('aria-label')) continue; processedIds.add(rid); // Already seen from API - just track order, skip content // BUT still hide the card to keep DOM light! if (seenSet.has(rid)) { results.push({id: rid, orderOnly: true}); // Hide this card since we already have its data from API card.style.display = 'none'; card.innerHTML = ''; continue; } var author = '', text = '', rating = 0, timestamp = ''; // AUTHOR: Extract from "Photo of {Name}" button aria-label var photoBtn = card.querySelector('button[aria-label^="Photo of"]'); if (photoBtn) { author = photoBtn.getAttribute('aria-label').replace('Photo of ', '').trim(); } // Fallback: card's own aria-label is the author name if (!author) { author = card.getAttribute('aria-label') || ''; } // RATING: span with role="img" and aria-label containing "star" var ratingEl = card.querySelector('span[role="img"][aria-label*="star"]'); if (ratingEl) { var match = ratingEl.getAttribute('aria-label').match(/(\\d)/); if (match) rating = parseInt(match[1]); } // TIMESTAMP: Find span with "X time ago" pattern var spans = card.querySelectorAll('span'); for (var j = 0; j < spans.length; j++) { var spanText = spans[j].textContent.trim(); if (spanText.match(/^(\\d+|a|an)\\s+(second|minute|hour|day|week|month|year)s?\\s+ago$/i)) { timestamp = spanText; break; } } // TEXT: Find longest text span (not timestamp/UI elements) var longestText = ''; for (var j = 0; j < spans.length; j++) { var spanText = spans[j].textContent.trim(); if (spanText === timestamp) continue; if (spanText.match(/^\\d+ stars?$/i)) continue; if (spanText === 'More' || spanText === 'Less') continue; if (spanText.match(/^(Like\\d*|Share)$/)) continue; if (spanText.length > longestText.length && spanText.length > 10) { longestText = spanText; } } text = longestText; // OWNER RESPONSE: Find by "Response from the owner" text anchor var ownerResponse = null; var ownerSpan = null; var cardSpans = card.querySelectorAll('span'); for (var k = 0; k < cardSpans.length; k++) { if (cardSpans[k].textContent.trim() === 'Response from the owner') { ownerSpan = cardSpans[k]; break; } } if (ownerSpan) { // Navigate: span -> header div -> container div var headerDiv = ownerSpan.closest('div'); var respContainer = headerDiv ? headerDiv.parentElement : null; if (respContainer) { // Click expand button if exists and not expanded var expandBtn = respContainer.querySelector('button[aria-label="See more"]'); if (expandBtn && expandBtn.getAttribute('aria-expanded') !== 'true') { expandBtn.click(); } // Get timestamp from header spans var respTimestamp = ''; var headerSpans = headerDiv.querySelectorAll('span'); for (var m = 0; m < headerSpans.length; m++) { var spanTxt = headerSpans[m].textContent.trim(); if (spanTxt.match(/ago$/i)) { respTimestamp = spanTxt; break; } } // Get response text from direct child div[lang] var respText = ''; var langDivs = respContainer.children; for (var m = 0; m < langDivs.length; m++) { if (langDivs[m].tagName === 'DIV' && langDivs[m].hasAttribute('lang')) { respText = langDivs[m].textContent.trim(); respText = respText.replace(/(More|Less)$/, '').trim(); break; } } // Fallback: find longest text div that's not the header if (!respText) { for (var m = 0; m < langDivs.length; m++) { if (langDivs[m].tagName === 'DIV') { var divTxt = langDivs[m].textContent.trim(); if (divTxt.includes('Response from the owner')) continue; divTxt = divTxt.replace(/(More|Less)$/, '').trim(); if (divTxt.length > respText.length) { respText = divTxt; } } } } if (respText) { ownerResponse = {text: respText, timestamp: respTimestamp}; } } } if (author && rating >= 1 && rating <= 5) { results.push({ id: rid, orderOnly: false, author: author, text: text, rating: rating, timestamp: timestamp, owner_response: ownerResponse, source: 'dom' }); } // Mark card as processed (hide + clear) - will be removed on next cycle // Keep near-end cards visible for scroll reference if (!isNearEnd) { card.style.display = 'none'; card.innerHTML = ''; } } return {reviews: results, cardCount: totalCards, cardsRemoved: cardsRemoved, sepsRemoved: sepsRemoved}; """, seen_list) dom_cards = parsed_reviews.get('cardCount', 0) if parsed_reviews else 0 cards_removed = parsed_reviews.get('cardsRemoved', 0) if parsed_reviews else 0 if cards_removed > 0: log.info('system', f"DOM cleanup: removed {cards_removed} cards to prevent memory buildup", metrics={'cards_removed': cards_removed, 'cards_remaining': dom_cards - cards_removed}) new_reviews = parsed_reviews.get('reviews', []) if parsed_reviews else [] for rev in new_reviews: rid = rev.pop('id') order_only = rev.pop('orderOnly', False) # Track DOM order for ALL reviews (for sorting output) if rid not in review_order: review_order[rid] = order_counter[0] order_counter[0] += 1 # Only add content for new reviews (not already from API) if not order_only: reviews[rid] = rev seen_ids.add(rid) except Exception as e: log.error('scraper', f"DOM parse error: {e}") dom_time = time.time() - t2 # BATCH FLUSH: If we have enough reviews, flush to callback and clear memory # Sort by DOM order before flushing t3 = time.time() if flush_callback and len(reviews) >= flush_batch_size: log.info('scraper', f"Flushing {len(reviews)} reviews to disk...", metrics={'batch_size': len(reviews), 'source': 'flush'}) sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) flush_callback([r for _, r in sorted_reviews]) total_flushed[0] += len(reviews) reviews.clear() # Free memory, but keep seen_ids and review_order flush_time = time.time() - t3 current_count = total_flushed[0] + len(reviews) # TIMING: Print if cycle is slow (>2s) if cycle_delta > 2.0: log.warn('system', f"SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})", metrics={'cycle_time_s': cycle_delta, 'api_time_s': api_time, 'dom_time_s': dom_time, 'dom_cards': dom_cards, 'seen_count': len(seen_ids)}) # Check for new reviews if current_count > last_count: last_new_time = time.time() last_count = current_count # Check if loading (spinner visible OR network activity) try: loading_status = driver.execute_script(""" var status = {spinner: false, network: false}; // Check for Google's loading indicators var spinner = document.querySelector('div[role="progressbar"]'); if (spinner && spinner.offsetParent !== null) status.spinner = true; var loading = document.querySelector('.qjESne, .loading'); if (loading && loading.offsetParent !== null) status.spinner = true; // Check for recent network activity (API interceptor) var responses = window.__interceptedResponses || []; var lastCount = window.__lastResponseCount || 0; if (responses.length > lastCount) { status.network = true; window.__lastResponseCount = responses.length; } return status; """) is_loading = loading_status.get('spinner') or loading_status.get('network') if is_loading: last_new_time = time.time() # Reset timer while loading except: is_loading = False # Progress update elapsed = time.time() - last_new_time if total_reviews[0]: pct = (current_count / total_reviews[0]) * 100 log.info('scraper', f"{current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", metrics={'reviews_count': current_count, 'total_reviews': total_reviews[0], 'progress_pct': pct, 'idle_seconds': elapsed}) else: log.info('scraper', f"{current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", metrics={'reviews_count': current_count, 'idle_seconds': elapsed}) # Call progress callback on every iteration (for real-time log updates) if progress_callback: progress_callback(current_count, total_reviews[0]) # Stop conditions - check BEFORE recovery attempts if current_count >= max_reviews: log.info('scraper', f"Reached max: {current_count}", metrics={'total_reviews': current_count, 'elapsed_seconds': time.time() - start_time}) stop_scrolling.set() break # Also stop if we have all reviews from the page if total_reviews[0] and current_count >= total_reviews[0]: log.info('scraper', f"All {current_count} reviews collected", metrics={'total_reviews': current_count, 'elapsed_seconds': time.time() - start_time}) stop_scrolling.set() break # STUCK DETECTION: If no new reviews for 3s+, try to unstick # Only if we haven't collected all reviews yet if elapsed >= 3 and int(elapsed) % 3 == 0: # After 8+ failed recovery attempts, try hard refresh if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes: # Check for captcha before hard refresh - no point refreshing if blocked captcha_type = detect_captcha() if captcha_type: log.warn('browser', f"Captcha detected ({captcha_type}), stopping - human intervention needed", metrics={'captcha_type': captcha_type}) stop_scrolling.set() return { "reviews": [], "total": current_count, "error": f"Captcha detected: {captcha_type}. Please solve manually and retry.", "captcha_detected": True } log.info('browser', f"Soft recovery failed {recovery_count[0]} times, trying hard refresh...", metrics={'recovery_count': recovery_count[0]}) if do_hard_refresh(): last_new_time = time.time() # Reset timer after refresh continue # Skip to next iteration else: log.info('browser', f"Recovery attempt #{recovery_count[0] + 1}...", metrics={'recovery_attempt': recovery_count[0] + 1}) unstick_scroll() # Check scroll state - track if content is still being added try: scroll_state = driver.execute_script(""" var p = window.scrollablePane; if (!p) return {atBottom: true, height: 0}; var atBottom = (p.scrollTop + p.clientHeight >= p.scrollHeight - 50); var height = p.scrollHeight; var lastHeight = window.__lastScrollHeight || 0; var growing = height > lastHeight; window.__lastScrollHeight = height; return {atBottom: atBottom, height: height, growing: growing}; """) at_bottom = scroll_state.get('atBottom', True) content_growing = scroll_state.get('growing', False) except: at_bottom = True content_growing = False # Reset timer if content is growing (new reviews loading) if content_growing: last_new_time = time.time() # Dynamic timeout based on state and recovery attempts # - Try hard refresh before giving up if we still have refreshes left # - 5s if at bottom AND content stopped growing AND multiple recovery attempts failed # - 15s max otherwise (keep trying) recovery_failed = recovery_count[0] >= 5 and elapsed >= 5 truly_done = at_bottom and not content_growing and recovery_failed timeout_hit = elapsed >= timeout_no_new if truly_done or timeout_hit: # Check if we're close enough to total (configurable threshold) # If we have close_enough_pct+ of reviews, don't waste time with hard refreshes close_enough = False hit_limit_with_multisort = False if total_reviews[0] and current_count > 0: pct_complete = (current_count / total_reviews[0]) * 100 close_enough = pct_complete >= close_enough_pct if close_enough: log.info('scraper', f"Close enough ({pct_complete:.1f}% >= {close_enough_pct}%), skipping further retries", metrics={'pct_complete': pct_complete}) # Special case: if multi-sort mode and we hit a limit (~1000 reviews), # exit first pass to try other sorts instead of endless hard refreshes if (sort_strategy in ["multi", SORT_AUTO]) and current_count >= 1000 and hard_refresh_count[0] >= 1: hit_limit_with_multisort = True log.info('scraper', f"Hit ~1000 limit with multi-sort available, proceeding to additional sorts", metrics={'current_count': current_count, 'pct_complete': pct_complete}) # Last chance: try hard refresh before giving up (only if not close enough and not hitting multi-sort limit) if not close_enough and not hit_limit_with_multisort and hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews): # Check for captcha first captcha_type = detect_captcha() if captcha_type: log.warn('browser', f"Captcha detected ({captcha_type}), stopping - human intervention needed", metrics={'captcha_type': captcha_type}) stop_scrolling.set() break log.info('browser', "Timeout reached, trying hard refresh before giving up...", metrics={'idle_seconds': elapsed}) if do_hard_refresh(): last_new_time = time.time() continue # Keep trying log.info('scraper', f"All reviews loaded: {current_count}", metrics={'total_reviews': current_count, 'elapsed_seconds': time.time() - start_time}) stop_scrolling.set() break # ===== MULTI-SORT ADDITIONAL PASSES ===== # After first pass, check if we should do additional passes with different sort orders first_pass_count = total_flushed[0] + len(reviews) actual_sort_order = sort_order or DEFAULT_SORT_ORDER completed_sorts = [SORT_NEWEST] # First pass always uses newest (or initial_sort) # Determine if we should do multi-sort should_multi_sort = False if sort_strategy == "multi": should_multi_sort = True log.info('scraper', "Multi-sort enabled (forced)", metrics={'sort_strategy': 'multi'}) elif sort_strategy == SORT_AUTO: # Auto mode: enable if total > threshold OR first pass got < 90% of total if total_reviews[0] and total_reviews[0] > multi_sort_threshold: should_multi_sort = True log.info('scraper', f"Multi-sort auto-enabled (total {total_reviews[0]} > {multi_sort_threshold})", metrics={'total_reviews': total_reviews[0], 'threshold': multi_sort_threshold}) elif total_reviews[0] and first_pass_count < (total_reviews[0] * 0.9): should_multi_sort = True pct = (first_pass_count / total_reviews[0]) * 100 log.info('scraper', f"Multi-sort auto-enabled (first pass got {pct:.1f}% < 90%)", metrics={'first_pass_count': first_pass_count, 'total_reviews': total_reviews[0]}) if should_multi_sort and first_pass_count < max_reviews: remaining_sorts = [s for s in actual_sort_order if s not in completed_sorts] for pass_num, next_sort in enumerate(remaining_sorts, start=2): # Check if we already have enough reviews current_total = total_flushed[0] + len(reviews) if current_total >= max_reviews: log.info('scraper', f"Reached max_reviews ({max_reviews}), skipping remaining sorts") break pass_start_count = current_total log.info('scraper', f"Pass {pass_num}/{len(actual_sort_order)} ({next_sort}): starting with {current_total} reviews", metrics={'pass': pass_num, 'sort': next_sort, 'current_total': current_total}) # Change sort order if not change_sort_order(next_sort): log.warn('scraper', f"Failed to change sort to {next_sort}, skipping") continue time.sleep(0.5) # Re-find scroll container (DOM may have changed) scroll_container = find_scroll_container() if not scroll_container: log.warn('scraper', f"Lost scroll container after sort change, skipping {next_sort}") continue # Scroll to top to start fresh try: driver.execute_script(""" var p = arguments[0]; if (p) p.scrollTop = 0; """, scroll_container) time.sleep(0.3) except: pass # Start new scroll worker for this pass stop_scrolling = threading.Event() def scroll_worker_pass(): while not stop_scrolling.is_set(): try: driver.execute_script(""" var p = window.scrollablePane; if (p) p.scrollTop = p.scrollHeight; """) except: pass time.sleep(0.1) driver.execute_script("window.scrollablePane = arguments[0];", scroll_container) scroll_thread = threading.Thread(target=scroll_worker_pass, daemon=True) scroll_thread.start() # Mini scraping loop for this sort pass pass_last_new_time = time.time() pass_timeout = timeout_no_new while True: time.sleep(1.0) # Get reviews from API interception (sufficient for multi-sort) api_reviews = get_api_reviews() all_new = api_reviews # API interception captures all reviews # Add new reviews (seen_ids persists across passes!) new_count = 0 for rev in all_new: rid = rev.get("id") or f"{rev.get('author', '')}_{rev.get('timestamp', '')}" if rid and rid not in seen_ids: seen_ids.add(rid) reviews[rid] = rev if rid not in review_order: review_order[rid] = order_counter[0] order_counter[0] += 1 new_count += 1 if new_count > 0: pass_last_new_time = time.time() current_total = total_flushed[0] + len(reviews) # Progress update if progress_callback and total_reviews[0]: progress_callback(current_total, total_reviews[0]) # Flush if batch size reached if flush_callback and len(reviews) >= flush_batch_size: sorted_revs = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) flush_callback([r for _, r in sorted_revs]) total_flushed[0] += len(reviews) reviews.clear() # Check timeout elapsed = time.time() - pass_last_new_time if elapsed >= pass_timeout: stop_scrolling.set() break # Check max reviews if current_total >= max_reviews: stop_scrolling.set() break stop_scrolling.set() completed_sorts.append(next_sort) # Calculate yield for this pass pass_end_count = total_flushed[0] + len(reviews) pass_yield = pass_end_count - pass_start_count yield_pct = (pass_yield / max(1, pass_start_count)) * 100 log.info('scraper', f"Pass {pass_num} ({next_sort}) complete: +{pass_yield} new reviews ({yield_pct:.1f}% yield)", metrics={'pass': pass_num, 'sort': next_sort, 'new_reviews': pass_yield, 'yield_pct': yield_pct}) # Diminishing returns check if pass_yield > 0 and yield_pct < DIMINISHING_RETURNS_PCT: log.info('scraper', f"Low yield ({yield_pct:.1f}% < {DIMINISHING_RETURNS_PCT}%), stopping multi-sort", metrics={'yield_pct': yield_pct, 'threshold': DIMINISHING_RETURNS_PCT}) break # ===== END MULTI-SORT ===== # Flush any remaining reviews (sorted by DOM order) if flush_callback and reviews: log.info('scraper', f"Final flush: {len(reviews)} reviews...", metrics={'batch_size': len(reviews), 'source': 'final_flush'}) sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) flush_callback([r for _, r in sorted_reviews]) total_flushed[0] += len(reviews) reviews.clear() # Reviews already parsed during scrolling (real-time parsing) log.info('scraper', "Finalizing review data...") # Final results (sorted by DOM order) sorted_items = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) review_list = [r for _, r in sorted_items] grand_total = total_flushed[0] + len(review_list) dom_count = sum(1 for r in review_list if r.get("source") == "dom") api_count = sum(1 for r in review_list if r.get("source") == "api") if total_flushed[0] > 0: log.info('scraper', f"Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})", metrics={'total_reviews': grand_total, 'flushed_count': total_flushed[0], 'in_memory_count': len(review_list), 'elapsed_seconds': time.time() - start_time}) else: log.info('scraper', f"Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})", metrics={'total_reviews': len(review_list), 'dom_count': dom_count, 'api_count': api_count, 'elapsed_seconds': time.time() - start_time}) # Infer topics for each review if review_topics is available if review_topics: log.info('scraper', f"Inferring topics for {len(review_list)} reviews...", metrics={'reviews_count': len(review_list)}) topics_inferred_count = 0 for review in review_list: review_text = review.get("text", "") matched = infer_review_topics(review_text, review_topics) review["topics"] = matched if matched: topics_inferred_count += 1 log.info('scraper', f"Topics inferred for {topics_inferred_count}/{len(review_list)} reviews", metrics={'topics_inferred_count': topics_inferred_count, 'reviews_count': len(review_list)}) # Include business info captured from Overview page business_info = business_info_cache[0] or {} return { "reviews": review_list, # Only unflushed reviews (flushed already sent to callback) "total": grand_total, "total_flushed": total_flushed[0], "checks": check_num, "url": url, "logs": log.get_logs(), "review_topics": review_topics, # Topic filters with mention counts "metrics_history": metrics_history, # For crash detection "start_time": start_time, # For crash report elapsed time "session_fingerprint": session_fingerprint, # Browser fingerprint for bot detection analysis "bot_detected": bot_detected if 'bot_detected' in dir() else False, # True if sort button was hidden "initial_sort_used": target_sort if 'target_sort' in dir() else SORT_NEWEST, # Sort order used for first pass "multi_sort": { "enabled": should_multi_sort if 'should_multi_sort' in dir() else False, "completed_sorts": completed_sorts if 'completed_sorts' in dir() else [SORT_NEWEST], "first_pass_count": first_pass_count if 'first_pass_count' in dir() else grand_total }, # Business info captured from Google Maps page "business_info": { "name": business_info.get("name"), "category": business_info.get("category"), "address": business_info.get("address"), "rating": business_info.get("rating") } } def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999, progress_callback=None, driver=None, return_driver: bool = False, log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False, browser_fingerprint: dict = None, initial_sort: str = None, sort_strategy: str = SORT_AUTO, max_reviews: int = None, resume_from_validation: bool = False, validated_business_info: dict = None, validated_total_reviews: int = None): """ Production-compatible wrapper for scrape_reviews. Matches the API expected by job_manager.py. Args: url: Google Maps URL to scrape headless: Run Chrome in headless mode max_scrolls: Not used (kept for API compatibility) progress_callback: Optional callback(current_count, total_count) for progress driver: Existing driver instance to reuse return_driver: If True, return driver in result log_capture: Optional LogCapture instance for real-time log access browser_fingerprint: Optional dict with user's browser fingerprint: - geolocation: {lat, lng} - userAgent: string - viewport: {width, height} - timezone: string (e.g., "Europe/Madrid") - language: string (e.g., "en-US") - platform: string (e.g., "MacIntel") initial_sort: Initial sort order to use ("newest", "lowest", "highest", "relevant") Used for retry with different sort strategy sort_strategy: Sort strategy ("auto", "multi", "single", or specific sort) max_reviews: Maximum reviews to collect (for testing). None = unlimited (default: 5000) resume_from_validation: If True, skip navigation (browser already on page from validation) validated_business_info: Pre-extracted business info from validation phase validated_total_reviews: Pre-extracted total review count from validation phase Returns: Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs """ from seleniumbase import Driver start_time = time.time() driver_provided = driver is not None should_close_driver = not return_driver and not driver_provided # Use provided log_capture or create new one log_capture = log_capture or LogCapture() try: # Extract fingerprint settings fp = browser_fingerprint or {} user_agent = fp.get('userAgent') or "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" viewport = fp.get('viewport') or {'width': 1200, 'height': 900} geolocation = fp.get('geolocation') timezone = fp.get('timezone') language = fp.get('language', 'en-US') # Create driver if not provided if not driver: driver = Driver( uc=True, headless=headless, page_load_strategy="normal", agent=user_agent # Use user's actual user agent ) # Set viewport to match user's screen driver.set_window_size(viewport['width'], viewport['height']) # Apply browser fingerprint settings via CDP try: # Set timezone if provided if timezone: driver.execute_cdp_cmd('Emulation.setTimezoneOverride', {'timezoneId': timezone}) log_capture.info('browser', f"Set timezone to {timezone}") # Set locale/language driver.execute_cdp_cmd('Emulation.setLocaleOverride', {'locale': language}) # Set geolocation if geolocation and 'lat' in geolocation and 'lng' in geolocation: driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { 'latitude': geolocation['lat'], 'longitude': geolocation['lng'], 'accuracy': 1000 # ~1km accuracy for IP-based location }) log_capture.info('browser', f"Set geolocation to ({geolocation['lat']:.2f}, {geolocation['lng']:.2f})", metrics={'lat': geolocation['lat'], 'lng': geolocation['lng']}) else: # Default to US (Boston, MA) if no geolocation provided driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { 'latitude': 42.3601, 'longitude': -71.0589, 'accuracy': 100 }) log_capture.info('browser', "Set geolocation to US (Boston, MA) [default]", metrics={'lat': 42.3601, 'lng': -71.0589}) if fp: log_capture.info('browser', f"Browser fingerprint applied: {fp.get('platform', 'unknown')}, {viewport['width']}x{viewport['height']}", metrics={'viewport_width': viewport['width'], 'viewport_height': viewport['height']}) except Exception as e: log_capture.warn('system', f"Could not apply fingerprint settings: {e}") # Add URL parameters for consistent results if 'hl=' not in url: separator = '&' if '?' in url else '?' url = f"{url}{separator}hl=en" if 'gl=' not in url: url = f"{url}&gl=us" # Create combined flush callback for progress + external handler external_flush = flush_callback # Save external callback internal_flush = None if progress_callback or external_flush: collected = [0] def combined_flush(reviews_batch): collected[0] = len(reviews_batch) # reviews_batch is ALL reviews so far if progress_callback: progress_callback(collected[0], None) if external_flush: external_flush(reviews_batch) # Pass reviews to external handler internal_flush = combined_flush # Run the scraper with progress callback for real-time updates result = scrape_reviews( driver=driver, url=url, max_reviews=max_reviews if max_reviews else 999999, # Unlimited by default, or custom limit for testing timeout_no_new=15, flush_callback=internal_flush, flush_batch_size=100, # Smaller batches for more frequent progress log_capture=log_capture, progress_callback=progress_callback, # Pass through for real-time log updates validation_only=validation_only, # Return early if just validating sort_strategy=sort_strategy, # Sort strategy (auto, multi, single) initial_sort=initial_sort, # Initial sort order for retry with different sort # Session resume parameters (v1.2.0) resume_from_validation=resume_from_validation, validated_business_info=validated_business_info, validated_total_reviews=validated_total_reviews ) elapsed = time.time() - start_time # Return in expected format response = { "reviews": result.get("reviews", []), "count": result.get("total", 0), "total_reviews": result.get("total", 0), "time": elapsed, "success": True, "error": None, "logs": result.get("logs", []), "review_topics": result.get("review_topics", []), # Topic filters with mention counts "session_fingerprint": result.get("session_fingerprint"), # Browser fingerprint for bot detection # Tracking info for retry strategy "bot_detected": result.get("bot_detected", False), # True if sort button was hidden by Google "initial_sort_used": result.get("initial_sort_used", "newest"), # Sort order used "multi_sort": result.get("multi_sort", {}), # Multi-sort completion info # Business info captured from Google Maps page "business_info": result.get("business_info", {}) } # Include validation_info if in validation_only mode if validation_only and "validation_info" in result: response["validation_info"] = result["validation_info"] if return_driver: response["driver"] = driver elif should_close_driver: try: driver.quit() except: pass return response except Exception as e: elapsed = time.time() - start_time # CRASH DETECTION: Build crash report before closing driver crash_report = None try: if driver: # Try to sample final metrics from the browser final_metrics = { 'timestamp_ms': int(time.time() * 1000), 'memory_mb': get_chrome_memory(driver), 'dom_nodes': get_dom_node_count(driver) } # Build crash report with available information crash_report = { 'crash_type': classify_crash(e, [final_metrics]), 'error_message': str(e), 'state': { 'reviews_extracted': 0, # Unknown at crash time 'total_expected': None, 'scroll_count': 0, 'elapsed_seconds': elapsed }, 'metrics_history': [final_metrics], 'logs_before_crash': log_capture.get_logs()[-20:] if log_capture else [], 'last_successful_review_id': None } log_capture.error('system', f"Crash detected: {crash_report['crash_type']}", metrics={'error': str(e), 'elapsed_seconds': elapsed}) except: # If we can't build crash report, continue with basic error handling pass if should_close_driver and driver: try: driver.quit() except: pass # Log error to the existing log_capture log_capture.error('system', f"Scraper failed: {str(e)}") result = { "reviews": [], "count": 0, "total_reviews": 0, "time": elapsed, "success": False, "error": str(e), "driver": driver if return_driver else None, "logs": log_capture.get_logs() } # Include crash report if available if crash_report: result['crash_report'] = crash_report return result def extract_about_info(driver, url: str = None) -> dict: """ Extract About section info from Google Maps (Accessibility, Amenities, etc.). This function should be called AFTER reviews are scraped if about info is needed, as it navigates to a different tab. Args: driver: Selenium WebDriver instance (already on the business page) url: Optional URL to navigate to first (if not already on the page) Returns: dict with section names as keys, each containing list of features """ try: # Navigate if URL provided if url: # Force English if 'hl=' not in url: separator = '&' if '?' in url else '?' url = f"{url}{separator}hl=en" if 'gl=' not in url: url = f"{url}&gl=us" driver.get(url) time.sleep(1) # Click About tab using robust selectors clicked = driver.execute_script(""" // Try multiple selectors for about tab var selectors = [ 'button[aria-label*="About"]', 'button[data-tab-index="2"]', 'div[role="tablist"] button:nth-child(3)', 'button[jsaction*="about"]' ]; for (var sel of selectors) { var btn = document.querySelector(sel); if (btn && btn.textContent.toLowerCase().includes('about')) { btn.click(); return true; } } // Fallback: find by text content var buttons = document.querySelectorAll('button'); for (var btn of buttons) { if (btn.textContent.trim().toLowerCase() === 'about') { btn.click(); return true; } } return false; """) if not clicked: return {} time.sleep(1.5) # Wait for about tab to load # Extract about sections using aria-labels (robust) about = driver.execute_script(""" var about = {}; // Find the about region by aria-label or role var container = document.querySelector('div[role="region"][aria-label*="About"]'); if (!container) { // Fallback: look for the scrollable area with sections container = document.querySelector('.m6QErb[aria-label*="About"]'); } if (!container) { // Last resort: find sections by h2 headers container = document; } // Find all section headers (h2 elements) var sections = container.querySelectorAll('h2'); for (var h2 of sections) { var sectionName = h2.textContent.trim(); var items = []; // Find the ul list following this h2 var parent = h2.closest('.iP2t7d, div'); if (parent) { var listItems = parent.querySelectorAll('li span[aria-label]'); for (var li of listItems) { var label = li.getAttribute('aria-label'); if (label) { // Parse "Has toilet" or "No wheelchair-accessible car park" var hasFeature = !label.toLowerCase().startsWith('no '); var featureName = label.replace(/^(Has |No )/i, ''); items.push({ feature: featureName, available: hasFeature }); } } } if (sectionName && items.length > 0) { about[sectionName] = items; } } return about; """) return about or {} except Exception as e: return {"error": str(e)} # Test function if __name__ == "__main__": from seleniumbase import Driver # Test URL - 79 reviews TEST_URL = "https://www.google.com/maps/place/R.+Fleitas+Peluqueros/@28.1302986,-15.4448111,821m/data=!3m1!1e3!4m6!3m5!1s0xc40951a43c21f19:0x85f89601b9909c72!8m2!3d28.1299805!4d-15.4436854!16s%2Fg%2F11gbwtk8c8" print("🚀 Starting clean scraper test...") # Set up driver driver = Driver(uc=True, headless=False) driver.set_window_size(1200, 900) try: result = scrape_reviews(driver, TEST_URL, max_reviews=100, timeout_no_new=15) print(f"\n✅ Got {result['total']} reviews in {result['checks']} checks") # Show sample if result["reviews"]: print("\n📝 Sample review:") sample = result["reviews"][0] print(f" Author: {sample['author']}") print(f" Rating: {sample['rating']}⭐") print(f" Text: {sample['text'][:100]}..." if sample['text'] else " Text: (none)") finally: driver.quit() print("\n🏁 Done") def get_business_card_info(url: str, headless: bool = True, driver=None, return_driver: bool = False) -> dict: """ Extract business card info from Google Maps. Uses the same efficient polling navigation as scrape_reviews (no fixed waits). Returns: dict with: name, address, rating, total_reviews, success, error, time """ from seleniumbase import Driver import logging log = logging.getLogger(__name__) start_time = time.time() driver_provided = driver is not None should_close_driver = not return_driver and not driver_provided try: # Create driver if not provided if not driver: driver = Driver(uc=True, headless=headless) # Set geolocation to US try: driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { 'latitude': 42.3601, 'longitude': -71.0589, 'accuracy': 100 }) except: pass # Don't clear state - Google may serve different content based on session history # The scraper doesn't reset state, so validation shouldn't either # Force English interface for consistent parsing if 'hl=' not in url: separator = '&' if '?' in url else '?' url = f"{url}{separator}hl=en" if 'gl=' not in url: url = f"{url}&gl=us" # Navigate to URL driver.get(url) # Handle consent popup - poll with 10ms sleep (same as scrape_reviews) start = time.time() while time.time() - start < 5: if "consent.google" in driver.current_url: try: # Try multiple approaches to find and click accept button clicked = False # Method 1: Find by aria-label (most reliable for Google consent) for btn in driver.find_elements(By.CSS_SELECTOR, "button[aria-label*='Accept']"): btn.click() clicked = True break # Method 2: Find by text content if not clicked: for btn in driver.find_elements(By.CSS_SELECTOR, "button"): txt = btn.text.lower() if "accept all" in txt or "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt: btn.click() clicked = True break if clicked: time.sleep(0.5) # Brief wait for consent to process driver.get(url) # Reload the target URL time.sleep(0.5) # Wait for reload except Exception as e: pass break if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url): break time.sleep(0.01) # 10ms - responsive but low CPU # Log current URL after consent handling try: current_url = driver.current_url log.info(f"🔍 Validation: Current URL after load: {current_url[:80]}...") except: pass # Wait for page to fully render before polling (tabs may load dynamically) time.sleep(2) # Poll for business info (same pattern as total_reviews extraction) # Timeout increased to 10s because Reviews tab can take 6+ seconds to appear after consent info = {"name": None, "rating": None, "total_reviews": None, "address": None, "category": None} start = time.time() debug_logged = False while time.time() - start < 10: try: info = driver.execute_script(""" var result = {name: null, rating: null, total_reviews: null, address: null, category: null, debug: []}; // Business name from h1 var h1 = document.querySelector('h1'); if (h1) result.name = h1.textContent.trim(); // Category - use jsaction attribute (robust, survives class changes) var catBtn = document.querySelector('button[jsaction*="category"]'); if (catBtn) result.category = catBtn.textContent.trim(); // Fallback: look for button after rating that's not a link if (!result.category) { var buttons = document.querySelectorAll('button'); for (var btn of buttons) { var text = btn.textContent.trim(); // Categories are short words, no numbers, not navigation if (text && text.length < 50 && !text.match(/^[0-9]/) && !text.match(/review|star|direction|save|share|photo/i)) { // Check if it's near the rating area var parent = btn.closest('.LBgpqf, .skqShb, .fontBodyMedium'); if (parent) { result.category = text; break; } } } } // Rating from span[role="img"] aria-labels var spans = document.querySelectorAll('span[role="img"]'); for (var i = 0; i < spans.length; i++) { var label = spans[i].getAttribute('aria-label') || ''; // Collect debug info for all aria-labels if (label) { result.debug.push('img-aria: ' + label); } // Rating: "4.8 stars" (English forced via hl=en) var rMatch = label.match(/^([\\d,.]+)\\s*star/i); if (rMatch && !result.rating) { result.rating = parseFloat(rMatch[1].replace(',', '.')); } // Reviews: "79 reviews" or "4.8 stars 79 reviews" (English forced via hl=en) // Try direct format first: "79 reviews" var revMatch = label.match(/^([\\d,]+)\\s*review/i); if (revMatch && !result.total_reviews) { result.total_reviews = parseInt(revMatch[1].replace(/,/g, '')); } // Try combined format: "4.8 stars 79 reviews" or "4.8 stars 79k+ reviews" if (!result.total_reviews) { var combinedMatch = label.match(/stars?\\s+([\\d,]+k?\\+?)\\s*review/i); if (combinedMatch) { var countStr = combinedMatch[1].replace(/,/g, ''); if (countStr.includes('k')) { // Handle "9k+" format result.total_reviews = parseInt(countStr) * 1000; } else { result.total_reviews = parseInt(countStr); } } } } // Also collect tab button texts for debugging (include full text including numbers) var tabs = document.querySelectorAll('button[role="tab"]'); for (var j = 0; j < tabs.length; j++) { var tabText = tabs[j].textContent.trim(); result.debug.push('tab: ' + tabText); // Also try to extract review count from tab text like "Reviews (79)" if (tabText.toLowerCase().includes('review') && !result.total_reviews) { var tabMatch = tabText.match(/\\((\\d+)\\)/); if (tabMatch) { result.total_reviews = parseInt(tabMatch[1]); result.debug.push('Found reviews in tab: ' + tabText); } } } // Also check ALL buttons for reviews count var allButtons = document.querySelectorAll('button'); for (var b = 0; b < allButtons.length; b++) { var btnText = allButtons[b].textContent || ''; if (btnText.toLowerCase().includes('review') && !btnText.toLowerCase().includes('write')) { var numMatch = btnText.match(/\\((\\d+)\\)/); if (numMatch && !result.total_reviews) { result.total_reviews = parseInt(numMatch[1]); result.debug.push('Found reviews in button: ' + btnText.substring(0, 50)); } } } // Check if we're on search results vs place page result.debug.push('title: ' + document.title); result.debug.push('url: ' + window.location.href.substring(0, 80)); // Check for search results list var searchResults = document.querySelectorAll('div[role="feed"] > div'); result.debug.push('search_results_count: ' + searchResults.length); // Fallback: Get review count from Reviews tab button "Reviews (79)" // Search ALL tab buttons for one containing "review" text (same as scrape_reviews) if (!result.total_reviews) { var tabs = document.querySelectorAll('button[role="tab"]'); for (var tab of tabs) { var text = tab.textContent.toLowerCase(); if (text.includes('review')) { var match = tab.textContent.match(/\\((\\d+)\\)/); if (match) { result.total_reviews = parseInt(match[1]); break; } } } } // Fallback 2: Look for any button with "Reviews" and a number if (!result.total_reviews) { var buttons = document.querySelectorAll('button'); for (var btn of buttons) { var text = btn.textContent; if (text.toLowerCase().includes('review') && !text.toLowerCase().includes('write')) { var numMatch = text.match(/\\((\\d+)\\)/); if (numMatch) { result.total_reviews = parseInt(numMatch[1]); break; } } } } // Address from button var addrBtn = document.querySelector('button[data-item-id="address"]'); if (addrBtn) { var label = addrBtn.getAttribute('aria-label'); if (label) result.address = label.replace(/^Address:\\s*/i, ''); } return result; """) # Exit early if we have the essentials (name found AND reviews count > 0) if info.get("name") and info.get("total_reviews") and info.get("total_reviews") > 0: break # Log debug info once after 3 seconds if not debug_logged and time.time() - start > 3: debug_logged = True debug_info = info.get("debug", []) if debug_info: log.info(f"🔍 Validation debug - URL: {url[:50]}...") log.info(f" Name: {info.get('name')}, Rating: {info.get('rating')}, Reviews: {info.get('total_reviews')}") for d in debug_info[:10]: # First 10 debug items log.info(f" {d}") except: pass time.sleep(0.1) # 100ms between polls # Final debug log if still no reviews if not info.get("total_reviews"): debug_info = info.get("debug", []) log.warning(f"⚠️ Validation: No reviews found for '{info.get('name')}' after 10s polling") if debug_info: log.warning(f" Debug items: {debug_info[:10]}") return { "name": info.get("name"), "address": info.get("address"), "rating": info.get("rating"), "total_reviews": info.get("total_reviews"), "category": info.get("category"), "success": bool(info.get("name")), "error": None, "time": time.time() - start_time } except Exception as e: return { "name": None, "address": None, "rating": None, "total_reviews": None, "category": None, "success": False, "error": str(e), "time": time.time() - start_time } finally: if should_close_driver and driver: try: driver.quit() except: pass # ============================================================================ # SESSION WORKFLOW FUNCTIONS (v1.2.0) # ============================================================================ # These functions enable efficient session handoff between validation and scraping. # Instead of creating a new browser for scraping, we reuse the one from validation. def validate_with_session(url: str, headless: bool = False, browser_fingerprint: dict = None, log_capture: LogCapture = None, session_ttl: int = 300) -> dict: """ Validate a Google Maps URL and keep the browser session alive for scraping. This is the first step in the session handoff workflow: 1. validate_with_session() - validates URL, returns session_id (browser stays open) 2. User reviews business info (man in the loop) 3. scrape_with_session() - reuses browser, skips navigation Args: url: Google Maps URL to validate headless: Run Chrome in headless mode (requires Xvfb in Docker) browser_fingerprint: Optional browser fingerprint settings log_capture: Optional LogCapture for logging session_ttl: Session time-to-live in seconds (default: 5 minutes) Returns: dict with: - session_id: ID to use for scrape_with_session() (None if validation failed) - business_info: Extracted business information - total_reviews: Total review count - success: Whether validation succeeded - error: Error message if failed """ from seleniumbase import Driver from .session_manager import get_session_manager start_time = time.time() log = log_capture or LogCapture() driver = None try: # Extract fingerprint settings fp = browser_fingerprint or {} user_agent = fp.get('userAgent') or "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" viewport = fp.get('viewport') or {'width': 1200, 'height': 900} geolocation = fp.get('geolocation') timezone = fp.get('timezone') language = fp.get('language', 'en-US') # Create driver driver = Driver( uc=True, headless=headless, page_load_strategy="normal", agent=user_agent ) driver.set_window_size(viewport['width'], viewport['height']) # Apply browser fingerprint via CDP try: if timezone: driver.execute_cdp_cmd('Emulation.setTimezoneOverride', {'timezoneId': timezone}) driver.execute_cdp_cmd('Emulation.setLocaleOverride', {'locale': language}) if geolocation and 'lat' in geolocation and 'lng' in geolocation: driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { 'latitude': geolocation['lat'], 'longitude': geolocation['lng'], 'accuracy': 1000 }) else: driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { 'latitude': 42.3601, 'longitude': -71.0589, 'accuracy': 100 }) except Exception as e: log.warn('system', f"Could not apply fingerprint: {e}") # Add URL parameters for consistent results if 'hl=' not in url: separator = '&' if '?' in url else '?' url = f"{url}{separator}hl=en" if 'gl=' not in url: url = f"{url}&gl=us" # Run validation (but DON'T close driver) result = scrape_reviews( driver=driver, url=url, max_reviews=0, # Not needed for validation validation_only=True, log_capture=log ) validation_info = result.get("validation_info", {}) # validation_info contains business fields directly (name, rating, category, address, total_reviews) business_info = { "name": validation_info.get("name"), "rating": validation_info.get("rating"), "category": validation_info.get("category"), "address": validation_info.get("address"), "total_reviews": validation_info.get("total_reviews") } total_reviews = validation_info.get("total_reviews") # Check if validation succeeded if not business_info.get("name"): # Validation failed - close driver try: driver.quit() except: pass return { "session_id": None, "business_info": business_info, "total_reviews": total_reviews, "success": False, "error": "Could not find business information", "time": time.time() - start_time } # Validation succeeded - create session (keep driver alive) session_manager = get_session_manager() session_id = session_manager.create_session( driver=driver, url=url, business_info=business_info, total_reviews=total_reviews or 0, browser_fingerprint=browser_fingerprint, log_capture=log, ttl_seconds=session_ttl ) log.info('session', f"Created session {session_id} for {business_info.get('name')}", metrics={ 'session_id': session_id, 'total_reviews': total_reviews, 'ttl_seconds': session_ttl }) return { "session_id": session_id, "business_info": business_info, "total_reviews": total_reviews, "success": True, "error": None, "time": time.time() - start_time, "expires_in": session_ttl } except Exception as e: # Close driver on error if driver: try: driver.quit() except: pass log.error('session', f"Validation failed: {e}") return { "session_id": None, "business_info": {}, "total_reviews": None, "success": False, "error": str(e), "time": time.time() - start_time } def scrape_with_session(session_id: str, max_reviews: int = None, progress_callback=None, flush_callback=None, sort_strategy: str = SORT_AUTO, initial_sort: str = None) -> dict: """ Scrape reviews using an existing validated session. This is the second step in the session handoff workflow. The browser is already on the Google Maps page from validation. Args: session_id: Session ID from validate_with_session() max_reviews: Maximum reviews to collect (None = unlimited) progress_callback: Optional callback(current_count, total_count) flush_callback: Optional callback for streaming reviews sort_strategy: Sort strategy ("auto", "multi", etc.) initial_sort: Initial sort order Returns: dict with reviews, count, success, error, etc. """ from .session_manager import get_session_manager start_time = time.time() session_manager = get_session_manager() # Claim the session (marks it as in-use) session = session_manager.claim_session(session_id) if not session: return { "reviews": [], "count": 0, "total_reviews": 0, "success": False, "error": f"Session {session_id} not found or expired", "time": time.time() - start_time } log = session.log_capture or LogCapture() log.info('session', f"Resuming session {session_id} for scraping", metrics={ 'business': session.business_info.get('name'), 'total_reviews': session.total_reviews }) try: # Run scraper with resume_from_validation=True # This skips navigation since browser is already on page result = scrape_reviews( driver=session.driver, url=session.url, max_reviews=max_reviews if max_reviews else 999999, timeout_no_new=15, flush_callback=flush_callback, flush_batch_size=100, log_capture=log, progress_callback=progress_callback, validation_only=False, sort_strategy=sort_strategy, initial_sort=initial_sort, # Session resume parameters resume_from_validation=True, validated_business_info=session.business_info, validated_total_reviews=session.total_reviews ) elapsed = time.time() - start_time response = { "reviews": result.get("reviews", []), "count": result.get("total", 0), "total_reviews": result.get("total", 0), "time": elapsed, "success": True, "error": None, "logs": result.get("logs", []), "review_topics": result.get("review_topics", []), "session_fingerprint": result.get("session_fingerprint"), "bot_detected": result.get("bot_detected", False), "initial_sort_used": result.get("initial_sort_used", "newest"), "multi_sort": result.get("multi_sort", {}), "business_info": result.get("business_info", session.business_info), "session_id": session_id, "session_reused": True # Flag indicating session was reused } return response except Exception as e: log.error('session', f"Scraping failed for session {session_id}: {e}") return { "reviews": [], "count": 0, "total_reviews": 0, "success": False, "error": str(e), "time": time.time() - start_time, "logs": log.get_logs(), "session_id": session_id } finally: # Always release session (closes browser) session_manager.release_session(session_id, reason="scraping_completed")