diff --git a/scrapers/google_reviews/v1_0_0.py b/scrapers/google_reviews/v1_0_0.py index 791bd80..a52e8d9 100644 --- a/scrapers/google_reviews/v1_0_0.py +++ b/scrapers/google_reviews/v1_0_0.py @@ -801,6 +801,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in time.sleep(0.1) except: pass + log.info('browser', f"Loading: {url[:80]}...") else: log.info('browser', f"Hard refresh #{hard_refresh_count[0]}: reloading page...") @@ -1069,14 +1070,25 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in except: pass - # Block images to speed up scrolling (use CDP) + # Block heavy resources to speed up scrolling (use CDP) try: driver.execute_cdp_cmd('Network.setBlockedURLs', { - 'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*'] + 'urls': [ + # Images + '*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', + '*googleusercontent.com/*', + # Fonts + '*.woff', '*.woff2', '*.ttf', '*.otf', + # Analytics/tracking + '*google-analytics.com/*', '*googletagmanager.com/*', + '*doubleclick.net/*', '*googlesyndication.com/*', + # Maps tiles (not needed for reviews) + '*khms*.google.com/*', '*maps.googleapis.com/maps/vt*' + ] }) driver.execute_cdp_cmd('Network.enable', {}) if not is_refresh: - log.info('browser', "Blocking images for faster scrolling") + log.info('browser', "Blocking heavy resources for faster scrolling") except: pass @@ -1198,6 +1210,28 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in pass return api_revs + # Captcha detection helper + def detect_captcha(): + """Check if a captcha or challenge is blocking the page. Returns captcha type or None.""" + try: + return driver.execute_script(""" + // Check for reCAPTCHA iframe or checkbox + var recaptcha = document.querySelector('iframe[src*="recaptcha"], iframe[title*="reCAPTCHA"]'); + if (recaptcha) return 'recaptcha'; + + // Check for "unusual traffic" message + var body = document.body ? document.body.innerText : ''; + if (body.includes('unusual traffic') || body.includes('not a robot')) return 'unusual_traffic'; + + // Check for challenge frame + var challenge = document.querySelector('iframe[src*="challenge"]'); + if (challenge) return 'challenge'; + + return null; + """) + except: + return None + # Recovery function - use real mouse actions when stuck from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.keys import Keys @@ -1557,6 +1591,18 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in if elapsed >= 3 and int(elapsed) % 3 == 0: # After 8+ failed recovery attempts, try hard refresh if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes: + # Check for captcha before hard refresh - no point refreshing if blocked + captcha_type = detect_captcha() + if captcha_type: + log.warn('browser', f"Captcha detected ({captcha_type}), stopping - human intervention needed", metrics={'captcha_type': captcha_type}) + stop_scrolling.set() + return { + "reviews": [], + "total": current_count, + "error": f"Captcha detected: {captcha_type}. Please solve manually and retry.", + "captcha_detected": True + } + log.info('browser', f"Soft recovery failed {recovery_count[0]} times, trying hard refresh...", metrics={'recovery_count': recovery_count[0]}) if do_hard_refresh(): last_new_time = time.time() # Reset timer after refresh @@ -1596,8 +1642,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in timeout_hit = elapsed >= timeout_no_new if truly_done or timeout_hit: - # Last chance: try hard refresh before giving up - if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews): + # Check if we're close enough to total (95%+ threshold) + # If we have 95%+ of reviews, don't waste time with hard refreshes + close_enough = False + if total_reviews[0] and current_count > 0: + pct_complete = (current_count / total_reviews[0]) * 100 + close_enough = pct_complete >= 95 + if close_enough: + log.info('scraper', f"Close enough ({pct_complete:.1f}% complete), skipping further retries", metrics={'pct_complete': pct_complete}) + + # Last chance: try hard refresh before giving up (only if not close enough) + if not close_enough and hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews): + # Check for captcha first + captcha_type = detect_captcha() + if captcha_type: + log.warn('browser', f"Captcha detected ({captcha_type}), stopping - human intervention needed", metrics={'captcha_type': captcha_type}) + stop_scrolling.set() + break + log.info('browser', "Timeout reached, trying hard refresh before giving up...", metrics={'idle_seconds': elapsed}) if do_hard_refresh(): last_new_time = time.time() diff --git a/scrapers/google_reviews/v1_1_0.py b/scrapers/google_reviews/v1_1_0.py new file mode 100644 index 0000000..a99dc16 --- /dev/null +++ b/scrapers/google_reviews/v1_1_0.py @@ -0,0 +1,2865 @@ +""" +Google Reviews Scraper v1.1.0 + +This module provides the core Google Maps reviews scraping functionality. +- Simple down scrolling +- DOM scraping + API interception +- Multi-sort strategy to bypass ~1000 review limit + +Version: 1.1.0 +Based on: v1.0.0 +New features: + - Multi-sort strategy (newest, lowest, highest, relevant) + - Auto mode: smart detection when to use multi-sort + - Configurable sort order and thresholds + - Diminishing returns detection + - Configurable close_enough_pct threshold +""" + +# Sort strategy constants +SORT_NEWEST = "newest" +SORT_LOWEST = "lowest" +SORT_HIGHEST = "highest" +SORT_RELEVANT = "relevant" +SORT_AUTO = "auto" + +DEFAULT_SORT_ORDER = [SORT_NEWEST, SORT_LOWEST, SORT_HIGHEST, SORT_RELEVANT] +MULTI_SORT_THRESHOLD = 1000 # Auto-enable multi-sort if total > this +DIMINISHING_RETURNS_PCT = 5 # Stop if pass yields < 5% new reviews + +import re +import json +import time +import threading +from datetime import datetime +from typing import List, Optional +from selenium.webdriver.common.by import By + +from utils.logger import StructuredLogger + +def get_chrome_memory(driver) -> Optional[int]: + """Get Chrome memory usage in MB using CDP.""" + try: + # Use CDP Performance.getMetrics + result = driver.execute_cdp_cmd('Performance.getMetrics', {}) + for metric in result.get('metrics', []): + if metric['name'] == 'JSHeapUsedSize': + return int(metric['value'] / 1024 / 1024) + except: + pass + return None + + +def get_dom_node_count(driver) -> Optional[int]: + """Get DOM node count.""" + try: + return driver.execute_script("return document.getElementsByTagName('*').length") + except: + return None + + +def capture_session_fingerprint(driver) -> dict: + """ + Capture browser session fingerprint for bot detection analysis. + + This captures various browser attributes that can be used to: + 1. Verify bot detection evasion is working + 2. Debug issues when scraping fails + 3. Track session characteristics for analysis + + Args: + driver: Selenium WebDriver instance (must be initialized) + + Returns: + Dictionary containing session fingerprint data + """ + fingerprint = { + "user_agent": None, + "platform": None, + "language": None, + "languages": None, + "timezone": None, + "screen": { + "width": None, + "height": None, + "colorDepth": None + }, + "viewport": { + "width": None, + "height": None + }, + "webgl_vendor": None, + "webgl_renderer": None, + "canvas_fingerprint": None, + "hardware_concurrency": None, + "device_memory": None, + "bot_detection_tests": { + "webdriver_hidden": None, + "chrome_runtime": None, + "permissions_query": None + }, + "captured_at": None + } + + try: + # Navigate to about:blank first to ensure we can execute JS + # (in case driver was just created and hasn't navigated yet) + current_url = driver.current_url + if not current_url or current_url == "data:,": + driver.get("about:blank") + + # Capture timestamp + fingerprint["captured_at"] = datetime.now().isoformat() + + # Basic navigator properties + try: + fingerprint["user_agent"] = driver.execute_script("return navigator.userAgent") + except: + pass + + try: + fingerprint["platform"] = driver.execute_script("return navigator.platform") + except: + pass + + try: + fingerprint["language"] = driver.execute_script("return navigator.language") + except: + pass + + try: + fingerprint["languages"] = driver.execute_script("return navigator.languages") + except: + pass + + try: + fingerprint["timezone"] = driver.execute_script( + "return Intl.DateTimeFormat().resolvedOptions().timeZone" + ) + except: + pass + + # Screen properties + try: + fingerprint["screen"]["width"] = driver.execute_script("return screen.width") + fingerprint["screen"]["height"] = driver.execute_script("return screen.height") + fingerprint["screen"]["colorDepth"] = driver.execute_script("return screen.colorDepth") + except: + pass + + # Viewport properties + try: + fingerprint["viewport"]["width"] = driver.execute_script("return window.innerWidth") + fingerprint["viewport"]["height"] = driver.execute_script("return window.innerHeight") + except: + pass + + # WebGL vendor and renderer (important for fingerprinting) + try: + webgl_info = driver.execute_script(""" + try { + var canvas = document.createElement('canvas'); + var gl = canvas.getContext('webgl') || canvas.getContext('experimental-webgl'); + if (gl) { + var debugInfo = gl.getExtension('WEBGL_debug_renderer_info'); + if (debugInfo) { + return { + vendor: gl.getParameter(debugInfo.UNMASKED_VENDOR_WEBGL), + renderer: gl.getParameter(debugInfo.UNMASKED_RENDERER_WEBGL) + }; + } + } + } catch(e) {} + return {vendor: null, renderer: null}; + """) + fingerprint["webgl_vendor"] = webgl_info.get("vendor") + fingerprint["webgl_renderer"] = webgl_info.get("renderer") + except: + pass + + # Canvas fingerprint (hash of canvas drawing) + try: + canvas_hash = driver.execute_script(""" + try { + var canvas = document.createElement('canvas'); + canvas.width = 200; + canvas.height = 50; + var ctx = canvas.getContext('2d'); + ctx.textBaseline = 'top'; + ctx.font = '14px Arial'; + ctx.fillStyle = '#f60'; + ctx.fillRect(125, 1, 62, 20); + ctx.fillStyle = '#069'; + ctx.fillText('Fingerprint', 2, 15); + ctx.fillStyle = 'rgba(102, 204, 0, 0.7)'; + ctx.fillText('Fingerprint', 4, 17); + var dataUrl = canvas.toDataURL(); + // Simple hash + var hash = 0; + for (var i = 0; i < dataUrl.length; i++) { + var char = dataUrl.charCodeAt(i); + hash = ((hash << 5) - hash) + char; + hash = hash & hash; + } + return hash.toString(16); + } catch(e) { + return null; + } + """) + fingerprint["canvas_fingerprint"] = canvas_hash + except: + pass + + # Hardware info + try: + fingerprint["hardware_concurrency"] = driver.execute_script( + "return navigator.hardwareConcurrency" + ) + except: + pass + + try: + fingerprint["device_memory"] = driver.execute_script( + "return navigator.deviceMemory" + ) + except: + pass + + # Bot detection tests + try: + # Test 1: webdriver property should be hidden/false for undetected Chrome + webdriver_hidden = driver.execute_script( + "return navigator.webdriver === undefined || navigator.webdriver === false" + ) + fingerprint["bot_detection_tests"]["webdriver_hidden"] = webdriver_hidden + except: + pass + + try: + # Test 2: chrome runtime should exist in real Chrome + chrome_runtime = driver.execute_script( + "return typeof window.chrome !== 'undefined'" + ) + fingerprint["bot_detection_tests"]["chrome_runtime"] = chrome_runtime + except: + pass + + try: + # Test 3: permissions.query should work in real Chrome + permissions_query = driver.execute_script(""" + try { + if (navigator.permissions && navigator.permissions.query) { + return true; + } + return false; + } catch(e) { + return false; + } + """) + fingerprint["bot_detection_tests"]["permissions_query"] = permissions_query + except: + pass + + except Exception as e: + fingerprint["capture_error"] = str(e) + + return fingerprint + + +def classify_crash(exception: Exception, metrics_history: list) -> str: + """Classify crash type based on exception and metrics.""" + error_str = str(exception).lower() + + if 'aw, snap' in error_str or 'status_access_violation' in error_str: + return 'tab_crash' + if 'timeout' in error_str: + return 'timeout' + if metrics_history and metrics_history[-1].get('memory_mb', 0) > 400: + return 'memory_exhaustion' + if 'no such element' in error_str: + return 'element_not_found' + if '429' in error_str or 'rate' in error_str: + return 'rate_limited' + if 'network' in error_str or 'connection' in error_str: + return 'network_failure' + return 'unknown' + + +class ScraperCrashException(Exception): + """Exception that carries crash report data for analysis.""" + def __init__(self, original_exception, crash_report): + self.original_exception = original_exception + self.crash_report = crash_report + super().__init__(str(original_exception)) + + +def get_topic_variants(topic: str) -> List[str]: + """ + Generate common variants of a topic word for matching. + + Handles: + - Singular/plural forms + - Verb forms (-ing, -ed, -s) + - Common stemming patterns + + Args: + topic: The topic word/phrase to generate variants for + + Returns: + List of variant strings including the original + + Example: + >>> get_topic_variants("cutting") + ["cutting", "cut", "cuts"] + >>> get_topic_variants("service") + ["service", "services", "servicing"] + """ + if not topic: + return [] + + topic = topic.lower().strip() + variants = {topic} # Use set to avoid duplicates + + # Handle -ing forms (cutting -> cut, cuts) + if topic.endswith("ing"): + base = topic[:-3] # Remove -ing + if base: + variants.add(base) + variants.add(base + "s") + # Handle doubled consonants (cutting -> cut) + if len(base) >= 2 and base[-1] == base[-2]: + single_consonant = base[:-1] + variants.add(single_consonant) + variants.add(single_consonant + "s") + + # Handle -s/-es plural forms (services -> service) + if topic.endswith("es") and len(topic) > 2: + variants.add(topic[:-2]) # Remove -es + variants.add(topic[:-2] + "ing") + elif topic.endswith("s") and len(topic) > 1 and not topic.endswith("ss"): + variants.add(topic[:-1]) # Remove -s + variants.add(topic[:-1] + "ing") + + # Handle -ed forms (colored -> color) + if topic.endswith("ed") and len(topic) > 2: + base = topic[:-2] + if base: + variants.add(base) + variants.add(base + "s") + variants.add(base + "ing") + # Handle doubled consonants (colored -> color from coloured) + if len(base) >= 2 and base[-1] == base[-2]: + single_consonant = base[:-1] + variants.add(single_consonant) + + # Add common forms if base word (no suffix detected) + if not (topic.endswith("ing") or topic.endswith("ed") or topic.endswith("s")): + variants.add(topic + "s") + variants.add(topic + "ing") + # Handle consonant doubling for -ing (cut -> cutting) + if len(topic) >= 2 and topic[-1] not in "aeiouwy": + variants.add(topic + topic[-1] + "ing") + + return list(variants) + + +def infer_review_topics(review_text: str, topics: List[dict]) -> List[str]: + """ + Match review text against extracted topic keywords. + + Args: + review_text: The review text to analyze + topics: List of topic dicts, e.g., [{"topic": "cutting", "count": 3}] + + Returns: + List of matched topic names + + Example: + >>> topics = [{"topic": "hair salon", "count": 4}, {"topic": "cutting", "count": 3}] + >>> text = "Great haircut! The cutting was professional." + >>> infer_review_topics(text, topics) + ["cutting"] + """ + # Handle empty/None inputs gracefully + if not review_text or not topics: + return [] + + review_text_lower = review_text.lower() + matched_topics = [] + + for topic_dict in topics: + topic = topic_dict.get("topic", "") + if not topic: + continue + + topic_lower = topic.lower().strip() + + # Get all variants of the topic + variants = get_topic_variants(topic_lower) + + # Check each variant for word boundary match + for variant in variants: + if not variant: + continue + + # Use word boundary regex to avoid partial matches + # \b ensures we match whole words only + # E.g., "cut" won't match "execute" or "cutlery" partially + pattern = r'\b' + re.escape(variant) + r'\b' + + if re.search(pattern, review_text_lower): + matched_topics.append(topic) # Use original topic name + break # Found a match, no need to check other variants + + return matched_topics + + +class LogCapture: + """ + Backward-compatible wrapper around StructuredLogger. + + Maintains the original LogCapture API while using StructuredLogger internally. + This allows existing code to continue working while gaining structured logging benefits. + """ + + def __init__(self): + self._logger = StructuredLogger() + + def log(self, message: str, level: str = "INFO", source: str = "scraper"): + """Add a log entry with timestamp (backward compatible).""" + # Map source to category + category = self._source_to_category(source) + level_upper = level.upper() + + if level_upper == "ERROR": + self._logger.error(category, message) + elif level_upper == "WARNING" or level_upper == "WARN": + self._logger.warn(category, message) + elif level_upper == "DEBUG": + self._logger.debug(category, message) + else: + self._logger.info(category, message) + + # Also print for console visibility + print(message, flush=True) + + def info(self, category_or_msg, message: str = None, *, metrics: dict = None): + """ + Log an INFO message. + + Supports both old API: info(message, source) + And new API: info(category, message, metrics={...}) + """ + if message is None: + # Old API: info(message) or info(message, source) + self._logger.info('scraper', category_or_msg, metrics=metrics) + print(category_or_msg, flush=True) + else: + # New API: info(category, message, metrics={...}) + self._logger.info(category_or_msg, message, metrics=metrics) + print(message, flush=True) + + def warning(self, category_or_msg, message: str = None, *, metrics: dict = None): + """Log a WARNING message (supports both old and new API).""" + if message is None: + self._logger.warn('scraper', category_or_msg, metrics=metrics) + print(category_or_msg, flush=True) + else: + self._logger.warn(category_or_msg, message, metrics=metrics) + print(message, flush=True) + + def warn(self, category, message: str, *, metrics: dict = None): + """Log a WARN message with category (new API).""" + self._logger.warn(category, message, metrics=metrics) + print(message, flush=True) + + def error(self, category_or_msg, message: str = None, *, metrics: dict = None): + """Log an ERROR message (supports both old and new API).""" + if message is None: + self._logger.error('scraper', category_or_msg, metrics=metrics) + print(category_or_msg, flush=True) + else: + self._logger.error(category_or_msg, message, metrics=metrics) + print(message, flush=True) + + def debug(self, category, message: str, *, metrics: dict = None): + """Log a DEBUG message with category (new API).""" + self._logger.debug(category, message, metrics=metrics) + print(message, flush=True) + + def get_logs(self): + """Get all log entries as JSON-serializable dictionaries.""" + return self._logger.get_logs() + + def _source_to_category(self, source: str) -> str: + """Map legacy source names to StructuredLogger categories.""" + source_lower = source.lower() if source else 'scraper' + if source_lower in ('browser', 'navigation', 'page'): + return 'browser' + elif source_lower in ('network', 'api'): + return 'network' + elif source_lower in ('system', 'memory', 'chrome'): + return 'system' + else: + return 'scraper' + + +def parse_api_review(raw: list) -> dict: + """Parse a review from API response array.""" + try: + if not isinstance(raw, list) or len(raw) < 5: + return None + + author = raw[0] if len(raw) > 0 and isinstance(raw[0], str) else "" + timestamp = raw[1] if len(raw) > 1 else "" + text = raw[3] if len(raw) > 3 and isinstance(raw[3], str) else "" + rating = raw[4] if len(raw) > 4 and isinstance(raw[4], int) else 0 + + if not (1 <= rating <= 5): + return None + + # Filter out garbage data (language codes, metadata, etc.) + if len(author) <= 3: # Real names are longer than 3 chars + return None + if author.lower() in ['google', 'maps', 'reviews', 'es', 'en', 'it', 'no', 'de', 'fr', 'pt']: + return None + # Timestamp should look like a date, not a URL or language code + if timestamp and ('http' in str(timestamp) or len(str(timestamp)) <= 3): + return None + + # Owner response + owner_response = None + for idx in [9, 18]: + if len(raw) > idx and raw[idx] and isinstance(raw[idx], list): + resp = raw[idx] + if len(resp) > 1: + owner_response = {"text": resp[1], "timestamp": resp[0] if resp[0] else ""} + break + + return { + "author": author, + "text": text, + "rating": rating, + "timestamp": timestamp, + "owner_response": owner_response, + "source": "api" + } + except: + return None + + +def extract_reviews_from_api_body(body: str) -> list: + """Extract reviews from API response body using correct Google Maps structure.""" + reviews = [] + try: + # Remove )]}' prefix + if body.startswith(")]}'"): + body = body[4:].strip() + + data = json.loads(body) + + # Google Maps API structure: data[2] contains review arrays + # Each review: data[2][X][0] where: + # Author: [1][4][5][0] + # Rating: [2][0][0] + # Text: [2][15][0][0] + # Time: [1][6] + if not isinstance(data, list) or len(data) < 3: + return reviews + + reviews_area = data[2] + if not isinstance(reviews_area, list): + return reviews + + for item in reviews_area: + try: + if not isinstance(item, list) or len(item) < 1: + continue + review_data = item[0] + if not isinstance(review_data, list) or len(review_data) < 3: + continue + + # Extract fields using correct paths + review_id = "" + author = "" + rating = 0 + text = "" + timestamp = "" + + # Review ID: [0] - same format as DOM's data-review-id + try: + review_id = review_data[0] + except (IndexError, TypeError): + pass + + # Author: [1][4][5][0] + try: + author = review_data[1][4][5][0] + except (IndexError, TypeError): + pass + + # Rating: [2][0][0] + try: + rating = review_data[2][0][0] + except (IndexError, TypeError): + pass + + # Text: [2][15][0][0] + try: + text = review_data[2][15][0][0] + except (IndexError, TypeError): + pass + + # Timestamp: [1][6] + try: + timestamp = review_data[1][6] + except (IndexError, TypeError): + pass + + # Validate and add (include review_id for deduplication) + if author and isinstance(rating, int) and 1 <= rating <= 5: + reviews.append({ + "review_id": review_id, + "author": author, + "text": text or "", + "rating": rating, + "timestamp": timestamp or "", + "source": "api" + }) + except: + continue + except: + pass + return reviews + +def parse_dom_review(card) -> dict: + """Parse a review from DOM element.""" + try: + # Get review ID + review_id = card.get_attribute("data-review-id") or "" + if not review_id: + try: + id_el = card.find_element(By.CSS_SELECTOR, "[data-review-id]") + review_id = id_el.get_attribute("data-review-id") or "" + except: + pass + + # Author - multiple selectors + author = "" + for sel in ['div[class*="d4r55"]', '.d4r55', 'button[data-review-id] + div']: + try: + author_el = card.find_element(By.CSS_SELECTOR, sel) + author = author_el.text.strip() + if author: + break + except: + pass + + # Rating from aria-label on span[role="img"] + rating = 0 + try: + stars_el = card.find_element(By.CSS_SELECTOR, 'span[role="img"]') + aria = stars_el.get_attribute("aria-label") or "" + # Extract number from label (handles "5 stars", "5 estrellas", etc.) + num = re.search(r'[\d\.]+', aria.replace(',', '.')) + if num: + rating = int(float(num.group())) + except: + pass + + # Review text - try multiple selectors + text = "" + for sel in ['span[jsname="bN97Pc"]', 'span[jsname="fbQN7e"]', 'div.MyEned span.wiI7pd', '.wiI7pd']: + try: + text_el = card.find_element(By.CSS_SELECTOR, sel) + text = text_el.text.strip() + if text: + break + except: + pass + + # Note: "More" button clicking removed for speed + # Full text can be expanded later if needed + + # Timestamp + timestamp = "" + try: + time_el = card.find_element(By.CSS_SELECTOR, 'span[class*="rsqaWe"]') + timestamp = time_el.text.strip() + except: + pass + + # Owner response + owner_response = None + try: + resp_box = card.find_element(By.CSS_SELECTOR, "div.CDe7pd") + if resp_box: + resp_text = "" + resp_date = "" + try: + resp_text_el = resp_box.find_element(By.CSS_SELECTOR, "div.wiI7pd") + resp_text = resp_text_el.text.strip() + except: + pass + try: + resp_date_el = resp_box.find_element(By.CSS_SELECTOR, "span.DZSIDd") + resp_date = resp_date_el.text.strip() + except: + pass + if resp_text: + owner_response = {"text": resp_text, "timestamp": resp_date} + except: + pass + + if not review_id and not author: + return None + + return { + "id": review_id, + "author": author, + "text": text, + "rating": rating, + "timestamp": timestamp, + "owner_response": owner_response, + "source": "dom" + } + except Exception: + return None + + +def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15, + flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None, + progress_callback=None, validation_only: bool = False, + sort_strategy: str = SORT_AUTO, sort_order: List[str] = None, + multi_sort_threshold: int = MULTI_SORT_THRESHOLD, + close_enough_pct: float = 95.0) -> dict: + """ + Scrape Google Maps reviews with optional multi-sort strategy. + + Args: + driver: Selenium WebDriver instance + url: Google Maps place URL + max_reviews: Maximum reviews to collect + timeout_no_new: Seconds to wait with no new reviews before stopping + flush_callback: Optional callback(reviews_list) called every flush_batch_size reviews + This allows streaming data to disk and freeing memory + flush_batch_size: Number of reviews to collect before flushing (default 500) + log_capture: Optional LogCapture instance for storing logs + progress_callback: Optional callback(current_count, total_count) called every iteration + validation_only: If True, only validate the business exists (no scraping) + sort_strategy: Sort strategy - "auto", "newest", "lowest", "highest", "relevant", or "multi" + - "auto": Use multi-sort if total > threshold or first pass incomplete + - "multi": Force multi-sort through all sort orders + - Others: Single sort mode + sort_order: Custom sort order for multi-sort (default: newest, lowest, highest, relevant) + multi_sort_threshold: Auto-enable multi-sort if total reviews > this (default: 1000) + close_enough_pct: Stop retrying if we have this % of total reviews (default: 95.0) + + Returns: + dict with reviews list and metadata + """ + # Use provided log_capture or create a dummy that just prints + log = log_capture or LogCapture() + + # Capture session fingerprint early (before navigation) for bot detection analysis + session_fingerprint = capture_session_fingerprint(driver) + log.info('browser', "Session fingerprint captured", metrics={ + 'user_agent': session_fingerprint.get('user_agent', 'unknown')[:50] + '...' if session_fingerprint.get('user_agent') else 'unknown', + 'platform': session_fingerprint.get('platform'), + 'timezone': session_fingerprint.get('timezone'), + 'webdriver_hidden': session_fingerprint.get('bot_detection_tests', {}).get('webdriver_hidden'), + 'chrome_runtime': session_fingerprint.get('bot_detection_tests', {}).get('chrome_runtime') + }) + + # Storage - use review ID as key + reviews = {} # review_id -> review + seen_ids = set() # Track all IDs we've seen (persists after flush) + total_flushed = [0] # Use list for closure mutation + review_order = {} # review_id -> position (DOM visual order for sorting) + order_counter = [0] # Current order position + + # Track total reviews (persists across refreshes) + total_reviews = [None] # Use list for closure mutation + + # Store business info extracted from overview (before clicking reviews tab) + business_info_cache = [None] + + # Hard refresh counter + hard_refresh_count = [0] + max_hard_refreshes = 3 # Max number of hard refreshes before giving up + + # Find scrollable reviews container helper + def find_scroll_container(): + selectors = [ + "div.m6QErb.DxyBCb.kA9KIf.dS8AEf", + "div.m6QErb.DxyBCb.kA9KIf", + "div.m6QErb.DxyBCb", + "div.m6QErb[aria-label]", + "div.DxyBCb.kA9KIf.dS8AEf", + "div[role='main'] div.m6QErb", + ] + for sel in selectors: + try: + els = driver.find_elements(By.CSS_SELECTOR, sel) + for el in els: + if el.is_displayed() and el.size['height'] > 100: + return el + except: + pass + return None + + def change_sort_order(target_sort: str) -> bool: + """ + Change the review sort order in the UI. + + Args: + target_sort: One of "newest", "lowest", "highest", "relevant" + + Returns: + True if sort was changed successfully, False otherwise + """ + # Mapping of our sort names to UI text patterns + sort_patterns = { + SORT_NEWEST: ['newest', 'recent', 'más reciente', 'neueste', 'plus récent'], + SORT_LOWEST: ['lowest', 'rating: low', 'puntuación: baja', 'niedrigste'], + SORT_HIGHEST: ['highest', 'rating: high', 'puntuación: alta', 'höchste'], + SORT_RELEVANT: ['relevant', 'relevance', 'más relevante', 'relevanteste'], + } + + patterns = sort_patterns.get(target_sort, [target_sort]) + + try: + # Click sort button to open menu + # Note: The button shows current sort value (e.g., "Most relevant", "Newest") + # IMPORTANT: There are two HQzyZ buttons - "All reviews" (filter) and sort button + # We need the one with sort values, NOT "All reviews" + # The button textContent includes nested elements like "SortMost relevant" so we use includes() + sort_btn = driver.execute_script(""" + // Sort option values (what the button displays) - use lowercase for matching + var sortValues = ['most relevant', 'newest', 'highest rating', 'lowest rating', + 'más relevantes', 'más recientes', 'puntuación más alta', 'puntuación más baja', + 'relevantes', 'recientes']; + + // Exclusion patterns - buttons we should NOT click + var excludePatterns = ['all reviews', 'todas las reseñas', 'alle bewertungen']; + + function normalizeText(text) { + return text.toLowerCase().replace(/\\s+/g, ' ').trim(); + } + + function isExcluded(text) { + var normalized = normalizeText(text); + for (var i = 0; i < excludePatterns.length; i++) { + if (normalized.includes(excludePatterns[i])) return true; + } + return false; + } + + function matchesSortValue(text) { + var normalized = normalizeText(text); + for (var i = 0; i < sortValues.length; i++) { + if (normalized.includes(sortValues[i])) return true; + } + return false; + } + + // Method 1: Find button with aria-haspopup containing sort value (not excluded) + var btns = document.querySelectorAll('button[aria-haspopup="true"]'); + for (var i = 0; i < btns.length; i++) { + var text = btns[i].textContent; + if (!isExcluded(text) && matchesSortValue(text)) { + return btns[i]; + } + } + + // Method 2: Button with S9kvJb class (Google's sort button class) + btns = document.querySelectorAll('button.S9kvJb'); + for (var i = 0; i < btns.length; i++) { + var text = btns[i].textContent; + if (!isExcluded(text) && matchesSortValue(text)) { + return btns[i]; + } + } + + // Method 3: Any button containing sort value text + btns = document.querySelectorAll('button'); + for (var i = 0; i < btns.length; i++) { + var text = btns[i].textContent; + if (!isExcluded(text) && matchesSortValue(text)) { + return btns[i]; + } + } + + // Method 4: Look for button near a "Sort" label + var spans = document.querySelectorAll('span'); + for (var i = 0; i < spans.length; i++) { + if (normalizeText(spans[i].textContent) === 'sort' || + normalizeText(spans[i].textContent) === 'ordenar') { + var parent = spans[i].closest('button'); + if (parent && !isExcluded(parent.textContent)) { + return parent; + } + } + } + + return null; + """) + if not sort_btn: + # Debug: check what buttons exist - focus on aria-haspopup buttons + btn_debug = driver.execute_script(""" + var result = []; + + // Show all buttons with aria-haspopup (dropdown buttons) + var dropdowns = document.querySelectorAll('button[aria-haspopup="true"]'); + result.push('Dropdown buttons (' + dropdowns.length + '):'); + for (var i = 0; i < dropdowns.length && i < 5; i++) { + var text = dropdowns[i].textContent.replace(/\\s+/g, ' ').trim().substring(0, 50); + result.push(' [' + i + ']: "' + text + '"'); + } + + // Look for any element with "Sort" or "relevant" text + var sortKeywords = ['sort', 'relevant', 'newest', 'highest', 'lowest']; + var btns = document.querySelectorAll('button'); + var sortBtns = []; + for (var i = 0; i < btns.length; i++) { + var text = btns[i].textContent.toLowerCase(); + for (var k = 0; k < sortKeywords.length; k++) { + if (text.includes(sortKeywords[k])) { + sortBtns.push(btns[i].textContent.replace(/\\s+/g, ' ').trim().substring(0, 40)); + break; + } + } + } + if (sortBtns.length > 0) { + result.push('Sort-related buttons: ' + JSON.stringify(sortBtns.slice(0, 5))); + } + + return result; + """) + log.warn('browser', f"Could not find sort button for {target_sort}. Debug: {btn_debug}") + return False + + # Use JavaScript click (more reliable than Selenium click) + driver.execute_script("arguments[0].click();", sort_btn) + time.sleep(0.5) + + # Wait for menu to appear with retries + menu_found = False + for attempt in range(3): + menu_items = driver.execute_script(""" + var items = []; + // Check for menuitemradio (Google's standard) + var radios = document.querySelectorAll('[role="menuitemradio"]'); + for (var i = 0; i < radios.length; i++) { + items.push(radios[i].textContent.trim()); + } + // Also check for menuitem + if (items.length === 0) { + var menuItems = document.querySelectorAll('[role="menuitem"]'); + for (var i = 0; i < menuItems.length; i++) { + items.push(menuItems[i].textContent.trim()); + } + } + // Check menu containers + if (items.length === 0) { + var menus = document.querySelectorAll('[role="menu"], [role="listbox"]'); + for (var m = 0; m < menus.length; m++) { + var divs = menus[m].querySelectorAll('div'); + for (var i = 0; i < divs.length; i++) { + var text = divs[i].textContent.trim(); + if (text.length > 2 && text.length < 30) { + items.push(text); + } + } + } + } + return items; + """) + if menu_items and len(menu_items) > 0: + menu_found = True + log.info('browser', f"Menu opened, items: {menu_items[:4]}") + break + time.sleep(0.5) + + if not menu_found: + log.warn('browser', f"Menu did not appear after clicking sort button") + + # Click the target sort option - try multiple selector strategies + # Use more comprehensive patterns including exact menu text + extended_patterns = { + SORT_NEWEST: ['newest', 'recent', 'más reciente', 'neueste', 'plus récent', 'más recientes'], + SORT_LOWEST: ['lowest', 'rating: low', 'puntuación: baja', 'niedrigste', 'puntuación más baja'], + SORT_HIGHEST: ['highest', 'rating: high', 'puntuación: alta', 'höchste', 'puntuación más alta'], + SORT_RELEVANT: ['most relevant', 'relevant', 'relevance', 'más relevante', 'relevanteste', 'más relevantes'], + } + search_patterns = extended_patterns.get(target_sort, patterns) + patterns_js = json.dumps(search_patterns) + + clicked = driver.execute_script(f""" + var patterns = {patterns_js}; + + function textMatches(txt, patterns) {{ + txt = txt.toLowerCase().trim(); + for (var p = 0; p < patterns.length; p++) {{ + if (txt.includes(patterns[p])) return true; + }} + return false; + }} + + // Strategy 1: menuitemradio elements (Google's standard) + var items = document.querySelectorAll('[role="menuitemradio"]'); + for (var i = 0; i < items.length; i++) {{ + if (textMatches(items[i].textContent, patterns)) {{ + items[i].click(); + return 'menuitemradio'; + }} + }} + + // Strategy 2: menuitem elements + items = document.querySelectorAll('[role="menuitem"]'); + for (var i = 0; i < items.length; i++) {{ + if (textMatches(items[i].textContent, patterns)) {{ + items[i].click(); + return 'menuitem'; + }} + }} + + // Strategy 3: menu items with data-index + items = document.querySelectorAll('[data-index]'); + for (var i = 0; i < items.length; i++) {{ + if (textMatches(items[i].textContent, patterns)) {{ + items[i].click(); + return 'data-index'; + }} + }} + + // Strategy 4: Any element in a menu/listbox container + var menus = document.querySelectorAll('[role="menu"], [role="listbox"]'); + for (var m = 0; m < menus.length; m++) {{ + var children = menus[m].querySelectorAll('*'); + for (var i = 0; i < children.length; i++) {{ + var el = children[i]; + // Only click leaf elements with matching text + if (el.children.length === 0 || el.tagName === 'SPAN') {{ + if (textMatches(el.textContent, patterns) && el.textContent.length < 50) {{ + el.click(); + return 'menu-child'; + }} + }} + }} + }} + + // Strategy 5: Any visible div with jsaction that matches + var allDivs = document.querySelectorAll('div[jsaction]'); + for (var i = 0; i < allDivs.length; i++) {{ + var txt = allDivs[i].textContent; + var visible = allDivs[i].offsetParent !== null; + if (visible && textMatches(txt, patterns) && txt.length < 50) {{ + allDivs[i].click(); + return 'jsaction-div'; + }} + }} + + return false; + """) + + if clicked: + time.sleep(0.5) + log.info('browser', f"Sorted by {target_sort} (via {clicked})") + return True + else: + # Debug: log what menu items we can see and any visible elements with sort text + menu_items = driver.execute_script(""" + var items = []; + // Check standard menu elements + var all = document.querySelectorAll('[role="menuitemradio"], [role="menuitem"], [data-index]'); + for (var i = 0; i < all.length && i < 10; i++) { + items.push(all[i].textContent.trim().substring(0, 30)); + } + // If nothing found, look for any visible element with sort-related text + if (items.length === 0) { + var allVisible = document.querySelectorAll('div, span'); + var sortTexts = ['newest', 'relevant', 'highest', 'lowest']; + for (var i = 0; i < allVisible.length && items.length < 10; i++) { + var txt = allVisible[i].textContent.toLowerCase(); + for (var s = 0; s < sortTexts.length; s++) { + if (txt.includes(sortTexts[s]) && txt.length < 30 && + allVisible[i].offsetParent !== null) { + items.push(allVisible[i].textContent.trim()); + break; + } + } + } + } + return items; + """) + log.warn('browser', f"Could not find sort option for {target_sort}. Menu items: {menu_items}") + return False + + except Exception as e: + log.warn('browser', f"Failed to change sort to {target_sort}: {e}") + return False + + def setup_reviews_page(is_refresh=False, validation_only_mode=False, initial_sort: str = None): + """ + Setup the reviews page for scraping. + Returns (scroll_container, stop_scrolling_event) or (None, None) on failure. + Can be called after initial load or after a hard refresh. + + If validation_only_mode=True, returns early after extracting business info + without clicking reviews tab or finding scroll container. + """ + nonlocal total_reviews + + refresh_label = " (after refresh)" if is_refresh else "" + + # Navigate to URL (only on initial load or refresh) + if not is_refresh: + # Reset browser state by navigating to blank page first + # This clears any stale state from pooled browser sessions + try: + driver.get("about:blank") + time.sleep(0.1) + except: + pass + + log.info('browser', f"Loading: {url[:80]}...") + else: + log.info('browser', f"Hard refresh #{hard_refresh_count[0]}: reloading page...") + driver.get(url) + + # Handle consent popup if redirected (poll with tiny sleep) + start = time.time() + while time.time() - start < 5: # Max 5s for consent + if "consent.google" in driver.current_url: + log.info('browser', "Handling consent popup...") + try: + for btn in driver.find_elements(By.CSS_SELECTOR, "button"): + txt = btn.text.lower() + if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt: + btn.click() + # Reload original URL after consent + log.info('browser', "Reloading after consent...") + driver.get(url) + # Wait for page to settle after consent reload + time.sleep(1) + break + except: + pass + break + # Check if we're already on the target page + if "maps/place" in driver.current_url and "consent" not in driver.current_url: + break + time.sleep(0.01) # 10ms - responsive but low CPU + + # Extract business info and total review count BEFORE clicking reviews tab (on Overview) + # This captures name, rating, category, address while they're visible + # Only on first load (don't overwrite if we already have it) + if total_reviews[0] is None or business_info_cache[0] is None: + start = time.time() + while time.time() - start < 5: + try: + info = driver.execute_script(""" + var result = { + total_reviews: null, + name: null, + rating: null, + category: null, + address: null + }; + + // Business name from h1 + var h1 = document.querySelector('h1'); + if (h1) result.name = h1.textContent.trim(); + + // Category - use jsaction attribute (robust selector) + var catBtn = document.querySelector('button[jsaction*="category"]'); + if (catBtn) result.category = catBtn.textContent.trim(); + + // Rating and review count from span[role="img"] aria-labels + var spans = document.querySelectorAll('span[role="img"]'); + for (var i = 0; i < spans.length; i++) { + var label = spans[i].getAttribute('aria-label') || ''; + + // Rating: "4.8 stars" + var rMatch = label.match(/^([\\d,.]+)\\s*star/i); + if (rMatch && !result.rating) { + result.rating = parseFloat(rMatch[1].replace(',', '.')); + } + + // Reviews: "79 reviews" + var revMatch = label.match(/^([\\d,\\.]+)\\s*review/i); + if (revMatch && !result.total_reviews) { + result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, '')); + } + } + + // Address from button + var addrBtn = document.querySelector('button[data-item-id="address"]'); + if (addrBtn) { + var label = addrBtn.getAttribute('aria-label'); + if (label) result.address = label.replace(/^Address:\\s*/i, ''); + } + + return result; + """) + + if info: + if info.get('total_reviews') and total_reviews[0] is None: + total_reviews[0] = info['total_reviews'] + log.info('scraper', f"Total reviews on page: {total_reviews[0]}", metrics={'total_reviews': total_reviews[0]}) + if info.get('name') and business_info_cache[0] is None: + business_info_cache[0] = info + log.info('scraper', f"Business: {info.get('name')}") + if total_reviews[0] and business_info_cache[0]: + break + except: + pass + time.sleep(0.1) + + # VALIDATION_ONLY: Return early - skip clicking reviews tab, sorting, etc. + if validation_only_mode: + log.info('scraper', "Validation mode: returning early (skipping reviews tab)") + return ("validation_done", None) + + # Click reviews tab - poll until found + review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"] + start = time.time() + tab_clicked = False + tabs_logged = False + while time.time() - start < 5: # Max 5s for tabs + try: + tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']") + # Log available tabs once for debugging + if not tabs_logged and tabs: + tabs_logged = True + tab_texts = [t.text for t in tabs] + log.info('browser', f"Available tabs: {tab_texts}") + for tab in tabs: + tab_text = tab.text.lower() + if any(kw in tab_text for kw in review_keywords): + if not is_refresh: + log.info('browser', f"Clicking reviews tab: '{tab.text}'") + # Extract total_reviews from tab text like "Reviews (79)" or "Reviews\n79" + if total_reviews[0] is None: + import re + # Try pattern with parentheses: "Reviews (79)" + match = re.search(r'\((\d+)\)', tab.text) + if match: + total_reviews[0] = int(match.group(1)) + log.info('scraper', f"Total reviews from tab: {total_reviews[0]}", metrics={'total_reviews': total_reviews[0]}) + else: + # Try pattern with newline: "Reviews\n79" + match = re.search(r'(\d+)', tab.text) + if match: + total_reviews[0] = int(match.group(1)) + log.info('scraper', f"Total reviews from tab: {total_reviews[0]}", metrics={'total_reviews': total_reviews[0]}) + tab.click() + tab_clicked = True + break + if tab_clicked: + break + time.sleep(0.01) # 10ms between polls + except: + time.sleep(0.01) + + # Poll for scroll container (10ms intervals - fast but low CPU) + scroll_container = None + start = time.time() + last_print = 0 + while time.time() - start < 10: # Max 10s + scroll_container = find_scroll_container() + if scroll_container: + break + elapsed = int(time.time() - start) + if elapsed > last_print: + log.info('browser', f"Waiting for reviews panel...{refresh_label} ({elapsed}s)") + last_print = elapsed + time.sleep(0.01) # 10ms - responsive but low CPU + + if not scroll_container: + log.error('browser', f"Could not find reviews scroll container{refresh_label}") + try: + log.error('browser', f"Page title: {driver.title}") + log.error('browser', f"Current URL: {driver.current_url[:100]}") + except: + pass + return None, None + + log.info('browser', f"Found scroll container{refresh_label}") + + # Inject API interceptor (needs to be re-injected after refresh) + if not is_refresh: + log.info('network', "Injecting API interceptor...") + driver.execute_script(""" + // Always re-setup on refresh + window.__reviewInterceptorInjected = true; + window.__interceptedResponses = window.__interceptedResponses || []; + + // Intercept fetch (only if not already patched) + if (!window.__fetchPatched) { + window.__fetchPatched = true; + const originalFetch = window.fetch; + window.fetch = async function(...args) { + const url = args[0].toString(); + const response = await originalFetch.apply(this, args); + if (url.includes('listugcposts') || url.includes('review')) { + try { + const clone = response.clone(); + const text = await clone.text(); + window.__interceptedResponses.push({url: url, body: text}); + } catch(e) {} + } + return response; + }; + } + + // Intercept XHR (only if not already patched) + if (!window.__xhrPatched) { + window.__xhrPatched = true; + const originalXHR = window.XMLHttpRequest; + window.XMLHttpRequest = function() { + const xhr = new originalXHR(); + const originalOpen = xhr.open; + let reqUrl = ''; + xhr.open = function(method, url, ...rest) { + reqUrl = url; + return originalOpen.apply(this, [method, url, ...rest]); + }; + xhr.addEventListener('load', function() { + if (reqUrl.includes('listugcposts') || reqUrl.includes('review')) { + try { + window.__interceptedResponses.push({url: reqUrl, body: xhr.responseText}); + } catch(e) {} + } + }); + return xhr; + }; + for (let prop of Object.getOwnPropertyNames(originalXHR)) { + try { window.XMLHttpRequest[prop] = originalXHR[prop]; } catch(e) {} + } + } + """) + + # Wait for sort button to appear (it loads after reviews panel) + sort_found = False + for wait_attempt in range(5): + time.sleep(0.5) + has_sort = driver.execute_script(""" + // Sort option values (use lowercase for matching with includes) + var sortValues = ['most relevant', 'newest', 'highest rating', 'lowest rating', + 'más relevantes', 'más recientes', 'puntuación más alta', 'puntuación más baja']; + + // Exclusion patterns - buttons we should NOT match + var excludePatterns = ['all reviews', 'todas las reseñas']; + + function normalizeText(text) { + return text.toLowerCase().replace(/\\s+/g, ' ').trim(); + } + + function isExcluded(text) { + var normalized = normalizeText(text); + for (var i = 0; i < excludePatterns.length; i++) { + if (normalized.includes(excludePatterns[i])) return true; + } + return false; + } + + // Check for buttons containing sort value text (not excluded) + var btns = document.querySelectorAll('button[aria-haspopup="true"], button.HQzyZ, button.S9kvJb'); + for (var i = 0; i < btns.length; i++) { + var text = normalizeText(btns[i].textContent); + if (isExcluded(text)) continue; + for (var j = 0; j < sortValues.length; j++) { + if (text.includes(sortValues[j])) return true; + } + } + return false; + """) + if has_sort: + sort_found = True + log.info('browser', "Sort button found") + break + + if not sort_found: + log.warn('browser', "Sort button not found after waiting, continuing without sorting") + + # Sort by specified order (default: newest) + target_sort = initial_sort or SORT_NEWEST + if sort_found and change_sort_order(target_sort): + # Re-find scroll container after sorting (DOM may be recreated) + new_container = find_scroll_container() + if new_container: + scroll_container = new_container + log.info('browser', "Refreshed scroll container reference") + + # Expand "More" buttons for full text + try: + expanded = driver.execute_script(""" + var buttons = document.querySelectorAll('button.w8nwRe.kyuRq'); + var count = 0; + for (var i = 0; i < buttons.length; i++) { + if (buttons[i].textContent.trim() === 'More') { + buttons[i].click(); + count++; + } + } + return count; + """) + if expanded > 0: + log.info('browser', f"Expanded {expanded} truncated reviews", metrics={'expanded_count': expanded}) + except: + pass + + # Block heavy resources to speed up scrolling (use CDP) + try: + driver.execute_cdp_cmd('Network.setBlockedURLs', { + 'urls': [ + # Images + '*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', + '*googleusercontent.com/*', + # Fonts + '*.woff', '*.woff2', '*.ttf', '*.otf', + # Analytics/tracking + '*google-analytics.com/*', '*googletagmanager.com/*', + '*doubleclick.net/*', '*googlesyndication.com/*', + # Maps tiles (not needed for reviews) + '*khms*.google.com/*', '*maps.googleapis.com/maps/vt*' + ] + }) + driver.execute_cdp_cmd('Network.enable', {}) + if not is_refresh: + log.info('browser', "Blocking heavy resources for faster scrolling") + except: + pass + + # Setup scrollable pane reference + driver.execute_script("window.scrollablePane = arguments[0];", scroll_container) + + # Create scroll worker + stop_scrolling = threading.Event() + + def scroll_worker(): + while not stop_scrolling.is_set(): + try: + driver.execute_script(""" + var p = window.scrollablePane; + if (p) p.scrollTop = p.scrollHeight; + """) + except: + pass + time.sleep(0.1) + + scroll_thread = threading.Thread(target=scroll_worker, daemon=True) + scroll_thread.start() + + return scroll_container, stop_scrolling + + # Helper to extract review topics from the reviews tab + def extract_review_topics(): + """Extract review topic filters from radiogroup (robust selectors).""" + try: + topics = driver.execute_script(""" + var topics = []; + + // Primary: use role="radiogroup" with aria-label="Refine reviews" (robust) + var container = document.querySelector('div[role="radiogroup"][aria-label*="Refine"], div[role="radiogroup"][aria-label*="refine"]'); + + if (!container) { + // Fallback: any radiogroup in the reviews area + container = document.querySelector('div[role="radiogroup"]'); + } + + if (container) { + var buttons = container.querySelectorAll('button[role="radio"]'); + for (var btn of buttons) { + var label = btn.getAttribute('aria-label') || ''; + // Parse "hair salon, mentioned in 4 reviews" format + var match = label.match(/^([^,]+),\\s*mentioned in (\\d+)/i); + if (match) { + topics.push({ + topic: match[1].trim(), + count: parseInt(match[2]) + }); + } else if (label && !label.toLowerCase().includes('all review')) { + // Fallback: try to extract from child spans + var countSpan = btn.querySelector('.bC3Nkc, .fontBodySmall'); + var nameSpan = btn.querySelector('.uEubGf, span:first-child'); + if (nameSpan) { + var name = nameSpan.textContent.trim(); + var count = countSpan ? parseInt(countSpan.textContent) : 0; + if (name && name.toLowerCase() !== 'all') { + topics.push({topic: name, count: count || 0}); + } + } + } + } + } + + return topics; + """) + return topics or [] + except: + return [] + + # Initial page setup (pass validation_only to skip unnecessary steps) + scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False, validation_only_mode=validation_only) + + # VALIDATION_ONLY MODE: Return early with just total_reviews and business info + # setup_reviews_page returns ("validation_done", None) in this case + if validation_only or scroll_container == "validation_done": + # Use the business info captured from Overview (before clicking reviews tab) + business_info = business_info_cache[0] or {} + + return { + "reviews": [], + "total": total_reviews[0] or 0, + "scrolls": 0, + "error": None, + "validation_info": { + "name": business_info.get("name"), + "rating": business_info.get("rating"), + "category": business_info.get("category"), + "address": business_info.get("address"), + "total_reviews": total_reviews[0] + }, + "session_fingerprint": session_fingerprint # Browser fingerprint for bot detection analysis + } + + if not scroll_container: + return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found", "session_fingerprint": session_fingerprint} + + # Extract review topics after reviews tab is loaded (before scrolling begins) + time.sleep(0.5) # Brief wait for topic filters to render + review_topics = extract_review_topics() + if review_topics: + log.info('scraper', f"Found {len(review_topics)} review topics: {', '.join(t['topic'] for t in review_topics[:5])}...", metrics={'topic_count': len(review_topics)}) + + def get_api_reviews(): + """Get reviews from intercepted API responses.""" + api_revs = [] + try: + responses = driver.execute_script(""" + var r = window.__interceptedResponses || []; + window.__interceptedResponses = []; + return r; + """) + for resp in (responses or []): + body = resp.get("body", "") + api_revs.extend(extract_reviews_from_api_body(body)) + except: + pass + return api_revs + + # Captcha detection helper + def detect_captcha(): + """Check if a captcha or challenge is blocking the page. Returns captcha type or None.""" + try: + return driver.execute_script(""" + // Check for reCAPTCHA iframe or checkbox + var recaptcha = document.querySelector('iframe[src*="recaptcha"], iframe[title*="reCAPTCHA"]'); + if (recaptcha) return 'recaptcha'; + + // Check for "unusual traffic" message + var body = document.body ? document.body.innerText : ''; + if (body.includes('unusual traffic') || body.includes('not a robot')) return 'unusual_traffic'; + + // Check for challenge frame + var challenge = document.querySelector('iframe[src*="challenge"]'); + if (challenge) return 'challenge'; + + return null; + """) + except: + return None + + # Recovery function - use real mouse actions when stuck + from selenium.webdriver.common.action_chains import ActionChains + from selenium.webdriver.common.keys import Keys + recovery_count = [0] + + def unstick_scroll(): + nonlocal scroll_container + recovery_count[0] += 1 + method = recovery_count[0] % 4 + try: + if method == 1: + # Method 1: Click pane and send Page Down keys + scroll_container.click() + ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform() + ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform() + elif method == 2: + # Method 2: Real mouse wheel scroll + ActionChains(driver).move_to_element(scroll_container)\ + .scroll_by_amount(0, 800).perform() + elif method == 3: + # Method 3: Scroll up significantly then back down (force reload) + driver.execute_script(""" + var p = window.scrollablePane; + if (p) p.scrollTop = Math.max(0, p.scrollTop - 2000); + """) + time.sleep(0.3) + driver.execute_script(""" + var p = window.scrollablePane; + if (p) p.scrollTop = p.scrollHeight; + """) + else: + # Method 4: Scroll last card into view, then scroll pane (no click to avoid opening profile) + driver.execute_script(""" + var cards = document.querySelectorAll('[data-review-id]'); + if (cards.length > 0) { + cards[cards.length - 1].scrollIntoView({block: 'end', behavior: 'smooth'}); + } + """) + time.sleep(0.3) + driver.execute_script(""" + var p = window.scrollablePane; + if (p) p.scrollTop = p.scrollHeight; + """) + except: + pass + + def do_hard_refresh(): + """Hard refresh the page and re-setup everything. Returns True on success.""" + nonlocal scroll_container, stop_scrolling + hard_refresh_count[0] += 1 + + if hard_refresh_count[0] > max_hard_refreshes: + log.warn('system', f"Max hard refreshes ({max_hard_refreshes}) reached, giving up", metrics={'hard_refresh_count': hard_refresh_count[0]}) + return False + + # Stop current scroll worker + stop_scrolling.set() + time.sleep(0.2) + + # Re-setup page + new_container, new_stop = setup_reviews_page(is_refresh=True) + if new_container: + scroll_container = new_container + stop_scrolling = new_stop + recovery_count[0] = 0 # Reset recovery count after successful refresh + log.info('browser', f"Hard refresh successful, resuming with {len(seen_ids)} reviews already collected", metrics={'reviews_collected': len(seen_ids)}) + return True + else: + log.error('browser', "Hard refresh failed to find scroll container") + return False + + # Main collection loop + last_new_time = time.time() + last_count = len(reviews) + check_num = 0 + start_time = time.time() + + # Crash detection: metrics sampling + metrics_history = [] + last_sample_time = time.time() + scroll_count = [0] # Track scroll operations for crash reports + + log.info('browser', f"Scrolling... (timeout: {timeout_no_new}s with no new)", metrics={'timeout_seconds': timeout_no_new}) + + cycle_start = time.time() + while True: + check_num += 1 + time.sleep(1.0) # Check every second + + # TIMING: Track cycle performance + t0 = time.time() + cycle_delta = t0 - cycle_start + cycle_start = t0 + + # CRASH DETECTION: Sample metrics every 5 seconds + if time.time() - last_sample_time >= 5: + current_count_for_metrics = total_flushed[0] + len(reviews) + metrics_history.append({ + 'timestamp_ms': int(time.time() * 1000), + 'memory_mb': get_chrome_memory(driver), + 'dom_nodes': get_dom_node_count(driver), + 'reviews_count': current_count_for_metrics + }) + # Keep only last 100 samples + metrics_history = metrics_history[-100:] + last_sample_time = time.time() + + # Collect from API (doesn't affect scroll) - API has FULL TEXT in original language + # Use review_id as key to avoid duplicates with DOM + t1 = time.time() + for rev in get_api_reviews(): + rid = rev.get('review_id', '') + if rid and rid not in seen_ids: + reviews[rid] = rev + seen_ids.add(rid) + api_time = time.time() - t1 + + # Expand any new "More" buttons for full text (batch click, fast) + try: + driver.execute_script(""" + var buttons = document.querySelectorAll('button.w8nwRe.kyuRq'); + for (var i = 0; i < buttons.length; i++) { + if (buttons[i].textContent.trim() === 'More') { + buttons[i].click(); + } + } + """) + except: + pass + + # Parse reviews using ROBUST selectors (no class names - uses data/aria attributes) + # This survives Google's CSS class name changes + # MEMORY FIX: Actually remove processed cards from DOM (not just hide) + # Keep last N cards for scroll continuity + t2 = time.time() + dom_cards = 0 + try: + seen_list = list(seen_ids) + parsed_reviews = driver.execute_script(""" + var seenSet = new Set(arguments[0]); + var results = []; + var processedIds = new Set(); + var sepsRemoved = 0; + var cardsRemoved = 0; + var KEEP_LAST_N = 50; // Keep last 50 cards for scroll reference + + // ROBUST: Find cards by data attribute only (not class names) + var cards = document.querySelectorAll('[data-review-id]'); + var cardsArray = Array.from(cards); + var totalCards = cardsArray.length; + + for (var i = 0; i < cardsArray.length; i++) { + var card = cardsArray[i]; + var rid = card.getAttribute('data-review-id'); + var isHidden = card.style.display === 'none'; + var isNearEnd = i >= totalCards - KEEP_LAST_N; + + // AGGRESSIVE CLEANUP: Remove hidden cards that are NOT near the scroll end + // This prevents memory buildup that causes tab crashes + if (isHidden && !isNearEnd) { + // Remove separators first + var sibling = card.nextElementSibling; + while (sibling) { + var nextSib = sibling.nextElementSibling; + var classes = sibling.className || ''; + if (classes.includes('AyRUI') || classes.includes('TFQHme')) { + sibling.remove(); + sepsRemoved++; + sibling = nextSib; + } else { + break; + } + } + // Remove the card itself from DOM + card.remove(); + cardsRemoved++; + continue; + } + + // Skip already hidden cards near end (keep for scroll reference) + if (isHidden) continue; + + // Skip if no ID or already processed this cycle + if (!rid || processedIds.has(rid)) continue; + + // Only process top-level review cards (have aria-label with author name) + if (!card.getAttribute('aria-label')) continue; + processedIds.add(rid); + + // Already seen from API - just track order, skip content + // BUT still hide the card to keep DOM light! + if (seenSet.has(rid)) { + results.push({id: rid, orderOnly: true}); + // Hide this card since we already have its data from API + card.style.display = 'none'; + card.innerHTML = ''; + continue; + } + + var author = '', text = '', rating = 0, timestamp = ''; + + // AUTHOR: Extract from "Photo of {Name}" button aria-label + var photoBtn = card.querySelector('button[aria-label^="Photo of"]'); + if (photoBtn) { + author = photoBtn.getAttribute('aria-label').replace('Photo of ', '').trim(); + } + // Fallback: card's own aria-label is the author name + if (!author) { + author = card.getAttribute('aria-label') || ''; + } + + // RATING: span with role="img" and aria-label containing "star" + var ratingEl = card.querySelector('span[role="img"][aria-label*="star"]'); + if (ratingEl) { + var match = ratingEl.getAttribute('aria-label').match(/(\\d)/); + if (match) rating = parseInt(match[1]); + } + + // TIMESTAMP: Find span with "X time ago" pattern + var spans = card.querySelectorAll('span'); + for (var j = 0; j < spans.length; j++) { + var spanText = spans[j].textContent.trim(); + if (spanText.match(/^(\\d+|a|an)\\s+(second|minute|hour|day|week|month|year)s?\\s+ago$/i)) { + timestamp = spanText; + break; + } + } + + // TEXT: Find longest text span (not timestamp/UI elements) + var longestText = ''; + for (var j = 0; j < spans.length; j++) { + var spanText = spans[j].textContent.trim(); + if (spanText === timestamp) continue; + if (spanText.match(/^\\d+ stars?$/i)) continue; + if (spanText === 'More' || spanText === 'Less') continue; + if (spanText.match(/^(Like\\d*|Share)$/)) continue; + if (spanText.length > longestText.length && spanText.length > 10) { + longestText = spanText; + } + } + text = longestText; + + if (author && rating >= 1 && rating <= 5) { + results.push({ + id: rid, + orderOnly: false, + author: author, + text: text, + rating: rating, + timestamp: timestamp, + source: 'dom' + }); + } + + // Mark card as processed (hide + clear) - will be removed on next cycle + // Keep near-end cards visible for scroll reference + if (!isNearEnd) { + card.style.display = 'none'; + card.innerHTML = ''; + } + } + return {reviews: results, cardCount: totalCards, cardsRemoved: cardsRemoved, sepsRemoved: sepsRemoved}; + """, seen_list) + + dom_cards = parsed_reviews.get('cardCount', 0) if parsed_reviews else 0 + cards_removed = parsed_reviews.get('cardsRemoved', 0) if parsed_reviews else 0 + if cards_removed > 0: + log.info('system', f"DOM cleanup: removed {cards_removed} cards to prevent memory buildup", metrics={'cards_removed': cards_removed, 'cards_remaining': dom_cards - cards_removed}) + new_reviews = parsed_reviews.get('reviews', []) if parsed_reviews else [] + for rev in new_reviews: + rid = rev.pop('id') + order_only = rev.pop('orderOnly', False) + # Track DOM order for ALL reviews (for sorting output) + if rid not in review_order: + review_order[rid] = order_counter[0] + order_counter[0] += 1 + # Only add content for new reviews (not already from API) + if not order_only: + reviews[rid] = rev + seen_ids.add(rid) + except Exception as e: + log.error('scraper', f"DOM parse error: {e}") + dom_time = time.time() - t2 + + # BATCH FLUSH: If we have enough reviews, flush to callback and clear memory + # Sort by DOM order before flushing + t3 = time.time() + if flush_callback and len(reviews) >= flush_batch_size: + log.info('scraper', f"Flushing {len(reviews)} reviews to disk...", metrics={'batch_size': len(reviews), 'source': 'flush'}) + sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) + flush_callback([r for _, r in sorted_reviews]) + total_flushed[0] += len(reviews) + reviews.clear() # Free memory, but keep seen_ids and review_order + flush_time = time.time() - t3 + + current_count = total_flushed[0] + len(reviews) + + # TIMING: Print if cycle is slow (>2s) + if cycle_delta > 2.0: + log.warn('system', f"SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})", metrics={'cycle_time_s': cycle_delta, 'api_time_s': api_time, 'dom_time_s': dom_time, 'dom_cards': dom_cards, 'seen_count': len(seen_ids)}) + + # Check for new reviews + if current_count > last_count: + last_new_time = time.time() + last_count = current_count + + # Check if loading (spinner visible OR network activity) + try: + loading_status = driver.execute_script(""" + var status = {spinner: false, network: false}; + // Check for Google's loading indicators + var spinner = document.querySelector('div[role="progressbar"]'); + if (spinner && spinner.offsetParent !== null) status.spinner = true; + var loading = document.querySelector('.qjESne, .loading'); + if (loading && loading.offsetParent !== null) status.spinner = true; + // Check for recent network activity (API interceptor) + var responses = window.__interceptedResponses || []; + var lastCount = window.__lastResponseCount || 0; + if (responses.length > lastCount) { + status.network = true; + window.__lastResponseCount = responses.length; + } + return status; + """) + is_loading = loading_status.get('spinner') or loading_status.get('network') + if is_loading: + last_new_time = time.time() # Reset timer while loading + except: + is_loading = False + + # Progress update + elapsed = time.time() - last_new_time + if total_reviews[0]: + pct = (current_count / total_reviews[0]) * 100 + log.info('scraper', f"{current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", metrics={'reviews_count': current_count, 'total_reviews': total_reviews[0], 'progress_pct': pct, 'idle_seconds': elapsed}) + else: + log.info('scraper', f"{current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", metrics={'reviews_count': current_count, 'idle_seconds': elapsed}) + + # Call progress callback on every iteration (for real-time log updates) + if progress_callback: + progress_callback(current_count, total_reviews[0]) + + # Stop conditions - check BEFORE recovery attempts + if current_count >= max_reviews: + log.info('scraper', f"Reached max: {current_count}", metrics={'total_reviews': current_count, 'elapsed_seconds': time.time() - start_time}) + stop_scrolling.set() + break + + # Also stop if we have all reviews from the page + if total_reviews[0] and current_count >= total_reviews[0]: + log.info('scraper', f"All {current_count} reviews collected", metrics={'total_reviews': current_count, 'elapsed_seconds': time.time() - start_time}) + stop_scrolling.set() + break + + # STUCK DETECTION: If no new reviews for 3s+, try to unstick + # Only if we haven't collected all reviews yet + if elapsed >= 3 and int(elapsed) % 3 == 0: + # After 8+ failed recovery attempts, try hard refresh + if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes: + # Check for captcha before hard refresh - no point refreshing if blocked + captcha_type = detect_captcha() + if captcha_type: + log.warn('browser', f"Captcha detected ({captcha_type}), stopping - human intervention needed", metrics={'captcha_type': captcha_type}) + stop_scrolling.set() + return { + "reviews": [], + "total": current_count, + "error": f"Captcha detected: {captcha_type}. Please solve manually and retry.", + "captcha_detected": True + } + + log.info('browser', f"Soft recovery failed {recovery_count[0]} times, trying hard refresh...", metrics={'recovery_count': recovery_count[0]}) + if do_hard_refresh(): + last_new_time = time.time() # Reset timer after refresh + continue # Skip to next iteration + else: + log.info('browser', f"Recovery attempt #{recovery_count[0] + 1}...", metrics={'recovery_attempt': recovery_count[0] + 1}) + unstick_scroll() + + # Check scroll state - track if content is still being added + try: + scroll_state = driver.execute_script(""" + var p = window.scrollablePane; + if (!p) return {atBottom: true, height: 0}; + var atBottom = (p.scrollTop + p.clientHeight >= p.scrollHeight - 50); + var height = p.scrollHeight; + var lastHeight = window.__lastScrollHeight || 0; + var growing = height > lastHeight; + window.__lastScrollHeight = height; + return {atBottom: atBottom, height: height, growing: growing}; + """) + at_bottom = scroll_state.get('atBottom', True) + content_growing = scroll_state.get('growing', False) + except: + at_bottom = True + content_growing = False + + # Reset timer if content is growing (new reviews loading) + if content_growing: + last_new_time = time.time() + + # Dynamic timeout based on state and recovery attempts + # - Try hard refresh before giving up if we still have refreshes left + # - 5s if at bottom AND content stopped growing AND multiple recovery attempts failed + # - 15s max otherwise (keep trying) + recovery_failed = recovery_count[0] >= 5 and elapsed >= 5 + truly_done = at_bottom and not content_growing and recovery_failed + timeout_hit = elapsed >= timeout_no_new + + if truly_done or timeout_hit: + # Check if we're close enough to total (configurable threshold) + # If we have close_enough_pct+ of reviews, don't waste time with hard refreshes + close_enough = False + hit_limit_with_multisort = False + if total_reviews[0] and current_count > 0: + pct_complete = (current_count / total_reviews[0]) * 100 + close_enough = pct_complete >= close_enough_pct + if close_enough: + log.info('scraper', f"Close enough ({pct_complete:.1f}% >= {close_enough_pct}%), skipping further retries", metrics={'pct_complete': pct_complete}) + + # Special case: if multi-sort mode and we hit a limit (~1000 reviews), + # exit first pass to try other sorts instead of endless hard refreshes + if (sort_strategy in ["multi", SORT_AUTO]) and current_count >= 1000 and hard_refresh_count[0] >= 1: + hit_limit_with_multisort = True + log.info('scraper', f"Hit ~1000 limit with multi-sort available, proceeding to additional sorts", + metrics={'current_count': current_count, 'pct_complete': pct_complete}) + + # Last chance: try hard refresh before giving up (only if not close enough and not hitting multi-sort limit) + if not close_enough and not hit_limit_with_multisort and hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews): + # Check for captcha first + captcha_type = detect_captcha() + if captcha_type: + log.warn('browser', f"Captcha detected ({captcha_type}), stopping - human intervention needed", metrics={'captcha_type': captcha_type}) + stop_scrolling.set() + break + + log.info('browser', "Timeout reached, trying hard refresh before giving up...", metrics={'idle_seconds': elapsed}) + if do_hard_refresh(): + last_new_time = time.time() + continue # Keep trying + log.info('scraper', f"All reviews loaded: {current_count}", metrics={'total_reviews': current_count, 'elapsed_seconds': time.time() - start_time}) + stop_scrolling.set() + break + + # ===== MULTI-SORT ADDITIONAL PASSES ===== + # After first pass, check if we should do additional passes with different sort orders + first_pass_count = total_flushed[0] + len(reviews) + actual_sort_order = sort_order or DEFAULT_SORT_ORDER + completed_sorts = [SORT_NEWEST] # First pass always uses newest (or initial_sort) + + # Determine if we should do multi-sort + should_multi_sort = False + if sort_strategy == "multi": + should_multi_sort = True + log.info('scraper', "Multi-sort enabled (forced)", metrics={'sort_strategy': 'multi'}) + elif sort_strategy == SORT_AUTO: + # Auto mode: enable if total > threshold OR first pass got < 90% of total + if total_reviews[0] and total_reviews[0] > multi_sort_threshold: + should_multi_sort = True + log.info('scraper', f"Multi-sort auto-enabled (total {total_reviews[0]} > {multi_sort_threshold})", + metrics={'total_reviews': total_reviews[0], 'threshold': multi_sort_threshold}) + elif total_reviews[0] and first_pass_count < (total_reviews[0] * 0.9): + should_multi_sort = True + pct = (first_pass_count / total_reviews[0]) * 100 + log.info('scraper', f"Multi-sort auto-enabled (first pass got {pct:.1f}% < 90%)", + metrics={'first_pass_count': first_pass_count, 'total_reviews': total_reviews[0]}) + + if should_multi_sort and first_pass_count < max_reviews: + remaining_sorts = [s for s in actual_sort_order if s not in completed_sorts] + + for pass_num, next_sort in enumerate(remaining_sorts, start=2): + # Check if we already have enough reviews + current_total = total_flushed[0] + len(reviews) + if current_total >= max_reviews: + log.info('scraper', f"Reached max_reviews ({max_reviews}), skipping remaining sorts") + break + + pass_start_count = current_total + log.info('scraper', f"Pass {pass_num}/{len(actual_sort_order)} ({next_sort}): starting with {current_total} reviews", + metrics={'pass': pass_num, 'sort': next_sort, 'current_total': current_total}) + + # Change sort order + if not change_sort_order(next_sort): + log.warn('scraper', f"Failed to change sort to {next_sort}, skipping") + continue + + time.sleep(0.5) + + # Re-find scroll container (DOM may have changed) + scroll_container = find_scroll_container() + if not scroll_container: + log.warn('scraper', f"Lost scroll container after sort change, skipping {next_sort}") + continue + + # Scroll to top to start fresh + try: + driver.execute_script(""" + var p = arguments[0]; + if (p) p.scrollTop = 0; + """, scroll_container) + time.sleep(0.3) + except: + pass + + # Start new scroll worker for this pass + stop_scrolling = threading.Event() + def scroll_worker_pass(): + while not stop_scrolling.is_set(): + try: + driver.execute_script(""" + var p = window.scrollablePane; + if (p) p.scrollTop = p.scrollHeight; + """) + except: + pass + time.sleep(0.1) + + driver.execute_script("window.scrollablePane = arguments[0];", scroll_container) + scroll_thread = threading.Thread(target=scroll_worker_pass, daemon=True) + scroll_thread.start() + + # Mini scraping loop for this sort pass + pass_last_new_time = time.time() + pass_timeout = timeout_no_new + + while True: + time.sleep(1.0) + + # Get reviews + api_reviews = get_api_reviews() + dom_reviews = get_dom_reviews(scroll_container) + all_new = api_reviews + dom_reviews + + # Add new reviews (seen_ids persists across passes!) + new_count = 0 + for rev in all_new: + rid = rev.get("id") or f"{rev.get('author', '')}_{rev.get('timestamp', '')}" + if rid and rid not in seen_ids: + seen_ids.add(rid) + reviews[rid] = rev + if rid not in review_order: + review_order[rid] = order_counter[0] + order_counter[0] += 1 + new_count += 1 + + if new_count > 0: + pass_last_new_time = time.time() + + current_total = total_flushed[0] + len(reviews) + + # Progress update + if progress_callback and total_reviews[0]: + progress_callback(current_total, total_reviews[0]) + + # Flush if batch size reached + if flush_callback and len(reviews) >= flush_batch_size: + sorted_revs = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) + flush_callback([r for _, r in sorted_revs]) + total_flushed[0] += len(reviews) + reviews.clear() + + # Check timeout + elapsed = time.time() - pass_last_new_time + if elapsed >= pass_timeout: + stop_scrolling.set() + break + + # Check max reviews + if current_total >= max_reviews: + stop_scrolling.set() + break + + stop_scrolling.set() + completed_sorts.append(next_sort) + + # Calculate yield for this pass + pass_end_count = total_flushed[0] + len(reviews) + pass_yield = pass_end_count - pass_start_count + yield_pct = (pass_yield / max(1, pass_start_count)) * 100 + + log.info('scraper', f"Pass {pass_num} ({next_sort}) complete: +{pass_yield} new reviews ({yield_pct:.1f}% yield)", + metrics={'pass': pass_num, 'sort': next_sort, 'new_reviews': pass_yield, 'yield_pct': yield_pct}) + + # Diminishing returns check + if pass_yield > 0 and yield_pct < DIMINISHING_RETURNS_PCT: + log.info('scraper', f"Low yield ({yield_pct:.1f}% < {DIMINISHING_RETURNS_PCT}%), stopping multi-sort", + metrics={'yield_pct': yield_pct, 'threshold': DIMINISHING_RETURNS_PCT}) + break + + # ===== END MULTI-SORT ===== + + # Flush any remaining reviews (sorted by DOM order) + if flush_callback and reviews: + log.info('scraper', f"Final flush: {len(reviews)} reviews...", metrics={'batch_size': len(reviews), 'source': 'final_flush'}) + sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) + flush_callback([r for _, r in sorted_reviews]) + total_flushed[0] += len(reviews) + reviews.clear() + + # Reviews already parsed during scrolling (real-time parsing) + log.info('scraper', "Finalizing review data...") + + # Final results (sorted by DOM order) + sorted_items = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) + review_list = [r for _, r in sorted_items] + grand_total = total_flushed[0] + len(review_list) + dom_count = sum(1 for r in review_list if r.get("source") == "dom") + api_count = sum(1 for r in review_list if r.get("source") == "api") + + if total_flushed[0] > 0: + log.info('scraper', f"Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})", metrics={'total_reviews': grand_total, 'flushed_count': total_flushed[0], 'in_memory_count': len(review_list), 'elapsed_seconds': time.time() - start_time}) + else: + log.info('scraper', f"Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})", metrics={'total_reviews': len(review_list), 'dom_count': dom_count, 'api_count': api_count, 'elapsed_seconds': time.time() - start_time}) + + # Infer topics for each review if review_topics is available + if review_topics: + log.info('scraper', f"Inferring topics for {len(review_list)} reviews...", metrics={'reviews_count': len(review_list)}) + topics_inferred_count = 0 + for review in review_list: + review_text = review.get("text", "") + matched = infer_review_topics(review_text, review_topics) + review["topics"] = matched + if matched: + topics_inferred_count += 1 + log.info('scraper', f"Topics inferred for {topics_inferred_count}/{len(review_list)} reviews", metrics={'topics_inferred_count': topics_inferred_count, 'reviews_count': len(review_list)}) + + return { + "reviews": review_list, # Only unflushed reviews (flushed already sent to callback) + "total": grand_total, + "total_flushed": total_flushed[0], + "checks": check_num, + "url": url, + "logs": log.get_logs(), + "review_topics": review_topics, # Topic filters with mention counts + "metrics_history": metrics_history, # For crash detection + "start_time": start_time, # For crash report elapsed time + "session_fingerprint": session_fingerprint, # Browser fingerprint for bot detection analysis + "multi_sort": { + "enabled": should_multi_sort if 'should_multi_sort' in dir() else False, + "completed_sorts": completed_sorts if 'completed_sorts' in dir() else [SORT_NEWEST], + "first_pass_count": first_pass_count if 'first_pass_count' in dir() else grand_total + } + } + + +def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999, + progress_callback=None, driver=None, return_driver: bool = False, + log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False, + browser_fingerprint: dict = None): + """ + Production-compatible wrapper for scrape_reviews. + Matches the API expected by job_manager.py. + + Args: + url: Google Maps URL to scrape + headless: Run Chrome in headless mode + max_scrolls: Not used (kept for API compatibility) + progress_callback: Optional callback(current_count, total_count) for progress + driver: Existing driver instance to reuse + return_driver: If True, return driver in result + log_capture: Optional LogCapture instance for real-time log access + browser_fingerprint: Optional dict with user's browser fingerprint: + - geolocation: {lat, lng} + - userAgent: string + - viewport: {width, height} + - timezone: string (e.g., "Europe/Madrid") + - language: string (e.g., "en-US") + - platform: string (e.g., "MacIntel") + + Returns: + Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs + """ + from seleniumbase import Driver + + start_time = time.time() + driver_provided = driver is not None + should_close_driver = not return_driver and not driver_provided + + # Use provided log_capture or create new one + log_capture = log_capture or LogCapture() + + try: + # Extract fingerprint settings + fp = browser_fingerprint or {} + user_agent = fp.get('userAgent') or "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + viewport = fp.get('viewport') or {'width': 1200, 'height': 900} + geolocation = fp.get('geolocation') + timezone = fp.get('timezone') + language = fp.get('language', 'en-US') + + # Create driver if not provided + if not driver: + driver = Driver( + uc=True, + headless=headless, + page_load_strategy="normal", + agent=user_agent # Use user's actual user agent + ) + # Set viewport to match user's screen + driver.set_window_size(viewport['width'], viewport['height']) + + # Apply browser fingerprint settings via CDP + try: + # Set timezone if provided + if timezone: + driver.execute_cdp_cmd('Emulation.setTimezoneOverride', {'timezoneId': timezone}) + log_capture.info('browser', f"Set timezone to {timezone}") + + # Set locale/language + driver.execute_cdp_cmd('Emulation.setLocaleOverride', {'locale': language}) + + # Set geolocation + if geolocation and 'lat' in geolocation and 'lng' in geolocation: + driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { + 'latitude': geolocation['lat'], + 'longitude': geolocation['lng'], + 'accuracy': 1000 # ~1km accuracy for IP-based location + }) + log_capture.info('browser', f"Set geolocation to ({geolocation['lat']:.2f}, {geolocation['lng']:.2f})", metrics={'lat': geolocation['lat'], 'lng': geolocation['lng']}) + else: + # Default to US (Boston, MA) if no geolocation provided + driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { + 'latitude': 42.3601, + 'longitude': -71.0589, + 'accuracy': 100 + }) + log_capture.info('browser', "Set geolocation to US (Boston, MA) [default]", metrics={'lat': 42.3601, 'lng': -71.0589}) + + if fp: + log_capture.info('browser', f"Browser fingerprint applied: {fp.get('platform', 'unknown')}, {viewport['width']}x{viewport['height']}", metrics={'viewport_width': viewport['width'], 'viewport_height': viewport['height']}) + except Exception as e: + log_capture.warn('system', f"Could not apply fingerprint settings: {e}") + + # Add URL parameters for consistent results + if 'hl=' not in url: + separator = '&' if '?' in url else '?' + url = f"{url}{separator}hl=en" + if 'gl=' not in url: + url = f"{url}&gl=us" + + # Create combined flush callback for progress + external handler + external_flush = flush_callback # Save external callback + internal_flush = None + if progress_callback or external_flush: + collected = [0] + def combined_flush(reviews_batch): + collected[0] = len(reviews_batch) # reviews_batch is ALL reviews so far + if progress_callback: + progress_callback(collected[0], None) + if external_flush: + external_flush(reviews_batch) # Pass reviews to external handler + internal_flush = combined_flush + + # Run the scraper with progress callback for real-time updates + result = scrape_reviews( + driver=driver, + url=url, + max_reviews=999999, # Effectively unlimited + timeout_no_new=15, + flush_callback=internal_flush, + flush_batch_size=100, # Smaller batches for more frequent progress + log_capture=log_capture, + progress_callback=progress_callback, # Pass through for real-time log updates + validation_only=validation_only # Return early if just validating + ) + + elapsed = time.time() - start_time + + # Return in expected format + response = { + "reviews": result.get("reviews", []), + "count": result.get("total", 0), + "total_reviews": result.get("total", 0), + "time": elapsed, + "success": True, + "error": None, + "logs": result.get("logs", []), + "review_topics": result.get("review_topics", []), # Topic filters with mention counts + "session_fingerprint": result.get("session_fingerprint") # Browser fingerprint for bot detection + } + + # Include validation_info if in validation_only mode + if validation_only and "validation_info" in result: + response["validation_info"] = result["validation_info"] + + if return_driver: + response["driver"] = driver + elif should_close_driver: + try: + driver.quit() + except: + pass + + return response + + except Exception as e: + elapsed = time.time() - start_time + + # CRASH DETECTION: Build crash report before closing driver + crash_report = None + try: + if driver: + # Try to sample final metrics from the browser + final_metrics = { + 'timestamp_ms': int(time.time() * 1000), + 'memory_mb': get_chrome_memory(driver), + 'dom_nodes': get_dom_node_count(driver) + } + # Build crash report with available information + crash_report = { + 'crash_type': classify_crash(e, [final_metrics]), + 'error_message': str(e), + 'state': { + 'reviews_extracted': 0, # Unknown at crash time + 'total_expected': None, + 'scroll_count': 0, + 'elapsed_seconds': elapsed + }, + 'metrics_history': [final_metrics], + 'logs_before_crash': log_capture.get_logs()[-20:] if log_capture else [], + 'last_successful_review_id': None + } + log_capture.error('system', f"Crash detected: {crash_report['crash_type']}", + metrics={'error': str(e), 'elapsed_seconds': elapsed}) + except: + # If we can't build crash report, continue with basic error handling + pass + + if should_close_driver and driver: + try: + driver.quit() + except: + pass + + # Log error to the existing log_capture + log_capture.error('system', f"Scraper failed: {str(e)}") + + result = { + "reviews": [], + "count": 0, + "total_reviews": 0, + "time": elapsed, + "success": False, + "error": str(e), + "driver": driver if return_driver else None, + "logs": log_capture.get_logs() + } + + # Include crash report if available + if crash_report: + result['crash_report'] = crash_report + + return result + + +def extract_about_info(driver, url: str = None) -> dict: + """ + Extract About section info from Google Maps (Accessibility, Amenities, etc.). + + This function should be called AFTER reviews are scraped if about info is needed, + as it navigates to a different tab. + + Args: + driver: Selenium WebDriver instance (already on the business page) + url: Optional URL to navigate to first (if not already on the page) + + Returns: + dict with section names as keys, each containing list of features + """ + try: + # Navigate if URL provided + if url: + # Force English + if 'hl=' not in url: + separator = '&' if '?' in url else '?' + url = f"{url}{separator}hl=en" + if 'gl=' not in url: + url = f"{url}&gl=us" + driver.get(url) + time.sleep(1) + + # Click About tab using robust selectors + clicked = driver.execute_script(""" + // Try multiple selectors for about tab + var selectors = [ + 'button[aria-label*="About"]', + 'button[data-tab-index="2"]', + 'div[role="tablist"] button:nth-child(3)', + 'button[jsaction*="about"]' + ]; + + for (var sel of selectors) { + var btn = document.querySelector(sel); + if (btn && btn.textContent.toLowerCase().includes('about')) { + btn.click(); + return true; + } + } + + // Fallback: find by text content + var buttons = document.querySelectorAll('button'); + for (var btn of buttons) { + if (btn.textContent.trim().toLowerCase() === 'about') { + btn.click(); + return true; + } + } + return false; + """) + + if not clicked: + return {} + + time.sleep(1.5) # Wait for about tab to load + + # Extract about sections using aria-labels (robust) + about = driver.execute_script(""" + var about = {}; + + // Find the about region by aria-label or role + var container = document.querySelector('div[role="region"][aria-label*="About"]'); + + if (!container) { + // Fallback: look for the scrollable area with sections + container = document.querySelector('.m6QErb[aria-label*="About"]'); + } + + if (!container) { + // Last resort: find sections by h2 headers + container = document; + } + + // Find all section headers (h2 elements) + var sections = container.querySelectorAll('h2'); + + for (var h2 of sections) { + var sectionName = h2.textContent.trim(); + var items = []; + + // Find the ul list following this h2 + var parent = h2.closest('.iP2t7d, div'); + if (parent) { + var listItems = parent.querySelectorAll('li span[aria-label]'); + for (var li of listItems) { + var label = li.getAttribute('aria-label'); + if (label) { + // Parse "Has toilet" or "No wheelchair-accessible car park" + var hasFeature = !label.toLowerCase().startsWith('no '); + var featureName = label.replace(/^(Has |No )/i, ''); + items.push({ + feature: featureName, + available: hasFeature + }); + } + } + } + + if (sectionName && items.length > 0) { + about[sectionName] = items; + } + } + + return about; + """) + + return about or {} + + except Exception as e: + return {"error": str(e)} + + +# Test function +if __name__ == "__main__": + from seleniumbase import Driver + + # Test URL - 79 reviews + TEST_URL = "https://www.google.com/maps/place/R.+Fleitas+Peluqueros/@28.1302986,-15.4448111,821m/data=!3m1!1e3!4m6!3m5!1s0xc40951a43c21f19:0x85f89601b9909c72!8m2!3d28.1299805!4d-15.4436854!16s%2Fg%2F11gbwtk8c8" + + print("🚀 Starting clean scraper test...") + + # Set up driver + driver = Driver(uc=True, headless=False) + driver.set_window_size(1200, 900) + + try: + result = scrape_reviews(driver, TEST_URL, max_reviews=100, timeout_no_new=15) + print(f"\n✅ Got {result['total']} reviews in {result['checks']} checks") + + # Show sample + if result["reviews"]: + print("\n📝 Sample review:") + sample = result["reviews"][0] + print(f" Author: {sample['author']}") + print(f" Rating: {sample['rating']}⭐") + print(f" Text: {sample['text'][:100]}..." if sample['text'] else " Text: (none)") + + finally: + driver.quit() + print("\n🏁 Done") + + +def get_business_card_info(url: str, headless: bool = True, driver=None, return_driver: bool = False) -> dict: + """ + Extract business card info from Google Maps. + Uses the same efficient polling navigation as scrape_reviews (no fixed waits). + + Returns: + dict with: name, address, rating, total_reviews, success, error, time + """ + from seleniumbase import Driver + import logging + log = logging.getLogger(__name__) + + start_time = time.time() + driver_provided = driver is not None + should_close_driver = not return_driver and not driver_provided + + try: + # Create driver if not provided + if not driver: + driver = Driver(uc=True, headless=headless) + + # Set geolocation to US + try: + driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { + 'latitude': 42.3601, 'longitude': -71.0589, 'accuracy': 100 + }) + except: + pass + + # Don't clear state - Google may serve different content based on session history + # The scraper doesn't reset state, so validation shouldn't either + + # Force English interface for consistent parsing + if 'hl=' not in url: + separator = '&' if '?' in url else '?' + url = f"{url}{separator}hl=en" + if 'gl=' not in url: + url = f"{url}&gl=us" + + # Navigate to URL + driver.get(url) + + # Handle consent popup - poll with 10ms sleep (same as scrape_reviews) + start = time.time() + while time.time() - start < 5: + if "consent.google" in driver.current_url: + try: + # Try multiple approaches to find and click accept button + clicked = False + + # Method 1: Find by aria-label (most reliable for Google consent) + for btn in driver.find_elements(By.CSS_SELECTOR, "button[aria-label*='Accept']"): + btn.click() + clicked = True + break + + # Method 2: Find by text content + if not clicked: + for btn in driver.find_elements(By.CSS_SELECTOR, "button"): + txt = btn.text.lower() + if "accept all" in txt or "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt: + btn.click() + clicked = True + break + + if clicked: + time.sleep(0.5) # Brief wait for consent to process + driver.get(url) # Reload the target URL + time.sleep(0.5) # Wait for reload + except Exception as e: + pass + break + if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url): + break + time.sleep(0.01) # 10ms - responsive but low CPU + + # Log current URL after consent handling + try: + current_url = driver.current_url + log.info(f"🔍 Validation: Current URL after load: {current_url[:80]}...") + except: + pass + + # Wait for page to fully render before polling (tabs may load dynamically) + time.sleep(2) + + # Poll for business info (same pattern as total_reviews extraction) + # Timeout increased to 10s because Reviews tab can take 6+ seconds to appear after consent + info = {"name": None, "rating": None, "total_reviews": None, "address": None, "category": None} + start = time.time() + debug_logged = False + while time.time() - start < 10: + try: + info = driver.execute_script(""" + var result = {name: null, rating: null, total_reviews: null, address: null, category: null, debug: []}; + + // Business name from h1 + var h1 = document.querySelector('h1'); + if (h1) result.name = h1.textContent.trim(); + + // Category - use jsaction attribute (robust, survives class changes) + var catBtn = document.querySelector('button[jsaction*="category"]'); + if (catBtn) result.category = catBtn.textContent.trim(); + + // Fallback: look for button after rating that's not a link + if (!result.category) { + var buttons = document.querySelectorAll('button'); + for (var btn of buttons) { + var text = btn.textContent.trim(); + // Categories are short words, no numbers, not navigation + if (text && text.length < 50 && !text.match(/^[0-9]/) && + !text.match(/review|star|direction|save|share|photo/i)) { + // Check if it's near the rating area + var parent = btn.closest('.LBgpqf, .skqShb, .fontBodyMedium'); + if (parent) { + result.category = text; + break; + } + } + } + } + + // Rating from span[role="img"] aria-labels + var spans = document.querySelectorAll('span[role="img"]'); + for (var i = 0; i < spans.length; i++) { + var label = spans[i].getAttribute('aria-label') || ''; + + // Collect debug info for all aria-labels + if (label) { + result.debug.push('img-aria: ' + label); + } + + // Rating: "4.8 stars" (English forced via hl=en) + var rMatch = label.match(/^([\\d,.]+)\\s*star/i); + if (rMatch && !result.rating) { + result.rating = parseFloat(rMatch[1].replace(',', '.')); + } + + // Reviews: "79 reviews" or "4.8 stars 79 reviews" (English forced via hl=en) + // Try direct format first: "79 reviews" + var revMatch = label.match(/^([\\d,]+)\\s*review/i); + if (revMatch && !result.total_reviews) { + result.total_reviews = parseInt(revMatch[1].replace(/,/g, '')); + } + + // Try combined format: "4.8 stars 79 reviews" or "4.8 stars 79k+ reviews" + if (!result.total_reviews) { + var combinedMatch = label.match(/stars?\\s+([\\d,]+k?\\+?)\\s*review/i); + if (combinedMatch) { + var countStr = combinedMatch[1].replace(/,/g, ''); + if (countStr.includes('k')) { + // Handle "9k+" format + result.total_reviews = parseInt(countStr) * 1000; + } else { + result.total_reviews = parseInt(countStr); + } + } + } + } + + // Also collect tab button texts for debugging (include full text including numbers) + var tabs = document.querySelectorAll('button[role="tab"]'); + for (var j = 0; j < tabs.length; j++) { + var tabText = tabs[j].textContent.trim(); + result.debug.push('tab: ' + tabText); + // Also try to extract review count from tab text like "Reviews (79)" + if (tabText.toLowerCase().includes('review') && !result.total_reviews) { + var tabMatch = tabText.match(/\\((\\d+)\\)/); + if (tabMatch) { + result.total_reviews = parseInt(tabMatch[1]); + result.debug.push('Found reviews in tab: ' + tabText); + } + } + } + + // Also check ALL buttons for reviews count + var allButtons = document.querySelectorAll('button'); + for (var b = 0; b < allButtons.length; b++) { + var btnText = allButtons[b].textContent || ''; + if (btnText.toLowerCase().includes('review') && !btnText.toLowerCase().includes('write')) { + var numMatch = btnText.match(/\\((\\d+)\\)/); + if (numMatch && !result.total_reviews) { + result.total_reviews = parseInt(numMatch[1]); + result.debug.push('Found reviews in button: ' + btnText.substring(0, 50)); + } + } + } + + // Check if we're on search results vs place page + result.debug.push('title: ' + document.title); + result.debug.push('url: ' + window.location.href.substring(0, 80)); + + // Check for search results list + var searchResults = document.querySelectorAll('div[role="feed"] > div'); + result.debug.push('search_results_count: ' + searchResults.length); + + // Fallback: Get review count from Reviews tab button "Reviews (79)" + // Search ALL tab buttons for one containing "review" text (same as scrape_reviews) + if (!result.total_reviews) { + var tabs = document.querySelectorAll('button[role="tab"]'); + for (var tab of tabs) { + var text = tab.textContent.toLowerCase(); + if (text.includes('review')) { + var match = tab.textContent.match(/\\((\\d+)\\)/); + if (match) { + result.total_reviews = parseInt(match[1]); + break; + } + } + } + } + + // Fallback 2: Look for any button with "Reviews" and a number + if (!result.total_reviews) { + var buttons = document.querySelectorAll('button'); + for (var btn of buttons) { + var text = btn.textContent; + if (text.toLowerCase().includes('review') && !text.toLowerCase().includes('write')) { + var numMatch = text.match(/\\((\\d+)\\)/); + if (numMatch) { + result.total_reviews = parseInt(numMatch[1]); + break; + } + } + } + } + + // Address from button + var addrBtn = document.querySelector('button[data-item-id="address"]'); + if (addrBtn) { + var label = addrBtn.getAttribute('aria-label'); + if (label) result.address = label.replace(/^Address:\\s*/i, ''); + } + + return result; + """) + # Exit early if we have the essentials (name found AND reviews count > 0) + if info.get("name") and info.get("total_reviews") and info.get("total_reviews") > 0: + break + + # Log debug info once after 3 seconds + if not debug_logged and time.time() - start > 3: + debug_logged = True + debug_info = info.get("debug", []) + if debug_info: + log.info(f"🔍 Validation debug - URL: {url[:50]}...") + log.info(f" Name: {info.get('name')}, Rating: {info.get('rating')}, Reviews: {info.get('total_reviews')}") + for d in debug_info[:10]: # First 10 debug items + log.info(f" {d}") + except: + pass + time.sleep(0.1) # 100ms between polls + + # Final debug log if still no reviews + if not info.get("total_reviews"): + debug_info = info.get("debug", []) + log.warning(f"⚠️ Validation: No reviews found for '{info.get('name')}' after 10s polling") + if debug_info: + log.warning(f" Debug items: {debug_info[:10]}") + + return { + "name": info.get("name"), + "address": info.get("address"), + "rating": info.get("rating"), + "total_reviews": info.get("total_reviews"), + "category": info.get("category"), + "success": bool(info.get("name")), + "error": None, + "time": time.time() - start_time + } + + except Exception as e: + return { + "name": None, + "address": None, + "rating": None, + "total_reviews": None, + "category": None, + "success": False, + "error": str(e), + "time": time.time() - start_time + } + + finally: + if should_close_driver and driver: + try: + driver.quit() + except: + pass diff --git a/tools/test_scraper_v110.py b/tools/test_scraper_v110.py new file mode 100644 index 0000000..7916d70 --- /dev/null +++ b/tools/test_scraper_v110.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python3 +""" +Quick CLI tool to test v1.1.0 scraper with multi-sort support. + +Usage: + # Basic test (auto mode - enables multi-sort if needed) + python tools/test_scraper_v110.py "ClickRent Gran Canaria" + + # Force multi-sort through all sort orders + python tools/test_scraper_v110.py "White Hart Hotel Boston UK" --multi-sort + + # Custom sort order + python tools/test_scraper_v110.py "Business" --multi-sort --sort-order "newest,lowest,highest" + + # Single sort mode + python tools/test_scraper_v110.py "Business" --sort newest + + # Set close-enough threshold + python tools/test_scraper_v110.py "Business" --close-enough 90 +""" + +import sys +import os +import argparse +import time +import json +from datetime import datetime + +# Add project root to path +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +def main(): + parser = argparse.ArgumentParser( + description='Test Google Reviews scraper v1.1.0 with multi-sort', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__ + ) + parser.add_argument('query', nargs='?', help='Business name to search') + parser.add_argument('--url', '-u', help='Direct Google Maps URL (overrides query)') + parser.add_argument('--max', '-m', type=int, default=2000, help='Max reviews to scrape (default: 2000)') + parser.add_argument('--timeout', '-t', type=int, default=15, help='Timeout for no new reviews (default: 15s)') + parser.add_argument('--headless', action='store_true', help='Run in headless mode') + parser.add_argument('--output', '-o', help='Output JSON file') + parser.add_argument('--verbose', '-v', action='store_true', help='Verbose logging') + + # Multi-sort options + parser.add_argument('--sort', choices=['auto', 'newest', 'lowest', 'highest', 'relevant', 'multi'], + default='auto', help='Sort strategy (default: auto)') + parser.add_argument('--multi-sort', action='store_true', help='Force multi-sort mode') + parser.add_argument('--sort-order', help='Custom sort order, comma-separated (e.g., "newest,lowest,highest")') + parser.add_argument('--close-enough', type=float, default=95.0, help='Stop retrying at this %% (default: 95)') + + args = parser.parse_args() + + if not args.query and not args.url: + parser.error('Either query or --url is required') + + # Build URL + if args.url: + url = args.url + else: + from urllib.parse import quote + url = f"https://www.google.com/maps/search/?api=1&query={quote(args.query)}&hl=en" + + # Determine sort strategy + sort_strategy = 'multi' if args.multi_sort else args.sort + sort_order = args.sort_order.split(',') if args.sort_order else None + + print(f"\n{'='*60}") + print(f"🔍 SCRAPER TEST v1.1.0 (Multi-Sort)") + print(f"{'='*60}") + print(f"URL: {url}") + print(f"Max reviews: {args.max}") + print(f"Sort strategy: {sort_strategy}") + if sort_order: + print(f"Sort order: {sort_order}") + print(f"Close enough: {args.close_enough}%") + print(f"Timeout: {args.timeout}s") + print(f"Headless: {args.headless}") + print(f"{'='*60}\n") + + # Import v1.1.0 scraper + from seleniumbase import Driver + from scrapers.google_reviews.v1_1_0 import scrape_reviews, LogCapture, SORT_AUTO + + # Set up log capture + log_capture = LogCapture() + + # Track reviews for real-time progress + reviews_collected = [] + + def progress_callback(current, total): + if args.verbose: + print(f" Progress: {current}/{total or '?'}") + + def flush_callback(reviews): + reviews_collected.extend(reviews) + print(f" 📥 Flushed {len(reviews)} reviews (total: {len(reviews_collected)})") + + # Set up driver + print("🚀 Starting browser...") + driver = Driver(uc=True, headless=args.headless) + driver.set_window_size(1200, 900) + + start_time = time.time() + + try: + result = scrape_reviews( + driver=driver, + url=url, + max_reviews=args.max, + timeout_no_new=args.timeout, + log_capture=log_capture, + flush_callback=flush_callback, + progress_callback=progress_callback, + flush_batch_size=100, + sort_strategy=sort_strategy, + sort_order=sort_order, + close_enough_pct=args.close_enough + ) + + elapsed = time.time() - start_time + + # Combine flushed + remaining reviews + all_reviews = reviews_collected + result.get('reviews', []) + + print(f"\n{'='*60}") + print(f"✅ SCRAPE COMPLETE") + print(f"{'='*60}") + print(f"Total reviews: {len(all_reviews)}") + print(f"Time: {elapsed:.1f}s") + if len(all_reviews) > 0 and elapsed > 0: + print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") + + # Multi-sort info + multi_sort_info = result.get('multi_sort', {}) + if multi_sort_info.get('enabled'): + print(f"\n🔄 Multi-Sort:") + print(f" Sorts used: {multi_sort_info.get('completed_sorts', [])}") + print(f" First pass: {multi_sort_info.get('first_pass_count', 0)} reviews") + + if result.get('error'): + print(f"⚠️ Error: {result['error']}") + + # Show sample review + if all_reviews: + print(f"\n📝 Sample review:") + sample = all_reviews[0] + print(f" Author: {sample.get('author', 'N/A')}") + print(f" Rating: {'⭐' * sample.get('rating', 0)}") + if sample.get('text'): + text = sample['text'][:100] + '...' if len(sample.get('text', '')) > 100 else sample.get('text', '') + print(f" Text: {text}") + + # Save output if requested + if args.output: + output_data = { + 'timestamp': datetime.now().isoformat(), + 'url': url, + 'query': args.query, + 'total_reviews': len(all_reviews), + 'elapsed_seconds': elapsed, + 'speed': len(all_reviews)/elapsed if elapsed > 0 else 0, + 'multi_sort': multi_sort_info, + 'error': result.get('error'), + 'reviews': all_reviews + } + with open(args.output, 'w') as f: + json.dump(output_data, f, indent=2) + print(f"\n💾 Saved to: {args.output}") + + print(f"{'='*60}\n") + + return 0 if not result.get('error') else 1 + + except Exception as e: + print(f"\n❌ SCRAPE FAILED: {e}") + import traceback + traceback.print_exc() + return 1 + + finally: + print("🛑 Closing browser...") + driver.quit() + + +if __name__ == '__main__': + sys.exit(main())