From e3136281b847b676859f5518206601159c88b987 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Fri, 23 Jan 2026 17:59:09 +0000 Subject: [PATCH] Remove fast_scraper.py - consolidated into scraper_clean All functionality now in scraper_clean.py: - fast_scrape_reviews (main scraper) - get_business_card_info (validation) Updated health_checks.py to import from scraper_clean. Removes 1,935 lines of duplicate/obsolete code. Co-Authored-By: Claude Opus 4.5 --- modules/fast_scraper.py | 1935 -------------------------------------- modules/health_checks.py | 2 +- 2 files changed, 1 insertion(+), 1936 deletions(-) delete mode 100644 modules/fast_scraper.py diff --git a/modules/fast_scraper.py b/modules/fast_scraper.py deleted file mode 100644 index cb84533..0000000 --- a/modules/fast_scraper.py +++ /dev/null @@ -1,1935 +0,0 @@ -#!/usr/bin/env python3 -""" -Fast DOM-only scraper module for API integration. -Based on start_dom_only_fast.py - achieves ~18.9s for all reviews. - -This module provides a reusable function for the API server. -""" -import logging -import time -from typing import List, Dict, Any, Optional -from seleniumbase import Driver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import TimeoutException - -log = logging.getLogger(__name__) - - -def check_no_reviews_early(driver) -> tuple[bool, str]: - """ - Early detection for 'no reviews available' scenarios. - Returns (has_no_reviews, reason) tuple. - - Uses structural patterns instead of fragile CSS classes for robustness. - """ - try: - # Check for common "no reviews" messages in multiple languages - no_review_patterns = [ - 'no reviews yet', - 'be the first to review', - "there aren't any reviews", - 'no hay reseñas', - 'sin reseñas', - "pas encore d'avis", - 'noch keine bewertungen', - 'nessuna recensione', - 'まだレビューがありません', - 'sem avaliações', - 'belum ada ulasan' - ] - - # Get page text - page_text = driver.execute_script("return document.body.innerText.toLowerCase();") - - # Check for "no reviews" messages - for pattern in no_review_patterns: - if pattern in page_text: - return True, f"Found 'no reviews' message: '{pattern}'" - - # Check if review count is explicitly 0 - # IMPORTANT: Be very specific to avoid false positives! - review_count_check = driver.execute_script(""" - // Only check for EXACT "0 reviews" patterns, not loose matches - const patterns = [ - /^0\\s+reviews?$/im, // Exactly "0 reviews" on its own line - /\\(0\\s+reviews?\\)/i, // "(0 reviews)" - /\\b0\\s+reviews?\\b/i // "0 reviews" as a complete phrase - ]; - - const text = document.body.innerText; - - // Split into lines and check each line independently to avoid false positives - const lines = text.split('\\n'); - for (let line of lines) { - const trimmed = line.trim(); - for (let pattern of patterns) { - if (pattern.test(trimmed)) { - // Double-check: line should be short (not a review text itself) - if (trimmed.length < 50) { - return 'Found explicit "0 reviews" text: ' + trimmed; - } - } - } - } - - return null; - """) - - if review_count_check: - return True, review_count_check - - # Check if reviews tab is disabled or not clickable - reviews_disabled = driver.execute_script(""" - const tabs = document.querySelectorAll('button[role="tab"]'); - for (let tab of tabs) { - const text = (tab.textContent || '').toLowerCase(); - const aria = (tab.getAttribute('aria-label') || '').toLowerCase(); - - if (text.includes('review') || aria.includes('review')) { - if (tab.disabled || tab.getAttribute('aria-disabled') === 'true') { - return 'Reviews tab is disabled'; - } - } - } - return null; - """) - - if reviews_disabled: - return True, reviews_disabled - - return False, "" - - except Exception as e: - log.warning(f"Error in early no-reviews detection: {e}") - return False, "" - - -def extract_total_review_count(driver) -> Optional[int]: - """ - Extract the total number of reviews from the Google Maps page. - Looks for text patterns like "500 reviews" in various elements. - Works on both search results pages and business detail pages. - - Returns: - Total review count or None if not found - """ - extract_script = """ - // Optimized review count extraction - removed verbose logging for speed - let total = null; - - const parenthesesPattern = /\\((\\d[\\d,\\.\\s]*)\\)/; - const numberPattern = /(\\d[\\d,\\.\\s]*)\\s*(?:review|reseña|avis|recensione|Bewertung|レビュー)/i; - - // PRIORITY 1: Search results page - const searchResultsSelectors = [ - 'a[href*="reviews"]', - '[role="article"] span', - '[role="article"] a', - 'div.fontBodyMedium', - 'span.UY7F9', - ]; - - for (const selector of searchResultsSelectors) { - const elements = document.querySelectorAll(selector); - for (let i = 0; i < Math.min(elements.length, 20); i++) { - const elem = elements[i]; - const text = elem.textContent || ''; - const href = elem.getAttribute('href') || ''; - - let match = text.match(numberPattern); - if (match) { - const num = parseInt(match[1].replace(/[,\\.\\s]/g, '')); - if (num > 0 && num < 1000000) { - total = num; - break; - } - } - - if (href.includes('reviews')) { - match = text.match(/(\\d[\\d,\\.\\s]*)/); - if (match) { - const num = parseInt(match[1].replace(/[,\\.\\s]/g, '')); - if (num > 0 && num < 1000000) { - total = num; - break; - } - } - } - } - if (total) break; - } - - // PRIORITY 2: Tab buttons (business detail page) - if (!total) { - const buttons = document.querySelectorAll('button[role="tab"]'); - for (let i = 0; i < buttons.length; i++) { - const text = buttons[i].textContent || ''; - let match = text.match(parenthesesPattern); - if (match) { - const num = parseInt(match[1].replace(/[,\\.\\s]/g, '')); - total = num; - break; - } - match = text.match(numberPattern); - if (match) { - const num = parseInt(match[1].replace(/[,\\.\\s]/g, '')); - total = num; - break; - } - } - } - - // PRIORITY 3: Aria-labels - if (!total) { - const elements = document.querySelectorAll('[aria-label]'); - for (let elem of elements) { - const ariaLabel = elem.getAttribute('aria-label') || ''; - let match = ariaLabel.match(parenthesesPattern); - if (match) { - const num = parseInt(match[1].replace(/[,\\.\\s]/g, '')); - total = num; - break; - } - match = ariaLabel.match(numberPattern); - if (match) { - const num = parseInt(match[1].replace(/[,\\.\\s]/g, '')); - total = num; - break; - } - } - } - - // PRIORITY 4: Fallback - entire page text - if (!total) { - const match = document.body.innerText.match(parenthesesPattern); - if (match) { - const num = parseInt(match[1].replace(/[,\\.\\s]/g, '')); - if (num > 0 && num < 1000000) { - total = num; - } - } - } - - return total; - """ - - try: - total = driver.execute_script(extract_script) - - # Get debug info from JavaScript - debug_script = """ - const info = { - search_results_count: document.querySelectorAll('[role="article"]').length, - links_with_reviews: document.querySelectorAll('a[href*="reviews"]').length, - page_url: window.location.href, - page_title: document.title, - sample_texts: [] - }; - - // Get sample text from links that might contain reviews - const reviewLinks = document.querySelectorAll('a[href*="reviews"]'); - for (let i = 0; i < Math.min(5, reviewLinks.length); i++) { - info.sample_texts.push(reviewLinks[i].textContent.substring(0, 100)); - } - - // Also check for text containing "review" keyword - const allText = document.body.innerText.substring(0, 2000); - const reviewMatches = allText.match(/\\d+[\\s,\\.]*(?:review|reseña|avis)/gi); - if (reviewMatches) { - info.review_patterns_found = reviewMatches.slice(0, 5); - } - - return info; - """ - debug_info = driver.execute_script(debug_script) - log.info(f"Page debug: URL={debug_info.get('page_url')}") - log.info(f"Page debug: Found {debug_info.get('search_results_count')} search result articles") - log.info(f"Page debug: Found {debug_info.get('links_with_reviews')} links containing 'reviews'") - if debug_info.get('review_patterns_found'): - log.info(f"Page debug: Review patterns in text: {debug_info.get('review_patterns_found')}") - if debug_info.get('sample_texts'): - log.info(f"Page debug: Sample link texts: {debug_info.get('sample_texts')}") - - if total and total > 0: - log.info(f"Extracted total review count: {total}") - return total - else: - log.warning(f"Could not extract total review count from page. Debug: {debug_info}") - return None - except Exception as e: - log.error(f"Error extracting total review count: {e}") - return None - - -def extract_all_reviews_js(driver) -> List[Dict[str, Any]]: - """Extract ALL reviews using JavaScript - single fast operation.""" - - extract_script = """ - const reviews = []; - - // ROBUST SELECTOR STRATEGY: Try known selectors first, then fall back to structural matching - let elements = null; - - // STRATEGY 1: Try known CSS selectors (fast path) - const knownSelectors = [ - 'div.jftiEf.fontBodyMedium', - 'div.jftiEf', - 'div[data-review-id]', - 'div[jsaction*="review"]' - ]; - - for (let selector of knownSelectors) { - const found = document.querySelectorAll(selector); - if (found.length > 0) { - elements = found; - console.log('Found', found.length, 'reviews using known selector:', selector); - break; - } - } - - // STRATEGY 2: Structural matching for unknown page layouts - // IMPORTANT: Search only within the reviews pane, not the entire page! - if (!elements || elements.length === 0) { - console.log('Known selectors failed, trying structural matching...'); - - // Find the reviews pane first - const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || - document.querySelector('div.m6QErb') || - document.querySelector('div[role="main"]'); - - if (!pane) { - console.warn('No reviews pane found'); - return []; - } - - // Find all divs that LOOK like reviews (have review structure) WITHIN the pane - const allDivs = pane.querySelectorAll('div'); - const reviewElements = []; - - for (let div of allDivs) { - // Skip if too small - if (div.children.length < 2) continue; - - // Check for review indicators - const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]'); - const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]'); - const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20); - const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|dienų|savaitės)/i); - - // Must have at least author, rating, and text to be a review - const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length; - if (indicators >= 3) { - reviewElements.push(div); - } - } - - if (reviewElements.length > 0) { - elements = reviewElements; - console.log('Found', reviewElements.length, 'reviews using structural matching'); - } - } - - // STRATEGY 3: Try role="article" as last resort (within pane) - if (!elements || elements.length === 0) { - const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || - document.querySelector('div.m6QErb') || - document.querySelector('div[role="main"]'); - - if (pane) { - const articles = pane.querySelectorAll('[role="article"]'); - const validArticles = []; - - for (let article of articles) { - const hasRating = article.querySelector('[aria-label*="star" i]'); - const hasText = article.textContent.length > 30; - if (hasRating && hasText) { - validArticles.push(article); - } - } - - if (validArticles.length > 0) { - elements = validArticles; - console.log('Found', validArticles.length, 'reviews using role=article'); - } - } - } - - if (!elements || elements.length === 0) { - console.warn('No review elements found with any strategy'); - return []; - } - - for (let i = 0; i < elements.length; i++) { - const elem = elements[i]; - const review = {}; - - try { - // Author - const authorElem = elem.querySelector('div.d4r55'); - review.author = authorElem ? authorElem.textContent.trim() : null; - - // Rating - const ratingElem = elem.querySelector('span.kvMYJc'); - if (ratingElem) { - const ariaLabel = ratingElem.getAttribute('aria-label'); - if (ariaLabel) { - const match = ariaLabel.match(/\\d+/); - review.rating = match ? parseFloat(match[0]) : null; - } - } - - // Text - const textElem = elem.querySelector('span.wiI7pd'); - review.text = textElem ? textElem.textContent.trim() : null; - - // Date - const dateElem = elem.querySelector('span.rsqaWe'); - review.date_text = dateElem ? dateElem.textContent.trim() : null; - - // DEEP DIVE: Find where Google stores the actual timestamp - review.timestamp = null; - review.debug_date_info = {}; - - if (dateElem) { - // 1. Check all attributes on date element - const allAttrs = {}; - for (let attr of dateElem.attributes) { - allAttrs[attr.name] = attr.value; - } - review.debug_date_info.date_elem_attrs = allAttrs; - - // 2. Check parent elements for data - let parent = dateElem.parentElement; - let parentLevel = 0; - while (parent && parentLevel < 3) { - const parentAttrs = {}; - for (let attr of parent.attributes) { - if (attr.name.includes('data') || attr.name.includes('time') || attr.name.includes('date')) { - parentAttrs[attr.name] = attr.value; - } - } - if (Object.keys(parentAttrs).length > 0) { - review.debug_date_info[`parent_${parentLevel}_attrs`] = parentAttrs; - } - parent = parent.parentElement; - parentLevel++; - } - - // 3. Check the entire review container for hidden data - const reviewContainer = elem; - const containerAttrs = {}; - for (let attr of reviewContainer.attributes) { - containerAttrs[attr.name] = attr.value; - } - review.debug_date_info.container_attrs = containerAttrs; - - // 4. Look for script tags or JSON data near the date - const nearbyScripts = elem.querySelectorAll('script'); - if (nearbyScripts.length > 0) { - review.debug_date_info.has_nearby_scripts = nearbyScripts.length; - } - - // 5. Check for any element with 'time' in class or data - const timeElements = elem.querySelectorAll('[class*="time"], [data-timestamp], [datetime]'); - if (timeElements.length > 0) { - const timeData = []; - timeElements.forEach(el => { - timeData.push({ - tag: el.tagName, - classes: el.className, - datetime: el.getAttribute('datetime'), - timestamp: el.getAttribute('data-timestamp'), - text: el.textContent.substring(0, 50) - }); - }); - review.debug_date_info.time_elements = timeData; - } - } - - // Avatar - const avatarElem = elem.querySelector('img.NBa7we'); - review.avatar_url = avatarElem ? avatarElem.src : null; - - // Profile URL - const profileElem = elem.querySelector('button.WEBjve'); - review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null; - - if (review.author && review.date_text) { - reviews.push(review); - } - } catch (e) { - // Skip this review - } - } - - return reviews; - """ - - # ADDITIONAL: Check for Google's internal state/data objects - check_state_script = """ - // Look for Google Maps' internal data stores - const debugInfo = { - global_keys: [], - app_data: null, - window_data: null - }; - - // Check window object for Google Maps data - for (let key in window) { - if (key.includes('google') || key.includes('maps') || key.includes('APP') || key.includes('_')) { - debugInfo.global_keys.push(key); - } - } - - // Check for common React/Angular state keys - const stateKeys = ['__INITIAL_STATE__', '__NEXT_DATA__', '__APP_STATE__', 'APP_INITIALIZATION_STATE']; - for (let key of stateKeys) { - if (window[key]) { - debugInfo.app_data = key; - } - } - - // Check for embedded JSON in script tags - const scriptTags = document.querySelectorAll('script[type="application/json"], script[type="application/ld+json"]'); - debugInfo.json_scripts_count = scriptTags.length; - if (scriptTags.length > 0) { - debugInfo.json_scripts_sample = Array.from(scriptTags).slice(0, 2).map(s => s.textContent.substring(0, 200)); - } - - return debugInfo; - """ - - try: - reviews_data = driver.execute_script(extract_script) - state_debug = driver.execute_script(check_state_script) - - # Log the global state debug info - log.info(f"Google Maps state debug: {state_debug}") - - # Add review IDs - reviews = [] - for i, review_data in enumerate(reviews_data): - review_id = f"review_{hash(review_data['author'] + review_data['date_text'])}" - review_data['review_id'] = review_id - - # Add global state debug to first review only - if i == 0: - review_data['_google_state_debug'] = state_debug - - reviews.append(review_data) - - return reviews - - except Exception as e: - log.error(f"Error in JavaScript extraction: {e}") - return [] - - -def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999, progress_callback=None, driver=None, return_driver: bool = False) -> Dict[str, Any]: - """ - Ultra-fast DOM-only scraping with JavaScript extraction. - - Args: - url: Google Maps URL to scrape - headless: Run Chrome in headless mode (default: True) - max_scrolls: Maximum scrolls safety limit (default: 999999 - effectively unlimited) - The scraper stops automatically via idle detection when no new reviews load. - progress_callback: Optional callback function(current_count, total_count) for progress updates - driver: Existing driver instance to reuse (from worker pool) - return_driver: If True, don't close driver and return it in result - - Returns: - Dictionary with: - - reviews: List of review dictionaries - - count: Total number of reviews scraped - - total_reviews: Total reviews available (from page counter) - - time: Time taken in seconds - - success: True if successful, False otherwise - - error: Error message if failed - - driver: Driver instance (if return_driver=True) - """ - start_time = time.time() - - log.info(f"Starting fast scrape for URL: {url[:80]}...") - - # Force English locale for consistent date parsing - # English gives cleaner date formats: "3 months ago" vs "Hace 3 meses" - # Store original URL in case we need to retry without locale override - original_url = url - locale_override_applied = False - - if 'hl=' in url: - # Replace existing locale - url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en') - locale_override_applied = True - else: - # Add English locale parameter - separator = '&' if '?' in url else '?' - url = f"{url}{separator}hl=en" - locale_override_applied = True - - log.info(f"Using English locale (hl=en) for consistent date parsing") - - # Track if driver was provided or created - driver_provided = driver is not None - should_close_driver = not return_driver and not driver_provided - - # Initialize driver with custom user agent to avoid headless detection - # Even with headless=False + Xvfb, Chromium still reports as HeadlessChrome - if not driver: - driver = Driver( - uc=True, - headless=headless, - page_load_strategy="normal", - agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" - ) - - try: - # Navigate - driver.get(url) - time.sleep(1.5) - - # Handle GDPR consent page (CRITICAL FIX for headless mode!) - if 'consent.google.com' in driver.current_url: - try: - # Find all form buttons and click "Accept all" / "Aceptar todo" - form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button') - for btn in form_btns: - btn_text = (btn.text or '').lower() - if 'aceptar todo' in btn_text or 'accept all' in btn_text: - log.info(f"Clicking GDPR consent: {btn.text}") - btn.click() - time.sleep(2) - break - else: - # Fallback: click second button (usually "Accept all") - if len(form_btns) >= 2: - log.info("Using fallback: clicking second form button") - form_btns[1].click() - time.sleep(2) - except Exception as e: - log.warning(f"GDPR consent handling failed: {e}") - - # Dismiss cookie banner on Maps page - try: - cookie_btns = driver.find_elements(By.CSS_SELECTOR, - 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') - if cookie_btns: - cookie_btns[0].click() - time.sleep(0.3) - except: - pass - - # Click reviews tab with retry logic (important for containers) - review_keywords = ['reviews', 'review', 'reseñas', 'reseña'] - reviews_tab_clicked = False - - # Try multiple times to find and click reviews tab - for attempt in range(3): - if reviews_tab_clicked: - break - - time.sleep(0.5) # Wait between attempts - - for selector in ['button[role="tab"]', '.LRkQ2', 'button']: - try: - tabs = driver.find_elements(By.CSS_SELECTOR, selector) - for tab in tabs: - text = (tab.text or '').lower() - aria = (tab.get_attribute('aria-label') or '').lower() - - if any(kw in text or kw in aria for kw in review_keywords): - log.info(f"Clicking reviews tab: {tab.text or aria[:30]}") - driver.execute_script("arguments[0].click();", tab) - time.sleep(1.5) # Wait for tab to load - reviews_tab_clicked = True - break - - if reviews_tab_clicked: - break - except Exception as e: - log.debug(f"Tab search attempt {attempt+1} with {selector}: {e}") - continue - - if not reviews_tab_clicked: - log.warning("Could not find reviews tab with hl=en locale") - - # FALLBACK: If locale override was applied and tab not found, - # retry without locale override (fixes regional pages where hl=en breaks tabs) - if locale_override_applied: - log.info("Retrying without locale override to find reviews tab...") - - # Reload page with original URL (no hl=en) - driver.get(original_url) - time.sleep(1.5) - - # Handle GDPR again if needed - if 'consent.google.com' in driver.current_url: - try: - form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button') - for btn in form_btns: - btn_text = (btn.text or '').lower() - if any(keyword in btn_text for keyword in ['accept', 'aceptar', 'priim', 'принять', 'accepter']): - log.info(f"Clicking GDPR consent: {btn.text}") - btn.click() - time.sleep(2) - break - else: - if len(form_btns) >= 2: - log.info("Using fallback: clicking second form button") - form_btns[1].click() - time.sleep(2) - except Exception as e: - log.warning(f"GDPR consent handling failed: {e}") - - # Dismiss cookie banner - try: - cookie_btns = driver.find_elements(By.CSS_SELECTOR, - 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') - if cookie_btns: - cookie_btns[0].click() - time.sleep(0.3) - except: - pass - - # Try to find reviews tab with multilingual keywords - multilingual_keywords = [ - 'review', 'reviews', # English - 'reseña', 'reseñas', # Spanish - 'avis', # French - 'bewertung', 'bewertungen', # German - 'recensione', 'recensioni', # Italian - 'レビュー', # Japanese - 'avaliação', 'avaliações', # Portuguese - 'отзыв', 'отзывы', # Russian - 'atsiliepimai', 'atsiliepi', # Lithuanian - 'ulasan', # Indonesian - '리뷰' # Korean - ] - - for attempt in range(3): - if reviews_tab_clicked: - break - - time.sleep(0.5) - - for selector in ['button[role="tab"]', '.LRkQ2', 'button']: - try: - tabs = driver.find_elements(By.CSS_SELECTOR, selector) - for tab in tabs: - text = (tab.text or '').lower() - aria = (tab.get_attribute('aria-label') or '').lower() - - if any(kw in text or kw in aria for kw in multilingual_keywords): - log.info(f"Clicking reviews tab (native locale): {tab.text or aria[:30]}") - driver.execute_script("arguments[0].click();", tab) - time.sleep(1.5) - reviews_tab_clicked = True - break - - if reviews_tab_clicked: - break - except Exception as e: - log.debug(f"Native locale tab search attempt {attempt+1} with {selector}: {e}") - continue - - if not reviews_tab_clicked: - log.warning("Could not find reviews tab even without locale override") - - # Wait for reviews section to load - time.sleep(2) - - # EARLY DETECTION: Check if there are no reviews before attempting to scrape - no_reviews, reason = check_no_reviews_early(driver) - if no_reviews: - log.info(f"Early detection: No reviews available. Reason: {reason}") - return { - "reviews": [], - "count": 0, - "total_reviews": 0, - "time": time.time() - start_time, - "success": True, - "message": f"No reviews available: {reason}" - } - - # Extract total review count from the page - total_reviews = extract_total_review_count(driver) - - # Double-check: If extracted count is 0, return early - if total_reviews == 0: - log.info("Total review count is 0, skipping scraping") - return { - "reviews": [], - "count": 0, - "total_reviews": 0, - "time": time.time() - start_time, - "success": True, - "message": "Business has 0 reviews" - } - - # Report initial progress with total count - if progress_callback and total_reviews: - try: - progress_callback(0, total_reviews) - except Exception as e: - log.warning(f"Progress callback failed: {e}") - - # Find scrollable pane - try multiple selectors (container-friendly) - pane = None - pane_selectors = [ - 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', - 'div.m6QErb.WNBkOb.XiKgde', - 'div.m6QErb', # Fallback to more general selector - 'div[role="main"]', - ] - - wait = WebDriverWait(driver, 5) - for selector in pane_selectors: - try: - pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector))) - log.info(f"Found pane with selector: {selector}") - break - except TimeoutException: - continue - - if not pane: - error_msg = "Could not find scrollable pane after trying all selectors" - log.error(error_msg) - return { - "reviews": [], - "count": 0, - "total_reviews": total_reviews, - "time": time.time() - start_time, - "success": False, - "error": error_msg - } - - # Wait longer for initial reviews to load (containers can be slower) - time.sleep(2) - - # Setup scroll - driver.execute_script("window.scrollablePane = arguments[0];", pane) - scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" - - # Trigger initial scroll and verify reviews are loading - driver.execute_script(scroll_script) - time.sleep(0.8) - - # Also scroll the main window (helps in some cases, especially containers) - driver.execute_script("window.scrollBy(0, 500);") - time.sleep(0.5) - - # JavaScript function to count reviews using ROBUST structural patterns - # Instead of relying on CSS classes, we look for containers with review-like structure - count_reviews_script = """ - // STRATEGY 1: Try known selectors first (fast path) - const knownSelectors = [ - 'div.jftiEf.fontBodyMedium', - 'div.jftiEf', - 'div[data-review-id]', - 'div[jsaction*="review"]' - ]; - - for (let selector of knownSelectors) { - const found = document.querySelectorAll(selector); - if (found.length > 0) { - return found.length; - } - } - - // STRATEGY 2: Structural pattern matching (robust, class-agnostic) - // Find containers that LOOK like reviews (have author + rating + text structure) - // IMPORTANT: Search only within the reviews pane, not the entire page! - const findReviewsByStructure = () => { - // Find the reviews pane first - const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || - document.querySelector('div.m6QErb') || - document.querySelector('div[role="main"]'); - - if (!pane) return 0; - - // Search only within the pane - const allDivs = pane.querySelectorAll('div'); - let reviewCount = 0; - - for (let div of allDivs) { - // Skip if too small (reviews have substantial content) - if (div.children.length < 2) continue; - - // Look for review indicators: - // - Has an author name (usually in a span/div with small text) - // - Has a rating (span with aria-label containing "star" or "rating") - // - Has review text (span/div with longer text content) - - const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]'); - const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]'); - const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20); - const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|jour|mois|année|dienų|savaitės)/i); - - // If it has at least 3 of these indicators, it's likely a review - const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length; - if (indicators >= 3) { - reviewCount++; - } - } - - return reviewCount > 0 ? reviewCount : 0; - }; - - // STRATEGY 3: Look for role="article" with review-like content (within pane) - const pane3 = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || - document.querySelector('div.m6QErb') || - document.querySelector('div[role="main"]'); - if (pane3) { - const articles = pane3.querySelectorAll('[role="article"]'); - if (articles.length > 0) { - let validArticles = 0; - for (let article of articles) { - // Check if article looks like a review (has rating + text) - const hasRating = article.querySelector('[aria-label*="star" i]'); - const hasText = article.textContent.length > 30; - if (hasRating && hasText) { - validArticles++; - } - } - if (validArticles > 0) return validArticles; - } - } - - // Try structural matching as last resort - const structuralCount = findReviewsByStructure(); - return structuralCount; - """ - - # Check if reviews are actually loading - initial_count = driver.execute_script(count_reviews_script) - - if initial_count < 5: - # Reviews not loaded yet, wait more and try alternative scrolling - log.info(f"Waiting for reviews to load (found {initial_count})...") - - # Try clicking on the pane to focus it - try: - driver.execute_script("arguments[0].click();", pane) - time.sleep(0.5) - except: - pass - - # Scroll both pane and window - driver.execute_script(scroll_script) - driver.execute_script("window.scrollBy(0, 500);") - time.sleep(1.5) - - initial_count = driver.execute_script(count_reviews_script) - - log.info(f"After extra waiting: {initial_count} reviews") - - log.info(f"Scrolling to load all reviews (starting with {initial_count})...") - - # Fast scrolling to load all DOM elements - last_count = 0 - idle_count = 0 - - for i in range(max_scrolls): - # Scroll to load more - prev_count = driver.execute_script(count_reviews_script) - driver.execute_script(scroll_script) - - # SMART WAIT: Wait until new reviews actually load - max_wait = 1.2 - wait_step = 0.05 - waited = 0 - - while waited < max_wait: - time.sleep(wait_step) - waited += wait_step - - current_count = driver.execute_script(count_reviews_script) - - # If reviews loaded, continue immediately! - if current_count > prev_count: - idle_count = 0 # Reset idle counter - break - - # Give Google Maps more time to lazy-load (0.6s instead of 0.3s) - # Only exit early if we're confident nothing is loading - if waited >= 0.6 and current_count == prev_count: - break - - # Track consecutive idle scrolls - if current_count == prev_count: - idle_count += 1 - # Be VERY patient: wait for 12 consecutive idle scrolls to ensure we get ALL reviews - # (each with up to 1.2s wait = ~14.4s total idle time before giving up) - # This ensures Google Maps has plenty of time to lazy-load all content - if idle_count >= 12: - log.info(f"Reached end at {current_count} reviews (12 consecutive idle scrolls)") - # Double-check we got all reviews if we know the total - if total_reviews and current_count < total_reviews: - log.warning(f"Only got {current_count}/{total_reviews} reviews ({current_count/total_reviews*100:.1f}%). Some may be hidden or loading slowly.") - break - - # Progress logging and callback every 5 scrolls - if (i + 1) % 5 == 0: - log.info(f"{current_count} review elements loaded...") - if progress_callback and total_reviews: - try: - progress_callback(current_count, total_reviews) - except Exception as e: - log.warning(f"Progress callback failed: {e}") - - # Aggressive memory management every 20 scrolls - if (i + 1) % 20 == 0: - try: - # Clear console logs to prevent buildup - driver.execute_script("console.clear();") - - # Force garbage collection in browser - driver.execute_script(""" - if (window.gc) { window.gc(); } - // Remove image srcs to free memory (images reload on demand) - document.querySelectorAll('img').forEach(img => { - if (img.complete && !img.classList.contains('needed')) { - img.removeAttribute('src'); - } - }); - """) - - # Brief pause to let Chrome breathe - time.sleep(0.1) - except Exception: - pass # Ignore if fails - - last_count = current_count - - # Shorter final scroll - for _ in range(2): - driver.execute_script(scroll_script) - time.sleep(0.3) - - scroll_time = time.time() - start_time - log.info(f"Scrolling complete in {scroll_time:.2f}s") - - # Update progress: scrolling done, starting extraction - if progress_callback and total_reviews: - try: - progress_callback(current_count, total_reviews) - except Exception as e: - log.warning(f"Progress callback failed: {e}") - - # Extract ALL reviews using JavaScript (fast!) - log.info("Extracting reviews with JavaScript...") - extract_start = time.time() - - all_reviews = extract_all_reviews_js(driver) - - extract_time = time.time() - extract_start - log.info(f"Extraction complete in {extract_time:.2f}s") - - # Final progress update with actual extracted count - if progress_callback and total_reviews: - try: - progress_callback(len(all_reviews), total_reviews) - except Exception as e: - log.warning(f"Progress callback failed: {e}") - - elapsed = time.time() - start_time - - log.info(f"Fast scrape completed: {len(all_reviews)} reviews in {elapsed:.2f}s") - - result = { - "reviews": all_reviews, - "count": len(all_reviews), - "total_reviews": total_reviews, - "time": elapsed, - "scroll_time": scroll_time, - "extract_time": extract_time, - "success": True, - "error": None - } - - if return_driver: - result["driver"] = driver - - return result - - except Exception as e: - elapsed = time.time() - start_time - error_msg = f"Fast scrape failed: {str(e)}" - log.error(error_msg) - - # Check if this is a tab crash - try to extract what we have - partial_reviews = [] - is_tab_crash = "tab crashed" in str(e).lower() or "session deleted" in str(e).lower() - - if is_tab_crash and driver: - log.warning("Detected tab crash - attempting to extract partial reviews from DOM before crash...") - try: - # Try to extract reviews that were loaded before crash - partial_reviews = extract_all_reviews_js(driver) - log.info(f"Recovered {len(partial_reviews)} reviews from crashed session") - except Exception as recovery_error: - log.error(f"Could not recover reviews: {recovery_error}") - - # Return partial results if we got any - if partial_reviews: - result = { - "reviews": partial_reviews, - "count": len(partial_reviews), - "total_reviews": None, - "time": elapsed, - "success": False, # Mark as failed but with partial data - "error": f"{error_msg} (recovered {len(partial_reviews)} reviews)", - "partial": True - } - else: - result = { - "reviews": [], - "count": 0, - "total_reviews": None, - "time": elapsed, - "success": False, - "error": error_msg - } - - if return_driver: - result["driver"] = driver - - return result - - finally: - if should_close_driver and driver: - try: - driver.quit() - except: - pass - - -def get_business_card_info(url: str, headless: bool = True, driver=None, return_driver: bool = False) -> Dict[str, Any]: - """ - Extract business card information from Google Maps. - Uses the same reliable navigation logic as the main scraper. - - Returns business card with: - - name - - address - - rating (float) - - total_reviews (int) - - success/error - """ - import time as timing_module - start_time = timing_module.time() - log.info(f"[PROFILE] Getting business card info for: {url}") - - driver_provided = driver is not None - should_close_driver = not return_driver and not driver_provided - - try: - # Initialize driver if not provided - t0 = timing_module.time() - if not driver: - driver = Driver( - uc=True, - headless=headless, - page_load_strategy="normal", - agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" - ) - log.info(f"[PROFILE] Driver initialization: {timing_module.time() - t0:.2f}s") - else: - log.info(f"[PROFILE] Using pooled driver (0.00s)") - - # Force English locale AND US region for consistent parsing/results - # This helps avoid geolocation-based variations in Google Maps results - if 'hl=' in url: - url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en') - else: - separator = '&' if '?' in url else '?' - url = f"{url}{separator}hl=en" - - # Add US region parameter if not present - if 'gl=' not in url: - url = f"{url}&gl=us" - - # Set Chrome geolocation to US (Boston, MA) using CDP - # This ensures Google Maps shows US results regardless of server location - try: - driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { - 'latitude': 42.3601, - 'longitude': -71.0589, - 'accuracy': 100 - }) - log.info("Set geolocation to US (Boston, MA)") - except Exception as e: - log.warning(f"Could not set geolocation: {e}") - - log.info(f"Loading Google Maps page...") - t0 = timing_module.time() - driver.get(url) - log.info(f"[PROFILE] Page load (driver.get): {timing_module.time() - t0:.2f}s") - - t0 = timing_module.time() - time.sleep(0.5) # Initial wait - reduced from 2s - log.info(f"[PROFILE] Initial sleep: {timing_module.time() - t0:.2f}s") - - # Handle GDPR consent page - t0 = timing_module.time() - if 'consent.google.com' in driver.current_url: - log.info("Detected GDPR consent page, accepting...") - try: - form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button') - for btn in form_btns: - btn_text = (btn.text or '').lower() - if 'aceptar todo' in btn_text or 'accept all' in btn_text or 'reject all' in btn_text: - log.info(f"Clicking GDPR consent: {btn.text}") - btn.click() - time.sleep(1) - break - else: - if len(form_btns) >= 2: - log.info("Using fallback: clicking second form button") - form_btns[1].click() - time.sleep(1) - except Exception as e: - log.warning(f"GDPR consent handling failed: {e}") - - # After GDPR consent, reload the original URL to ensure proper page state - log.info(f"Reloading original URL after GDPR consent...") - driver.get(url) - time.sleep(1) - log.info(f"[PROFILE] GDPR consent handling: {timing_module.time() - t0:.2f}s") - else: - log.info(f"[PROFILE] No GDPR consent page (0.00s)") - - # Dismiss cookie banner - try: - cookie_btns = driver.find_elements(By.CSS_SELECTOR, - 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') - if cookie_btns: - log.info("Dismissing cookie banner...") - cookie_btns[0].click() - time.sleep(0.3) # Reduced from 0.5s - except: - pass - log.info(f"[PROFILE] Cookie banner dismissal: {timing_module.time() - t0:.2f}s") - - # Wait for page to load - use smart waits - t0 = timing_module.time() - try: - log.info("Waiting for Google Maps content to load...") - wait = WebDriverWait(driver, 10) - # Wait for basic page structure (h1 or heading) - wait.until( - lambda d: d.find_elements(By.CSS_SELECTOR, 'h1, [role="heading"]') - ) - log.info("Basic page structure loaded") - - # Wait for page to settle - search URLs redirect to place URLs - # which triggers additional content loading - time.sleep(2) - - # Wait specifically for review count element (aria-label ending with "reviews") - # This is the most reliable indicator that the business detail is loaded - try: - WebDriverWait(driver, 5).until( - lambda d: d.execute_script(""" - var elems = document.querySelectorAll('[aria-label]'); - for (var i = 0; i < elems.length; i++) { - var label = elems[i].getAttribute('aria-label') || ''; - if (/^[0-9]+ reviews?$/.test(label)) return true; - } - return false; - """) - ) - log.info("Review count element loaded") - except: - # Fallback: Try clicking Reviews tab or rating stars to expose the review count - log.info("Review count wait timeout, trying to click Reviews/rating...") - try: - # Try 1: Click Reviews tab (if exists) - clicked = driver.execute_script(""" - var tabs = document.querySelectorAll('[role="tab"]'); - for (var i = 0; i < tabs.length; i++) { - var txt = (tabs[i].textContent || '').toLowerCase(); - if (txt.includes('review')) { - tabs[i].click(); - return 'tab'; - } - } - // Try 2: Click the rating stars element (often links to reviews) - var stars = document.querySelector('[role="img"][aria-label*="star"]'); - if (stars) { - var parent = stars.parentElement; - if (parent && parent.tagName.toLowerCase() === 'button') { - parent.click(); - return 'stars_button'; - } - stars.click(); - return 'stars'; - } - // Try 3: Click "Write a review" or any review-related button - var btns = document.querySelectorAll('button[aria-label*="review" i]'); - for (var b = 0; b < btns.length; b++) { - var label = btns[b].getAttribute('aria-label') || ''; - if (!/write/i.test(label) && /review/i.test(label)) { - btns[b].click(); - return 'review_btn: ' + label; - } - } - return 'none'; - """) - log.info(f"Clicked: {clicked}") - time.sleep(2) # Wait for reviews panel to load - except Exception as e: - log.warning(f"Click attempt failed: {e}") - - except Exception as e: - log.warning(f"Timeout waiting for Maps content: {e}") - time.sleep(2) # Fallback wait - log.info(f"[PROFILE] Smart wait for content: {timing_module.time() - t0:.2f}s") - log.info(f"DEBUG: Current URL: {driver.current_url[:100]}...") - log.info(f"DEBUG: Page title: {driver.title}") - - # Extract business card information using JavaScript - t0 = timing_module.time() - extract_script = """ - const info = { - name: null, - address: null, - rating: null, - total_reviews: null - }; - - // ============ ROBUST EXTRACTION (no class names, aria/data attributes preferred) ============ - - // Helper: Parse review count from text, handling multiple formats - function parseReviewCount(text) { - if (!text) return null; - - // Pattern 1: Exact "N reviews" format (aria-labels, clean text) - // Matches: "27 reviews", "1,234 reviews", "27 reseñas", "27 avis" - var match = text.match(/^([0-9][0-9,.]*)[ ]*(?:reviews?|reseñas?|avis|bewertungen?|recensioni?)$/i); - if (match) { - return parseInt(match[1].replace(/[,. ]/g, '')); - } - - // Pattern 2: "(N)" format often used in tabs like "Reviews (27)" - match = text.match(/[(]([0-9][0-9,.]*)[)]$/); - if (match) { - return parseInt(match[1].replace(/[,. ]/g, '')); - } - - // Pattern 3: "N reviews" anywhere in short text (< 30 chars to avoid false positives) - if (text.length < 30) { - match = text.match(/([0-9][0-9,]*)[ ]+(?:reviews?|reseñas?|avis)/i); - if (match) { - return parseInt(match[1].replace(/[,. ]/g, '')); - } - } - - return null; - } - - // ============ EXTRACT BUSINESS NAME ============ - // Priority: h1 (semantic), then role="heading" - const h1 = document.querySelector('h1'); - if (h1 && h1.textContent) { - info.name = h1.textContent.trim(); - } - if (!info.name) { - const heading = document.querySelector('[role="heading"][aria-level="1"]'); - if (heading && heading.textContent) { - info.name = heading.textContent.trim(); - } - } - - // ============ EXTRACT ADDRESS ============ - // Priority: data-item-id (semantic), then aria-label containing "address" - const addressElem = document.querySelector('[data-item-id*="address"]'); - if (addressElem && addressElem.textContent) { - info.address = addressElem.textContent.trim(); - } - if (!info.address) { - const ariaAddress = document.querySelector('[aria-label*="ddress"]'); - if (ariaAddress && ariaAddress.textContent) { - info.address = ariaAddress.textContent.trim(); - } - } - - // ============ EXTRACT RATING ============ - // Priority: aria-label containing "star" on role="img" elements - info._debug_rating_context = []; - const ratingElems = document.querySelectorAll('[role="img"][aria-label*="star"]'); - for (let elem of ratingElems) { - const ariaLabel = elem.getAttribute('aria-label') || ''; - // Match "4.9 stars" or "4,9 stars" (European format) - const match = ariaLabel.match(/([0-9][.,]?[0-9]?)\\s*star/i); - if (match) { - info.rating = parseFloat(match[1].replace(',', '.')); - // DEBUG: Capture parent/sibling context to find review count - var parent = elem.parentElement; - if (parent) { - info._debug_rating_context.push('PARENT: ' + (parent.textContent || '').trim().substring(0, 100)); - var grandparent = parent.parentElement; - if (grandparent) { - info._debug_rating_context.push('GRANDPARENT: ' + (grandparent.textContent || '').trim().substring(0, 100)); - // Check all children of grandparent for review count - var gpChildren = grandparent.querySelectorAll('*'); - for (var c = 0; c < Math.min(gpChildren.length, 30); c++) { - var childText = (gpChildren[c].textContent || '').trim(); - if (childText.length > 0 && childText.length < 20 && /[0-9]/.test(childText)) { - info._debug_rating_context.push('GP_CHILD: ' + childText); - } - } - // Also check great-grandparent - var ggp = grandparent.parentElement; - if (ggp) { - info._debug_rating_context.push('GREAT_GP: ' + (ggp.textContent || '').trim().substring(0, 150)); - } - } - // Check siblings - var nextSib = parent.nextElementSibling; - if (nextSib) { - info._debug_rating_context.push('NEXT_SIB: ' + (nextSib.textContent || '').trim().substring(0, 100)); - } - } - break; - } - } - - // ============ EXTRACT TOTAL REVIEWS (ROBUST, ARIA-FIRST) ============ - - // PRIORITY 1: aria-label with exact "N reviews" format (most reliable) - // Google Maps uses aria-label="27 reviews" for accessibility - info._debug_aria = []; - info._debug_all_numeric = []; - if (!info.total_reviews) { - var ariaElems = document.querySelectorAll('[aria-label]'); - for (var i = 0; i < ariaElems.length; i++) { - var ariaLabel = ariaElems[i].getAttribute('aria-label') || ''; - // Collect all labels containing "review" - if (ariaLabel.toLowerCase().indexOf('review') >= 0) { - info._debug_aria.push(ariaLabel); - } - // Collect all labels starting with a digit - if (/^[0-9]/.test(ariaLabel)) { - info._debug_all_numeric.push(ariaLabel); - } - var count = parseReviewCount(ariaLabel); - if (count && count > 0 && count < 100000) { - info.total_reviews = count; - info._debug_matched = ariaLabel; - break; - } - } - } - - // DEBUG: Find all text with parenthetical numbers like "(27)" - info._debug_parens = []; - info._debug_short_text = []; // All short text with numbers - var allSpans = document.querySelectorAll('span, div, a, button'); - for (var j = 0; j < Math.min(allSpans.length, 500); j++) { - var spanText = allSpans[j].textContent || ''; - // Capture parenthetical numbers - if (spanText.length < 20 && /[(][0-9]+[)]/.test(spanText)) { - info._debug_parens.push(spanText.trim()); - } - // Capture ALL short text containing numbers (for debugging) - if (spanText.length > 0 && spanText.length < 30 && /[0-9]+/.test(spanText)) { - var cleaned = spanText.trim().replace(/\\s+/g, ' '); - if (cleaned && info._debug_short_text.indexOf(cleaned) < 0) { - info._debug_short_text.push(cleaned); - } - } - } - - // PRIORITY 2.5: Look for text containing numbers near "review" word anywhere on page - // This catches formats like "27 reviews", "reviews: 27", etc. that aren't in aria-labels - if (!info.total_reviews) { - var allElems = document.querySelectorAll('*'); - for (var k = 0; k < Math.min(allElems.length, 1000); k++) { - var elem = allElems[k]; - // Skip if has children (we want leaf nodes only) - if (elem.children.length > 0) continue; - var txt = (elem.textContent || '').trim(); - // Look for short text with both numbers and "review" word - if (txt.length >= 3 && txt.length < 30 && /review/i.test(txt)) { - var match = txt.match(/([0-9][0-9,]*)/); - if (match) { - var count = parseInt(match[1].replace(/,/g, '')); - if (count > 0 && count < 100000) { - info.total_reviews = count; - info._debug_matched = 'LEAF: ' + txt; - break; - } - } - } - } - } - - // DEBUG: Collect all tab names - info._debug_tabs = []; - const tabs = document.querySelectorAll('[role="tab"]'); - for (let t = 0; t < tabs.length; t++) { - info._debug_tabs.push((tabs[t].textContent || '').trim().substring(0, 30)); - } - - // DEBUG: Collect all buttons with text (might contain review count) - info._debug_buttons = []; - const buttons = document.querySelectorAll('button'); - for (let b = 0; b < Math.min(buttons.length, 20); b++) { - var btnText = (buttons[b].textContent || '').trim(); - if (btnText && btnText.length < 40) { - info._debug_buttons.push(btnText.substring(0, 40)); - } - } - - // PRIORITY 2: Tabs with role="tab" (Reviews tab often shows count) - if (!info.total_reviews) { - for (let tab of tabs) { - const text = (tab.textContent || '').trim(); - // Look for "Reviews" tab with count - if (text.toLowerCase().includes('review')) { - const count = parseReviewCount(text); - if (count && count > 0) { - info.total_reviews = count; - info._debug_matched = 'TAB: ' + text; - break; - } - } - } - } - - // PRIORITY 2.3: Reviews panel header (after clicking Reviews tab) - // Google Maps shows "27 reviews" as heading text in the reviews panel - if (!info.total_reviews) { - // Look for headings containing review count - var headings = document.querySelectorAll('h1, h2, [role="heading"]'); - for (var h = 0; h < headings.length; h++) { - var hText = (headings[h].textContent || '').trim(); - if (/review/i.test(hText)) { - var match = hText.match(/([0-9][0-9,]*)/); - if (match) { - var count = parseInt(match[1].replace(/,/g, '')); - if (count > 0 && count < 100000) { - info.total_reviews = count; - info._debug_matched = 'HEADING: ' + hText; - break; - } - } - } - } - } - - // PRIORITY 2.4: Look for sort button area which often has total count - // The sort dropdown area displays "Sort: Newest" and total reviews - if (!info.total_reviews) { - var sortBtns = document.querySelectorAll('button[data-value="sort"], [aria-label*="Sort"]'); - for (var s = 0; s < sortBtns.length; s++) { - var parent = sortBtns[s].parentElement; - if (parent) { - var pText = (parent.textContent || '').trim(); - if (/review/i.test(pText)) { - var match = pText.match(/([0-9][0-9,]*)\\s*review/i); - if (match) { - var count = parseInt(match[1].replace(/,/g, '')); - if (count > 0 && count < 100000) { - info.total_reviews = count; - info._debug_matched = 'SORT_AREA: ' + pText.substring(0, 50); - break; - } - } - } - } - } - } - - // PRIORITY 3: Elements with semantic review-related attributes - if (!info.total_reviews) { - const reviewLinks = document.querySelectorAll('a[href*="review"], button[aria-label*="review" i]'); - for (let elem of reviewLinks) { - const text = (elem.textContent || '').trim(); - const count = parseReviewCount(text); - if (count && count > 0) { - info.total_reviews = count; - break; - } - } - } - - // PRIORITY 4: Look for standalone review count text near rating - // Find elements that contain ONLY "N reviews" pattern (not concatenated with rating) - if (!info.total_reviews) { - const allElements = document.querySelectorAll('span, a'); - for (let elem of allElements) { - // Get direct text content only (not nested children) - const text = (elem.textContent || '').trim(); - // Skip if too long (likely contains other content) - if (text.length > 50) continue; - // Skip if it looks like rating+reviews concatenated (e.g., "4.927 reviews") - if (/^[0-9]\\.[0-9]+[0-9]/.test(text)) continue; - - const count = parseReviewCount(text); - if (count && count > 0 && count < 100000) { - info.total_reviews = count; - break; - } - } - } - - // PRIORITY 5: Parse from visible page text using regex on short text blocks - if (!info.total_reviews) { - const walker = document.createTreeWalker( - document.body, - NodeFilter.SHOW_TEXT, - null, - false - ); - while (walker.nextNode()) { - const text = walker.currentNode.textContent.trim(); - if (text.length >= 5 && text.length <= 30) { - // Match "27 reviews" but not "4.927 reviews" - const match = text.match(/(?:^|[^0-9.,])([0-9,]+)\\s+(?:reviews?|reseñas?)/i); - if (match) { - const count = parseInt(match[1].replace(/[,]/g, '')); - if (count > 0 && count < 100000) { - info.total_reviews = count; - info._debug_matched = 'WALKER: ' + text; - break; - } - } - } - } - } - - // PRIORITY 6: Extract from embedded JSON in page source (Google embeds data in scripts) - if (!info.total_reviews) { - var scripts = document.querySelectorAll('script'); - for (var sc = 0; sc < scripts.length; sc++) { - var scriptText = scripts[sc].textContent || ''; - // Look for patterns like "user_reviews":{"count":27} or reviews_count":27 - var jsonMatch = scriptText.match(/"(?:user_reviews|reviews?)(?:_count)?"\s*[:\{]\s*"?(\d+)"?/i); - if (jsonMatch) { - var count = parseInt(jsonMatch[1]); - if (count > 0 && count < 100000) { - info.total_reviews = count; - info._debug_matched = 'JSON_SCRIPT'; - break; - } - } - // Also look for review count in Google's data format like [\"27 reviews\"] - if (!info.total_reviews) { - var dataMatch = scriptText.match(/"(\d+)\s+reviews?"/i); - if (dataMatch) { - var count = parseInt(dataMatch[1]); - if (count > 0 && count < 100000) { - info.total_reviews = count; - info._debug_matched = 'JSON_DATA: ' + dataMatch[0]; - break; - } - } - } - } - } - - return info; - """ - - business_info = driver.execute_script(extract_script) - log.info(f"[PROFILE] Business card extraction: {timing_module.time() - t0:.2f}s") - - total_time = timing_module.time() - start_time - log.info(f"[PROFILE] *** TOTAL GET_BUSINESS_CARD TIME: {total_time:.2f}s ***") - log.info(f"Business card extracted: name={business_info.get('name')}, " - f"rating={business_info.get('rating')}, reviews={business_info.get('total_reviews')}") - # Debug: log what aria-labels were found - if business_info.get('_debug_aria'): - log.info(f"DEBUG: Found {len(business_info.get('_debug_aria'))} aria-labels with 'review': {business_info.get('_debug_aria')[:5]}") - if business_info.get('_debug_matched'): - log.info(f"DEBUG: Matched aria-label: {business_info.get('_debug_matched')}") - # Also log all numeric aria-labels (potential review counts) - if business_info.get('_debug_all_numeric'): - log.info(f"DEBUG: Numeric aria-labels: {business_info.get('_debug_all_numeric')[:10]}") - # Log any text with parenthetical numbers like "(27)" - if business_info.get('_debug_parens'): - log.info(f"DEBUG: Parenthetical text: {business_info.get('_debug_parens')[:5]}") - # Log all short text containing numbers (for debugging review count detection) - if business_info.get('_debug_short_text'): - log.info(f"DEBUG: Short text with numbers: {business_info.get('_debug_short_text')[:15]}") - # Log the context around the rating element - if business_info.get('_debug_rating_context'): - for ctx in business_info.get('_debug_rating_context', []): - log.info(f"DEBUG: Rating context: {ctx}") - # Log what tabs exist on the page - if business_info.get('_debug_tabs'): - log.info(f"DEBUG: Page tabs: {business_info.get('_debug_tabs')}") - else: - log.info(f"DEBUG: No tabs found on page") - # Log buttons (might contain review count) - if business_info.get('_debug_buttons'): - log.info(f"DEBUG: Buttons: {business_info.get('_debug_buttons')[:10]}") - - result = { - "name": business_info.get('name'), - "address": business_info.get('address'), - "rating": business_info.get('rating'), - "total_reviews": business_info.get('total_reviews') or 0, - "has_reviews": (business_info.get('total_reviews') or 0) > 0, - "success": True, - "error": None - } - - if return_driver: - result["driver"] = driver - return result - - except Exception as e: - total_time = timing_module.time() - start_time - error_msg = f"Failed to get business card info: {str(e)}" - log.error(error_msg) - log.info(f"[PROFILE] *** TOTAL GET_BUSINESS_CARD TIME (FAILED): {total_time:.2f}s ***") - result = { - "name": None, - "address": None, - "rating": None, - "total_reviews": 0, - "has_reviews": False, - "success": False, - "error": error_msg - } - if return_driver: - result["driver"] = driver - return result - - finally: - if should_close_driver and driver: - try: - driver.quit() - except: - pass - - -def check_reviews_available(url: str, headless: bool = True, driver=None, return_driver: bool = False) -> Dict[str, Any]: - """ - Lightweight check to see if a business has reviews available. - - This function just loads the page and checks for review count without - doing the full scraping. Used to enable/disable scrape button in UI. - - Args: - url: Google Maps URL to check - headless: Run in headless mode (default True) - driver: Existing driver instance to reuse (from worker pool) - return_driver: If True, don't close driver and return it in result - - Returns: - Dict containing: - - has_reviews: bool - whether reviews exist - - review_count: int - number of reviews (0 if none) - - business_name: str - name of business (if found) - - success: bool - whether check succeeded - - error: str - error message (if failed) - - driver: Driver instance (if return_driver=True) - """ - import time as timing_module - start_time = timing_module.time() - log.info(f"[PROFILE] Starting validation for: {url}") - - driver_provided = driver is not None - should_close_driver = not return_driver and not driver_provided - - try: - # Initialize driver if not provided - t0 = timing_module.time() - if not driver: - driver = Driver(uc=True, headless=headless) - driver.maximize_window() - log.info(f"[PROFILE] Driver initialization: {timing_module.time() - t0:.2f}s") - else: - log.info(f"[PROFILE] Using pooled driver (0.00s)") - - # Navigate to the URL - t0 = timing_module.time() - log.info(f"Loading Google Maps page...") - driver.get(url) - log.info(f"[PROFILE] Page load (driver.get): {timing_module.time() - t0:.2f}s") - - t0 = timing_module.time() - time.sleep(0.5) # Initial wait - reduced from 2s - log.info(f"[PROFILE] Initial sleep: {timing_module.time() - t0:.2f}s") - - # Handle GDPR consent page (CRITICAL for validation to work!) - t0 = timing_module.time() - if 'consent.google.com' in driver.current_url: - log.info("Detected GDPR consent page, accepting...") - try: - # Find all form buttons and click "Accept all" / "Aceptar todo" - form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button') - for btn in form_btns: - btn_text = (btn.text or '').lower() - if 'aceptar todo' in btn_text or 'accept all' in btn_text: - log.info(f"Clicking GDPR consent: {btn.text}") - btn.click() - time.sleep(1) # Reduced from 2s - break - else: - # Fallback: click second button (usually "Accept all") - if len(form_btns) >= 2: - log.info("Using fallback: clicking second form button") - form_btns[1].click() - time.sleep(1) # Reduced from 2s - except Exception as e: - log.warning(f"GDPR consent handling failed: {e}") - log.info(f"[PROFILE] GDPR consent handling: {timing_module.time() - t0:.2f}s") - else: - log.info(f"[PROFILE] No GDPR consent page (0.00s)") - - # Dismiss cookie banner on Maps page - t0 = timing_module.time() - try: - cookie_btns = driver.find_elements(By.CSS_SELECTOR, - 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') - if cookie_btns: - log.info("Dismissing cookie banner...") - cookie_btns[0].click() - time.sleep(0.3) # Reduced from 0.5s - except: - pass - log.info(f"[PROFILE] Cookie banner dismissal: {timing_module.time() - t0:.2f}s") - - # Wait for page to fully load after consent - use smart waits - t0 = timing_module.time() - try: - # Wait for either business card OR search results to appear - log.info("Waiting for Google Maps content to load...") - wait = WebDriverWait(driver, 10) - wait.until( - lambda d: d.find_elements(By.CSS_SELECTOR, 'h1.DUwDvf, h1, [role="article"], [data-review-id]') - ) - log.info("Google Maps content loaded successfully") - except Exception as e: - log.warning(f"Timeout waiting for Maps content: {e}") - time.sleep(0.5) # Minimal fallback wait - log.info(f"[PROFILE] Smart wait for content: {timing_module.time() - t0:.2f}s") - - # Try to extract business name - t0 = timing_module.time() - business_name = None - try: - business_name_script = """ - // Try to find business name from various locations - let name = null; - - // Method 1: Look for business name in the main panel (most reliable) - // This is where the actual business info appears - const businessPanelSelectors = [ - 'h1.DUwDvf', // Main business name heading - '[role="main"] h1', // H1 in main content - 'h1.fontHeadlineLarge', // Large headline font - 'button[jsaction*="pane.header.rating"] h1', // Near rating button - ]; - - for (const selector of businessPanelSelectors) { - const element = document.querySelector(selector); - if (element && element.textContent) { - const text = element.textContent.trim(); - // Filter out Google's placeholder/suggestion text - if (text && - !text.toLowerCase().includes('antes de ir') && - !text.toLowerCase().includes('before going') && - !text.toLowerCase().includes('google maps') && - text.length < 100) { // Business names shouldn't be super long - name = text; - break; - } - } - } - - // Method 2: h1 tag (fallback) - if (!name) { - const h1 = document.querySelector('h1'); - if (h1 && h1.textContent) { - const text = h1.textContent.trim(); - if (!text.toLowerCase().includes('antes de ir') && - !text.toLowerCase().includes('before going')) { - name = text; - } - } - } - - // Method 3: Title attribute (last resort) - if (!name) { - const title = document.title; - if (title && !title.includes('Google Maps')) { - name = title.split('-')[0].trim(); - } - } - - return name; - """ - business_name = driver.execute_script(business_name_script) - if business_name: - log.info(f"Found business name: {business_name}") - except Exception as e: - log.debug(f"Could not extract business name: {e}") - log.info(f"[PROFILE] Business name extraction: {timing_module.time() - t0:.2f}s") - - # Extract total review count - t0 = timing_module.time() - review_count = extract_total_review_count(driver) - log.info(f"[PROFILE] Review count extraction: {timing_module.time() - t0:.2f}s") - - if review_count is None: - log.warning("Could not determine review count") - total_time = timing_module.time() - start_time - log.info(f"[PROFILE] *** TOTAL VALIDATION TIME: {total_time:.2f}s ***") - result = { - "has_reviews": False, - "review_count": 0, - "business_name": business_name, - "success": True, - "error": "Could not find review count on page" - } - if return_driver: - result["driver"] = driver - return result - - log.info(f"Found {review_count} reviews available") - - total_time = timing_module.time() - start_time - log.info(f"[PROFILE] *** TOTAL VALIDATION TIME: {total_time:.2f}s ***") - - result = { - "has_reviews": review_count > 0, - "review_count": review_count, - "business_name": business_name, - "success": True, - "error": None - } - if return_driver: - result["driver"] = driver - return result - - except Exception as e: - total_time = timing_module.time() - start_time - error_msg = f"Failed to check reviews: {str(e)}" - log.error(error_msg) - log.info(f"[PROFILE] *** TOTAL VALIDATION TIME (FAILED): {total_time:.2f}s ***") - result = { - "has_reviews": False, - "review_count": 0, - "business_name": None, - "success": False, - "error": error_msg - } - if return_driver: - result["driver"] = driver - return result - - finally: - if should_close_driver and driver: - try: - driver.quit() - except: - pass diff --git a/modules/health_checks.py b/modules/health_checks.py index a2d4db6..2210559 100644 --- a/modules/health_checks.py +++ b/modules/health_checks.py @@ -90,7 +90,7 @@ class CanaryMonitor: - Scrape time is reasonable - Data structure is valid """ - from modules.fast_scraper import fast_scrape_reviews + from modules.scraper_clean import fast_scrape_reviews log.info(f"Running canary scrape test on {self.test_url[:60]}...") self.last_run = datetime.now()