#!/usr/bin/env python3 """ Fast DOM-only scraper module for API integration. Based on start_dom_only_fast.py - achieves ~18.9s for all reviews. This module provides a reusable function for the API server. """ import logging import time from typing import List, Dict, Any, Optional from seleniumbase import Driver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException log = logging.getLogger(__name__) def check_no_reviews_early(driver) -> tuple[bool, str]: """ Early detection for 'no reviews available' scenarios. Returns (has_no_reviews, reason) tuple. Uses structural patterns instead of fragile CSS classes for robustness. """ try: # Check for common "no reviews" messages in multiple languages no_review_patterns = [ 'no reviews yet', 'be the first to review', "there aren't any reviews", 'no hay reseñas', 'sin reseñas', "pas encore d'avis", 'noch keine bewertungen', 'nessuna recensione', 'まだレビューがありません', 'sem avaliações', 'belum ada ulasan' ] # Get page text page_text = driver.execute_script("return document.body.innerText.toLowerCase();") # Check for "no reviews" messages for pattern in no_review_patterns: if pattern in page_text: return True, f"Found 'no reviews' message: '{pattern}'" # Check if review count is explicitly 0 # IMPORTANT: Be very specific to avoid false positives! review_count_check = driver.execute_script(""" // Only check for EXACT "0 reviews" patterns, not loose matches const patterns = [ /^0\\s+reviews?$/im, // Exactly "0 reviews" on its own line /\\(0\\s+reviews?\\)/i, // "(0 reviews)" /\\b0\\s+reviews?\\b/i // "0 reviews" as a complete phrase ]; const text = document.body.innerText; // Split into lines and check each line independently to avoid false positives const lines = text.split('\\n'); for (let line of lines) { const trimmed = line.trim(); for (let pattern of patterns) { if (pattern.test(trimmed)) { // Double-check: line should be short (not a review text itself) if (trimmed.length < 50) { return 'Found explicit "0 reviews" text: ' + trimmed; } } } } return null; """) if review_count_check: return True, review_count_check # Check if reviews tab is disabled or not clickable reviews_disabled = driver.execute_script(""" const tabs = document.querySelectorAll('button[role="tab"]'); for (let tab of tabs) { const text = (tab.textContent || '').toLowerCase(); const aria = (tab.getAttribute('aria-label') || '').toLowerCase(); if (text.includes('review') || aria.includes('review')) { if (tab.disabled || tab.getAttribute('aria-disabled') === 'true') { return 'Reviews tab is disabled'; } } } return null; """) if reviews_disabled: return True, reviews_disabled return False, "" except Exception as e: log.warning(f"Error in early no-reviews detection: {e}") return False, "" def extract_total_review_count(driver) -> Optional[int]: """ Extract the total number of reviews from the Google Maps page. Looks for text patterns like "500 reviews" in various elements. Works on both search results pages and business detail pages. Returns: Total review count or None if not found """ extract_script = """ // Optimized review count extraction - removed verbose logging for speed let total = null; const parenthesesPattern = /\\((\\d[\\d,\\.\\s]*)\\)/; const numberPattern = /(\\d[\\d,\\.\\s]*)\\s*(?:review|reseña|avis|recensione|Bewertung|レビュー)/i; // PRIORITY 1: Search results page const searchResultsSelectors = [ 'a[href*="reviews"]', '[role="article"] span', '[role="article"] a', 'div.fontBodyMedium', 'span.UY7F9', ]; for (const selector of searchResultsSelectors) { const elements = document.querySelectorAll(selector); for (let i = 0; i < Math.min(elements.length, 20); i++) { const elem = elements[i]; const text = elem.textContent || ''; const href = elem.getAttribute('href') || ''; let match = text.match(numberPattern); if (match) { const num = parseInt(match[1].replace(/[,\\.\\s]/g, '')); if (num > 0 && num < 1000000) { total = num; break; } } if (href.includes('reviews')) { match = text.match(/(\\d[\\d,\\.\\s]*)/); if (match) { const num = parseInt(match[1].replace(/[,\\.\\s]/g, '')); if (num > 0 && num < 1000000) { total = num; break; } } } } if (total) break; } // PRIORITY 2: Tab buttons (business detail page) if (!total) { const buttons = document.querySelectorAll('button[role="tab"]'); for (let i = 0; i < buttons.length; i++) { const text = buttons[i].textContent || ''; let match = text.match(parenthesesPattern); if (match) { const num = parseInt(match[1].replace(/[,\\.\\s]/g, '')); total = num; break; } match = text.match(numberPattern); if (match) { const num = parseInt(match[1].replace(/[,\\.\\s]/g, '')); total = num; break; } } } // PRIORITY 3: Aria-labels if (!total) { const elements = document.querySelectorAll('[aria-label]'); for (let elem of elements) { const ariaLabel = elem.getAttribute('aria-label') || ''; let match = ariaLabel.match(parenthesesPattern); if (match) { const num = parseInt(match[1].replace(/[,\\.\\s]/g, '')); total = num; break; } match = ariaLabel.match(numberPattern); if (match) { const num = parseInt(match[1].replace(/[,\\.\\s]/g, '')); total = num; break; } } } // PRIORITY 4: Fallback - entire page text if (!total) { const match = document.body.innerText.match(parenthesesPattern); if (match) { const num = parseInt(match[1].replace(/[,\\.\\s]/g, '')); if (num > 0 && num < 1000000) { total = num; } } } return total; """ try: total = driver.execute_script(extract_script) # Get debug info from JavaScript debug_script = """ const info = { search_results_count: document.querySelectorAll('[role="article"]').length, links_with_reviews: document.querySelectorAll('a[href*="reviews"]').length, page_url: window.location.href, page_title: document.title, sample_texts: [] }; // Get sample text from links that might contain reviews const reviewLinks = document.querySelectorAll('a[href*="reviews"]'); for (let i = 0; i < Math.min(5, reviewLinks.length); i++) { info.sample_texts.push(reviewLinks[i].textContent.substring(0, 100)); } // Also check for text containing "review" keyword const allText = document.body.innerText.substring(0, 2000); const reviewMatches = allText.match(/\\d+[\\s,\\.]*(?:review|reseña|avis)/gi); if (reviewMatches) { info.review_patterns_found = reviewMatches.slice(0, 5); } return info; """ debug_info = driver.execute_script(debug_script) log.info(f"Page debug: URL={debug_info.get('page_url')}") log.info(f"Page debug: Found {debug_info.get('search_results_count')} search result articles") log.info(f"Page debug: Found {debug_info.get('links_with_reviews')} links containing 'reviews'") if debug_info.get('review_patterns_found'): log.info(f"Page debug: Review patterns in text: {debug_info.get('review_patterns_found')}") if debug_info.get('sample_texts'): log.info(f"Page debug: Sample link texts: {debug_info.get('sample_texts')}") if total and total > 0: log.info(f"Extracted total review count: {total}") return total else: log.warning(f"Could not extract total review count from page. Debug: {debug_info}") return None except Exception as e: log.error(f"Error extracting total review count: {e}") return None def extract_all_reviews_js(driver) -> List[Dict[str, Any]]: """Extract ALL reviews using JavaScript - single fast operation.""" extract_script = """ const reviews = []; // ROBUST SELECTOR STRATEGY: Try known selectors first, then fall back to structural matching let elements = null; // STRATEGY 1: Try known CSS selectors (fast path) const knownSelectors = [ 'div.jftiEf.fontBodyMedium', 'div.jftiEf', 'div[data-review-id]', 'div[jsaction*="review"]' ]; for (let selector of knownSelectors) { const found = document.querySelectorAll(selector); if (found.length > 0) { elements = found; console.log('Found', found.length, 'reviews using known selector:', selector); break; } } // STRATEGY 2: Structural matching for unknown page layouts // IMPORTANT: Search only within the reviews pane, not the entire page! if (!elements || elements.length === 0) { console.log('Known selectors failed, trying structural matching...'); // Find the reviews pane first const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || document.querySelector('div.m6QErb') || document.querySelector('div[role="main"]'); if (!pane) { console.warn('No reviews pane found'); return []; } // Find all divs that LOOK like reviews (have review structure) WITHIN the pane const allDivs = pane.querySelectorAll('div'); const reviewElements = []; for (let div of allDivs) { // Skip if too small if (div.children.length < 2) continue; // Check for review indicators const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]'); const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]'); const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20); const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|dienų|savaitės)/i); // Must have at least author, rating, and text to be a review const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length; if (indicators >= 3) { reviewElements.push(div); } } if (reviewElements.length > 0) { elements = reviewElements; console.log('Found', reviewElements.length, 'reviews using structural matching'); } } // STRATEGY 3: Try role="article" as last resort (within pane) if (!elements || elements.length === 0) { const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || document.querySelector('div.m6QErb') || document.querySelector('div[role="main"]'); if (pane) { const articles = pane.querySelectorAll('[role="article"]'); const validArticles = []; for (let article of articles) { const hasRating = article.querySelector('[aria-label*="star" i]'); const hasText = article.textContent.length > 30; if (hasRating && hasText) { validArticles.push(article); } } if (validArticles.length > 0) { elements = validArticles; console.log('Found', validArticles.length, 'reviews using role=article'); } } } if (!elements || elements.length === 0) { console.warn('No review elements found with any strategy'); return []; } for (let i = 0; i < elements.length; i++) { const elem = elements[i]; const review = {}; try { // Author const authorElem = elem.querySelector('div.d4r55'); review.author = authorElem ? authorElem.textContent.trim() : null; // Rating const ratingElem = elem.querySelector('span.kvMYJc'); if (ratingElem) { const ariaLabel = ratingElem.getAttribute('aria-label'); if (ariaLabel) { const match = ariaLabel.match(/\\d+/); review.rating = match ? parseFloat(match[0]) : null; } } // Text const textElem = elem.querySelector('span.wiI7pd'); review.text = textElem ? textElem.textContent.trim() : null; // Date const dateElem = elem.querySelector('span.rsqaWe'); review.date_text = dateElem ? dateElem.textContent.trim() : null; // DEEP DIVE: Find where Google stores the actual timestamp review.timestamp = null; review.debug_date_info = {}; if (dateElem) { // 1. Check all attributes on date element const allAttrs = {}; for (let attr of dateElem.attributes) { allAttrs[attr.name] = attr.value; } review.debug_date_info.date_elem_attrs = allAttrs; // 2. Check parent elements for data let parent = dateElem.parentElement; let parentLevel = 0; while (parent && parentLevel < 3) { const parentAttrs = {}; for (let attr of parent.attributes) { if (attr.name.includes('data') || attr.name.includes('time') || attr.name.includes('date')) { parentAttrs[attr.name] = attr.value; } } if (Object.keys(parentAttrs).length > 0) { review.debug_date_info[`parent_${parentLevel}_attrs`] = parentAttrs; } parent = parent.parentElement; parentLevel++; } // 3. Check the entire review container for hidden data const reviewContainer = elem; const containerAttrs = {}; for (let attr of reviewContainer.attributes) { containerAttrs[attr.name] = attr.value; } review.debug_date_info.container_attrs = containerAttrs; // 4. Look for script tags or JSON data near the date const nearbyScripts = elem.querySelectorAll('script'); if (nearbyScripts.length > 0) { review.debug_date_info.has_nearby_scripts = nearbyScripts.length; } // 5. Check for any element with 'time' in class or data const timeElements = elem.querySelectorAll('[class*="time"], [data-timestamp], [datetime]'); if (timeElements.length > 0) { const timeData = []; timeElements.forEach(el => { timeData.push({ tag: el.tagName, classes: el.className, datetime: el.getAttribute('datetime'), timestamp: el.getAttribute('data-timestamp'), text: el.textContent.substring(0, 50) }); }); review.debug_date_info.time_elements = timeData; } } // Avatar const avatarElem = elem.querySelector('img.NBa7we'); review.avatar_url = avatarElem ? avatarElem.src : null; // Profile URL const profileElem = elem.querySelector('button.WEBjve'); review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null; if (review.author && review.date_text) { reviews.push(review); } } catch (e) { // Skip this review } } return reviews; """ # ADDITIONAL: Check for Google's internal state/data objects check_state_script = """ // Look for Google Maps' internal data stores const debugInfo = { global_keys: [], app_data: null, window_data: null }; // Check window object for Google Maps data for (let key in window) { if (key.includes('google') || key.includes('maps') || key.includes('APP') || key.includes('_')) { debugInfo.global_keys.push(key); } } // Check for common React/Angular state keys const stateKeys = ['__INITIAL_STATE__', '__NEXT_DATA__', '__APP_STATE__', 'APP_INITIALIZATION_STATE']; for (let key of stateKeys) { if (window[key]) { debugInfo.app_data = key; } } // Check for embedded JSON in script tags const scriptTags = document.querySelectorAll('script[type="application/json"], script[type="application/ld+json"]'); debugInfo.json_scripts_count = scriptTags.length; if (scriptTags.length > 0) { debugInfo.json_scripts_sample = Array.from(scriptTags).slice(0, 2).map(s => s.textContent.substring(0, 200)); } return debugInfo; """ try: reviews_data = driver.execute_script(extract_script) state_debug = driver.execute_script(check_state_script) # Log the global state debug info log.info(f"Google Maps state debug: {state_debug}") # Add review IDs reviews = [] for i, review_data in enumerate(reviews_data): review_id = f"review_{hash(review_data['author'] + review_data['date_text'])}" review_data['review_id'] = review_id # Add global state debug to first review only if i == 0: review_data['_google_state_debug'] = state_debug reviews.append(review_data) return reviews except Exception as e: log.error(f"Error in JavaScript extraction: {e}") return [] def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999, progress_callback=None, driver=None, return_driver: bool = False) -> Dict[str, Any]: """ Ultra-fast DOM-only scraping with JavaScript extraction. Args: url: Google Maps URL to scrape headless: Run Chrome in headless mode (default: True) max_scrolls: Maximum scrolls safety limit (default: 999999 - effectively unlimited) The scraper stops automatically via idle detection when no new reviews load. progress_callback: Optional callback function(current_count, total_count) for progress updates driver: Existing driver instance to reuse (from worker pool) return_driver: If True, don't close driver and return it in result Returns: Dictionary with: - reviews: List of review dictionaries - count: Total number of reviews scraped - total_reviews: Total reviews available (from page counter) - time: Time taken in seconds - success: True if successful, False otherwise - error: Error message if failed - driver: Driver instance (if return_driver=True) """ start_time = time.time() log.info(f"Starting fast scrape for URL: {url[:80]}...") # Force English locale for consistent date parsing # English gives cleaner date formats: "3 months ago" vs "Hace 3 meses" # Store original URL in case we need to retry without locale override original_url = url locale_override_applied = False if 'hl=' in url: # Replace existing locale url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en') locale_override_applied = True else: # Add English locale parameter separator = '&' if '?' in url else '?' url = f"{url}{separator}hl=en" locale_override_applied = True log.info(f"Using English locale (hl=en) for consistent date parsing") # Track if driver was provided or created driver_provided = driver is not None should_close_driver = not return_driver and not driver_provided # Initialize driver with custom user agent to avoid headless detection # Even with headless=False + Xvfb, Chromium still reports as HeadlessChrome if not driver: driver = Driver( uc=True, headless=headless, page_load_strategy="normal", agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) try: # Navigate driver.get(url) time.sleep(1.5) # Handle GDPR consent page (CRITICAL FIX for headless mode!) if 'consent.google.com' in driver.current_url: try: # Find all form buttons and click "Accept all" / "Aceptar todo" form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button') for btn in form_btns: btn_text = (btn.text or '').lower() if 'aceptar todo' in btn_text or 'accept all' in btn_text: log.info(f"Clicking GDPR consent: {btn.text}") btn.click() time.sleep(2) break else: # Fallback: click second button (usually "Accept all") if len(form_btns) >= 2: log.info("Using fallback: clicking second form button") form_btns[1].click() time.sleep(2) except Exception as e: log.warning(f"GDPR consent handling failed: {e}") # Dismiss cookie banner on Maps page try: cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') if cookie_btns: cookie_btns[0].click() time.sleep(0.3) except: pass # Click reviews tab with retry logic (important for containers) review_keywords = ['reviews', 'review', 'reseñas', 'reseña'] reviews_tab_clicked = False # Try multiple times to find and click reviews tab for attempt in range(3): if reviews_tab_clicked: break time.sleep(0.5) # Wait between attempts for selector in ['button[role="tab"]', '.LRkQ2', 'button']: try: tabs = driver.find_elements(By.CSS_SELECTOR, selector) for tab in tabs: text = (tab.text or '').lower() aria = (tab.get_attribute('aria-label') or '').lower() if any(kw in text or kw in aria for kw in review_keywords): log.info(f"Clicking reviews tab: {tab.text or aria[:30]}") driver.execute_script("arguments[0].click();", tab) time.sleep(1.5) # Wait for tab to load reviews_tab_clicked = True break if reviews_tab_clicked: break except Exception as e: log.debug(f"Tab search attempt {attempt+1} with {selector}: {e}") continue if not reviews_tab_clicked: log.warning("Could not find reviews tab with hl=en locale") # FALLBACK: If locale override was applied and tab not found, # retry without locale override (fixes regional pages where hl=en breaks tabs) if locale_override_applied: log.info("Retrying without locale override to find reviews tab...") # Reload page with original URL (no hl=en) driver.get(original_url) time.sleep(1.5) # Handle GDPR again if needed if 'consent.google.com' in driver.current_url: try: form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button') for btn in form_btns: btn_text = (btn.text or '').lower() if any(keyword in btn_text for keyword in ['accept', 'aceptar', 'priim', 'принять', 'accepter']): log.info(f"Clicking GDPR consent: {btn.text}") btn.click() time.sleep(2) break else: if len(form_btns) >= 2: log.info("Using fallback: clicking second form button") form_btns[1].click() time.sleep(2) except Exception as e: log.warning(f"GDPR consent handling failed: {e}") # Dismiss cookie banner try: cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') if cookie_btns: cookie_btns[0].click() time.sleep(0.3) except: pass # Try to find reviews tab with multilingual keywords multilingual_keywords = [ 'review', 'reviews', # English 'reseña', 'reseñas', # Spanish 'avis', # French 'bewertung', 'bewertungen', # German 'recensione', 'recensioni', # Italian 'レビュー', # Japanese 'avaliação', 'avaliações', # Portuguese 'отзыв', 'отзывы', # Russian 'atsiliepimai', 'atsiliepi', # Lithuanian 'ulasan', # Indonesian '리뷰' # Korean ] for attempt in range(3): if reviews_tab_clicked: break time.sleep(0.5) for selector in ['button[role="tab"]', '.LRkQ2', 'button']: try: tabs = driver.find_elements(By.CSS_SELECTOR, selector) for tab in tabs: text = (tab.text or '').lower() aria = (tab.get_attribute('aria-label') or '').lower() if any(kw in text or kw in aria for kw in multilingual_keywords): log.info(f"Clicking reviews tab (native locale): {tab.text or aria[:30]}") driver.execute_script("arguments[0].click();", tab) time.sleep(1.5) reviews_tab_clicked = True break if reviews_tab_clicked: break except Exception as e: log.debug(f"Native locale tab search attempt {attempt+1} with {selector}: {e}") continue if not reviews_tab_clicked: log.warning("Could not find reviews tab even without locale override") # Wait for reviews section to load time.sleep(2) # EARLY DETECTION: Check if there are no reviews before attempting to scrape no_reviews, reason = check_no_reviews_early(driver) if no_reviews: log.info(f"Early detection: No reviews available. Reason: {reason}") return { "reviews": [], "count": 0, "total_reviews": 0, "time": time.time() - start_time, "success": True, "message": f"No reviews available: {reason}" } # Extract total review count from the page total_reviews = extract_total_review_count(driver) # Double-check: If extracted count is 0, return early if total_reviews == 0: log.info("Total review count is 0, skipping scraping") return { "reviews": [], "count": 0, "total_reviews": 0, "time": time.time() - start_time, "success": True, "message": "Business has 0 reviews" } # Report initial progress with total count if progress_callback and total_reviews: try: progress_callback(0, total_reviews) except Exception as e: log.warning(f"Progress callback failed: {e}") # Find scrollable pane - try multiple selectors (container-friendly) pane = None pane_selectors = [ 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', 'div.m6QErb.WNBkOb.XiKgde', 'div.m6QErb', # Fallback to more general selector 'div[role="main"]', ] wait = WebDriverWait(driver, 5) for selector in pane_selectors: try: pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector))) log.info(f"Found pane with selector: {selector}") break except TimeoutException: continue if not pane: error_msg = "Could not find scrollable pane after trying all selectors" log.error(error_msg) return { "reviews": [], "count": 0, "total_reviews": total_reviews, "time": time.time() - start_time, "success": False, "error": error_msg } # Wait longer for initial reviews to load (containers can be slower) time.sleep(2) # Setup scroll driver.execute_script("window.scrollablePane = arguments[0];", pane) scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" # Trigger initial scroll and verify reviews are loading driver.execute_script(scroll_script) time.sleep(0.8) # Also scroll the main window (helps in some cases, especially containers) driver.execute_script("window.scrollBy(0, 500);") time.sleep(0.5) # JavaScript function to count reviews using ROBUST structural patterns # Instead of relying on CSS classes, we look for containers with review-like structure count_reviews_script = """ // STRATEGY 1: Try known selectors first (fast path) const knownSelectors = [ 'div.jftiEf.fontBodyMedium', 'div.jftiEf', 'div[data-review-id]', 'div[jsaction*="review"]' ]; for (let selector of knownSelectors) { const found = document.querySelectorAll(selector); if (found.length > 0) { return found.length; } } // STRATEGY 2: Structural pattern matching (robust, class-agnostic) // Find containers that LOOK like reviews (have author + rating + text structure) // IMPORTANT: Search only within the reviews pane, not the entire page! const findReviewsByStructure = () => { // Find the reviews pane first const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || document.querySelector('div.m6QErb') || document.querySelector('div[role="main"]'); if (!pane) return 0; // Search only within the pane const allDivs = pane.querySelectorAll('div'); let reviewCount = 0; for (let div of allDivs) { // Skip if too small (reviews have substantial content) if (div.children.length < 2) continue; // Look for review indicators: // - Has an author name (usually in a span/div with small text) // - Has a rating (span with aria-label containing "star" or "rating") // - Has review text (span/div with longer text content) const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]'); const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]'); const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20); const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|jour|mois|année|dienų|savaitės)/i); // If it has at least 3 of these indicators, it's likely a review const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length; if (indicators >= 3) { reviewCount++; } } return reviewCount > 0 ? reviewCount : 0; }; // STRATEGY 3: Look for role="article" with review-like content (within pane) const pane3 = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || document.querySelector('div.m6QErb') || document.querySelector('div[role="main"]'); if (pane3) { const articles = pane3.querySelectorAll('[role="article"]'); if (articles.length > 0) { let validArticles = 0; for (let article of articles) { // Check if article looks like a review (has rating + text) const hasRating = article.querySelector('[aria-label*="star" i]'); const hasText = article.textContent.length > 30; if (hasRating && hasText) { validArticles++; } } if (validArticles > 0) return validArticles; } } // Try structural matching as last resort const structuralCount = findReviewsByStructure(); return structuralCount; """ # Check if reviews are actually loading initial_count = driver.execute_script(count_reviews_script) if initial_count < 5: # Reviews not loaded yet, wait more and try alternative scrolling log.info(f"Waiting for reviews to load (found {initial_count})...") # Try clicking on the pane to focus it try: driver.execute_script("arguments[0].click();", pane) time.sleep(0.5) except: pass # Scroll both pane and window driver.execute_script(scroll_script) driver.execute_script("window.scrollBy(0, 500);") time.sleep(1.5) initial_count = driver.execute_script(count_reviews_script) log.info(f"After extra waiting: {initial_count} reviews") log.info(f"Scrolling to load all reviews (starting with {initial_count})...") # Fast scrolling to load all DOM elements last_count = 0 idle_count = 0 for i in range(max_scrolls): # Scroll to load more prev_count = driver.execute_script(count_reviews_script) driver.execute_script(scroll_script) # SMART WAIT: Wait until new reviews actually load max_wait = 1.2 wait_step = 0.05 waited = 0 while waited < max_wait: time.sleep(wait_step) waited += wait_step current_count = driver.execute_script(count_reviews_script) # If reviews loaded, continue immediately! if current_count > prev_count: idle_count = 0 # Reset idle counter break # Give Google Maps more time to lazy-load (0.6s instead of 0.3s) # Only exit early if we're confident nothing is loading if waited >= 0.6 and current_count == prev_count: break # Track consecutive idle scrolls if current_count == prev_count: idle_count += 1 # Be VERY patient: wait for 12 consecutive idle scrolls to ensure we get ALL reviews # (each with up to 1.2s wait = ~14.4s total idle time before giving up) # This ensures Google Maps has plenty of time to lazy-load all content if idle_count >= 12: log.info(f"Reached end at {current_count} reviews (12 consecutive idle scrolls)") # Double-check we got all reviews if we know the total if total_reviews and current_count < total_reviews: log.warning(f"Only got {current_count}/{total_reviews} reviews ({current_count/total_reviews*100:.1f}%). Some may be hidden or loading slowly.") break # Progress logging and callback every 5 scrolls if (i + 1) % 5 == 0: log.info(f"{current_count} review elements loaded...") if progress_callback and total_reviews: try: progress_callback(current_count, total_reviews) except Exception as e: log.warning(f"Progress callback failed: {e}") # Aggressive memory management every 20 scrolls if (i + 1) % 20 == 0: try: # Clear console logs to prevent buildup driver.execute_script("console.clear();") # Force garbage collection in browser driver.execute_script(""" if (window.gc) { window.gc(); } // Remove image srcs to free memory (images reload on demand) document.querySelectorAll('img').forEach(img => { if (img.complete && !img.classList.contains('needed')) { img.removeAttribute('src'); } }); """) # Brief pause to let Chrome breathe time.sleep(0.1) except Exception: pass # Ignore if fails last_count = current_count # Shorter final scroll for _ in range(2): driver.execute_script(scroll_script) time.sleep(0.3) scroll_time = time.time() - start_time log.info(f"Scrolling complete in {scroll_time:.2f}s") # Update progress: scrolling done, starting extraction if progress_callback and total_reviews: try: progress_callback(current_count, total_reviews) except Exception as e: log.warning(f"Progress callback failed: {e}") # Extract ALL reviews using JavaScript (fast!) log.info("Extracting reviews with JavaScript...") extract_start = time.time() all_reviews = extract_all_reviews_js(driver) extract_time = time.time() - extract_start log.info(f"Extraction complete in {extract_time:.2f}s") # Final progress update with actual extracted count if progress_callback and total_reviews: try: progress_callback(len(all_reviews), total_reviews) except Exception as e: log.warning(f"Progress callback failed: {e}") elapsed = time.time() - start_time log.info(f"Fast scrape completed: {len(all_reviews)} reviews in {elapsed:.2f}s") result = { "reviews": all_reviews, "count": len(all_reviews), "total_reviews": total_reviews, "time": elapsed, "scroll_time": scroll_time, "extract_time": extract_time, "success": True, "error": None } if return_driver: result["driver"] = driver return result except Exception as e: elapsed = time.time() - start_time error_msg = f"Fast scrape failed: {str(e)}" log.error(error_msg) # Check if this is a tab crash - try to extract what we have partial_reviews = [] is_tab_crash = "tab crashed" in str(e).lower() or "session deleted" in str(e).lower() if is_tab_crash and driver: log.warning("Detected tab crash - attempting to extract partial reviews from DOM before crash...") try: # Try to extract reviews that were loaded before crash partial_reviews = extract_all_reviews_js(driver) log.info(f"Recovered {len(partial_reviews)} reviews from crashed session") except Exception as recovery_error: log.error(f"Could not recover reviews: {recovery_error}") # Return partial results if we got any if partial_reviews: result = { "reviews": partial_reviews, "count": len(partial_reviews), "total_reviews": None, "time": elapsed, "success": False, # Mark as failed but with partial data "error": f"{error_msg} (recovered {len(partial_reviews)} reviews)", "partial": True } else: result = { "reviews": [], "count": 0, "total_reviews": None, "time": elapsed, "success": False, "error": error_msg } if return_driver: result["driver"] = driver return result finally: if should_close_driver and driver: try: driver.quit() except: pass def get_business_card_info(url: str, headless: bool = True, driver=None, return_driver: bool = False) -> Dict[str, Any]: """ Extract business card information from Google Maps. Uses the same reliable navigation logic as the main scraper. Returns business card with: - name - address - rating (float) - total_reviews (int) - success/error """ import time as timing_module start_time = timing_module.time() log.info(f"[PROFILE] Getting business card info for: {url}") driver_provided = driver is not None should_close_driver = not return_driver and not driver_provided try: # Initialize driver if not provided t0 = timing_module.time() if not driver: driver = Driver( uc=True, headless=headless, page_load_strategy="normal", agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) log.info(f"[PROFILE] Driver initialization: {timing_module.time() - t0:.2f}s") else: log.info(f"[PROFILE] Using pooled driver (0.00s)") # Force English locale AND US region for consistent parsing/results # This helps avoid geolocation-based variations in Google Maps results if 'hl=' in url: url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en') else: separator = '&' if '?' in url else '?' url = f"{url}{separator}hl=en" # Add US region parameter if not present if 'gl=' not in url: url = f"{url}&gl=us" # Set Chrome geolocation to US (Boston, MA) using CDP # This ensures Google Maps shows US results regardless of server location try: driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { 'latitude': 42.3601, 'longitude': -71.0589, 'accuracy': 100 }) log.info("Set geolocation to US (Boston, MA)") except Exception as e: log.warning(f"Could not set geolocation: {e}") log.info(f"Loading Google Maps page...") t0 = timing_module.time() driver.get(url) log.info(f"[PROFILE] Page load (driver.get): {timing_module.time() - t0:.2f}s") t0 = timing_module.time() time.sleep(0.5) # Initial wait - reduced from 2s log.info(f"[PROFILE] Initial sleep: {timing_module.time() - t0:.2f}s") # Handle GDPR consent page t0 = timing_module.time() if 'consent.google.com' in driver.current_url: log.info("Detected GDPR consent page, accepting...") try: form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button') for btn in form_btns: btn_text = (btn.text or '').lower() if 'aceptar todo' in btn_text or 'accept all' in btn_text or 'reject all' in btn_text: log.info(f"Clicking GDPR consent: {btn.text}") btn.click() time.sleep(1) break else: if len(form_btns) >= 2: log.info("Using fallback: clicking second form button") form_btns[1].click() time.sleep(1) except Exception as e: log.warning(f"GDPR consent handling failed: {e}") # After GDPR consent, reload the original URL to ensure proper page state log.info(f"Reloading original URL after GDPR consent...") driver.get(url) time.sleep(1) log.info(f"[PROFILE] GDPR consent handling: {timing_module.time() - t0:.2f}s") else: log.info(f"[PROFILE] No GDPR consent page (0.00s)") # Dismiss cookie banner try: cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') if cookie_btns: log.info("Dismissing cookie banner...") cookie_btns[0].click() time.sleep(0.3) # Reduced from 0.5s except: pass log.info(f"[PROFILE] Cookie banner dismissal: {timing_module.time() - t0:.2f}s") # Wait for page to load - use smart waits t0 = timing_module.time() try: log.info("Waiting for Google Maps content to load...") wait = WebDriverWait(driver, 10) # Wait for basic page structure (h1 or heading) wait.until( lambda d: d.find_elements(By.CSS_SELECTOR, 'h1, [role="heading"]') ) log.info("Basic page structure loaded") # Wait for page to settle - search URLs redirect to place URLs # which triggers additional content loading time.sleep(2) # Wait specifically for review count element (aria-label ending with "reviews") # This is the most reliable indicator that the business detail is loaded try: WebDriverWait(driver, 5).until( lambda d: d.execute_script(""" var elems = document.querySelectorAll('[aria-label]'); for (var i = 0; i < elems.length; i++) { var label = elems[i].getAttribute('aria-label') || ''; if (/^[0-9]+ reviews?$/.test(label)) return true; } return false; """) ) log.info("Review count element loaded") except: # Fallback: Try clicking Reviews tab or rating stars to expose the review count log.info("Review count wait timeout, trying to click Reviews/rating...") try: # Try 1: Click Reviews tab (if exists) clicked = driver.execute_script(""" var tabs = document.querySelectorAll('[role="tab"]'); for (var i = 0; i < tabs.length; i++) { var txt = (tabs[i].textContent || '').toLowerCase(); if (txt.includes('review')) { tabs[i].click(); return 'tab'; } } // Try 2: Click the rating stars element (often links to reviews) var stars = document.querySelector('[role="img"][aria-label*="star"]'); if (stars) { var parent = stars.parentElement; if (parent && parent.tagName.toLowerCase() === 'button') { parent.click(); return 'stars_button'; } stars.click(); return 'stars'; } // Try 3: Click "Write a review" or any review-related button var btns = document.querySelectorAll('button[aria-label*="review" i]'); for (var b = 0; b < btns.length; b++) { var label = btns[b].getAttribute('aria-label') || ''; if (!/write/i.test(label) && /review/i.test(label)) { btns[b].click(); return 'review_btn: ' + label; } } return 'none'; """) log.info(f"Clicked: {clicked}") time.sleep(2) # Wait for reviews panel to load except Exception as e: log.warning(f"Click attempt failed: {e}") except Exception as e: log.warning(f"Timeout waiting for Maps content: {e}") time.sleep(2) # Fallback wait log.info(f"[PROFILE] Smart wait for content: {timing_module.time() - t0:.2f}s") log.info(f"DEBUG: Current URL: {driver.current_url[:100]}...") log.info(f"DEBUG: Page title: {driver.title}") # Extract business card information using JavaScript t0 = timing_module.time() extract_script = """ const info = { name: null, address: null, rating: null, total_reviews: null }; // ============ ROBUST EXTRACTION (no class names, aria/data attributes preferred) ============ // Helper: Parse review count from text, handling multiple formats function parseReviewCount(text) { if (!text) return null; // Pattern 1: Exact "N reviews" format (aria-labels, clean text) // Matches: "27 reviews", "1,234 reviews", "27 reseñas", "27 avis" var match = text.match(/^([0-9][0-9,.]*)[ ]*(?:reviews?|reseñas?|avis|bewertungen?|recensioni?)$/i); if (match) { return parseInt(match[1].replace(/[,. ]/g, '')); } // Pattern 2: "(N)" format often used in tabs like "Reviews (27)" match = text.match(/[(]([0-9][0-9,.]*)[)]$/); if (match) { return parseInt(match[1].replace(/[,. ]/g, '')); } // Pattern 3: "N reviews" anywhere in short text (< 30 chars to avoid false positives) if (text.length < 30) { match = text.match(/([0-9][0-9,]*)[ ]+(?:reviews?|reseñas?|avis)/i); if (match) { return parseInt(match[1].replace(/[,. ]/g, '')); } } return null; } // ============ EXTRACT BUSINESS NAME ============ // Priority: h1 (semantic), then role="heading" const h1 = document.querySelector('h1'); if (h1 && h1.textContent) { info.name = h1.textContent.trim(); } if (!info.name) { const heading = document.querySelector('[role="heading"][aria-level="1"]'); if (heading && heading.textContent) { info.name = heading.textContent.trim(); } } // ============ EXTRACT ADDRESS ============ // Priority: data-item-id (semantic), then aria-label containing "address" const addressElem = document.querySelector('[data-item-id*="address"]'); if (addressElem && addressElem.textContent) { info.address = addressElem.textContent.trim(); } if (!info.address) { const ariaAddress = document.querySelector('[aria-label*="ddress"]'); if (ariaAddress && ariaAddress.textContent) { info.address = ariaAddress.textContent.trim(); } } // ============ EXTRACT RATING ============ // Priority: aria-label containing "star" on role="img" elements info._debug_rating_context = []; const ratingElems = document.querySelectorAll('[role="img"][aria-label*="star"]'); for (let elem of ratingElems) { const ariaLabel = elem.getAttribute('aria-label') || ''; // Match "4.9 stars" or "4,9 stars" (European format) const match = ariaLabel.match(/([0-9][.,]?[0-9]?)\\s*star/i); if (match) { info.rating = parseFloat(match[1].replace(',', '.')); // DEBUG: Capture parent/sibling context to find review count var parent = elem.parentElement; if (parent) { info._debug_rating_context.push('PARENT: ' + (parent.textContent || '').trim().substring(0, 100)); var grandparent = parent.parentElement; if (grandparent) { info._debug_rating_context.push('GRANDPARENT: ' + (grandparent.textContent || '').trim().substring(0, 100)); // Check all children of grandparent for review count var gpChildren = grandparent.querySelectorAll('*'); for (var c = 0; c < Math.min(gpChildren.length, 30); c++) { var childText = (gpChildren[c].textContent || '').trim(); if (childText.length > 0 && childText.length < 20 && /[0-9]/.test(childText)) { info._debug_rating_context.push('GP_CHILD: ' + childText); } } // Also check great-grandparent var ggp = grandparent.parentElement; if (ggp) { info._debug_rating_context.push('GREAT_GP: ' + (ggp.textContent || '').trim().substring(0, 150)); } } // Check siblings var nextSib = parent.nextElementSibling; if (nextSib) { info._debug_rating_context.push('NEXT_SIB: ' + (nextSib.textContent || '').trim().substring(0, 100)); } } break; } } // ============ EXTRACT TOTAL REVIEWS (ROBUST, ARIA-FIRST) ============ // PRIORITY 1: aria-label with exact "N reviews" format (most reliable) // Google Maps uses aria-label="27 reviews" for accessibility info._debug_aria = []; info._debug_all_numeric = []; if (!info.total_reviews) { var ariaElems = document.querySelectorAll('[aria-label]'); for (var i = 0; i < ariaElems.length; i++) { var ariaLabel = ariaElems[i].getAttribute('aria-label') || ''; // Collect all labels containing "review" if (ariaLabel.toLowerCase().indexOf('review') >= 0) { info._debug_aria.push(ariaLabel); } // Collect all labels starting with a digit if (/^[0-9]/.test(ariaLabel)) { info._debug_all_numeric.push(ariaLabel); } var count = parseReviewCount(ariaLabel); if (count && count > 0 && count < 100000) { info.total_reviews = count; info._debug_matched = ariaLabel; break; } } } // DEBUG: Find all text with parenthetical numbers like "(27)" info._debug_parens = []; info._debug_short_text = []; // All short text with numbers var allSpans = document.querySelectorAll('span, div, a, button'); for (var j = 0; j < Math.min(allSpans.length, 500); j++) { var spanText = allSpans[j].textContent || ''; // Capture parenthetical numbers if (spanText.length < 20 && /[(][0-9]+[)]/.test(spanText)) { info._debug_parens.push(spanText.trim()); } // Capture ALL short text containing numbers (for debugging) if (spanText.length > 0 && spanText.length < 30 && /[0-9]+/.test(spanText)) { var cleaned = spanText.trim().replace(/\\s+/g, ' '); if (cleaned && info._debug_short_text.indexOf(cleaned) < 0) { info._debug_short_text.push(cleaned); } } } // PRIORITY 2.5: Look for text containing numbers near "review" word anywhere on page // This catches formats like "27 reviews", "reviews: 27", etc. that aren't in aria-labels if (!info.total_reviews) { var allElems = document.querySelectorAll('*'); for (var k = 0; k < Math.min(allElems.length, 1000); k++) { var elem = allElems[k]; // Skip if has children (we want leaf nodes only) if (elem.children.length > 0) continue; var txt = (elem.textContent || '').trim(); // Look for short text with both numbers and "review" word if (txt.length >= 3 && txt.length < 30 && /review/i.test(txt)) { var match = txt.match(/([0-9][0-9,]*)/); if (match) { var count = parseInt(match[1].replace(/,/g, '')); if (count > 0 && count < 100000) { info.total_reviews = count; info._debug_matched = 'LEAF: ' + txt; break; } } } } } // DEBUG: Collect all tab names info._debug_tabs = []; const tabs = document.querySelectorAll('[role="tab"]'); for (let t = 0; t < tabs.length; t++) { info._debug_tabs.push((tabs[t].textContent || '').trim().substring(0, 30)); } // DEBUG: Collect all buttons with text (might contain review count) info._debug_buttons = []; const buttons = document.querySelectorAll('button'); for (let b = 0; b < Math.min(buttons.length, 20); b++) { var btnText = (buttons[b].textContent || '').trim(); if (btnText && btnText.length < 40) { info._debug_buttons.push(btnText.substring(0, 40)); } } // PRIORITY 2: Tabs with role="tab" (Reviews tab often shows count) if (!info.total_reviews) { for (let tab of tabs) { const text = (tab.textContent || '').trim(); // Look for "Reviews" tab with count if (text.toLowerCase().includes('review')) { const count = parseReviewCount(text); if (count && count > 0) { info.total_reviews = count; info._debug_matched = 'TAB: ' + text; break; } } } } // PRIORITY 2.3: Reviews panel header (after clicking Reviews tab) // Google Maps shows "27 reviews" as heading text in the reviews panel if (!info.total_reviews) { // Look for headings containing review count var headings = document.querySelectorAll('h1, h2, [role="heading"]'); for (var h = 0; h < headings.length; h++) { var hText = (headings[h].textContent || '').trim(); if (/review/i.test(hText)) { var match = hText.match(/([0-9][0-9,]*)/); if (match) { var count = parseInt(match[1].replace(/,/g, '')); if (count > 0 && count < 100000) { info.total_reviews = count; info._debug_matched = 'HEADING: ' + hText; break; } } } } } // PRIORITY 2.4: Look for sort button area which often has total count // The sort dropdown area displays "Sort: Newest" and total reviews if (!info.total_reviews) { var sortBtns = document.querySelectorAll('button[data-value="sort"], [aria-label*="Sort"]'); for (var s = 0; s < sortBtns.length; s++) { var parent = sortBtns[s].parentElement; if (parent) { var pText = (parent.textContent || '').trim(); if (/review/i.test(pText)) { var match = pText.match(/([0-9][0-9,]*)\\s*review/i); if (match) { var count = parseInt(match[1].replace(/,/g, '')); if (count > 0 && count < 100000) { info.total_reviews = count; info._debug_matched = 'SORT_AREA: ' + pText.substring(0, 50); break; } } } } } } // PRIORITY 3: Elements with semantic review-related attributes if (!info.total_reviews) { const reviewLinks = document.querySelectorAll('a[href*="review"], button[aria-label*="review" i]'); for (let elem of reviewLinks) { const text = (elem.textContent || '').trim(); const count = parseReviewCount(text); if (count && count > 0) { info.total_reviews = count; break; } } } // PRIORITY 4: Look for standalone review count text near rating // Find elements that contain ONLY "N reviews" pattern (not concatenated with rating) if (!info.total_reviews) { const allElements = document.querySelectorAll('span, a'); for (let elem of allElements) { // Get direct text content only (not nested children) const text = (elem.textContent || '').trim(); // Skip if too long (likely contains other content) if (text.length > 50) continue; // Skip if it looks like rating+reviews concatenated (e.g., "4.927 reviews") if (/^[0-9]\\.[0-9]+[0-9]/.test(text)) continue; const count = parseReviewCount(text); if (count && count > 0 && count < 100000) { info.total_reviews = count; break; } } } // PRIORITY 5: Parse from visible page text using regex on short text blocks if (!info.total_reviews) { const walker = document.createTreeWalker( document.body, NodeFilter.SHOW_TEXT, null, false ); while (walker.nextNode()) { const text = walker.currentNode.textContent.trim(); if (text.length >= 5 && text.length <= 30) { // Match "27 reviews" but not "4.927 reviews" const match = text.match(/(?:^|[^0-9.,])([0-9,]+)\\s+(?:reviews?|reseñas?)/i); if (match) { const count = parseInt(match[1].replace(/[,]/g, '')); if (count > 0 && count < 100000) { info.total_reviews = count; info._debug_matched = 'WALKER: ' + text; break; } } } } } // PRIORITY 6: Extract from embedded JSON in page source (Google embeds data in scripts) if (!info.total_reviews) { var scripts = document.querySelectorAll('script'); for (var sc = 0; sc < scripts.length; sc++) { var scriptText = scripts[sc].textContent || ''; // Look for patterns like "user_reviews":{"count":27} or reviews_count":27 var jsonMatch = scriptText.match(/"(?:user_reviews|reviews?)(?:_count)?"\s*[:\{]\s*"?(\d+)"?/i); if (jsonMatch) { var count = parseInt(jsonMatch[1]); if (count > 0 && count < 100000) { info.total_reviews = count; info._debug_matched = 'JSON_SCRIPT'; break; } } // Also look for review count in Google's data format like [\"27 reviews\"] if (!info.total_reviews) { var dataMatch = scriptText.match(/"(\d+)\s+reviews?"/i); if (dataMatch) { var count = parseInt(dataMatch[1]); if (count > 0 && count < 100000) { info.total_reviews = count; info._debug_matched = 'JSON_DATA: ' + dataMatch[0]; break; } } } } } return info; """ business_info = driver.execute_script(extract_script) log.info(f"[PROFILE] Business card extraction: {timing_module.time() - t0:.2f}s") total_time = timing_module.time() - start_time log.info(f"[PROFILE] *** TOTAL GET_BUSINESS_CARD TIME: {total_time:.2f}s ***") log.info(f"Business card extracted: name={business_info.get('name')}, " f"rating={business_info.get('rating')}, reviews={business_info.get('total_reviews')}") # Debug: log what aria-labels were found if business_info.get('_debug_aria'): log.info(f"DEBUG: Found {len(business_info.get('_debug_aria'))} aria-labels with 'review': {business_info.get('_debug_aria')[:5]}") if business_info.get('_debug_matched'): log.info(f"DEBUG: Matched aria-label: {business_info.get('_debug_matched')}") # Also log all numeric aria-labels (potential review counts) if business_info.get('_debug_all_numeric'): log.info(f"DEBUG: Numeric aria-labels: {business_info.get('_debug_all_numeric')[:10]}") # Log any text with parenthetical numbers like "(27)" if business_info.get('_debug_parens'): log.info(f"DEBUG: Parenthetical text: {business_info.get('_debug_parens')[:5]}") # Log all short text containing numbers (for debugging review count detection) if business_info.get('_debug_short_text'): log.info(f"DEBUG: Short text with numbers: {business_info.get('_debug_short_text')[:15]}") # Log the context around the rating element if business_info.get('_debug_rating_context'): for ctx in business_info.get('_debug_rating_context', []): log.info(f"DEBUG: Rating context: {ctx}") # Log what tabs exist on the page if business_info.get('_debug_tabs'): log.info(f"DEBUG: Page tabs: {business_info.get('_debug_tabs')}") else: log.info(f"DEBUG: No tabs found on page") # Log buttons (might contain review count) if business_info.get('_debug_buttons'): log.info(f"DEBUG: Buttons: {business_info.get('_debug_buttons')[:10]}") result = { "name": business_info.get('name'), "address": business_info.get('address'), "rating": business_info.get('rating'), "total_reviews": business_info.get('total_reviews') or 0, "has_reviews": (business_info.get('total_reviews') or 0) > 0, "success": True, "error": None } if return_driver: result["driver"] = driver return result except Exception as e: total_time = timing_module.time() - start_time error_msg = f"Failed to get business card info: {str(e)}" log.error(error_msg) log.info(f"[PROFILE] *** TOTAL GET_BUSINESS_CARD TIME (FAILED): {total_time:.2f}s ***") result = { "name": None, "address": None, "rating": None, "total_reviews": 0, "has_reviews": False, "success": False, "error": error_msg } if return_driver: result["driver"] = driver return result finally: if should_close_driver and driver: try: driver.quit() except: pass def check_reviews_available(url: str, headless: bool = True, driver=None, return_driver: bool = False) -> Dict[str, Any]: """ Lightweight check to see if a business has reviews available. This function just loads the page and checks for review count without doing the full scraping. Used to enable/disable scrape button in UI. Args: url: Google Maps URL to check headless: Run in headless mode (default True) driver: Existing driver instance to reuse (from worker pool) return_driver: If True, don't close driver and return it in result Returns: Dict containing: - has_reviews: bool - whether reviews exist - review_count: int - number of reviews (0 if none) - business_name: str - name of business (if found) - success: bool - whether check succeeded - error: str - error message (if failed) - driver: Driver instance (if return_driver=True) """ import time as timing_module start_time = timing_module.time() log.info(f"[PROFILE] Starting validation for: {url}") driver_provided = driver is not None should_close_driver = not return_driver and not driver_provided try: # Initialize driver if not provided t0 = timing_module.time() if not driver: driver = Driver(uc=True, headless=headless) driver.maximize_window() log.info(f"[PROFILE] Driver initialization: {timing_module.time() - t0:.2f}s") else: log.info(f"[PROFILE] Using pooled driver (0.00s)") # Navigate to the URL t0 = timing_module.time() log.info(f"Loading Google Maps page...") driver.get(url) log.info(f"[PROFILE] Page load (driver.get): {timing_module.time() - t0:.2f}s") t0 = timing_module.time() time.sleep(0.5) # Initial wait - reduced from 2s log.info(f"[PROFILE] Initial sleep: {timing_module.time() - t0:.2f}s") # Handle GDPR consent page (CRITICAL for validation to work!) t0 = timing_module.time() if 'consent.google.com' in driver.current_url: log.info("Detected GDPR consent page, accepting...") try: # Find all form buttons and click "Accept all" / "Aceptar todo" form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button') for btn in form_btns: btn_text = (btn.text or '').lower() if 'aceptar todo' in btn_text or 'accept all' in btn_text: log.info(f"Clicking GDPR consent: {btn.text}") btn.click() time.sleep(1) # Reduced from 2s break else: # Fallback: click second button (usually "Accept all") if len(form_btns) >= 2: log.info("Using fallback: clicking second form button") form_btns[1].click() time.sleep(1) # Reduced from 2s except Exception as e: log.warning(f"GDPR consent handling failed: {e}") log.info(f"[PROFILE] GDPR consent handling: {timing_module.time() - t0:.2f}s") else: log.info(f"[PROFILE] No GDPR consent page (0.00s)") # Dismiss cookie banner on Maps page t0 = timing_module.time() try: cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') if cookie_btns: log.info("Dismissing cookie banner...") cookie_btns[0].click() time.sleep(0.3) # Reduced from 0.5s except: pass log.info(f"[PROFILE] Cookie banner dismissal: {timing_module.time() - t0:.2f}s") # Wait for page to fully load after consent - use smart waits t0 = timing_module.time() try: # Wait for either business card OR search results to appear log.info("Waiting for Google Maps content to load...") wait = WebDriverWait(driver, 10) wait.until( lambda d: d.find_elements(By.CSS_SELECTOR, 'h1.DUwDvf, h1, [role="article"], [data-review-id]') ) log.info("Google Maps content loaded successfully") except Exception as e: log.warning(f"Timeout waiting for Maps content: {e}") time.sleep(0.5) # Minimal fallback wait log.info(f"[PROFILE] Smart wait for content: {timing_module.time() - t0:.2f}s") # Try to extract business name t0 = timing_module.time() business_name = None try: business_name_script = """ // Try to find business name from various locations let name = null; // Method 1: Look for business name in the main panel (most reliable) // This is where the actual business info appears const businessPanelSelectors = [ 'h1.DUwDvf', // Main business name heading '[role="main"] h1', // H1 in main content 'h1.fontHeadlineLarge', // Large headline font 'button[jsaction*="pane.header.rating"] h1', // Near rating button ]; for (const selector of businessPanelSelectors) { const element = document.querySelector(selector); if (element && element.textContent) { const text = element.textContent.trim(); // Filter out Google's placeholder/suggestion text if (text && !text.toLowerCase().includes('antes de ir') && !text.toLowerCase().includes('before going') && !text.toLowerCase().includes('google maps') && text.length < 100) { // Business names shouldn't be super long name = text; break; } } } // Method 2: h1 tag (fallback) if (!name) { const h1 = document.querySelector('h1'); if (h1 && h1.textContent) { const text = h1.textContent.trim(); if (!text.toLowerCase().includes('antes de ir') && !text.toLowerCase().includes('before going')) { name = text; } } } // Method 3: Title attribute (last resort) if (!name) { const title = document.title; if (title && !title.includes('Google Maps')) { name = title.split('-')[0].trim(); } } return name; """ business_name = driver.execute_script(business_name_script) if business_name: log.info(f"Found business name: {business_name}") except Exception as e: log.debug(f"Could not extract business name: {e}") log.info(f"[PROFILE] Business name extraction: {timing_module.time() - t0:.2f}s") # Extract total review count t0 = timing_module.time() review_count = extract_total_review_count(driver) log.info(f"[PROFILE] Review count extraction: {timing_module.time() - t0:.2f}s") if review_count is None: log.warning("Could not determine review count") total_time = timing_module.time() - start_time log.info(f"[PROFILE] *** TOTAL VALIDATION TIME: {total_time:.2f}s ***") result = { "has_reviews": False, "review_count": 0, "business_name": business_name, "success": True, "error": "Could not find review count on page" } if return_driver: result["driver"] = driver return result log.info(f"Found {review_count} reviews available") total_time = timing_module.time() - start_time log.info(f"[PROFILE] *** TOTAL VALIDATION TIME: {total_time:.2f}s ***") result = { "has_reviews": review_count > 0, "review_count": review_count, "business_name": business_name, "success": True, "error": None } if return_driver: result["driver"] = driver return result except Exception as e: total_time = timing_module.time() - start_time error_msg = f"Failed to check reviews: {str(e)}" log.error(error_msg) log.info(f"[PROFILE] *** TOTAL VALIDATION TIME (FAILED): {total_time:.2f}s ***") result = { "has_reviews": False, "review_count": 0, "business_name": None, "success": False, "error": error_msg } if return_driver: result["driver"] = driver return result finally: if should_close_driver and driver: try: driver.quit() except: pass