#!/usr/bin/env python3 """ DOM-ONLY FAST Scraper - Uses JavaScript for ultra-fast DOM extraction. Strategy: 1. Scroll to load all reviews 2. Extract ALL data using JavaScript in one shot (no slow Selenium queries) 3. Should be faster and simpler than API + DOM hybrid Target: ~20-25 seconds for all 244 reviews with simpler code """ import sys import yaml import logging import time import json from seleniumbase import Driver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s') log = logging.getLogger(__name__) log.setLevel(logging.INFO) def load_config(): with open('config.yaml', 'r') as f: return yaml.safe_load(f) def extract_all_reviews_js(driver): """Extract ALL reviews using JavaScript - single fast operation.""" extract_script = """ const reviews = []; const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium'); for (let i = 0; i < elements.length; i++) { const elem = elements[i]; const review = {}; try { // Author const authorElem = elem.querySelector('div.d4r55'); review.author = authorElem ? authorElem.textContent.trim() : null; // Rating const ratingElem = elem.querySelector('span.kvMYJc'); if (ratingElem) { const ariaLabel = ratingElem.getAttribute('aria-label'); if (ariaLabel) { const match = ariaLabel.match(/\\d+/); review.rating = match ? parseFloat(match[0]) : null; } } // Text const textElem = elem.querySelector('span.wiI7pd'); review.text = textElem ? textElem.textContent.trim() : null; // Date const dateElem = elem.querySelector('span.rsqaWe'); review.date_text = dateElem ? dateElem.textContent.trim() : null; // Avatar const avatarElem = elem.querySelector('img.NBa7we'); review.avatar_url = avatarElem ? avatarElem.src : null; // Profile URL const profileElem = elem.querySelector('button.WEBjve'); review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null; if (review.author && review.date_text) { reviews.push(review); } } catch (e) { // Skip this review } } return reviews; """ try: reviews_data = driver.execute_script(extract_script) # Add review IDs reviews = [] for review_data in reviews_data: review_id = f"review_{hash(review_data['author'] + review_data['date_text'])}" review_data['review_id'] = review_id reviews.append(review_data) return reviews except Exception as e: print(f" Error in JavaScript extraction: {e}") return [] def dom_only_fast_scrape(): """Ultra-fast DOM-only scraping with JavaScript extraction.""" config = load_config() url = config.get('url') headless = config.get('headless', False) print("DOM-ONLY FAST SCRAPER - JavaScript extraction...") print(f"URL: {url[:80]}...") start_time = time.time() driver = Driver(uc=True, headless=headless, page_load_strategy="normal") try: # Navigate driver.get(url) time.sleep(1.5) # Reduced from 2.0 # Handle GDPR consent page (CRITICAL FIX!) if 'consent.google.com' in driver.current_url: try: # Click "Accept all" / "Aceptar todo" consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Aceptar"]') if not consent_btns: consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept"]') if consent_btns: consent_btns[0].click() time.sleep(1.5) # Reduced from 2.0 except: pass # Dismiss cookie banner on Maps page try: cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') if cookie_btns: cookie_btns[0].click() time.sleep(0.3) # Reduced from 0.4 except: pass # Click reviews tab review_keywords = ['reviews', 'review', 'reseñas', 'reseña'] for selector in ['.LRkQ2', 'button[role="tab"]']: try: tabs = driver.find_elements(By.CSS_SELECTOR, selector) for tab in tabs: text = (tab.text or '').lower() aria = (tab.get_attribute('aria-label') or '').lower() if any(kw in text or kw in aria for kw in review_keywords): driver.execute_script("arguments[0].click();", tab) time.sleep(0.3) # Reduced from 0.4 break except: continue # Wait for page stability time.sleep(0.8) # Reduced from 1.0 # Find pane pane = None try: wait = WebDriverWait(driver, 3) pane = wait.until(EC.presence_of_element_located( (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'))) except TimeoutException: try: pane = wait.until(EC.presence_of_element_located( (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde'))) except: print("ERROR: Could not find pane") return [] # CRITICAL: Wait for initial reviews to load time.sleep(1.2) # Reduced from 1.5 # Setup scroll driver.execute_script("window.scrollablePane = arguments[0];", pane) scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" # Trigger initial scroll and VERIFY reviews are loading driver.execute_script(scroll_script) time.sleep(0.8) # Reduced from 1.0 # Check if reviews are actually loading initial_count = driver.execute_script( "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;" ) if initial_count < 5: # Reviews not loaded yet, wait more print(f" Waiting for reviews to load (found {initial_count})...") time.sleep(1.5) # Reduced from 2.0 driver.execute_script(scroll_script) time.sleep(0.8) initial_count = driver.execute_script( "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;" ) print(f"Scrolling to load all reviews (starting with {initial_count})...") # Fast scrolling to load all DOM elements # No hard limit - stops automatically via idle detection max_scrolls = 999999 last_count = 0 idle_count = 0 last_scroll_pos = 0 for i in range(max_scrolls): # Get current review count current_count = driver.execute_script( "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;" ) # Scroll to load more prev_count = current_count driver.execute_script(scroll_script) # SMART WAIT: Wait until new reviews actually load (instead of fixed delay!) max_wait = 1.0 # Maximum 1 second wait_step = 0.05 # Check every 50ms waited = 0 while waited < max_wait: time.sleep(wait_step) waited += wait_step new_count = driver.execute_script( "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;" ) # If reviews loaded, continue immediately! if new_count > prev_count: break # If at bottom and no new reviews after 0.3s, we're done if waited >= 0.3 and new_count == prev_count: scroll_pos = driver.execute_script("return arguments[0].scrollTop;", pane) if scroll_pos == last_scroll_pos: idle_count += 1 if idle_count >= 3: print(f" Reached end at {new_count} reviews") break last_scroll_pos = scroll_pos break current_count = new_count # Progress logging every 10 scrolls if (i + 1) % 10 == 0: print(f" {current_count} review elements loaded...") # Track for idle detection if current_count == prev_count: idle_count += 1 if idle_count >= 3: break else: idle_count = 0 last_count = current_count # Shorter final scroll for _ in range(2): # Reduced from 3 driver.execute_script(scroll_script) time.sleep(0.3) # Reduced from 0.4 scroll_time = time.time() - start_time print(f" Scrolling complete in {scroll_time:.2f}s") # Extract ALL reviews using JavaScript (fast!) print("Extracting reviews with JavaScript...") extract_start = time.time() all_reviews = extract_all_reviews_js(driver) extract_time = time.time() - extract_start print(f" Extraction complete in {extract_time:.2f}s") elapsed = time.time() - start_time print(f"\n{'='*50}") print(f"✅ COMPLETED!") print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)") print(f"Time: {elapsed:.2f}s") print(f" - Scrolling: {scroll_time:.2f}s") print(f" - Extraction: {extract_time:.2f}s") print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") print(f"Speedup: {155/elapsed:.1f}x faster! 🚀") print(f"{'='*50}") if len(all_reviews) >= 244: print(f"🎯 Got ALL 244 reviews!") elif len(all_reviews) >= 240: print(f"⚠️ Missing {244-len(all_reviews)} reviews") print() # Save with open('google_reviews_dom_only_fast.json', 'w', encoding='utf-8') as f: json.dump(all_reviews, f, indent=2, ensure_ascii=False) print(f"💾 Saved to google_reviews_dom_only_fast.json") if all_reviews: print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★") return all_reviews finally: try: driver.quit() except: pass if __name__ == '__main__': try: reviews = dom_only_fast_scrape() sys.exit(0 if reviews else 1) except KeyboardInterrupt: print("\n\nInterrupted by user") sys.exit(1) except Exception as e: print(f"ERROR: {e}") import traceback traceback.print_exc() sys.exit(1)