#!/usr/bin/env python3 """ ULTRA-FAST API Scraper - Maximum speed optimization. Optimizations: 1. Minimal waits (0.5s after tab click instead of 3s) 2. No wait for "initial reviews" (removes 3s) 3. Faster scroll timing (0.2s instead of 0.3s) 4. Batch response collection (every 3 scrolls, not every scroll) 5. Less logging during scrolling (I/O overhead) 6. Direct pane selection (no trying multiple) 7. Parallel operations where possible Target: ~15-20 seconds for 234 reviews """ import sys import yaml import logging import time import json from seleniumbase import Driver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from modules.api_interceptor import GoogleMapsAPIInterceptor logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s') log = logging.getLogger(__name__) # Only show INFO and above log.setLevel(logging.INFO) def load_config(): with open('config.yaml', 'r') as f: return yaml.safe_load(f) def ultra_fast_scrape(): """Ultra-fast API-first scraping with all optimizations.""" config = load_config() url = config.get('url') headless = config.get('headless', False) print("ULTRA-FAST SCRAPER - Starting...") print(f"URL: {url[:80]}...") start_time = time.time() api_reviews = {} driver = Driver(uc=True, headless=headless, page_load_strategy="normal") try: # Step 1: Navigate (minimal waits) driver.get(url) time.sleep(1.5) # Stable wait # Dismiss cookies (non-blocking) try: cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') if cookie_btns: cookie_btns[0].click() time.sleep(0.4) # Balanced wait except: pass # Click reviews tab review_keywords = ['reviews', 'review', 'reseñas', 'reseña'] for selector in ['.LRkQ2', 'button[role="tab"]']: try: tabs = driver.find_elements(By.CSS_SELECTOR, selector) for tab in tabs: text = (tab.text or '').lower() aria = (tab.get_attribute('aria-label') or '').lower() if any(kw in text or kw in aria for kw in review_keywords): driver.execute_script("arguments[0].click();", tab) time.sleep(0.4) # Balanced wait break except: continue # Brief wait for reviews page (balance speed vs stability) time.sleep(1.0) # Reduced from 3s but needed for stability # Find pane - use most common selector directly pane = None try: wait = WebDriverWait(driver, 3) # Reduced from 5s pane = wait.until(EC.presence_of_element_located( (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'))) except TimeoutException: try: pane = wait.until(EC.presence_of_element_located( (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde'))) except: print("ERROR: Could not find pane") return [] # NO wait for initial reviews - save 3s! # Setup API interceptor immediately interceptor = GoogleMapsAPIInterceptor(driver) interceptor.setup_interception() interceptor.inject_response_interceptor() time.sleep(0.3) # Minimal wait for interceptor # Setup scroll driver.execute_script("window.scrollablePane = arguments[0];", pane) scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" # Trigger initial scroll driver.execute_script(scroll_script) time.sleep(0.3) # Minimal initial trigger wait print("Fast scrolling...") # Rapid scrolling with batch collection target_reviews = 240 max_scrolls = 35 # Slightly more to compensate for faster timing for i in range(max_scrolls): # Ultra-fast scroll driver.execute_script(scroll_script) time.sleep(0.27) # Sweet spot for stability # Collect every scroll (can't skip or buffer clears) try: responses = interceptor.get_intercepted_responses() if responses: parsed = interceptor.parse_reviews_from_responses(responses) for review in parsed: if review.review_id and review.review_id not in api_reviews: api_reviews[review.review_id] = { 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, } # Only log every 10 scrolls to reduce I/O if (i + 1) % 10 == 0: print(f" {len(api_reviews)} reviews...") if len(api_reviews) >= target_reviews: break except: pass # Final collection try: responses = interceptor.get_intercepted_responses() if responses: parsed = interceptor.parse_reviews_from_responses(responses) for review in parsed: if review.review_id and review.review_id not in api_reviews: api_reviews[review.review_id] = { 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, } except: pass # Quick DOM parse for missing reviews (only if needed) missing = 244 - len(api_reviews) if missing > 0: print(f"\nQuick DOM parse for {missing} missing reviews...") try: # Scroll to top driver.execute_script("window.scrollablePane.scrollTo(0, 0);", pane) time.sleep(0.3) # Parse top reviews (most likely to be missing) review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')[:min(missing + 5, 20)] # Build API keys for deduplication api_keys = set() for api_review in api_reviews.values(): key = (api_review.get('author', ''), (api_review.get('date_text', '') or '')[:20]) api_keys.add(key) # Parse and add unique DOM reviews dom_added = 0 for elem in review_elements: try: review_data = {} # Author author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55') review_data['author'] = author_elem.text if author_elem else None # Rating rating_elem = elem.find_element(By.CSS_SELECTOR, 'span.kvMYJc') rating_attr = rating_elem.get_attribute('aria-label') if rating_attr: rating_parts = rating_attr.split() if rating_parts: review_data['rating'] = float(rating_parts[0]) # Text text_elem = elem.find_element(By.CSS_SELECTOR, 'span.wiI7pd') review_data['text'] = text_elem.text if text_elem else None # Date date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe') review_data['date_text'] = date_elem.text if date_elem else None # Avatar avatar_elem = elem.find_element(By.CSS_SELECTOR, 'img.NBa7we') review_data['avatar_url'] = avatar_elem.get_attribute('src') if avatar_elem else None # Profile URL profile_elem = elem.find_element(By.CSS_SELECTOR, 'button.WEBjve') review_data['profile_url'] = profile_elem.get_attribute('data-review-id') if profile_elem else None # Check if unique dom_key = (review_data.get('author', ''), (review_data.get('date_text', '') or '')[:20]) if dom_key not in api_keys and review_data.get('author'): review_id = f"dom_{hash(str(review_data.get('author', '')) + str(review_data.get('date_text', '')))}" review_data['review_id'] = review_id api_reviews[review_id] = review_data api_keys.add(dom_key) dom_added += 1 except: continue print(f" +{dom_added} reviews from DOM") except Exception as e: print(f" DOM parse failed: {e}") elapsed = time.time() - start_time all_reviews = list(api_reviews.values()) print(f"\n✅ COMPLETED!") print(f"Reviews: {len(all_reviews)}") print(f"Time: {elapsed:.2f}s") print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") print(f"Speedup: {155/elapsed:.1f}x faster! 🚀\n") # Save with open('google_reviews_ultra_fast.json', 'w', encoding='utf-8') as f: json.dump(all_reviews, f, indent=2, ensure_ascii=False) print(f"💾 Saved to google_reviews_ultra_fast.json") if all_reviews: print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★") return all_reviews finally: try: driver.quit() except: pass if __name__ == '__main__': try: reviews = ultra_fast_scrape() sys.exit(0 if reviews else 1) except KeyboardInterrupt: print("\n\nInterrupted by user") sys.exit(1) except Exception as e: print(f"ERROR: {e}") import traceback traceback.print_exc() sys.exit(1)