#!/usr/bin/env python3 """ Fast API-First Scraper - Optimized version of start.py Strategy: 1. Open browser and navigate to reviews (~15 seconds) 2. Scroll rapidly JUST to trigger API calls (~15 seconds) 3. Collect all API responses during scrolling 4. Parse reviews from API responses 5. Skip DOM parsing entirely 6. Exit immediately Expected time: ~30-40 seconds for 244 reviews (vs 155 seconds) Speed improvement: ~4-5x faster! """ import sys import yaml import logging import time import json from pathlib import Path from seleniumbase import Driver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from modules.api_interceptor import GoogleMapsAPIInterceptor logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') log = logging.getLogger(__name__) def load_config(): """Load configuration from config.yaml""" with open('config.yaml', 'r') as f: return yaml.safe_load(f) def fast_scrape(): """Fast API-first scraping.""" config = load_config() url = config.get('url') headless = config.get('headless', False) log.info("="*60) log.info("FAST API-FIRST SCRAPER") log.info("="*60) log.info(f"URL: {url[:80]}...") log.info(f"Mode: API-first (skip DOM parsing)") log.info("="*60 + "\n") start_time = time.time() api_reviews = {} # Create driver using SeleniumBase UC Mode (like original scraper) driver = Driver(uc=True, headless=headless, page_load_strategy="normal") try: # Step 1: Navigate to reviews log.info("Step 1: Opening Google Maps...") driver.get(url) time.sleep(2) # Dismiss cookies try: cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') if cookie_btns: cookie_btns[0].click() log.info("✓ Cookie dialog dismissed") time.sleep(1) except: pass # Click reviews tab - comprehensive approach log.info("Step 2: Opening reviews tab...") # Review keywords for multiple languages review_keywords = [ 'reviews', 'review', 'reseñas', 'reseña', 'opiniones', 'avis', 'bewertungen', 'recensioni', 'avaliações', 'ביקורות' ] clicked = False tab_selectors = [ '.LRkQ2', # Primary '.hh2c6', # Alternative '[data-tab-index="1"]', # Tab index 'button[role="tab"]', # Button tabs 'div[role="tab"]', # Div tabs ] # Try each selector for selector in tab_selectors: try: tabs = driver.find_elements(By.CSS_SELECTOR, selector) for tab in tabs: try: # Check if this is the reviews tab text = (tab.text or '').lower() aria_label = (tab.get_attribute('aria-label') or '').lower() if any(keyword in text or keyword in aria_label for keyword in review_keywords): log.info(f"Found reviews tab with selector {selector}: '{tab.text}'") # Scroll into view driver.execute_script("arguments[0].scrollIntoView({block:'center'});", tab) time.sleep(0.5) # Click with JavaScript (most reliable) driver.execute_script("arguments[0].click();", tab) time.sleep(1.5) log.info("✓ Reviews tab clicked") clicked = True break except: continue if clicked: break except: continue if not clicked: log.warning("Could not find/click reviews tab - may already be on reviews or page structure changed") # CRITICAL: Wait after clicking reviews tab for page to load log.info("Waiting for reviews page to fully load...") time.sleep(3) # Find reviews pane log.info("Step 3: Finding reviews pane...") log.info(f"Current URL: {driver.current_url}") pane = None pane_selectors = [ 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', # Primary 'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', # Without role="main" 'div.m6QErb.WNBkOb.XiKgde', # Alternative class combination 'div[role="main"] div.m6QErb.XiKgde', # Simplified with XiKgde 'div.m6QErb.DxyBCb.XiKgde', # Another variant 'div[role="main"] div.m6QErb', # Simplified version 'div.m6QErb.DxyBCb', # Even more simplified 'div[role="main"]', # Most generic ] for selector in pane_selectors: try: log.info(f"Trying selector: {selector}") wait = WebDriverWait(driver, 5) pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector))) log.info(f"✓ Found reviews pane with: {selector}") break except TimeoutException: log.debug(f"Pane not found with selector: {selector}") continue if not pane: log.error("Could not find reviews pane after all attempts!") log.error(f"Final URL: {driver.current_url}") # Save screenshot for debugging try: screenshot_path = 'pane_not_found.png' driver.save_screenshot(screenshot_path) log.info(f"Screenshot saved to {screenshot_path}") except: pass return [] # Wait for initial reviews to load log.info("Waiting for initial reviews to render...") time.sleep(3) # Check if any review cards are present try: cards = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf') log.info(f"Found {len(cards)} initial review cards") except: log.warning("Could not find initial review cards") # Step 4: Setup API interceptor (AFTER finding pane) log.info("Step 4: Setting up API interception...") interceptor = GoogleMapsAPIInterceptor(driver) try: interceptor.setup_interception() interceptor.inject_response_interceptor() log.info("✓ API interceptor ready - capturing network responses") except Exception as e: log.warning(f"Failed to setup interceptor: {e}") import traceback traceback.print_exc() time.sleep(2) # Extra wait for interception to be fully active log.info("") # Step 5: Rapid scrolling to trigger API calls log.info("="*60) log.info("Step 5: Rapid scrolling to trigger API calls") log.info("="*60) # Setup scroll script (same as original scraper) try: driver.execute_script("window.scrollablePane = arguments[0];", pane) scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" log.info("✓ Scroll script setup complete") except Exception as e: log.warning(f"Error setting up scroll script: {e}") scroll_script = "window.scrollBy(0, 300);" # Fallback # Verify interceptor is active try: is_injected = driver.execute_script("return window.__reviewInterceptorInjected === true;") stats = driver.execute_script("return window.__interceptorStats;") queue_length = driver.execute_script("return window.__interceptedResponses ? window.__interceptedResponses.length : -1;") log.info(f"Interceptor status: injected={is_injected}, queue={queue_length}, stats={stats}") except Exception as e: log.warning(f"Could not check interceptor status: {e}") # Trigger initial API call log.info("Triggering initial API call...") driver.execute_script(scroll_script) time.sleep(2) # Wait for first API response log.info("") # We need about 25 API calls for 244 reviews (10 per call) # Scroll rapidly - no DOM parsing! target_reviews = 240 max_scrolls = 30 for i in range(max_scrolls): # Fast scroll driver.execute_script(scroll_script) time.sleep(0.3) # Optimal timing - fast but captures all responses # Collect API responses try: responses = interceptor.get_intercepted_responses() if i == 5: # Debug on scroll 5 log.info(f"DEBUG: Got {len(responses)} responses from interceptor") # Check browser console try: console_logs = driver.get_log('browser') interceptor_logs = [l for l in console_logs if 'API Interceptor' in l.get('message', '')] if interceptor_logs: log.info(f"DEBUG: Interceptor console logs:") for l in interceptor_logs[-10:]: # Last 10 log.info(f" {l['message']}") else: log.info("DEBUG: No interceptor logs in console") except Exception as e: log.warning(f"Could not get console logs: {e}") if responses: parsed = interceptor.parse_reviews_from_responses(responses) if i == 5: # Debug on scroll 5 log.info(f"DEBUG: Parsed {len(parsed)} reviews from responses") for review in parsed: if review.review_id and review.review_id not in api_reviews: api_reviews[review.review_id] = { 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, } if parsed: log.info(f"Scroll {i+1}: +{len(parsed)} reviews | Total: {len(api_reviews)}") # Exit early if we have enough if len(api_reviews) >= target_reviews: log.info(f"\n✓ Reached target of {target_reviews} reviews!") break except Exception as e: log.error(f"Error collecting API responses: {e}") import traceback traceback.print_exc() # Quick progress update if (i + 1) % 5 == 0 and i > 0: log.info(f"Progress: {i+1}/{max_scrolls} scrolls, {len(api_reviews)} reviews collected") elapsed = time.time() - start_time # Convert to list all_reviews = list(api_reviews.values()) log.info("\n" + "="*60) log.info("✅ FAST SCRAPING COMPLETED!") log.info("="*60) log.info(f"Total reviews: {len(all_reviews)}") log.info(f"Scrolls performed: {i+1}") log.info(f"Time elapsed: {elapsed:.2f} seconds") if all_reviews: log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second") log.info("="*60 + "\n") # Save results output_file = 'google_reviews_fast.json' with open(output_file, 'w', encoding='utf-8') as f: json.dump(all_reviews, f, indent=2, ensure_ascii=False) log.info(f"💾 Saved {len(all_reviews)} reviews to {output_file}") # Show sample if all_reviews: log.info("\n📝 Sample review:") sample = all_reviews[0] log.info(f" Author: {sample['author']}") log.info(f" Rating: {sample['rating']}★") log.info(f" Date: {sample['date_text']}") if sample['text']: log.info(f" Text: {sample['text'][:80]}...") # Stats comparison log.info("\n" + "="*60) log.info("SPEED COMPARISON") log.info("="*60) log.info(f"Old approach: ~155 seconds for 244 reviews") log.info(f"Fast approach: ~{elapsed:.0f} seconds for {len(all_reviews)} reviews") if elapsed > 0: log.info(f"Improvement: {155/elapsed:.1f}x faster! 🚀") log.info("="*60 + "\n") return all_reviews finally: # Always close the driver try: driver.quit() except: pass if __name__ == '__main__': try: reviews = fast_scrape() sys.exit(0 if reviews else 1) except KeyboardInterrupt: log.info("\n\nInterrupted by user") sys.exit(1) except Exception as e: log.error(f"Fatal error: {e}") import traceback traceback.print_exc() sys.exit(1)