#!/usr/bin/env python3 """ Parallel API Scraper V2 - Use browser's fetch API for parallel calls. Strategy: 1. Open browser and navigate to reviews (~15 seconds) 2. Trigger initial API call to get place ID and pattern 3. Use JavaScript fetch API to make 25 parallel calls (~3-5 seconds) 4. Collect all results at once Expected time: ~20-25 seconds for 244 reviews Speed improvement: ~6-7x faster! """ import sys import yaml import logging import time import json from pathlib import Path from seleniumbase import Driver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from modules.api_interceptor import GoogleMapsAPIInterceptor logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') log = logging.getLogger(__name__) def load_config(): """Load configuration from config.yaml""" with open('config.yaml', 'r') as f: return yaml.safe_load(f) def parallel_scrape(): """Parallel API-first scraping using browser's fetch API.""" config = load_config() url = config.get('url') headless = config.get('headless', False) log.info("="*60) log.info("PARALLEL API SCRAPER V2") log.info("="*60) log.info(f"URL: {url[:80]}...") log.info(f"Mode: Parallel browser fetch calls") log.info("="*60 + "\n") start_time = time.time() driver = Driver(uc=True, headless=headless, page_load_strategy="normal") try: # Step 1: Navigate and setup log.info("Step 1: Opening Google Maps...") driver.get(url) time.sleep(2) # Dismiss cookies try: cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') if cookie_btns: cookie_btns[0].click() log.info("✓ Cookie dialog dismissed") time.sleep(1) except: pass # Click reviews tab log.info("Step 2: Opening reviews tab...") review_keywords = ['reviews', 'review', 'reseñas', 'reseña', 'opiniones'] clicked = False for selector in ['.LRkQ2', '.hh2c6', '[data-tab-index="1"]', 'button[role="tab"]']: try: tabs = driver.find_elements(By.CSS_SELECTOR, selector) for tab in tabs: text = (tab.text or '').lower() aria_label = (tab.get_attribute('aria-label') or '').lower() if any(kw in text or kw in aria_label for kw in review_keywords): driver.execute_script("arguments[0].click();", tab) time.sleep(2) log.info("✓ Reviews tab clicked") clicked = True break if clicked: break except: continue # Wait for reviews to load log.info("Waiting for reviews page to fully load...") time.sleep(3) # Find reviews pane log.info("Step 3: Finding reviews pane...") pane = None pane_selectors = [ 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', 'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', 'div.m6QErb.WNBkOb.XiKgde', ] for selector in pane_selectors: try: wait = WebDriverWait(driver, 5) pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector))) log.info(f"✓ Found reviews pane with: {selector}") break except TimeoutException: continue if not pane: log.error("Could not find reviews pane") return [] # Wait for initial reviews time.sleep(2) # Extract place ID from URL current_url = driver.current_url place_id = None if '!1s' in current_url: parts = current_url.split('!1s') if len(parts) > 1: place_id = parts[1].split('!')[0] log.info(f"✓ Extracted place ID: {place_id}") if not place_id: log.error("Could not extract place ID from URL") return [] # Step 4: Make parallel API calls using browser's fetch log.info("\n" + "="*60) log.info("Step 4: Making parallel API calls via browser fetch") log.info("="*60) # JavaScript to make parallel API calls parallel_fetch_script = """ async function fetchReviewsParallel(placeId, numPages) { const baseUrl = 'https://www.google.com/maps/rpc/listugcposts'; const results = []; // Build pb parameter for each page const requests = []; let token = null; console.log('[Parallel Fetch] Starting parallel fetch for', numPages, 'pages'); // First, we need to get continuation tokens sequentially const tokens = []; for (let i = 0; i < Math.min(numPages, 5); i++) { const pb = token ? `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1` : `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`; const params = new URLSearchParams({ authuser: '0', hl: 'es', gl: 'es', pb: pb }); try { const response = await fetch(`${baseUrl}?${params}`); const text = await response.text(); const body = text.startsWith(")]}'") ? text.substring(4) : text; const data = JSON.parse(body); results.push({index: i, data: data}); // Get next token if (data && data.length > 1 && typeof data[1] === 'string') { token = data[1]; tokens.push(token); } else { break; // No more pages } } catch (e) { console.error('[Parallel Fetch] Error fetching page', i, e); } } console.log('[Parallel Fetch] Got', tokens.length, 'continuation tokens'); console.log('[Parallel Fetch] Now fetching remaining pages in parallel...'); // Now fetch remaining pages in parallel using the tokens const parallelPromises = tokens.slice(5).map((tok, idx) => { const pb = `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${tok}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`; const params = new URLSearchParams({ authuser: '0', hl: 'es', gl: 'es', pb: pb }); return fetch(`${baseUrl}?${params}`) .then(r => r.text()) .then(text => { const body = text.startsWith(")]}'") ? text.substring(4) : text; return JSON.parse(body); }) .then(data => ({index: idx + 5, data: data})) .catch(e => { console.error('[Parallel Fetch] Parallel fetch error', idx, e); return null; }); }); const parallelResults = await Promise.all(parallelPromises); results.push(...parallelResults.filter(r => r !== null)); console.log('[Parallel Fetch] Completed! Total responses:', results.length); return results; } // Execute parallel fetch return await fetchReviewsParallel(arguments[0], arguments[1]); """ log.info(f"Fetching up to 25 pages in parallel...") api_start = time.time() try: results = driver.execute_async_script(parallel_fetch_script, place_id, 25) api_elapsed = time.time() - api_start log.info(f"✓ Parallel fetch completed in {api_elapsed:.2f} seconds") log.info(f" Received {len(results)} API responses") except Exception as e: log.error(f"Parallel fetch failed: {e}") return [] # Parse results log.info("\nStep 5: Parsing reviews from API responses...") interceptor = GoogleMapsAPIInterceptor(None) all_reviews = {} for result in results: if result and 'data' in result: try: parsed = interceptor._parse_listugcposts_response(result['data']) for review in parsed: if review.review_id and review.review_id not in all_reviews: all_reviews[review.review_id] = { 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, } except Exception as e: log.debug(f"Error parsing response: {e}") reviews_list = list(all_reviews.values()) elapsed = time.time() - start_time log.info(f"\n{'='*60}") log.info(f"✅ PARALLEL SCRAPING COMPLETED!") log.info(f"{'='*60}") log.info(f"Total reviews: {len(reviews_list)}") log.info(f"API responses: {len(results)}") log.info(f"Total time: {elapsed:.2f} seconds") log.info(f" - Setup: {api_start - start_time:.2f}s") log.info(f" - Parallel API: {api_elapsed:.2f}s") log.info(f"Speed: {len(reviews_list)/elapsed:.1f} reviews/second") log.info(f"{'='*60}\n") # Save results output_file = 'google_reviews_parallel.json' with open(output_file, 'w', encoding='utf-8') as f: json.dump(reviews_list, f, indent=2, ensure_ascii=False) log.info(f"💾 Saved {len(reviews_list)} reviews to {output_file}") # Show sample if reviews_list: log.info("\n📝 Sample review:") sample = reviews_list[0] log.info(f" Author: {sample['author']}") log.info(f" Rating: {sample['rating']}★") log.info(f" Date: {sample['date_text']}") if sample['text']: log.info(f" Text: {sample['text'][:80]}...") # Stats comparison log.info("\n" + "="*60) log.info("SPEED COMPARISON") log.info("="*60) log.info(f"Old DOM scraping: ~155 seconds for 244 reviews (1.0x)") log.info(f"Fast API scrolling: ~43 seconds for 234 reviews (3.6x faster)") log.info(f"Parallel browser fetch: ~{elapsed:.0f} seconds for {len(reviews_list)} reviews ({155/elapsed:.1f}x faster!) 🚀") log.info("="*60 + "\n") return reviews_list finally: try: driver.quit() except: pass if __name__ == '__main__': try: reviews = parallel_scrape() sys.exit(0 if reviews else 1) except KeyboardInterrupt: log.info("\n\nInterrupted by user") sys.exit(1) except Exception as e: log.error(f"Fatal error: {e}") import traceback traceback.print_exc() sys.exit(1)