#!/usr/bin/env python3 """ Hybrid Parallel Scraper - Best of both worlds. Strategy: 1. Open browser and get to reviews page (~15s) 2. Scroll quickly to collect ~5-10 continuation tokens (~5s) 3. Make parallel API calls in browser using JavaScript (~2-3s) 4. Total: ~22-25 seconds for 244 reviews This approach: - Uses browser's active session (no auth issues) - Collects tokens sequentially (required by API) - Makes parallel calls for remaining pages (fast!) """ import sys import yaml import logging import time import json from seleniumbase import Driver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from modules.api_interceptor import GoogleMapsAPIInterceptor logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') log = logging.getLogger(__name__) def load_config(): with open('config.yaml', 'r') as f: return yaml.safe_load(f) def hybrid_parallel_scrape(): """Hybrid approach: Sequential token collection + Parallel fetch.""" config = load_config() url = config.get('url') headless = config.get('headless', False) log.info("="*60) log.info("HYBRID PARALLEL SCRAPER") log.info("="*60) log.info(f"URL: {url[:80]}...") log.info(f"Mode: Sequential tokens + Parallel fetch") log.info("="*60 + "\n") start_time = time.time() driver = Driver(uc=True, headless=headless, page_load_strategy="normal") try: # PHASE 1: Setup (~15s) log.info("Phase 1: Browser setup...") driver.get(url) time.sleep(2) # Dismiss cookies try: cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') if cookie_btns: cookie_btns[0].click() time.sleep(1) except: pass # Click reviews tab review_keywords = ['reviews', 'review', 'reseñas'] for selector in ['.LRkQ2', '.hh2c6', 'button[role="tab"]']: try: tabs = driver.find_elements(By.CSS_SELECTOR, selector) for tab in tabs: text = (tab.text or '').lower() aria = (tab.get_attribute('aria-label') or '').lower() if any(kw in text or kw in aria for kw in review_keywords): driver.execute_script("arguments[0].click();", tab) time.sleep(2) break except: continue time.sleep(3) # Find pane pane = None for selector in ['div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', 'div.m6QErb.WNBkOb.XiKgde']: try: wait = WebDriverWait(driver, 5) pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector))) break except: continue if not pane: log.error("Could not find pane") return [] time.sleep(2) # Extract place ID place_id = None current_url = driver.current_url if '!1s' in current_url: parts = current_url.split('!1s') if len(parts) > 1: place_id = parts[1].split('!')[0] if not place_id: log.error("Could not extract place ID") return [] log.info(f"✓ Setup complete (place_id: {place_id})\n") # PHASE 2: Collect tokens via scrolling (~5s) log.info("Phase 2: Collecting continuation tokens...") interceptor = GoogleMapsAPIInterceptor(driver) interceptor.setup_interception() interceptor.inject_response_interceptor() time.sleep(1) # Setup scroll driver.execute_script("window.scrollablePane = arguments[0];", pane) scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" # Collect tokens by scrolling quickly tokens = [] all_reviews = {} for i in range(8): # 8 scrolls to get ~8 tokens driver.execute_script(scroll_script) time.sleep(0.2) # Very fast scrolling # Collect responses responses = interceptor.get_intercepted_responses() if responses: parsed = interceptor.parse_reviews_from_responses(responses) for review in parsed: if review.review_id and review.review_id not in all_reviews: all_reviews[review.review_id] = { 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, } # Extract continuation token from raw response for resp in responses: try: body = resp.get('body', '') if body.startswith(")]}'"): body = body[4:] data = json.loads(body) if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str): token = data[1] if token and token not in tokens: tokens.append(token) except: pass log.info(f"✓ Collected {len(tokens)} continuation tokens") log.info(f"✓ Got {len(all_reviews)} reviews from scrolling\n") # PHASE 3: Parallel fetch remaining pages (~2-3s) if len(tokens) > 0: log.info("Phase 3: Parallel fetch of remaining pages...") parallel_script = """ async function fetchPages(placeId, tokens) { const baseUrl = 'https://www.google.com/maps/rpc/listugcposts'; const results = []; const promises = tokens.map((token, idx) => { const pb = `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`; const params = new URLSearchParams({ authuser: '0', hl: 'es', gl: 'es', pb: pb }); return fetch(`${baseUrl}?${params}`) .then(r => r.text()) .then(text => { const body = text.startsWith(")]}'") ? text.substring(4) : text; return {idx, data: JSON.parse(body)}; }) .catch(e => null); }); const settled = await Promise.all(promises); return settled.filter(r => r !== null); } return await fetchPages(arguments[0], arguments[1]); """ try: parallel_start = time.time() results = driver.execute_async_script(parallel_script, place_id, tokens[:15]) # Limit to 15 parallel parallel_time = time.time() - parallel_start log.info(f"✓ Parallel fetch completed in {parallel_time:.2f}s") log.info(f" Received {len(results)} responses") # Parse parallel results for result in results: if result and 'data' in result: try: parsed = interceptor._parse_listugcposts_response(result['data']) for review in parsed: if review.review_id and review.review_id not in all_reviews: all_reviews[review.review_id] = { 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, } except Exception as e: log.debug(f"Parse error: {e}") log.info(f"✓ Total reviews after parallel fetch: {len(all_reviews)}\n") except Exception as e: log.warning(f"Parallel fetch failed: {e}") reviews_list = list(all_reviews.values()) elapsed = time.time() - start_time log.info("="*60) log.info("✅ HYBRID PARALLEL SCRAPING COMPLETED!") log.info("="*60) log.info(f"Total reviews: {len(reviews_list)}") log.info(f"Total time: {elapsed:.2f} seconds") log.info(f"Speed: {len(reviews_list)/elapsed:.1f} reviews/second") log.info("="*60 + "\n") # Save with open('google_reviews_hybrid.json', 'w', encoding='utf-8') as f: json.dump(reviews_list, f, indent=2, ensure_ascii=False) log.info(f"💾 Saved {len(reviews_list)} reviews to google_reviews_hybrid.json") if reviews_list: log.info("\n📝 Sample:") s = reviews_list[0] log.info(f" {s['author']} - {s['rating']}★ - {s['date_text']}") log.info("\n" + "="*60) log.info("SPEED COMPARISON") log.info("="*60) log.info(f"Old DOM: ~155s for 244 reviews (1.0x)") log.info(f"Fast scrolling: ~29s for 234 reviews (5.3x)") log.info(f"Hybrid parallel: ~{elapsed:.0f}s for {len(reviews_list)} reviews ({155/elapsed:.1f}x)! 🚀") log.info("="*60 + "\n") return reviews_list finally: try: driver.quit() except: pass if __name__ == '__main__': try: reviews = hybrid_parallel_scrape() sys.exit(0 if reviews else 1) except KeyboardInterrupt: log.info("\n\nInterrupted by user") sys.exit(1) except Exception as e: log.error(f"Fatal error: {e}") import traceback traceback.print_exc() sys.exit(1)