#!/usr/bin/env python3 """ Parallel API Scraper - Capture session, then parallel API calls. Strategy: 1. Open browser and navigate to reviews (~15 seconds) 2. Capture cookies and place ID from active session (~2 seconds) 3. Make parallel API calls using requests (~5-10 seconds) 4. Close browser immediately Expected time: ~20-30 seconds for 244 reviews (vs 155 seconds) Speed improvement: ~5-7x faster! """ import sys import yaml import logging import time import json from pathlib import Path from concurrent.futures import ThreadPoolExecutor, as_completed import requests from seleniumbase import Driver from selenium.webdriver.common.by import By from modules.api_interceptor import GoogleMapsAPIInterceptor logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') log = logging.getLogger(__name__) def load_config(): """Load configuration from config.yaml""" with open('config.yaml', 'r') as f: return yaml.safe_load(f) def capture_session(url: str, headless: bool = False): """ Capture cookies and place ID from browser session. Returns (session, place_id, interceptor) """ log.info("="*60) log.info("STEP 1: Capturing session from browser") log.info("="*60) driver = Driver(uc=True, headless=headless, page_load_strategy="normal") try: # Navigate to place log.info("Opening Google Maps...") driver.get(url) time.sleep(2) # Dismiss cookies try: cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') if cookie_btns: cookie_btns[0].click() log.info("✓ Cookie dialog dismissed") time.sleep(1) except: pass # Click reviews tab log.info("Opening reviews tab...") review_keywords = ['reviews', 'review', 'reseñas', 'reseña', 'opiniones'] clicked = False for selector in ['.LRkQ2', '.hh2c6', '[data-tab-index="1"]', 'button[role="tab"]']: try: tabs = driver.find_elements(By.CSS_SELECTOR, selector) for tab in tabs: text = (tab.text or '').lower() aria_label = (tab.get_attribute('aria-label') or '').lower() if any(kw in text or kw in aria_label for kw in review_keywords): driver.execute_script("arguments[0].click();", tab) time.sleep(2) log.info("✓ Reviews tab clicked") clicked = True break if clicked: break except: continue # Wait for reviews to load time.sleep(3) # Extract place ID from URL current_url = driver.current_url place_id = None if '!1s' in current_url: parts = current_url.split('!1s') if len(parts) > 1: place_id = parts[1].split('!')[0] log.info(f"✓ Extracted place ID: {place_id}") if not place_id: log.error("Could not extract place ID from URL") return None, None, None # Capture ALL cookies using CDP log.info("Capturing cookies via CDP...") cdp_cookies = driver.execute_cdp_cmd('Network.getAllCookies', {}) browser_cookies = cdp_cookies.get('cookies', []) log.info(f"✓ Captured {len(browser_cookies)} cookies") # Get user agent user_agent = driver.execute_script("return navigator.userAgent") # Create session with cookies session = requests.Session() for cookie in browser_cookies: session.cookies.set( name=cookie['name'], value=cookie['value'], domain=cookie.get('domain', '.google.com'), path=cookie.get('path', '/') ) # Set headers session.headers.update({ 'User-Agent': user_agent, 'Accept': '*/*', 'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8', 'Referer': 'https://www.google.com/maps/', 'Origin': 'https://www.google.com', }) # Create interceptor for parsing interceptor = GoogleMapsAPIInterceptor(None) log.info("✓ Session captured successfully\n") return session, place_id, interceptor finally: # Close browser immediately - we don't need it anymore! try: driver.quit() log.info("✓ Browser closed\n") except: pass def fetch_reviews_page(session, place_id, interceptor, continuation_token=None): """Fetch a single page of reviews via API.""" if continuation_token: pb = f"!1m6!1s{place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" else: pb = f"!1m6!1s{place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" params = { 'authuser': '0', 'hl': 'es', 'gl': 'es', 'pb': pb } try: url = 'https://www.google.com/maps/rpc/listugcposts' response = session.get(url, params=params, timeout=10) if response.status_code != 200: log.error(f"API error {response.status_code}") return [], None body = response.text if body.startswith(")]}'"): body = body[4:].strip() data = json.loads(body) reviews = interceptor._parse_listugcposts_response(data) # Get next token next_token = None if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str): next_token = data[1] return reviews, next_token except Exception as e: log.error(f"Request failed: {e}") return [], None def scrape_all_parallel(session, place_id, interceptor, max_workers=5): """ Main scraping method with parallel API calls. """ log.info("="*60) log.info("STEP 2: Parallel API scraping") log.info("="*60) start_time = time.time() all_reviews = [] seen_ids = set() # Fetch first page to get continuation token log.info("Fetching first page...") reviews, token = fetch_reviews_page(session, place_id, interceptor, None) for review in reviews: rid = review.review_id or f"{review.author}_{review.date_text}" if rid not in seen_ids: seen_ids.add(rid) all_reviews.append({ 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, }) log.info(f" → {len(reviews)} reviews | Total: {len(all_reviews)}") if not token: log.info("No continuation token - only one page of reviews") return all_reviews # Collect continuation tokens by fetching a few sequential pages # (We need to do this sequentially to get the tokens) tokens = [token] log.info("Collecting continuation tokens...") for i in range(4): # Get 5 total tokens reviews, next_token = fetch_reviews_page(session, place_id, interceptor, token) if next_token: tokens.append(next_token) token = next_token else: break for review in reviews: rid = review.review_id or f"{review.author}_{review.date_text}" if rid not in seen_ids: seen_ids.add(rid) all_reviews.append({ 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, }) log.info(f"Collected {len(tokens)} tokens, {len(all_reviews)} reviews so far") log.info(f"Starting parallel fetch with {max_workers} workers...\n") # Now fetch remaining pages in parallel with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [] for token in tokens: future = executor.submit(fetch_reviews_page, session, place_id, interceptor, token) futures.append(future) for i, future in enumerate(as_completed(futures)): try: reviews, _ = future.result() new_count = 0 for review in reviews: rid = review.review_id or f"{review.author}_{review.date_text}" if rid not in seen_ids: seen_ids.add(rid) all_reviews.append({ 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, }) new_count += 1 log.info(f" Completed {i+1}/{len(futures)}: +{new_count} new reviews | Total: {len(all_reviews)}") except Exception as e: log.error(f" Error in parallel fetch: {e}") elapsed = time.time() - start_time log.info(f"\n{'='*60}") log.info(f"✅ PARALLEL SCRAPING COMPLETED!") log.info(f"{'='*60}") log.info(f"Total reviews: {len(all_reviews)}") log.info(f"Parallel workers: {max_workers}") log.info(f"API time: {elapsed:.2f} seconds") log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") log.info(f"{'='*60}\n") return all_reviews def main(): """Main entry point.""" config = load_config() url = config.get('url') headless = config.get('headless', False) log.info("="*60) log.info("PARALLEL API SCRAPER") log.info("="*60) log.info(f"URL: {url[:80]}...") log.info(f"Mode: Parallel API calls (no scrolling)") log.info("="*60 + "\n") total_start = time.time() # Step 1: Capture session from browser session, place_id, interceptor = capture_session(url, headless) if not session or not place_id: log.error("Failed to capture session") return [] # Step 2: Parallel API scraping reviews = scrape_all_parallel(session, place_id, interceptor, max_workers=5) total_elapsed = time.time() - total_start # Save results output_file = 'google_reviews_parallel.json' with open(output_file, 'w', encoding='utf-8') as f: json.dump(reviews, f, indent=2, ensure_ascii=False) log.info(f"💾 Saved {len(reviews)} reviews to {output_file}") # Show sample if reviews: log.info("\n📝 Sample review:") sample = reviews[0] log.info(f" Author: {sample['author']}") log.info(f" Rating: {sample['rating']}★") log.info(f" Date: {sample['date_text']}") if sample['text']: log.info(f" Text: {sample['text'][:80]}...") # Stats comparison log.info("\n" + "="*60) log.info("SPEED COMPARISON") log.info("="*60) log.info(f"Old DOM scraping: ~155 seconds for 244 reviews") log.info(f"Fast API scrolling: ~43 seconds for 234 reviews (3.6x faster)") log.info(f"Parallel API calls: ~{total_elapsed:.0f} seconds for {len(reviews)} reviews ({155/total_elapsed:.1f}x faster!) 🚀") log.info("="*60 + "\n") return reviews if __name__ == '__main__': try: reviews = main() sys.exit(0 if reviews else 1) except KeyboardInterrupt: log.info("\n\nInterrupted by user") sys.exit(1) except Exception as e: log.error(f"Fatal error: {e}") import traceback traceback.print_exc() sys.exit(1)