#!/usr/bin/env python3 """ Fast API scraper - Minimal browser usage, maximum API speed. Strategy: 1. Start browser and navigate to reviews page 2. Capture cookies and user-agent from browser 3. Let one API call happen naturally (to warm up the session) 4. Close browser 5. Use requests library with captured session to make fast API calls 6. Paginate through all reviews without any scrolling Expected: 10-25x faster than traditional scrolling approach. """ import json import logging import time from typing import List, Optional, Tuple import requests from seleniumbase import SB from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') log = logging.getLogger(__name__) class FastAPIScraper: """Minimal browser, maximum speed.""" def __init__(self, url: str): self.url = url self.session = requests.Session() self.place_id = None self.interceptor = GoogleMapsAPIInterceptor(None) def bootstrap_session(self) -> bool: """ Quickly establish session using browser, then close it. """ log.info("Bootstrapping session with minimal browser usage...") try: with SB(uc=True, headless=False) as sb: # Navigate log.info("Opening Google Maps...") sb.open(self.url) sb.sleep(2) # Dismiss cookies try: sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=3) except: pass # Click reviews try: sb.click('.LRkQ2', timeout=5) log.info("✓ Opened reviews tab") sb.sleep(2) except: log.warning("Could not click reviews tab") # Wait a bit to ensure page is loaded sb.sleep(1) # Extract place ID from URL or page current_url = sb.get_current_url() if '!1s' in current_url: parts = current_url.split('!1s') if len(parts) > 1: self.place_id = parts[1].split('!')[0] log.info(f"✓ Extracted place ID: {self.place_id}") # Get cookies from browser - do this while browser is still active try: browser_cookies = sb.driver.get_cookies() log.debug(f"Got {len(browser_cookies)} cookies") except Exception as e: log.warning(f"Could not get cookies: {e}") browser_cookies = [] # Get user agent - do this while browser is still active try: user_agent = sb.execute_script("return navigator.userAgent") log.debug(f"User agent: {user_agent[:50]}...") except Exception as e: log.warning(f"Could not get user agent: {e}") user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' # Now process cookies and headers (browser context manager still open) for cookie in browser_cookies: try: self.session.cookies.set( name=cookie['name'], value=cookie['value'], domain=cookie.get('domain', '.google.com'), path=cookie.get('path', '/') ) except Exception as e: log.debug(f"Could not set cookie {cookie.get('name')}: {e}") # Set headers self.session.headers.update({ 'User-Agent': user_agent, 'Accept': '*/*', 'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8', 'Referer': 'https://www.google.com/maps/', 'Origin': 'https://www.google.com', 'X-Requested-With': 'XMLHttpRequest', }) log.info(f"✅ Session bootstrapped!") log.info(f" Cookies: {len(browser_cookies)}") log.info(f" Place ID: {self.place_id}") # Let browser stay open for a moment to ensure all operations complete sb.sleep(1) return True except Exception as e: log.error(f"Bootstrap failed: {e}") import traceback traceback.print_exc() return False def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]: """Fetch a page of reviews via API.""" # Build pb parameter if continuation_token: pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" else: pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" params = { 'authuser': '0', 'hl': 'es', 'gl': 'es', 'pb': pb } try: url = 'https://www.google.com/maps/rpc/listugcposts' response = self.session.get(url, params=params, timeout=10) if response.status_code != 200: log.error(f"API error {response.status_code}") log.error(f"Response: {response.text[:300]}") return [], None # Parse body = response.text if body.startswith(")]}'"): body = body[4:].strip() data = json.loads(body) reviews = self.interceptor._parse_listugcposts_response(data) # Next token next_token = None if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str): next_token = data[1] return reviews, next_token except Exception as e: log.error(f"Request failed: {e}") return [], None def scrape_all(self, max_pages: int = 100) -> List[dict]: """ Main scraping method. """ # Bootstrap if not self.bootstrap_session(): return [] # Scrape via API log.info("\n" + "="*60) log.info("STARTING FAST API SCRAPING") log.info("="*60 + "\n") start_time = time.time() all_reviews = [] seen_ids = set() token = None page = 0 while page < max_pages: page += 1 log.info(f"Fetching page {page}...") reviews, token = self.fetch_reviews_page(token) if not reviews: log.info("No more reviews") break # Dedup for review in reviews: rid = review.review_id or f"{review.author}_{review.date_text}" if rid not in seen_ids: seen_ids.add(rid) all_reviews.append({ 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, }) log.info(f" → {len(reviews)} reviews | Total: {len(all_reviews)}") if not token: break time.sleep(0.2) # Small delay elapsed = time.time() - start_time log.info("\n" + "="*60) log.info("✅ FAST API SCRAPING COMPLETED!") log.info("="*60) log.info(f"Reviews: {len(all_reviews)}") log.info(f"Pages: {page}") log.info(f"Time: {elapsed:.2f} seconds") log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") log.info("="*60 + "\n") return all_reviews def main(): url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1" scraper = FastAPIScraper(url) reviews = scraper.scrape_all(max_pages=50) # Save with open('fast_api_reviews.json', 'w', encoding='utf-8') as f: json.dump(reviews, f, indent=2, ensure_ascii=False) log.info(f"Saved to fast_api_reviews.json") if __name__ == '__main__': main()