#!/usr/bin/env python3 """ Hybrid API scraper - Capture session from browser, then use direct API calls. This combines the best of both worlds: 1. Browser establishes authentic session with Google 2. We capture ALL headers from real XHR requests 3. Replay those headers in direct API calls 4. No scrolling needed - just fast API pagination Expected speed: 10-25x faster than traditional browser scrolling. """ import json import logging import time from typing import List, Optional, Tuple, Dict import requests from seleniumbase import SB from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') log = logging.getLogger(__name__) class HybridAPIScraper: """ Capture session from browser, then scrape via direct API calls. """ def __init__(self, url: str, headless: bool = False): """ Initialize the hybrid scraper. Args: url: Google Maps place URL headless: Run browser in headless mode """ self.url = url self.headless = headless self.captured_headers = None self.place_id = None self.session = requests.Session() # Initialize parser self.interceptor = GoogleMapsAPIInterceptor(None) def capture_session_from_browser(self) -> bool: """ Start a browser session, capture headers from actual API requests. Returns: True if session captured successfully """ log.info("Starting browser to capture session headers...") try: with SB(uc=True, headless=self.headless) as sb: # Navigate to the place log.info(f"Navigating to: {self.url[:80]}...") sb.open(self.url) sb.sleep(3) # Dismiss cookie consent try: sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=5) log.info("Cookie dialog dismissed") except: pass # Click reviews tab log.info("Opening reviews...") try: sb.click('.LRkQ2', timeout=5) sb.sleep(3) except: log.warning("Could not click reviews tab") # Enable Chrome DevTools Protocol for network monitoring log.info("Enabling network interception...") sb.driver.execute_cdp_cmd('Network.enable', {}) # Store captured requests captured_requests = [] # Create event listener for network requests def add_request_listener(): """Inject JS to capture fetch/XHR requests with headers.""" script = """ window.__capturedRequests = []; // Capture fetch const originalFetch = window.fetch; window.fetch = function(...args) { const url = args[0].toString(); if (url.includes('listugcposts')) { console.log('[CAPTURE] Fetch to:', url); // Can't easily get headers from fetch without cloning } return originalFetch.apply(this, args); }; // Capture XHR (more reliable for headers) const originalXHR = window.XMLHttpRequest; window.XMLHttpRequest = function() { const xhr = new originalXHR(); const originalOpen = xhr.open; const originalSetRequestHeader = xhr.setRequestHeader; const headers = {}; xhr.setRequestHeader = function(name, value) { headers[name.toLowerCase()] = value; return originalSetRequestHeader.apply(this, arguments); }; xhr.open = function(method, url, ...rest) { if (url.includes('listugcposts')) { console.log('[CAPTURE] XHR to:', url); window.__capturedRequests.push({ url: url, method: method, headers: {...headers} }); } return originalOpen.apply(this, [method, url, ...rest]); }; return xhr; }; console.log('[CAPTURE] Request capture initialized'); """ sb.execute_script(script) add_request_listener() # Scroll to trigger an API call log.info("Scrolling to trigger API request...") for i in range(5): sb.execute_script("window.scrollBy(0, 800)") sb.sleep(1.5) # Check captured requests captured_requests = sb.execute_script("return window.__capturedRequests || []") if captured_requests: log.info(f"✓ Captured {len(captured_requests)} API request(s)!") break captured_request = captured_requests[0] if captured_requests else {} if not captured_request: log.error("Failed to capture API request") return False # Extract place ID from URL if 'place_id:' in self.url: self.place_id = self.url.split('place_id:')[1].split('&')[0].split('/')[0] elif '!1s' in captured_request['url']: # Extract from pb parameter import urllib.parse parsed = urllib.parse.urlparse(captured_request['url']) params = urllib.parse.parse_qs(parsed.query) pb = params.get('pb', [''])[0] if '!1s' in pb: self.place_id = pb.split('!1s')[1].split('!')[0] # Store captured headers self.captured_headers = captured_request['headers'] # Also get cookies from browser cookies = sb.driver.get_cookies() for cookie in cookies: self.session.cookies.set(cookie['name'], cookie['value'], domain=cookie.get('domain')) log.info(f"\n{'='*60}") log.info("✅ Session captured successfully!") log.info(f"{'='*60}") log.info(f"Place ID: {self.place_id}") log.info(f"Headers captured: {len(self.captured_headers)}") log.info(f"Cookies captured: {len(cookies)}") log.info(f"{'='*60}\n") # Print sample headers for debugging log.debug("Sample headers:") for key in ['cookie', 'x-goog-api-key', 'authorization', 'user-agent']: if key in self.captured_headers: value = self.captured_headers[key] preview = value[:50] + '...' if len(value) > 50 else value log.debug(f" {key}: {preview}") return True except Exception as e: log.error(f"Failed to capture session: {e}") import traceback traceback.print_exc() return False def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]: """ Fetch reviews page using captured session. Args: continuation_token: Pagination token Returns: Tuple of (reviews, next_token) """ # Build pb parameter if continuation_token: pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" else: pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" params = { 'authuser': '0', 'hl': 'es', 'gl': 'es', 'pb': pb } try: log.info(f"Fetching page (token: {'initial' if not continuation_token else 'paginated'})...") # Make request with captured headers url = 'https://www.google.com/maps/rpc/listugcposts' response = self.session.get(url, params=params, headers=self.captured_headers, timeout=10) log.debug(f"Response status: {response.status_code}") if response.status_code != 200: log.error(f"API error {response.status_code}: {response.text[:500]}") return [], None # Parse response body = response.text if body.startswith(")]}'"): body = body[4:].strip() data = json.loads(body) # Extract reviews reviews = self.interceptor._parse_listugcposts_response(data) # Get next token next_token = None if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str): next_token = data[1] log.info(f"✓ Extracted {len(reviews)} reviews") return reviews, next_token except Exception as e: log.error(f"API request failed: {e}") return [], None def scrape_all_reviews(self, max_pages: int = 100, delay: float = 0.3) -> List[dict]: """ Scrape all reviews using hybrid approach. Args: max_pages: Maximum pages to fetch delay: Delay between API calls Returns: List of review dictionaries """ # Step 1: Capture session from browser if not self.capture_session_from_browser(): log.error("Failed to capture session - aborting") return [] # Step 2: Fetch all reviews via API log.info("\nStarting API-based scraping (no browser needed!)...") start_time = time.time() all_reviews = [] seen_ids = set() continuation_token = None page = 0 while page < max_pages: page += 1 reviews, continuation_token = self.fetch_reviews_page(continuation_token) if not reviews: log.info("No more reviews found") break # Deduplicate for review in reviews: review_id = review.review_id or f"{review.author}_{review.date_text}" if review_id not in seen_ids: seen_ids.add(review_id) all_reviews.append({ 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, }) log.info(f"Page {page}: {len(all_reviews)} total unique reviews") if not continuation_token: log.info("No continuation token - finished") break if delay > 0: time.sleep(delay) elapsed = time.time() - start_time log.info(f"\n{'='*60}") log.info(f"✅ API SCRAPING COMPLETED!") log.info(f"{'='*60}") log.info(f"Total reviews: {len(all_reviews)}") log.info(f"API calls: {page}") log.info(f"Time (API only): {elapsed:.2f} seconds") log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second") log.info(f"{'='*60}\n") return all_reviews def main(): """Example usage.""" url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1" scraper = HybridAPIScraper(url, headless=False) reviews = scraper.scrape_all_reviews(max_pages=50, delay=0.3) # Save results output_file = 'hybrid_api_reviews.json' with open(output_file, 'w', encoding='utf-8') as f: json.dump(reviews, f, indent=2, ensure_ascii=False) log.info(f"Saved {len(reviews)} reviews to {output_file}") # Show sample if reviews: log.info("\nSample review:") sample = reviews[0] log.info(f" Author: {sample['author']}") log.info(f" Rating: {sample['rating']}★") log.info(f" Text: {sample['text'][:80]}..." if sample['text'] else " Text: (none)") if __name__ == '__main__': main()