#!/usr/bin/env python3 """ Header Capture Scraper - Capture COMPLETE request from browser (headers + cookies). This captures the exact request the browser makes, including ALL headers and cookies, then replays it for fast API scraping. """ import json import logging import time from typing import List, Optional, Tuple import requests from seleniumbase import SB from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') log = logging.getLogger(__name__) class HeaderCaptureScraper: """Capture complete request, then replay for fast scraping.""" def __init__(self, url: str, headless: bool = False): self.url = url self.headless = headless self.captured_request = None self.place_id = None self.session = requests.Session() self.interceptor = GoogleMapsAPIInterceptor(None) def capture_request(self) -> bool: """ Capture a complete API request (URL, headers, cookies) from browser. """ log.info("="*60) log.info("Capturing request from browser...") log.info("="*60) sb_context = None sb = None try: log.info("Starting browser...") sb_context = SB(uc=True, headless=self.headless) sb = sb_context.__enter__() sb.open(self.url) time.sleep(2) # Dismiss cookies try: sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=3) except: pass # Click reviews try: sb.click('.LRkQ2', timeout=5) log.info("✓ Opened reviews") time.sleep(2) except: pass # Enable CDP network monitoring sb.driver.execute_cdp_cmd('Network.enable', {}) log.info("✓ Network monitoring enabled") # Scroll to trigger API call log.info("Scrolling to trigger API request...") sb.execute_script("window.scrollBy(0, 800)") time.sleep(3) # Get network logs from CDP log.info("Checking network logs...") logs = sb.driver.get_log('browser') # Alternatively, use execute_cdp_cmd to get network events # But simpler: Let's inject JS to capture the request capture_script = """ window.__capturedRequest = null; const originalFetch = window.fetch; window.fetch = function(...args) { const url = args[0].toString(); if (url.includes('listugcposts')) { console.log('[CAPTURE] Intercepted request to:', url); window.__capturedRequest = { url: url, method: 'GET' }; } return originalFetch.apply(this, args); }; const originalXHR = window.XMLHttpRequest; window.XMLHttpRequest = function() { const xhr = new originalXHR(); const originalOpen = xhr.open; xhr.open = function(method, url, ...rest) { if (url.includes('listugcposts')) { console.log('[CAPTURE] Intercepted XHR:', url); window.__capturedRequest = { url: url, method: method }; } return originalOpen.apply(this, [method, url, ...rest]); }; return xhr; }; console.log('[CAPTURE] Request interceptor ready'); """ sb.execute_script(capture_script) log.info("✓ Request interceptor injected") # Scroll again to trigger request log.info("Scrolling to capture request...") for i in range(3): sb.execute_script("window.scrollBy(0, 600)") time.sleep(2) captured = sb.execute_script("return window.__capturedRequest") if captured: log.info(f"✓ Captured request URL!") self.captured_request = captured break if not self.captured_request: log.error("Failed to capture request") return False # Extract place ID from URL url = self.captured_request['url'] if '!1s' in url: import urllib.parse parsed = urllib.parse.urlparse(url) params = urllib.parse.parse_qs(parsed.query) pb = params.get('pb', [''])[0] if '!1s' in pb: self.place_id = pb.split('!1s')[1].split('!')[0] # Now capture ALL cookies via CDP cdp_cookies = sb.driver.execute_cdp_cmd('Network.getAllCookies', {}) all_cookies = cdp_cookies.get('cookies', []) # Set cookies in session for cookie in all_cookies: self.session.cookies.set( name=cookie['name'], value=cookie['value'], domain=cookie.get('domain', '.google.com'), path=cookie.get('path', '/') ) # Get user agent user_agent = sb.execute_script("return navigator.userAgent") # Set headers to match browser self.session.headers.update({ 'User-Agent': user_agent, 'Accept': '*/*', 'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8', 'Referer': 'https://www.google.com/maps/', 'Origin': 'https://www.google.com', 'X-Requested-With': 'XMLHttpRequest', }) log.info(f"\n✅ Request captured successfully!") log.info(f" Place ID: {self.place_id}") log.info(f" Cookies: {len(all_cookies)}") log.info(f" Cookie names: {', '.join([c['name'] for c in all_cookies[:10]])}") return True except Exception as e: log.error(f"Capture failed: {e}") import traceback traceback.print_exc() return False finally: if sb_context: try: log.info("Closing browser...") sb_context.__exit__(None, None, None) log.info("✓ Browser closed\n") except: pass def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]: """Fetch reviews using captured session.""" if continuation_token: pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" else: pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" params = { 'authuser': '0', 'hl': 'es', 'gl': 'es', 'pb': pb } try: url = 'https://www.google.com/maps/rpc/listugcposts' response = self.session.get(url, params=params, timeout=10) if response.status_code != 200: log.error(f"API error {response.status_code}: {response.text[:200]}") return [], None body = response.text if body.startswith(")]}'"): body = body[4:].strip() data = json.loads(body) reviews = self.interceptor._parse_listugcposts_response(data) next_token = None if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str): next_token = data[1] return reviews, next_token except Exception as e: log.error(f"Request failed: {e}") return [], None def scrape_all(self, max_pages: int = 50) -> List[dict]: """Main scraping method.""" if not self.capture_request(): return [] log.info("="*60) log.info("Fast API scraping...") log.info("="*60) start_time = time.time() all_reviews = [] seen_ids = set() token = None page = 0 while page < max_pages: page += 1 log.info(f"Page {page}...") reviews, token = self.fetch_reviews_page(token) if not reviews: break for review in reviews: rid = review.review_id or f"{review.author}_{review.date_text}" if rid not in seen_ids: seen_ids.add(rid) all_reviews.append({ 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, }) log.info(f" → {len(reviews)} reviews | Total: {len(all_reviews)}") if not token: break time.sleep(0.2) elapsed = time.time() - start_time log.info(f"\n{'='*60}") log.info(f"✅ COMPLETED!") log.info(f"{'='*60}") log.info(f"Reviews: {len(all_reviews)}") log.info(f"Time: {elapsed:.2f}s") log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") log.info(f"{'='*60}\n") return all_reviews def main(): url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1" scraper = HeaderCaptureScraper(url, headless=False) reviews = scraper.scrape_all() if reviews: with open('header_capture_reviews.json', 'w', encoding='utf-8') as f: json.dump(reviews, f, indent=2, ensure_ascii=False) log.info(f"Saved to header_capture_reviews.json") if __name__ == '__main__': main()