#!/usr/bin/env python3 """ Cookie-based API scraper - Capture fresh cookies on each run, then fast API scraping. Flow: 1. Start browser (15 seconds) 2. Capture cookies from active browser session (5 seconds) 3. Close browser 4. Use cookies for rapid API pagination (5-10 seconds) Total time: ~25-35 seconds for 244 reviews (vs 155 seconds with scrolling) """ import json import logging import time from typing import List, Optional, Tuple import requests from seleniumbase import SB from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') log = logging.getLogger(__name__) class CookieBasedScraper: """Capture cookies each run, then scrape via API.""" def __init__(self, url: str, headless: bool = False): self.url = url self.headless = headless self.session = requests.Session() self.place_id = None self.interceptor = GoogleMapsAPIInterceptor(None) def capture_cookies(self) -> bool: """ Capture cookies from a real browser session. Returns True if successful. """ log.info("="*60) log.info("STEP 1: Capturing cookies from browser session") log.info("="*60) sb = None sb_context = None try: # Create driver - need to enter the context manually log.info("Starting browser...") sb_context = SB(uc=True, headless=self.headless) sb = sb_context.__enter__() # Manually enter context log.info("Opening Google Maps...") sb.open(self.url) time.sleep(2) # Dismiss cookie consent try: sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=3) log.info("✓ Cookie dialog dismissed") except: pass # Click reviews tab try: sb.click('.LRkQ2', timeout=5) log.info("✓ Opened reviews tab") time.sleep(3) # Wait for reviews to load except Exception as e: log.warning(f"Could not click reviews tab: {e}") # Extract place ID from current URL current_url = sb.get_current_url() if '!1s' in current_url: parts = current_url.split('!1s') if len(parts) > 1: self.place_id = parts[1].split('!')[0] log.info(f"✓ Extracted place ID: {self.place_id}") if not self.place_id: log.error("Could not extract place ID") return False # CRITICAL: Scroll once to trigger an API call! # This causes Google to set the necessary session cookies log.info("Triggering API call by scrolling...") sb.execute_script("window.scrollBy(0, 500)") time.sleep(2) # Wait for API call to complete log.info("✓ API call triggered - session cookies should now be set") # CAPTURE COOKIES using CDP (gets httpOnly cookies too!) log.info("Capturing cookies via CDP...") try: # Use Chrome DevTools Protocol to get ALL cookies from all domains cdp_cookies = sb.driver.execute_cdp_cmd('Network.getAllCookies', {}) browser_cookies = cdp_cookies.get('cookies', []) log.info(f"✓ Captured {len(browser_cookies)} cookies via CDP") # Also try getting cookies for specific Google domains for domain in ['.google.com', 'www.google.com', '.google.es', 'maps.google.com']: try: domain_cookies = sb.driver.execute_cdp_cmd('Network.getCookies', {'urls': [f'https://{domain}']}) extra_cookies = domain_cookies.get('cookies', []) if extra_cookies: log.info(f" Found {len(extra_cookies)} cookies for {domain}") # Add any new cookies we don't have yet existing_names = {c['name'] for c in browser_cookies} for cookie in extra_cookies: if cookie['name'] not in existing_names: browser_cookies.append(cookie) except: pass log.info(f"✓ Total cookies after checking all domains: {len(browser_cookies)}") except Exception as e: log.warning(f"CDP cookie capture failed: {e}") # Fallback to JavaScript (won't get httpOnly cookies) cookie_string = sb.execute_script("return document.cookie") browser_cookies = [] for cookie in cookie_string.split('; '): if '=' in cookie: name, value = cookie.split('=', 1) browser_cookies.append({ 'name': name, 'value': value, 'domain': '.google.com', 'path': '/' }) log.info(f"✓ Fallback: Captured {len(browser_cookies)} cookies via JS") # CAPTURE USER AGENT while driver is active user_agent = sb.execute_script("return navigator.userAgent") log.info(f"✓ Captured user agent") # Process cookies into session for cookie in browser_cookies: self.session.cookies.set( name=cookie['name'], value=cookie['value'], domain=cookie.get('domain', '.google.com'), path=cookie.get('path', '/') ) # Set headers self.session.headers.update({ 'User-Agent': user_agent, 'Accept': '*/*', 'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8', 'Referer': 'https://www.google.com/maps/', 'Origin': 'https://www.google.com', 'X-Requested-With': 'XMLHttpRequest', }) # Print ALL cookie names for debugging all_cookie_names = [c['name'] for c in browser_cookies] log.info(f"Cookie names: {', '.join(all_cookie_names)}") # Print important cookies for debugging important_cookies = ['SID', 'HSID', 'SSID', 'APISID', 'SAPISID', '__Secure-1PSID', '__Secure-3PSID'] found_cookies = [] for cookie_name in important_cookies: if cookie_name in self.session.cookies: found_cookies.append(cookie_name) log.info(f"✓ Found auth cookies: {', '.join(found_cookies) if found_cookies else 'NONE - this is the problem!'}") # Check if we have auth cookies if not found_cookies: log.warning("\n" + "="*60) log.warning("⚠️ NO AUTHENTICATION COOKIES FOUND!") log.warning("="*60) log.warning("Google Maps API requires you to be logged into Google.") log.warning("") log.warning("To fix this:") log.warning("1. Log into your Google account in Chrome") log.warning("2. Visit google.com/maps while logged in") log.warning("3. Then run this scraper again") log.warning("") log.warning("Alternatively, use the hybrid scraper (start.py) which") log.warning("handles authentication automatically and already achieves") log.warning("95%+ API coverage with 100% parse rate!") log.warning("="*60 + "\n") # Continue anyway to show the error log.info("Continuing anyway to demonstrate the API error...") log.info("\n✅ Cookie capture successful!") log.info(f" Total cookies: {len(browser_cookies)}") log.info(f" Place ID: {self.place_id}") log.info(f" Session ready: Yes\n") return True except Exception as e: log.error(f"Cookie capture failed: {e}") import traceback traceback.print_exc() return False finally: # IMPORTANT: Close browser properly if sb_context: try: log.info("Closing browser...") sb_context.__exit__(None, None, None) # Properly exit context log.info("✓ Browser closed\n") except Exception as e: log.debug(f"Error closing browser: {e}") def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]: """ Fetch a page of reviews via API using captured cookies. """ # Build pb parameter if continuation_token: pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" else: pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" params = { 'authuser': '0', 'hl': 'es', 'gl': 'es', 'pb': pb } try: url = 'https://www.google.com/maps/rpc/listugcposts' response = self.session.get(url, params=params, timeout=10) if response.status_code != 200: log.error(f"API error {response.status_code}") log.error(f"Response: {response.text[:500]}") log.debug(f"Request URL: {response.url}") log.debug(f"Request headers: {dict(self.session.headers)}") return [], None # Parse response body = response.text if body.startswith(")]}'"): body = body[4:].strip() data = json.loads(body) reviews = self.interceptor._parse_listugcposts_response(data) # Get next token next_token = None if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str): next_token = data[1] return reviews, next_token except Exception as e: log.error(f"API request failed: {e}") return [], None def scrape_all(self, max_pages: int = 100) -> List[dict]: """ Main scraping method with cookie-based session. """ # Step 1: Capture cookies from browser if not self.capture_cookies(): log.error("Failed to capture cookies - aborting") return [] # Step 2: Scrape via API log.info("="*60) log.info("STEP 2: Fast API scraping (no browser needed)") log.info("="*60) start_time = time.time() all_reviews = [] seen_ids = set() token = None page = 0 while page < max_pages: page += 1 log.info(f"Fetching page {page}...") reviews, token = self.fetch_reviews_page(token) if not reviews: if page == 1: log.error("No reviews on first page - cookies may have expired or be invalid") else: log.info("No more reviews found") break # Deduplicate for review in reviews: rid = review.review_id or f"{review.author}_{review.date_text}" if rid not in seen_ids: seen_ids.add(rid) all_reviews.append({ 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, }) log.info(f" → {len(reviews)} reviews | Total: {len(all_reviews)}") if not token: log.info("No continuation token - all reviews fetched") break # Small delay between requests time.sleep(0.2) elapsed = time.time() - start_time log.info("\n" + "="*60) log.info("✅ SCRAPING COMPLETED!") log.info("="*60) log.info(f"Total reviews: {len(all_reviews)}") log.info(f"API calls: {page}") log.info(f"API scraping time: {elapsed:.2f} seconds") log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second") log.info("="*60 + "\n") return all_reviews def main(): """Example usage.""" url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1" scraper = CookieBasedScraper(url, headless=False) reviews = scraper.scrape_all(max_pages=50) if reviews: # Save results output_file = 'cookie_based_reviews.json' with open(output_file, 'w', encoding='utf-8') as f: json.dump(reviews, f, indent=2, ensure_ascii=False) log.info(f"💾 Saved {len(reviews)} reviews to {output_file}") # Show sample log.info("\nSample review:") sample = reviews[0] log.info(f" Author: {sample['author']}") log.info(f" Rating: {sample['rating']}★") log.info(f" Date: {sample['date_text']}") if sample['text']: log.info(f" Text: {sample['text'][:80]}...") else: log.error("No reviews scraped!") if __name__ == '__main__': main()