#!/usr/bin/env python3 """ Direct API scraper - fetch Google Maps reviews via API without browser scrolling. This is 10-25x faster than traditional browser-based scraping. """ import json import logging import time import urllib.parse from typing import List, Optional, Tuple import requests from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') log = logging.getLogger(__name__) class DirectAPIScraper: """Fetch Google Maps reviews directly via API without browser automation.""" def __init__(self, place_id: str, language: str = 'en', region: str = 'us'): """ Initialize the direct API scraper. Args: place_id: Google Maps place ID (e.g., '0x46dd947294b213bf:0x864c7a232527adb4') language: Language code (e.g., 'en', 'es', 'de') region: Region/country code (e.g., 'us', 'es', 'de') """ self.place_id = place_id self.language = language self.region = region self.base_url = 'https://www.google.com/maps/rpc/listugcposts' # Initialize parser (reuse the working parser from api_interceptor) self.interceptor = GoogleMapsAPIInterceptor(None) # Session for maintaining cookies self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': '*/*', 'Accept-Language': f'{language},{language}-{region.upper()};q=0.9,en;q=0.8', 'Referer': 'https://www.google.com/maps/', 'X-Requested-With': 'XMLHttpRequest', }) def _build_pb_param(self, continuation_token: Optional[str] = None) -> str: """ Build the Protocol Buffer (pb) parameter for the API request. Args: continuation_token: Pagination token from previous response Returns: pb parameter string (NOT URL-encoded - that's done by requests) """ # Base structure with place ID and pagination token if continuation_token: pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" else: # First request without continuation token pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1" return pb def _establish_session(self): """Visit Google Maps page to establish session cookies.""" try: # Visit the main maps page to get cookies maps_url = f"https://www.google.com/maps/place/?q=place_id:{self.place_id}" log.debug("Establishing session by visiting Google Maps...") response = self.session.get(maps_url, timeout=10) response.raise_for_status() log.debug(f"Session established (cookies: {len(self.session.cookies)})") except Exception as e: log.warning(f"Failed to establish session: {e}") def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]: """ Fetch a single page of reviews from the API. Args: continuation_token: Pagination token from previous response Returns: Tuple of (reviews list, next continuation token or None) """ # Build request parameters params = { 'authuser': '0', 'hl': self.language, 'gl': self.region, 'pb': self._build_pb_param(continuation_token) } try: log.info(f"Fetching reviews page (token: {'initial' if not continuation_token else 'paginated'})...") response = self.session.get(self.base_url, params=params, timeout=10) # Log response for debugging log.debug(f"Response status: {response.status_code}") if response.status_code != 200: log.error(f"Response body: {response.text[:500]}") response.raise_for_status() # Google returns responses with )]}' prefix - strip it body = response.text if body.startswith(")]}'"): body = body[4:].strip() log.debug(f"Response size: {len(body)} bytes") # Parse JSON response data = json.loads(body) # Extract reviews using our working parser reviews = self.interceptor._parse_listugcposts_response(data) # Extract next continuation token next_token = None if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str): next_token = data[1] log.debug(f"Found continuation token: {next_token[:50]}...") log.info(f"✓ Extracted {len(reviews)} reviews from this page") return reviews, next_token except requests.exceptions.RequestException as e: log.error(f"API request failed: {e}") return [], None except json.JSONDecodeError as e: log.error(f"Failed to parse API response: {e}") return [], None except Exception as e: log.error(f"Unexpected error: {e}") return [], None def fetch_all_reviews(self, max_pages: int = 100, delay: float = 0.5) -> List[dict]: """ Fetch all reviews by paginating through the API. Args: max_pages: Maximum number of pages to fetch (safety limit) delay: Delay between requests in seconds Returns: List of review dictionaries """ all_reviews = [] seen_ids = set() continuation_token = None page = 0 start_time = time.time() log.info(f"Starting direct API scraping for place: {self.place_id}") # Establish session first self._establish_session() while page < max_pages: page += 1 # Fetch page reviews, continuation_token = self.fetch_reviews_page(continuation_token) if not reviews: log.info("No more reviews found - stopping") break # Deduplicate and add reviews for review in reviews: review_id = review.review_id or f"{review.author}_{review.date_text}" if review_id not in seen_ids: seen_ids.add(review_id) # Convert to dict all_reviews.append({ 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, }) log.info(f"Page {page}: {len(all_reviews)} total unique reviews") # Check if we have a continuation token if not continuation_token: log.info("No continuation token - all reviews fetched") break # Rate limiting if delay > 0 and page < max_pages: time.sleep(delay) elapsed = time.time() - start_time log.info(f"\n{'='*60}") log.info(f"✅ Direct API scraping completed!") log.info(f"{'='*60}") log.info(f"Total reviews: {len(all_reviews)}") log.info(f"Pages fetched: {page}") log.info(f"Time elapsed: {elapsed:.2f} seconds") log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second") log.info(f"{'='*60}\n") return all_reviews def main(): """Example usage of the direct API scraper.""" # Soho Club place ID from the test URL place_id = '0x46dd947294b213bf:0x864c7a232527adb4' # Create scraper scraper = DirectAPIScraper( place_id=place_id, language='es', region='es' ) # Fetch all reviews reviews = scraper.fetch_all_reviews(max_pages=50, delay=0.5) # Save to JSON output_file = 'direct_api_reviews.json' with open(output_file, 'w', encoding='utf-8') as f: json.dump(reviews, f, indent=2, ensure_ascii=False) log.info(f"Saved {len(reviews)} reviews to {output_file}") # Show sample if reviews: log.info("\nSample review:") sample = reviews[0] log.info(f" Author: {sample['author']}") log.info(f" Rating: {sample['rating']}★") log.info(f" Date: {sample['date_text']}") log.info(f" Text: {sample['text'][:100]}..." if sample['text'] else " Text: (no text)") if __name__ == '__main__': main()