Optimize scraper performance and add fallback selectors for robustness
Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
249
direct_api_scraper.py
Normal file
249
direct_api_scraper.py
Normal file
@@ -0,0 +1,249 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Direct API scraper - fetch Google Maps reviews via API without browser scrolling.
|
||||
This is 10-25x faster than traditional browser-based scraping.
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import urllib.parse
|
||||
from typing import List, Optional, Tuple
|
||||
import requests
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DirectAPIScraper:
|
||||
"""Fetch Google Maps reviews directly via API without browser automation."""
|
||||
|
||||
def __init__(self, place_id: str, language: str = 'en', region: str = 'us'):
|
||||
"""
|
||||
Initialize the direct API scraper.
|
||||
|
||||
Args:
|
||||
place_id: Google Maps place ID (e.g., '0x46dd947294b213bf:0x864c7a232527adb4')
|
||||
language: Language code (e.g., 'en', 'es', 'de')
|
||||
region: Region/country code (e.g., 'us', 'es', 'de')
|
||||
"""
|
||||
self.place_id = place_id
|
||||
self.language = language
|
||||
self.region = region
|
||||
self.base_url = 'https://www.google.com/maps/rpc/listugcposts'
|
||||
|
||||
# Initialize parser (reuse the working parser from api_interceptor)
|
||||
self.interceptor = GoogleMapsAPIInterceptor(None)
|
||||
|
||||
# Session for maintaining cookies
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': '*/*',
|
||||
'Accept-Language': f'{language},{language}-{region.upper()};q=0.9,en;q=0.8',
|
||||
'Referer': 'https://www.google.com/maps/',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
})
|
||||
|
||||
def _build_pb_param(self, continuation_token: Optional[str] = None) -> str:
|
||||
"""
|
||||
Build the Protocol Buffer (pb) parameter for the API request.
|
||||
|
||||
Args:
|
||||
continuation_token: Pagination token from previous response
|
||||
|
||||
Returns:
|
||||
pb parameter string (NOT URL-encoded - that's done by requests)
|
||||
"""
|
||||
# Base structure with place ID and pagination token
|
||||
if continuation_token:
|
||||
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
|
||||
else:
|
||||
# First request without continuation token
|
||||
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
|
||||
|
||||
return pb
|
||||
|
||||
def _establish_session(self):
|
||||
"""Visit Google Maps page to establish session cookies."""
|
||||
try:
|
||||
# Visit the main maps page to get cookies
|
||||
maps_url = f"https://www.google.com/maps/place/?q=place_id:{self.place_id}"
|
||||
log.debug("Establishing session by visiting Google Maps...")
|
||||
response = self.session.get(maps_url, timeout=10)
|
||||
response.raise_for_status()
|
||||
log.debug(f"Session established (cookies: {len(self.session.cookies)})")
|
||||
except Exception as e:
|
||||
log.warning(f"Failed to establish session: {e}")
|
||||
|
||||
def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]:
|
||||
"""
|
||||
Fetch a single page of reviews from the API.
|
||||
|
||||
Args:
|
||||
continuation_token: Pagination token from previous response
|
||||
|
||||
Returns:
|
||||
Tuple of (reviews list, next continuation token or None)
|
||||
"""
|
||||
# Build request parameters
|
||||
params = {
|
||||
'authuser': '0',
|
||||
'hl': self.language,
|
||||
'gl': self.region,
|
||||
'pb': self._build_pb_param(continuation_token)
|
||||
}
|
||||
|
||||
try:
|
||||
log.info(f"Fetching reviews page (token: {'initial' if not continuation_token else 'paginated'})...")
|
||||
|
||||
response = self.session.get(self.base_url, params=params, timeout=10)
|
||||
|
||||
# Log response for debugging
|
||||
log.debug(f"Response status: {response.status_code}")
|
||||
if response.status_code != 200:
|
||||
log.error(f"Response body: {response.text[:500]}")
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
# Google returns responses with )]}' prefix - strip it
|
||||
body = response.text
|
||||
if body.startswith(")]}'"):
|
||||
body = body[4:].strip()
|
||||
|
||||
log.debug(f"Response size: {len(body)} bytes")
|
||||
|
||||
# Parse JSON response
|
||||
data = json.loads(body)
|
||||
|
||||
# Extract reviews using our working parser
|
||||
reviews = self.interceptor._parse_listugcposts_response(data)
|
||||
|
||||
# Extract next continuation token
|
||||
next_token = None
|
||||
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
|
||||
next_token = data[1]
|
||||
log.debug(f"Found continuation token: {next_token[:50]}...")
|
||||
|
||||
log.info(f"✓ Extracted {len(reviews)} reviews from this page")
|
||||
|
||||
return reviews, next_token
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
log.error(f"API request failed: {e}")
|
||||
return [], None
|
||||
except json.JSONDecodeError as e:
|
||||
log.error(f"Failed to parse API response: {e}")
|
||||
return [], None
|
||||
except Exception as e:
|
||||
log.error(f"Unexpected error: {e}")
|
||||
return [], None
|
||||
|
||||
def fetch_all_reviews(self, max_pages: int = 100, delay: float = 0.5) -> List[dict]:
|
||||
"""
|
||||
Fetch all reviews by paginating through the API.
|
||||
|
||||
Args:
|
||||
max_pages: Maximum number of pages to fetch (safety limit)
|
||||
delay: Delay between requests in seconds
|
||||
|
||||
Returns:
|
||||
List of review dictionaries
|
||||
"""
|
||||
all_reviews = []
|
||||
seen_ids = set()
|
||||
continuation_token = None
|
||||
page = 0
|
||||
|
||||
start_time = time.time()
|
||||
log.info(f"Starting direct API scraping for place: {self.place_id}")
|
||||
|
||||
# Establish session first
|
||||
self._establish_session()
|
||||
|
||||
while page < max_pages:
|
||||
page += 1
|
||||
|
||||
# Fetch page
|
||||
reviews, continuation_token = self.fetch_reviews_page(continuation_token)
|
||||
|
||||
if not reviews:
|
||||
log.info("No more reviews found - stopping")
|
||||
break
|
||||
|
||||
# Deduplicate and add reviews
|
||||
for review in reviews:
|
||||
review_id = review.review_id or f"{review.author}_{review.date_text}"
|
||||
if review_id not in seen_ids:
|
||||
seen_ids.add(review_id)
|
||||
|
||||
# Convert to dict
|
||||
all_reviews.append({
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
})
|
||||
|
||||
log.info(f"Page {page}: {len(all_reviews)} total unique reviews")
|
||||
|
||||
# Check if we have a continuation token
|
||||
if not continuation_token:
|
||||
log.info("No continuation token - all reviews fetched")
|
||||
break
|
||||
|
||||
# Rate limiting
|
||||
if delay > 0 and page < max_pages:
|
||||
time.sleep(delay)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
log.info(f"\n{'='*60}")
|
||||
log.info(f"✅ Direct API scraping completed!")
|
||||
log.info(f"{'='*60}")
|
||||
log.info(f"Total reviews: {len(all_reviews)}")
|
||||
log.info(f"Pages fetched: {page}")
|
||||
log.info(f"Time elapsed: {elapsed:.2f} seconds")
|
||||
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second")
|
||||
log.info(f"{'='*60}\n")
|
||||
|
||||
return all_reviews
|
||||
|
||||
|
||||
def main():
|
||||
"""Example usage of the direct API scraper."""
|
||||
|
||||
# Soho Club place ID from the test URL
|
||||
place_id = '0x46dd947294b213bf:0x864c7a232527adb4'
|
||||
|
||||
# Create scraper
|
||||
scraper = DirectAPIScraper(
|
||||
place_id=place_id,
|
||||
language='es',
|
||||
region='es'
|
||||
)
|
||||
|
||||
# Fetch all reviews
|
||||
reviews = scraper.fetch_all_reviews(max_pages=50, delay=0.5)
|
||||
|
||||
# Save to JSON
|
||||
output_file = 'direct_api_reviews.json'
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(reviews, f, indent=2, ensure_ascii=False)
|
||||
|
||||
log.info(f"Saved {len(reviews)} reviews to {output_file}")
|
||||
|
||||
# Show sample
|
||||
if reviews:
|
||||
log.info("\nSample review:")
|
||||
sample = reviews[0]
|
||||
log.info(f" Author: {sample['author']}")
|
||||
log.info(f" Rating: {sample['rating']}★")
|
||||
log.info(f" Date: {sample['date_text']}")
|
||||
log.info(f" Text: {sample['text'][:100]}..." if sample['text'] else " Text: (no text)")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user