Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
356 lines
14 KiB
Python
356 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Cookie-based API scraper - Capture fresh cookies on each run, then fast API scraping.
|
|
|
|
Flow:
|
|
1. Start browser (15 seconds)
|
|
2. Capture cookies from active browser session (5 seconds)
|
|
3. Close browser
|
|
4. Use cookies for rapid API pagination (5-10 seconds)
|
|
|
|
Total time: ~25-35 seconds for 244 reviews (vs 155 seconds with scrolling)
|
|
"""
|
|
import json
|
|
import logging
|
|
import time
|
|
from typing import List, Optional, Tuple
|
|
import requests
|
|
from seleniumbase import SB
|
|
from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview
|
|
|
|
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
class CookieBasedScraper:
|
|
"""Capture cookies each run, then scrape via API."""
|
|
|
|
def __init__(self, url: str, headless: bool = False):
|
|
self.url = url
|
|
self.headless = headless
|
|
self.session = requests.Session()
|
|
self.place_id = None
|
|
self.interceptor = GoogleMapsAPIInterceptor(None)
|
|
|
|
def capture_cookies(self) -> bool:
|
|
"""
|
|
Capture cookies from a real browser session.
|
|
Returns True if successful.
|
|
"""
|
|
log.info("="*60)
|
|
log.info("STEP 1: Capturing cookies from browser session")
|
|
log.info("="*60)
|
|
|
|
sb = None
|
|
sb_context = None
|
|
try:
|
|
# Create driver - need to enter the context manually
|
|
log.info("Starting browser...")
|
|
sb_context = SB(uc=True, headless=self.headless)
|
|
sb = sb_context.__enter__() # Manually enter context
|
|
|
|
log.info("Opening Google Maps...")
|
|
sb.open(self.url)
|
|
time.sleep(2)
|
|
|
|
# Dismiss cookie consent
|
|
try:
|
|
sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=3)
|
|
log.info("✓ Cookie dialog dismissed")
|
|
except:
|
|
pass
|
|
|
|
# Click reviews tab
|
|
try:
|
|
sb.click('.LRkQ2', timeout=5)
|
|
log.info("✓ Opened reviews tab")
|
|
time.sleep(3) # Wait for reviews to load
|
|
except Exception as e:
|
|
log.warning(f"Could not click reviews tab: {e}")
|
|
|
|
# Extract place ID from current URL
|
|
current_url = sb.get_current_url()
|
|
if '!1s' in current_url:
|
|
parts = current_url.split('!1s')
|
|
if len(parts) > 1:
|
|
self.place_id = parts[1].split('!')[0]
|
|
log.info(f"✓ Extracted place ID: {self.place_id}")
|
|
|
|
if not self.place_id:
|
|
log.error("Could not extract place ID")
|
|
return False
|
|
|
|
# CRITICAL: Scroll once to trigger an API call!
|
|
# This causes Google to set the necessary session cookies
|
|
log.info("Triggering API call by scrolling...")
|
|
sb.execute_script("window.scrollBy(0, 500)")
|
|
time.sleep(2) # Wait for API call to complete
|
|
log.info("✓ API call triggered - session cookies should now be set")
|
|
|
|
# CAPTURE COOKIES using CDP (gets httpOnly cookies too!)
|
|
log.info("Capturing cookies via CDP...")
|
|
try:
|
|
# Use Chrome DevTools Protocol to get ALL cookies from all domains
|
|
cdp_cookies = sb.driver.execute_cdp_cmd('Network.getAllCookies', {})
|
|
browser_cookies = cdp_cookies.get('cookies', [])
|
|
log.info(f"✓ Captured {len(browser_cookies)} cookies via CDP")
|
|
|
|
# Also try getting cookies for specific Google domains
|
|
for domain in ['.google.com', 'www.google.com', '.google.es', 'maps.google.com']:
|
|
try:
|
|
domain_cookies = sb.driver.execute_cdp_cmd('Network.getCookies', {'urls': [f'https://{domain}']})
|
|
extra_cookies = domain_cookies.get('cookies', [])
|
|
if extra_cookies:
|
|
log.info(f" Found {len(extra_cookies)} cookies for {domain}")
|
|
# Add any new cookies we don't have yet
|
|
existing_names = {c['name'] for c in browser_cookies}
|
|
for cookie in extra_cookies:
|
|
if cookie['name'] not in existing_names:
|
|
browser_cookies.append(cookie)
|
|
except:
|
|
pass
|
|
|
|
log.info(f"✓ Total cookies after checking all domains: {len(browser_cookies)}")
|
|
except Exception as e:
|
|
log.warning(f"CDP cookie capture failed: {e}")
|
|
# Fallback to JavaScript (won't get httpOnly cookies)
|
|
cookie_string = sb.execute_script("return document.cookie")
|
|
browser_cookies = []
|
|
for cookie in cookie_string.split('; '):
|
|
if '=' in cookie:
|
|
name, value = cookie.split('=', 1)
|
|
browser_cookies.append({
|
|
'name': name,
|
|
'value': value,
|
|
'domain': '.google.com',
|
|
'path': '/'
|
|
})
|
|
log.info(f"✓ Fallback: Captured {len(browser_cookies)} cookies via JS")
|
|
|
|
# CAPTURE USER AGENT while driver is active
|
|
user_agent = sb.execute_script("return navigator.userAgent")
|
|
log.info(f"✓ Captured user agent")
|
|
|
|
# Process cookies into session
|
|
for cookie in browser_cookies:
|
|
self.session.cookies.set(
|
|
name=cookie['name'],
|
|
value=cookie['value'],
|
|
domain=cookie.get('domain', '.google.com'),
|
|
path=cookie.get('path', '/')
|
|
)
|
|
|
|
# Set headers
|
|
self.session.headers.update({
|
|
'User-Agent': user_agent,
|
|
'Accept': '*/*',
|
|
'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8',
|
|
'Referer': 'https://www.google.com/maps/',
|
|
'Origin': 'https://www.google.com',
|
|
'X-Requested-With': 'XMLHttpRequest',
|
|
})
|
|
|
|
# Print ALL cookie names for debugging
|
|
all_cookie_names = [c['name'] for c in browser_cookies]
|
|
log.info(f"Cookie names: {', '.join(all_cookie_names)}")
|
|
|
|
# Print important cookies for debugging
|
|
important_cookies = ['SID', 'HSID', 'SSID', 'APISID', 'SAPISID', '__Secure-1PSID', '__Secure-3PSID']
|
|
found_cookies = []
|
|
for cookie_name in important_cookies:
|
|
if cookie_name in self.session.cookies:
|
|
found_cookies.append(cookie_name)
|
|
|
|
log.info(f"✓ Found auth cookies: {', '.join(found_cookies) if found_cookies else 'NONE - this is the problem!'}")
|
|
|
|
# Check if we have auth cookies
|
|
if not found_cookies:
|
|
log.warning("\n" + "="*60)
|
|
log.warning("⚠️ NO AUTHENTICATION COOKIES FOUND!")
|
|
log.warning("="*60)
|
|
log.warning("Google Maps API requires you to be logged into Google.")
|
|
log.warning("")
|
|
log.warning("To fix this:")
|
|
log.warning("1. Log into your Google account in Chrome")
|
|
log.warning("2. Visit google.com/maps while logged in")
|
|
log.warning("3. Then run this scraper again")
|
|
log.warning("")
|
|
log.warning("Alternatively, use the hybrid scraper (start.py) which")
|
|
log.warning("handles authentication automatically and already achieves")
|
|
log.warning("95%+ API coverage with 100% parse rate!")
|
|
log.warning("="*60 + "\n")
|
|
|
|
# Continue anyway to show the error
|
|
log.info("Continuing anyway to demonstrate the API error...")
|
|
|
|
log.info("\n✅ Cookie capture successful!")
|
|
log.info(f" Total cookies: {len(browser_cookies)}")
|
|
log.info(f" Place ID: {self.place_id}")
|
|
log.info(f" Session ready: Yes\n")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
log.error(f"Cookie capture failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
finally:
|
|
# IMPORTANT: Close browser properly
|
|
if sb_context:
|
|
try:
|
|
log.info("Closing browser...")
|
|
sb_context.__exit__(None, None, None) # Properly exit context
|
|
log.info("✓ Browser closed\n")
|
|
except Exception as e:
|
|
log.debug(f"Error closing browser: {e}")
|
|
|
|
def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]:
|
|
"""
|
|
Fetch a page of reviews via API using captured cookies.
|
|
"""
|
|
# Build pb parameter
|
|
if continuation_token:
|
|
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
|
|
else:
|
|
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
|
|
|
|
params = {
|
|
'authuser': '0',
|
|
'hl': 'es',
|
|
'gl': 'es',
|
|
'pb': pb
|
|
}
|
|
|
|
try:
|
|
url = 'https://www.google.com/maps/rpc/listugcposts'
|
|
response = self.session.get(url, params=params, timeout=10)
|
|
|
|
if response.status_code != 200:
|
|
log.error(f"API error {response.status_code}")
|
|
log.error(f"Response: {response.text[:500]}")
|
|
log.debug(f"Request URL: {response.url}")
|
|
log.debug(f"Request headers: {dict(self.session.headers)}")
|
|
return [], None
|
|
|
|
# Parse response
|
|
body = response.text
|
|
if body.startswith(")]}'"):
|
|
body = body[4:].strip()
|
|
|
|
data = json.loads(body)
|
|
reviews = self.interceptor._parse_listugcposts_response(data)
|
|
|
|
# Get next token
|
|
next_token = None
|
|
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
|
|
next_token = data[1]
|
|
|
|
return reviews, next_token
|
|
|
|
except Exception as e:
|
|
log.error(f"API request failed: {e}")
|
|
return [], None
|
|
|
|
def scrape_all(self, max_pages: int = 100) -> List[dict]:
|
|
"""
|
|
Main scraping method with cookie-based session.
|
|
"""
|
|
# Step 1: Capture cookies from browser
|
|
if not self.capture_cookies():
|
|
log.error("Failed to capture cookies - aborting")
|
|
return []
|
|
|
|
# Step 2: Scrape via API
|
|
log.info("="*60)
|
|
log.info("STEP 2: Fast API scraping (no browser needed)")
|
|
log.info("="*60)
|
|
|
|
start_time = time.time()
|
|
all_reviews = []
|
|
seen_ids = set()
|
|
token = None
|
|
page = 0
|
|
|
|
while page < max_pages:
|
|
page += 1
|
|
|
|
log.info(f"Fetching page {page}...")
|
|
reviews, token = self.fetch_reviews_page(token)
|
|
|
|
if not reviews:
|
|
if page == 1:
|
|
log.error("No reviews on first page - cookies may have expired or be invalid")
|
|
else:
|
|
log.info("No more reviews found")
|
|
break
|
|
|
|
# Deduplicate
|
|
for review in reviews:
|
|
rid = review.review_id or f"{review.author}_{review.date_text}"
|
|
if rid not in seen_ids:
|
|
seen_ids.add(rid)
|
|
all_reviews.append({
|
|
'review_id': review.review_id,
|
|
'author': review.author,
|
|
'rating': review.rating,
|
|
'text': review.text,
|
|
'date_text': review.date_text,
|
|
'avatar_url': review.avatar_url,
|
|
'profile_url': review.profile_url,
|
|
})
|
|
|
|
log.info(f" → {len(reviews)} reviews | Total: {len(all_reviews)}")
|
|
|
|
if not token:
|
|
log.info("No continuation token - all reviews fetched")
|
|
break
|
|
|
|
# Small delay between requests
|
|
time.sleep(0.2)
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
log.info("\n" + "="*60)
|
|
log.info("✅ SCRAPING COMPLETED!")
|
|
log.info("="*60)
|
|
log.info(f"Total reviews: {len(all_reviews)}")
|
|
log.info(f"API calls: {page}")
|
|
log.info(f"API scraping time: {elapsed:.2f} seconds")
|
|
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second")
|
|
log.info("="*60 + "\n")
|
|
|
|
return all_reviews
|
|
|
|
|
|
def main():
|
|
"""Example usage."""
|
|
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
|
|
|
|
scraper = CookieBasedScraper(url, headless=False)
|
|
reviews = scraper.scrape_all(max_pages=50)
|
|
|
|
if reviews:
|
|
# Save results
|
|
output_file = 'cookie_based_reviews.json'
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(reviews, f, indent=2, ensure_ascii=False)
|
|
|
|
log.info(f"💾 Saved {len(reviews)} reviews to {output_file}")
|
|
|
|
# Show sample
|
|
log.info("\nSample review:")
|
|
sample = reviews[0]
|
|
log.info(f" Author: {sample['author']}")
|
|
log.info(f" Rating: {sample['rating']}★")
|
|
log.info(f" Date: {sample['date_text']}")
|
|
if sample['text']:
|
|
log.info(f" Text: {sample['text'][:80]}...")
|
|
else:
|
|
log.error("No reviews scraped!")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|