Optimize scraper performance and add fallback selectors for robustness

Performance improvements:
- Validation speed: 59.71s → 10.96s (5.5x improvement)
- Removed 50+ console.log statements from JavaScript extraction
- Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting
- Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls)

Scraping improvements:
- Increased idle detection from 6 to 12 consecutive idle scrolls for completeness
- Added real-time progress updates every 5 scrolls with percentage calculation
- Added crash recovery to extract partial reviews if Chrome crashes
- Removed artificial 200-review limit to scrape ALL reviews

Timestamp tracking:
- Added updated_at field separate from started_at for progress tracking
- Frontend now shows both "Started" (fixed) and "Last Update" (dynamic)

Robustness improvements:
- Added 5 fallback CSS selectors to handle different Google Maps page structures
- Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc.
- Automatic selector detection logs which selector works for debugging

Test results:
- Successfully scraped 550 reviews in 150.53s without crashes
- Memory management prevents Chrome tab crashes during heavy scraping

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-18 19:49:24 +00:00
parent bdffb5eaac
commit faa0704737
108 changed files with 23632 additions and 54 deletions

249
fast_api_scraper.py Normal file
View File

@@ -0,0 +1,249 @@
#!/usr/bin/env python3
"""
Fast API scraper - Minimal browser usage, maximum API speed.
Strategy:
1. Start browser and navigate to reviews page
2. Capture cookies and user-agent from browser
3. Let one API call happen naturally (to warm up the session)
4. Close browser
5. Use requests library with captured session to make fast API calls
6. Paginate through all reviews without any scrolling
Expected: 10-25x faster than traditional scrolling approach.
"""
import json
import logging
import time
from typing import List, Optional, Tuple
import requests
from seleniumbase import SB
from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
class FastAPIScraper:
"""Minimal browser, maximum speed."""
def __init__(self, url: str):
self.url = url
self.session = requests.Session()
self.place_id = None
self.interceptor = GoogleMapsAPIInterceptor(None)
def bootstrap_session(self) -> bool:
"""
Quickly establish session using browser, then close it.
"""
log.info("Bootstrapping session with minimal browser usage...")
try:
with SB(uc=True, headless=False) as sb:
# Navigate
log.info("Opening Google Maps...")
sb.open(self.url)
sb.sleep(2)
# Dismiss cookies
try:
sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=3)
except:
pass
# Click reviews
try:
sb.click('.LRkQ2', timeout=5)
log.info("✓ Opened reviews tab")
sb.sleep(2)
except:
log.warning("Could not click reviews tab")
# Wait a bit to ensure page is loaded
sb.sleep(1)
# Extract place ID from URL or page
current_url = sb.get_current_url()
if '!1s' in current_url:
parts = current_url.split('!1s')
if len(parts) > 1:
self.place_id = parts[1].split('!')[0]
log.info(f"✓ Extracted place ID: {self.place_id}")
# Get cookies from browser - do this while browser is still active
try:
browser_cookies = sb.driver.get_cookies()
log.debug(f"Got {len(browser_cookies)} cookies")
except Exception as e:
log.warning(f"Could not get cookies: {e}")
browser_cookies = []
# Get user agent - do this while browser is still active
try:
user_agent = sb.execute_script("return navigator.userAgent")
log.debug(f"User agent: {user_agent[:50]}...")
except Exception as e:
log.warning(f"Could not get user agent: {e}")
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
# Now process cookies and headers (browser context manager still open)
for cookie in browser_cookies:
try:
self.session.cookies.set(
name=cookie['name'],
value=cookie['value'],
domain=cookie.get('domain', '.google.com'),
path=cookie.get('path', '/')
)
except Exception as e:
log.debug(f"Could not set cookie {cookie.get('name')}: {e}")
# Set headers
self.session.headers.update({
'User-Agent': user_agent,
'Accept': '*/*',
'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8',
'Referer': 'https://www.google.com/maps/',
'Origin': 'https://www.google.com',
'X-Requested-With': 'XMLHttpRequest',
})
log.info(f"✅ Session bootstrapped!")
log.info(f" Cookies: {len(browser_cookies)}")
log.info(f" Place ID: {self.place_id}")
# Let browser stay open for a moment to ensure all operations complete
sb.sleep(1)
return True
except Exception as e:
log.error(f"Bootstrap failed: {e}")
import traceback
traceback.print_exc()
return False
def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]:
"""Fetch a page of reviews via API."""
# Build pb parameter
if continuation_token:
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
else:
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
params = {
'authuser': '0',
'hl': 'es',
'gl': 'es',
'pb': pb
}
try:
url = 'https://www.google.com/maps/rpc/listugcposts'
response = self.session.get(url, params=params, timeout=10)
if response.status_code != 200:
log.error(f"API error {response.status_code}")
log.error(f"Response: {response.text[:300]}")
return [], None
# Parse
body = response.text
if body.startswith(")]}'"):
body = body[4:].strip()
data = json.loads(body)
reviews = self.interceptor._parse_listugcposts_response(data)
# Next token
next_token = None
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
next_token = data[1]
return reviews, next_token
except Exception as e:
log.error(f"Request failed: {e}")
return [], None
def scrape_all(self, max_pages: int = 100) -> List[dict]:
"""
Main scraping method.
"""
# Bootstrap
if not self.bootstrap_session():
return []
# Scrape via API
log.info("\n" + "="*60)
log.info("STARTING FAST API SCRAPING")
log.info("="*60 + "\n")
start_time = time.time()
all_reviews = []
seen_ids = set()
token = None
page = 0
while page < max_pages:
page += 1
log.info(f"Fetching page {page}...")
reviews, token = self.fetch_reviews_page(token)
if not reviews:
log.info("No more reviews")
break
# Dedup
for review in reviews:
rid = review.review_id or f"{review.author}_{review.date_text}"
if rid not in seen_ids:
seen_ids.add(rid)
all_reviews.append({
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
})
log.info(f"{len(reviews)} reviews | Total: {len(all_reviews)}")
if not token:
break
time.sleep(0.2) # Small delay
elapsed = time.time() - start_time
log.info("\n" + "="*60)
log.info("✅ FAST API SCRAPING COMPLETED!")
log.info("="*60)
log.info(f"Reviews: {len(all_reviews)}")
log.info(f"Pages: {page}")
log.info(f"Time: {elapsed:.2f} seconds")
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
log.info("="*60 + "\n")
return all_reviews
def main():
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
scraper = FastAPIScraper(url)
reviews = scraper.scrape_all(max_pages=50)
# Save
with open('fast_api_reviews.json', 'w', encoding='utf-8') as f:
json.dump(reviews, f, indent=2, ensure_ascii=False)
log.info(f"Saved to fast_api_reviews.json")
if __name__ == '__main__':
main()