Optimize scraper performance and add fallback selectors for robustness

Performance improvements:
- Validation speed: 59.71s → 10.96s (5.5x improvement)
- Removed 50+ console.log statements from JavaScript extraction
- Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting
- Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls)

Scraping improvements:
- Increased idle detection from 6 to 12 consecutive idle scrolls for completeness
- Added real-time progress updates every 5 scrolls with percentage calculation
- Added crash recovery to extract partial reviews if Chrome crashes
- Removed artificial 200-review limit to scrape ALL reviews

Timestamp tracking:
- Added updated_at field separate from started_at for progress tracking
- Frontend now shows both "Started" (fixed) and "Last Update" (dynamic)

Robustness improvements:
- Added 5 fallback CSS selectors to handle different Google Maps page structures
- Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc.
- Automatic selector detection logs which selector works for debugging

Test results:
- Successfully scraped 550 reviews in 150.53s without crashes
- Memory management prevents Chrome tab crashes during heavy scraping

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-18 19:49:24 +00:00
parent bdffb5eaac
commit faa0704737
108 changed files with 23632 additions and 54 deletions

305
header_capture_scraper.py Normal file
View File

@@ -0,0 +1,305 @@
#!/usr/bin/env python3
"""
Header Capture Scraper - Capture COMPLETE request from browser (headers + cookies).
This captures the exact request the browser makes, including ALL headers and cookies,
then replays it for fast API scraping.
"""
import json
import logging
import time
from typing import List, Optional, Tuple
import requests
from seleniumbase import SB
from modules.api_interceptor import GoogleMapsAPIInterceptor, InterceptedReview
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
class HeaderCaptureScraper:
"""Capture complete request, then replay for fast scraping."""
def __init__(self, url: str, headless: bool = False):
self.url = url
self.headless = headless
self.captured_request = None
self.place_id = None
self.session = requests.Session()
self.interceptor = GoogleMapsAPIInterceptor(None)
def capture_request(self) -> bool:
"""
Capture a complete API request (URL, headers, cookies) from browser.
"""
log.info("="*60)
log.info("Capturing request from browser...")
log.info("="*60)
sb_context = None
sb = None
try:
log.info("Starting browser...")
sb_context = SB(uc=True, headless=self.headless)
sb = sb_context.__enter__()
sb.open(self.url)
time.sleep(2)
# Dismiss cookies
try:
sb.click('button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]', timeout=3)
except:
pass
# Click reviews
try:
sb.click('.LRkQ2', timeout=5)
log.info("✓ Opened reviews")
time.sleep(2)
except:
pass
# Enable CDP network monitoring
sb.driver.execute_cdp_cmd('Network.enable', {})
log.info("✓ Network monitoring enabled")
# Scroll to trigger API call
log.info("Scrolling to trigger API request...")
sb.execute_script("window.scrollBy(0, 800)")
time.sleep(3)
# Get network logs from CDP
log.info("Checking network logs...")
logs = sb.driver.get_log('browser')
# Alternatively, use execute_cdp_cmd to get network events
# But simpler: Let's inject JS to capture the request
capture_script = """
window.__capturedRequest = null;
const originalFetch = window.fetch;
window.fetch = function(...args) {
const url = args[0].toString();
if (url.includes('listugcposts')) {
console.log('[CAPTURE] Intercepted request to:', url);
window.__capturedRequest = {
url: url,
method: 'GET'
};
}
return originalFetch.apply(this, args);
};
const originalXHR = window.XMLHttpRequest;
window.XMLHttpRequest = function() {
const xhr = new originalXHR();
const originalOpen = xhr.open;
xhr.open = function(method, url, ...rest) {
if (url.includes('listugcposts')) {
console.log('[CAPTURE] Intercepted XHR:', url);
window.__capturedRequest = {
url: url,
method: method
};
}
return originalOpen.apply(this, [method, url, ...rest]);
};
return xhr;
};
console.log('[CAPTURE] Request interceptor ready');
"""
sb.execute_script(capture_script)
log.info("✓ Request interceptor injected")
# Scroll again to trigger request
log.info("Scrolling to capture request...")
for i in range(3):
sb.execute_script("window.scrollBy(0, 600)")
time.sleep(2)
captured = sb.execute_script("return window.__capturedRequest")
if captured:
log.info(f"✓ Captured request URL!")
self.captured_request = captured
break
if not self.captured_request:
log.error("Failed to capture request")
return False
# Extract place ID from URL
url = self.captured_request['url']
if '!1s' in url:
import urllib.parse
parsed = urllib.parse.urlparse(url)
params = urllib.parse.parse_qs(parsed.query)
pb = params.get('pb', [''])[0]
if '!1s' in pb:
self.place_id = pb.split('!1s')[1].split('!')[0]
# Now capture ALL cookies via CDP
cdp_cookies = sb.driver.execute_cdp_cmd('Network.getAllCookies', {})
all_cookies = cdp_cookies.get('cookies', [])
# Set cookies in session
for cookie in all_cookies:
self.session.cookies.set(
name=cookie['name'],
value=cookie['value'],
domain=cookie.get('domain', '.google.com'),
path=cookie.get('path', '/')
)
# Get user agent
user_agent = sb.execute_script("return navigator.userAgent")
# Set headers to match browser
self.session.headers.update({
'User-Agent': user_agent,
'Accept': '*/*',
'Accept-Language': 'es,es-ES;q=0.9,en;q=0.8',
'Referer': 'https://www.google.com/maps/',
'Origin': 'https://www.google.com',
'X-Requested-With': 'XMLHttpRequest',
})
log.info(f"\n✅ Request captured successfully!")
log.info(f" Place ID: {self.place_id}")
log.info(f" Cookies: {len(all_cookies)}")
log.info(f" Cookie names: {', '.join([c['name'] for c in all_cookies[:10]])}")
return True
except Exception as e:
log.error(f"Capture failed: {e}")
import traceback
traceback.print_exc()
return False
finally:
if sb_context:
try:
log.info("Closing browser...")
sb_context.__exit__(None, None, None)
log.info("✓ Browser closed\n")
except:
pass
def fetch_reviews_page(self, continuation_token: Optional[str] = None) -> Tuple[List[InterceptedReview], Optional[str]]:
"""Fetch reviews using captured session."""
if continuation_token:
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s{continuation_token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
else:
pb = f"!1m6!1s{self.place_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1"
params = {
'authuser': '0',
'hl': 'es',
'gl': 'es',
'pb': pb
}
try:
url = 'https://www.google.com/maps/rpc/listugcposts'
response = self.session.get(url, params=params, timeout=10)
if response.status_code != 200:
log.error(f"API error {response.status_code}: {response.text[:200]}")
return [], None
body = response.text
if body.startswith(")]}'"):
body = body[4:].strip()
data = json.loads(body)
reviews = self.interceptor._parse_listugcposts_response(data)
next_token = None
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
next_token = data[1]
return reviews, next_token
except Exception as e:
log.error(f"Request failed: {e}")
return [], None
def scrape_all(self, max_pages: int = 50) -> List[dict]:
"""Main scraping method."""
if not self.capture_request():
return []
log.info("="*60)
log.info("Fast API scraping...")
log.info("="*60)
start_time = time.time()
all_reviews = []
seen_ids = set()
token = None
page = 0
while page < max_pages:
page += 1
log.info(f"Page {page}...")
reviews, token = self.fetch_reviews_page(token)
if not reviews:
break
for review in reviews:
rid = review.review_id or f"{review.author}_{review.date_text}"
if rid not in seen_ids:
seen_ids.add(rid)
all_reviews.append({
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
})
log.info(f"{len(reviews)} reviews | Total: {len(all_reviews)}")
if not token:
break
time.sleep(0.2)
elapsed = time.time() - start_time
log.info(f"\n{'='*60}")
log.info(f"✅ COMPLETED!")
log.info(f"{'='*60}")
log.info(f"Reviews: {len(all_reviews)}")
log.info(f"Time: {elapsed:.2f}s")
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
log.info(f"{'='*60}\n")
return all_reviews
def main():
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=es&rclk=1"
scraper = HeaderCaptureScraper(url, headless=False)
reviews = scraper.scrape_all()
if reviews:
with open('header_capture_reviews.json', 'w', encoding='utf-8') as f:
json.dump(reviews, f, indent=2, ensure_ascii=False)
log.info(f"Saved to header_capture_reviews.json")
if __name__ == '__main__':
main()