Files
whyrating-engine-legacy/start_hybrid_parallel.py
Alejandro Gutiérrez faa0704737 Optimize scraper performance and add fallback selectors for robustness
Performance improvements:
- Validation speed: 59.71s → 10.96s (5.5x improvement)
- Removed 50+ console.log statements from JavaScript extraction
- Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting
- Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls)

Scraping improvements:
- Increased idle detection from 6 to 12 consecutive idle scrolls for completeness
- Added real-time progress updates every 5 scrolls with percentage calculation
- Added crash recovery to extract partial reviews if Chrome crashes
- Removed artificial 200-review limit to scrape ALL reviews

Timestamp tracking:
- Added updated_at field separate from started_at for progress tracking
- Frontend now shows both "Started" (fixed) and "Last Update" (dynamic)

Robustness improvements:
- Added 5 fallback CSS selectors to handle different Google Maps page structures
- Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc.
- Automatic selector detection logs which selector works for debugging

Test results:
- Successfully scraped 550 reviews in 150.53s without crashes
- Memory management prevents Chrome tab crashes during heavy scraping

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-18 19:49:24 +00:00

287 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Hybrid Parallel Scraper - Best of both worlds.
Strategy:
1. Open browser and get to reviews page (~15s)
2. Scroll quickly to collect ~5-10 continuation tokens (~5s)
3. Make parallel API calls in browser using JavaScript (~2-3s)
4. Total: ~22-25 seconds for 244 reviews
This approach:
- Uses browser's active session (no auth issues)
- Collects tokens sequentially (required by API)
- Makes parallel calls for remaining pages (fast!)
"""
import sys
import yaml
import logging
import time
import json
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from modules.api_interceptor import GoogleMapsAPIInterceptor
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
def load_config():
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def hybrid_parallel_scrape():
"""Hybrid approach: Sequential token collection + Parallel fetch."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
log.info("="*60)
log.info("HYBRID PARALLEL SCRAPER")
log.info("="*60)
log.info(f"URL: {url[:80]}...")
log.info(f"Mode: Sequential tokens + Parallel fetch")
log.info("="*60 + "\n")
start_time = time.time()
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# PHASE 1: Setup (~15s)
log.info("Phase 1: Browser setup...")
driver.get(url)
time.sleep(2)
# Dismiss cookies
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
time.sleep(1)
except:
pass
# Click reviews tab
review_keywords = ['reviews', 'review', 'reseñas']
for selector in ['.LRkQ2', '.hh2c6', 'button[role="tab"]']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria for kw in review_keywords):
driver.execute_script("arguments[0].click();", tab)
time.sleep(2)
break
except:
continue
time.sleep(3)
# Find pane
pane = None
for selector in ['div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde',
'div.m6QErb.WNBkOb.XiKgde']:
try:
wait = WebDriverWait(driver, 5)
pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
break
except:
continue
if not pane:
log.error("Could not find pane")
return []
time.sleep(2)
# Extract place ID
place_id = None
current_url = driver.current_url
if '!1s' in current_url:
parts = current_url.split('!1s')
if len(parts) > 1:
place_id = parts[1].split('!')[0]
if not place_id:
log.error("Could not extract place ID")
return []
log.info(f"✓ Setup complete (place_id: {place_id})\n")
# PHASE 2: Collect tokens via scrolling (~5s)
log.info("Phase 2: Collecting continuation tokens...")
interceptor = GoogleMapsAPIInterceptor(driver)
interceptor.setup_interception()
interceptor.inject_response_interceptor()
time.sleep(1)
# Setup scroll
driver.execute_script("window.scrollablePane = arguments[0];", pane)
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
# Collect tokens by scrolling quickly
tokens = []
all_reviews = {}
for i in range(8): # 8 scrolls to get ~8 tokens
driver.execute_script(scroll_script)
time.sleep(0.2) # Very fast scrolling
# Collect responses
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in all_reviews:
all_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
# Extract continuation token from raw response
for resp in responses:
try:
body = resp.get('body', '')
if body.startswith(")]}'"):
body = body[4:]
data = json.loads(body)
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
token = data[1]
if token and token not in tokens:
tokens.append(token)
except:
pass
log.info(f"✓ Collected {len(tokens)} continuation tokens")
log.info(f"✓ Got {len(all_reviews)} reviews from scrolling\n")
# PHASE 3: Parallel fetch remaining pages (~2-3s)
if len(tokens) > 0:
log.info("Phase 3: Parallel fetch of remaining pages...")
parallel_script = """
async function fetchPages(placeId, tokens) {
const baseUrl = 'https://www.google.com/maps/rpc/listugcposts';
const results = [];
const promises = tokens.map((token, idx) => {
const pb = `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`;
const params = new URLSearchParams({
authuser: '0',
hl: 'es',
gl: 'es',
pb: pb
});
return fetch(`${baseUrl}?${params}`)
.then(r => r.text())
.then(text => {
const body = text.startsWith(")]}'") ? text.substring(4) : text;
return {idx, data: JSON.parse(body)};
})
.catch(e => null);
});
const settled = await Promise.all(promises);
return settled.filter(r => r !== null);
}
return await fetchPages(arguments[0], arguments[1]);
"""
try:
parallel_start = time.time()
results = driver.execute_async_script(parallel_script, place_id, tokens[:15]) # Limit to 15 parallel
parallel_time = time.time() - parallel_start
log.info(f"✓ Parallel fetch completed in {parallel_time:.2f}s")
log.info(f" Received {len(results)} responses")
# Parse parallel results
for result in results:
if result and 'data' in result:
try:
parsed = interceptor._parse_listugcposts_response(result['data'])
for review in parsed:
if review.review_id and review.review_id not in all_reviews:
all_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except Exception as e:
log.debug(f"Parse error: {e}")
log.info(f"✓ Total reviews after parallel fetch: {len(all_reviews)}\n")
except Exception as e:
log.warning(f"Parallel fetch failed: {e}")
reviews_list = list(all_reviews.values())
elapsed = time.time() - start_time
log.info("="*60)
log.info("✅ HYBRID PARALLEL SCRAPING COMPLETED!")
log.info("="*60)
log.info(f"Total reviews: {len(reviews_list)}")
log.info(f"Total time: {elapsed:.2f} seconds")
log.info(f"Speed: {len(reviews_list)/elapsed:.1f} reviews/second")
log.info("="*60 + "\n")
# Save
with open('google_reviews_hybrid.json', 'w', encoding='utf-8') as f:
json.dump(reviews_list, f, indent=2, ensure_ascii=False)
log.info(f"💾 Saved {len(reviews_list)} reviews to google_reviews_hybrid.json")
if reviews_list:
log.info("\n📝 Sample:")
s = reviews_list[0]
log.info(f" {s['author']} - {s['rating']}★ - {s['date_text']}")
log.info("\n" + "="*60)
log.info("SPEED COMPARISON")
log.info("="*60)
log.info(f"Old DOM: ~155s for 244 reviews (1.0x)")
log.info(f"Fast scrolling: ~29s for 234 reviews (5.3x)")
log.info(f"Hybrid parallel: ~{elapsed:.0f}s for {len(reviews_list)} reviews ({155/elapsed:.1f}x)! 🚀")
log.info("="*60 + "\n")
return reviews_list
finally:
try:
driver.quit()
except:
pass
if __name__ == '__main__':
try:
reviews = hybrid_parallel_scrape()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
log.info("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
log.error(f"Fatal error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)