Files
whyrating-engine-legacy/start_parallel_v2.py
Alejandro Gutiérrez faa0704737 Optimize scraper performance and add fallback selectors for robustness
Performance improvements:
- Validation speed: 59.71s → 10.96s (5.5x improvement)
- Removed 50+ console.log statements from JavaScript extraction
- Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting
- Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls)

Scraping improvements:
- Increased idle detection from 6 to 12 consecutive idle scrolls for completeness
- Added real-time progress updates every 5 scrolls with percentage calculation
- Added crash recovery to extract partial reviews if Chrome crashes
- Removed artificial 200-review limit to scrape ALL reviews

Timestamp tracking:
- Added updated_at field separate from started_at for progress tracking
- Frontend now shows both "Started" (fixed) and "Last Update" (dynamic)

Robustness improvements:
- Added 5 fallback CSS selectors to handle different Google Maps page structures
- Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc.
- Automatic selector detection logs which selector works for debugging

Test results:
- Successfully scraped 550 reviews in 150.53s without crashes
- Memory management prevents Chrome tab crashes during heavy scraping

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-18 19:49:24 +00:00

320 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Parallel API Scraper V2 - Use browser's fetch API for parallel calls.
Strategy:
1. Open browser and navigate to reviews (~15 seconds)
2. Trigger initial API call to get place ID and pattern
3. Use JavaScript fetch API to make 25 parallel calls (~3-5 seconds)
4. Collect all results at once
Expected time: ~20-25 seconds for 244 reviews
Speed improvement: ~6-7x faster!
"""
import sys
import yaml
import logging
import time
import json
from pathlib import Path
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from modules.api_interceptor import GoogleMapsAPIInterceptor
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
def load_config():
"""Load configuration from config.yaml"""
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def parallel_scrape():
"""Parallel API-first scraping using browser's fetch API."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
log.info("="*60)
log.info("PARALLEL API SCRAPER V2")
log.info("="*60)
log.info(f"URL: {url[:80]}...")
log.info(f"Mode: Parallel browser fetch calls")
log.info("="*60 + "\n")
start_time = time.time()
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# Step 1: Navigate and setup
log.info("Step 1: Opening Google Maps...")
driver.get(url)
time.sleep(2)
# Dismiss cookies
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
log.info("✓ Cookie dialog dismissed")
time.sleep(1)
except:
pass
# Click reviews tab
log.info("Step 2: Opening reviews tab...")
review_keywords = ['reviews', 'review', 'reseñas', 'reseña', 'opiniones']
clicked = False
for selector in ['.LRkQ2', '.hh2c6', '[data-tab-index="1"]', 'button[role="tab"]']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria_label = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria_label for kw in review_keywords):
driver.execute_script("arguments[0].click();", tab)
time.sleep(2)
log.info("✓ Reviews tab clicked")
clicked = True
break
if clicked:
break
except:
continue
# Wait for reviews to load
log.info("Waiting for reviews page to fully load...")
time.sleep(3)
# Find reviews pane
log.info("Step 3: Finding reviews pane...")
pane = None
pane_selectors = [
'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde',
'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde',
'div.m6QErb.WNBkOb.XiKgde',
]
for selector in pane_selectors:
try:
wait = WebDriverWait(driver, 5)
pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
log.info(f"✓ Found reviews pane with: {selector}")
break
except TimeoutException:
continue
if not pane:
log.error("Could not find reviews pane")
return []
# Wait for initial reviews
time.sleep(2)
# Extract place ID from URL
current_url = driver.current_url
place_id = None
if '!1s' in current_url:
parts = current_url.split('!1s')
if len(parts) > 1:
place_id = parts[1].split('!')[0]
log.info(f"✓ Extracted place ID: {place_id}")
if not place_id:
log.error("Could not extract place ID from URL")
return []
# Step 4: Make parallel API calls using browser's fetch
log.info("\n" + "="*60)
log.info("Step 4: Making parallel API calls via browser fetch")
log.info("="*60)
# JavaScript to make parallel API calls
parallel_fetch_script = """
async function fetchReviewsParallel(placeId, numPages) {
const baseUrl = 'https://www.google.com/maps/rpc/listugcposts';
const results = [];
// Build pb parameter for each page
const requests = [];
let token = null;
console.log('[Parallel Fetch] Starting parallel fetch for', numPages, 'pages');
// First, we need to get continuation tokens sequentially
const tokens = [];
for (let i = 0; i < Math.min(numPages, 5); i++) {
const pb = token
? `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`
: `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`;
const params = new URLSearchParams({
authuser: '0',
hl: 'es',
gl: 'es',
pb: pb
});
try {
const response = await fetch(`${baseUrl}?${params}`);
const text = await response.text();
const body = text.startsWith(")]}'") ? text.substring(4) : text;
const data = JSON.parse(body);
results.push({index: i, data: data});
// Get next token
if (data && data.length > 1 && typeof data[1] === 'string') {
token = data[1];
tokens.push(token);
} else {
break; // No more pages
}
} catch (e) {
console.error('[Parallel Fetch] Error fetching page', i, e);
}
}
console.log('[Parallel Fetch] Got', tokens.length, 'continuation tokens');
console.log('[Parallel Fetch] Now fetching remaining pages in parallel...');
// Now fetch remaining pages in parallel using the tokens
const parallelPromises = tokens.slice(5).map((tok, idx) => {
const pb = `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${tok}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`;
const params = new URLSearchParams({
authuser: '0',
hl: 'es',
gl: 'es',
pb: pb
});
return fetch(`${baseUrl}?${params}`)
.then(r => r.text())
.then(text => {
const body = text.startsWith(")]}'") ? text.substring(4) : text;
return JSON.parse(body);
})
.then(data => ({index: idx + 5, data: data}))
.catch(e => {
console.error('[Parallel Fetch] Parallel fetch error', idx, e);
return null;
});
});
const parallelResults = await Promise.all(parallelPromises);
results.push(...parallelResults.filter(r => r !== null));
console.log('[Parallel Fetch] Completed! Total responses:', results.length);
return results;
}
// Execute parallel fetch
return await fetchReviewsParallel(arguments[0], arguments[1]);
"""
log.info(f"Fetching up to 25 pages in parallel...")
api_start = time.time()
try:
results = driver.execute_async_script(parallel_fetch_script, place_id, 25)
api_elapsed = time.time() - api_start
log.info(f"✓ Parallel fetch completed in {api_elapsed:.2f} seconds")
log.info(f" Received {len(results)} API responses")
except Exception as e:
log.error(f"Parallel fetch failed: {e}")
return []
# Parse results
log.info("\nStep 5: Parsing reviews from API responses...")
interceptor = GoogleMapsAPIInterceptor(None)
all_reviews = {}
for result in results:
if result and 'data' in result:
try:
parsed = interceptor._parse_listugcposts_response(result['data'])
for review in parsed:
if review.review_id and review.review_id not in all_reviews:
all_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except Exception as e:
log.debug(f"Error parsing response: {e}")
reviews_list = list(all_reviews.values())
elapsed = time.time() - start_time
log.info(f"\n{'='*60}")
log.info(f"✅ PARALLEL SCRAPING COMPLETED!")
log.info(f"{'='*60}")
log.info(f"Total reviews: {len(reviews_list)}")
log.info(f"API responses: {len(results)}")
log.info(f"Total time: {elapsed:.2f} seconds")
log.info(f" - Setup: {api_start - start_time:.2f}s")
log.info(f" - Parallel API: {api_elapsed:.2f}s")
log.info(f"Speed: {len(reviews_list)/elapsed:.1f} reviews/second")
log.info(f"{'='*60}\n")
# Save results
output_file = 'google_reviews_parallel.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(reviews_list, f, indent=2, ensure_ascii=False)
log.info(f"💾 Saved {len(reviews_list)} reviews to {output_file}")
# Show sample
if reviews_list:
log.info("\n📝 Sample review:")
sample = reviews_list[0]
log.info(f" Author: {sample['author']}")
log.info(f" Rating: {sample['rating']}")
log.info(f" Date: {sample['date_text']}")
if sample['text']:
log.info(f" Text: {sample['text'][:80]}...")
# Stats comparison
log.info("\n" + "="*60)
log.info("SPEED COMPARISON")
log.info("="*60)
log.info(f"Old DOM scraping: ~155 seconds for 244 reviews (1.0x)")
log.info(f"Fast API scrolling: ~43 seconds for 234 reviews (3.6x faster)")
log.info(f"Parallel browser fetch: ~{elapsed:.0f} seconds for {len(reviews_list)} reviews ({155/elapsed:.1f}x faster!) 🚀")
log.info("="*60 + "\n")
return reviews_list
finally:
try:
driver.quit()
except:
pass
if __name__ == '__main__':
try:
reviews = parallel_scrape()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
log.info("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
log.error(f"Fatal error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)