Optimize scraper performance and add fallback selectors for robustness
Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
286
start_hybrid_parallel.py
Normal file
286
start_hybrid_parallel.py
Normal file
@@ -0,0 +1,286 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Hybrid Parallel Scraper - Best of both worlds.
|
||||
|
||||
Strategy:
|
||||
1. Open browser and get to reviews page (~15s)
|
||||
2. Scroll quickly to collect ~5-10 continuation tokens (~5s)
|
||||
3. Make parallel API calls in browser using JavaScript (~2-3s)
|
||||
4. Total: ~22-25 seconds for 244 reviews
|
||||
|
||||
This approach:
|
||||
- Uses browser's active session (no auth issues)
|
||||
- Collects tokens sequentially (required by API)
|
||||
- Makes parallel calls for remaining pages (fast!)
|
||||
"""
|
||||
import sys
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def load_config():
|
||||
with open('config.yaml', 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def hybrid_parallel_scrape():
|
||||
"""Hybrid approach: Sequential token collection + Parallel fetch."""
|
||||
|
||||
config = load_config()
|
||||
url = config.get('url')
|
||||
headless = config.get('headless', False)
|
||||
|
||||
log.info("="*60)
|
||||
log.info("HYBRID PARALLEL SCRAPER")
|
||||
log.info("="*60)
|
||||
log.info(f"URL: {url[:80]}...")
|
||||
log.info(f"Mode: Sequential tokens + Parallel fetch")
|
||||
log.info("="*60 + "\n")
|
||||
|
||||
start_time = time.time()
|
||||
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
|
||||
|
||||
try:
|
||||
# PHASE 1: Setup (~15s)
|
||||
log.info("Phase 1: Browser setup...")
|
||||
driver.get(url)
|
||||
time.sleep(2)
|
||||
|
||||
# Dismiss cookies
|
||||
try:
|
||||
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
||||
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
||||
if cookie_btns:
|
||||
cookie_btns[0].click()
|
||||
time.sleep(1)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
review_keywords = ['reviews', 'review', 'reseñas']
|
||||
for selector in ['.LRkQ2', '.hh2c6', 'button[role="tab"]']:
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||
if any(kw in text or kw in aria for kw in review_keywords):
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(2)
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
time.sleep(3)
|
||||
|
||||
# Find pane
|
||||
pane = None
|
||||
for selector in ['div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde',
|
||||
'div.m6QErb.WNBkOb.XiKgde']:
|
||||
try:
|
||||
wait = WebDriverWait(driver, 5)
|
||||
pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
if not pane:
|
||||
log.error("Could not find pane")
|
||||
return []
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
# Extract place ID
|
||||
place_id = None
|
||||
current_url = driver.current_url
|
||||
if '!1s' in current_url:
|
||||
parts = current_url.split('!1s')
|
||||
if len(parts) > 1:
|
||||
place_id = parts[1].split('!')[0]
|
||||
|
||||
if not place_id:
|
||||
log.error("Could not extract place ID")
|
||||
return []
|
||||
|
||||
log.info(f"✓ Setup complete (place_id: {place_id})\n")
|
||||
|
||||
# PHASE 2: Collect tokens via scrolling (~5s)
|
||||
log.info("Phase 2: Collecting continuation tokens...")
|
||||
interceptor = GoogleMapsAPIInterceptor(driver)
|
||||
interceptor.setup_interception()
|
||||
interceptor.inject_response_interceptor()
|
||||
time.sleep(1)
|
||||
|
||||
# Setup scroll
|
||||
driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
||||
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
|
||||
|
||||
# Collect tokens by scrolling quickly
|
||||
tokens = []
|
||||
all_reviews = {}
|
||||
|
||||
for i in range(8): # 8 scrolls to get ~8 tokens
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.2) # Very fast scrolling
|
||||
|
||||
# Collect responses
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in all_reviews:
|
||||
all_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
|
||||
# Extract continuation token from raw response
|
||||
for resp in responses:
|
||||
try:
|
||||
body = resp.get('body', '')
|
||||
if body.startswith(")]}'"):
|
||||
body = body[4:]
|
||||
data = json.loads(body)
|
||||
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], str):
|
||||
token = data[1]
|
||||
if token and token not in tokens:
|
||||
tokens.append(token)
|
||||
except:
|
||||
pass
|
||||
|
||||
log.info(f"✓ Collected {len(tokens)} continuation tokens")
|
||||
log.info(f"✓ Got {len(all_reviews)} reviews from scrolling\n")
|
||||
|
||||
# PHASE 3: Parallel fetch remaining pages (~2-3s)
|
||||
if len(tokens) > 0:
|
||||
log.info("Phase 3: Parallel fetch of remaining pages...")
|
||||
|
||||
parallel_script = """
|
||||
async function fetchPages(placeId, tokens) {
|
||||
const baseUrl = 'https://www.google.com/maps/rpc/listugcposts';
|
||||
const results = [];
|
||||
|
||||
const promises = tokens.map((token, idx) => {
|
||||
const pb = `!1m6!1s${placeId}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${token}!5m2!1sByJsaaTKLK-bi-gPiqKAiQE!7e81!8m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!11m4!1e3!2e1!6m1!1i2!13m1!1e1`;
|
||||
const params = new URLSearchParams({
|
||||
authuser: '0',
|
||||
hl: 'es',
|
||||
gl: 'es',
|
||||
pb: pb
|
||||
});
|
||||
|
||||
return fetch(`${baseUrl}?${params}`)
|
||||
.then(r => r.text())
|
||||
.then(text => {
|
||||
const body = text.startsWith(")]}'") ? text.substring(4) : text;
|
||||
return {idx, data: JSON.parse(body)};
|
||||
})
|
||||
.catch(e => null);
|
||||
});
|
||||
|
||||
const settled = await Promise.all(promises);
|
||||
return settled.filter(r => r !== null);
|
||||
}
|
||||
|
||||
return await fetchPages(arguments[0], arguments[1]);
|
||||
"""
|
||||
|
||||
try:
|
||||
parallel_start = time.time()
|
||||
results = driver.execute_async_script(parallel_script, place_id, tokens[:15]) # Limit to 15 parallel
|
||||
parallel_time = time.time() - parallel_start
|
||||
|
||||
log.info(f"✓ Parallel fetch completed in {parallel_time:.2f}s")
|
||||
log.info(f" Received {len(results)} responses")
|
||||
|
||||
# Parse parallel results
|
||||
for result in results:
|
||||
if result and 'data' in result:
|
||||
try:
|
||||
parsed = interceptor._parse_listugcposts_response(result['data'])
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in all_reviews:
|
||||
all_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
except Exception as e:
|
||||
log.debug(f"Parse error: {e}")
|
||||
|
||||
log.info(f"✓ Total reviews after parallel fetch: {len(all_reviews)}\n")
|
||||
|
||||
except Exception as e:
|
||||
log.warning(f"Parallel fetch failed: {e}")
|
||||
|
||||
reviews_list = list(all_reviews.values())
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
log.info("="*60)
|
||||
log.info("✅ HYBRID PARALLEL SCRAPING COMPLETED!")
|
||||
log.info("="*60)
|
||||
log.info(f"Total reviews: {len(reviews_list)}")
|
||||
log.info(f"Total time: {elapsed:.2f} seconds")
|
||||
log.info(f"Speed: {len(reviews_list)/elapsed:.1f} reviews/second")
|
||||
log.info("="*60 + "\n")
|
||||
|
||||
# Save
|
||||
with open('google_reviews_hybrid.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(reviews_list, f, indent=2, ensure_ascii=False)
|
||||
|
||||
log.info(f"💾 Saved {len(reviews_list)} reviews to google_reviews_hybrid.json")
|
||||
|
||||
if reviews_list:
|
||||
log.info("\n📝 Sample:")
|
||||
s = reviews_list[0]
|
||||
log.info(f" {s['author']} - {s['rating']}★ - {s['date_text']}")
|
||||
|
||||
log.info("\n" + "="*60)
|
||||
log.info("SPEED COMPARISON")
|
||||
log.info("="*60)
|
||||
log.info(f"Old DOM: ~155s for 244 reviews (1.0x)")
|
||||
log.info(f"Fast scrolling: ~29s for 234 reviews (5.3x)")
|
||||
log.info(f"Hybrid parallel: ~{elapsed:.0f}s for {len(reviews_list)} reviews ({155/elapsed:.1f}x)! 🚀")
|
||||
log.info("="*60 + "\n")
|
||||
|
||||
return reviews_list
|
||||
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
reviews = hybrid_parallel_scrape()
|
||||
sys.exit(0 if reviews else 1)
|
||||
except KeyboardInterrupt:
|
||||
log.info("\n\nInterrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
log.error(f"Fatal error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user