Files
whyrating-engine-legacy/start_parallel_hybrid.py
Alejandro Gutiérrez faa0704737 Optimize scraper performance and add fallback selectors for robustness
Performance improvements:
- Validation speed: 59.71s → 10.96s (5.5x improvement)
- Removed 50+ console.log statements from JavaScript extraction
- Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting
- Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls)

Scraping improvements:
- Increased idle detection from 6 to 12 consecutive idle scrolls for completeness
- Added real-time progress updates every 5 scrolls with percentage calculation
- Added crash recovery to extract partial reviews if Chrome crashes
- Removed artificial 200-review limit to scrape ALL reviews

Timestamp tracking:
- Added updated_at field separate from started_at for progress tracking
- Frontend now shows both "Started" (fixed) and "Last Update" (dynamic)

Robustness improvements:
- Added 5 fallback CSS selectors to handle different Google Maps page structures
- Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc.
- Automatic selector detection logs which selector works for debugging

Test results:
- Successfully scraped 550 reviews in 150.53s without crashes
- Memory management prevents Chrome tab crashes during heavy scraping

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-18 19:49:24 +00:00

351 lines
13 KiB
Python

#!/usr/bin/env python3
"""
PARALLEL HYBRID Scraper - Collects API + DOM simultaneously while scrolling.
Strategy:
1. During scrolling, collect BOTH API responses AND DOM elements in parallel
2. Deduplicate at the end
3. Should get all 244 reviews in ~20-25s (vs 34s sequential)
Optimization: No separate DOM parsing phase - everything happens during scroll!
"""
import sys
import yaml
import logging
import time
import json
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from modules.api_interceptor import GoogleMapsAPIInterceptor
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
def load_config():
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def parse_dom_review_element(elem):
"""Parse a single review element from DOM."""
try:
review_data = {}
# Author name
try:
author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55')
review_data['author'] = author_elem.text
except:
review_data['author'] = None
# Rating
try:
rating_elem = elem.find_element(By.CSS_SELECTOR, 'span.kvMYJc')
rating_attr = rating_elem.get_attribute('aria-label')
if rating_attr:
rating_parts = rating_attr.split()
if rating_parts:
review_data['rating'] = float(rating_parts[0])
except:
review_data['rating'] = None
# Review text
try:
text_elem = elem.find_element(By.CSS_SELECTOR, 'span.wiI7pd')
review_data['text'] = text_elem.text
except:
review_data['text'] = None
# Date
try:
date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe')
review_data['date_text'] = date_elem.text
except:
review_data['date_text'] = None
# Avatar URL
try:
avatar_elem = elem.find_element(By.CSS_SELECTOR, 'img.NBa7we')
review_data['avatar_url'] = avatar_elem.get_attribute('src')
except:
review_data['avatar_url'] = None
# Profile URL
try:
profile_elem = elem.find_element(By.CSS_SELECTOR, 'button.WEBjve')
review_data['profile_url'] = profile_elem.get_attribute('data-review-id')
except:
review_data['profile_url'] = None
# Generate ID from author + date + rating
if review_data.get('author'):
review_id = f"dom_{hash(str(review_data.get('author', '')) + str(review_data.get('date_text', '')) + str(review_data.get('rating', '')))}"
review_data['review_id'] = review_id
return review_data
return None
except (StaleElementReferenceException, Exception):
return None
def parallel_hybrid_scrape():
"""Collect API + DOM simultaneously during scrolling."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
print("PARALLEL HYBRID SCRAPER - Collecting API + DOM simultaneously...")
print(f"URL: {url[:80]}...")
start_time = time.time()
api_reviews = {}
dom_reviews = {}
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# Step 1: Navigate
driver.get(url)
time.sleep(1.5)
# Dismiss cookies
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
time.sleep(0.4)
except:
pass
# Click reviews tab
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
for selector in ['.LRkQ2', 'button[role="tab"]']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria for kw in review_keywords):
driver.execute_script("arguments[0].click();", tab)
time.sleep(0.4)
break
except:
continue
# Wait for page stability
time.sleep(1.0)
# Find pane
pane = None
try:
wait = WebDriverWait(driver, 3)
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
except TimeoutException:
try:
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
except:
print("ERROR: Could not find pane")
return []
# Wait for reviews to start loading
time.sleep(1.5)
# Setup API interceptor
interceptor = GoogleMapsAPIInterceptor(driver)
interceptor.setup_interception()
interceptor.inject_response_interceptor()
time.sleep(1.0) # Important: wait for interceptor to be ready
# Setup scroll
driver.execute_script("window.scrollablePane = arguments[0];", pane)
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
# Trigger initial scroll to get first API response
driver.execute_script(scroll_script)
time.sleep(1.0) # Wait for first API response
print("Parallel collection (API + DOM simultaneously)...")
# Scrolling with PARALLEL API + DOM collection
max_scrolls = 35
dom_parse_start = 25 # Only start DOM parsing after 25 scrolls (when near end)
for i in range(max_scrolls):
# Scroll
driver.execute_script(scroll_script)
time.sleep(0.27) # Optimal scroll timing
# PARALLEL COLLECTION 1: API Responses (always)
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except:
pass
# PARALLEL COLLECTION 2: DOM Elements (only near the end, lightweight)
# Only parse DOM in the last scrolls when we know we're near 234 API reviews
if i >= dom_parse_start and len(api_reviews) >= 220:
try:
# Lightweight: Just get author + date as unique key, don't parse everything
review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')
for elem in review_elements[:min(len(review_elements), 250)]: # Limit to first 250 for speed
try:
# Quick parse - just essentials
author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55')
author = author_elem.text if author_elem else None
date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe')
date_text = date_elem.text if date_elem else None
if author and date_text:
dom_key = (author, date_text[:20])
if dom_key not in dom_reviews:
# Full parse only if needed
dom_review = parse_dom_review_element(elem)
if dom_review:
dom_reviews[dom_key] = dom_review
except:
continue
except:
pass
# Progress logging
if (i + 1) % 10 == 0:
print(f" API: {len(api_reviews)}, DOM: {len(dom_reviews)} unique keys...")
# Final collections
print("Final collection sweep...")
# Final API collection
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except:
pass
# Final DOM parse (quick sweep)
try:
review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')
for elem in review_elements[:min(len(review_elements), 250)]:
try:
author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55')
author = author_elem.text if author_elem else None
date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe')
date_text = date_elem.text if date_elem else None
if author and date_text:
dom_key = (author, date_text[:20])
if dom_key not in dom_reviews:
dom_review = parse_dom_review_element(elem)
if dom_review:
dom_reviews[dom_key] = dom_review
except:
continue
except:
pass
# Merge: Start with API reviews, add DOM reviews that aren't duplicates
print("\nMerging API + DOM reviews...")
# Build set of API keys for deduplication (author + date)
api_keys = set()
for api_review in api_reviews.values():
key = (
api_review.get('author', ''),
(api_review.get('date_text', '') or '')[:20]
)
api_keys.add(key)
# Add unique DOM reviews
dom_added = 0
for dom_key, dom_review in dom_reviews.items():
if dom_key not in api_keys and dom_review.get('review_id'):
api_reviews[dom_review['review_id']] = dom_review
dom_added += 1
elapsed = time.time() - start_time
all_reviews = list(api_reviews.values())
print(f"\n{'='*50}")
print(f"✅ COMPLETED!")
print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
print(f" - API: {len(api_reviews) - dom_added}")
print(f" - DOM: {dom_added} unique")
print(f"Time: {elapsed:.2f}s")
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
print(f"{'='*50}")
if len(all_reviews) >= 244:
print(f"🎯 Got ALL 244 reviews!")
elif len(all_reviews) >= 240:
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
print()
# Save
with open('google_reviews_parallel_hybrid.json', 'w', encoding='utf-8') as f:
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
print(f"💾 Saved to google_reviews_parallel_hybrid.json")
if all_reviews:
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}")
return all_reviews
finally:
try:
driver.quit()
except:
pass
if __name__ == '__main__':
try:
reviews = parallel_hybrid_scrape()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
print("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)