Files
whyrating-engine-legacy/start_fastest_stable.py
Alejandro Gutiérrez faa0704737 Optimize scraper performance and add fallback selectors for robustness
Performance improvements:
- Validation speed: 59.71s → 10.96s (5.5x improvement)
- Removed 50+ console.log statements from JavaScript extraction
- Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting
- Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls)

Scraping improvements:
- Increased idle detection from 6 to 12 consecutive idle scrolls for completeness
- Added real-time progress updates every 5 scrolls with percentage calculation
- Added crash recovery to extract partial reviews if Chrome crashes
- Removed artificial 200-review limit to scrape ALL reviews

Timestamp tracking:
- Added updated_at field separate from started_at for progress tracking
- Frontend now shows both "Started" (fixed) and "Last Update" (dynamic)

Robustness improvements:
- Added 5 fallback CSS selectors to handle different Google Maps page structures
- Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc.
- Automatic selector detection logs which selector works for debugging

Test results:
- Successfully scraped 550 reviews in 150.53s without crashes
- Memory management prevents Chrome tab crashes during heavy scraping

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-18 19:49:24 +00:00

308 lines
11 KiB
Python

#!/usr/bin/env python3
"""
FASTEST STABLE Scraper - Best of both worlds.
Strategy:
1. Ultra-fast API scrolling (proven stable) → 234 reviews in ~19s
2. Instant JavaScript DOM extraction → 10 missing reviews in ~0.5s
3. Total: ~20 seconds for all 244 reviews with 100% stability
Combines stability of API approach with speed of JavaScript extraction.
"""
import sys
import yaml
import logging
import time
import json
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from modules.api_interceptor import GoogleMapsAPIInterceptor
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
def load_config():
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def extract_missing_reviews_js(driver, max_reviews=25):
"""Ultra-fast JavaScript extraction for missing reviews."""
extract_script = """
const reviews = [];
const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium');
const maxCount = Math.min(arguments[0], elements.length);
for (let i = 0; i < maxCount; i++) {
const elem = elements[i];
const review = {};
try {
const authorElem = elem.querySelector('div.d4r55');
review.author = authorElem ? authorElem.textContent.trim() : null;
const ratingElem = elem.querySelector('span.kvMYJc');
if (ratingElem) {
const ariaLabel = ratingElem.getAttribute('aria-label');
if (ariaLabel) {
const match = ariaLabel.match(/\\d+/);
review.rating = match ? parseFloat(match[0]) : null;
}
}
const textElem = elem.querySelector('span.wiI7pd');
review.text = textElem ? textElem.textContent.trim() : null;
const dateElem = elem.querySelector('span.rsqaWe');
review.date_text = dateElem ? dateElem.textContent.trim() : null;
const avatarElem = elem.querySelector('img.NBa7we');
review.avatar_url = avatarElem ? avatarElem.src : null;
const profileElem = elem.querySelector('button.WEBjve');
review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null;
if (review.author && review.date_text) {
reviews.push(review);
}
} catch (e) {
// Skip
}
}
return reviews;
"""
try:
reviews_data = driver.execute_script(extract_script, max_reviews)
reviews = []
for review_data in reviews_data:
review_id = f"dom_{hash(review_data['author'] + review_data['date_text'])}"
review_data['review_id'] = review_id
reviews.append(review_data)
return reviews
except Exception as e:
return []
def fastest_stable_scrape():
"""Get ALL 244 reviews with ultra-fast API + instant JS extraction."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
print("FASTEST STABLE SCRAPER - Ultra-fast API + instant JS...")
print(f"URL: {url[:80]}...")
start_time = time.time()
api_reviews = {}
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# Navigate
driver.get(url)
time.sleep(1.5)
# Dismiss cookies
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
time.sleep(0.4)
except:
pass
# Click reviews tab
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
for selector in ['.LRkQ2', 'button[role="tab"]']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria for kw in review_keywords):
driver.execute_script("arguments[0].click();", tab)
time.sleep(0.4)
break
except:
continue
# Wait for stability
time.sleep(1.0)
# Find pane
pane = None
try:
wait = WebDriverWait(driver, 3)
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
except TimeoutException:
try:
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
except:
print("ERROR: Could not find pane")
return []
# Wait for initial reviews to load (critical for stability)
time.sleep(1.5)
# Setup API interceptor
interceptor = GoogleMapsAPIInterceptor(driver)
interceptor.setup_interception()
interceptor.inject_response_interceptor()
time.sleep(1.0) # Important: wait for interceptor to be ready
# Setup scroll
driver.execute_script("window.scrollablePane = arguments[0];", pane)
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
# Trigger initial scroll to get first API response
driver.execute_script(scroll_script)
time.sleep(1.0) # Wait for first API response
print("[Phase 1] Ultra-fast API scrolling...")
# Ultra-fast API scrolling
target_reviews = 240
max_scrolls = 35
for i in range(max_scrolls):
driver.execute_script(scroll_script)
time.sleep(0.27) # Optimal timing
# API collection
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
if (i + 1) % 10 == 0:
print(f" {len(api_reviews)} reviews...")
if len(api_reviews) >= target_reviews:
break
except:
pass
# Final API collection
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except:
pass
api_time = time.time() - start_time
print(f" ✅ Phase 1: {len(api_reviews)} reviews in {api_time:.2f}s")
# [Phase 2] Instant JavaScript extraction for missing reviews
missing = 244 - len(api_reviews)
if missing > 0:
print(f"\n[Phase 2] Fast JS extraction for {missing} missing reviews...")
# Scroll to top (missing reviews likely at top)
driver.execute_script("window.scrollablePane.scrollTo(0, 0);", pane)
time.sleep(0.3)
# Extract with JavaScript
dom_reviews = extract_missing_reviews_js(driver, max_reviews=min(missing + 10, 25))
# Build API keys for deduplication
api_keys = set()
for api_review in api_reviews.values():
key = (api_review.get('author', ''), (api_review.get('date_text', '') or '')[:20])
api_keys.add(key)
# Add unique DOM reviews
dom_added = 0
for dom_review in dom_reviews:
dom_key = (dom_review.get('author', ''), (dom_review.get('date_text', '') or '')[:20])
if dom_key not in api_keys:
api_reviews[dom_review['review_id']] = dom_review
dom_added += 1
dom_time = time.time() - start_time - api_time
print(f" ✅ Phase 2: +{dom_added} reviews in {dom_time:.2f}s")
elapsed = time.time() - start_time
all_reviews = list(api_reviews.values())
print(f"\n{'='*50}")
print(f"✅ COMPLETED!")
print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
print(f"Time: {elapsed:.2f}s")
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
print(f"{'='*50}")
if len(all_reviews) >= 244:
print(f"🎯 Got ALL 244 reviews!")
elif len(all_reviews) >= 240:
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
print()
# Save
with open('google_reviews_fastest_stable.json', 'w', encoding='utf-8') as f:
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
print(f"💾 Saved to google_reviews_fastest_stable.json")
if all_reviews:
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}")
return all_reviews
finally:
try:
driver.quit()
except:
pass
if __name__ == '__main__':
try:
reviews = fastest_stable_scrape()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
print("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)