Files
whyrating-engine-legacy/start_dom_only_fast.py
Alejandro Gutiérrez faa0704737 Optimize scraper performance and add fallback selectors for robustness
Performance improvements:
- Validation speed: 59.71s → 10.96s (5.5x improvement)
- Removed 50+ console.log statements from JavaScript extraction
- Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting
- Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls)

Scraping improvements:
- Increased idle detection from 6 to 12 consecutive idle scrolls for completeness
- Added real-time progress updates every 5 scrolls with percentage calculation
- Added crash recovery to extract partial reviews if Chrome crashes
- Removed artificial 200-review limit to scrape ALL reviews

Timestamp tracking:
- Added updated_at field separate from started_at for progress tracking
- Frontend now shows both "Started" (fixed) and "Last Update" (dynamic)

Robustness improvements:
- Added 5 fallback CSS selectors to handle different Google Maps page structures
- Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc.
- Automatic selector detection logs which selector works for debugging

Test results:
- Successfully scraped 550 reviews in 150.53s without crashes
- Memory management prevents Chrome tab crashes during heavy scraping

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-18 19:49:24 +00:00

332 lines
11 KiB
Python

#!/usr/bin/env python3
"""
DOM-ONLY FAST Scraper - Uses JavaScript for ultra-fast DOM extraction.
Strategy:
1. Scroll to load all reviews
2. Extract ALL data using JavaScript in one shot (no slow Selenium queries)
3. Should be faster and simpler than API + DOM hybrid
Target: ~20-25 seconds for all 244 reviews with simpler code
"""
import sys
import yaml
import logging
import time
import json
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
def load_config():
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def extract_all_reviews_js(driver):
"""Extract ALL reviews using JavaScript - single fast operation."""
extract_script = """
const reviews = [];
const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium');
for (let i = 0; i < elements.length; i++) {
const elem = elements[i];
const review = {};
try {
// Author
const authorElem = elem.querySelector('div.d4r55');
review.author = authorElem ? authorElem.textContent.trim() : null;
// Rating
const ratingElem = elem.querySelector('span.kvMYJc');
if (ratingElem) {
const ariaLabel = ratingElem.getAttribute('aria-label');
if (ariaLabel) {
const match = ariaLabel.match(/\\d+/);
review.rating = match ? parseFloat(match[0]) : null;
}
}
// Text
const textElem = elem.querySelector('span.wiI7pd');
review.text = textElem ? textElem.textContent.trim() : null;
// Date
const dateElem = elem.querySelector('span.rsqaWe');
review.date_text = dateElem ? dateElem.textContent.trim() : null;
// Avatar
const avatarElem = elem.querySelector('img.NBa7we');
review.avatar_url = avatarElem ? avatarElem.src : null;
// Profile URL
const profileElem = elem.querySelector('button.WEBjve');
review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null;
if (review.author && review.date_text) {
reviews.push(review);
}
} catch (e) {
// Skip this review
}
}
return reviews;
"""
try:
reviews_data = driver.execute_script(extract_script)
# Add review IDs
reviews = []
for review_data in reviews_data:
review_id = f"review_{hash(review_data['author'] + review_data['date_text'])}"
review_data['review_id'] = review_id
reviews.append(review_data)
return reviews
except Exception as e:
print(f" Error in JavaScript extraction: {e}")
return []
def dom_only_fast_scrape():
"""Ultra-fast DOM-only scraping with JavaScript extraction."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
print("DOM-ONLY FAST SCRAPER - JavaScript extraction...")
print(f"URL: {url[:80]}...")
start_time = time.time()
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# Navigate
driver.get(url)
time.sleep(1.5) # Reduced from 2.0
# Handle GDPR consent page (CRITICAL FIX!)
if 'consent.google.com' in driver.current_url:
try:
# Click "Accept all" / "Aceptar todo"
consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Aceptar"]')
if not consent_btns:
consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept"]')
if consent_btns:
consent_btns[0].click()
time.sleep(1.5) # Reduced from 2.0
except:
pass
# Dismiss cookie banner on Maps page
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
time.sleep(0.3) # Reduced from 0.4
except:
pass
# Click reviews tab
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
for selector in ['.LRkQ2', 'button[role="tab"]']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria for kw in review_keywords):
driver.execute_script("arguments[0].click();", tab)
time.sleep(0.3) # Reduced from 0.4
break
except:
continue
# Wait for page stability
time.sleep(0.8) # Reduced from 1.0
# Find pane
pane = None
try:
wait = WebDriverWait(driver, 3)
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
except TimeoutException:
try:
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
except:
print("ERROR: Could not find pane")
return []
# CRITICAL: Wait for initial reviews to load
time.sleep(1.2) # Reduced from 1.5
# Setup scroll
driver.execute_script("window.scrollablePane = arguments[0];", pane)
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
# Trigger initial scroll and VERIFY reviews are loading
driver.execute_script(scroll_script)
time.sleep(0.8) # Reduced from 1.0
# Check if reviews are actually loading
initial_count = driver.execute_script(
"return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
)
if initial_count < 5:
# Reviews not loaded yet, wait more
print(f" Waiting for reviews to load (found {initial_count})...")
time.sleep(1.5) # Reduced from 2.0
driver.execute_script(scroll_script)
time.sleep(0.8)
initial_count = driver.execute_script(
"return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
)
print(f"Scrolling to load all reviews (starting with {initial_count})...")
# Fast scrolling to load all DOM elements
# No hard limit - stops automatically via idle detection
max_scrolls = 999999
last_count = 0
idle_count = 0
last_scroll_pos = 0
for i in range(max_scrolls):
# Get current review count
current_count = driver.execute_script(
"return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
)
# Scroll to load more
prev_count = current_count
driver.execute_script(scroll_script)
# SMART WAIT: Wait until new reviews actually load (instead of fixed delay!)
max_wait = 1.0 # Maximum 1 second
wait_step = 0.05 # Check every 50ms
waited = 0
while waited < max_wait:
time.sleep(wait_step)
waited += wait_step
new_count = driver.execute_script(
"return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
)
# If reviews loaded, continue immediately!
if new_count > prev_count:
break
# If at bottom and no new reviews after 0.3s, we're done
if waited >= 0.3 and new_count == prev_count:
scroll_pos = driver.execute_script("return arguments[0].scrollTop;", pane)
if scroll_pos == last_scroll_pos:
idle_count += 1
if idle_count >= 3:
print(f" Reached end at {new_count} reviews")
break
last_scroll_pos = scroll_pos
break
current_count = new_count
# Progress logging every 10 scrolls
if (i + 1) % 10 == 0:
print(f" {current_count} review elements loaded...")
# Track for idle detection
if current_count == prev_count:
idle_count += 1
if idle_count >= 3:
break
else:
idle_count = 0
last_count = current_count
# Shorter final scroll
for _ in range(2): # Reduced from 3
driver.execute_script(scroll_script)
time.sleep(0.3) # Reduced from 0.4
scroll_time = time.time() - start_time
print(f" Scrolling complete in {scroll_time:.2f}s")
# Extract ALL reviews using JavaScript (fast!)
print("Extracting reviews with JavaScript...")
extract_start = time.time()
all_reviews = extract_all_reviews_js(driver)
extract_time = time.time() - extract_start
print(f" Extraction complete in {extract_time:.2f}s")
elapsed = time.time() - start_time
print(f"\n{'='*50}")
print(f"✅ COMPLETED!")
print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
print(f"Time: {elapsed:.2f}s")
print(f" - Scrolling: {scroll_time:.2f}s")
print(f" - Extraction: {extract_time:.2f}s")
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
print(f"{'='*50}")
if len(all_reviews) >= 244:
print(f"🎯 Got ALL 244 reviews!")
elif len(all_reviews) >= 240:
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
print()
# Save
with open('google_reviews_dom_only_fast.json', 'w', encoding='utf-8') as f:
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
print(f"💾 Saved to google_reviews_dom_only_fast.json")
if all_reviews:
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}")
return all_reviews
finally:
try:
driver.quit()
except:
pass
if __name__ == '__main__':
try:
reviews = dom_only_fast_scrape()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
print("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)