Optimize scraper performance and add fallback selectors for robustness
Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
331
start_dom_only_fast.py
Normal file
331
start_dom_only_fast.py
Normal file
@@ -0,0 +1,331 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
DOM-ONLY FAST Scraper - Uses JavaScript for ultra-fast DOM extraction.
|
||||
|
||||
Strategy:
|
||||
1. Scroll to load all reviews
|
||||
2. Extract ALL data using JavaScript in one shot (no slow Selenium queries)
|
||||
3. Should be faster and simpler than API + DOM hybrid
|
||||
|
||||
Target: ~20-25 seconds for all 244 reviews with simpler code
|
||||
"""
|
||||
import sys
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
log.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def load_config():
|
||||
with open('config.yaml', 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def extract_all_reviews_js(driver):
|
||||
"""Extract ALL reviews using JavaScript - single fast operation."""
|
||||
|
||||
extract_script = """
|
||||
const reviews = [];
|
||||
const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium');
|
||||
|
||||
for (let i = 0; i < elements.length; i++) {
|
||||
const elem = elements[i];
|
||||
const review = {};
|
||||
|
||||
try {
|
||||
// Author
|
||||
const authorElem = elem.querySelector('div.d4r55');
|
||||
review.author = authorElem ? authorElem.textContent.trim() : null;
|
||||
|
||||
// Rating
|
||||
const ratingElem = elem.querySelector('span.kvMYJc');
|
||||
if (ratingElem) {
|
||||
const ariaLabel = ratingElem.getAttribute('aria-label');
|
||||
if (ariaLabel) {
|
||||
const match = ariaLabel.match(/\\d+/);
|
||||
review.rating = match ? parseFloat(match[0]) : null;
|
||||
}
|
||||
}
|
||||
|
||||
// Text
|
||||
const textElem = elem.querySelector('span.wiI7pd');
|
||||
review.text = textElem ? textElem.textContent.trim() : null;
|
||||
|
||||
// Date
|
||||
const dateElem = elem.querySelector('span.rsqaWe');
|
||||
review.date_text = dateElem ? dateElem.textContent.trim() : null;
|
||||
|
||||
// Avatar
|
||||
const avatarElem = elem.querySelector('img.NBa7we');
|
||||
review.avatar_url = avatarElem ? avatarElem.src : null;
|
||||
|
||||
// Profile URL
|
||||
const profileElem = elem.querySelector('button.WEBjve');
|
||||
review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null;
|
||||
|
||||
if (review.author && review.date_text) {
|
||||
reviews.push(review);
|
||||
}
|
||||
} catch (e) {
|
||||
// Skip this review
|
||||
}
|
||||
}
|
||||
|
||||
return reviews;
|
||||
"""
|
||||
|
||||
try:
|
||||
reviews_data = driver.execute_script(extract_script)
|
||||
|
||||
# Add review IDs
|
||||
reviews = []
|
||||
for review_data in reviews_data:
|
||||
review_id = f"review_{hash(review_data['author'] + review_data['date_text'])}"
|
||||
review_data['review_id'] = review_id
|
||||
reviews.append(review_data)
|
||||
|
||||
return reviews
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error in JavaScript extraction: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def dom_only_fast_scrape():
|
||||
"""Ultra-fast DOM-only scraping with JavaScript extraction."""
|
||||
|
||||
config = load_config()
|
||||
url = config.get('url')
|
||||
headless = config.get('headless', False)
|
||||
|
||||
print("DOM-ONLY FAST SCRAPER - JavaScript extraction...")
|
||||
print(f"URL: {url[:80]}...")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
|
||||
|
||||
try:
|
||||
# Navigate
|
||||
driver.get(url)
|
||||
time.sleep(1.5) # Reduced from 2.0
|
||||
|
||||
# Handle GDPR consent page (CRITICAL FIX!)
|
||||
if 'consent.google.com' in driver.current_url:
|
||||
try:
|
||||
# Click "Accept all" / "Aceptar todo"
|
||||
consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Aceptar"]')
|
||||
if not consent_btns:
|
||||
consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept"]')
|
||||
if consent_btns:
|
||||
consent_btns[0].click()
|
||||
time.sleep(1.5) # Reduced from 2.0
|
||||
except:
|
||||
pass
|
||||
|
||||
# Dismiss cookie banner on Maps page
|
||||
try:
|
||||
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
||||
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
||||
if cookie_btns:
|
||||
cookie_btns[0].click()
|
||||
time.sleep(0.3) # Reduced from 0.4
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
|
||||
for selector in ['.LRkQ2', 'button[role="tab"]']:
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||
if any(kw in text or kw in aria for kw in review_keywords):
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(0.3) # Reduced from 0.4
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
# Wait for page stability
|
||||
time.sleep(0.8) # Reduced from 1.0
|
||||
|
||||
# Find pane
|
||||
pane = None
|
||||
try:
|
||||
wait = WebDriverWait(driver, 3)
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
|
||||
except TimeoutException:
|
||||
try:
|
||||
pane = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
|
||||
except:
|
||||
print("ERROR: Could not find pane")
|
||||
return []
|
||||
|
||||
# CRITICAL: Wait for initial reviews to load
|
||||
time.sleep(1.2) # Reduced from 1.5
|
||||
|
||||
# Setup scroll
|
||||
driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
||||
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
|
||||
|
||||
# Trigger initial scroll and VERIFY reviews are loading
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.8) # Reduced from 1.0
|
||||
|
||||
# Check if reviews are actually loading
|
||||
initial_count = driver.execute_script(
|
||||
"return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
|
||||
)
|
||||
|
||||
if initial_count < 5:
|
||||
# Reviews not loaded yet, wait more
|
||||
print(f" Waiting for reviews to load (found {initial_count})...")
|
||||
time.sleep(1.5) # Reduced from 2.0
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.8)
|
||||
initial_count = driver.execute_script(
|
||||
"return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
|
||||
)
|
||||
|
||||
print(f"Scrolling to load all reviews (starting with {initial_count})...")
|
||||
|
||||
# Fast scrolling to load all DOM elements
|
||||
# No hard limit - stops automatically via idle detection
|
||||
max_scrolls = 999999
|
||||
last_count = 0
|
||||
idle_count = 0
|
||||
last_scroll_pos = 0
|
||||
|
||||
for i in range(max_scrolls):
|
||||
# Get current review count
|
||||
current_count = driver.execute_script(
|
||||
"return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
|
||||
)
|
||||
|
||||
# Scroll to load more
|
||||
prev_count = current_count
|
||||
driver.execute_script(scroll_script)
|
||||
|
||||
# SMART WAIT: Wait until new reviews actually load (instead of fixed delay!)
|
||||
max_wait = 1.0 # Maximum 1 second
|
||||
wait_step = 0.05 # Check every 50ms
|
||||
waited = 0
|
||||
|
||||
while waited < max_wait:
|
||||
time.sleep(wait_step)
|
||||
waited += wait_step
|
||||
|
||||
new_count = driver.execute_script(
|
||||
"return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
|
||||
)
|
||||
|
||||
# If reviews loaded, continue immediately!
|
||||
if new_count > prev_count:
|
||||
break
|
||||
|
||||
# If at bottom and no new reviews after 0.3s, we're done
|
||||
if waited >= 0.3 and new_count == prev_count:
|
||||
scroll_pos = driver.execute_script("return arguments[0].scrollTop;", pane)
|
||||
if scroll_pos == last_scroll_pos:
|
||||
idle_count += 1
|
||||
if idle_count >= 3:
|
||||
print(f" Reached end at {new_count} reviews")
|
||||
break
|
||||
last_scroll_pos = scroll_pos
|
||||
break
|
||||
|
||||
current_count = new_count
|
||||
|
||||
# Progress logging every 10 scrolls
|
||||
if (i + 1) % 10 == 0:
|
||||
print(f" {current_count} review elements loaded...")
|
||||
|
||||
# Track for idle detection
|
||||
if current_count == prev_count:
|
||||
idle_count += 1
|
||||
if idle_count >= 3:
|
||||
break
|
||||
else:
|
||||
idle_count = 0
|
||||
|
||||
last_count = current_count
|
||||
|
||||
# Shorter final scroll
|
||||
for _ in range(2): # Reduced from 3
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.3) # Reduced from 0.4
|
||||
|
||||
scroll_time = time.time() - start_time
|
||||
print(f" Scrolling complete in {scroll_time:.2f}s")
|
||||
|
||||
# Extract ALL reviews using JavaScript (fast!)
|
||||
print("Extracting reviews with JavaScript...")
|
||||
extract_start = time.time()
|
||||
|
||||
all_reviews = extract_all_reviews_js(driver)
|
||||
|
||||
extract_time = time.time() - extract_start
|
||||
print(f" Extraction complete in {extract_time:.2f}s")
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"✅ COMPLETED!")
|
||||
print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
|
||||
print(f"Time: {elapsed:.2f}s")
|
||||
print(f" - Scrolling: {scroll_time:.2f}s")
|
||||
print(f" - Extraction: {extract_time:.2f}s")
|
||||
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
|
||||
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
|
||||
print(f"{'='*50}")
|
||||
|
||||
if len(all_reviews) >= 244:
|
||||
print(f"🎯 Got ALL 244 reviews!")
|
||||
elif len(all_reviews) >= 240:
|
||||
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
|
||||
|
||||
print()
|
||||
|
||||
# Save
|
||||
with open('google_reviews_dom_only_fast.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"💾 Saved to google_reviews_dom_only_fast.json")
|
||||
|
||||
if all_reviews:
|
||||
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
|
||||
|
||||
return all_reviews
|
||||
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
reviews = dom_only_fast_scrape()
|
||||
sys.exit(0 if reviews else 1)
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nInterrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user