Files
whyrating-engine-legacy/start_fast.py
Alejandro Gutiérrez faa0704737 Optimize scraper performance and add fallback selectors for robustness
Performance improvements:
- Validation speed: 59.71s → 10.96s (5.5x improvement)
- Removed 50+ console.log statements from JavaScript extraction
- Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting
- Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls)

Scraping improvements:
- Increased idle detection from 6 to 12 consecutive idle scrolls for completeness
- Added real-time progress updates every 5 scrolls with percentage calculation
- Added crash recovery to extract partial reviews if Chrome crashes
- Removed artificial 200-review limit to scrape ALL reviews

Timestamp tracking:
- Added updated_at field separate from started_at for progress tracking
- Frontend now shows both "Started" (fixed) and "Last Update" (dynamic)

Robustness improvements:
- Added 5 fallback CSS selectors to handle different Google Maps page structures
- Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc.
- Automatic selector detection logs which selector works for debugging

Test results:
- Successfully scraped 550 reviews in 150.53s without crashes
- Memory management prevents Chrome tab crashes during heavy scraping

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-18 19:49:24 +00:00

347 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Fast API-First Scraper - Optimized version of start.py
Strategy:
1. Open browser and navigate to reviews (~15 seconds)
2. Scroll rapidly JUST to trigger API calls (~15 seconds)
3. Collect all API responses during scrolling
4. Parse reviews from API responses
5. Skip DOM parsing entirely
6. Exit immediately
Expected time: ~30-40 seconds for 244 reviews (vs 155 seconds)
Speed improvement: ~4-5x faster!
"""
import sys
import yaml
import logging
import time
import json
from pathlib import Path
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from modules.api_interceptor import GoogleMapsAPIInterceptor
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
def load_config():
"""Load configuration from config.yaml"""
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def fast_scrape():
"""Fast API-first scraping."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
log.info("="*60)
log.info("FAST API-FIRST SCRAPER")
log.info("="*60)
log.info(f"URL: {url[:80]}...")
log.info(f"Mode: API-first (skip DOM parsing)")
log.info("="*60 + "\n")
start_time = time.time()
api_reviews = {}
# Create driver using SeleniumBase UC Mode (like original scraper)
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# Step 1: Navigate to reviews
log.info("Step 1: Opening Google Maps...")
driver.get(url)
time.sleep(2)
# Dismiss cookies
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
log.info("✓ Cookie dialog dismissed")
time.sleep(1)
except:
pass
# Click reviews tab - comprehensive approach
log.info("Step 2: Opening reviews tab...")
# Review keywords for multiple languages
review_keywords = [
'reviews', 'review', 'reseñas', 'reseña', 'opiniones', 'avis',
'bewertungen', 'recensioni', 'avaliações', 'ביקורות'
]
clicked = False
tab_selectors = [
'.LRkQ2', # Primary
'.hh2c6', # Alternative
'[data-tab-index="1"]', # Tab index
'button[role="tab"]', # Button tabs
'div[role="tab"]', # Div tabs
]
# Try each selector
for selector in tab_selectors:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
try:
# Check if this is the reviews tab
text = (tab.text or '').lower()
aria_label = (tab.get_attribute('aria-label') or '').lower()
if any(keyword in text or keyword in aria_label for keyword in review_keywords):
log.info(f"Found reviews tab with selector {selector}: '{tab.text}'")
# Scroll into view
driver.execute_script("arguments[0].scrollIntoView({block:'center'});", tab)
time.sleep(0.5)
# Click with JavaScript (most reliable)
driver.execute_script("arguments[0].click();", tab)
time.sleep(1.5)
log.info("✓ Reviews tab clicked")
clicked = True
break
except:
continue
if clicked:
break
except:
continue
if not clicked:
log.warning("Could not find/click reviews tab - may already be on reviews or page structure changed")
# CRITICAL: Wait after clicking reviews tab for page to load
log.info("Waiting for reviews page to fully load...")
time.sleep(3)
# Find reviews pane
log.info("Step 3: Finding reviews pane...")
log.info(f"Current URL: {driver.current_url}")
pane = None
pane_selectors = [
'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', # Primary
'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', # Without role="main"
'div.m6QErb.WNBkOb.XiKgde', # Alternative class combination
'div[role="main"] div.m6QErb.XiKgde', # Simplified with XiKgde
'div.m6QErb.DxyBCb.XiKgde', # Another variant
'div[role="main"] div.m6QErb', # Simplified version
'div.m6QErb.DxyBCb', # Even more simplified
'div[role="main"]', # Most generic
]
for selector in pane_selectors:
try:
log.info(f"Trying selector: {selector}")
wait = WebDriverWait(driver, 5)
pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
log.info(f"✓ Found reviews pane with: {selector}")
break
except TimeoutException:
log.debug(f"Pane not found with selector: {selector}")
continue
if not pane:
log.error("Could not find reviews pane after all attempts!")
log.error(f"Final URL: {driver.current_url}")
# Save screenshot for debugging
try:
screenshot_path = 'pane_not_found.png'
driver.save_screenshot(screenshot_path)
log.info(f"Screenshot saved to {screenshot_path}")
except:
pass
return []
# Wait for initial reviews to load
log.info("Waiting for initial reviews to render...")
time.sleep(3)
# Check if any review cards are present
try:
cards = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf')
log.info(f"Found {len(cards)} initial review cards")
except:
log.warning("Could not find initial review cards")
# Step 4: Setup API interceptor (AFTER finding pane)
log.info("Step 4: Setting up API interception...")
interceptor = GoogleMapsAPIInterceptor(driver)
try:
interceptor.setup_interception()
interceptor.inject_response_interceptor()
log.info("✓ API interceptor ready - capturing network responses")
except Exception as e:
log.warning(f"Failed to setup interceptor: {e}")
import traceback
traceback.print_exc()
time.sleep(2) # Extra wait for interception to be fully active
log.info("")
# Step 5: Rapid scrolling to trigger API calls
log.info("="*60)
log.info("Step 5: Rapid scrolling to trigger API calls")
log.info("="*60)
# Setup scroll script (same as original scraper)
try:
driver.execute_script("window.scrollablePane = arguments[0];", pane)
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
log.info("✓ Scroll script setup complete")
except Exception as e:
log.warning(f"Error setting up scroll script: {e}")
scroll_script = "window.scrollBy(0, 300);" # Fallback
# Verify interceptor is active
try:
is_injected = driver.execute_script("return window.__reviewInterceptorInjected === true;")
stats = driver.execute_script("return window.__interceptorStats;")
queue_length = driver.execute_script("return window.__interceptedResponses ? window.__interceptedResponses.length : -1;")
log.info(f"Interceptor status: injected={is_injected}, queue={queue_length}, stats={stats}")
except Exception as e:
log.warning(f"Could not check interceptor status: {e}")
# Trigger initial API call
log.info("Triggering initial API call...")
driver.execute_script(scroll_script)
time.sleep(2) # Wait for first API response
log.info("")
# We need about 25 API calls for 244 reviews (10 per call)
# Scroll rapidly - no DOM parsing!
target_reviews = 240
max_scrolls = 30
for i in range(max_scrolls):
# Fast scroll
driver.execute_script(scroll_script)
time.sleep(0.3) # Optimal timing - fast but captures all responses
# Collect API responses
try:
responses = interceptor.get_intercepted_responses()
if i == 5: # Debug on scroll 5
log.info(f"DEBUG: Got {len(responses)} responses from interceptor")
# Check browser console
try:
console_logs = driver.get_log('browser')
interceptor_logs = [l for l in console_logs if 'API Interceptor' in l.get('message', '')]
if interceptor_logs:
log.info(f"DEBUG: Interceptor console logs:")
for l in interceptor_logs[-10:]: # Last 10
log.info(f" {l['message']}")
else:
log.info("DEBUG: No interceptor logs in console")
except Exception as e:
log.warning(f"Could not get console logs: {e}")
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
if i == 5: # Debug on scroll 5
log.info(f"DEBUG: Parsed {len(parsed)} reviews from responses")
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
if parsed:
log.info(f"Scroll {i+1}: +{len(parsed)} reviews | Total: {len(api_reviews)}")
# Exit early if we have enough
if len(api_reviews) >= target_reviews:
log.info(f"\n✓ Reached target of {target_reviews} reviews!")
break
except Exception as e:
log.error(f"Error collecting API responses: {e}")
import traceback
traceback.print_exc()
# Quick progress update
if (i + 1) % 5 == 0 and i > 0:
log.info(f"Progress: {i+1}/{max_scrolls} scrolls, {len(api_reviews)} reviews collected")
elapsed = time.time() - start_time
# Convert to list
all_reviews = list(api_reviews.values())
log.info("\n" + "="*60)
log.info("✅ FAST SCRAPING COMPLETED!")
log.info("="*60)
log.info(f"Total reviews: {len(all_reviews)}")
log.info(f"Scrolls performed: {i+1}")
log.info(f"Time elapsed: {elapsed:.2f} seconds")
if all_reviews:
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second")
log.info("="*60 + "\n")
# Save results
output_file = 'google_reviews_fast.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
log.info(f"💾 Saved {len(all_reviews)} reviews to {output_file}")
# Show sample
if all_reviews:
log.info("\n📝 Sample review:")
sample = all_reviews[0]
log.info(f" Author: {sample['author']}")
log.info(f" Rating: {sample['rating']}")
log.info(f" Date: {sample['date_text']}")
if sample['text']:
log.info(f" Text: {sample['text'][:80]}...")
# Stats comparison
log.info("\n" + "="*60)
log.info("SPEED COMPARISON")
log.info("="*60)
log.info(f"Old approach: ~155 seconds for 244 reviews")
log.info(f"Fast approach: ~{elapsed:.0f} seconds for {len(all_reviews)} reviews")
if elapsed > 0:
log.info(f"Improvement: {155/elapsed:.1f}x faster! 🚀")
log.info("="*60 + "\n")
return all_reviews
finally:
# Always close the driver
try:
driver.quit()
except:
pass
if __name__ == '__main__':
try:
reviews = fast_scrape()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
log.info("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
log.error(f"Fatal error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)