Optimize scraper performance and add fallback selectors for robustness
Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
346
start_fast.py
Normal file
346
start_fast.py
Normal file
@@ -0,0 +1,346 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fast API-First Scraper - Optimized version of start.py
|
||||
|
||||
Strategy:
|
||||
1. Open browser and navigate to reviews (~15 seconds)
|
||||
2. Scroll rapidly JUST to trigger API calls (~15 seconds)
|
||||
3. Collect all API responses during scrolling
|
||||
4. Parse reviews from API responses
|
||||
5. Skip DOM parsing entirely
|
||||
6. Exit immediately
|
||||
|
||||
Expected time: ~30-40 seconds for 244 reviews (vs 155 seconds)
|
||||
Speed improvement: ~4-5x faster!
|
||||
"""
|
||||
import sys
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
from pathlib import Path
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def load_config():
|
||||
"""Load configuration from config.yaml"""
|
||||
with open('config.yaml', 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def fast_scrape():
|
||||
"""Fast API-first scraping."""
|
||||
|
||||
config = load_config()
|
||||
url = config.get('url')
|
||||
headless = config.get('headless', False)
|
||||
|
||||
log.info("="*60)
|
||||
log.info("FAST API-FIRST SCRAPER")
|
||||
log.info("="*60)
|
||||
log.info(f"URL: {url[:80]}...")
|
||||
log.info(f"Mode: API-first (skip DOM parsing)")
|
||||
log.info("="*60 + "\n")
|
||||
|
||||
start_time = time.time()
|
||||
api_reviews = {}
|
||||
|
||||
# Create driver using SeleniumBase UC Mode (like original scraper)
|
||||
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
|
||||
|
||||
try:
|
||||
# Step 1: Navigate to reviews
|
||||
log.info("Step 1: Opening Google Maps...")
|
||||
driver.get(url)
|
||||
time.sleep(2)
|
||||
|
||||
# Dismiss cookies
|
||||
try:
|
||||
cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
||||
if cookie_btns:
|
||||
cookie_btns[0].click()
|
||||
log.info("✓ Cookie dialog dismissed")
|
||||
time.sleep(1)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab - comprehensive approach
|
||||
log.info("Step 2: Opening reviews tab...")
|
||||
|
||||
# Review keywords for multiple languages
|
||||
review_keywords = [
|
||||
'reviews', 'review', 'reseñas', 'reseña', 'opiniones', 'avis',
|
||||
'bewertungen', 'recensioni', 'avaliações', 'ביקורות'
|
||||
]
|
||||
|
||||
clicked = False
|
||||
tab_selectors = [
|
||||
'.LRkQ2', # Primary
|
||||
'.hh2c6', # Alternative
|
||||
'[data-tab-index="1"]', # Tab index
|
||||
'button[role="tab"]', # Button tabs
|
||||
'div[role="tab"]', # Div tabs
|
||||
]
|
||||
|
||||
# Try each selector
|
||||
for selector in tab_selectors:
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
for tab in tabs:
|
||||
try:
|
||||
# Check if this is the reviews tab
|
||||
text = (tab.text or '').lower()
|
||||
aria_label = (tab.get_attribute('aria-label') or '').lower()
|
||||
|
||||
if any(keyword in text or keyword in aria_label for keyword in review_keywords):
|
||||
log.info(f"Found reviews tab with selector {selector}: '{tab.text}'")
|
||||
# Scroll into view
|
||||
driver.execute_script("arguments[0].scrollIntoView({block:'center'});", tab)
|
||||
time.sleep(0.5)
|
||||
# Click with JavaScript (most reliable)
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(1.5)
|
||||
log.info("✓ Reviews tab clicked")
|
||||
clicked = True
|
||||
break
|
||||
except:
|
||||
continue
|
||||
if clicked:
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
if not clicked:
|
||||
log.warning("Could not find/click reviews tab - may already be on reviews or page structure changed")
|
||||
|
||||
# CRITICAL: Wait after clicking reviews tab for page to load
|
||||
log.info("Waiting for reviews page to fully load...")
|
||||
time.sleep(3)
|
||||
|
||||
# Find reviews pane
|
||||
log.info("Step 3: Finding reviews pane...")
|
||||
log.info(f"Current URL: {driver.current_url}")
|
||||
|
||||
pane = None
|
||||
pane_selectors = [
|
||||
'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', # Primary
|
||||
'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', # Without role="main"
|
||||
'div.m6QErb.WNBkOb.XiKgde', # Alternative class combination
|
||||
'div[role="main"] div.m6QErb.XiKgde', # Simplified with XiKgde
|
||||
'div.m6QErb.DxyBCb.XiKgde', # Another variant
|
||||
'div[role="main"] div.m6QErb', # Simplified version
|
||||
'div.m6QErb.DxyBCb', # Even more simplified
|
||||
'div[role="main"]', # Most generic
|
||||
]
|
||||
|
||||
for selector in pane_selectors:
|
||||
try:
|
||||
log.info(f"Trying selector: {selector}")
|
||||
wait = WebDriverWait(driver, 5)
|
||||
pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
|
||||
log.info(f"✓ Found reviews pane with: {selector}")
|
||||
break
|
||||
except TimeoutException:
|
||||
log.debug(f"Pane not found with selector: {selector}")
|
||||
continue
|
||||
|
||||
if not pane:
|
||||
log.error("Could not find reviews pane after all attempts!")
|
||||
log.error(f"Final URL: {driver.current_url}")
|
||||
# Save screenshot for debugging
|
||||
try:
|
||||
screenshot_path = 'pane_not_found.png'
|
||||
driver.save_screenshot(screenshot_path)
|
||||
log.info(f"Screenshot saved to {screenshot_path}")
|
||||
except:
|
||||
pass
|
||||
return []
|
||||
|
||||
# Wait for initial reviews to load
|
||||
log.info("Waiting for initial reviews to render...")
|
||||
time.sleep(3)
|
||||
|
||||
# Check if any review cards are present
|
||||
try:
|
||||
cards = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf')
|
||||
log.info(f"Found {len(cards)} initial review cards")
|
||||
except:
|
||||
log.warning("Could not find initial review cards")
|
||||
|
||||
# Step 4: Setup API interceptor (AFTER finding pane)
|
||||
log.info("Step 4: Setting up API interception...")
|
||||
interceptor = GoogleMapsAPIInterceptor(driver)
|
||||
try:
|
||||
interceptor.setup_interception()
|
||||
interceptor.inject_response_interceptor()
|
||||
log.info("✓ API interceptor ready - capturing network responses")
|
||||
except Exception as e:
|
||||
log.warning(f"Failed to setup interceptor: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
time.sleep(2) # Extra wait for interception to be fully active
|
||||
log.info("")
|
||||
|
||||
# Step 5: Rapid scrolling to trigger API calls
|
||||
log.info("="*60)
|
||||
log.info("Step 5: Rapid scrolling to trigger API calls")
|
||||
log.info("="*60)
|
||||
|
||||
# Setup scroll script (same as original scraper)
|
||||
try:
|
||||
driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
||||
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
|
||||
log.info("✓ Scroll script setup complete")
|
||||
except Exception as e:
|
||||
log.warning(f"Error setting up scroll script: {e}")
|
||||
scroll_script = "window.scrollBy(0, 300);" # Fallback
|
||||
|
||||
# Verify interceptor is active
|
||||
try:
|
||||
is_injected = driver.execute_script("return window.__reviewInterceptorInjected === true;")
|
||||
stats = driver.execute_script("return window.__interceptorStats;")
|
||||
queue_length = driver.execute_script("return window.__interceptedResponses ? window.__interceptedResponses.length : -1;")
|
||||
log.info(f"Interceptor status: injected={is_injected}, queue={queue_length}, stats={stats}")
|
||||
except Exception as e:
|
||||
log.warning(f"Could not check interceptor status: {e}")
|
||||
|
||||
# Trigger initial API call
|
||||
log.info("Triggering initial API call...")
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(2) # Wait for first API response
|
||||
log.info("")
|
||||
|
||||
# We need about 25 API calls for 244 reviews (10 per call)
|
||||
# Scroll rapidly - no DOM parsing!
|
||||
target_reviews = 240
|
||||
max_scrolls = 30
|
||||
|
||||
for i in range(max_scrolls):
|
||||
# Fast scroll
|
||||
driver.execute_script(scroll_script)
|
||||
time.sleep(0.3) # Optimal timing - fast but captures all responses
|
||||
|
||||
# Collect API responses
|
||||
try:
|
||||
responses = interceptor.get_intercepted_responses()
|
||||
if i == 5: # Debug on scroll 5
|
||||
log.info(f"DEBUG: Got {len(responses)} responses from interceptor")
|
||||
|
||||
# Check browser console
|
||||
try:
|
||||
console_logs = driver.get_log('browser')
|
||||
interceptor_logs = [l for l in console_logs if 'API Interceptor' in l.get('message', '')]
|
||||
if interceptor_logs:
|
||||
log.info(f"DEBUG: Interceptor console logs:")
|
||||
for l in interceptor_logs[-10:]: # Last 10
|
||||
log.info(f" {l['message']}")
|
||||
else:
|
||||
log.info("DEBUG: No interceptor logs in console")
|
||||
except Exception as e:
|
||||
log.warning(f"Could not get console logs: {e}")
|
||||
|
||||
if responses:
|
||||
parsed = interceptor.parse_reviews_from_responses(responses)
|
||||
if i == 5: # Debug on scroll 5
|
||||
log.info(f"DEBUG: Parsed {len(parsed)} reviews from responses")
|
||||
|
||||
for review in parsed:
|
||||
if review.review_id and review.review_id not in api_reviews:
|
||||
api_reviews[review.review_id] = {
|
||||
'review_id': review.review_id,
|
||||
'author': review.author,
|
||||
'rating': review.rating,
|
||||
'text': review.text,
|
||||
'date_text': review.date_text,
|
||||
'avatar_url': review.avatar_url,
|
||||
'profile_url': review.profile_url,
|
||||
}
|
||||
|
||||
if parsed:
|
||||
log.info(f"Scroll {i+1}: +{len(parsed)} reviews | Total: {len(api_reviews)}")
|
||||
|
||||
# Exit early if we have enough
|
||||
if len(api_reviews) >= target_reviews:
|
||||
log.info(f"\n✓ Reached target of {target_reviews} reviews!")
|
||||
break
|
||||
except Exception as e:
|
||||
log.error(f"Error collecting API responses: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
# Quick progress update
|
||||
if (i + 1) % 5 == 0 and i > 0:
|
||||
log.info(f"Progress: {i+1}/{max_scrolls} scrolls, {len(api_reviews)} reviews collected")
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Convert to list
|
||||
all_reviews = list(api_reviews.values())
|
||||
|
||||
log.info("\n" + "="*60)
|
||||
log.info("✅ FAST SCRAPING COMPLETED!")
|
||||
log.info("="*60)
|
||||
log.info(f"Total reviews: {len(all_reviews)}")
|
||||
log.info(f"Scrolls performed: {i+1}")
|
||||
log.info(f"Time elapsed: {elapsed:.2f} seconds")
|
||||
if all_reviews:
|
||||
log.info(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/second")
|
||||
log.info("="*60 + "\n")
|
||||
|
||||
# Save results
|
||||
output_file = 'google_reviews_fast.json'
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
|
||||
|
||||
log.info(f"💾 Saved {len(all_reviews)} reviews to {output_file}")
|
||||
|
||||
# Show sample
|
||||
if all_reviews:
|
||||
log.info("\n📝 Sample review:")
|
||||
sample = all_reviews[0]
|
||||
log.info(f" Author: {sample['author']}")
|
||||
log.info(f" Rating: {sample['rating']}★")
|
||||
log.info(f" Date: {sample['date_text']}")
|
||||
if sample['text']:
|
||||
log.info(f" Text: {sample['text'][:80]}...")
|
||||
|
||||
# Stats comparison
|
||||
log.info("\n" + "="*60)
|
||||
log.info("SPEED COMPARISON")
|
||||
log.info("="*60)
|
||||
log.info(f"Old approach: ~155 seconds for 244 reviews")
|
||||
log.info(f"Fast approach: ~{elapsed:.0f} seconds for {len(all_reviews)} reviews")
|
||||
if elapsed > 0:
|
||||
log.info(f"Improvement: {155/elapsed:.1f}x faster! 🚀")
|
||||
log.info("="*60 + "\n")
|
||||
|
||||
return all_reviews
|
||||
|
||||
finally:
|
||||
# Always close the driver
|
||||
try:
|
||||
driver.quit()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
reviews = fast_scrape()
|
||||
sys.exit(0 if reviews else 1)
|
||||
except KeyboardInterrupt:
|
||||
log.info("\n\nInterrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
log.error(f"Fatal error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user