Files
whyrating-engine-legacy/start_api_244.py
Alejandro Gutiérrez faa0704737 Optimize scraper performance and add fallback selectors for robustness
Performance improvements:
- Validation speed: 59.71s → 10.96s (5.5x improvement)
- Removed 50+ console.log statements from JavaScript extraction
- Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting
- Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls)

Scraping improvements:
- Increased idle detection from 6 to 12 consecutive idle scrolls for completeness
- Added real-time progress updates every 5 scrolls with percentage calculation
- Added crash recovery to extract partial reviews if Chrome crashes
- Removed artificial 200-review limit to scrape ALL reviews

Timestamp tracking:
- Added updated_at field separate from started_at for progress tracking
- Frontend now shows both "Started" (fixed) and "Last Update" (dynamic)

Robustness improvements:
- Added 5 fallback CSS selectors to handle different Google Maps page structures
- Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc.
- Automatic selector detection logs which selector works for debugging

Test results:
- Successfully scraped 550 reviews in 150.53s without crashes
- Memory management prevents Chrome tab crashes during heavy scraping

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-18 19:49:24 +00:00

289 lines
10 KiB
Python

#!/usr/bin/env python3
"""
API-Only 244 Scraper - Attempt to get ALL 244 reviews via API alone.
Strategy:
1. More patient scrolling (more scrolls, longer waits)
2. Collect responses more frequently
3. Extra end-of-list collection
4. Slower timing near the end to ensure API completes
Goal: Get all 244 reviews via API without DOM parsing
"""
import sys
import yaml
import logging
import time
import json
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from modules.api_interceptor import GoogleMapsAPIInterceptor
logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
def load_config():
with open('config.yaml', 'r') as f:
return yaml.safe_load(f)
def api_244_scrape():
"""Get all 244 reviews purely via API with aggressive collection."""
config = load_config()
url = config.get('url')
headless = config.get('headless', False)
print("API-244 SCRAPER - Getting ALL 244 reviews via API...")
print(f"URL: {url[:80]}...")
start_time = time.time()
api_reviews = {}
driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
try:
# Step 1: Navigate
driver.get(url)
time.sleep(1.5)
# Dismiss cookies
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
time.sleep(0.4)
except:
pass
# Click reviews tab
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
for selector in ['.LRkQ2', 'button[role="tab"]']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria for kw in review_keywords):
driver.execute_script("arguments[0].click();", tab)
time.sleep(0.4)
break
except:
continue
# Wait for page stability
time.sleep(1.0)
# Find pane
pane = None
try:
wait = WebDriverWait(driver, 3)
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
except TimeoutException:
try:
pane = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
except:
print("ERROR: Could not find pane")
return []
# Setup API interceptor
interceptor = GoogleMapsAPIInterceptor(driver)
interceptor.setup_interception()
interceptor.inject_response_interceptor()
time.sleep(1.0) # Longer wait to ensure interceptor is ready
# Setup scroll
driver.execute_script("window.scrollablePane = arguments[0];", pane)
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
# Trigger initial scroll
driver.execute_script(scroll_script)
time.sleep(1.0) # Wait for first API response
print("Scrolling with extended collection strategy...")
# Extended scrolling - MORE scrolls, SLOWER timing
max_scrolls = 50 # More scrolls to ensure we catch everything
idle_scrolls = 0
max_idle = 15 # Even more patience
last_count = 0
last_scroll_pos = 0
scroll_stuck_count = 0
for i in range(max_scrolls):
# Scroll
driver.execute_script(scroll_script)
# Progressive timing - slower and slower
if len(api_reviews) < 50:
time.sleep(0.30) # Start moderate
elif len(api_reviews) < 100:
time.sleep(0.35)
elif len(api_reviews) < 150:
time.sleep(0.40)
elif len(api_reviews) < 200:
time.sleep(0.50)
elif len(api_reviews) < 230:
time.sleep(0.60) # Much slower near end
else:
time.sleep(0.80) # Very slow for final reviews
# Collect responses
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except:
pass
# Check if we got new reviews
current_count = len(api_reviews)
if current_count == last_count:
idle_scrolls += 1
else:
idle_scrolls = 0
if (i + 1) % 10 == 0:
print(f" {current_count} reviews...")
last_count = current_count
# Check scroll position
try:
current_scroll = driver.execute_script("return arguments[0].scrollTop;", pane)
if current_scroll == last_scroll_pos:
scroll_stuck_count += 1
else:
scroll_stuck_count = 0
last_scroll_pos = current_scroll
except:
pass
# Stop conditions - but only if we have at least 240 reviews
if idle_scrolls >= max_idle and scroll_stuck_count >= 5 and current_count >= 240:
print(f" Reached end (no new reviews for {idle_scrolls} scrolls)")
break
# AGGRESSIVE final collection phase
print(f" Aggressive final collection (currently have {len(api_reviews)})...")
# Do 10 more scrolls with very long waits
for extra in range(10):
driver.execute_script(scroll_script)
time.sleep(1.2) # Very long wait
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
new_count = 0
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
new_count += 1
if new_count > 0:
print(f" +{new_count} more reviews (total: {len(api_reviews)})")
except:
pass
# Ultra-final wait and collect
time.sleep(2.0)
try:
responses = interceptor.get_intercepted_responses()
if responses:
parsed = interceptor.parse_reviews_from_responses(responses)
for review in parsed:
if review.review_id and review.review_id not in api_reviews:
api_reviews[review.review_id] = {
'review_id': review.review_id,
'author': review.author,
'rating': review.rating,
'text': review.text,
'date_text': review.date_text,
'avatar_url': review.avatar_url,
'profile_url': review.profile_url,
}
except:
pass
elapsed = time.time() - start_time
all_reviews = list(api_reviews.values())
print(f"\n{'='*50}")
print(f"✅ COMPLETED!")
print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
print(f"Time: {elapsed:.2f}s")
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
if elapsed > 0:
print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
print(f"{'='*50}")
if len(all_reviews) >= 244:
print(f"🎯 Got ALL 244 reviews via API!")
elif len(all_reviews) >= 240:
print(f"⚠️ Missing {244-len(all_reviews)} reviews - may need DOM parsing")
else:
print(f"⚠️ Missing {244-len(all_reviews)} reviews")
print()
# Save
with open('google_reviews_api_244.json', 'w', encoding='utf-8') as f:
json.dump(all_reviews, f, indent=2, ensure_ascii=False)
print(f"💾 Saved to google_reviews_api_244.json")
if all_reviews:
print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}")
return all_reviews
finally:
try:
driver.quit()
except:
pass
if __name__ == '__main__':
try:
reviews = api_244_scrape()
sys.exit(0 if reviews else 1)
except KeyboardInterrupt:
print("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)