Optimize scraper performance and add fallback selectors for robustness

Performance improvements:
- Validation speed: 59.71s → 10.96s (5.5x improvement)
- Removed 50+ console.log statements from JavaScript extraction
- Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting
- Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls)

Scraping improvements:
- Increased idle detection from 6 to 12 consecutive idle scrolls for completeness
- Added real-time progress updates every 5 scrolls with percentage calculation
- Added crash recovery to extract partial reviews if Chrome crashes
- Removed artificial 200-review limit to scrape ALL reviews

Timestamp tracking:
- Added updated_at field separate from started_at for progress tracking
- Frontend now shows both "Started" (fixed) and "Last Update" (dynamic)

Robustness improvements:
- Added 5 fallback CSS selectors to handle different Google Maps page structures
- Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc.
- Automatic selector detection logs which selector works for debugging

Test results:
- Successfully scraped 550 reviews in 150.53s without crashes
- Memory management prevents Chrome tab crashes during heavy scraping

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-18 19:49:24 +00:00
parent bdffb5eaac
commit faa0704737
108 changed files with 23632 additions and 54 deletions

142
debug_wait_for_results.py Normal file
View File

@@ -0,0 +1,142 @@
#!/usr/bin/env python3
"""
Debug script - wait for search results to load before extracting.
"""
import time
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = Driver(uc=True, headless=True)
url = "https://www.google.com/maps/search/?api=1&query=soho+vilna+club&hl=en"
print(f"Navigating to: {url}")
driver.get(url)
time.sleep(2)
# Handle GDPR
if 'consent.google.com' in driver.current_url:
print("Handling GDPR...")
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
for btn in form_btns:
if 'accept all' in (btn.text or '').lower():
btn.click()
time.sleep(2)
break
print(f"Current URL: {driver.current_url}")
print("Waiting for search results to load...\n")
# Wait for search results to appear (but don't wait so long that Google auto-navigates)
try:
# Wait for the first result card to appear
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[role="article"], a[href*="/place/"]')))
print("✓ Search results loaded!")
except Exception as e:
print(f"✗ Timeout waiting for results: {e}")
# Give it just a tiny bit more time for content to render
time.sleep(0.5)
print(f"Current URL: {driver.current_url}")
print(f"Still on search results: {'/search/' in driver.current_url}\n")
# Extract
result = driver.execute_script("""
const info = {
businessName: null,
rating: null,
reviewCount: null,
debug: []
};
// Find first result card
const resultCard = document.querySelector('div[role="article"], a[href*="/place/"]');
if (!resultCard) {
info.debug.push('No result card found');
return info;
}
info.debug.push('Found result card');
// Get full text of card
const cardText = resultCard.textContent || '';
info.debug.push(`Card text length: ${cardText.length}`);
info.debug.push(`Card text (first 300 chars): ${cardText.substring(0, 300)}`);
// Extract business name (usually first h3 or div with specific class)
const nameElem = resultCard.querySelector('h3, div.fontHeadlineSmall, div[class*="fontHeadline"]');
if (nameElem) {
info.businessName = nameElem.textContent.trim();
info.debug.push(`Found name: ${info.businessName}`);
}
// Extract rating
const ratingElem = resultCard.querySelector('[role="img"][aria-label*="star"]');
if (ratingElem) {
const ariaLabel = ratingElem.getAttribute('aria-label');
const match = ariaLabel.match(/([0-9.]+)/);
if (match) {
info.rating = parseFloat(match[1]);
info.debug.push(`Found rating: ${info.rating}`);
}
}
// Extract review count - look for "N reviews" pattern
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
const match = cardText.match(numberPattern);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
if (num > 0 && num < 1000000) {
info.reviewCount = num;
info.debug.push(`✓ Found review count: ${num}`);
}
} else {
info.debug.push('No review count pattern found in card text');
// Try checking individual child elements
const allChildren = resultCard.querySelectorAll('*');
info.debug.push(`Card has ${allChildren.length} child elements`);
for (let child of allChildren) {
const childText = child.textContent || '';
if (childText.length < 100 && /review/i.test(childText)) {
info.debug.push(`Element with "review": ${childText}`);
const match = childText.match(numberPattern);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
if (num > 0 && num < 1000000 && !info.reviewCount) {
info.reviewCount = num;
info.debug.push(`✓ Found via child element: ${num}`);
}
}
}
}
}
return info;
""")
print("="*80)
print("EXTRACTION RESULTS:")
print("="*80)
print(f"Business Name: {result['businessName']}")
print(f"Rating: {result['rating']}")
print(f"Review Count: {result['reviewCount']}\n")
print("="*80)
print("DEBUG INFO:")
print("="*80)
for debug_line in result['debug']:
print(f" {debug_line}")
# Take a screenshot of the search results
screenshot_path = '/tmp/search_results.png'
driver.save_screenshot(screenshot_path)
print(f"\n✓ Screenshot saved to: {screenshot_path}")
driver.quit()