Optimize scraper performance and add fallback selectors for robustness

Performance improvements:
- Validation speed: 59.71s → 10.96s (5.5x improvement)
- Removed 50+ console.log statements from JavaScript extraction
- Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting
- Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls)

Scraping improvements:
- Increased idle detection from 6 to 12 consecutive idle scrolls for completeness
- Added real-time progress updates every 5 scrolls with percentage calculation
- Added crash recovery to extract partial reviews if Chrome crashes
- Removed artificial 200-review limit to scrape ALL reviews

Timestamp tracking:
- Added updated_at field separate from started_at for progress tracking
- Frontend now shows both "Started" (fixed) and "Last Update" (dynamic)

Robustness improvements:
- Added 5 fallback CSS selectors to handle different Google Maps page structures
- Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc.
- Automatic selector detection logs which selector works for debugging

Test results:
- Successfully scraped 550 reviews in 150.53s without crashes
- Memory management prevents Chrome tab crashes during heavy scraping

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-18 19:49:24 +00:00
parent bdffb5eaac
commit faa0704737
108 changed files with 23632 additions and 54 deletions

171
debug_search_results.py Normal file
View File

@@ -0,0 +1,171 @@
#!/usr/bin/env python3
"""
Debug script to extract review count from search results BEFORE auto-navigation.
"""
import time
from seleniumbase import Driver
from selenium.webdriver.common.by import By
driver = Driver(uc=True, headless=True)
url = "https://www.google.com/maps/search/?api=1&query=soho+vilna+club&hl=en"
print(f"Navigating to: {url}")
driver.get(url)
time.sleep(2)
# Handle GDPR
if 'consent.google.com' in driver.current_url:
print("Handling GDPR...")
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
for btn in form_btns:
if 'accept all' in (btn.text or '').lower():
btn.click()
time.sleep(2)
break
# SHORT WAIT - extract quickly before auto-navigation!
time.sleep(1.5)
print(f"Current URL (should still be /search/): {driver.current_url}")
is_search = '/search/' in driver.current_url
print(f"Still on search results: {is_search}\n")
# FAST extraction from search results sidebar
result = driver.execute_script("""
const info = {
businessName: null,
rating: null,
reviewCount: null,
searchResults: [],
allTextWithNumbers: []
};
console.log('[EXTRACTION] Starting search results extraction...');
// Get business name from first result card
const nameSelectors = [
'div[role="article"] h3',
'div[role="article"] div.fontHeadlineSmall',
'div[aria-label*="Results"] h3',
'a[href*="/place/"] h3',
'div.Nv2PK h3' // Google Maps class for business name in search results
];
for (const selector of nameSelectors) {
const elem = document.querySelector(selector);
if (elem && elem.textContent) {
info.businessName = elem.textContent.trim();
console.log(`[EXTRACTION] Found name via ${selector}: ${info.businessName}`);
break;
}
}
// Get rating from first result
const ratingElem = document.querySelector('div[role="article"] [role="img"][aria-label*="star"], a[href*="/place/"] [role="img"][aria-label*="star"]');
if (ratingElem) {
const ariaLabel = ratingElem.getAttribute('aria-label');
const match = ariaLabel.match(/([0-9.]+)/);
if (match) {
info.rating = parseFloat(match[1]);
console.log(`[EXTRACTION] Found rating: ${info.rating}`);
}
}
// CRITICAL: Extract review count from search results sidebar
// Look for patterns like "152 reviews", "247 reviews", etc.
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
// Strategy 1: Check first result card/article
const resultCards = document.querySelectorAll('div[role="article"], a[href*="/place/"], div.Nv2PK');
console.log(`[EXTRACTION] Found ${resultCards.length} result cards`);
for (let card of resultCards) {
const text = card.textContent || '';
console.log(`[EXTRACTION] Card text (first 200 chars): ${text.substring(0, 200)}`);
const match = text.match(numberPattern);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
if (num > 0 && num < 1000000) {
info.reviewCount = num;
console.log(`[EXTRACTION] ✓ Found review count in card: ${num}`);
break;
}
}
// Only check first card
break;
}
// Strategy 2: Check all elements in left sidebar/panel
if (!info.reviewCount) {
console.log('[EXTRACTION] Strategy 2: Checking all sidebar elements...');
const leftPanel = document.querySelector('div[role="main"]') || document.querySelector('[aria-label*="Results"]') || document.body;
const allElements = leftPanel.querySelectorAll('span, div, a, button');
console.log(`[EXTRACTION] Checking ${allElements.length} elements in sidebar...`);
for (let elem of allElements) {
const text = elem.textContent || '';
// Skip very long text blocks (likely not the review count)
if (text.length > 0 && text.length < 150) {
const match = text.match(numberPattern);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
if (num > 0 && num < 1000000) {
info.allTextWithNumbers.push({
tag: elem.tagName,
text: text,
number: num
});
if (!info.reviewCount) {
info.reviewCount = num;
console.log(`[EXTRACTION] ✓ Found via sidebar scan: ${num} from "${text}"`);
}
}
}
}
}
}
console.log(`[EXTRACTION] Final result: ${info.reviewCount} reviews`);
return info;
""")
print("="*80)
print("EXTRACTION RESULTS (from search results page):")
print("="*80)
print(f"Business Name: {result['businessName']}")
print(f"Rating: {result['rating']}")
print(f"Review Count: {result['reviewCount']}")
if result['allTextWithNumbers']:
print(f"\n{'='*80}")
print("ALL ELEMENTS WITH REVIEW NUMBERS (first 10):")
print("="*80)
for i, item in enumerate(result['allTextWithNumbers'][:10], 1):
print(f"\n{i}. <{item['tag']}> Number: {item['number']}")
print(f" Text: {item['text'][:100]}")
# Check browser console
console_logs = driver.get_log('browser')
print(f"\n{'='*80}")
print("BROWSER CONSOLE LOGS:")
print("="*80)
for log in console_logs:
if '[EXTRACTION]' in log['message']:
print(log['message'])
# Wait a bit longer to see if Google auto-navigates
print(f"\n{'='*80}")
print("Waiting 5 more seconds to see if Google auto-navigates...")
print("="*80)
time.sleep(5)
print(f"URL after waiting: {driver.current_url}")
print(f"Still on search results: {'/search/' in driver.current_url}")
driver.quit()