Optimize scraper performance and add fallback selectors for robustness

Performance improvements:
- Validation speed: 59.71s → 10.96s (5.5x improvement)
- Removed 50+ console.log statements from JavaScript extraction
- Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting
- Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls)

Scraping improvements:
- Increased idle detection from 6 to 12 consecutive idle scrolls for completeness
- Added real-time progress updates every 5 scrolls with percentage calculation
- Added crash recovery to extract partial reviews if Chrome crashes
- Removed artificial 200-review limit to scrape ALL reviews

Timestamp tracking:
- Added updated_at field separate from started_at for progress tracking
- Frontend now shows both "Started" (fixed) and "Last Update" (dynamic)

Robustness improvements:
- Added 5 fallback CSS selectors to handle different Google Maps page structures
- Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc.
- Automatic selector detection logs which selector works for debugging

Test results:
- Successfully scraped 550 reviews in 150.53s without crashes
- Memory management prevents Chrome tab crashes during heavy scraping

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-18 19:49:24 +00:00
parent bdffb5eaac
commit faa0704737
108 changed files with 23632 additions and 54 deletions

217
debug_business_card.py Normal file
View File

@@ -0,0 +1,217 @@
#!/usr/bin/env python3
"""
Debug script to inspect the actual HTML structure on Google Maps search results.
This will help us identify where the review count is located in the DOM.
"""
import time
from seleniumbase import Driver
from selenium.webdriver.common.by import By
# Initialize driver
print("Starting Chrome...")
driver = Driver(
uc=True,
headless=True,
page_load_strategy="normal"
)
# Navigate to Google Maps search for Instinto
url = "https://www.google.com/maps/search/?api=1&query=instinto+las+palmas&hl=en"
print(f"\nNavigating to: {url}")
driver.get(url)
time.sleep(3)
# Handle GDPR consent if present
if 'consent.google.com' in driver.current_url:
print("Handling GDPR consent...")
try:
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
for btn in form_btns:
btn_text = (btn.text or '').lower()
if 'accept all' in btn_text or 'aceptar todo' in btn_text:
print(f"Clicking: {btn.text}")
btn.click()
time.sleep(3)
break
else:
if len(form_btns) >= 2:
print("Using fallback - clicking second button")
form_btns[1].click()
time.sleep(3)
except Exception as e:
print(f"GDPR handling error: {e}")
# Wait for page to load
print("\nWaiting for page to fully load...")
time.sleep(5)
print(f"\nCurrent URL: {driver.current_url}")
# Get all text content on the page
all_text = driver.execute_script("return document.body.innerText;")
print("\n" + "="*80)
print("ALL TEXT ON PAGE (first 3000 chars):")
print("="*80)
print(all_text[:3000])
# Search for elements containing "152" or "review"
print("\n" + "="*80)
print("SEARCHING FOR ELEMENTS CONTAINING '152' OR 'review':")
print("="*80)
elements_with_numbers = driver.execute_script("""
const results = [];
const allElements = document.querySelectorAll('*');
for (let elem of allElements) {
const text = elem.textContent || '';
const ownText = elem.innerText || '';
// Only check elements that directly contain the text (not nested)
if (ownText && ownText.length < 200 && (ownText.includes('152') || /\\d+\\s*review/i.test(ownText))) {
results.push({
tag: elem.tagName,
class: elem.className,
id: elem.id,
text: ownText.substring(0, 100),
href: elem.href || null,
role: elem.getAttribute('role'),
ariaLabel: elem.getAttribute('aria-label')
});
}
}
return results.slice(0, 50); // First 50 matches
""")
for i, elem in enumerate(elements_with_numbers, 1):
print(f"\n{i}. <{elem['tag']}> "
f"class='{elem['class'][:50] if elem['class'] else ''}' "
f"id='{elem['id']}'")
if elem['role']:
print(f" role: {elem['role']}")
if elem['ariaLabel']:
print(f" aria-label: {elem['ariaLabel'][:100]}")
if elem['href']:
print(f" href: {elem['href'][:100]}")
print(f" text: {elem['text']}")
# Also check what the extraction script would find
print("\n" + "="*80)
print("RUNNING ACTUAL EXTRACTION SCRIPT:")
print("="*80)
extract_script = """
const info = {
name: null,
address: null,
rating: null,
total_reviews: null,
debug_info: []
};
// Extract business name
const nameSelectors = [
'h1.DUwDvf',
'[role="main"] h1',
'h1.fontHeadlineLarge'
];
for (const selector of nameSelectors) {
const elem = document.querySelector(selector);
if (elem && elem.textContent) {
info.name = elem.textContent.trim();
info.debug_info.push(`Found name via: ${selector}`);
break;
}
}
// Extract rating
const ratingElem = document.querySelector('[role="img"][aria-label*="star"]');
if (ratingElem) {
const ariaLabel = ratingElem.getAttribute('aria-label');
const match = ariaLabel.match(/([0-9.]+)/);
if (match) {
info.rating = parseFloat(match[1]);
info.debug_info.push(`Found rating: ${info.rating} from aria-label: ${ariaLabel}`);
}
}
// Extract total review count
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
// Check search panel selectors
const searchPanelSelectors = [
'a[href*="reviews"]',
'button[jsaction*="reviews"]',
'div[role="link"]',
];
for (const selector of searchPanelSelectors) {
const elements = document.querySelectorAll(selector);
info.debug_info.push(`Checking ${selector}: found ${elements.length} elements`);
for (let elem of elements) {
const text = elem.textContent || '';
if (text.length < 200) {
info.debug_info.push(` - text: "${text.substring(0, 100)}"`);
}
const match = text.match(numberPattern);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
if (num > 0 && num < 1000000) {
info.total_reviews = num;
info.debug_info.push(` ✓ FOUND via ${selector}: ${num}`);
break;
}
}
}
if (info.total_reviews) break;
}
// If not found, try all spans/divs
if (!info.total_reviews) {
const allElements = document.querySelectorAll('span, div, a');
info.debug_info.push(`Checking all spans/divs/links: ${allElements.length} elements`);
let checked = 0;
for (let elem of allElements) {
const text = elem.textContent || '';
if (text.length < 100) {
const match = text.match(numberPattern);
if (match) {
checked++;
if (checked <= 10) { // Log first 10 matches
info.debug_info.push(` - potential match: "${text.substring(0, 80)}"`);
}
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
if (num > 0 && num < 1000000) {
info.total_reviews = num;
info.debug_info.push(` ✓ FOUND via all elements: ${num} from "${text.substring(0, 80)}"`);
break;
}
}
}
}
}
return info;
"""
result = driver.execute_script(extract_script)
print(f"\nExtracted Info:")
print(f" Name: {result.get('name')}")
print(f" Rating: {result.get('rating')}")
print(f" Total Reviews: {result.get('total_reviews')}")
print(f"\nDebug Info:")
for debug_line in result.get('debug_info', []):
print(f" {debug_line}")
print("\n" + "="*80)
print("Done! Closing browser.")
print("="*80)
driver.quit()