Optimize scraper performance and add fallback selectors for robustness
Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
217
debug_business_card.py
Normal file
217
debug_business_card.py
Normal file
@@ -0,0 +1,217 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Debug script to inspect the actual HTML structure on Google Maps search results.
|
||||
This will help us identify where the review count is located in the DOM.
|
||||
"""
|
||||
import time
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
# Initialize driver
|
||||
print("Starting Chrome...")
|
||||
driver = Driver(
|
||||
uc=True,
|
||||
headless=True,
|
||||
page_load_strategy="normal"
|
||||
)
|
||||
|
||||
# Navigate to Google Maps search for Instinto
|
||||
url = "https://www.google.com/maps/search/?api=1&query=instinto+las+palmas&hl=en"
|
||||
print(f"\nNavigating to: {url}")
|
||||
driver.get(url)
|
||||
time.sleep(3)
|
||||
|
||||
# Handle GDPR consent if present
|
||||
if 'consent.google.com' in driver.current_url:
|
||||
print("Handling GDPR consent...")
|
||||
try:
|
||||
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
||||
for btn in form_btns:
|
||||
btn_text = (btn.text or '').lower()
|
||||
if 'accept all' in btn_text or 'aceptar todo' in btn_text:
|
||||
print(f"Clicking: {btn.text}")
|
||||
btn.click()
|
||||
time.sleep(3)
|
||||
break
|
||||
else:
|
||||
if len(form_btns) >= 2:
|
||||
print("Using fallback - clicking second button")
|
||||
form_btns[1].click()
|
||||
time.sleep(3)
|
||||
except Exception as e:
|
||||
print(f"GDPR handling error: {e}")
|
||||
|
||||
# Wait for page to load
|
||||
print("\nWaiting for page to fully load...")
|
||||
time.sleep(5)
|
||||
|
||||
print(f"\nCurrent URL: {driver.current_url}")
|
||||
|
||||
# Get all text content on the page
|
||||
all_text = driver.execute_script("return document.body.innerText;")
|
||||
print("\n" + "="*80)
|
||||
print("ALL TEXT ON PAGE (first 3000 chars):")
|
||||
print("="*80)
|
||||
print(all_text[:3000])
|
||||
|
||||
# Search for elements containing "152" or "review"
|
||||
print("\n" + "="*80)
|
||||
print("SEARCHING FOR ELEMENTS CONTAINING '152' OR 'review':")
|
||||
print("="*80)
|
||||
|
||||
elements_with_numbers = driver.execute_script("""
|
||||
const results = [];
|
||||
const allElements = document.querySelectorAll('*');
|
||||
|
||||
for (let elem of allElements) {
|
||||
const text = elem.textContent || '';
|
||||
const ownText = elem.innerText || '';
|
||||
|
||||
// Only check elements that directly contain the text (not nested)
|
||||
if (ownText && ownText.length < 200 && (ownText.includes('152') || /\\d+\\s*review/i.test(ownText))) {
|
||||
results.push({
|
||||
tag: elem.tagName,
|
||||
class: elem.className,
|
||||
id: elem.id,
|
||||
text: ownText.substring(0, 100),
|
||||
href: elem.href || null,
|
||||
role: elem.getAttribute('role'),
|
||||
ariaLabel: elem.getAttribute('aria-label')
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results.slice(0, 50); // First 50 matches
|
||||
""")
|
||||
|
||||
for i, elem in enumerate(elements_with_numbers, 1):
|
||||
print(f"\n{i}. <{elem['tag']}> "
|
||||
f"class='{elem['class'][:50] if elem['class'] else ''}' "
|
||||
f"id='{elem['id']}'")
|
||||
if elem['role']:
|
||||
print(f" role: {elem['role']}")
|
||||
if elem['ariaLabel']:
|
||||
print(f" aria-label: {elem['ariaLabel'][:100]}")
|
||||
if elem['href']:
|
||||
print(f" href: {elem['href'][:100]}")
|
||||
print(f" text: {elem['text']}")
|
||||
|
||||
# Also check what the extraction script would find
|
||||
print("\n" + "="*80)
|
||||
print("RUNNING ACTUAL EXTRACTION SCRIPT:")
|
||||
print("="*80)
|
||||
|
||||
extract_script = """
|
||||
const info = {
|
||||
name: null,
|
||||
address: null,
|
||||
rating: null,
|
||||
total_reviews: null,
|
||||
debug_info: []
|
||||
};
|
||||
|
||||
// Extract business name
|
||||
const nameSelectors = [
|
||||
'h1.DUwDvf',
|
||||
'[role="main"] h1',
|
||||
'h1.fontHeadlineLarge'
|
||||
];
|
||||
|
||||
for (const selector of nameSelectors) {
|
||||
const elem = document.querySelector(selector);
|
||||
if (elem && elem.textContent) {
|
||||
info.name = elem.textContent.trim();
|
||||
info.debug_info.push(`Found name via: ${selector}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract rating
|
||||
const ratingElem = document.querySelector('[role="img"][aria-label*="star"]');
|
||||
if (ratingElem) {
|
||||
const ariaLabel = ratingElem.getAttribute('aria-label');
|
||||
const match = ariaLabel.match(/([0-9.]+)/);
|
||||
if (match) {
|
||||
info.rating = parseFloat(match[1]);
|
||||
info.debug_info.push(`Found rating: ${info.rating} from aria-label: ${ariaLabel}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Extract total review count
|
||||
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
|
||||
|
||||
// Check search panel selectors
|
||||
const searchPanelSelectors = [
|
||||
'a[href*="reviews"]',
|
||||
'button[jsaction*="reviews"]',
|
||||
'div[role="link"]',
|
||||
];
|
||||
|
||||
for (const selector of searchPanelSelectors) {
|
||||
const elements = document.querySelectorAll(selector);
|
||||
info.debug_info.push(`Checking ${selector}: found ${elements.length} elements`);
|
||||
|
||||
for (let elem of elements) {
|
||||
const text = elem.textContent || '';
|
||||
if (text.length < 200) {
|
||||
info.debug_info.push(` - text: "${text.substring(0, 100)}"`);
|
||||
}
|
||||
|
||||
const match = text.match(numberPattern);
|
||||
if (match) {
|
||||
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
if (num > 0 && num < 1000000) {
|
||||
info.total_reviews = num;
|
||||
info.debug_info.push(` ✓ FOUND via ${selector}: ${num}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (info.total_reviews) break;
|
||||
}
|
||||
|
||||
// If not found, try all spans/divs
|
||||
if (!info.total_reviews) {
|
||||
const allElements = document.querySelectorAll('span, div, a');
|
||||
info.debug_info.push(`Checking all spans/divs/links: ${allElements.length} elements`);
|
||||
|
||||
let checked = 0;
|
||||
for (let elem of allElements) {
|
||||
const text = elem.textContent || '';
|
||||
if (text.length < 100) {
|
||||
const match = text.match(numberPattern);
|
||||
if (match) {
|
||||
checked++;
|
||||
if (checked <= 10) { // Log first 10 matches
|
||||
info.debug_info.push(` - potential match: "${text.substring(0, 80)}"`);
|
||||
}
|
||||
|
||||
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
if (num > 0 && num < 1000000) {
|
||||
info.total_reviews = num;
|
||||
info.debug_info.push(` ✓ FOUND via all elements: ${num} from "${text.substring(0, 80)}"`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return info;
|
||||
"""
|
||||
|
||||
result = driver.execute_script(extract_script)
|
||||
|
||||
print(f"\nExtracted Info:")
|
||||
print(f" Name: {result.get('name')}")
|
||||
print(f" Rating: {result.get('rating')}")
|
||||
print(f" Total Reviews: {result.get('total_reviews')}")
|
||||
|
||||
print(f"\nDebug Info:")
|
||||
for debug_line in result.get('debug_info', []):
|
||||
print(f" {debug_line}")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("Done! Closing browser.")
|
||||
print("="*80)
|
||||
driver.quit()
|
||||
Reference in New Issue
Block a user