Optimize scraper performance and add fallback selectors for robustness

Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-18 19:49:24 +00:00
parent bdffb5eaac
commit faa0704737
108 changed files with 23632 additions and 54 deletions
--- a/debug_business_card.py
+++ b/debug_business_card.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+"""
+Debug script to inspect the actual HTML structure on Google Maps search results.
+This will help us identify where the review count is located in the DOM.
+"""
+import time
+from seleniumbase import Driver
+from selenium.webdriver.common.by import By
+
+# Initialize driver
+print("Starting Chrome...")
+driver = Driver(
+    uc=True,
+    headless=True,
+    page_load_strategy="normal"
+)
+
+# Navigate to Google Maps search for Instinto
+url = "https://www.google.com/maps/search/?api=1&query=instinto+las+palmas&hl=en"
+print(f"\nNavigating to: {url}")
+driver.get(url)
+time.sleep(3)
+
+# Handle GDPR consent if present
+if 'consent.google.com' in driver.current_url:
+    print("Handling GDPR consent...")
+    try:
+        form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
+        for btn in form_btns:
+            btn_text = (btn.text or '').lower()
+            if 'accept all' in btn_text or 'aceptar todo' in btn_text:
+                print(f"Clicking: {btn.text}")
+                btn.click()
+                time.sleep(3)
+                break
+        else:
+            if len(form_btns) >= 2:
+                print("Using fallback - clicking second button")
+                form_btns[1].click()
+                time.sleep(3)
+    except Exception as e:
+        print(f"GDPR handling error: {e}")
+
+# Wait for page to load
+print("\nWaiting for page to fully load...")
+time.sleep(5)
+
+print(f"\nCurrent URL: {driver.current_url}")
+
+# Get all text content on the page
+all_text = driver.execute_script("return document.body.innerText;")
+print("\n" + "="*80)
+print("ALL TEXT ON PAGE (first 3000 chars):")
+print("="*80)
+print(all_text[:3000])
+
+# Search for elements containing "152" or "review"
+print("\n" + "="*80)
+print("SEARCHING FOR ELEMENTS CONTAINING '152' OR 'review':")
+print("="*80)
+
+elements_with_numbers = driver.execute_script("""
+    const results = [];
+    const allElements = document.querySelectorAll('*');
+
+    for (let elem of allElements) {
+        const text = elem.textContent || '';
+        const ownText = elem.innerText || '';
+
+        // Only check elements that directly contain the text (not nested)
+        if (ownText && ownText.length < 200 && (ownText.includes('152') || /\\d+\\s*review/i.test(ownText))) {
+            results.push({
+                tag: elem.tagName,
+                class: elem.className,
+                id: elem.id,
+                text: ownText.substring(0, 100),
+                href: elem.href || null,
+                role: elem.getAttribute('role'),
+                ariaLabel: elem.getAttribute('aria-label')
+            });
+        }
+    }
+
+    return results.slice(0, 50);  // First 50 matches
+""")
+
+for i, elem in enumerate(elements_with_numbers, 1):
+    print(f"\n{i}. <{elem['tag']}> "
+          f"class='{elem['class'][:50] if elem['class'] else ''}' "
+          f"id='{elem['id']}'")
+    if elem['role']:
+        print(f"   role: {elem['role']}")
+    if elem['ariaLabel']:
+        print(f"   aria-label: {elem['ariaLabel'][:100]}")
+    if elem['href']:
+        print(f"   href: {elem['href'][:100]}")
+    print(f"   text: {elem['text']}")
+
+# Also check what the extraction script would find
+print("\n" + "="*80)
+print("RUNNING ACTUAL EXTRACTION SCRIPT:")
+print("="*80)
+
+extract_script = """
+const info = {
+    name: null,
+    address: null,
+    rating: null,
+    total_reviews: null,
+    debug_info: []
+};
+
+// Extract business name
+const nameSelectors = [
+    'h1.DUwDvf',
+    '[role="main"] h1',
+    'h1.fontHeadlineLarge'
+];
+
+for (const selector of nameSelectors) {
+    const elem = document.querySelector(selector);
+    if (elem && elem.textContent) {
+        info.name = elem.textContent.trim();
+        info.debug_info.push(`Found name via: ${selector}`);
+        break;
+    }
+}
+
+// Extract rating
+const ratingElem = document.querySelector('[role="img"][aria-label*="star"]');
+if (ratingElem) {
+    const ariaLabel = ratingElem.getAttribute('aria-label');
+    const match = ariaLabel.match(/([0-9.]+)/);
+    if (match) {
+        info.rating = parseFloat(match[1]);
+        info.debug_info.push(`Found rating: ${info.rating} from aria-label: ${ariaLabel}`);
+    }
+}
+
+// Extract total review count
+const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
+
+// Check search panel selectors
+const searchPanelSelectors = [
+    'a[href*="reviews"]',
+    'button[jsaction*="reviews"]',
+    'div[role="link"]',
+];
+
+for (const selector of searchPanelSelectors) {
+    const elements = document.querySelectorAll(selector);
+    info.debug_info.push(`Checking ${selector}: found ${elements.length} elements`);
+
+    for (let elem of elements) {
+        const text = elem.textContent || '';
+        if (text.length < 200) {
+            info.debug_info.push(`  - text: "${text.substring(0, 100)}"`);
+        }
+
+        const match = text.match(numberPattern);
+        if (match) {
+            const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
+            if (num > 0 && num < 1000000) {
+                info.total_reviews = num;
+                info.debug_info.push(`  ✓ FOUND via ${selector}: ${num}`);
+                break;
+            }
+        }
+    }
+    if (info.total_reviews) break;
+}
+
+// If not found, try all spans/divs
+if (!info.total_reviews) {
+    const allElements = document.querySelectorAll('span, div, a');
+    info.debug_info.push(`Checking all spans/divs/links: ${allElements.length} elements`);
+
+    let checked = 0;
+    for (let elem of allElements) {
+        const text = elem.textContent || '';
+        if (text.length < 100) {
+            const match = text.match(numberPattern);
+            if (match) {
+                checked++;
+                if (checked <= 10) {  // Log first 10 matches
+                    info.debug_info.push(`  - potential match: "${text.substring(0, 80)}"`);
+                }
+
+                const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
+                if (num > 0 && num < 1000000) {
+                    info.total_reviews = num;
+                    info.debug_info.push(`  ✓ FOUND via all elements: ${num} from "${text.substring(0, 80)}"`);
+                    break;
+                }
+            }
+        }
+    }
+}
+
+return info;
+"""
+
+result = driver.execute_script(extract_script)
+
+print(f"\nExtracted Info:")
+print(f"  Name: {result.get('name')}")
+print(f"  Rating: {result.get('rating')}")
+print(f"  Total Reviews: {result.get('total_reviews')}")
+
+print(f"\nDebug Info:")
+for debug_line in result.get('debug_info', []):
+    print(f"  {debug_line}")
+
+print("\n" + "="*80)
+print("Done! Closing browser.")
+print("="*80)
+driver.quit()