Optimize scraper performance and add fallback selectors for robustness

Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-18 19:49:24 +00:00
parent bdffb5eaac
commit faa0704737
108 changed files with 23632 additions and 54 deletions
--- a/debug_wait_for_results.py
+++ b/debug_wait_for_results.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+"""
+Debug script - wait for search results to load before extracting.
+"""
+import time
+from seleniumbase import Driver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+driver = Driver(uc=True, headless=True)
+
+url = "https://www.google.com/maps/search/?api=1&query=soho+vilna+club&hl=en"
+print(f"Navigating to: {url}")
+driver.get(url)
+time.sleep(2)
+
+# Handle GDPR
+if 'consent.google.com' in driver.current_url:
+    print("Handling GDPR...")
+    form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
+    for btn in form_btns:
+        if 'accept all' in (btn.text or '').lower():
+            btn.click()
+            time.sleep(2)
+            break
+
+print(f"Current URL: {driver.current_url}")
+print("Waiting for search results to load...\n")
+
+# Wait for search results to appear (but don't wait so long that Google auto-navigates)
+try:
+    # Wait for the first result card to appear
+    wait = WebDriverWait(driver, 10)
+    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[role="article"], a[href*="/place/"]')))
+    print("✓ Search results loaded!")
+except Exception as e:
+    print(f"✗ Timeout waiting for results: {e}")
+
+# Give it just a tiny bit more time for content to render
+time.sleep(0.5)
+
+print(f"Current URL: {driver.current_url}")
+print(f"Still on search results: {'/search/' in driver.current_url}\n")
+
+# Extract
+result = driver.execute_script("""
+    const info = {
+        businessName: null,
+        rating: null,
+        reviewCount: null,
+        debug: []
+    };
+
+    // Find first result card
+    const resultCard = document.querySelector('div[role="article"], a[href*="/place/"]');
+    if (!resultCard) {
+        info.debug.push('No result card found');
+        return info;
+    }
+
+    info.debug.push('Found result card');
+
+    // Get full text of card
+    const cardText = resultCard.textContent || '';
+    info.debug.push(`Card text length: ${cardText.length}`);
+    info.debug.push(`Card text (first 300 chars): ${cardText.substring(0, 300)}`);
+
+    // Extract business name (usually first h3 or div with specific class)
+    const nameElem = resultCard.querySelector('h3, div.fontHeadlineSmall, div[class*="fontHeadline"]');
+    if (nameElem) {
+        info.businessName = nameElem.textContent.trim();
+        info.debug.push(`Found name: ${info.businessName}`);
+    }
+
+    // Extract rating
+    const ratingElem = resultCard.querySelector('[role="img"][aria-label*="star"]');
+    if (ratingElem) {
+        const ariaLabel = ratingElem.getAttribute('aria-label');
+        const match = ariaLabel.match(/([0-9.]+)/);
+        if (match) {
+            info.rating = parseFloat(match[1]);
+            info.debug.push(`Found rating: ${info.rating}`);
+        }
+    }
+
+    // Extract review count - look for "N reviews" pattern
+    const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
+    const match = cardText.match(numberPattern);
+
+    if (match) {
+        const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
+        if (num > 0 && num < 1000000) {
+            info.reviewCount = num;
+            info.debug.push(`✓ Found review count: ${num}`);
+        }
+    } else {
+        info.debug.push('No review count pattern found in card text');
+
+        // Try checking individual child elements
+        const allChildren = resultCard.querySelectorAll('*');
+        info.debug.push(`Card has ${allChildren.length} child elements`);
+
+        for (let child of allChildren) {
+            const childText = child.textContent || '';
+            if (childText.length < 100 && /review/i.test(childText)) {
+                info.debug.push(`Element with "review": ${childText}`);
+
+                const match = childText.match(numberPattern);
+                if (match) {
+                    const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
+                    if (num > 0 && num < 1000000 && !info.reviewCount) {
+                        info.reviewCount = num;
+                        info.debug.push(`✓ Found via child element: ${num}`);
+                    }
+                }
+            }
+        }
+    }
+
+    return info;
+""")
+
+print("="*80)
+print("EXTRACTION RESULTS:")
+print("="*80)
+print(f"Business Name: {result['businessName']}")
+print(f"Rating: {result['rating']}")
+print(f"Review Count: {result['reviewCount']}\n")
+
+print("="*80)
+print("DEBUG INFO:")
+print("="*80)
+for debug_line in result['debug']:
+    print(f"  {debug_line}")
+
+# Take a screenshot of the search results
+screenshot_path = '/tmp/search_results.png'
+driver.save_screenshot(screenshot_path)
+print(f"\n✓ Screenshot saved to: {screenshot_path}")
+
+driver.quit()