Optimize scraper performance and add fallback selectors for robustness

Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-18 19:49:24 +00:00
parent bdffb5eaac
commit faa0704737
108 changed files with 23632 additions and 54 deletions
--- a/debug_tabs.py
+++ b/debug_tabs.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+"""
+Debug script to find review count on business detail page tabs.
+"""
+import time
+from seleniumbase import Driver
+from selenium.webdriver.common.by import By
+
+driver = Driver(uc=True, headless=True)
+
+url = "https://www.google.com/maps/search/?api=1&query=instinto+las+palmas&hl=en"
+print(f"Navigating to: {url}")
+driver.get(url)
+time.sleep(3)
+
+# Handle GDPR
+if 'consent.google.com' in driver.current_url:
+    form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
+    for btn in form_btns:
+        if 'accept all' in (btn.text or '').lower():
+            btn.click()
+            time.sleep(3)
+            break
+
+time.sleep(5)
+print(f"Current URL: {driver.current_url}\n")
+
+# Extract tabs and review count
+result = driver.execute_script("""
+    const info = {
+        tabs: [],
+        reviewCount: null,
+        allText: []
+    };
+
+    // Get all tabs
+    const tabs = document.querySelectorAll('button[role="tab"]');
+    tabs.forEach((tab, i) => {
+        info.tabs.push({
+            index: i,
+            text: tab.textContent || '',
+            ariaLabel: tab.getAttribute('aria-label') || ''
+        });
+    });
+
+    // Look for review count patterns
+    const reviewPattern = /\\((\\d[\\d,\\.]*)\\)/;
+    const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
+
+    for (let tab of tabs) {
+        const text = tab.textContent || '';
+        const ariaLabel = tab.getAttribute('aria-label') || '';
+
+        let match = text.match(reviewPattern);
+        if (!match) match = text.match(numberPattern);
+        if (!match) match = ariaLabel.match(reviewPattern);
+        if (!match) match = ariaLabel.match(numberPattern);
+
+        if (match) {
+            const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
+            if (num > 0 && num < 1000000) {
+                info.reviewCount = num;
+                break;
+            }
+        }
+    }
+
+    // Also check all elements with "review" in text
+    const allElements = document.querySelectorAll('*');
+    for (let elem of allElements) {
+        const text = (elem.textContent || '').trim();
+        if (text.length > 0 && text.length < 150 && /review/i.test(text)) {
+            if (!info.allText.includes(text)) {
+                info.allText.push(text);
+            }
+        }
+    }
+
+    return info;
+""")
+
+print("="*80)
+print("TABS FOUND:")
+print("="*80)
+for tab in result['tabs']:
+    print(f"\nTab {tab['index']}:")
+    print(f"  Text: {tab['text']}")
+    print(f"  Aria-label: {tab['ariaLabel']}")
+
+print(f"\n{'='*80}")
+print(f"REVIEW COUNT EXTRACTED: {result['reviewCount']}")
+print(f"{'='*80}\n")
+
+print("="*80)
+print("ALL TEXT CONTAINING 'review' (first 20):")
+print("="*80)
+for i, text in enumerate(result['allText'][:20], 1):
+    print(f"{i}. {text}")
+
+driver.quit()