Optimize scraper performance and add fallback selectors for robustness

Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-18 19:49:24 +00:00
parent bdffb5eaac
commit faa0704737
108 changed files with 23632 additions and 54 deletions
--- a/debug_detail_page.py
+++ b/debug_detail_page.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+"""
+Debug script - check detail page after auto-navigation for review count.
+"""
+import time
+from seleniumbase import Driver
+from selenium.webdriver.common.by import By
+
+driver = Driver(uc=True, headless=True)
+
+url = "https://www.google.com/maps/search/?api=1&query=soho+vilna+club&hl=en"
+print(f"Navigating to: {url}")
+driver.get(url)
+time.sleep(2)
+
+# Handle GDPR
+if 'consent.google.com' in driver.current_url:
+    print("Handling GDPR...")
+    form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
+    for btn in form_btns:
+        if 'accept all' in (btn.text or '').lower():
+            btn.click()
+            time.sleep(2)
+            break
+
+# Wait for auto-navigation to complete
+print("Waiting for Google Maps to auto-navigate to business detail page...")
+time.sleep(6)
+
+print(f"Final URL: {driver.current_url}")
+print(f"On detail page: {'/place/' in driver.current_url}\n")
+
+# Dump ALL text on the page
+all_text = driver.execute_script("return document.body.innerText;")
+
+print("="*80)
+print("SEARCHING FOR REVIEW NUMBERS IN PAGE TEXT:")
+print("="*80)
+
+# Find all numbers followed by "review"
+import re
+review_pattern = r'(\d[\d,\.]*)\s*(?:review|reseña|avis)'
+matches = re.findall(review_pattern, all_text, re.IGNORECASE)
+
+if matches:
+    print(f"✓ Found {len(matches)} potential review count(s) in text:")
+    for i, match in enumerate(matches, 1):
+        num = match.replace(',', '').replace('.', '')
+        print(f"  {i}. {match} ({num})")
+else:
+    print("✗ No review count found in page text")
+
+# Check specific patterns in the text
+print(f"\n{'='*80}")
+print("PAGE TEXT ANALYSIS:")
+print("="*80)
+
+# Lines containing numbers
+lines = all_text.split('\n')
+number_lines = [line.strip() for line in lines if re.search(r'\d+', line) and len(line.strip()) < 100 and len(line.strip()) > 0]
+
+print(f"Lines containing numbers (first 30):")
+for i, line in enumerate(number_lines[:30], 1):
+    print(f"  {i}. {line}")
+
+# Now use JavaScript to find exact element
+result = driver.execute_script("""
+    const info = {
+        foundIn: [],
+        reviewCount: null
+    };
+
+    const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
+
+    // Check ALL elements
+    const allElements = document.querySelectorAll('*');
+
+    for (let elem of allElements) {
+        const text = elem.textContent || '';
+        const ownText = elem.innerText || '';
+
+        // Check both textContent and innerText
+        for (let txt of [text, ownText]) {
+            if (txt && txt.length < 200) {
+                const match = txt.match(numberPattern);
+                if (match) {
+                    const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
+                    if (num > 0 && num < 1000000) {
+                        info.foundIn.push({
+                            tag: elem.tagName,
+                            class: elem.className,
+                            id: elem.id,
+                            role: elem.getAttribute('role'),
+                            ariaLabel: elem.getAttribute('aria-label'),
+                            text: txt.substring(0, 100),
+                            number: num
+                        });
+
+                        if (!info.reviewCount) {
+                            info.reviewCount = num;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return info;
+""")
+
+print(f"\n{'='*80}")
+print("JAVASCRIPT EXTRACTION:")
+print("="*80)
+print(f"Review Count Found: {result['reviewCount']}\n")
+
+if result['foundIn']:
+    print(f"Elements containing review numbers (first 15):")
+    for i, elem in enumerate(result['foundIn'][:15], 1):
+        print(f"\n{i}. <{elem['tag']}> Number: {elem['number']}")
+        if elem['class']:
+            print(f"   class: {elem['class'][:60]}")
+        if elem['role']:
+            print(f"   role: {elem['role']}")
+        if elem['ariaLabel']:
+            print(f"   aria-label: {elem['ariaLabel'][:80]}")
+        print(f"   text: {elem['text']}")
+else:
+    print("No elements with review numbers found")
+
+driver.quit()