Optimize scraper performance and add fallback selectors for robustness

Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-18 19:49:24 +00:00
parent bdffb5eaac
commit faa0704737
108 changed files with 23632 additions and 54 deletions
--- a/debug_check.py
+++ b/debug_check.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+"""Quick debug to see what's happening"""
+import yaml
+import time
+from seleniumbase import Driver
+from selenium.webdriver.common.by import By
+
+def load_config():
+    with open('config.yaml', 'r') as f:
+        return yaml.safe_load(f)
+
+config = load_config()
+url = config.get('url')
+
+driver = Driver(uc=True, headless=False, page_load_strategy="normal")
+
+try:
+    print(f"Loading: {url[:100]}")
+    driver.get(url)
+    time.sleep(3)
+
+    print(f"Title: {driver.title}")
+    print(f"URL: {driver.current_url[:100]}")
+
+    time.sleep(2)
+
+    # Handle GDPR consent page
+    if 'consent.google.com' in driver.current_url:
+        print("On consent page, looking for accept button...")
+        try:
+            # Look for various consent buttons
+            consent_selectors = [
+                'button:has-text("Accept all")',
+                'button:has-text("Aceptar todo")',
+                'button[aria-label*="Accept"]',
+                'button[aria-label*="Aceptar"]',
+                'form button[type="submit"]',
+                '//button[contains(., "Accept")]',
+                '//button[contains(., "Aceptar")]',
+            ]
+
+            for selector in consent_selectors:
+                try:
+                    if selector.startswith('//'):
+                        btns = driver.find_elements(By.XPATH, selector)
+                    else:
+                        btns = driver.find_elements(By.CSS_SELECTOR, selector)
+
+                    print(f"  Selector '{selector[:30]}...': found {len(btns)} buttons")
+                    if btns:
+                        print(f"  Clicking: {btns[0].text[:50]}")
+                        btns[0].click()
+                        time.sleep(2)
+                        break
+                except:
+                    continue
+
+            print(f"After consent click: {driver.current_url[:100]}")
+            time.sleep(3)
+
+        except Exception as e:
+            print(f"Consent error: {e}")
+
+    # Now try cookie banner on Maps page
+    try:
+        cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i]')
+        print(f"Found {len(cookie_btns)} cookie buttons")
+        if cookie_btns:
+            cookie_btns[0].click()
+            time.sleep(1)
+    except Exception as e:
+        print(f"Cookie error: {e}")
+
+    # Click reviews
+    tabs = driver.find_elements(By.CSS_SELECTOR, '.LRkQ2, button[role="tab"]')
+    print(f"Found {len(tabs)} tabs")
+    for tab in tabs:
+        text = (tab.text or '').lower()
+        if 'review' in text:
+            print(f"Clicking: {tab.text}")
+            driver.execute_script("arguments[0].click();", tab)
+            break
+
+    time.sleep(3)
+
+    # Check reviews
+    reviews = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')
+    print(f"Found {len(reviews)} review elements")
+
+    # Check pane
+    panes = driver.find_elements(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb')
+    print(f"Found {len(panes)} pane elements")
+
+    time.sleep(10)  # Keep browser open
+
+finally:
+    driver.quit()