Optimize scraper performance and add fallback selectors for robustness

Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-18 19:49:24 +00:00
parent bdffb5eaac
commit faa0704737
108 changed files with 23632 additions and 54 deletions
--- a/start_dom_only_fast.py
+++ b/start_dom_only_fast.py
@@ -0,0 +1,331 @@
+#!/usr/bin/env python3
+"""
+DOM-ONLY FAST Scraper - Uses JavaScript for ultra-fast DOM extraction.
+
+Strategy:
+1. Scroll to load all reviews
+2. Extract ALL data using JavaScript in one shot (no slow Selenium queries)
+3. Should be faster and simpler than API + DOM hybrid
+
+Target: ~20-25 seconds for all 244 reviews with simpler code
+"""
+import sys
+import yaml
+import logging
+import time
+import json
+from seleniumbase import Driver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException
+
+logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
+log = logging.getLogger(__name__)
+log.setLevel(logging.INFO)
+
+
+def load_config():
+    with open('config.yaml', 'r') as f:
+        return yaml.safe_load(f)
+
+
+def extract_all_reviews_js(driver):
+    """Extract ALL reviews using JavaScript - single fast operation."""
+
+    extract_script = """
+    const reviews = [];
+    const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium');
+
+    for (let i = 0; i < elements.length; i++) {
+        const elem = elements[i];
+        const review = {};
+
+        try {
+            // Author
+            const authorElem = elem.querySelector('div.d4r55');
+            review.author = authorElem ? authorElem.textContent.trim() : null;
+
+            // Rating
+            const ratingElem = elem.querySelector('span.kvMYJc');
+            if (ratingElem) {
+                const ariaLabel = ratingElem.getAttribute('aria-label');
+                if (ariaLabel) {
+                    const match = ariaLabel.match(/\\d+/);
+                    review.rating = match ? parseFloat(match[0]) : null;
+                }
+            }
+
+            // Text
+            const textElem = elem.querySelector('span.wiI7pd');
+            review.text = textElem ? textElem.textContent.trim() : null;
+
+            // Date
+            const dateElem = elem.querySelector('span.rsqaWe');
+            review.date_text = dateElem ? dateElem.textContent.trim() : null;
+
+            // Avatar
+            const avatarElem = elem.querySelector('img.NBa7we');
+            review.avatar_url = avatarElem ? avatarElem.src : null;
+
+            // Profile URL
+            const profileElem = elem.querySelector('button.WEBjve');
+            review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null;
+
+            if (review.author && review.date_text) {
+                reviews.push(review);
+            }
+        } catch (e) {
+            // Skip this review
+        }
+    }
+
+    return reviews;
+    """
+
+    try:
+        reviews_data = driver.execute_script(extract_script)
+
+        # Add review IDs
+        reviews = []
+        for review_data in reviews_data:
+            review_id = f"review_{hash(review_data['author'] + review_data['date_text'])}"
+            review_data['review_id'] = review_id
+            reviews.append(review_data)
+
+        return reviews
+
+    except Exception as e:
+        print(f"  Error in JavaScript extraction: {e}")
+        return []
+
+
+def dom_only_fast_scrape():
+    """Ultra-fast DOM-only scraping with JavaScript extraction."""
+
+    config = load_config()
+    url = config.get('url')
+    headless = config.get('headless', False)
+
+    print("DOM-ONLY FAST SCRAPER - JavaScript extraction...")
+    print(f"URL: {url[:80]}...")
+
+    start_time = time.time()
+
+    driver = Driver(uc=True, headless=headless, page_load_strategy="normal")
+
+    try:
+        # Navigate
+        driver.get(url)
+        time.sleep(1.5)  # Reduced from 2.0
+
+        # Handle GDPR consent page (CRITICAL FIX!)
+        if 'consent.google.com' in driver.current_url:
+            try:
+                # Click "Accept all" / "Aceptar todo"
+                consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Aceptar"]')
+                if not consent_btns:
+                    consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept"]')
+                if consent_btns:
+                    consent_btns[0].click()
+                    time.sleep(1.5)  # Reduced from 2.0
+            except:
+                pass
+
+        # Dismiss cookie banner on Maps page
+        try:
+            cookie_btns = driver.find_elements(By.CSS_SELECTOR,
+                'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
+            if cookie_btns:
+                cookie_btns[0].click()
+                time.sleep(0.3)  # Reduced from 0.4
+        except:
+            pass
+
+        # Click reviews tab
+        review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
+        for selector in ['.LRkQ2', 'button[role="tab"]']:
+            try:
+                tabs = driver.find_elements(By.CSS_SELECTOR, selector)
+                for tab in tabs:
+                    text = (tab.text or '').lower()
+                    aria = (tab.get_attribute('aria-label') or '').lower()
+                    if any(kw in text or kw in aria for kw in review_keywords):
+                        driver.execute_script("arguments[0].click();", tab)
+                        time.sleep(0.3)  # Reduced from 0.4
+                        break
+            except:
+                continue
+
+        # Wait for page stability
+        time.sleep(0.8)  # Reduced from 1.0
+
+        # Find pane
+        pane = None
+        try:
+            wait = WebDriverWait(driver, 3)
+            pane = wait.until(EC.presence_of_element_located(
+                (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
+        except TimeoutException:
+            try:
+                pane = wait.until(EC.presence_of_element_located(
+                    (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
+            except:
+                print("ERROR: Could not find pane")
+                return []
+
+        # CRITICAL: Wait for initial reviews to load
+        time.sleep(1.2)  # Reduced from 1.5
+
+        # Setup scroll
+        driver.execute_script("window.scrollablePane = arguments[0];", pane)
+        scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
+
+        # Trigger initial scroll and VERIFY reviews are loading
+        driver.execute_script(scroll_script)
+        time.sleep(0.8)  # Reduced from 1.0
+
+        # Check if reviews are actually loading
+        initial_count = driver.execute_script(
+            "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
+        )
+
+        if initial_count < 5:
+            # Reviews not loaded yet, wait more
+            print(f"  Waiting for reviews to load (found {initial_count})...")
+            time.sleep(1.5)  # Reduced from 2.0
+            driver.execute_script(scroll_script)
+            time.sleep(0.8)
+            initial_count = driver.execute_script(
+                "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
+            )
+
+        print(f"Scrolling to load all reviews (starting with {initial_count})...")
+
+        # Fast scrolling to load all DOM elements
+        # No hard limit - stops automatically via idle detection
+        max_scrolls = 999999
+        last_count = 0
+        idle_count = 0
+        last_scroll_pos = 0
+
+        for i in range(max_scrolls):
+            # Get current review count
+            current_count = driver.execute_script(
+                "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
+            )
+
+            # Scroll to load more
+            prev_count = current_count
+            driver.execute_script(scroll_script)
+
+            # SMART WAIT: Wait until new reviews actually load (instead of fixed delay!)
+            max_wait = 1.0  # Maximum 1 second
+            wait_step = 0.05  # Check every 50ms
+            waited = 0
+
+            while waited < max_wait:
+                time.sleep(wait_step)
+                waited += wait_step
+
+                new_count = driver.execute_script(
+                    "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
+                )
+
+                # If reviews loaded, continue immediately!
+                if new_count > prev_count:
+                    break
+
+                # If at bottom and no new reviews after 0.3s, we're done
+                if waited >= 0.3 and new_count == prev_count:
+                    scroll_pos = driver.execute_script("return arguments[0].scrollTop;", pane)
+                    if scroll_pos == last_scroll_pos:
+                        idle_count += 1
+                        if idle_count >= 3:
+                            print(f"  Reached end at {new_count} reviews")
+                            break
+                    last_scroll_pos = scroll_pos
+                    break
+
+            current_count = new_count
+
+            # Progress logging every 10 scrolls
+            if (i + 1) % 10 == 0:
+                print(f"  {current_count} review elements loaded...")
+
+            # Track for idle detection
+            if current_count == prev_count:
+                idle_count += 1
+                if idle_count >= 3:
+                    break
+            else:
+                idle_count = 0
+
+            last_count = current_count
+
+        # Shorter final scroll
+        for _ in range(2):  # Reduced from 3
+            driver.execute_script(scroll_script)
+            time.sleep(0.3)  # Reduced from 0.4
+
+        scroll_time = time.time() - start_time
+        print(f"  Scrolling complete in {scroll_time:.2f}s")
+
+        # Extract ALL reviews using JavaScript (fast!)
+        print("Extracting reviews with JavaScript...")
+        extract_start = time.time()
+
+        all_reviews = extract_all_reviews_js(driver)
+
+        extract_time = time.time() - extract_start
+        print(f"  Extraction complete in {extract_time:.2f}s")
+
+        elapsed = time.time() - start_time
+
+        print(f"\n{'='*50}")
+        print(f"✅ COMPLETED!")
+        print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
+        print(f"Time: {elapsed:.2f}s")
+        print(f"  - Scrolling: {scroll_time:.2f}s")
+        print(f"  - Extraction: {extract_time:.2f}s")
+        print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
+        print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
+        print(f"{'='*50}")
+
+        if len(all_reviews) >= 244:
+            print(f"🎯 Got ALL 244 reviews!")
+        elif len(all_reviews) >= 240:
+            print(f"⚠️  Missing {244-len(all_reviews)} reviews")
+
+        print()
+
+        # Save
+        with open('google_reviews_dom_only_fast.json', 'w', encoding='utf-8') as f:
+            json.dump(all_reviews, f, indent=2, ensure_ascii=False)
+
+        print(f"💾 Saved to google_reviews_dom_only_fast.json")
+
+        if all_reviews:
+            print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")
+
+        return all_reviews
+
+    finally:
+        try:
+            driver.quit()
+        except:
+            pass
+
+
+if __name__ == '__main__':
+    try:
+        reviews = dom_only_fast_scrape()
+        sys.exit(0 if reviews else 1)
+    except KeyboardInterrupt:
+        print("\n\nInterrupted by user")
+        sys.exit(1)
+    except Exception as e:
+        print(f"ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)