whyrating-engine-legacy/start_dom_only_fast.py

#!/usr/bin/env python3
"""
DOM-ONLY FAST Scraper - Uses JavaScript for ultra-fast DOM extraction.

Strategy:
1. Scroll to load all reviews
2. Extract ALL data using JavaScript in one shot (no slow Selenium queries)
3. Should be faster and simpler than API + DOM hybrid

Target: ~20-25 seconds for all 244 reviews with simpler code
"""
import sys
import yaml
import logging
import time
import json
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)


def load_config():
    with open('config.yaml', 'r') as f:
        return yaml.safe_load(f)


def extract_all_reviews_js(driver):
    """Extract ALL reviews using JavaScript - single fast operation."""

    extract_script = """
    const reviews = [];
    const elements = document.querySelectorAll('div.jftiEf.fontBodyMedium');

    for (let i = 0; i < elements.length; i++) {
        const elem = elements[i];
        const review = {};

        try {
            // Author
            const authorElem = elem.querySelector('div.d4r55');
            review.author = authorElem ? authorElem.textContent.trim() : null;

            // Rating
            const ratingElem = elem.querySelector('span.kvMYJc');
            if (ratingElem) {
                const ariaLabel = ratingElem.getAttribute('aria-label');
                if (ariaLabel) {
                    const match = ariaLabel.match(/\\d+/);
                    review.rating = match ? parseFloat(match[0]) : null;
                }
            }

            // Text
            const textElem = elem.querySelector('span.wiI7pd');
            review.text = textElem ? textElem.textContent.trim() : null;

            // Date
            const dateElem = elem.querySelector('span.rsqaWe');
            review.date_text = dateElem ? dateElem.textContent.trim() : null;

            // Avatar
            const avatarElem = elem.querySelector('img.NBa7we');
            review.avatar_url = avatarElem ? avatarElem.src : null;

            // Profile URL
            const profileElem = elem.querySelector('button.WEBjve');
            review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null;

            if (review.author && review.date_text) {
                reviews.push(review);
            }
        } catch (e) {
            // Skip this review
        }
    }

    return reviews;
    """

    try:
        reviews_data = driver.execute_script(extract_script)

        # Add review IDs
        reviews = []
        for review_data in reviews_data:
            review_id = f"review_{hash(review_data['author'] + review_data['date_text'])}"
            review_data['review_id'] = review_id
            reviews.append(review_data)

        return reviews

    except Exception as e:
        print(f"  Error in JavaScript extraction: {e}")
        return []


def dom_only_fast_scrape():
    """Ultra-fast DOM-only scraping with JavaScript extraction."""

    config = load_config()
    url = config.get('url')
    headless = config.get('headless', False)

    print("DOM-ONLY FAST SCRAPER - JavaScript extraction...")
    print(f"URL: {url[:80]}...")

    start_time = time.time()

    driver = Driver(uc=True, headless=headless, page_load_strategy="normal")

    try:
        # Navigate
        driver.get(url)
        time.sleep(1.5)  # Reduced from 2.0

        # Handle GDPR consent page (CRITICAL FIX!)
        if 'consent.google.com' in driver.current_url:
            try:
                # Click "Accept all" / "Aceptar todo"
                consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Aceptar"]')
                if not consent_btns:
                    consent_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept"]')
                if consent_btns:
                    consent_btns[0].click()
                    time.sleep(1.5)  # Reduced from 2.0
            except:
                pass

        # Dismiss cookie banner on Maps page
        try:
            cookie_btns = driver.find_elements(By.CSS_SELECTOR,
                'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
            if cookie_btns:
                cookie_btns[0].click()
                time.sleep(0.3)  # Reduced from 0.4
        except:
            pass

        # Click reviews tab
        review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
        for selector in ['.LRkQ2', 'button[role="tab"]']:
            try:
                tabs = driver.find_elements(By.CSS_SELECTOR, selector)
                for tab in tabs:
                    text = (tab.text or '').lower()
                    aria = (tab.get_attribute('aria-label') or '').lower()
                    if any(kw in text or kw in aria for kw in review_keywords):
                        driver.execute_script("arguments[0].click();", tab)
                        time.sleep(0.3)  # Reduced from 0.4
                        break
            except:
                continue

        # Wait for page stability
        time.sleep(0.8)  # Reduced from 1.0

        # Find pane
        pane = None
        try:
            wait = WebDriverWait(driver, 3)
            pane = wait.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
        except TimeoutException:
            try:
                pane = wait.until(EC.presence_of_element_located(
                    (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
            except:
                print("ERROR: Could not find pane")
                return []

        # CRITICAL: Wait for initial reviews to load
        time.sleep(1.2)  # Reduced from 1.5

        # Setup scroll
        driver.execute_script("window.scrollablePane = arguments[0];", pane)
        scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"

        # Trigger initial scroll and VERIFY reviews are loading
        driver.execute_script(scroll_script)
        time.sleep(0.8)  # Reduced from 1.0

        # Check if reviews are actually loading
        initial_count = driver.execute_script(
            "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
        )

        if initial_count < 5:
            # Reviews not loaded yet, wait more
            print(f"  Waiting for reviews to load (found {initial_count})...")
            time.sleep(1.5)  # Reduced from 2.0
            driver.execute_script(scroll_script)
            time.sleep(0.8)
            initial_count = driver.execute_script(
                "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
            )

        print(f"Scrolling to load all reviews (starting with {initial_count})...")

        # Fast scrolling to load all DOM elements
        # No hard limit - stops automatically via idle detection
        max_scrolls = 999999
        last_count = 0
        idle_count = 0
        last_scroll_pos = 0

        for i in range(max_scrolls):
            # Get current review count
            current_count = driver.execute_script(
                "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
            )

            # Scroll to load more
            prev_count = current_count
            driver.execute_script(scroll_script)

            # SMART WAIT: Wait until new reviews actually load (instead of fixed delay!)
            max_wait = 1.0  # Maximum 1 second
            wait_step = 0.05  # Check every 50ms
            waited = 0

            while waited < max_wait:
                time.sleep(wait_step)
                waited += wait_step

                new_count = driver.execute_script(
                    "return document.querySelectorAll('div.jftiEf.fontBodyMedium').length;"
                )

                # If reviews loaded, continue immediately!
                if new_count > prev_count:
                    break

                # If at bottom and no new reviews after 0.3s, we're done
                if waited >= 0.3 and new_count == prev_count:
                    scroll_pos = driver.execute_script("return arguments[0].scrollTop;", pane)
                    if scroll_pos == last_scroll_pos:
                        idle_count += 1
                        if idle_count >= 3:
                            print(f"  Reached end at {new_count} reviews")
                            break
                    last_scroll_pos = scroll_pos
                    break

            current_count = new_count

            # Progress logging every 10 scrolls
            if (i + 1) % 10 == 0:
                print(f"  {current_count} review elements loaded...")

            # Track for idle detection
            if current_count == prev_count:
                idle_count += 1
                if idle_count >= 3:
                    break
            else:
                idle_count = 0

            last_count = current_count

        # Shorter final scroll
        for _ in range(2):  # Reduced from 3
            driver.execute_script(scroll_script)
            time.sleep(0.3)  # Reduced from 0.4

        scroll_time = time.time() - start_time
        print(f"  Scrolling complete in {scroll_time:.2f}s")

        # Extract ALL reviews using JavaScript (fast!)
        print("Extracting reviews with JavaScript...")
        extract_start = time.time()

        all_reviews = extract_all_reviews_js(driver)

        extract_time = time.time() - extract_start
        print(f"  Extraction complete in {extract_time:.2f}s")

        elapsed = time.time() - start_time

        print(f"\n{'='*50}")
        print(f"✅ COMPLETED!")
        print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
        print(f"Time: {elapsed:.2f}s")
        print(f"  - Scrolling: {scroll_time:.2f}s")
        print(f"  - Extraction: {extract_time:.2f}s")
        print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
        print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")
        print(f"{'='*50}")

        if len(all_reviews) >= 244:
            print(f"🎯 Got ALL 244 reviews!")
        elif len(all_reviews) >= 240:
            print(f"⚠️  Missing {244-len(all_reviews)} reviews")

        print()

        # Save
        with open('google_reviews_dom_only_fast.json', 'w', encoding='utf-8') as f:
            json.dump(all_reviews, f, indent=2, ensure_ascii=False)

        print(f"💾 Saved to google_reviews_dom_only_fast.json")

        if all_reviews:
            print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")

        return all_reviews

    finally:
        try:
            driver.quit()
        except:
            pass


if __name__ == '__main__':
    try:
        reviews = dom_only_fast_scrape()
        sys.exit(0 if reviews else 1)
    except KeyboardInterrupt:
        print("\n\nInterrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"ERROR: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)