whyrating-engine-legacy/start_api_244.py

#!/usr/bin/env python3
"""
API-Only 244 Scraper - Attempt to get ALL 244 reviews via API alone.

Strategy:
1. More patient scrolling (more scrolls, longer waits)
2. Collect responses more frequently
3. Extra end-of-list collection
4. Slower timing near the end to ensure API completes

Goal: Get all 244 reviews via API without DOM parsing
"""
import sys
import yaml
import logging
import time
import json
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from modules.api_interceptor import GoogleMapsAPIInterceptor

logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)


def load_config():
    with open('config.yaml', 'r') as f:
        return yaml.safe_load(f)


def api_244_scrape():
    """Get all 244 reviews purely via API with aggressive collection."""

    config = load_config()
    url = config.get('url')
    headless = config.get('headless', False)

    print("API-244 SCRAPER - Getting ALL 244 reviews via API...")
    print(f"URL: {url[:80]}...")

    start_time = time.time()
    api_reviews = {}

    driver = Driver(uc=True, headless=headless, page_load_strategy="normal")

    try:
        # Step 1: Navigate
        driver.get(url)
        time.sleep(1.5)

        # Dismiss cookies
        try:
            cookie_btns = driver.find_elements(By.CSS_SELECTOR,
                'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
            if cookie_btns:
                cookie_btns[0].click()
                time.sleep(0.4)
        except:
            pass

        # Click reviews tab
        review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
        for selector in ['.LRkQ2', 'button[role="tab"]']:
            try:
                tabs = driver.find_elements(By.CSS_SELECTOR, selector)
                for tab in tabs:
                    text = (tab.text or '').lower()
                    aria = (tab.get_attribute('aria-label') or '').lower()
                    if any(kw in text or kw in aria for kw in review_keywords):
                        driver.execute_script("arguments[0].click();", tab)
                        time.sleep(0.4)
                        break
            except:
                continue

        # Wait for page stability
        time.sleep(1.0)

        # Find pane
        pane = None
        try:
            wait = WebDriverWait(driver, 3)
            pane = wait.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')))
        except TimeoutException:
            try:
                pane = wait.until(EC.presence_of_element_located(
                    (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde')))
            except:
                print("ERROR: Could not find pane")
                return []

        # Setup API interceptor
        interceptor = GoogleMapsAPIInterceptor(driver)
        interceptor.setup_interception()
        interceptor.inject_response_interceptor()
        time.sleep(1.0)  # Longer wait to ensure interceptor is ready

        # Setup scroll
        driver.execute_script("window.scrollablePane = arguments[0];", pane)
        scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"

        # Trigger initial scroll
        driver.execute_script(scroll_script)
        time.sleep(1.0)  # Wait for first API response

        print("Scrolling with extended collection strategy...")

        # Extended scrolling - MORE scrolls, SLOWER timing
        max_scrolls = 50  # More scrolls to ensure we catch everything
        idle_scrolls = 0
        max_idle = 15  # Even more patience
        last_count = 0
        last_scroll_pos = 0
        scroll_stuck_count = 0

        for i in range(max_scrolls):
            # Scroll
            driver.execute_script(scroll_script)

            # Progressive timing - slower and slower
            if len(api_reviews) < 50:
                time.sleep(0.30)  # Start moderate
            elif len(api_reviews) < 100:
                time.sleep(0.35)
            elif len(api_reviews) < 150:
                time.sleep(0.40)
            elif len(api_reviews) < 200:
                time.sleep(0.50)
            elif len(api_reviews) < 230:
                time.sleep(0.60)  # Much slower near end
            else:
                time.sleep(0.80)  # Very slow for final reviews

            # Collect responses
            try:
                responses = interceptor.get_intercepted_responses()
                if responses:
                    parsed = interceptor.parse_reviews_from_responses(responses)
                    for review in parsed:
                        if review.review_id and review.review_id not in api_reviews:
                            api_reviews[review.review_id] = {
                                'review_id': review.review_id,
                                'author': review.author,
                                'rating': review.rating,
                                'text': review.text,
                                'date_text': review.date_text,
                                'avatar_url': review.avatar_url,
                                'profile_url': review.profile_url,
                            }
            except:
                pass

            # Check if we got new reviews
            current_count = len(api_reviews)
            if current_count == last_count:
                idle_scrolls += 1
            else:
                idle_scrolls = 0
                if (i + 1) % 10 == 0:
                    print(f"  {current_count} reviews...")

            last_count = current_count

            # Check scroll position
            try:
                current_scroll = driver.execute_script("return arguments[0].scrollTop;", pane)
                if current_scroll == last_scroll_pos:
                    scroll_stuck_count += 1
                else:
                    scroll_stuck_count = 0
                last_scroll_pos = current_scroll
            except:
                pass

            # Stop conditions - but only if we have at least 240 reviews
            if idle_scrolls >= max_idle and scroll_stuck_count >= 5 and current_count >= 240:
                print(f"  Reached end (no new reviews for {idle_scrolls} scrolls)")
                break

        # AGGRESSIVE final collection phase
        print(f"  Aggressive final collection (currently have {len(api_reviews)})...")

        # Do 10 more scrolls with very long waits
        for extra in range(10):
            driver.execute_script(scroll_script)
            time.sleep(1.2)  # Very long wait

            try:
                responses = interceptor.get_intercepted_responses()
                if responses:
                    parsed = interceptor.parse_reviews_from_responses(responses)
                    new_count = 0
                    for review in parsed:
                        if review.review_id and review.review_id not in api_reviews:
                            api_reviews[review.review_id] = {
                                'review_id': review.review_id,
                                'author': review.author,
                                'rating': review.rating,
                                'text': review.text,
                                'date_text': review.date_text,
                                'avatar_url': review.avatar_url,
                                'profile_url': review.profile_url,
                            }
                            new_count += 1

                    if new_count > 0:
                        print(f"    +{new_count} more reviews (total: {len(api_reviews)})")
            except:
                pass

        # Ultra-final wait and collect
        time.sleep(2.0)
        try:
            responses = interceptor.get_intercepted_responses()
            if responses:
                parsed = interceptor.parse_reviews_from_responses(responses)
                for review in parsed:
                    if review.review_id and review.review_id not in api_reviews:
                        api_reviews[review.review_id] = {
                            'review_id': review.review_id,
                            'author': review.author,
                            'rating': review.rating,
                            'text': review.text,
                            'date_text': review.date_text,
                            'avatar_url': review.avatar_url,
                            'profile_url': review.profile_url,
                        }
        except:
            pass

        elapsed = time.time() - start_time
        all_reviews = list(api_reviews.values())

        print(f"\n{'='*50}")
        print(f"✅ COMPLETED!")
        print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)")
        print(f"Time: {elapsed:.2f}s")
        print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")

        if elapsed > 0:
            print(f"Speedup: {155/elapsed:.1f}x faster! 🚀")

        print(f"{'='*50}")

        if len(all_reviews) >= 244:
            print(f"🎯 Got ALL 244 reviews via API!")
        elif len(all_reviews) >= 240:
            print(f"⚠️  Missing {244-len(all_reviews)} reviews - may need DOM parsing")
        else:
            print(f"⚠️  Missing {244-len(all_reviews)} reviews")

        print()

        # Save
        with open('google_reviews_api_244.json', 'w', encoding='utf-8') as f:
            json.dump(all_reviews, f, indent=2, ensure_ascii=False)

        print(f"💾 Saved to google_reviews_api_244.json")

        if all_reviews:
            print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★")

        return all_reviews

    finally:
        try:
            driver.quit()
        except:
            pass


if __name__ == '__main__':
    try:
        reviews = api_244_scrape()
        sys.exit(0 if reviews else 1)
    except KeyboardInterrupt:
        print("\n\nInterrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"ERROR: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)