#!/usr/bin/env python3 """ API-Only 244 Scraper - Attempt to get ALL 244 reviews via API alone. Strategy: 1. More patient scrolling (more scrolls, longer waits) 2. Collect responses more frequently 3. Extra end-of-list collection 4. Slower timing near the end to ensure API completes Goal: Get all 244 reviews via API without DOM parsing """ import sys import yaml import logging import time import json from seleniumbase import Driver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from modules.api_interceptor import GoogleMapsAPIInterceptor logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s') log = logging.getLogger(__name__) log.setLevel(logging.INFO) def load_config(): with open('config.yaml', 'r') as f: return yaml.safe_load(f) def api_244_scrape(): """Get all 244 reviews purely via API with aggressive collection.""" config = load_config() url = config.get('url') headless = config.get('headless', False) print("API-244 SCRAPER - Getting ALL 244 reviews via API...") print(f"URL: {url[:80]}...") start_time = time.time() api_reviews = {} driver = Driver(uc=True, headless=headless, page_load_strategy="normal") try: # Step 1: Navigate driver.get(url) time.sleep(1.5) # Dismiss cookies try: cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') if cookie_btns: cookie_btns[0].click() time.sleep(0.4) except: pass # Click reviews tab review_keywords = ['reviews', 'review', 'reseñas', 'reseña'] for selector in ['.LRkQ2', 'button[role="tab"]']: try: tabs = driver.find_elements(By.CSS_SELECTOR, selector) for tab in tabs: text = (tab.text or '').lower() aria = (tab.get_attribute('aria-label') or '').lower() if any(kw in text or kw in aria for kw in review_keywords): driver.execute_script("arguments[0].click();", tab) time.sleep(0.4) break except: continue # Wait for page stability time.sleep(1.0) # Find pane pane = None try: wait = WebDriverWait(driver, 3) pane = wait.until(EC.presence_of_element_located( (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'))) except TimeoutException: try: pane = wait.until(EC.presence_of_element_located( (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde'))) except: print("ERROR: Could not find pane") return [] # Setup API interceptor interceptor = GoogleMapsAPIInterceptor(driver) interceptor.setup_interception() interceptor.inject_response_interceptor() time.sleep(1.0) # Longer wait to ensure interceptor is ready # Setup scroll driver.execute_script("window.scrollablePane = arguments[0];", pane) scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" # Trigger initial scroll driver.execute_script(scroll_script) time.sleep(1.0) # Wait for first API response print("Scrolling with extended collection strategy...") # Extended scrolling - MORE scrolls, SLOWER timing max_scrolls = 50 # More scrolls to ensure we catch everything idle_scrolls = 0 max_idle = 15 # Even more patience last_count = 0 last_scroll_pos = 0 scroll_stuck_count = 0 for i in range(max_scrolls): # Scroll driver.execute_script(scroll_script) # Progressive timing - slower and slower if len(api_reviews) < 50: time.sleep(0.30) # Start moderate elif len(api_reviews) < 100: time.sleep(0.35) elif len(api_reviews) < 150: time.sleep(0.40) elif len(api_reviews) < 200: time.sleep(0.50) elif len(api_reviews) < 230: time.sleep(0.60) # Much slower near end else: time.sleep(0.80) # Very slow for final reviews # Collect responses try: responses = interceptor.get_intercepted_responses() if responses: parsed = interceptor.parse_reviews_from_responses(responses) for review in parsed: if review.review_id and review.review_id not in api_reviews: api_reviews[review.review_id] = { 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, } except: pass # Check if we got new reviews current_count = len(api_reviews) if current_count == last_count: idle_scrolls += 1 else: idle_scrolls = 0 if (i + 1) % 10 == 0: print(f" {current_count} reviews...") last_count = current_count # Check scroll position try: current_scroll = driver.execute_script("return arguments[0].scrollTop;", pane) if current_scroll == last_scroll_pos: scroll_stuck_count += 1 else: scroll_stuck_count = 0 last_scroll_pos = current_scroll except: pass # Stop conditions - but only if we have at least 240 reviews if idle_scrolls >= max_idle and scroll_stuck_count >= 5 and current_count >= 240: print(f" Reached end (no new reviews for {idle_scrolls} scrolls)") break # AGGRESSIVE final collection phase print(f" Aggressive final collection (currently have {len(api_reviews)})...") # Do 10 more scrolls with very long waits for extra in range(10): driver.execute_script(scroll_script) time.sleep(1.2) # Very long wait try: responses = interceptor.get_intercepted_responses() if responses: parsed = interceptor.parse_reviews_from_responses(responses) new_count = 0 for review in parsed: if review.review_id and review.review_id not in api_reviews: api_reviews[review.review_id] = { 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, } new_count += 1 if new_count > 0: print(f" +{new_count} more reviews (total: {len(api_reviews)})") except: pass # Ultra-final wait and collect time.sleep(2.0) try: responses = interceptor.get_intercepted_responses() if responses: parsed = interceptor.parse_reviews_from_responses(responses) for review in parsed: if review.review_id and review.review_id not in api_reviews: api_reviews[review.review_id] = { 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, } except: pass elapsed = time.time() - start_time all_reviews = list(api_reviews.values()) print(f"\n{'='*50}") print(f"✅ COMPLETED!") print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)") print(f"Time: {elapsed:.2f}s") print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") if elapsed > 0: print(f"Speedup: {155/elapsed:.1f}x faster! 🚀") print(f"{'='*50}") if len(all_reviews) >= 244: print(f"🎯 Got ALL 244 reviews via API!") elif len(all_reviews) >= 240: print(f"⚠️ Missing {244-len(all_reviews)} reviews - may need DOM parsing") else: print(f"⚠️ Missing {244-len(all_reviews)} reviews") print() # Save with open('google_reviews_api_244.json', 'w', encoding='utf-8') as f: json.dump(all_reviews, f, indent=2, ensure_ascii=False) print(f"💾 Saved to google_reviews_api_244.json") if all_reviews: print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★") return all_reviews finally: try: driver.quit() except: pass if __name__ == '__main__': try: reviews = api_244_scrape() sys.exit(0 if reviews else 1) except KeyboardInterrupt: print("\n\nInterrupted by user") sys.exit(1) except Exception as e: print(f"ERROR: {e}") import traceback traceback.print_exc() sys.exit(1)