#!/usr/bin/env python3 """ Complete Scraper - Gets ALL reviews while staying fast. Strategy: 1. Scroll until no new reviews for 5 consecutive scrolls 2. Check scroll position to detect end 3. Do extra scrolls at the end to catch stragglers 4. Adaptive timing - faster at start, slower at end Target: Get all 244 reviews in ~22-25 seconds """ import sys import yaml import logging import time import json from seleniumbase import Driver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from modules.api_interceptor import GoogleMapsAPIInterceptor logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s') log = logging.getLogger(__name__) log.setLevel(logging.INFO) def load_config(): with open('config.yaml', 'r') as f: return yaml.safe_load(f) def complete_scrape(): """Get ALL reviews with intelligent scrolling.""" config = load_config() url = config.get('url') headless = config.get('headless', False) print("COMPLETE SCRAPER - Getting ALL reviews...") print(f"URL: {url[:80]}...") start_time = time.time() api_reviews = {} driver = Driver(uc=True, headless=headless, page_load_strategy="normal") try: # Step 1: Navigate driver.get(url) time.sleep(1.5) # Dismiss cookies try: cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') if cookie_btns: cookie_btns[0].click() time.sleep(0.4) except: pass # Click reviews tab review_keywords = ['reviews', 'review', 'reseñas', 'reseña'] for selector in ['.LRkQ2', 'button[role="tab"]']: try: tabs = driver.find_elements(By.CSS_SELECTOR, selector) for tab in tabs: text = (tab.text or '').lower() aria = (tab.get_attribute('aria-label') or '').lower() if any(kw in text or kw in aria for kw in review_keywords): driver.execute_script("arguments[0].click();", tab) time.sleep(0.4) break except: continue # Wait for page stability time.sleep(1.0) # Find pane pane = None try: wait = WebDriverWait(driver, 3) pane = wait.until(EC.presence_of_element_located( (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'))) except TimeoutException: try: pane = wait.until(EC.presence_of_element_located( (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde'))) except: print("ERROR: Could not find pane") return [] # Wait for initial reviews to load time.sleep(1.5) # Setup API interceptor interceptor = GoogleMapsAPIInterceptor(driver) interceptor.setup_interception() interceptor.inject_response_interceptor() time.sleep(1.0) # Important: wait for interceptor to be ready # Setup scroll driver.execute_script("window.scrollablePane = arguments[0];", pane) scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" # Trigger initial scroll to get first API response driver.execute_script(scroll_script) time.sleep(1.0) # Wait for first API response print("Scrolling with intelligent stopping...") # Intelligent scrolling max_scrolls = 60 # Higher limit to ensure we get everything idle_scrolls = 0 # Count scrolls with no new reviews max_idle = 12 # More patience - stop after 12 scrolls with no new reviews last_count = 0 last_scroll_pos = 0 scroll_stuck_count = 0 for i in range(max_scrolls): # Scroll driver.execute_script(scroll_script) # Adaptive timing - faster at start, slower near end if len(api_reviews) < 100: time.sleep(0.27) # Fast at beginning elif len(api_reviews) < 200: time.sleep(0.30) # Medium in middle elif len(api_reviews) < 235: time.sleep(0.40) # Slower near end else: time.sleep(0.50) # Very slow at the very end to catch stragglers # Collect responses try: responses = interceptor.get_intercepted_responses() if responses: parsed = interceptor.parse_reviews_from_responses(responses) for review in parsed: if review.review_id and review.review_id not in api_reviews: api_reviews[review.review_id] = { 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, } except: pass # Check if we got new reviews current_count = len(api_reviews) if current_count == last_count: idle_scrolls += 1 else: idle_scrolls = 0 if (i + 1) % 10 == 0: print(f" {current_count} reviews...") last_count = current_count # Check scroll position to detect if stuck at bottom try: current_scroll = driver.execute_script("return arguments[0].scrollTop;", pane) if current_scroll == last_scroll_pos: scroll_stuck_count += 1 else: scroll_stuck_count = 0 last_scroll_pos = current_scroll except: pass # Stop conditions if idle_scrolls >= max_idle and scroll_stuck_count >= 3: print(f" Reached end (no new reviews for {idle_scrolls} scrolls)") break # Extra thorough collection at the end print(f" Final collection sweep (currently have {len(api_reviews)})...") # Do a few more scrolls with longer waits for extra in range(5): driver.execute_script(scroll_script) time.sleep(0.8) # Longer wait to ensure API completes try: responses = interceptor.get_intercepted_responses() if responses: parsed = interceptor.parse_reviews_from_responses(responses) new_count = 0 for review in parsed: if review.review_id and review.review_id not in api_reviews: api_reviews[review.review_id] = { 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, } new_count += 1 if new_count > 0: print(f" +{new_count} more reviews (total: {len(api_reviews)})") except: pass # Final wait and collect time.sleep(1.0) try: responses = interceptor.get_intercepted_responses() if responses: parsed = interceptor.parse_reviews_from_responses(responses) for review in parsed: if review.review_id and review.review_id not in api_reviews: api_reviews[review.review_id] = { 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, } except: pass elapsed = time.time() - start_time all_reviews = list(api_reviews.values()) print(f"\n✅ COMPLETED!") print(f"Reviews: {len(all_reviews)} (target: 244)") print(f"Time: {elapsed:.2f}s") print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") print(f"Speedup: {155/elapsed:.1f}x faster! 🚀") if len(all_reviews) >= 244: print(f"🎯 Got ALL reviews!") elif len(all_reviews) >= 240: print(f"⚠️ Missing {244-len(all_reviews)} reviews") print() # Save with open('google_reviews_complete.json', 'w', encoding='utf-8') as f: json.dump(all_reviews, f, indent=2, ensure_ascii=False) print(f"💾 Saved to google_reviews_complete.json") if all_reviews: print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★") return all_reviews finally: try: driver.quit() except: pass if __name__ == '__main__': try: reviews = complete_scrape() sys.exit(0 if reviews else 1) except KeyboardInterrupt: print("\n\nInterrupted by user") sys.exit(1) except Exception as e: print(f"ERROR: {e}") import traceback traceback.print_exc() sys.exit(1)