#!/usr/bin/env python3 """ OPTIMIZED HYBRID Scraper - True parallel with minimal overhead. Strategy: 1. Ultra-fast API scrolling (no DOM parsing during scroll!) 2. Quick DOM count check near end (minimal overhead) 3. If needed, targeted DOM parse at very end for missing reviews 4. Goal: ~22-25s for all 244 reviews Key: Keep scroll loop FAST, only parse DOM if absolutely needed at the very end. """ import sys import yaml import logging import time import json from seleniumbase import Driver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from modules.api_interceptor import GoogleMapsAPIInterceptor logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s') log = logging.getLogger(__name__) log.setLevel(logging.INFO) def load_config(): with open('config.yaml', 'r') as f: return yaml.safe_load(f) def quick_dom_parse_top_reviews(driver, count=15): """Quick parse of just the top N reviews from DOM.""" dom_reviews = [] try: # Get only first N review elements (the ones most likely to be missing from API) review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')[:count] for elem in review_elements: try: review_data = {} # Author try: author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55') review_data['author'] = author_elem.text except: review_data['author'] = None # Rating try: rating_elem = elem.find_element(By.CSS_SELECTOR, 'span.kvMYJc') rating_attr = rating_elem.get_attribute('aria-label') if rating_attr: rating_parts = rating_attr.split() if rating_parts: review_data['rating'] = float(rating_parts[0]) except: review_data['rating'] = None # Text try: text_elem = elem.find_element(By.CSS_SELECTOR, 'span.wiI7pd') review_data['text'] = text_elem.text except: review_data['text'] = None # Date try: date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe') review_data['date_text'] = date_elem.text except: review_data['date_text'] = None # Avatar try: avatar_elem = elem.find_element(By.CSS_SELECTOR, 'img.NBa7we') review_data['avatar_url'] = avatar_elem.get_attribute('src') except: review_data['avatar_url'] = None # Profile URL try: profile_elem = elem.find_element(By.CSS_SELECTOR, 'button.WEBjve') review_data['profile_url'] = profile_elem.get_attribute('data-review-id') except: review_data['profile_url'] = None # Generate ID if review_data.get('author'): review_id = f"dom_{hash(str(review_data.get('author', '')) + str(review_data.get('date_text', '')))}" review_data['review_id'] = review_id dom_reviews.append(review_data) except: continue except Exception as e: pass return dom_reviews def optimized_hybrid_scrape(): """Ultra-fast API scrolling + minimal targeted DOM parse.""" config = load_config() url = config.get('url') headless = config.get('headless', False) print("OPTIMIZED HYBRID SCRAPER - Ultra-fast API + minimal DOM...") print(f"URL: {url[:80]}...") start_time = time.time() api_reviews = {} driver = Driver(uc=True, headless=headless, page_load_strategy="normal") try: # Navigate driver.get(url) time.sleep(1.5) # Dismiss cookies try: cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') if cookie_btns: cookie_btns[0].click() time.sleep(0.4) except: pass # Click reviews tab review_keywords = ['reviews', 'review', 'reseñas', 'reseña'] for selector in ['.LRkQ2', 'button[role="tab"]']: try: tabs = driver.find_elements(By.CSS_SELECTOR, selector) for tab in tabs: text = (tab.text or '').lower() aria = (tab.get_attribute('aria-label') or '').lower() if any(kw in text or kw in aria for kw in review_keywords): driver.execute_script("arguments[0].click();", tab) time.sleep(0.4) break except: continue # Brief wait for reviews page (balance speed vs stability) time.sleep(1.0) # Reduced from 3s but needed for stability # Find pane - use most common selector directly pane = None try: wait = WebDriverWait(driver, 3) # Reduced from 5s pane = wait.until(EC.presence_of_element_located( (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'))) except TimeoutException: try: pane = wait.until(EC.presence_of_element_located( (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde'))) except: print("ERROR: Could not find pane") return [] # Setup API interceptor immediately interceptor = GoogleMapsAPIInterceptor(driver) interceptor.setup_interception() interceptor.inject_response_interceptor() time.sleep(0.3) # Minimal wait for interceptor # Setup scroll driver.execute_script("window.scrollablePane = arguments[0];", pane) scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" # Trigger initial scroll driver.execute_script(scroll_script) time.sleep(0.3) # Minimal initial trigger wait print("Ultra-fast API scrolling...") # FAST API-only scrolling (NO DOM parsing overhead!) max_scrolls = 35 for i in range(max_scrolls): driver.execute_script(scroll_script) time.sleep(0.27) # API collection only try: responses = interceptor.get_intercepted_responses() if responses: parsed = interceptor.parse_reviews_from_responses(responses) for review in parsed: if review.review_id and review.review_id not in api_reviews: api_reviews[review.review_id] = { 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, } except: pass if (i + 1) % 10 == 0: print(f" {len(api_reviews)} reviews...") # Final API collection try: responses = interceptor.get_intercepted_responses() if responses: parsed = interceptor.parse_reviews_from_responses(responses) for review in parsed: if review.review_id and review.review_id not in api_reviews: api_reviews[review.review_id] = { 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, } except: pass api_time = time.time() - start_time print(f" ✅ API complete: {len(api_reviews)} reviews in {api_time:.2f}s") # Targeted DOM parse ONLY if we're missing reviews missing = 244 - len(api_reviews) if missing > 0: print(f"\nQuick DOM parse for {missing} missing reviews...") # Scroll to top driver.execute_script("window.scrollablePane.scrollTo(0, 0);", pane) time.sleep(0.5) # Quick parse of top reviews (most likely to be missing) dom_reviews = quick_dom_parse_top_reviews(driver, count=min(missing + 5, 20)) # Build API keys api_keys = set() for api_review in api_reviews.values(): key = ( api_review.get('author', ''), (api_review.get('date_text', '') or '')[:20] ) api_keys.add(key) # Add unique DOM reviews dom_added = 0 for dom_review in dom_reviews: dom_key = ( dom_review.get('author', ''), (dom_review.get('date_text', '') or '')[:20] ) if dom_key not in api_keys and dom_review.get('review_id'): api_reviews[dom_review['review_id']] = dom_review dom_added += 1 dom_time = time.time() - start_time - api_time print(f" ✅ DOM complete: +{dom_added} reviews in {dom_time:.2f}s") elapsed = time.time() - start_time all_reviews = list(api_reviews.values()) print(f"\n{'='*50}") print(f"✅ COMPLETED!") print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)") print(f"Time: {elapsed:.2f}s") print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") print(f"Speedup: {155/elapsed:.1f}x faster! 🚀") print(f"{'='*50}") if len(all_reviews) >= 244: print(f"🎯 Got ALL 244 reviews!") elif len(all_reviews) >= 240: print(f"⚠️ Missing {244-len(all_reviews)} reviews") print() # Save with open('google_reviews_optimized_hybrid.json', 'w', encoding='utf-8') as f: json.dump(all_reviews, f, indent=2, ensure_ascii=False) print(f"💾 Saved to google_reviews_optimized_hybrid.json") if all_reviews: print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★") return all_reviews finally: try: driver.quit() except: pass if __name__ == '__main__': try: reviews = optimized_hybrid_scrape() sys.exit(0 if reviews else 1) except KeyboardInterrupt: print("\n\nInterrupted by user") sys.exit(1) except Exception as e: print(f"ERROR: {e}") import traceback traceback.print_exc() sys.exit(1)