#!/usr/bin/env python3 """ PARALLEL HYBRID Scraper - Collects API + DOM simultaneously while scrolling. Strategy: 1. During scrolling, collect BOTH API responses AND DOM elements in parallel 2. Deduplicate at the end 3. Should get all 244 reviews in ~20-25s (vs 34s sequential) Optimization: No separate DOM parsing phase - everything happens during scroll! """ import sys import yaml import logging import time import json from seleniumbase import Driver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException, StaleElementReferenceException from modules.api_interceptor import GoogleMapsAPIInterceptor logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s') log = logging.getLogger(__name__) log.setLevel(logging.INFO) def load_config(): with open('config.yaml', 'r') as f: return yaml.safe_load(f) def parse_dom_review_element(elem): """Parse a single review element from DOM.""" try: review_data = {} # Author name try: author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55') review_data['author'] = author_elem.text except: review_data['author'] = None # Rating try: rating_elem = elem.find_element(By.CSS_SELECTOR, 'span.kvMYJc') rating_attr = rating_elem.get_attribute('aria-label') if rating_attr: rating_parts = rating_attr.split() if rating_parts: review_data['rating'] = float(rating_parts[0]) except: review_data['rating'] = None # Review text try: text_elem = elem.find_element(By.CSS_SELECTOR, 'span.wiI7pd') review_data['text'] = text_elem.text except: review_data['text'] = None # Date try: date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe') review_data['date_text'] = date_elem.text except: review_data['date_text'] = None # Avatar URL try: avatar_elem = elem.find_element(By.CSS_SELECTOR, 'img.NBa7we') review_data['avatar_url'] = avatar_elem.get_attribute('src') except: review_data['avatar_url'] = None # Profile URL try: profile_elem = elem.find_element(By.CSS_SELECTOR, 'button.WEBjve') review_data['profile_url'] = profile_elem.get_attribute('data-review-id') except: review_data['profile_url'] = None # Generate ID from author + date + rating if review_data.get('author'): review_id = f"dom_{hash(str(review_data.get('author', '')) + str(review_data.get('date_text', '')) + str(review_data.get('rating', '')))}" review_data['review_id'] = review_id return review_data return None except (StaleElementReferenceException, Exception): return None def parallel_hybrid_scrape(): """Collect API + DOM simultaneously during scrolling.""" config = load_config() url = config.get('url') headless = config.get('headless', False) print("PARALLEL HYBRID SCRAPER - Collecting API + DOM simultaneously...") print(f"URL: {url[:80]}...") start_time = time.time() api_reviews = {} dom_reviews = {} driver = Driver(uc=True, headless=headless, page_load_strategy="normal") try: # Step 1: Navigate driver.get(url) time.sleep(1.5) # Dismiss cookies try: cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') if cookie_btns: cookie_btns[0].click() time.sleep(0.4) except: pass # Click reviews tab review_keywords = ['reviews', 'review', 'reseñas', 'reseña'] for selector in ['.LRkQ2', 'button[role="tab"]']: try: tabs = driver.find_elements(By.CSS_SELECTOR, selector) for tab in tabs: text = (tab.text or '').lower() aria = (tab.get_attribute('aria-label') or '').lower() if any(kw in text or kw in aria for kw in review_keywords): driver.execute_script("arguments[0].click();", tab) time.sleep(0.4) break except: continue # Wait for page stability time.sleep(1.0) # Find pane pane = None try: wait = WebDriverWait(driver, 3) pane = wait.until(EC.presence_of_element_located( (By.CSS_SELECTOR, 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'))) except TimeoutException: try: pane = wait.until(EC.presence_of_element_located( (By.CSS_SELECTOR, 'div.m6QErb.WNBkOb.XiKgde'))) except: print("ERROR: Could not find pane") return [] # Wait for reviews to start loading time.sleep(1.5) # Setup API interceptor interceptor = GoogleMapsAPIInterceptor(driver) interceptor.setup_interception() interceptor.inject_response_interceptor() time.sleep(1.0) # Important: wait for interceptor to be ready # Setup scroll driver.execute_script("window.scrollablePane = arguments[0];", pane) scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" # Trigger initial scroll to get first API response driver.execute_script(scroll_script) time.sleep(1.0) # Wait for first API response print("Parallel collection (API + DOM simultaneously)...") # Scrolling with PARALLEL API + DOM collection max_scrolls = 35 dom_parse_start = 25 # Only start DOM parsing after 25 scrolls (when near end) for i in range(max_scrolls): # Scroll driver.execute_script(scroll_script) time.sleep(0.27) # Optimal scroll timing # PARALLEL COLLECTION 1: API Responses (always) try: responses = interceptor.get_intercepted_responses() if responses: parsed = interceptor.parse_reviews_from_responses(responses) for review in parsed: if review.review_id and review.review_id not in api_reviews: api_reviews[review.review_id] = { 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, } except: pass # PARALLEL COLLECTION 2: DOM Elements (only near the end, lightweight) # Only parse DOM in the last scrolls when we know we're near 234 API reviews if i >= dom_parse_start and len(api_reviews) >= 220: try: # Lightweight: Just get author + date as unique key, don't parse everything review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium') for elem in review_elements[:min(len(review_elements), 250)]: # Limit to first 250 for speed try: # Quick parse - just essentials author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55') author = author_elem.text if author_elem else None date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe') date_text = date_elem.text if date_elem else None if author and date_text: dom_key = (author, date_text[:20]) if dom_key not in dom_reviews: # Full parse only if needed dom_review = parse_dom_review_element(elem) if dom_review: dom_reviews[dom_key] = dom_review except: continue except: pass # Progress logging if (i + 1) % 10 == 0: print(f" API: {len(api_reviews)}, DOM: {len(dom_reviews)} unique keys...") # Final collections print("Final collection sweep...") # Final API collection try: responses = interceptor.get_intercepted_responses() if responses: parsed = interceptor.parse_reviews_from_responses(responses) for review in parsed: if review.review_id and review.review_id not in api_reviews: api_reviews[review.review_id] = { 'review_id': review.review_id, 'author': review.author, 'rating': review.rating, 'text': review.text, 'date_text': review.date_text, 'avatar_url': review.avatar_url, 'profile_url': review.profile_url, } except: pass # Final DOM parse (quick sweep) try: review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium') for elem in review_elements[:min(len(review_elements), 250)]: try: author_elem = elem.find_element(By.CSS_SELECTOR, 'div.d4r55') author = author_elem.text if author_elem else None date_elem = elem.find_element(By.CSS_SELECTOR, 'span.rsqaWe') date_text = date_elem.text if date_elem else None if author and date_text: dom_key = (author, date_text[:20]) if dom_key not in dom_reviews: dom_review = parse_dom_review_element(elem) if dom_review: dom_reviews[dom_key] = dom_review except: continue except: pass # Merge: Start with API reviews, add DOM reviews that aren't duplicates print("\nMerging API + DOM reviews...") # Build set of API keys for deduplication (author + date) api_keys = set() for api_review in api_reviews.values(): key = ( api_review.get('author', ''), (api_review.get('date_text', '') or '')[:20] ) api_keys.add(key) # Add unique DOM reviews dom_added = 0 for dom_key, dom_review in dom_reviews.items(): if dom_key not in api_keys and dom_review.get('review_id'): api_reviews[dom_review['review_id']] = dom_review dom_added += 1 elapsed = time.time() - start_time all_reviews = list(api_reviews.values()) print(f"\n{'='*50}") print(f"✅ COMPLETED!") print(f"Reviews: {len(all_reviews)}/244 ({len(all_reviews)/244*100:.1f}%)") print(f" - API: {len(api_reviews) - dom_added}") print(f" - DOM: {dom_added} unique") print(f"Time: {elapsed:.2f}s") print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") print(f"Speedup: {155/elapsed:.1f}x faster! 🚀") print(f"{'='*50}") if len(all_reviews) >= 244: print(f"🎯 Got ALL 244 reviews!") elif len(all_reviews) >= 240: print(f"⚠️ Missing {244-len(all_reviews)} reviews") print() # Save with open('google_reviews_parallel_hybrid.json', 'w', encoding='utf-8') as f: json.dump(all_reviews, f, indent=2, ensure_ascii=False) print(f"💾 Saved to google_reviews_parallel_hybrid.json") if all_reviews: print(f"\nSample: {all_reviews[0]['author']} - {all_reviews[0]['rating']}★") return all_reviews finally: try: driver.quit() except: pass if __name__ == '__main__': try: reviews = parallel_hybrid_scrape() sys.exit(0 if reviews else 1) except KeyboardInterrupt: print("\n\nInterrupted by user") sys.exit(1) except Exception as e: print(f"ERROR: {e}") import traceback traceback.print_exc() sys.exit(1)