diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py index a65f432..1f3b15d 100644 --- a/modules/scraper_clean.py +++ b/modules/scraper_clean.py @@ -200,9 +200,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in # Storage - use review ID as key reviews = {} # review_id -> review - # Force English language - if "hl=" not in url: - url = url + ("&" if "?" in url else "?") + "hl=en" + # Don't force language - let Google show all reviews in user's locale # Navigate to URL print(f"๐ŸŒ Loading: {url[:80]}...") @@ -380,10 +378,47 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in pass return api_revs - # Store pane in window for scroll thread - driver.execute_script("window.scrollablePane = arguments[0];", scroll_container) + # Sort by newest first (helps with loading) + try: + sort_btn = driver.execute_script(""" + var btns = document.querySelectorAll('button[data-value="sort"]'); + if (btns.length) return btns[0]; + // Try aria-label + var all = document.querySelectorAll('button[aria-label*="Sort"]'); + if (all.length) return all[0]; + return null; + """) + if sort_btn: + sort_btn.click() + time.sleep(0.3) + # Click "Newest" option + driver.execute_script(""" + var items = document.querySelectorAll('[role="menuitemradio"], [data-index="1"]'); + for (var i = 0; i < items.length; i++) { + var txt = items[i].textContent.toLowerCase(); + if (txt.includes('newest') || txt.includes('recent') || txt.includes('mรกs reciente')) { + items[i].click(); + break; + } + } + """) + time.sleep(0.5) + print(" ๐Ÿ“… Sorted by newest") + except: + pass - # Background scroll thread (fast, continuous) + # Block images to speed up scrolling (use CDP) + try: + driver.execute_cdp_cmd('Network.setBlockedURLs', { + 'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*'] + }) + driver.execute_cdp_cmd('Network.enable', {}) + print(" ๐Ÿšซ Blocking images for faster scrolling") + except Exception as e: + pass # CDP might not be available in all setups + + # Simple scroll - scrollTop = scrollHeight (proven to work) + driver.execute_script("window.scrollablePane = arguments[0];", scroll_container) stop_scrolling = threading.Event() def scroll_worker(): @@ -395,11 +430,57 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in """) except: pass - time.sleep(0.1) # 10x per second + time.sleep(0.1) scroll_thread = threading.Thread(target=scroll_worker, daemon=True) scroll_thread.start() + # Recovery function - use real mouse actions when stuck + from selenium.webdriver.common.action_chains import ActionChains + from selenium.webdriver.common.keys import Keys + recovery_count = [0] + + def unstick_scroll(): + recovery_count[0] += 1 + method = recovery_count[0] % 4 + try: + if method == 1: + # Method 1: Click pane and send Page Down keys + scroll_container.click() + ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform() + ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform() + elif method == 2: + # Method 2: Real mouse wheel scroll + ActionChains(driver).move_to_element(scroll_container)\ + .scroll_by_amount(0, 800).perform() + elif method == 3: + # Method 3: Scroll up significantly then back down (force reload) + driver.execute_script(""" + var p = window.scrollablePane; + if (p) p.scrollTop = Math.max(0, p.scrollTop - 2000); + """) + time.sleep(0.3) + driver.execute_script(""" + var p = window.scrollablePane; + if (p) p.scrollTop = p.scrollHeight; + """) + else: + # Method 4: Click last review card to focus, then scroll + driver.execute_script(""" + var cards = document.querySelectorAll('div.jftiEf[data-review-id]'); + if (cards.length > 0) { + cards[cards.length - 1].scrollIntoView({block: 'end'}); + cards[cards.length - 1].click(); + } + """) + time.sleep(0.2) + driver.execute_script(""" + var p = window.scrollablePane; + if (p) p.scrollTop = p.scrollHeight; + """) + except: + pass + # Main collection loop last_new_time = time.time() last_count = len(reviews) @@ -417,19 +498,17 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in key = f"api_{rev['author'][:20]}_{rev['rating']}" reviews[key] = rev - # Collect review IDs via JavaScript (doesn't affect scroll position!) - # Use specific selector to only get actual review cards, not buttons + # Parse reviews in real-time (Google Maps uses virtual scroll - elements get removed!) + # We must parse NOW, not later try: - review_ids = driver.execute_script(""" - var ids = []; - document.querySelectorAll('div.jftiEf[data-review-id]').forEach(function(el) { - ids.push(el.getAttribute('data-review-id')); - }); - return ids; - """) - for rid in (review_ids or []): + cards = driver.find_elements(By.CSS_SELECTOR, "div.jftiEf[data-review-id]") + for card in cards: + rid = card.get_attribute("data-review-id") if rid and rid not in reviews: - reviews[rid] = {"id": rid, "source": "dom", "_needs_parse": True} + # Parse immediately - element may be gone later! + review = parse_dom_review(card) + if review: + reviews[rid] = review except: pass @@ -440,6 +519,30 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in last_new_time = time.time() last_count = current_count + # Check if loading (spinner visible OR network activity) + try: + loading_status = driver.execute_script(""" + var status = {spinner: false, network: false}; + // Check for Google's loading indicators + var spinner = document.querySelector('div[role="progressbar"]'); + if (spinner && spinner.offsetParent !== null) status.spinner = true; + var loading = document.querySelector('.qjESne, .loading'); + if (loading && loading.offsetParent !== null) status.spinner = true; + // Check for recent network activity (API interceptor) + var responses = window.__interceptedResponses || []; + var lastCount = window.__lastResponseCount || 0; + if (responses.length > lastCount) { + status.network = true; + window.__lastResponseCount = responses.length; + } + return status; + """) + is_loading = loading_status.get('spinner') or loading_status.get('network') + if is_loading: + last_new_time = time.time() # Reset timer while loading + except: + is_loading = False + # Progress update elapsed = time.time() - last_new_time if total_reviews: @@ -448,37 +551,58 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in else: print(f" ๐Ÿ“Š {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True) + # STUCK DETECTION: If no new reviews for 3s+, try to unstick + # Trigger at 3s, 6s, 9s... (every 3 seconds while stuck) + if elapsed >= 3 and int(elapsed) % 3 == 0: + print(f" ๐Ÿ”ง Recovery attempt #{recovery_count[0] + 1}...", flush=True) + unstick_scroll() + # Stop conditions if current_count >= max_reviews: print(f"โœ… Reached max: {current_count}") stop_scrolling.set() break - if total_reviews and current_count >= total_reviews: - print(f"โœ… Got all {total_reviews} reviews!") + # Check scroll state - track if content is still being added + try: + scroll_state = driver.execute_script(""" + var p = window.scrollablePane; + if (!p) return {atBottom: true, height: 0}; + var atBottom = (p.scrollTop + p.clientHeight >= p.scrollHeight - 50); + var height = p.scrollHeight; + var lastHeight = window.__lastScrollHeight || 0; + var growing = height > lastHeight; + window.__lastScrollHeight = height; + return {atBottom: atBottom, height: height, growing: growing}; + """) + at_bottom = scroll_state.get('atBottom', True) + content_growing = scroll_state.get('growing', False) + except: + at_bottom = True + content_growing = False + + # Reset timer if content is growing (new reviews loading) + if content_growing: + last_new_time = time.time() + + # Dynamic timeout based on state and recovery attempts + # - 5s if at bottom AND content stopped growing AND multiple recovery attempts failed + # - 15s max otherwise (keep trying) + recovery_failed = recovery_count[0] >= 5 and elapsed >= 5 + truly_done = at_bottom and not content_growing and recovery_failed + timeout_hit = elapsed >= 15 + + if truly_done or timeout_hit: + print(f"โœ… All reviews loaded: {current_count}") stop_scrolling.set() break - if time.time() - last_new_time >= timeout_no_new: - print(f"โฑ๏ธ Timeout: no new reviews for {timeout_no_new}s") - stop_scrolling.set() - break + # Reviews already parsed during scrolling (real-time parsing) + print("๐Ÿ“ Finalizing review data...") - # FINAL PHASE: Parse full review data from DOM (scroll is stopped) - print("๐Ÿ“ Parsing full review data...") + # Separate API and DOM reviews api_reviews_collected = {k: v for k, v in reviews.items() if v.get("source") == "api"} - reviews.clear() - - # Parse all DOM cards now that scrolling is done - # Use specific selector to only get actual review cards (div.jftiEf), not buttons - try: - cards = driver.find_elements(By.CSS_SELECTOR, "div.jftiEf[data-review-id]") - for card in cards: - review = parse_dom_review(card) - if review and review.get("id"): - reviews[review["id"]] = review - except Exception as e: - print(f" Warning: DOM parse error: {e}") + dom_reviews = {k: v for k, v in reviews.items() if v.get("source") == "dom"} # Merge API reviews (only add if not already in DOM) api_added = 0