""" Clean Google Maps Reviews Scraper - Simple down scrolling - DOM scraping + API interception """ import re import json import time import threading from selenium.webdriver.common.by import By def parse_api_review(raw: list) -> dict: """Parse a review from API response array.""" try: if not isinstance(raw, list) or len(raw) < 5: return None author = raw[0] if len(raw) > 0 and isinstance(raw[0], str) else "" timestamp = raw[1] if len(raw) > 1 else "" text = raw[3] if len(raw) > 3 and isinstance(raw[3], str) else "" rating = raw[4] if len(raw) > 4 and isinstance(raw[4], int) else 0 if not (1 <= rating <= 5): return None # Filter out garbage data (language codes, metadata, etc.) if len(author) <= 3: # Real names are longer than 3 chars return None if author.lower() in ['google', 'maps', 'reviews', 'es', 'en', 'it', 'no', 'de', 'fr', 'pt']: return None # Timestamp should look like a date, not a URL or language code if timestamp and ('http' in str(timestamp) or len(str(timestamp)) <= 3): return None # Owner response owner_response = None for idx in [9, 18]: if len(raw) > idx and raw[idx] and isinstance(raw[idx], list): resp = raw[idx] if len(resp) > 1: owner_response = {"text": resp[1], "timestamp": resp[0] if resp[0] else ""} break return { "author": author, "text": text, "rating": rating, "timestamp": timestamp, "owner_response": owner_response, "source": "api" } except: return None def extract_reviews_from_api_body(body: str) -> list: """Extract reviews from API response body using correct Google Maps structure.""" reviews = [] try: # Remove )]}' prefix if body.startswith(")]}'"): body = body[4:].strip() data = json.loads(body) # Google Maps API structure: data[2] contains review arrays # Each review: data[2][X][0] where: # Author: [1][4][5][0] # Rating: [2][0][0] # Text: [2][15][0][0] # Time: [1][6] if not isinstance(data, list) or len(data) < 3: return reviews reviews_area = data[2] if not isinstance(reviews_area, list): return reviews for item in reviews_area: try: if not isinstance(item, list) or len(item) < 1: continue review_data = item[0] if not isinstance(review_data, list) or len(review_data) < 3: continue # Extract fields using correct paths review_id = "" author = "" rating = 0 text = "" timestamp = "" # Review ID: [0] - same format as DOM's data-review-id try: review_id = review_data[0] except (IndexError, TypeError): pass # Author: [1][4][5][0] try: author = review_data[1][4][5][0] except (IndexError, TypeError): pass # Rating: [2][0][0] try: rating = review_data[2][0][0] except (IndexError, TypeError): pass # Text: [2][15][0][0] try: text = review_data[2][15][0][0] except (IndexError, TypeError): pass # Timestamp: [1][6] try: timestamp = review_data[1][6] except (IndexError, TypeError): pass # Validate and add (include review_id for deduplication) if author and isinstance(rating, int) and 1 <= rating <= 5: reviews.append({ "review_id": review_id, "author": author, "text": text or "", "rating": rating, "timestamp": timestamp or "", "source": "api" }) except: continue except: pass return reviews def parse_dom_review(card) -> dict: """Parse a review from DOM element.""" try: # Get review ID review_id = card.get_attribute("data-review-id") or "" if not review_id: try: id_el = card.find_element(By.CSS_SELECTOR, "[data-review-id]") review_id = id_el.get_attribute("data-review-id") or "" except: pass # Author - multiple selectors author = "" for sel in ['div[class*="d4r55"]', '.d4r55', 'button[data-review-id] + div']: try: author_el = card.find_element(By.CSS_SELECTOR, sel) author = author_el.text.strip() if author: break except: pass # Rating from aria-label on span[role="img"] rating = 0 try: stars_el = card.find_element(By.CSS_SELECTOR, 'span[role="img"]') aria = stars_el.get_attribute("aria-label") or "" # Extract number from label (handles "5 stars", "5 estrellas", etc.) num = re.search(r'[\d\.]+', aria.replace(',', '.')) if num: rating = int(float(num.group())) except: pass # Review text - try multiple selectors text = "" for sel in ['span[jsname="bN97Pc"]', 'span[jsname="fbQN7e"]', 'div.MyEned span.wiI7pd', '.wiI7pd']: try: text_el = card.find_element(By.CSS_SELECTOR, sel) text = text_el.text.strip() if text: break except: pass # Note: "More" button clicking removed for speed # Full text can be expanded later if needed # Timestamp timestamp = "" try: time_el = card.find_element(By.CSS_SELECTOR, 'span[class*="rsqaWe"]') timestamp = time_el.text.strip() except: pass # Owner response owner_response = None try: resp_box = card.find_element(By.CSS_SELECTOR, "div.CDe7pd") if resp_box: resp_text = "" resp_date = "" try: resp_text_el = resp_box.find_element(By.CSS_SELECTOR, "div.wiI7pd") resp_text = resp_text_el.text.strip() except: pass try: resp_date_el = resp_box.find_element(By.CSS_SELECTOR, "span.DZSIDd") resp_date = resp_date_el.text.strip() except: pass if resp_text: owner_response = {"text": resp_text, "timestamp": resp_date} except: pass if not review_id and not author: return None return { "id": review_id, "author": author, "text": text, "rating": rating, "timestamp": timestamp, "owner_response": owner_response, "source": "dom" } except Exception: return None def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15, flush_callback=None, flush_batch_size: int = 500) -> dict: """ Scrape Google Maps reviews. Args: driver: Selenium WebDriver instance url: Google Maps place URL max_reviews: Maximum reviews to collect timeout_no_new: Seconds to wait with no new reviews before stopping flush_callback: Optional callback(reviews_list) called every flush_batch_size reviews This allows streaming data to disk and freeing memory flush_batch_size: Number of reviews to collect before flushing (default 500) Returns: dict with reviews list and metadata """ # Storage - use review ID as key reviews = {} # review_id -> review seen_ids = set() # Track all IDs we've seen (persists after flush) total_flushed = [0] # Use list for closure mutation review_order = {} # review_id -> position (DOM visual order for sorting) order_counter = [0] # Current order position # Track total reviews (persists across refreshes) total_reviews = [None] # Use list for closure mutation # Hard refresh counter hard_refresh_count = [0] max_hard_refreshes = 3 # Max number of hard refreshes before giving up # Find scrollable reviews container helper def find_scroll_container(): selectors = [ "div.m6QErb.DxyBCb.kA9KIf.dS8AEf", "div.m6QErb.DxyBCb.kA9KIf", "div.m6QErb.DxyBCb", "div.m6QErb[aria-label]", "div.DxyBCb.kA9KIf.dS8AEf", "div[role='main'] div.m6QErb", ] for sel in selectors: try: els = driver.find_elements(By.CSS_SELECTOR, sel) for el in els: if el.is_displayed() and el.size['height'] > 100: return el except: pass return None def setup_reviews_page(is_refresh=False): """ Setup the reviews page for scraping. Returns (scroll_container, stop_scrolling_event) or (None, None) on failure. Can be called after initial load or after a hard refresh. """ nonlocal total_reviews refresh_label = " (after refresh)" if is_refresh else "" # Navigate to URL (only on initial load or refresh) if not is_refresh: print(f"🌐 Loading: {url[:80]}...") else: print(f"πŸ”„ Hard refresh #{hard_refresh_count[0]}: reloading page...") driver.get(url) # Handle consent popup if redirected (poll with tiny sleep) start = time.time() while time.time() - start < 5: # Max 5s for consent if "consent.google" in driver.current_url: print(" Handling consent popup...") try: for btn in driver.find_elements(By.CSS_SELECTOR, "button"): txt = btn.text.lower() if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt: btn.click() # Reload original URL after consent print(" Reloading after consent...") driver.get(url) break except: pass break # Check if we're already on the target page if "maps/place" in driver.current_url and "consent" not in driver.current_url: break time.sleep(0.01) # 10ms - responsive but low CPU # Extract total review count BEFORE clicking reviews tab (it's on Overview) # Only on first load (don't overwrite if we already have it) if total_reviews[0] is None: start = time.time() while time.time() - start < 5: try: count = driver.execute_script(""" var reviewSpans = document.querySelectorAll('span[role="img"]'); for (var i = 0; i < reviewSpans.length; i++) { var label = reviewSpans[i].getAttribute('aria-label') || ''; var match = label.match(/^([\\d,\\.]+)\\s*review/i); if (match) { return parseInt(match[1].replace(/[,\\.]/g, '')); } } return null; """) if count: total_reviews[0] = count print(f"πŸ“Š Total reviews on page: {count}") break except: pass time.sleep(0.1) # Click reviews tab - poll until found review_keywords = ["review", "reseΓ±a", "avis", "bewertung", "recensione", "opiniones"] start = time.time() tab_clicked = False while time.time() - start < 5: # Max 5s for tabs try: tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']") for tab in tabs: tab_text = tab.text.lower() if any(kw in tab_text for kw in review_keywords): if not is_refresh: print(f" Clicking reviews tab: '{tab.text}'") tab.click() tab_clicked = True break if tab_clicked: break time.sleep(0.01) # 10ms between polls except: time.sleep(0.01) # Poll for scroll container (10ms intervals - fast but low CPU) scroll_container = None start = time.time() last_print = 0 while time.time() - start < 10: # Max 10s scroll_container = find_scroll_container() if scroll_container: break elapsed = int(time.time() - start) if elapsed > last_print: print(f" Waiting for reviews panel...{refresh_label} ({elapsed}s)") last_print = elapsed time.sleep(0.01) # 10ms - responsive but low CPU if not scroll_container: print(f"❌ Could not find reviews scroll container{refresh_label}") try: print("Page title:", driver.title) print("Current URL:", driver.current_url[:100]) except: pass return None, None print(f"βœ… Found scroll container{refresh_label}") # Inject API interceptor (needs to be re-injected after refresh) if not is_refresh: print("πŸ”Œ Injecting API interceptor...") driver.execute_script(""" // Always re-setup on refresh window.__reviewInterceptorInjected = true; window.__interceptedResponses = window.__interceptedResponses || []; // Intercept fetch (only if not already patched) if (!window.__fetchPatched) { window.__fetchPatched = true; const originalFetch = window.fetch; window.fetch = async function(...args) { const url = args[0].toString(); const response = await originalFetch.apply(this, args); if (url.includes('listugcposts') || url.includes('review')) { try { const clone = response.clone(); const text = await clone.text(); window.__interceptedResponses.push({url: url, body: text}); } catch(e) {} } return response; }; } // Intercept XHR (only if not already patched) if (!window.__xhrPatched) { window.__xhrPatched = true; const originalXHR = window.XMLHttpRequest; window.XMLHttpRequest = function() { const xhr = new originalXHR(); const originalOpen = xhr.open; let reqUrl = ''; xhr.open = function(method, url, ...rest) { reqUrl = url; return originalOpen.apply(this, [method, url, ...rest]); }; xhr.addEventListener('load', function() { if (reqUrl.includes('listugcposts') || reqUrl.includes('review')) { try { window.__interceptedResponses.push({url: reqUrl, body: xhr.responseText}); } catch(e) {} } }); return xhr; }; for (let prop of Object.getOwnPropertyNames(originalXHR)) { try { window.XMLHttpRequest[prop] = originalXHR[prop]; } catch(e) {} } } """) # Sort by newest first try: sort_btn = driver.execute_script(""" var btns = document.querySelectorAll('button[data-value="sort"]'); if (btns.length) return btns[0]; var all = document.querySelectorAll('button[aria-label*="Sort"]'); if (all.length) return all[0]; return null; """) if sort_btn: sort_btn.click() time.sleep(0.3) driver.execute_script(""" var items = document.querySelectorAll('[role="menuitemradio"], [data-index="1"]'); for (var i = 0; i < items.length; i++) { var txt = items[i].textContent.toLowerCase(); if (txt.includes('newest') || txt.includes('recent') || txt.includes('mΓ‘s reciente')) { items[i].click(); break; } } """) time.sleep(0.5) print(" πŸ“… Sorted by newest") # Re-find scroll container after sorting (DOM may be recreated) new_container = find_scroll_container() if new_container: scroll_container = new_container print(" πŸ”„ Refreshed scroll container reference") except: pass # Expand "More" buttons for full text try: expanded = driver.execute_script(""" var buttons = document.querySelectorAll('button.w8nwRe.kyuRq'); var count = 0; for (var i = 0; i < buttons.length; i++) { if (buttons[i].textContent.trim() === 'More') { buttons[i].click(); count++; } } return count; """) if expanded > 0: print(f" πŸ“ Expanded {expanded} truncated reviews") except: pass # Block images to speed up scrolling (use CDP) try: driver.execute_cdp_cmd('Network.setBlockedURLs', { 'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*'] }) driver.execute_cdp_cmd('Network.enable', {}) if not is_refresh: print(" 🚫 Blocking images for faster scrolling") except: pass # Setup scrollable pane reference driver.execute_script("window.scrollablePane = arguments[0];", scroll_container) # Create scroll worker stop_scrolling = threading.Event() def scroll_worker(): while not stop_scrolling.is_set(): try: driver.execute_script(""" var p = window.scrollablePane; if (p) p.scrollTop = p.scrollHeight; """) except: pass time.sleep(0.1) scroll_thread = threading.Thread(target=scroll_worker, daemon=True) scroll_thread.start() return scroll_container, stop_scrolling # Initial page setup scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False) if not scroll_container: return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"} def get_api_reviews(): """Get reviews from intercepted API responses.""" api_revs = [] try: responses = driver.execute_script(""" var r = window.__interceptedResponses || []; window.__interceptedResponses = []; return r; """) for resp in (responses or []): body = resp.get("body", "") api_revs.extend(extract_reviews_from_api_body(body)) except: pass return api_revs # Recovery function - use real mouse actions when stuck from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.keys import Keys recovery_count = [0] def unstick_scroll(): nonlocal scroll_container recovery_count[0] += 1 method = recovery_count[0] % 4 try: if method == 1: # Method 1: Click pane and send Page Down keys scroll_container.click() ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform() ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform() elif method == 2: # Method 2: Real mouse wheel scroll ActionChains(driver).move_to_element(scroll_container)\ .scroll_by_amount(0, 800).perform() elif method == 3: # Method 3: Scroll up significantly then back down (force reload) driver.execute_script(""" var p = window.scrollablePane; if (p) p.scrollTop = Math.max(0, p.scrollTop - 2000); """) time.sleep(0.3) driver.execute_script(""" var p = window.scrollablePane; if (p) p.scrollTop = p.scrollHeight; """) else: # Method 4: Scroll last card into view, then scroll pane (no click to avoid opening profile) driver.execute_script(""" var cards = document.querySelectorAll('[data-review-id]'); if (cards.length > 0) { cards[cards.length - 1].scrollIntoView({block: 'end', behavior: 'smooth'}); } """) time.sleep(0.3) driver.execute_script(""" var p = window.scrollablePane; if (p) p.scrollTop = p.scrollHeight; """) except: pass def do_hard_refresh(): """Hard refresh the page and re-setup everything. Returns True on success.""" nonlocal scroll_container, stop_scrolling hard_refresh_count[0] += 1 if hard_refresh_count[0] > max_hard_refreshes: print(f" ⚠️ Max hard refreshes ({max_hard_refreshes}) reached, giving up") return False # Stop current scroll worker stop_scrolling.set() time.sleep(0.2) # Re-setup page new_container, new_stop = setup_reviews_page(is_refresh=True) if new_container: scroll_container = new_container stop_scrolling = new_stop recovery_count[0] = 0 # Reset recovery count after successful refresh print(f" βœ… Hard refresh successful, resuming with {len(seen_ids)} reviews already collected") return True else: print(f" ❌ Hard refresh failed to find scroll container") return False # Main collection loop last_new_time = time.time() last_count = len(reviews) check_num = 0 print(f"πŸ”„ Scrolling... (timeout: {timeout_no_new}s with no new)", flush=True) cycle_start = time.time() while True: check_num += 1 time.sleep(1.0) # Check every second # TIMING: Track cycle performance t0 = time.time() cycle_delta = t0 - cycle_start cycle_start = t0 # Collect from API (doesn't affect scroll) - API has FULL TEXT in original language # Use review_id as key to avoid duplicates with DOM t1 = time.time() for rev in get_api_reviews(): rid = rev.get('review_id', '') if rid and rid not in seen_ids: reviews[rid] = rev seen_ids.add(rid) api_time = time.time() - t1 # Expand any new "More" buttons for full text (batch click, fast) try: driver.execute_script(""" var buttons = document.querySelectorAll('button.w8nwRe.kyuRq'); for (var i = 0; i < buttons.length; i++) { if (buttons[i].textContent.trim() === 'More') { buttons[i].click(); } } """) except: pass # Parse reviews using ROBUST selectors (no class names - uses data/aria attributes) # This survives Google's CSS class name changes # Also removes separators from previously-hidden cards to keep DOM light t2 = time.time() dom_cards = 0 try: seen_list = list(seen_ids) parsed_reviews = driver.execute_script(""" var seenSet = new Set(arguments[0]); var results = []; var processedIds = new Set(); var sepsRemoved = 0; // ROBUST: Find cards by data attribute only (not class names) var cards = document.querySelectorAll('[data-review-id]'); for (var i = 0; i < cards.length; i++) { var card = cards[i]; var rid = card.getAttribute('data-review-id'); var isHidden = card.style.display === 'none'; // CLEANUP: Remove separators adjacent to already-hidden cards // This keeps DOM light without breaking Google's virtual scroll if (isHidden) { var sibling = card.nextElementSibling; while (sibling) { var nextSib = sibling.nextElementSibling; var classes = sibling.className || ''; if (classes.includes('AyRUI') || classes.includes('TFQHme')) { sibling.remove(); sepsRemoved++; sibling = nextSib; } else { break; } } continue; } // Skip if no ID or already processed this cycle if (!rid || processedIds.has(rid)) continue; // Only process top-level review cards (have aria-label with author name) if (!card.getAttribute('aria-label')) continue; processedIds.add(rid); // Already seen from API - just track order, skip content if (seenSet.has(rid)) { results.push({id: rid, orderOnly: true}); continue; } var author = '', text = '', rating = 0, timestamp = ''; // AUTHOR: Extract from "Photo of {Name}" button aria-label var photoBtn = card.querySelector('button[aria-label^="Photo of"]'); if (photoBtn) { author = photoBtn.getAttribute('aria-label').replace('Photo of ', '').trim(); } // Fallback: card's own aria-label is the author name if (!author) { author = card.getAttribute('aria-label') || ''; } // RATING: span with role="img" and aria-label containing "star" var ratingEl = card.querySelector('span[role="img"][aria-label*="star"]'); if (ratingEl) { var match = ratingEl.getAttribute('aria-label').match(/(\\d)/); if (match) rating = parseInt(match[1]); } // TIMESTAMP: Find span with "X time ago" pattern var spans = card.querySelectorAll('span'); for (var j = 0; j < spans.length; j++) { var spanText = spans[j].textContent.trim(); if (spanText.match(/^(\\d+|a|an)\\s+(second|minute|hour|day|week|month|year)s?\\s+ago$/i)) { timestamp = spanText; break; } } // TEXT: Find longest text span (not timestamp/UI elements) var longestText = ''; for (var j = 0; j < spans.length; j++) { var spanText = spans[j].textContent.trim(); if (spanText === timestamp) continue; if (spanText.match(/^\\d+ stars?$/i)) continue; if (spanText === 'More' || spanText === 'Less') continue; if (spanText.match(/^(Like\\d*|Share)$/)) continue; if (spanText.length > longestText.length && spanText.length > 10) { longestText = spanText; } } text = longestText; if (author && rating >= 1 && rating <= 5) { results.push({ id: rid, orderOnly: false, author: author, text: text, rating: rating, timestamp: timestamp, source: 'dom' }); // Hide processed card (separators removed on next cycle) card.style.display = 'none'; card.innerHTML = ''; } } return {reviews: results, cardCount: cards.length, sepsRemoved: sepsRemoved}; """, seen_list) dom_cards = parsed_reviews.get('cardCount', 0) if parsed_reviews else 0 new_reviews = parsed_reviews.get('reviews', []) if parsed_reviews else [] for rev in new_reviews: rid = rev.pop('id') order_only = rev.pop('orderOnly', False) # Track DOM order for ALL reviews (for sorting output) if rid not in review_order: review_order[rid] = order_counter[0] order_counter[0] += 1 # Only add content for new reviews (not already from API) if not order_only: reviews[rid] = rev seen_ids.add(rid) except Exception as e: print(f" ❌ DOM parse error: {e}") dom_time = time.time() - t2 # BATCH FLUSH: If we have enough reviews, flush to callback and clear memory # Sort by DOM order before flushing t3 = time.time() if flush_callback and len(reviews) >= flush_batch_size: print(f" πŸ’Ύ Flushing {len(reviews)} reviews to disk...") sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) flush_callback([r for _, r in sorted_reviews]) total_flushed[0] += len(reviews) reviews.clear() # Free memory, but keep seen_ids and review_order flush_time = time.time() - t3 current_count = total_flushed[0] + len(reviews) # TIMING: Print if cycle is slow (>2s) if cycle_delta > 2.0: print(f" ⚠️ SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})") # Check for new reviews if current_count > last_count: last_new_time = time.time() last_count = current_count # Check if loading (spinner visible OR network activity) try: loading_status = driver.execute_script(""" var status = {spinner: false, network: false}; // Check for Google's loading indicators var spinner = document.querySelector('div[role="progressbar"]'); if (spinner && spinner.offsetParent !== null) status.spinner = true; var loading = document.querySelector('.qjESne, .loading'); if (loading && loading.offsetParent !== null) status.spinner = true; // Check for recent network activity (API interceptor) var responses = window.__interceptedResponses || []; var lastCount = window.__lastResponseCount || 0; if (responses.length > lastCount) { status.network = true; window.__lastResponseCount = responses.length; } return status; """) is_loading = loading_status.get('spinner') or loading_status.get('network') if is_loading: last_new_time = time.time() # Reset timer while loading except: is_loading = False # Progress update elapsed = time.time() - last_new_time if total_reviews[0]: pct = (current_count / total_reviews[0]) * 100 print(f" πŸ“Š {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True) else: print(f" πŸ“Š {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True) # Stop conditions - check BEFORE recovery attempts if current_count >= max_reviews: print(f"βœ… Reached max: {current_count}") stop_scrolling.set() break # Also stop if we have all reviews from the page if total_reviews[0] and current_count >= total_reviews[0]: print(f"βœ… All {current_count} reviews collected") stop_scrolling.set() break # STUCK DETECTION: If no new reviews for 3s+, try to unstick # Only if we haven't collected all reviews yet if elapsed >= 3 and int(elapsed) % 3 == 0: # After 8+ failed recovery attempts, try hard refresh if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes: print(f" πŸ”„ Soft recovery failed {recovery_count[0]} times, trying hard refresh...", flush=True) if do_hard_refresh(): last_new_time = time.time() # Reset timer after refresh continue # Skip to next iteration else: print(f" πŸ”§ Recovery attempt #{recovery_count[0] + 1}...", flush=True) unstick_scroll() # Check scroll state - track if content is still being added try: scroll_state = driver.execute_script(""" var p = window.scrollablePane; if (!p) return {atBottom: true, height: 0}; var atBottom = (p.scrollTop + p.clientHeight >= p.scrollHeight - 50); var height = p.scrollHeight; var lastHeight = window.__lastScrollHeight || 0; var growing = height > lastHeight; window.__lastScrollHeight = height; return {atBottom: atBottom, height: height, growing: growing}; """) at_bottom = scroll_state.get('atBottom', True) content_growing = scroll_state.get('growing', False) except: at_bottom = True content_growing = False # Reset timer if content is growing (new reviews loading) if content_growing: last_new_time = time.time() # Dynamic timeout based on state and recovery attempts # - Try hard refresh before giving up if we still have refreshes left # - 5s if at bottom AND content stopped growing AND multiple recovery attempts failed # - 15s max otherwise (keep trying) recovery_failed = recovery_count[0] >= 5 and elapsed >= 5 truly_done = at_bottom and not content_growing and recovery_failed timeout_hit = elapsed >= timeout_no_new if truly_done or timeout_hit: # Last chance: try hard refresh before giving up if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews): print(f" πŸ”„ Timeout reached, trying hard refresh before giving up...", flush=True) if do_hard_refresh(): last_new_time = time.time() continue # Keep trying print(f"βœ… All reviews loaded: {current_count}") stop_scrolling.set() break # Flush any remaining reviews (sorted by DOM order) if flush_callback and reviews: print(f" πŸ’Ύ Final flush: {len(reviews)} reviews...") sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) flush_callback([r for _, r in sorted_reviews]) total_flushed[0] += len(reviews) reviews.clear() # Reviews already parsed during scrolling (real-time parsing) print("πŸ“ Finalizing review data...") # Final results (sorted by DOM order) sorted_items = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) review_list = [r for _, r in sorted_items] grand_total = total_flushed[0] + len(review_list) dom_count = sum(1 for r in review_list if r.get("source") == "dom") api_count = sum(1 for r in review_list if r.get("source") == "api") if total_flushed[0] > 0: print(f"\nπŸ“‹ Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})") else: print(f"\nπŸ“‹ Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})") return { "reviews": review_list, # Only unflushed reviews (flushed already sent to callback) "total": grand_total, "total_flushed": total_flushed[0], "checks": check_num, "url": url } def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999, progress_callback=None, driver=None, return_driver: bool = False): """ Production-compatible wrapper for scrape_reviews. Matches the API expected by job_manager.py. Args: url: Google Maps URL to scrape headless: Run Chrome in headless mode max_scrolls: Not used (kept for API compatibility) progress_callback: Optional callback(current_count, total_count) for progress driver: Existing driver instance to reuse return_driver: If True, return driver in result Returns: Dictionary with: reviews, count, total_reviews, time, success, error, driver """ from seleniumbase import Driver start_time = time.time() driver_provided = driver is not None should_close_driver = not return_driver and not driver_provided try: # Create driver if not provided if not driver: driver = Driver( uc=True, headless=headless, page_load_strategy="normal", agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) driver.set_window_size(1200, 900) # Proper viewport for Google Maps # Create progress wrapper if callback provided flush_callback = None if progress_callback: collected = [0] def flush_with_progress(reviews_batch): collected[0] += len(reviews_batch) progress_callback(collected[0], None) flush_callback = flush_with_progress # Run the scraper result = scrape_reviews( driver=driver, url=url, max_reviews=999999, # Effectively unlimited timeout_no_new=15, flush_callback=flush_callback, flush_batch_size=100 # Smaller batches for more frequent progress ) elapsed = time.time() - start_time # Return in expected format response = { "reviews": result.get("reviews", []), "count": result.get("total", 0), "total_reviews": result.get("total", 0), "time": elapsed, "success": True, "error": None } if return_driver: response["driver"] = driver elif should_close_driver: try: driver.quit() except: pass return response except Exception as e: elapsed = time.time() - start_time if should_close_driver and driver: try: driver.quit() except: pass return { "reviews": [], "count": 0, "total_reviews": 0, "time": elapsed, "success": False, "error": str(e), "driver": driver if return_driver else None } # Test function if __name__ == "__main__": from seleniumbase import Driver # Test URL - 79 reviews TEST_URL = "https://www.google.com/maps/place/R.+Fleitas+Peluqueros/@28.1302986,-15.4448111,821m/data=!3m1!1e3!4m6!3m5!1s0xc40951a43c21f19:0x85f89601b9909c72!8m2!3d28.1299805!4d-15.4436854!16s%2Fg%2F11gbwtk8c8" print("πŸš€ Starting clean scraper test...") # Set up driver driver = Driver(uc=True, headless=False) driver.set_window_size(1200, 900) try: result = scrape_reviews(driver, TEST_URL, max_reviews=100, timeout_no_new=15) print(f"\nβœ… Got {result['total']} reviews in {result['checks']} checks") # Show sample if result["reviews"]: print("\nπŸ“ Sample review:") sample = result["reviews"][0] print(f" Author: {sample['author']}") print(f" Rating: {sample['rating']}⭐") print(f" Text: {sample['text'][:100]}..." if sample['text'] else " Text: (none)") finally: driver.quit() print("\n🏁 Done")