diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py index 9cb9af8..23856ce 100644 --- a/modules/scraper_clean.py +++ b/modules/scraper_clean.py @@ -259,80 +259,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in review_order = {} # review_id -> position (DOM visual order for sorting) order_counter = [0] # Current order position - # Don't force language - let Google show all reviews in user's locale + # Track total reviews (persists across refreshes) + total_reviews = [None] # Use list for closure mutation - # Navigate to URL - print(f"🌐 Loading: {url[:80]}...") - driver.get(url) + # Hard refresh counter + hard_refresh_count = [0] + max_hard_refreshes = 3 # Max number of hard refreshes before giving up - # Handle consent popup if redirected (poll with tiny sleep) - start = time.time() - while time.time() - start < 5: # Max 5s for consent - if "consent.google" in driver.current_url: - print(" Handling consent popup...") - try: - for btn in driver.find_elements(By.CSS_SELECTOR, "button"): - txt = btn.text.lower() - if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt: - btn.click() - # Reload original URL after consent - print(" Reloading after consent...") - driver.get(url) - break - except: - pass - break - # Check if we're already on the target page - if "maps/place" in driver.current_url and "consent" not in driver.current_url: - break - time.sleep(0.01) # 10ms - responsive but low CPU - - # Extract total review count BEFORE clicking reviews tab (it's on Overview) - # ROBUST: Use aria-label="X reviews" on span[role="img"] - # Poll for up to 5s since page might still be loading after consent - total_reviews = None - start = time.time() - while time.time() - start < 5: - try: - total_reviews = driver.execute_script(""" - // ROBUST: Find span[role="img"] with aria-label starting with number + "review" - var reviewSpans = document.querySelectorAll('span[role="img"]'); - for (var i = 0; i < reviewSpans.length; i++) { - var label = reviewSpans[i].getAttribute('aria-label') || ''; - var match = label.match(/^([\\d,\\.]+)\\s*review/i); - if (match) { - return parseInt(match[1].replace(/[,\\.]/g, '')); - } - } - return null; - """) - if total_reviews: - print(f"πŸ“Š Total reviews on page: {total_reviews}") - break - except: - pass - time.sleep(0.1) - - # Click reviews tab - poll until found - review_keywords = ["review", "reseΓ±a", "avis", "bewertung", "recensione", "opiniones"] - start = time.time() - while time.time() - start < 5: # Max 5s for tabs - try: - tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']") - for tab in tabs: - tab_text = tab.text.lower() - if any(kw in tab_text for kw in review_keywords): - print(f" Clicking reviews tab: '{tab.text}'") - tab.click() - break - else: - time.sleep(0.01) # 10ms between polls - continue - break # Found and clicked - except: - time.sleep(0.01) - - # Find scrollable reviews container + # Find scrollable reviews container helper def find_scroll_container(): selectors = [ "div.m6QErb.DxyBCb.kA9KIf.dS8AEf", @@ -352,77 +286,256 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in pass return None - # Poll for scroll container (10ms intervals - fast but low CPU) - scroll_container = None - start = time.time() - last_print = 0 - while time.time() - start < 10: # Max 10s - scroll_container = find_scroll_container() - if scroll_container: - break - elapsed = int(time.time() - start) - if elapsed > last_print: - print(f" Waiting for reviews panel... ({elapsed}s)") - last_print = elapsed - time.sleep(0.01) # 10ms - responsive but low CPU + def setup_reviews_page(is_refresh=False): + """ + Setup the reviews page for scraping. + Returns (scroll_container, stop_scrolling_event) or (None, None) on failure. + Can be called after initial load or after a hard refresh. + """ + nonlocal total_reviews - if not scroll_container: - print("❌ Could not find reviews scroll container") - # Debug: print page source snippet + refresh_label = " (after refresh)" if is_refresh else "" + + # Navigate to URL (only on initial load or refresh) + if not is_refresh: + print(f"🌐 Loading: {url[:80]}...") + else: + print(f"πŸ”„ Hard refresh #{hard_refresh_count[0]}: reloading page...") + driver.get(url) + + # Handle consent popup if redirected (poll with tiny sleep) + start = time.time() + while time.time() - start < 5: # Max 5s for consent + if "consent.google" in driver.current_url: + print(" Handling consent popup...") + try: + for btn in driver.find_elements(By.CSS_SELECTOR, "button"): + txt = btn.text.lower() + if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt: + btn.click() + # Reload original URL after consent + print(" Reloading after consent...") + driver.get(url) + break + except: + pass + break + # Check if we're already on the target page + if "maps/place" in driver.current_url and "consent" not in driver.current_url: + break + time.sleep(0.01) # 10ms - responsive but low CPU + + # Extract total review count BEFORE clicking reviews tab (it's on Overview) + # Only on first load (don't overwrite if we already have it) + if total_reviews[0] is None: + start = time.time() + while time.time() - start < 5: + try: + count = driver.execute_script(""" + var reviewSpans = document.querySelectorAll('span[role="img"]'); + for (var i = 0; i < reviewSpans.length; i++) { + var label = reviewSpans[i].getAttribute('aria-label') || ''; + var match = label.match(/^([\\d,\\.]+)\\s*review/i); + if (match) { + return parseInt(match[1].replace(/[,\\.]/g, '')); + } + } + return null; + """) + if count: + total_reviews[0] = count + print(f"πŸ“Š Total reviews on page: {count}") + break + except: + pass + time.sleep(0.1) + + # Click reviews tab - poll until found + review_keywords = ["review", "reseΓ±a", "avis", "bewertung", "recensione", "opiniones"] + start = time.time() + tab_clicked = False + while time.time() - start < 5: # Max 5s for tabs + try: + tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']") + for tab in tabs: + tab_text = tab.text.lower() + if any(kw in tab_text for kw in review_keywords): + if not is_refresh: + print(f" Clicking reviews tab: '{tab.text}'") + tab.click() + tab_clicked = True + break + if tab_clicked: + break + time.sleep(0.01) # 10ms between polls + except: + time.sleep(0.01) + + # Poll for scroll container (10ms intervals - fast but low CPU) + scroll_container = None + start = time.time() + last_print = 0 + while time.time() - start < 10: # Max 10s + scroll_container = find_scroll_container() + if scroll_container: + break + elapsed = int(time.time() - start) + if elapsed > last_print: + print(f" Waiting for reviews panel...{refresh_label} ({elapsed}s)") + last_print = elapsed + time.sleep(0.01) # 10ms - responsive but low CPU + + if not scroll_container: + print(f"❌ Could not find reviews scroll container{refresh_label}") + try: + print("Page title:", driver.title) + print("Current URL:", driver.current_url[:100]) + except: + pass + return None, None + + print(f"βœ… Found scroll container{refresh_label}") + + # Inject API interceptor (needs to be re-injected after refresh) + if not is_refresh: + print("πŸ”Œ Injecting API interceptor...") + driver.execute_script(""" + // Always re-setup on refresh + window.__reviewInterceptorInjected = true; + window.__interceptedResponses = window.__interceptedResponses || []; + + // Intercept fetch (only if not already patched) + if (!window.__fetchPatched) { + window.__fetchPatched = true; + const originalFetch = window.fetch; + window.fetch = async function(...args) { + const url = args[0].toString(); + const response = await originalFetch.apply(this, args); + if (url.includes('listugcposts') || url.includes('review')) { + try { + const clone = response.clone(); + const text = await clone.text(); + window.__interceptedResponses.push({url: url, body: text}); + } catch(e) {} + } + return response; + }; + } + + // Intercept XHR (only if not already patched) + if (!window.__xhrPatched) { + window.__xhrPatched = true; + const originalXHR = window.XMLHttpRequest; + window.XMLHttpRequest = function() { + const xhr = new originalXHR(); + const originalOpen = xhr.open; + let reqUrl = ''; + xhr.open = function(method, url, ...rest) { + reqUrl = url; + return originalOpen.apply(this, [method, url, ...rest]); + }; + xhr.addEventListener('load', function() { + if (reqUrl.includes('listugcposts') || reqUrl.includes('review')) { + try { + window.__interceptedResponses.push({url: reqUrl, body: xhr.responseText}); + } catch(e) {} + } + }); + return xhr; + }; + for (let prop of Object.getOwnPropertyNames(originalXHR)) { + try { window.XMLHttpRequest[prop] = originalXHR[prop]; } catch(e) {} + } + } + """) + + # Sort by newest first try: - print("Page title:", driver.title) - print("Current URL:", driver.current_url[:100]) + sort_btn = driver.execute_script(""" + var btns = document.querySelectorAll('button[data-value="sort"]'); + if (btns.length) return btns[0]; + var all = document.querySelectorAll('button[aria-label*="Sort"]'); + if (all.length) return all[0]; + return null; + """) + if sort_btn: + sort_btn.click() + time.sleep(0.3) + driver.execute_script(""" + var items = document.querySelectorAll('[role="menuitemradio"], [data-index="1"]'); + for (var i = 0; i < items.length; i++) { + var txt = items[i].textContent.toLowerCase(); + if (txt.includes('newest') || txt.includes('recent') || txt.includes('mΓ‘s reciente')) { + items[i].click(); + break; + } + } + """) + time.sleep(0.5) + print(" πŸ“… Sorted by newest") + # Re-find scroll container after sorting (DOM may be recreated) + new_container = find_scroll_container() + if new_container: + scroll_container = new_container + print(" πŸ”„ Refreshed scroll container reference") except: pass - return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"} - print("βœ… Found scroll container") - - # PHASE 2: Inject API interceptor for scroll-loaded reviews - print("πŸ”Œ Injecting API interceptor...") - driver.execute_script(""" - if (window.__reviewInterceptorInjected) return; - window.__reviewInterceptorInjected = true; - window.__interceptedResponses = []; - - // Intercept fetch - const originalFetch = window.fetch; - window.fetch = async function(...args) { - const url = args[0].toString(); - const response = await originalFetch.apply(this, args); - if (url.includes('listugcposts') || url.includes('review')) { - try { - const clone = response.clone(); - const text = await clone.text(); - window.__interceptedResponses.push({url: url, body: text}); - } catch(e) {} - } - return response; - }; - - // Intercept XHR - const originalXHR = window.XMLHttpRequest; - window.XMLHttpRequest = function() { - const xhr = new originalXHR(); - const originalOpen = xhr.open; - let reqUrl = ''; - xhr.open = function(method, url, ...rest) { - reqUrl = url; - return originalOpen.apply(this, [method, url, ...rest]); - }; - xhr.addEventListener('load', function() { - if (reqUrl.includes('listugcposts') || reqUrl.includes('review')) { - try { - window.__interceptedResponses.push({url: reqUrl, body: xhr.responseText}); - } catch(e) {} + # Expand "More" buttons for full text + try: + expanded = driver.execute_script(""" + var buttons = document.querySelectorAll('button.w8nwRe.kyuRq'); + var count = 0; + for (var i = 0; i < buttons.length; i++) { + if (buttons[i].textContent.trim() === 'More') { + buttons[i].click(); + count++; + } } - }); - return xhr; - }; - for (let prop of Object.getOwnPropertyNames(originalXHR)) { - try { window.XMLHttpRequest[prop] = originalXHR[prop]; } catch(e) {} - } - """) + return count; + """) + if expanded > 0: + print(f" πŸ“ Expanded {expanded} truncated reviews") + except: + pass + + # Block images to speed up scrolling (use CDP) + try: + driver.execute_cdp_cmd('Network.setBlockedURLs', { + 'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*'] + }) + driver.execute_cdp_cmd('Network.enable', {}) + if not is_refresh: + print(" 🚫 Blocking images for faster scrolling") + except: + pass + + # Setup scrollable pane reference + driver.execute_script("window.scrollablePane = arguments[0];", scroll_container) + + # Create scroll worker + stop_scrolling = threading.Event() + + def scroll_worker(): + while not stop_scrolling.is_set(): + try: + driver.execute_script(""" + var p = window.scrollablePane; + if (p) p.scrollTop = p.scrollHeight; + """) + except: + pass + time.sleep(0.1) + + scroll_thread = threading.Thread(target=scroll_worker, daemon=True) + scroll_thread.start() + + return scroll_container, stop_scrolling + + # Initial page setup + scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False) + if not scroll_container: + return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"} def get_api_reviews(): """Get reviews from intercepted API responses.""" @@ -440,93 +553,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in pass return api_revs - # Sort by newest first (helps with loading) - try: - sort_btn = driver.execute_script(""" - var btns = document.querySelectorAll('button[data-value="sort"]'); - if (btns.length) return btns[0]; - // Try aria-label - var all = document.querySelectorAll('button[aria-label*="Sort"]'); - if (all.length) return all[0]; - return null; - """) - if sort_btn: - sort_btn.click() - time.sleep(0.3) - # Click "Newest" option - driver.execute_script(""" - var items = document.querySelectorAll('[role="menuitemradio"], [data-index="1"]'); - for (var i = 0; i < items.length; i++) { - var txt = items[i].textContent.toLowerCase(); - if (txt.includes('newest') || txt.includes('recent') || txt.includes('mΓ‘s reciente')) { - items[i].click(); - break; - } - } - """) - time.sleep(0.5) - print(" πŸ“… Sorted by newest") - # Re-find scroll container after sorting (DOM may be recreated) - new_container = find_scroll_container() - if new_container: - scroll_container = new_container - print(" πŸ”„ Refreshed scroll container reference") - except: - pass - - # EXPAND ALL "MORE" BUTTONS for full text on pre-rendered reviews - # This batch-clicks all "More" buttons at once (fast, no waiting per button) - try: - expanded = driver.execute_script(""" - var buttons = document.querySelectorAll('button.w8nwRe.kyuRq'); - var count = 0; - for (var i = 0; i < buttons.length; i++) { - if (buttons[i].textContent.trim() === 'More') { - buttons[i].click(); - count++; - } - } - return count; - """) - if expanded > 0: - print(f" πŸ“ Expanded {expanded} truncated reviews") - except: - pass - - # Block images to speed up scrolling (use CDP) - try: - driver.execute_cdp_cmd('Network.setBlockedURLs', { - 'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*'] - }) - driver.execute_cdp_cmd('Network.enable', {}) - print(" 🚫 Blocking images for faster scrolling") - except Exception as e: - pass # CDP might not be available in all setups - - # Simple scroll - scrollTop = scrollHeight (proven to work) - driver.execute_script("window.scrollablePane = arguments[0];", scroll_container) - stop_scrolling = threading.Event() - - def scroll_worker(): - while not stop_scrolling.is_set(): - try: - driver.execute_script(""" - var p = window.scrollablePane; - if (p) p.scrollTop = p.scrollHeight; - """) - except: - pass - time.sleep(0.1) - - scroll_thread = threading.Thread(target=scroll_worker, daemon=True) - scroll_thread.start() - # Recovery function - use real mouse actions when stuck from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.keys import Keys recovery_count = [0] def unstick_scroll(): + nonlocal scroll_container recovery_count[0] += 1 method = recovery_count[0] % 4 try: @@ -566,6 +599,31 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in except: pass + def do_hard_refresh(): + """Hard refresh the page and re-setup everything. Returns True on success.""" + nonlocal scroll_container, stop_scrolling + hard_refresh_count[0] += 1 + + if hard_refresh_count[0] > max_hard_refreshes: + print(f" ⚠️ Max hard refreshes ({max_hard_refreshes}) reached, giving up") + return False + + # Stop current scroll worker + stop_scrolling.set() + time.sleep(0.2) + + # Re-setup page + new_container, new_stop = setup_reviews_page(is_refresh=True) + if new_container: + scroll_container = new_container + stop_scrolling = new_stop + recovery_count[0] = 0 # Reset recovery count after successful refresh + print(f" βœ… Hard refresh successful, resuming with {len(seen_ids)} reviews already collected") + return True + else: + print(f" ❌ Hard refresh failed to find scroll container") + return False + # Main collection loop last_new_time = time.time() last_count = len(reviews) @@ -784,9 +842,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in # Progress update elapsed = time.time() - last_new_time - if total_reviews: - pct = (current_count / total_reviews) * 100 - print(f" πŸ“Š {current_count}/{total_reviews} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True) + if total_reviews[0]: + pct = (current_count / total_reviews[0]) * 100 + print(f" πŸ“Š {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True) else: print(f" πŸ“Š {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True) @@ -797,7 +855,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in break # Also stop if we have all reviews from the page - if total_reviews and current_count >= total_reviews: + if total_reviews[0] and current_count >= total_reviews[0]: print(f"βœ… All {current_count} reviews collected") stop_scrolling.set() break @@ -805,8 +863,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in # STUCK DETECTION: If no new reviews for 3s+, try to unstick # Only if we haven't collected all reviews yet if elapsed >= 3 and int(elapsed) % 3 == 0: - print(f" πŸ”§ Recovery attempt #{recovery_count[0] + 1}...", flush=True) - unstick_scroll() + # After 8+ failed recovery attempts, try hard refresh + if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes: + print(f" πŸ”„ Soft recovery failed {recovery_count[0]} times, trying hard refresh...", flush=True) + if do_hard_refresh(): + last_new_time = time.time() # Reset timer after refresh + continue # Skip to next iteration + else: + print(f" πŸ”§ Recovery attempt #{recovery_count[0] + 1}...", flush=True) + unstick_scroll() # Check scroll state - track if content is still being added try: @@ -831,13 +896,20 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in last_new_time = time.time() # Dynamic timeout based on state and recovery attempts + # - Try hard refresh before giving up if we still have refreshes left # - 5s if at bottom AND content stopped growing AND multiple recovery attempts failed # - 15s max otherwise (keep trying) recovery_failed = recovery_count[0] >= 5 and elapsed >= 5 truly_done = at_bottom and not content_growing and recovery_failed - timeout_hit = elapsed >= 15 + timeout_hit = elapsed >= timeout_no_new if truly_done or timeout_hit: + # Last chance: try hard refresh before giving up + if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews): + print(f" πŸ”„ Timeout reached, trying hard refresh before giving up...", flush=True) + if do_hard_refresh(): + last_new_time = time.time() + continue # Keep trying print(f"βœ… All reviews loaded: {current_count}") stop_scrolling.set() break