From 6a75159ebe11f571fb38ea59666cca1bfe1dda1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Wed, 21 Jan 2026 20:52:18 +0000 Subject: [PATCH] Use immediate element detection with 10ms polling - Replace fixed waits with tight polling loops - 10ms sleep between polls (responsive but low CPU) - Consent, tabs, scroll container all detected immediately - Total time reduced to ~11-12 seconds Co-Authored-By: Claude Opus 4.5 --- modules/scraper_clean.py | 80 ++++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py index a9bba13..a65f432 100644 --- a/modules/scraper_clean.py +++ b/modules/scraper_clean.py @@ -207,43 +207,47 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in # Navigate to URL print(f"🌐 Loading: {url[:80]}...") driver.get(url) - time.sleep(1) # Minimal wait for redirect check - # Handle consent popup if present - if "consent.google" in driver.current_url: - print(" Handling consent popup...") - try: - accept_btns = driver.find_elements(By.CSS_SELECTOR, "button") - for btn in accept_btns: - txt = btn.text.lower() - if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt: - btn.click() - time.sleep(0.5) # Brief wait for consent processing - break - except: - pass - # Reload original URL after consent (redirect can corrupt URL) - print(" Reloading after consent...") - driver.get(url) - time.sleep(1) # Minimal wait for page start + # Handle consent popup if redirected (poll with tiny sleep) + start = time.time() + while time.time() - start < 5: # Max 5s for consent + if "consent.google" in driver.current_url: + print(" Handling consent popup...") + try: + for btn in driver.find_elements(By.CSS_SELECTOR, "button"): + txt = btn.text.lower() + if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt: + btn.click() + # Reload original URL after consent + print(" Reloading after consent...") + driver.get(url) + break + except: + pass + break + # Check if we're already on the target page + if "maps/place" in driver.current_url and "consent" not in driver.current_url: + break + time.sleep(0.01) # 10ms - responsive but low CPU - # Click reviews tab if present (multilingual support) + # Click reviews tab - poll until found review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"] - for _ in range(10): # Wait up to 2.5s for tabs to appear + start = time.time() + while time.time() - start < 5: # Max 5s for tabs try: tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']") - if tabs: - for tab in tabs: - tab_text = tab.text.lower() - if any(kw in tab_text for kw in review_keywords): - print(f" Clicking reviews tab: '{tab.text}'") - tab.click() - time.sleep(0.3) # Brief wait for tab switch - break - break + for tab in tabs: + tab_text = tab.text.lower() + if any(kw in tab_text for kw in review_keywords): + print(f" Clicking reviews tab: '{tab.text}'") + tab.click() + break + else: + time.sleep(0.01) # 10ms between polls + continue + break # Found and clicked except: - pass - time.sleep(0.25) + time.sleep(0.01) # Find scrollable reviews container def find_scroll_container(): @@ -265,15 +269,19 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in pass return None - # Wait and retry for scroll container (faster polling) + # Poll for scroll container (10ms intervals - fast but low CPU) scroll_container = None - for attempt in range(20): # 20 x 0.25s = 5s max + start = time.time() + last_print = 0 + while time.time() - start < 10: # Max 10s scroll_container = find_scroll_container() if scroll_container: break - if attempt % 4 == 3: # Print every 1s - print(f" Waiting for reviews panel... ({(attempt+1)//4 + 1}s)") - time.sleep(0.25) + elapsed = int(time.time() - start) + if elapsed > last_print: + print(f" Waiting for reviews panel... ({elapsed}s)") + last_print = elapsed + time.sleep(0.01) # 10ms - responsive but low CPU if not scroll_container: print("❌ Could not find reviews scroll container")