Use immediate element detection with 10ms polling
- Replace fixed waits with tight polling loops - 10ms sleep between polls (responsive but low CPU) - Consent, tabs, scroll container all detected immediately - Total time reduced to ~11-12 seconds Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -207,43 +207,47 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
# Navigate to URL
|
||||
print(f"🌐 Loading: {url[:80]}...")
|
||||
driver.get(url)
|
||||
time.sleep(1) # Minimal wait for redirect check
|
||||
|
||||
# Handle consent popup if present
|
||||
if "consent.google" in driver.current_url:
|
||||
print(" Handling consent popup...")
|
||||
try:
|
||||
accept_btns = driver.find_elements(By.CSS_SELECTOR, "button")
|
||||
for btn in accept_btns:
|
||||
txt = btn.text.lower()
|
||||
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
||||
btn.click()
|
||||
time.sleep(0.5) # Brief wait for consent processing
|
||||
break
|
||||
except:
|
||||
pass
|
||||
# Reload original URL after consent (redirect can corrupt URL)
|
||||
print(" Reloading after consent...")
|
||||
driver.get(url)
|
||||
time.sleep(1) # Minimal wait for page start
|
||||
# Handle consent popup if redirected (poll with tiny sleep)
|
||||
start = time.time()
|
||||
while time.time() - start < 5: # Max 5s for consent
|
||||
if "consent.google" in driver.current_url:
|
||||
print(" Handling consent popup...")
|
||||
try:
|
||||
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
|
||||
txt = btn.text.lower()
|
||||
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
||||
btn.click()
|
||||
# Reload original URL after consent
|
||||
print(" Reloading after consent...")
|
||||
driver.get(url)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
break
|
||||
# Check if we're already on the target page
|
||||
if "maps/place" in driver.current_url and "consent" not in driver.current_url:
|
||||
break
|
||||
time.sleep(0.01) # 10ms - responsive but low CPU
|
||||
|
||||
# Click reviews tab if present (multilingual support)
|
||||
# Click reviews tab - poll until found
|
||||
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
|
||||
for _ in range(10): # Wait up to 2.5s for tabs to appear
|
||||
start = time.time()
|
||||
while time.time() - start < 5: # Max 5s for tabs
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
|
||||
if tabs:
|
||||
for tab in tabs:
|
||||
tab_text = tab.text.lower()
|
||||
if any(kw in tab_text for kw in review_keywords):
|
||||
print(f" Clicking reviews tab: '{tab.text}'")
|
||||
tab.click()
|
||||
time.sleep(0.3) # Brief wait for tab switch
|
||||
break
|
||||
break
|
||||
for tab in tabs:
|
||||
tab_text = tab.text.lower()
|
||||
if any(kw in tab_text for kw in review_keywords):
|
||||
print(f" Clicking reviews tab: '{tab.text}'")
|
||||
tab.click()
|
||||
break
|
||||
else:
|
||||
time.sleep(0.01) # 10ms between polls
|
||||
continue
|
||||
break # Found and clicked
|
||||
except:
|
||||
pass
|
||||
time.sleep(0.25)
|
||||
time.sleep(0.01)
|
||||
|
||||
# Find scrollable reviews container
|
||||
def find_scroll_container():
|
||||
@@ -265,15 +269,19 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
pass
|
||||
return None
|
||||
|
||||
# Wait and retry for scroll container (faster polling)
|
||||
# Poll for scroll container (10ms intervals - fast but low CPU)
|
||||
scroll_container = None
|
||||
for attempt in range(20): # 20 x 0.25s = 5s max
|
||||
start = time.time()
|
||||
last_print = 0
|
||||
while time.time() - start < 10: # Max 10s
|
||||
scroll_container = find_scroll_container()
|
||||
if scroll_container:
|
||||
break
|
||||
if attempt % 4 == 3: # Print every 1s
|
||||
print(f" Waiting for reviews panel... ({(attempt+1)//4 + 1}s)")
|
||||
time.sleep(0.25)
|
||||
elapsed = int(time.time() - start)
|
||||
if elapsed > last_print:
|
||||
print(f" Waiting for reviews panel... ({elapsed}s)")
|
||||
last_print = elapsed
|
||||
time.sleep(0.01) # 10ms - responsive but low CPU
|
||||
|
||||
if not scroll_container:
|
||||
print("❌ Could not find reviews scroll container")
|
||||
|
||||
Reference in New Issue
Block a user