Optimize wait times for faster scraping

- Reduce initial page load wait: 3s -> 1s
- Reduce consent click wait: 2s -> 0.5s
- Reduce post-consent reload wait: 3s -> 1s
- Reduce tab click wait: 2s -> 0.3s
- Use smart polling for tabs (0.25s intervals, up to 2.5s)
- Use faster scroll container polling (0.25s intervals)
- Remove redundant 2s wait after reviews load

Total execution time reduced from ~22s to ~13s

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-21 20:49:12 +00:00
parent 218927bd9b
commit 4f48fb28cd

View File

@@ -207,7 +207,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Navigate to URL
print(f"🌐 Loading: {url[:80]}...")
driver.get(url)
time.sleep(3)
time.sleep(1) # Minimal wait for redirect check
# Handle consent popup if present
if "consent.google" in driver.current_url:
@@ -218,28 +218,32 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
txt = btn.text.lower()
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
btn.click()
time.sleep(2)
time.sleep(0.5) # Brief wait for consent processing
break
except:
pass
# Reload original URL after consent (redirect can corrupt URL)
print(" Reloading after consent...")
driver.get(url)
time.sleep(3)
time.sleep(1) # Minimal wait for page start
# Click reviews tab if present (multilingual support)
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
try:
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
for tab in tabs:
tab_text = tab.text.lower()
if any(kw in tab_text for kw in review_keywords):
print(f" Clicking reviews tab: '{tab.text}'")
tab.click()
time.sleep(2)
for _ in range(10): # Wait up to 2.5s for tabs to appear
try:
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
if tabs:
for tab in tabs:
tab_text = tab.text.lower()
if any(kw in tab_text for kw in review_keywords):
print(f" Clicking reviews tab: '{tab.text}'")
tab.click()
time.sleep(0.3) # Brief wait for tab switch
break
break
except:
pass
except:
pass
time.sleep(0.25)
# Find scrollable reviews container
def find_scroll_container():
@@ -261,14 +265,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
pass
return None
# Wait and retry for scroll container
# Wait and retry for scroll container (faster polling)
scroll_container = None
for attempt in range(10):
for attempt in range(20): # 20 x 0.25s = 5s max
scroll_container = find_scroll_container()
if scroll_container:
break
print(f" Waiting for reviews panel... ({attempt+1}/10)")
time.sleep(1)
if attempt % 4 == 3: # Print every 1s
print(f" Waiting for reviews panel... ({(attempt+1)//4 + 1}s)")
time.sleep(0.25)
if not scroll_container:
print("❌ Could not find reviews scroll container")
@@ -305,9 +310,6 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
except:
pass
# Wait for reviews to fully load after tab click
time.sleep(2)
# PHASE 2: Inject API interceptor for scroll-loaded reviews
print("🔌 Injecting API interceptor...")
driver.execute_script("""