Optimize wait times for faster scraping
- Reduce initial page load wait: 3s -> 1s - Reduce consent click wait: 2s -> 0.5s - Reduce post-consent reload wait: 3s -> 1s - Reduce tab click wait: 2s -> 0.3s - Use smart polling for tabs (0.25s intervals, up to 2.5s) - Use faster scroll container polling (0.25s intervals) - Remove redundant 2s wait after reviews load Total execution time reduced from ~22s to ~13s Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -207,7 +207,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
# Navigate to URL
|
# Navigate to URL
|
||||||
print(f"🌐 Loading: {url[:80]}...")
|
print(f"🌐 Loading: {url[:80]}...")
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
time.sleep(3)
|
time.sleep(1) # Minimal wait for redirect check
|
||||||
|
|
||||||
# Handle consent popup if present
|
# Handle consent popup if present
|
||||||
if "consent.google" in driver.current_url:
|
if "consent.google" in driver.current_url:
|
||||||
@@ -218,28 +218,32 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
txt = btn.text.lower()
|
txt = btn.text.lower()
|
||||||
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
||||||
btn.click()
|
btn.click()
|
||||||
time.sleep(2)
|
time.sleep(0.5) # Brief wait for consent processing
|
||||||
break
|
break
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
# Reload original URL after consent (redirect can corrupt URL)
|
# Reload original URL after consent (redirect can corrupt URL)
|
||||||
print(" Reloading after consent...")
|
print(" Reloading after consent...")
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
time.sleep(3)
|
time.sleep(1) # Minimal wait for page start
|
||||||
|
|
||||||
# Click reviews tab if present (multilingual support)
|
# Click reviews tab if present (multilingual support)
|
||||||
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
|
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
|
||||||
try:
|
for _ in range(10): # Wait up to 2.5s for tabs to appear
|
||||||
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
|
try:
|
||||||
for tab in tabs:
|
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
|
||||||
tab_text = tab.text.lower()
|
if tabs:
|
||||||
if any(kw in tab_text for kw in review_keywords):
|
for tab in tabs:
|
||||||
print(f" Clicking reviews tab: '{tab.text}'")
|
tab_text = tab.text.lower()
|
||||||
tab.click()
|
if any(kw in tab_text for kw in review_keywords):
|
||||||
time.sleep(2)
|
print(f" Clicking reviews tab: '{tab.text}'")
|
||||||
|
tab.click()
|
||||||
|
time.sleep(0.3) # Brief wait for tab switch
|
||||||
|
break
|
||||||
break
|
break
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
time.sleep(0.25)
|
||||||
|
|
||||||
# Find scrollable reviews container
|
# Find scrollable reviews container
|
||||||
def find_scroll_container():
|
def find_scroll_container():
|
||||||
@@ -261,14 +265,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
pass
|
pass
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Wait and retry for scroll container
|
# Wait and retry for scroll container (faster polling)
|
||||||
scroll_container = None
|
scroll_container = None
|
||||||
for attempt in range(10):
|
for attempt in range(20): # 20 x 0.25s = 5s max
|
||||||
scroll_container = find_scroll_container()
|
scroll_container = find_scroll_container()
|
||||||
if scroll_container:
|
if scroll_container:
|
||||||
break
|
break
|
||||||
print(f" Waiting for reviews panel... ({attempt+1}/10)")
|
if attempt % 4 == 3: # Print every 1s
|
||||||
time.sleep(1)
|
print(f" Waiting for reviews panel... ({(attempt+1)//4 + 1}s)")
|
||||||
|
time.sleep(0.25)
|
||||||
|
|
||||||
if not scroll_container:
|
if not scroll_container:
|
||||||
print("❌ Could not find reviews scroll container")
|
print("❌ Could not find reviews scroll container")
|
||||||
@@ -305,9 +310,6 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Wait for reviews to fully load after tab click
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
# PHASE 2: Inject API interceptor for scroll-loaded reviews
|
# PHASE 2: Inject API interceptor for scroll-loaded reviews
|
||||||
print("🔌 Injecting API interceptor...")
|
print("🔌 Injecting API interceptor...")
|
||||||
driver.execute_script("""
|
driver.execute_script("""
|
||||||
|
|||||||
Reference in New Issue
Block a user