Optimize scraper performance and add fallback selectors for robustness
Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
97
debug_check.py
Normal file
97
debug_check.py
Normal file
@@ -0,0 +1,97 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Quick debug to see what's happening"""
|
||||
import yaml
|
||||
import time
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
def load_config():
|
||||
with open('config.yaml', 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
config = load_config()
|
||||
url = config.get('url')
|
||||
|
||||
driver = Driver(uc=True, headless=False, page_load_strategy="normal")
|
||||
|
||||
try:
|
||||
print(f"Loading: {url[:100]}")
|
||||
driver.get(url)
|
||||
time.sleep(3)
|
||||
|
||||
print(f"Title: {driver.title}")
|
||||
print(f"URL: {driver.current_url[:100]}")
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
# Handle GDPR consent page
|
||||
if 'consent.google.com' in driver.current_url:
|
||||
print("On consent page, looking for accept button...")
|
||||
try:
|
||||
# Look for various consent buttons
|
||||
consent_selectors = [
|
||||
'button:has-text("Accept all")',
|
||||
'button:has-text("Aceptar todo")',
|
||||
'button[aria-label*="Accept"]',
|
||||
'button[aria-label*="Aceptar"]',
|
||||
'form button[type="submit"]',
|
||||
'//button[contains(., "Accept")]',
|
||||
'//button[contains(., "Aceptar")]',
|
||||
]
|
||||
|
||||
for selector in consent_selectors:
|
||||
try:
|
||||
if selector.startswith('//'):
|
||||
btns = driver.find_elements(By.XPATH, selector)
|
||||
else:
|
||||
btns = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
|
||||
print(f" Selector '{selector[:30]}...': found {len(btns)} buttons")
|
||||
if btns:
|
||||
print(f" Clicking: {btns[0].text[:50]}")
|
||||
btns[0].click()
|
||||
time.sleep(2)
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
print(f"After consent click: {driver.current_url[:100]}")
|
||||
time.sleep(3)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Consent error: {e}")
|
||||
|
||||
# Now try cookie banner on Maps page
|
||||
try:
|
||||
cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i]')
|
||||
print(f"Found {len(cookie_btns)} cookie buttons")
|
||||
if cookie_btns:
|
||||
cookie_btns[0].click()
|
||||
time.sleep(1)
|
||||
except Exception as e:
|
||||
print(f"Cookie error: {e}")
|
||||
|
||||
# Click reviews
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, '.LRkQ2, button[role="tab"]')
|
||||
print(f"Found {len(tabs)} tabs")
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
if 'review' in text:
|
||||
print(f"Clicking: {tab.text}")
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
break
|
||||
|
||||
time.sleep(3)
|
||||
|
||||
# Check reviews
|
||||
reviews = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')
|
||||
print(f"Found {len(reviews)} review elements")
|
||||
|
||||
# Check pane
|
||||
panes = driver.find_elements(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb')
|
||||
print(f"Found {len(panes)} pane elements")
|
||||
|
||||
time.sleep(10) # Keep browser open
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
Reference in New Issue
Block a user