Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
98 lines
3.0 KiB
Python
98 lines
3.0 KiB
Python
#!/usr/bin/env python3
|
|
"""Quick debug to see what's happening"""
|
|
import yaml
|
|
import time
|
|
from seleniumbase import Driver
|
|
from selenium.webdriver.common.by import By
|
|
|
|
def load_config():
|
|
with open('config.yaml', 'r') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
config = load_config()
|
|
url = config.get('url')
|
|
|
|
driver = Driver(uc=True, headless=False, page_load_strategy="normal")
|
|
|
|
try:
|
|
print(f"Loading: {url[:100]}")
|
|
driver.get(url)
|
|
time.sleep(3)
|
|
|
|
print(f"Title: {driver.title}")
|
|
print(f"URL: {driver.current_url[:100]}")
|
|
|
|
time.sleep(2)
|
|
|
|
# Handle GDPR consent page
|
|
if 'consent.google.com' in driver.current_url:
|
|
print("On consent page, looking for accept button...")
|
|
try:
|
|
# Look for various consent buttons
|
|
consent_selectors = [
|
|
'button:has-text("Accept all")',
|
|
'button:has-text("Aceptar todo")',
|
|
'button[aria-label*="Accept"]',
|
|
'button[aria-label*="Aceptar"]',
|
|
'form button[type="submit"]',
|
|
'//button[contains(., "Accept")]',
|
|
'//button[contains(., "Aceptar")]',
|
|
]
|
|
|
|
for selector in consent_selectors:
|
|
try:
|
|
if selector.startswith('//'):
|
|
btns = driver.find_elements(By.XPATH, selector)
|
|
else:
|
|
btns = driver.find_elements(By.CSS_SELECTOR, selector)
|
|
|
|
print(f" Selector '{selector[:30]}...': found {len(btns)} buttons")
|
|
if btns:
|
|
print(f" Clicking: {btns[0].text[:50]}")
|
|
btns[0].click()
|
|
time.sleep(2)
|
|
break
|
|
except:
|
|
continue
|
|
|
|
print(f"After consent click: {driver.current_url[:100]}")
|
|
time.sleep(3)
|
|
|
|
except Exception as e:
|
|
print(f"Consent error: {e}")
|
|
|
|
# Now try cookie banner on Maps page
|
|
try:
|
|
cookie_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Accept" i]')
|
|
print(f"Found {len(cookie_btns)} cookie buttons")
|
|
if cookie_btns:
|
|
cookie_btns[0].click()
|
|
time.sleep(1)
|
|
except Exception as e:
|
|
print(f"Cookie error: {e}")
|
|
|
|
# Click reviews
|
|
tabs = driver.find_elements(By.CSS_SELECTOR, '.LRkQ2, button[role="tab"]')
|
|
print(f"Found {len(tabs)} tabs")
|
|
for tab in tabs:
|
|
text = (tab.text or '').lower()
|
|
if 'review' in text:
|
|
print(f"Clicking: {tab.text}")
|
|
driver.execute_script("arguments[0].click();", tab)
|
|
break
|
|
|
|
time.sleep(3)
|
|
|
|
# Check reviews
|
|
reviews = driver.find_elements(By.CSS_SELECTOR, 'div.jftiEf.fontBodyMedium')
|
|
print(f"Found {len(reviews)} review elements")
|
|
|
|
# Check pane
|
|
panes = driver.find_elements(By.CSS_SELECTOR, 'div[role="main"] div.m6QErb')
|
|
print(f"Found {len(panes)} pane elements")
|
|
|
|
time.sleep(10) # Keep browser open
|
|
|
|
finally:
|
|
driver.quit()
|