Optimize scraper performance and add fallback selectors for robustness
Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
130
debug_detail_page.py
Normal file
130
debug_detail_page.py
Normal file
@@ -0,0 +1,130 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Debug script - check detail page after auto-navigation for review count.
|
||||
"""
|
||||
import time
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
driver = Driver(uc=True, headless=True)
|
||||
|
||||
url = "https://www.google.com/maps/search/?api=1&query=soho+vilna+club&hl=en"
|
||||
print(f"Navigating to: {url}")
|
||||
driver.get(url)
|
||||
time.sleep(2)
|
||||
|
||||
# Handle GDPR
|
||||
if 'consent.google.com' in driver.current_url:
|
||||
print("Handling GDPR...")
|
||||
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
||||
for btn in form_btns:
|
||||
if 'accept all' in (btn.text or '').lower():
|
||||
btn.click()
|
||||
time.sleep(2)
|
||||
break
|
||||
|
||||
# Wait for auto-navigation to complete
|
||||
print("Waiting for Google Maps to auto-navigate to business detail page...")
|
||||
time.sleep(6)
|
||||
|
||||
print(f"Final URL: {driver.current_url}")
|
||||
print(f"On detail page: {'/place/' in driver.current_url}\n")
|
||||
|
||||
# Dump ALL text on the page
|
||||
all_text = driver.execute_script("return document.body.innerText;")
|
||||
|
||||
print("="*80)
|
||||
print("SEARCHING FOR REVIEW NUMBERS IN PAGE TEXT:")
|
||||
print("="*80)
|
||||
|
||||
# Find all numbers followed by "review"
|
||||
import re
|
||||
review_pattern = r'(\d[\d,\.]*)\s*(?:review|reseña|avis)'
|
||||
matches = re.findall(review_pattern, all_text, re.IGNORECASE)
|
||||
|
||||
if matches:
|
||||
print(f"✓ Found {len(matches)} potential review count(s) in text:")
|
||||
for i, match in enumerate(matches, 1):
|
||||
num = match.replace(',', '').replace('.', '')
|
||||
print(f" {i}. {match} ({num})")
|
||||
else:
|
||||
print("✗ No review count found in page text")
|
||||
|
||||
# Check specific patterns in the text
|
||||
print(f"\n{'='*80}")
|
||||
print("PAGE TEXT ANALYSIS:")
|
||||
print("="*80)
|
||||
|
||||
# Lines containing numbers
|
||||
lines = all_text.split('\n')
|
||||
number_lines = [line.strip() for line in lines if re.search(r'\d+', line) and len(line.strip()) < 100 and len(line.strip()) > 0]
|
||||
|
||||
print(f"Lines containing numbers (first 30):")
|
||||
for i, line in enumerate(number_lines[:30], 1):
|
||||
print(f" {i}. {line}")
|
||||
|
||||
# Now use JavaScript to find exact element
|
||||
result = driver.execute_script("""
|
||||
const info = {
|
||||
foundIn: [],
|
||||
reviewCount: null
|
||||
};
|
||||
|
||||
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
|
||||
|
||||
// Check ALL elements
|
||||
const allElements = document.querySelectorAll('*');
|
||||
|
||||
for (let elem of allElements) {
|
||||
const text = elem.textContent || '';
|
||||
const ownText = elem.innerText || '';
|
||||
|
||||
// Check both textContent and innerText
|
||||
for (let txt of [text, ownText]) {
|
||||
if (txt && txt.length < 200) {
|
||||
const match = txt.match(numberPattern);
|
||||
if (match) {
|
||||
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
if (num > 0 && num < 1000000) {
|
||||
info.foundIn.push({
|
||||
tag: elem.tagName,
|
||||
class: elem.className,
|
||||
id: elem.id,
|
||||
role: elem.getAttribute('role'),
|
||||
ariaLabel: elem.getAttribute('aria-label'),
|
||||
text: txt.substring(0, 100),
|
||||
number: num
|
||||
});
|
||||
|
||||
if (!info.reviewCount) {
|
||||
info.reviewCount = num;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return info;
|
||||
""")
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print("JAVASCRIPT EXTRACTION:")
|
||||
print("="*80)
|
||||
print(f"Review Count Found: {result['reviewCount']}\n")
|
||||
|
||||
if result['foundIn']:
|
||||
print(f"Elements containing review numbers (first 15):")
|
||||
for i, elem in enumerate(result['foundIn'][:15], 1):
|
||||
print(f"\n{i}. <{elem['tag']}> Number: {elem['number']}")
|
||||
if elem['class']:
|
||||
print(f" class: {elem['class'][:60]}")
|
||||
if elem['role']:
|
||||
print(f" role: {elem['role']}")
|
||||
if elem['ariaLabel']:
|
||||
print(f" aria-label: {elem['ariaLabel'][:80]}")
|
||||
print(f" text: {elem['text']}")
|
||||
else:
|
||||
print("No elements with review numbers found")
|
||||
|
||||
driver.quit()
|
||||
Reference in New Issue
Block a user