Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
131 lines
4.0 KiB
Python
131 lines
4.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Debug script - check detail page after auto-navigation for review count.
|
|
"""
|
|
import time
|
|
from seleniumbase import Driver
|
|
from selenium.webdriver.common.by import By
|
|
|
|
driver = Driver(uc=True, headless=True)
|
|
|
|
url = "https://www.google.com/maps/search/?api=1&query=soho+vilna+club&hl=en"
|
|
print(f"Navigating to: {url}")
|
|
driver.get(url)
|
|
time.sleep(2)
|
|
|
|
# Handle GDPR
|
|
if 'consent.google.com' in driver.current_url:
|
|
print("Handling GDPR...")
|
|
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
|
for btn in form_btns:
|
|
if 'accept all' in (btn.text or '').lower():
|
|
btn.click()
|
|
time.sleep(2)
|
|
break
|
|
|
|
# Wait for auto-navigation to complete
|
|
print("Waiting for Google Maps to auto-navigate to business detail page...")
|
|
time.sleep(6)
|
|
|
|
print(f"Final URL: {driver.current_url}")
|
|
print(f"On detail page: {'/place/' in driver.current_url}\n")
|
|
|
|
# Dump ALL text on the page
|
|
all_text = driver.execute_script("return document.body.innerText;")
|
|
|
|
print("="*80)
|
|
print("SEARCHING FOR REVIEW NUMBERS IN PAGE TEXT:")
|
|
print("="*80)
|
|
|
|
# Find all numbers followed by "review"
|
|
import re
|
|
review_pattern = r'(\d[\d,\.]*)\s*(?:review|reseña|avis)'
|
|
matches = re.findall(review_pattern, all_text, re.IGNORECASE)
|
|
|
|
if matches:
|
|
print(f"✓ Found {len(matches)} potential review count(s) in text:")
|
|
for i, match in enumerate(matches, 1):
|
|
num = match.replace(',', '').replace('.', '')
|
|
print(f" {i}. {match} ({num})")
|
|
else:
|
|
print("✗ No review count found in page text")
|
|
|
|
# Check specific patterns in the text
|
|
print(f"\n{'='*80}")
|
|
print("PAGE TEXT ANALYSIS:")
|
|
print("="*80)
|
|
|
|
# Lines containing numbers
|
|
lines = all_text.split('\n')
|
|
number_lines = [line.strip() for line in lines if re.search(r'\d+', line) and len(line.strip()) < 100 and len(line.strip()) > 0]
|
|
|
|
print(f"Lines containing numbers (first 30):")
|
|
for i, line in enumerate(number_lines[:30], 1):
|
|
print(f" {i}. {line}")
|
|
|
|
# Now use JavaScript to find exact element
|
|
result = driver.execute_script("""
|
|
const info = {
|
|
foundIn: [],
|
|
reviewCount: null
|
|
};
|
|
|
|
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
|
|
|
|
// Check ALL elements
|
|
const allElements = document.querySelectorAll('*');
|
|
|
|
for (let elem of allElements) {
|
|
const text = elem.textContent || '';
|
|
const ownText = elem.innerText || '';
|
|
|
|
// Check both textContent and innerText
|
|
for (let txt of [text, ownText]) {
|
|
if (txt && txt.length < 200) {
|
|
const match = txt.match(numberPattern);
|
|
if (match) {
|
|
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
if (num > 0 && num < 1000000) {
|
|
info.foundIn.push({
|
|
tag: elem.tagName,
|
|
class: elem.className,
|
|
id: elem.id,
|
|
role: elem.getAttribute('role'),
|
|
ariaLabel: elem.getAttribute('aria-label'),
|
|
text: txt.substring(0, 100),
|
|
number: num
|
|
});
|
|
|
|
if (!info.reviewCount) {
|
|
info.reviewCount = num;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return info;
|
|
""")
|
|
|
|
print(f"\n{'='*80}")
|
|
print("JAVASCRIPT EXTRACTION:")
|
|
print("="*80)
|
|
print(f"Review Count Found: {result['reviewCount']}\n")
|
|
|
|
if result['foundIn']:
|
|
print(f"Elements containing review numbers (first 15):")
|
|
for i, elem in enumerate(result['foundIn'][:15], 1):
|
|
print(f"\n{i}. <{elem['tag']}> Number: {elem['number']}")
|
|
if elem['class']:
|
|
print(f" class: {elem['class'][:60]}")
|
|
if elem['role']:
|
|
print(f" role: {elem['role']}")
|
|
if elem['ariaLabel']:
|
|
print(f" aria-label: {elem['ariaLabel'][:80]}")
|
|
print(f" text: {elem['text']}")
|
|
else:
|
|
print("No elements with review numbers found")
|
|
|
|
driver.quit()
|