Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
218 lines
6.4 KiB
Python
218 lines
6.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Debug script to inspect the actual HTML structure on Google Maps search results.
|
|
This will help us identify where the review count is located in the DOM.
|
|
"""
|
|
import time
|
|
from seleniumbase import Driver
|
|
from selenium.webdriver.common.by import By
|
|
|
|
# Initialize driver
|
|
print("Starting Chrome...")
|
|
driver = Driver(
|
|
uc=True,
|
|
headless=True,
|
|
page_load_strategy="normal"
|
|
)
|
|
|
|
# Navigate to Google Maps search for Instinto
|
|
url = "https://www.google.com/maps/search/?api=1&query=instinto+las+palmas&hl=en"
|
|
print(f"\nNavigating to: {url}")
|
|
driver.get(url)
|
|
time.sleep(3)
|
|
|
|
# Handle GDPR consent if present
|
|
if 'consent.google.com' in driver.current_url:
|
|
print("Handling GDPR consent...")
|
|
try:
|
|
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
|
for btn in form_btns:
|
|
btn_text = (btn.text or '').lower()
|
|
if 'accept all' in btn_text or 'aceptar todo' in btn_text:
|
|
print(f"Clicking: {btn.text}")
|
|
btn.click()
|
|
time.sleep(3)
|
|
break
|
|
else:
|
|
if len(form_btns) >= 2:
|
|
print("Using fallback - clicking second button")
|
|
form_btns[1].click()
|
|
time.sleep(3)
|
|
except Exception as e:
|
|
print(f"GDPR handling error: {e}")
|
|
|
|
# Wait for page to load
|
|
print("\nWaiting for page to fully load...")
|
|
time.sleep(5)
|
|
|
|
print(f"\nCurrent URL: {driver.current_url}")
|
|
|
|
# Get all text content on the page
|
|
all_text = driver.execute_script("return document.body.innerText;")
|
|
print("\n" + "="*80)
|
|
print("ALL TEXT ON PAGE (first 3000 chars):")
|
|
print("="*80)
|
|
print(all_text[:3000])
|
|
|
|
# Search for elements containing "152" or "review"
|
|
print("\n" + "="*80)
|
|
print("SEARCHING FOR ELEMENTS CONTAINING '152' OR 'review':")
|
|
print("="*80)
|
|
|
|
elements_with_numbers = driver.execute_script("""
|
|
const results = [];
|
|
const allElements = document.querySelectorAll('*');
|
|
|
|
for (let elem of allElements) {
|
|
const text = elem.textContent || '';
|
|
const ownText = elem.innerText || '';
|
|
|
|
// Only check elements that directly contain the text (not nested)
|
|
if (ownText && ownText.length < 200 && (ownText.includes('152') || /\\d+\\s*review/i.test(ownText))) {
|
|
results.push({
|
|
tag: elem.tagName,
|
|
class: elem.className,
|
|
id: elem.id,
|
|
text: ownText.substring(0, 100),
|
|
href: elem.href || null,
|
|
role: elem.getAttribute('role'),
|
|
ariaLabel: elem.getAttribute('aria-label')
|
|
});
|
|
}
|
|
}
|
|
|
|
return results.slice(0, 50); // First 50 matches
|
|
""")
|
|
|
|
for i, elem in enumerate(elements_with_numbers, 1):
|
|
print(f"\n{i}. <{elem['tag']}> "
|
|
f"class='{elem['class'][:50] if elem['class'] else ''}' "
|
|
f"id='{elem['id']}'")
|
|
if elem['role']:
|
|
print(f" role: {elem['role']}")
|
|
if elem['ariaLabel']:
|
|
print(f" aria-label: {elem['ariaLabel'][:100]}")
|
|
if elem['href']:
|
|
print(f" href: {elem['href'][:100]}")
|
|
print(f" text: {elem['text']}")
|
|
|
|
# Also check what the extraction script would find
|
|
print("\n" + "="*80)
|
|
print("RUNNING ACTUAL EXTRACTION SCRIPT:")
|
|
print("="*80)
|
|
|
|
extract_script = """
|
|
const info = {
|
|
name: null,
|
|
address: null,
|
|
rating: null,
|
|
total_reviews: null,
|
|
debug_info: []
|
|
};
|
|
|
|
// Extract business name
|
|
const nameSelectors = [
|
|
'h1.DUwDvf',
|
|
'[role="main"] h1',
|
|
'h1.fontHeadlineLarge'
|
|
];
|
|
|
|
for (const selector of nameSelectors) {
|
|
const elem = document.querySelector(selector);
|
|
if (elem && elem.textContent) {
|
|
info.name = elem.textContent.trim();
|
|
info.debug_info.push(`Found name via: ${selector}`);
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Extract rating
|
|
const ratingElem = document.querySelector('[role="img"][aria-label*="star"]');
|
|
if (ratingElem) {
|
|
const ariaLabel = ratingElem.getAttribute('aria-label');
|
|
const match = ariaLabel.match(/([0-9.]+)/);
|
|
if (match) {
|
|
info.rating = parseFloat(match[1]);
|
|
info.debug_info.push(`Found rating: ${info.rating} from aria-label: ${ariaLabel}`);
|
|
}
|
|
}
|
|
|
|
// Extract total review count
|
|
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
|
|
|
|
// Check search panel selectors
|
|
const searchPanelSelectors = [
|
|
'a[href*="reviews"]',
|
|
'button[jsaction*="reviews"]',
|
|
'div[role="link"]',
|
|
];
|
|
|
|
for (const selector of searchPanelSelectors) {
|
|
const elements = document.querySelectorAll(selector);
|
|
info.debug_info.push(`Checking ${selector}: found ${elements.length} elements`);
|
|
|
|
for (let elem of elements) {
|
|
const text = elem.textContent || '';
|
|
if (text.length < 200) {
|
|
info.debug_info.push(` - text: "${text.substring(0, 100)}"`);
|
|
}
|
|
|
|
const match = text.match(numberPattern);
|
|
if (match) {
|
|
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
if (num > 0 && num < 1000000) {
|
|
info.total_reviews = num;
|
|
info.debug_info.push(` ✓ FOUND via ${selector}: ${num}`);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (info.total_reviews) break;
|
|
}
|
|
|
|
// If not found, try all spans/divs
|
|
if (!info.total_reviews) {
|
|
const allElements = document.querySelectorAll('span, div, a');
|
|
info.debug_info.push(`Checking all spans/divs/links: ${allElements.length} elements`);
|
|
|
|
let checked = 0;
|
|
for (let elem of allElements) {
|
|
const text = elem.textContent || '';
|
|
if (text.length < 100) {
|
|
const match = text.match(numberPattern);
|
|
if (match) {
|
|
checked++;
|
|
if (checked <= 10) { // Log first 10 matches
|
|
info.debug_info.push(` - potential match: "${text.substring(0, 80)}"`);
|
|
}
|
|
|
|
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
if (num > 0 && num < 1000000) {
|
|
info.total_reviews = num;
|
|
info.debug_info.push(` ✓ FOUND via all elements: ${num} from "${text.substring(0, 80)}"`);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return info;
|
|
"""
|
|
|
|
result = driver.execute_script(extract_script)
|
|
|
|
print(f"\nExtracted Info:")
|
|
print(f" Name: {result.get('name')}")
|
|
print(f" Rating: {result.get('rating')}")
|
|
print(f" Total Reviews: {result.get('total_reviews')}")
|
|
|
|
print(f"\nDebug Info:")
|
|
for debug_line in result.get('debug_info', []):
|
|
print(f" {debug_line}")
|
|
|
|
print("\n" + "="*80)
|
|
print("Done! Closing browser.")
|
|
print("="*80)
|
|
driver.quit()
|