Files
whyrating-engine-legacy/debug_detail_page.py
Alejandro Gutiérrez faa0704737 Optimize scraper performance and add fallback selectors for robustness
Performance improvements:
- Validation speed: 59.71s → 10.96s (5.5x improvement)
- Removed 50+ console.log statements from JavaScript extraction
- Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting
- Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls)

Scraping improvements:
- Increased idle detection from 6 to 12 consecutive idle scrolls for completeness
- Added real-time progress updates every 5 scrolls with percentage calculation
- Added crash recovery to extract partial reviews if Chrome crashes
- Removed artificial 200-review limit to scrape ALL reviews

Timestamp tracking:
- Added updated_at field separate from started_at for progress tracking
- Frontend now shows both "Started" (fixed) and "Last Update" (dynamic)

Robustness improvements:
- Added 5 fallback CSS selectors to handle different Google Maps page structures
- Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc.
- Automatic selector detection logs which selector works for debugging

Test results:
- Successfully scraped 550 reviews in 150.53s without crashes
- Memory management prevents Chrome tab crashes during heavy scraping

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-18 19:49:24 +00:00

131 lines
4.0 KiB
Python

#!/usr/bin/env python3
"""
Debug script - check detail page after auto-navigation for review count.
"""
import time
from seleniumbase import Driver
from selenium.webdriver.common.by import By
driver = Driver(uc=True, headless=True)
url = "https://www.google.com/maps/search/?api=1&query=soho+vilna+club&hl=en"
print(f"Navigating to: {url}")
driver.get(url)
time.sleep(2)
# Handle GDPR
if 'consent.google.com' in driver.current_url:
print("Handling GDPR...")
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
for btn in form_btns:
if 'accept all' in (btn.text or '').lower():
btn.click()
time.sleep(2)
break
# Wait for auto-navigation to complete
print("Waiting for Google Maps to auto-navigate to business detail page...")
time.sleep(6)
print(f"Final URL: {driver.current_url}")
print(f"On detail page: {'/place/' in driver.current_url}\n")
# Dump ALL text on the page
all_text = driver.execute_script("return document.body.innerText;")
print("="*80)
print("SEARCHING FOR REVIEW NUMBERS IN PAGE TEXT:")
print("="*80)
# Find all numbers followed by "review"
import re
review_pattern = r'(\d[\d,\.]*)\s*(?:review|reseña|avis)'
matches = re.findall(review_pattern, all_text, re.IGNORECASE)
if matches:
print(f"✓ Found {len(matches)} potential review count(s) in text:")
for i, match in enumerate(matches, 1):
num = match.replace(',', '').replace('.', '')
print(f" {i}. {match} ({num})")
else:
print("✗ No review count found in page text")
# Check specific patterns in the text
print(f"\n{'='*80}")
print("PAGE TEXT ANALYSIS:")
print("="*80)
# Lines containing numbers
lines = all_text.split('\n')
number_lines = [line.strip() for line in lines if re.search(r'\d+', line) and len(line.strip()) < 100 and len(line.strip()) > 0]
print(f"Lines containing numbers (first 30):")
for i, line in enumerate(number_lines[:30], 1):
print(f" {i}. {line}")
# Now use JavaScript to find exact element
result = driver.execute_script("""
const info = {
foundIn: [],
reviewCount: null
};
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
// Check ALL elements
const allElements = document.querySelectorAll('*');
for (let elem of allElements) {
const text = elem.textContent || '';
const ownText = elem.innerText || '';
// Check both textContent and innerText
for (let txt of [text, ownText]) {
if (txt && txt.length < 200) {
const match = txt.match(numberPattern);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
if (num > 0 && num < 1000000) {
info.foundIn.push({
tag: elem.tagName,
class: elem.className,
id: elem.id,
role: elem.getAttribute('role'),
ariaLabel: elem.getAttribute('aria-label'),
text: txt.substring(0, 100),
number: num
});
if (!info.reviewCount) {
info.reviewCount = num;
}
}
}
}
}
}
return info;
""")
print(f"\n{'='*80}")
print("JAVASCRIPT EXTRACTION:")
print("="*80)
print(f"Review Count Found: {result['reviewCount']}\n")
if result['foundIn']:
print(f"Elements containing review numbers (first 15):")
for i, elem in enumerate(result['foundIn'][:15], 1):
print(f"\n{i}. <{elem['tag']}> Number: {elem['number']}")
if elem['class']:
print(f" class: {elem['class'][:60]}")
if elem['role']:
print(f" role: {elem['role']}")
if elem['ariaLabel']:
print(f" aria-label: {elem['ariaLabel'][:80]}")
print(f" text: {elem['text']}")
else:
print("No elements with review numbers found")
driver.quit()