Files
whyrating-engine-legacy/debug_soho.py
Alejandro Gutiérrez faa0704737 Optimize scraper performance and add fallback selectors for robustness
Performance improvements:
- Validation speed: 59.71s → 10.96s (5.5x improvement)
- Removed 50+ console.log statements from JavaScript extraction
- Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting
- Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls)

Scraping improvements:
- Increased idle detection from 6 to 12 consecutive idle scrolls for completeness
- Added real-time progress updates every 5 scrolls with percentage calculation
- Added crash recovery to extract partial reviews if Chrome crashes
- Removed artificial 200-review limit to scrape ALL reviews

Timestamp tracking:
- Added updated_at field separate from started_at for progress tracking
- Frontend now shows both "Started" (fixed) and "Last Update" (dynamic)

Robustness improvements:
- Added 5 fallback CSS selectors to handle different Google Maps page structures
- Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc.
- Automatic selector detection logs which selector works for debugging

Test results:
- Successfully scraped 550 reviews in 150.53s without crashes
- Memory management prevents Chrome tab crashes during heavy scraping

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-18 19:49:24 +00:00

145 lines
4.4 KiB
Python

#!/usr/bin/env python3
"""
Debug script for the actual business user tried: Soho Vilna Club
"""
import time
from seleniumbase import Driver
from selenium.webdriver.common.by import By
driver = Driver(uc=True, headless=True)
url = "https://www.google.com/maps/search/?api=1&query=soho+vilna+club&hl=en"
print(f"Navigating to: {url}")
driver.get(url)
time.sleep(3)
# Handle GDPR
if 'consent.google.com' in driver.current_url:
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
for btn in form_btns:
if 'accept all' in (btn.text or '').lower():
btn.click()
time.sleep(3)
break
time.sleep(5)
print(f"Current URL: {driver.current_url}\n")
# Check if still on search results or navigated to business page
is_search_results = '/search/' in driver.current_url
print(f"On search results page: {is_search_results}\n")
# Extract info
result = driver.execute_script("""
const info = {
tabs: [],
reviewCount: null,
businessName: null,
rating: null,
searchResults: []
};
const isSearchPage = window.location.href.includes('/search/');
// Get business name
const nameElem = document.querySelector('h1.DUwDvf, [role="main"] h1, h1.fontHeadlineLarge');
if (nameElem) {
info.businessName = nameElem.textContent.trim();
}
// Get rating
const ratingElem = document.querySelector('[role="img"][aria-label*="star"]');
if (ratingElem) {
const ariaLabel = ratingElem.getAttribute('aria-label');
const match = ariaLabel.match(/([0-9.]+)/);
if (match) {
info.rating = parseFloat(match[1]);
}
}
// Get all tabs
const tabs = document.querySelectorAll('button[role="tab"]');
tabs.forEach((tab, i) => {
const text = tab.textContent || '';
const ariaLabel = tab.getAttribute('aria-label') || '';
info.tabs.push({
index: i,
text: text,
ariaLabel: ariaLabel
});
// Try to extract review count from tabs
const reviewPattern = /\\((\\d[\\d,\\.]*)\\)/;
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
let match = text.match(reviewPattern);
if (!match) match = text.match(numberPattern);
if (!match) match = ariaLabel.match(reviewPattern);
if (!match) match = ariaLabel.match(numberPattern);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
if (num > 0 && num < 1000000) {
info.reviewCount = num;
}
}
});
// If on search results, try to get review count from search panel
if (isSearchPage || !info.reviewCount) {
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
// Check all elements
const allElements = document.querySelectorAll('a, span, div');
for (let elem of allElements) {
const text = elem.textContent || '';
if (text.length > 0 && text.length < 150) {
const match = text.match(numberPattern);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
if (num > 0 && num < 1000000) {
info.searchResults.push({
tag: elem.tagName,
class: elem.className,
text: text,
number: num
});
if (!info.reviewCount) {
info.reviewCount = num;
}
}
}
}
}
}
return info;
""")
print("="*80)
print("BUSINESS INFO:")
print("="*80)
print(f"Name: {result['businessName']}")
print(f"Rating: {result['rating']}")
print(f"Review Count: {result['reviewCount']}\n")
print("="*80)
print("TABS FOUND:")
print("="*80)
for tab in result['tabs']:
print(f"\nTab {tab['index']}:")
print(f" Text: {tab['text']}")
print(f" Aria-label: {tab['ariaLabel']}")
if result['searchResults']:
print(f"\n{'='*80}")
print("SEARCH RESULTS WITH NUMBERS (first 10):")
print("="*80)
for i, sr in enumerate(result['searchResults'][:10], 1):
print(f"\n{i}. <{sr['tag']}> class='{sr['class'][:40]}'")
print(f" Number found: {sr['number']}")
print(f" Text: {sr['text'][:100]}")
driver.quit()