whyrating-engine-legacy/debug_business_card.py

#!/usr/bin/env python3
"""
Debug script to inspect the actual HTML structure on Google Maps search results.
This will help us identify where the review count is located in the DOM.
"""
import time
from seleniumbase import Driver
from selenium.webdriver.common.by import By

# Initialize driver
print("Starting Chrome...")
driver = Driver(
    uc=True,
    headless=True,
    page_load_strategy="normal"
)

# Navigate to Google Maps search for Instinto
url = "https://www.google.com/maps/search/?api=1&query=instinto+las+palmas&hl=en"
print(f"\nNavigating to: {url}")
driver.get(url)
time.sleep(3)

# Handle GDPR consent if present
if 'consent.google.com' in driver.current_url:
    print("Handling GDPR consent...")
    try:
        form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
        for btn in form_btns:
            btn_text = (btn.text or '').lower()
            if 'accept all' in btn_text or 'aceptar todo' in btn_text:
                print(f"Clicking: {btn.text}")
                btn.click()
                time.sleep(3)
                break
        else:
            if len(form_btns) >= 2:
                print("Using fallback - clicking second button")
                form_btns[1].click()
                time.sleep(3)
    except Exception as e:
        print(f"GDPR handling error: {e}")

# Wait for page to load
print("\nWaiting for page to fully load...")
time.sleep(5)

print(f"\nCurrent URL: {driver.current_url}")

# Get all text content on the page
all_text = driver.execute_script("return document.body.innerText;")
print("\n" + "="*80)
print("ALL TEXT ON PAGE (first 3000 chars):")
print("="*80)
print(all_text[:3000])

# Search for elements containing "152" or "review"
print("\n" + "="*80)
print("SEARCHING FOR ELEMENTS CONTAINING '152' OR 'review':")
print("="*80)

elements_with_numbers = driver.execute_script("""
    const results = [];
    const allElements = document.querySelectorAll('*');

    for (let elem of allElements) {
        const text = elem.textContent || '';
        const ownText = elem.innerText || '';

        // Only check elements that directly contain the text (not nested)
        if (ownText && ownText.length < 200 && (ownText.includes('152') || /\\d+\\s*review/i.test(ownText))) {
            results.push({
                tag: elem.tagName,
                class: elem.className,
                id: elem.id,
                text: ownText.substring(0, 100),
                href: elem.href || null,
                role: elem.getAttribute('role'),
                ariaLabel: elem.getAttribute('aria-label')
            });
        }
    }

    return results.slice(0, 50);  // First 50 matches
""")

for i, elem in enumerate(elements_with_numbers, 1):
    print(f"\n{i}. <{elem['tag']}> "
          f"class='{elem['class'][:50] if elem['class'] else ''}' "
          f"id='{elem['id']}'")
    if elem['role']:
        print(f"   role: {elem['role']}")
    if elem['ariaLabel']:
        print(f"   aria-label: {elem['ariaLabel'][:100]}")
    if elem['href']:
        print(f"   href: {elem['href'][:100]}")
    print(f"   text: {elem['text']}")

# Also check what the extraction script would find
print("\n" + "="*80)
print("RUNNING ACTUAL EXTRACTION SCRIPT:")
print("="*80)

extract_script = """
const info = {
    name: null,
    address: null,
    rating: null,
    total_reviews: null,
    debug_info: []
};

// Extract business name
const nameSelectors = [
    'h1.DUwDvf',
    '[role="main"] h1',
    'h1.fontHeadlineLarge'
];

for (const selector of nameSelectors) {
    const elem = document.querySelector(selector);
    if (elem && elem.textContent) {
        info.name = elem.textContent.trim();
        info.debug_info.push(`Found name via: ${selector}`);
        break;
    }
}

// Extract rating
const ratingElem = document.querySelector('[role="img"][aria-label*="star"]');
if (ratingElem) {
    const ariaLabel = ratingElem.getAttribute('aria-label');
    const match = ariaLabel.match(/([0-9.]+)/);
    if (match) {
        info.rating = parseFloat(match[1]);
        info.debug_info.push(`Found rating: ${info.rating} from aria-label: ${ariaLabel}`);
    }
}

// Extract total review count
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;

// Check search panel selectors
const searchPanelSelectors = [
    'a[href*="reviews"]',
    'button[jsaction*="reviews"]',
    'div[role="link"]',
];

for (const selector of searchPanelSelectors) {
    const elements = document.querySelectorAll(selector);
    info.debug_info.push(`Checking ${selector}: found ${elements.length} elements`);

    for (let elem of elements) {
        const text = elem.textContent || '';
        if (text.length < 200) {
            info.debug_info.push(`  - text: "${text.substring(0, 100)}"`);
        }

        const match = text.match(numberPattern);
        if (match) {
            const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
            if (num > 0 && num < 1000000) {
                info.total_reviews = num;
                info.debug_info.push(`  ✓ FOUND via ${selector}: ${num}`);
                break;
            }
        }
    }
    if (info.total_reviews) break;
}

// If not found, try all spans/divs
if (!info.total_reviews) {
    const allElements = document.querySelectorAll('span, div, a');
    info.debug_info.push(`Checking all spans/divs/links: ${allElements.length} elements`);

    let checked = 0;
    for (let elem of allElements) {
        const text = elem.textContent || '';
        if (text.length < 100) {
            const match = text.match(numberPattern);
            if (match) {
                checked++;
                if (checked <= 10) {  // Log first 10 matches
                    info.debug_info.push(`  - potential match: "${text.substring(0, 80)}"`);
                }

                const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
                if (num > 0 && num < 1000000) {
                    info.total_reviews = num;
                    info.debug_info.push(`  ✓ FOUND via all elements: ${num} from "${text.substring(0, 80)}"`);
                    break;
                }
            }
        }
    }
}

return info;
"""

result = driver.execute_script(extract_script)

print(f"\nExtracted Info:")
print(f"  Name: {result.get('name')}")
print(f"  Rating: {result.get('rating')}")
print(f"  Total Reviews: {result.get('total_reviews')}")

print(f"\nDebug Info:")
for debug_line in result.get('debug_info', []):
    print(f"  {debug_line}")

print("\n" + "="*80)
print("Done! Closing browser.")
print("="*80)
driver.quit()