#!/usr/bin/env python3 """ Test metadata extraction: category, review topics, about info. Uses robust selectors (aria-labels, roles, jsaction) to avoid breakage. """ import time import json from seleniumbase import Driver from selenium.webdriver.common.by import By # Expected values for validation EXPECTED = { "name": "R. Fleitas Peluqueros", "category": "Barber shop", "review_topics": ["hair salon", "cutting", "price", "siblings", "beard"], "about_sections": ["Accessibility", "Amenities", "Planning", "Payments", "Children"] } def extract_metadata(driver, url: str) -> dict: """Extract all business metadata from Google Maps.""" # Force English if 'hl=' not in url: url = f"{url}{'&' if '?' in url else '?'}hl=en&gl=us" print(f" Loading URL: {url[:70]}...") driver.get(url) # Handle consent popup - poll with 10ms sleep (same as production scraper) start = time.time() while time.time() - start < 5: if "consent.google" in driver.current_url: print(" šŸŖ Consent page detected, clicking accept...") try: for btn in driver.find_elements(By.CSS_SELECTOR, "button"): txt = btn.text.lower() if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt: btn.click() print(f" āœ… Clicked: '{btn.text}', reloading...") driver.get(url) break except: pass break if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url): break time.sleep(0.01) # 10ms polling # Wait for page to stabilize time.sleep(1) result = { "name": None, "category": None, "rating": None, "total_reviews": None, "review_topics": [], "about": {} } # ========== OVERVIEW TAB (default) ========== print("\nšŸ“ Extracting from OVERVIEW tab...") overview_data = driver.execute_script(""" var data = {name: null, category: null, rating: null, total_reviews: null}; // Business name - h1 is stable var h1 = document.querySelector('h1'); if (h1) data.name = h1.textContent.trim(); // Category - use jsaction attribute (more stable than class) var catBtn = document.querySelector('button[jsaction*="category"]'); if (catBtn) data.category = catBtn.textContent.trim(); // Fallback: look for button after rating that's not a link if (!data.category) { var buttons = document.querySelectorAll('button'); for (var btn of buttons) { var text = btn.textContent.trim(); // Categories are short words, no numbers, not navigation if (text && text.length < 50 && !text.match(/^[0-9]/) && !text.match(/review|star|direction|save|share|photo/i)) { // Check if it's near the rating area var parent = btn.closest('.LBgpqf, .skqShb, .fontBodyMedium'); if (parent) { data.category = text; break; } } } } // Rating and reviews from aria-labels (stable) var spans = document.querySelectorAll('span[role="img"]'); for (var span of spans) { var label = span.getAttribute('aria-label') || ''; // Rating: "4.8 stars" var rMatch = label.match(/^([\\d,.]+)\\s*star/i); if (rMatch && !data.rating) { data.rating = parseFloat(rMatch[1].replace(',', '.')); } // Reviews: "79 reviews" var revMatch = label.match(/^([\\d,]+)\\s*review/i); if (revMatch && !data.total_reviews) { data.total_reviews = parseInt(revMatch[1].replace(/,/g, '')); } } return data; """) result.update(overview_data) print(f" Name: {result['name']}") print(f" Category: {result['category']}") print(f" Rating: {result['rating']}") print(f" Reviews: {result['total_reviews']}") # ========== REVIEWS TAB ========== print("\nšŸ“ Clicking REVIEWS tab...") # Click reviews tab using aria-label or role (robust) clicked = driver.execute_script(""" // Try multiple selectors for reviews tab var selectors = [ 'button[aria-label*="Review"]', 'button[data-tab-index="1"]', 'div[role="tablist"] button:nth-child(2)', 'button[jsaction*="review"]' ]; for (var sel of selectors) { var btn = document.querySelector(sel); if (btn && btn.textContent.toLowerCase().includes('review')) { btn.click(); return true; } } // Fallback: find by text content var buttons = document.querySelectorAll('button'); for (var btn of buttons) { if (btn.textContent.trim().toLowerCase() === 'reviews') { btn.click(); return true; } } return false; """) if clicked: time.sleep(1.5) # Wait for tab to load # Extract review topics from radiogroup (very stable selector) topics = driver.execute_script(""" var topics = []; // Primary: use role="radiogroup" with aria-label="Refine reviews" var container = document.querySelector('div[role="radiogroup"][aria-label*="Refine"], div[role="radiogroup"][aria-label*="refine"]'); if (!container) { // Fallback: any radiogroup in the reviews area container = document.querySelector('div[role="radiogroup"]'); } if (container) { var buttons = container.querySelectorAll('button[role="radio"]'); for (var btn of buttons) { var label = btn.getAttribute('aria-label') || ''; // Parse "hair salon, mentioned in 4 reviews" or just get the topic name var match = label.match(/^([^,]+),\\s*mentioned in (\\d+)/i); if (match) { topics.push({ topic: match[1].trim(), count: parseInt(match[2]) }); } else if (label && !label.toLowerCase().includes('all review')) { // Might be in different format var countSpan = btn.querySelector('.bC3Nkc, .fontBodySmall'); var nameSpan = btn.querySelector('.uEubGf, span:first-child'); if (nameSpan) { var name = nameSpan.textContent.trim(); var count = countSpan ? parseInt(countSpan.textContent) : 0; if (name && name.toLowerCase() !== 'all') { topics.push({topic: name, count: count}); } } } } } return topics; """) result['review_topics'] = topics print(f" Found {len(topics)} review topics:") for t in topics: print(f" - {t['topic']}: {t['count']} mentions") else: print(" āš ļø Could not click Reviews tab") # ========== ABOUT TAB ========== print("\nšŸ“‹ Clicking ABOUT tab...") clicked = driver.execute_script(""" // Try multiple selectors for about tab var selectors = [ 'button[aria-label*="About"]', 'button[data-tab-index="2"]', 'div[role="tablist"] button:nth-child(3)', 'button[jsaction*="about"]' ]; for (var sel of selectors) { var btn = document.querySelector(sel); if (btn && btn.textContent.toLowerCase().includes('about')) { btn.click(); return true; } } // Fallback: find by text content var buttons = document.querySelectorAll('button'); for (var btn of buttons) { if (btn.textContent.trim().toLowerCase() === 'about') { btn.click(); return true; } } return false; """) if clicked: time.sleep(1.5) # Wait for tab to load # Extract about sections using aria-label and role (stable) about = driver.execute_script(""" var about = {}; // Find the about region by aria-label or role var container = document.querySelector('div[role="region"][aria-label*="About"]'); if (!container) { // Fallback: look for the scrollable area with sections container = document.querySelector('.m6QErb[aria-label*="About"]'); } if (!container) { // Last resort: find sections by h2 headers container = document; } // Find all section headers (h2 elements) var sections = container.querySelectorAll('h2'); for (var h2 of sections) { var sectionName = h2.textContent.trim(); var items = []; // Find the ul list following this h2 var parent = h2.closest('.iP2t7d, div'); if (parent) { var listItems = parent.querySelectorAll('li span[aria-label]'); for (var li of listItems) { var label = li.getAttribute('aria-label'); if (label) { // Parse "Has toilet" or "No wheelchair-accessible car park" var hasFeature = !label.toLowerCase().startsWith('no '); var featureName = label.replace(/^(Has |No )/i, ''); items.push({ feature: featureName, available: hasFeature }); } } } if (sectionName && items.length > 0) { about[sectionName] = items; } } return about; """) result['about'] = about print(f" Found {len(about)} about sections:") for section, items in about.items(): print(f" {section}:") for item in items: status = "āœ“" if item['available'] else "āœ—" print(f" {status} {item['feature']}") else: print(" āš ļø Could not click About tab") return result def validate_results(result: dict) -> bool: """Validate extracted data against expected values.""" print("\n" + "="*60) print("šŸ” VALIDATION:") print("="*60) all_passed = True # Check name if result['name'] == EXPECTED['name']: print(f" āœ… Name: {result['name']}") else: print(f" āŒ Name: got '{result['name']}', expected '{EXPECTED['name']}'") all_passed = False # Check category if result['category'] == EXPECTED['category']: print(f" āœ… Category: {result['category']}") else: print(f" āŒ Category: got '{result['category']}', expected '{EXPECTED['category']}'") all_passed = False # Check review topics (at least some should match) extracted_topics = [t['topic'].lower() for t in result.get('review_topics', [])] expected_topics = [t.lower() for t in EXPECTED['review_topics']] matching = [t for t in expected_topics if t in extracted_topics] if len(matching) >= 3: # At least 3 topics should match print(f" āœ… Review topics: {len(matching)}/{len(expected_topics)} matched") else: print(f" āŒ Review topics: only {len(matching)}/{len(expected_topics)} matched") print(f" Expected: {expected_topics}") print(f" Got: {extracted_topics}") all_passed = False # Check about sections (at least some should be present) about_sections = list(result.get('about', {}).keys()) expected_sections = EXPECTED['about_sections'] matching_sections = [s for s in expected_sections if s in about_sections] if len(matching_sections) >= 3: print(f" āœ… About sections: {len(matching_sections)}/{len(expected_sections)} matched") else: print(f" āŒ About sections: only {len(matching_sections)}/{len(expected_sections)} matched") print(f" Expected: {expected_sections}") print(f" Got: {about_sections}") all_passed = False return all_passed def main(): url = "https://www.google.com/maps/search/?api=1&query=R.+Fleitas+Peluqueros+Gran+Canaria" print("šŸš€ Starting metadata extraction test...") print(f" URL: {url[:60]}...") driver = Driver(uc=True, headless=False) try: # Set geolocation try: driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { 'latitude': 42.3601, 'longitude': -71.0589, 'accuracy': 100 }) except: pass result = extract_metadata(driver, url) print("\n" + "="*60) print("šŸ“Š FULL RESULT:") print("="*60) print(json.dumps(result, indent=2, ensure_ascii=False)) passed = validate_results(result) print("\n" + "="*60) if passed: print("šŸŽ‰ ALL VALIDATIONS PASSED!") else: print("āš ļø SOME VALIDATIONS FAILED") print("="*60) print("\nšŸ‘€ Browser stays open for 15 seconds...") time.sleep(15) except Exception as e: print(f"\nāŒ Error: {e}") import traceback traceback.print_exc() time.sleep(10) finally: driver.quit() print("šŸ”’ Browser closed") if __name__ == "__main__": main()