Add ReviewIQ pipeline spec and metadata extraction test
- reviewiq-pipeline-v1-final.md: Earlier pipeline specification - test_metadata_extraction.py: Test script for metadata extraction Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
398
test_metadata_extraction.py
Normal file
398
test_metadata_extraction.py
Normal file
@@ -0,0 +1,398 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test metadata extraction: category, review topics, about info.
|
||||
Uses robust selectors (aria-labels, roles, jsaction) to avoid breakage.
|
||||
"""
|
||||
import time
|
||||
import json
|
||||
from seleniumbase import Driver
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
# Expected values for validation
|
||||
EXPECTED = {
|
||||
"name": "R. Fleitas Peluqueros",
|
||||
"category": "Barber shop",
|
||||
"review_topics": ["hair salon", "cutting", "price", "siblings", "beard"],
|
||||
"about_sections": ["Accessibility", "Amenities", "Planning", "Payments", "Children"]
|
||||
}
|
||||
|
||||
def extract_metadata(driver, url: str) -> dict:
|
||||
"""Extract all business metadata from Google Maps."""
|
||||
|
||||
# Force English
|
||||
if 'hl=' not in url:
|
||||
url = f"{url}{'&' if '?' in url else '?'}hl=en&gl=us"
|
||||
|
||||
print(f" Loading URL: {url[:70]}...")
|
||||
driver.get(url)
|
||||
|
||||
# Handle consent popup - poll with 10ms sleep (same as production scraper)
|
||||
start = time.time()
|
||||
while time.time() - start < 5:
|
||||
if "consent.google" in driver.current_url:
|
||||
print(" 🍪 Consent page detected, clicking accept...")
|
||||
try:
|
||||
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
|
||||
txt = btn.text.lower()
|
||||
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
||||
btn.click()
|
||||
print(f" ✅ Clicked: '{btn.text}', reloading...")
|
||||
driver.get(url)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
break
|
||||
if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url):
|
||||
break
|
||||
time.sleep(0.01) # 10ms polling
|
||||
|
||||
# Wait for page to stabilize
|
||||
time.sleep(1)
|
||||
|
||||
result = {
|
||||
"name": None,
|
||||
"category": None,
|
||||
"rating": None,
|
||||
"total_reviews": None,
|
||||
"review_topics": [],
|
||||
"about": {}
|
||||
}
|
||||
|
||||
# ========== OVERVIEW TAB (default) ==========
|
||||
print("\n📍 Extracting from OVERVIEW tab...")
|
||||
|
||||
overview_data = driver.execute_script("""
|
||||
var data = {name: null, category: null, rating: null, total_reviews: null};
|
||||
|
||||
// Business name - h1 is stable
|
||||
var h1 = document.querySelector('h1');
|
||||
if (h1) data.name = h1.textContent.trim();
|
||||
|
||||
// Category - use jsaction attribute (more stable than class)
|
||||
var catBtn = document.querySelector('button[jsaction*="category"]');
|
||||
if (catBtn) data.category = catBtn.textContent.trim();
|
||||
|
||||
// Fallback: look for button after rating that's not a link
|
||||
if (!data.category) {
|
||||
var buttons = document.querySelectorAll('button');
|
||||
for (var btn of buttons) {
|
||||
var text = btn.textContent.trim();
|
||||
// Categories are short words, no numbers, not navigation
|
||||
if (text && text.length < 50 && !text.match(/^[0-9]/) &&
|
||||
!text.match(/review|star|direction|save|share|photo/i)) {
|
||||
// Check if it's near the rating area
|
||||
var parent = btn.closest('.LBgpqf, .skqShb, .fontBodyMedium');
|
||||
if (parent) {
|
||||
data.category = text;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Rating and reviews from aria-labels (stable)
|
||||
var spans = document.querySelectorAll('span[role="img"]');
|
||||
for (var span of spans) {
|
||||
var label = span.getAttribute('aria-label') || '';
|
||||
|
||||
// Rating: "4.8 stars"
|
||||
var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
|
||||
if (rMatch && !data.rating) {
|
||||
data.rating = parseFloat(rMatch[1].replace(',', '.'));
|
||||
}
|
||||
|
||||
// Reviews: "79 reviews"
|
||||
var revMatch = label.match(/^([\\d,]+)\\s*review/i);
|
||||
if (revMatch && !data.total_reviews) {
|
||||
data.total_reviews = parseInt(revMatch[1].replace(/,/g, ''));
|
||||
}
|
||||
}
|
||||
|
||||
return data;
|
||||
""")
|
||||
|
||||
result.update(overview_data)
|
||||
print(f" Name: {result['name']}")
|
||||
print(f" Category: {result['category']}")
|
||||
print(f" Rating: {result['rating']}")
|
||||
print(f" Reviews: {result['total_reviews']}")
|
||||
|
||||
# ========== REVIEWS TAB ==========
|
||||
print("\n📝 Clicking REVIEWS tab...")
|
||||
|
||||
# Click reviews tab using aria-label or role (robust)
|
||||
clicked = driver.execute_script("""
|
||||
// Try multiple selectors for reviews tab
|
||||
var selectors = [
|
||||
'button[aria-label*="Review"]',
|
||||
'button[data-tab-index="1"]',
|
||||
'div[role="tablist"] button:nth-child(2)',
|
||||
'button[jsaction*="review"]'
|
||||
];
|
||||
|
||||
for (var sel of selectors) {
|
||||
var btn = document.querySelector(sel);
|
||||
if (btn && btn.textContent.toLowerCase().includes('review')) {
|
||||
btn.click();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: find by text content
|
||||
var buttons = document.querySelectorAll('button');
|
||||
for (var btn of buttons) {
|
||||
if (btn.textContent.trim().toLowerCase() === 'reviews') {
|
||||
btn.click();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
""")
|
||||
|
||||
if clicked:
|
||||
time.sleep(1.5) # Wait for tab to load
|
||||
|
||||
# Extract review topics from radiogroup (very stable selector)
|
||||
topics = driver.execute_script("""
|
||||
var topics = [];
|
||||
|
||||
// Primary: use role="radiogroup" with aria-label="Refine reviews"
|
||||
var container = document.querySelector('div[role="radiogroup"][aria-label*="Refine"], div[role="radiogroup"][aria-label*="refine"]');
|
||||
|
||||
if (!container) {
|
||||
// Fallback: any radiogroup in the reviews area
|
||||
container = document.querySelector('div[role="radiogroup"]');
|
||||
}
|
||||
|
||||
if (container) {
|
||||
var buttons = container.querySelectorAll('button[role="radio"]');
|
||||
for (var btn of buttons) {
|
||||
var label = btn.getAttribute('aria-label') || '';
|
||||
// Parse "hair salon, mentioned in 4 reviews" or just get the topic name
|
||||
var match = label.match(/^([^,]+),\\s*mentioned in (\\d+)/i);
|
||||
if (match) {
|
||||
topics.push({
|
||||
topic: match[1].trim(),
|
||||
count: parseInt(match[2])
|
||||
});
|
||||
} else if (label && !label.toLowerCase().includes('all review')) {
|
||||
// Might be in different format
|
||||
var countSpan = btn.querySelector('.bC3Nkc, .fontBodySmall');
|
||||
var nameSpan = btn.querySelector('.uEubGf, span:first-child');
|
||||
if (nameSpan) {
|
||||
var name = nameSpan.textContent.trim();
|
||||
var count = countSpan ? parseInt(countSpan.textContent) : 0;
|
||||
if (name && name.toLowerCase() !== 'all') {
|
||||
topics.push({topic: name, count: count});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return topics;
|
||||
""")
|
||||
|
||||
result['review_topics'] = topics
|
||||
print(f" Found {len(topics)} review topics:")
|
||||
for t in topics:
|
||||
print(f" - {t['topic']}: {t['count']} mentions")
|
||||
else:
|
||||
print(" ⚠️ Could not click Reviews tab")
|
||||
|
||||
# ========== ABOUT TAB ==========
|
||||
print("\n📋 Clicking ABOUT tab...")
|
||||
|
||||
clicked = driver.execute_script("""
|
||||
// Try multiple selectors for about tab
|
||||
var selectors = [
|
||||
'button[aria-label*="About"]',
|
||||
'button[data-tab-index="2"]',
|
||||
'div[role="tablist"] button:nth-child(3)',
|
||||
'button[jsaction*="about"]'
|
||||
];
|
||||
|
||||
for (var sel of selectors) {
|
||||
var btn = document.querySelector(sel);
|
||||
if (btn && btn.textContent.toLowerCase().includes('about')) {
|
||||
btn.click();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: find by text content
|
||||
var buttons = document.querySelectorAll('button');
|
||||
for (var btn of buttons) {
|
||||
if (btn.textContent.trim().toLowerCase() === 'about') {
|
||||
btn.click();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
""")
|
||||
|
||||
if clicked:
|
||||
time.sleep(1.5) # Wait for tab to load
|
||||
|
||||
# Extract about sections using aria-label and role (stable)
|
||||
about = driver.execute_script("""
|
||||
var about = {};
|
||||
|
||||
// Find the about region by aria-label or role
|
||||
var container = document.querySelector('div[role="region"][aria-label*="About"]');
|
||||
|
||||
if (!container) {
|
||||
// Fallback: look for the scrollable area with sections
|
||||
container = document.querySelector('.m6QErb[aria-label*="About"]');
|
||||
}
|
||||
|
||||
if (!container) {
|
||||
// Last resort: find sections by h2 headers
|
||||
container = document;
|
||||
}
|
||||
|
||||
// Find all section headers (h2 elements)
|
||||
var sections = container.querySelectorAll('h2');
|
||||
|
||||
for (var h2 of sections) {
|
||||
var sectionName = h2.textContent.trim();
|
||||
var items = [];
|
||||
|
||||
// Find the ul list following this h2
|
||||
var parent = h2.closest('.iP2t7d, div');
|
||||
if (parent) {
|
||||
var listItems = parent.querySelectorAll('li span[aria-label]');
|
||||
for (var li of listItems) {
|
||||
var label = li.getAttribute('aria-label');
|
||||
if (label) {
|
||||
// Parse "Has toilet" or "No wheelchair-accessible car park"
|
||||
var hasFeature = !label.toLowerCase().startsWith('no ');
|
||||
var featureName = label.replace(/^(Has |No )/i, '');
|
||||
items.push({
|
||||
feature: featureName,
|
||||
available: hasFeature
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sectionName && items.length > 0) {
|
||||
about[sectionName] = items;
|
||||
}
|
||||
}
|
||||
|
||||
return about;
|
||||
""")
|
||||
|
||||
result['about'] = about
|
||||
print(f" Found {len(about)} about sections:")
|
||||
for section, items in about.items():
|
||||
print(f" {section}:")
|
||||
for item in items:
|
||||
status = "✓" if item['available'] else "✗"
|
||||
print(f" {status} {item['feature']}")
|
||||
else:
|
||||
print(" ⚠️ Could not click About tab")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def validate_results(result: dict) -> bool:
|
||||
"""Validate extracted data against expected values."""
|
||||
print("\n" + "="*60)
|
||||
print("🔍 VALIDATION:")
|
||||
print("="*60)
|
||||
|
||||
all_passed = True
|
||||
|
||||
# Check name
|
||||
if result['name'] == EXPECTED['name']:
|
||||
print(f" ✅ Name: {result['name']}")
|
||||
else:
|
||||
print(f" ❌ Name: got '{result['name']}', expected '{EXPECTED['name']}'")
|
||||
all_passed = False
|
||||
|
||||
# Check category
|
||||
if result['category'] == EXPECTED['category']:
|
||||
print(f" ✅ Category: {result['category']}")
|
||||
else:
|
||||
print(f" ❌ Category: got '{result['category']}', expected '{EXPECTED['category']}'")
|
||||
all_passed = False
|
||||
|
||||
# Check review topics (at least some should match)
|
||||
extracted_topics = [t['topic'].lower() for t in result.get('review_topics', [])]
|
||||
expected_topics = [t.lower() for t in EXPECTED['review_topics']]
|
||||
matching = [t for t in expected_topics if t in extracted_topics]
|
||||
|
||||
if len(matching) >= 3: # At least 3 topics should match
|
||||
print(f" ✅ Review topics: {len(matching)}/{len(expected_topics)} matched")
|
||||
else:
|
||||
print(f" ❌ Review topics: only {len(matching)}/{len(expected_topics)} matched")
|
||||
print(f" Expected: {expected_topics}")
|
||||
print(f" Got: {extracted_topics}")
|
||||
all_passed = False
|
||||
|
||||
# Check about sections (at least some should be present)
|
||||
about_sections = list(result.get('about', {}).keys())
|
||||
expected_sections = EXPECTED['about_sections']
|
||||
matching_sections = [s for s in expected_sections if s in about_sections]
|
||||
|
||||
if len(matching_sections) >= 3:
|
||||
print(f" ✅ About sections: {len(matching_sections)}/{len(expected_sections)} matched")
|
||||
else:
|
||||
print(f" ❌ About sections: only {len(matching_sections)}/{len(expected_sections)} matched")
|
||||
print(f" Expected: {expected_sections}")
|
||||
print(f" Got: {about_sections}")
|
||||
all_passed = False
|
||||
|
||||
return all_passed
|
||||
|
||||
|
||||
def main():
|
||||
url = "https://www.google.com/maps/search/?api=1&query=R.+Fleitas+Peluqueros+Gran+Canaria"
|
||||
|
||||
print("🚀 Starting metadata extraction test...")
|
||||
print(f" URL: {url[:60]}...")
|
||||
|
||||
driver = Driver(uc=True, headless=False)
|
||||
|
||||
try:
|
||||
# Set geolocation
|
||||
try:
|
||||
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
||||
'latitude': 42.3601, 'longitude': -71.0589, 'accuracy': 100
|
||||
})
|
||||
except:
|
||||
pass
|
||||
|
||||
result = extract_metadata(driver, url)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("📊 FULL RESULT:")
|
||||
print("="*60)
|
||||
print(json.dumps(result, indent=2, ensure_ascii=False))
|
||||
|
||||
passed = validate_results(result)
|
||||
|
||||
print("\n" + "="*60)
|
||||
if passed:
|
||||
print("🎉 ALL VALIDATIONS PASSED!")
|
||||
else:
|
||||
print("⚠️ SOME VALIDATIONS FAILED")
|
||||
print("="*60)
|
||||
|
||||
print("\n👀 Browser stays open for 15 seconds...")
|
||||
time.sleep(15)
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
time.sleep(10)
|
||||
finally:
|
||||
driver.quit()
|
||||
print("🔒 Browser closed")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user