Add ReviewIQ pipeline spec and metadata extraction test

- reviewiq-pipeline-v1-final.md: Earlier pipeline specification
- test_metadata_extraction.py: Test script for metadata extraction

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 11:21:33 +00:00
parent 59368a5bd5
commit 3da243be79
2 changed files with 1390 additions and 0 deletions

398
test_metadata_extraction.py Normal file
View File

@@ -0,0 +1,398 @@
#!/usr/bin/env python3
"""
Test metadata extraction: category, review topics, about info.
Uses robust selectors (aria-labels, roles, jsaction) to avoid breakage.
"""
import time
import json
from seleniumbase import Driver
from selenium.webdriver.common.by import By
# Expected values for validation
EXPECTED = {
"name": "R. Fleitas Peluqueros",
"category": "Barber shop",
"review_topics": ["hair salon", "cutting", "price", "siblings", "beard"],
"about_sections": ["Accessibility", "Amenities", "Planning", "Payments", "Children"]
}
def extract_metadata(driver, url: str) -> dict:
"""Extract all business metadata from Google Maps."""
# Force English
if 'hl=' not in url:
url = f"{url}{'&' if '?' in url else '?'}hl=en&gl=us"
print(f" Loading URL: {url[:70]}...")
driver.get(url)
# Handle consent popup - poll with 10ms sleep (same as production scraper)
start = time.time()
while time.time() - start < 5:
if "consent.google" in driver.current_url:
print(" 🍪 Consent page detected, clicking accept...")
try:
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
txt = btn.text.lower()
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
btn.click()
print(f" ✅ Clicked: '{btn.text}', reloading...")
driver.get(url)
break
except:
pass
break
if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url):
break
time.sleep(0.01) # 10ms polling
# Wait for page to stabilize
time.sleep(1)
result = {
"name": None,
"category": None,
"rating": None,
"total_reviews": None,
"review_topics": [],
"about": {}
}
# ========== OVERVIEW TAB (default) ==========
print("\n📍 Extracting from OVERVIEW tab...")
overview_data = driver.execute_script("""
var data = {name: null, category: null, rating: null, total_reviews: null};
// Business name - h1 is stable
var h1 = document.querySelector('h1');
if (h1) data.name = h1.textContent.trim();
// Category - use jsaction attribute (more stable than class)
var catBtn = document.querySelector('button[jsaction*="category"]');
if (catBtn) data.category = catBtn.textContent.trim();
// Fallback: look for button after rating that's not a link
if (!data.category) {
var buttons = document.querySelectorAll('button');
for (var btn of buttons) {
var text = btn.textContent.trim();
// Categories are short words, no numbers, not navigation
if (text && text.length < 50 && !text.match(/^[0-9]/) &&
!text.match(/review|star|direction|save|share|photo/i)) {
// Check if it's near the rating area
var parent = btn.closest('.LBgpqf, .skqShb, .fontBodyMedium');
if (parent) {
data.category = text;
break;
}
}
}
}
// Rating and reviews from aria-labels (stable)
var spans = document.querySelectorAll('span[role="img"]');
for (var span of spans) {
var label = span.getAttribute('aria-label') || '';
// Rating: "4.8 stars"
var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
if (rMatch && !data.rating) {
data.rating = parseFloat(rMatch[1].replace(',', '.'));
}
// Reviews: "79 reviews"
var revMatch = label.match(/^([\\d,]+)\\s*review/i);
if (revMatch && !data.total_reviews) {
data.total_reviews = parseInt(revMatch[1].replace(/,/g, ''));
}
}
return data;
""")
result.update(overview_data)
print(f" Name: {result['name']}")
print(f" Category: {result['category']}")
print(f" Rating: {result['rating']}")
print(f" Reviews: {result['total_reviews']}")
# ========== REVIEWS TAB ==========
print("\n📝 Clicking REVIEWS tab...")
# Click reviews tab using aria-label or role (robust)
clicked = driver.execute_script("""
// Try multiple selectors for reviews tab
var selectors = [
'button[aria-label*="Review"]',
'button[data-tab-index="1"]',
'div[role="tablist"] button:nth-child(2)',
'button[jsaction*="review"]'
];
for (var sel of selectors) {
var btn = document.querySelector(sel);
if (btn && btn.textContent.toLowerCase().includes('review')) {
btn.click();
return true;
}
}
// Fallback: find by text content
var buttons = document.querySelectorAll('button');
for (var btn of buttons) {
if (btn.textContent.trim().toLowerCase() === 'reviews') {
btn.click();
return true;
}
}
return false;
""")
if clicked:
time.sleep(1.5) # Wait for tab to load
# Extract review topics from radiogroup (very stable selector)
topics = driver.execute_script("""
var topics = [];
// Primary: use role="radiogroup" with aria-label="Refine reviews"
var container = document.querySelector('div[role="radiogroup"][aria-label*="Refine"], div[role="radiogroup"][aria-label*="refine"]');
if (!container) {
// Fallback: any radiogroup in the reviews area
container = document.querySelector('div[role="radiogroup"]');
}
if (container) {
var buttons = container.querySelectorAll('button[role="radio"]');
for (var btn of buttons) {
var label = btn.getAttribute('aria-label') || '';
// Parse "hair salon, mentioned in 4 reviews" or just get the topic name
var match = label.match(/^([^,]+),\\s*mentioned in (\\d+)/i);
if (match) {
topics.push({
topic: match[1].trim(),
count: parseInt(match[2])
});
} else if (label && !label.toLowerCase().includes('all review')) {
// Might be in different format
var countSpan = btn.querySelector('.bC3Nkc, .fontBodySmall');
var nameSpan = btn.querySelector('.uEubGf, span:first-child');
if (nameSpan) {
var name = nameSpan.textContent.trim();
var count = countSpan ? parseInt(countSpan.textContent) : 0;
if (name && name.toLowerCase() !== 'all') {
topics.push({topic: name, count: count});
}
}
}
}
}
return topics;
""")
result['review_topics'] = topics
print(f" Found {len(topics)} review topics:")
for t in topics:
print(f" - {t['topic']}: {t['count']} mentions")
else:
print(" ⚠️ Could not click Reviews tab")
# ========== ABOUT TAB ==========
print("\n📋 Clicking ABOUT tab...")
clicked = driver.execute_script("""
// Try multiple selectors for about tab
var selectors = [
'button[aria-label*="About"]',
'button[data-tab-index="2"]',
'div[role="tablist"] button:nth-child(3)',
'button[jsaction*="about"]'
];
for (var sel of selectors) {
var btn = document.querySelector(sel);
if (btn && btn.textContent.toLowerCase().includes('about')) {
btn.click();
return true;
}
}
// Fallback: find by text content
var buttons = document.querySelectorAll('button');
for (var btn of buttons) {
if (btn.textContent.trim().toLowerCase() === 'about') {
btn.click();
return true;
}
}
return false;
""")
if clicked:
time.sleep(1.5) # Wait for tab to load
# Extract about sections using aria-label and role (stable)
about = driver.execute_script("""
var about = {};
// Find the about region by aria-label or role
var container = document.querySelector('div[role="region"][aria-label*="About"]');
if (!container) {
// Fallback: look for the scrollable area with sections
container = document.querySelector('.m6QErb[aria-label*="About"]');
}
if (!container) {
// Last resort: find sections by h2 headers
container = document;
}
// Find all section headers (h2 elements)
var sections = container.querySelectorAll('h2');
for (var h2 of sections) {
var sectionName = h2.textContent.trim();
var items = [];
// Find the ul list following this h2
var parent = h2.closest('.iP2t7d, div');
if (parent) {
var listItems = parent.querySelectorAll('li span[aria-label]');
for (var li of listItems) {
var label = li.getAttribute('aria-label');
if (label) {
// Parse "Has toilet" or "No wheelchair-accessible car park"
var hasFeature = !label.toLowerCase().startsWith('no ');
var featureName = label.replace(/^(Has |No )/i, '');
items.push({
feature: featureName,
available: hasFeature
});
}
}
}
if (sectionName && items.length > 0) {
about[sectionName] = items;
}
}
return about;
""")
result['about'] = about
print(f" Found {len(about)} about sections:")
for section, items in about.items():
print(f" {section}:")
for item in items:
status = "" if item['available'] else ""
print(f" {status} {item['feature']}")
else:
print(" ⚠️ Could not click About tab")
return result
def validate_results(result: dict) -> bool:
"""Validate extracted data against expected values."""
print("\n" + "="*60)
print("🔍 VALIDATION:")
print("="*60)
all_passed = True
# Check name
if result['name'] == EXPECTED['name']:
print(f" ✅ Name: {result['name']}")
else:
print(f" ❌ Name: got '{result['name']}', expected '{EXPECTED['name']}'")
all_passed = False
# Check category
if result['category'] == EXPECTED['category']:
print(f" ✅ Category: {result['category']}")
else:
print(f" ❌ Category: got '{result['category']}', expected '{EXPECTED['category']}'")
all_passed = False
# Check review topics (at least some should match)
extracted_topics = [t['topic'].lower() for t in result.get('review_topics', [])]
expected_topics = [t.lower() for t in EXPECTED['review_topics']]
matching = [t for t in expected_topics if t in extracted_topics]
if len(matching) >= 3: # At least 3 topics should match
print(f" ✅ Review topics: {len(matching)}/{len(expected_topics)} matched")
else:
print(f" ❌ Review topics: only {len(matching)}/{len(expected_topics)} matched")
print(f" Expected: {expected_topics}")
print(f" Got: {extracted_topics}")
all_passed = False
# Check about sections (at least some should be present)
about_sections = list(result.get('about', {}).keys())
expected_sections = EXPECTED['about_sections']
matching_sections = [s for s in expected_sections if s in about_sections]
if len(matching_sections) >= 3:
print(f" ✅ About sections: {len(matching_sections)}/{len(expected_sections)} matched")
else:
print(f" ❌ About sections: only {len(matching_sections)}/{len(expected_sections)} matched")
print(f" Expected: {expected_sections}")
print(f" Got: {about_sections}")
all_passed = False
return all_passed
def main():
url = "https://www.google.com/maps/search/?api=1&query=R.+Fleitas+Peluqueros+Gran+Canaria"
print("🚀 Starting metadata extraction test...")
print(f" URL: {url[:60]}...")
driver = Driver(uc=True, headless=False)
try:
# Set geolocation
try:
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': 42.3601, 'longitude': -71.0589, 'accuracy': 100
})
except:
pass
result = extract_metadata(driver, url)
print("\n" + "="*60)
print("📊 FULL RESULT:")
print("="*60)
print(json.dumps(result, indent=2, ensure_ascii=False))
passed = validate_results(result)
print("\n" + "="*60)
if passed:
print("🎉 ALL VALIDATIONS PASSED!")
else:
print("⚠️ SOME VALIDATIONS FAILED")
print("="*60)
print("\n👀 Browser stays open for 15 seconds...")
time.sleep(15)
except Exception as e:
print(f"\n❌ Error: {e}")
import traceback
traceback.print_exc()
time.sleep(10)
finally:
driver.quit()
print("🔒 Browser closed")
if __name__ == "__main__":
main()