Add ReviewIQ pipeline spec and metadata extraction test

- reviewiq-pipeline-v1-final.md: Earlier pipeline specification - test_metadata_extraction.py: Test script for metadata extraction Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 11:21:33 +00:00
parent 59368a5bd5
commit 3da243be79
2 changed files with 1390 additions and 0 deletions
--- a/test_metadata_extraction.py
+++ b/test_metadata_extraction.py
@@ -0,0 +1,398 @@
+#!/usr/bin/env python3
+"""
+Test metadata extraction: category, review topics, about info.
+Uses robust selectors (aria-labels, roles, jsaction) to avoid breakage.
+"""
+import time
+import json
+from seleniumbase import Driver
+from selenium.webdriver.common.by import By
+
+# Expected values for validation
+EXPECTED = {
+    "name": "R. Fleitas Peluqueros",
+    "category": "Barber shop",
+    "review_topics": ["hair salon", "cutting", "price", "siblings", "beard"],
+    "about_sections": ["Accessibility", "Amenities", "Planning", "Payments", "Children"]
+}
+
+def extract_metadata(driver, url: str) -> dict:
+    """Extract all business metadata from Google Maps."""
+
+    # Force English
+    if 'hl=' not in url:
+        url = f"{url}{'&' if '?' in url else '?'}hl=en&gl=us"
+
+    print(f"   Loading URL: {url[:70]}...")
+    driver.get(url)
+
+    # Handle consent popup - poll with 10ms sleep (same as production scraper)
+    start = time.time()
+    while time.time() - start < 5:
+        if "consent.google" in driver.current_url:
+            print("   🍪 Consent page detected, clicking accept...")
+            try:
+                for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
+                    txt = btn.text.lower()
+                    if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
+                        btn.click()
+                        print(f"   ✅ Clicked: '{btn.text}', reloading...")
+                        driver.get(url)
+                        break
+            except:
+                pass
+            break
+        if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url):
+            break
+        time.sleep(0.01)  # 10ms polling
+
+    # Wait for page to stabilize
+    time.sleep(1)
+
+    result = {
+        "name": None,
+        "category": None,
+        "rating": None,
+        "total_reviews": None,
+        "review_topics": [],
+        "about": {}
+    }
+
+    # ========== OVERVIEW TAB (default) ==========
+    print("\n📍 Extracting from OVERVIEW tab...")
+
+    overview_data = driver.execute_script("""
+        var data = {name: null, category: null, rating: null, total_reviews: null};
+
+        // Business name - h1 is stable
+        var h1 = document.querySelector('h1');
+        if (h1) data.name = h1.textContent.trim();
+
+        // Category - use jsaction attribute (more stable than class)
+        var catBtn = document.querySelector('button[jsaction*="category"]');
+        if (catBtn) data.category = catBtn.textContent.trim();
+
+        // Fallback: look for button after rating that's not a link
+        if (!data.category) {
+            var buttons = document.querySelectorAll('button');
+            for (var btn of buttons) {
+                var text = btn.textContent.trim();
+                // Categories are short words, no numbers, not navigation
+                if (text && text.length < 50 && !text.match(/^[0-9]/) &&
+                    !text.match(/review|star|direction|save|share|photo/i)) {
+                    // Check if it's near the rating area
+                    var parent = btn.closest('.LBgpqf, .skqShb, .fontBodyMedium');
+                    if (parent) {
+                        data.category = text;
+                        break;
+                    }
+                }
+            }
+        }
+
+        // Rating and reviews from aria-labels (stable)
+        var spans = document.querySelectorAll('span[role="img"]');
+        for (var span of spans) {
+            var label = span.getAttribute('aria-label') || '';
+
+            // Rating: "4.8 stars"
+            var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
+            if (rMatch && !data.rating) {
+                data.rating = parseFloat(rMatch[1].replace(',', '.'));
+            }
+
+            // Reviews: "79 reviews"
+            var revMatch = label.match(/^([\\d,]+)\\s*review/i);
+            if (revMatch && !data.total_reviews) {
+                data.total_reviews = parseInt(revMatch[1].replace(/,/g, ''));
+            }
+        }
+
+        return data;
+    """)
+
+    result.update(overview_data)
+    print(f"   Name: {result['name']}")
+    print(f"   Category: {result['category']}")
+    print(f"   Rating: {result['rating']}")
+    print(f"   Reviews: {result['total_reviews']}")
+
+    # ========== REVIEWS TAB ==========
+    print("\n📝 Clicking REVIEWS tab...")
+
+    # Click reviews tab using aria-label or role (robust)
+    clicked = driver.execute_script("""
+        // Try multiple selectors for reviews tab
+        var selectors = [
+            'button[aria-label*="Review"]',
+            'button[data-tab-index="1"]',
+            'div[role="tablist"] button:nth-child(2)',
+            'button[jsaction*="review"]'
+        ];
+
+        for (var sel of selectors) {
+            var btn = document.querySelector(sel);
+            if (btn && btn.textContent.toLowerCase().includes('review')) {
+                btn.click();
+                return true;
+            }
+        }
+
+        // Fallback: find by text content
+        var buttons = document.querySelectorAll('button');
+        for (var btn of buttons) {
+            if (btn.textContent.trim().toLowerCase() === 'reviews') {
+                btn.click();
+                return true;
+            }
+        }
+        return false;
+    """)
+
+    if clicked:
+        time.sleep(1.5)  # Wait for tab to load
+
+        # Extract review topics from radiogroup (very stable selector)
+        topics = driver.execute_script("""
+            var topics = [];
+
+            // Primary: use role="radiogroup" with aria-label="Refine reviews"
+            var container = document.querySelector('div[role="radiogroup"][aria-label*="Refine"], div[role="radiogroup"][aria-label*="refine"]');
+
+            if (!container) {
+                // Fallback: any radiogroup in the reviews area
+                container = document.querySelector('div[role="radiogroup"]');
+            }
+
+            if (container) {
+                var buttons = container.querySelectorAll('button[role="radio"]');
+                for (var btn of buttons) {
+                    var label = btn.getAttribute('aria-label') || '';
+                    // Parse "hair salon, mentioned in 4 reviews" or just get the topic name
+                    var match = label.match(/^([^,]+),\\s*mentioned in (\\d+)/i);
+                    if (match) {
+                        topics.push({
+                            topic: match[1].trim(),
+                            count: parseInt(match[2])
+                        });
+                    } else if (label && !label.toLowerCase().includes('all review')) {
+                        // Might be in different format
+                        var countSpan = btn.querySelector('.bC3Nkc, .fontBodySmall');
+                        var nameSpan = btn.querySelector('.uEubGf, span:first-child');
+                        if (nameSpan) {
+                            var name = nameSpan.textContent.trim();
+                            var count = countSpan ? parseInt(countSpan.textContent) : 0;
+                            if (name && name.toLowerCase() !== 'all') {
+                                topics.push({topic: name, count: count});
+                            }
+                        }
+                    }
+                }
+            }
+
+            return topics;
+        """)
+
+        result['review_topics'] = topics
+        print(f"   Found {len(topics)} review topics:")
+        for t in topics:
+            print(f"      - {t['topic']}: {t['count']} mentions")
+    else:
+        print("   ⚠️ Could not click Reviews tab")
+
+    # ========== ABOUT TAB ==========
+    print("\n📋 Clicking ABOUT tab...")
+
+    clicked = driver.execute_script("""
+        // Try multiple selectors for about tab
+        var selectors = [
+            'button[aria-label*="About"]',
+            'button[data-tab-index="2"]',
+            'div[role="tablist"] button:nth-child(3)',
+            'button[jsaction*="about"]'
+        ];
+
+        for (var sel of selectors) {
+            var btn = document.querySelector(sel);
+            if (btn && btn.textContent.toLowerCase().includes('about')) {
+                btn.click();
+                return true;
+            }
+        }
+
+        // Fallback: find by text content
+        var buttons = document.querySelectorAll('button');
+        for (var btn of buttons) {
+            if (btn.textContent.trim().toLowerCase() === 'about') {
+                btn.click();
+                return true;
+            }
+        }
+        return false;
+    """)
+
+    if clicked:
+        time.sleep(1.5)  # Wait for tab to load
+
+        # Extract about sections using aria-label and role (stable)
+        about = driver.execute_script("""
+            var about = {};
+
+            // Find the about region by aria-label or role
+            var container = document.querySelector('div[role="region"][aria-label*="About"]');
+
+            if (!container) {
+                // Fallback: look for the scrollable area with sections
+                container = document.querySelector('.m6QErb[aria-label*="About"]');
+            }
+
+            if (!container) {
+                // Last resort: find sections by h2 headers
+                container = document;
+            }
+
+            // Find all section headers (h2 elements)
+            var sections = container.querySelectorAll('h2');
+
+            for (var h2 of sections) {
+                var sectionName = h2.textContent.trim();
+                var items = [];
+
+                // Find the ul list following this h2
+                var parent = h2.closest('.iP2t7d, div');
+                if (parent) {
+                    var listItems = parent.querySelectorAll('li span[aria-label]');
+                    for (var li of listItems) {
+                        var label = li.getAttribute('aria-label');
+                        if (label) {
+                            // Parse "Has toilet" or "No wheelchair-accessible car park"
+                            var hasFeature = !label.toLowerCase().startsWith('no ');
+                            var featureName = label.replace(/^(Has |No )/i, '');
+                            items.push({
+                                feature: featureName,
+                                available: hasFeature
+                            });
+                        }
+                    }
+                }
+
+                if (sectionName && items.length > 0) {
+                    about[sectionName] = items;
+                }
+            }
+
+            return about;
+        """)
+
+        result['about'] = about
+        print(f"   Found {len(about)} about sections:")
+        for section, items in about.items():
+            print(f"      {section}:")
+            for item in items:
+                status = "✓" if item['available'] else "✗"
+                print(f"         {status} {item['feature']}")
+    else:
+        print("   ⚠️ Could not click About tab")
+
+    return result
+
+
+def validate_results(result: dict) -> bool:
+    """Validate extracted data against expected values."""
+    print("\n" + "="*60)
+    print("🔍 VALIDATION:")
+    print("="*60)
+
+    all_passed = True
+
+    # Check name
+    if result['name'] == EXPECTED['name']:
+        print(f"   ✅ Name: {result['name']}")
+    else:
+        print(f"   ❌ Name: got '{result['name']}', expected '{EXPECTED['name']}'")
+        all_passed = False
+
+    # Check category
+    if result['category'] == EXPECTED['category']:
+        print(f"   ✅ Category: {result['category']}")
+    else:
+        print(f"   ❌ Category: got '{result['category']}', expected '{EXPECTED['category']}'")
+        all_passed = False
+
+    # Check review topics (at least some should match)
+    extracted_topics = [t['topic'].lower() for t in result.get('review_topics', [])]
+    expected_topics = [t.lower() for t in EXPECTED['review_topics']]
+    matching = [t for t in expected_topics if t in extracted_topics]
+
+    if len(matching) >= 3:  # At least 3 topics should match
+        print(f"   ✅ Review topics: {len(matching)}/{len(expected_topics)} matched")
+    else:
+        print(f"   ❌ Review topics: only {len(matching)}/{len(expected_topics)} matched")
+        print(f"      Expected: {expected_topics}")
+        print(f"      Got: {extracted_topics}")
+        all_passed = False
+
+    # Check about sections (at least some should be present)
+    about_sections = list(result.get('about', {}).keys())
+    expected_sections = EXPECTED['about_sections']
+    matching_sections = [s for s in expected_sections if s in about_sections]
+
+    if len(matching_sections) >= 3:
+        print(f"   ✅ About sections: {len(matching_sections)}/{len(expected_sections)} matched")
+    else:
+        print(f"   ❌ About sections: only {len(matching_sections)}/{len(expected_sections)} matched")
+        print(f"      Expected: {expected_sections}")
+        print(f"      Got: {about_sections}")
+        all_passed = False
+
+    return all_passed
+
+
+def main():
+    url = "https://www.google.com/maps/search/?api=1&query=R.+Fleitas+Peluqueros+Gran+Canaria"
+
+    print("🚀 Starting metadata extraction test...")
+    print(f"   URL: {url[:60]}...")
+
+    driver = Driver(uc=True, headless=False)
+
+    try:
+        # Set geolocation
+        try:
+            driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
+                'latitude': 42.3601, 'longitude': -71.0589, 'accuracy': 100
+            })
+        except:
+            pass
+
+        result = extract_metadata(driver, url)
+
+        print("\n" + "="*60)
+        print("📊 FULL RESULT:")
+        print("="*60)
+        print(json.dumps(result, indent=2, ensure_ascii=False))
+
+        passed = validate_results(result)
+
+        print("\n" + "="*60)
+        if passed:
+            print("🎉 ALL VALIDATIONS PASSED!")
+        else:
+            print("⚠️ SOME VALIDATIONS FAILED")
+        print("="*60)
+
+        print("\n👀 Browser stays open for 15 seconds...")
+        time.sleep(15)
+
+    except Exception as e:
+        print(f"\n❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+        time.sleep(10)
+    finally:
+        driver.quit()
+        print("🔒 Browser closed")
+
+
+if __name__ == "__main__":
+    main()