From 0682c0ec611352e12d8e83ed125bfaf9b0695c8c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?=
 <35082514+alezmad@users.noreply.github.com>
Date: Fri, 23 Jan 2026 17:52:06 +0000
Subject: [PATCH] Add get_business_card_info to scraper_clean with multilingual
 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces fast_scraper validation with efficient polling-based extraction
using the same navigation pattern as scrape_reviews:
- 10ms polling for consent handling (no fixed waits)
- 100ms polling for data extraction
- Exits early when data found

Supports multiple languages:
- Rating: stars/estrellas/étoiles/sterne/stelle
- Reviews: reviews/reseñas/avis/bewertungen/recensioni
- Handles comma decimals (4,8 -> 4.8)

Result: 6.3s to extract name, address, rating, total_reviews

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 api_server_production.py |   3 +-
 modules/scraper_clean.py | 124 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 125 insertions(+), 2 deletions(-)

diff --git a/api_server_production.py b/api_server_production.py
index dad6f96..133ae8b 100644
--- a/api_server_production.py
+++ b/api_server_production.py
@@ -21,8 +21,7 @@ from fastapi.responses import JSONResponse, StreamingResponse
 from modules.database import DatabaseManager, JobStatus
 from modules.webhooks import WebhookDispatcher, WebhookManager
 from modules.health_checks import HealthCheckSystem
-from modules.scraper_clean import fast_scrape_reviews, LogCapture  # Clean scraper with hard refresh recovery
-from modules.fast_scraper import check_reviews_available, get_business_card_info  # Helper functions
+from modules.scraper_clean import fast_scrape_reviews, LogCapture, get_business_card_info  # Clean scraper
 from modules.chrome_pool import (
     start_worker_pools,
     stop_worker_pools,
diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py
index 1648749..d9a762e 100644
--- a/modules/scraper_clean.py
+++ b/modules/scraper_clean.py
@@ -1148,3 +1148,127 @@ if __name__ == "__main__":
     finally:
         driver.quit()
         print("\n🏁 Done")
+
+
+def get_business_card_info(url: str, headless: bool = True, driver=None, return_driver: bool = False) -> dict:
+    """
+    Extract business card info from Google Maps.
+    Uses the same efficient polling navigation as scrape_reviews (no fixed waits).
+
+    Returns:
+        dict with: name, address, rating, total_reviews, success, error, time
+    """
+    from seleniumbase import Driver
+
+    start_time = time.time()
+    driver_provided = driver is not None
+    should_close_driver = not return_driver and not driver_provided
+
+    try:
+        # Create driver if not provided
+        if not driver:
+            driver = Driver(uc=True, headless=headless)
+
+        # Set geolocation to US
+        try:
+            driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
+                'latitude': 42.3601, 'longitude': -71.0589, 'accuracy': 100
+            })
+        except:
+            pass
+
+        # Navigate to URL
+        driver.get(url)
+
+        # Handle consent popup - poll with 10ms sleep (same as scrape_reviews)
+        start = time.time()
+        while time.time() - start < 5:
+            if "consent.google" in driver.current_url:
+                try:
+                    for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
+                        txt = btn.text.lower()
+                        if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
+                            btn.click()
+                            driver.get(url)
+                            break
+                except:
+                    pass
+                break
+            if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url):
+                break
+            time.sleep(0.01)  # 10ms - responsive but low CPU
+
+        # Poll for business info (same pattern as total_reviews extraction)
+        info = {"name": None, "rating": None, "total_reviews": None, "address": None}
+        start = time.time()
+        while time.time() - start < 5:
+            try:
+                info = driver.execute_script("""
+                    var result = {name: null, rating: null, total_reviews: null, address: null};
+
+                    // Business name from h1
+                    var h1 = document.querySelector('h1');
+                    if (h1) result.name = h1.textContent.trim();
+
+                    // Rating and reviews from span[role="img"] aria-labels
+                    // Handles multiple languages: stars/estrellas/étoiles, reviews/reseñas/avis
+                    var spans = document.querySelectorAll('span[role="img"]');
+                    for (var i = 0; i < spans.length; i++) {
+                        var label = spans[i].getAttribute('aria-label') || '';
+
+                        // Rating: "4.8 stars" or "4,8 estrellas" or "4,8 étoiles"
+                        var rMatch = label.match(/^([\\d,.]+)\\s*(stars?|estrellas?|étoiles?|sterne?|stelle)/i);
+                        if (rMatch && !result.rating) {
+                            result.rating = parseFloat(rMatch[1].replace(',', '.'));
+                        }
+
+                        // Reviews: "79 reviews" or "79 reseñas" or "79 avis"
+                        var revMatch = label.match(/^([\\d,\\.]+)\\s*(reviews?|reseñas?|avis|bewertungen|recensioni)/i);
+                        if (revMatch && !result.total_reviews) {
+                            result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
+                        }
+                    }
+
+                    // Address from button
+                    var addrBtn = document.querySelector('button[data-item-id="address"]');
+                    if (addrBtn) {
+                        var label = addrBtn.getAttribute('aria-label');
+                        if (label) result.address = label.replace(/^(Address|Dirección|Adresse):\\s*/i, '');
+                    }
+
+                    return result;
+                """)
+                # Exit early if we have the essentials
+                if info.get("name") and info.get("total_reviews") is not None:
+                    break
+            except:
+                pass
+            time.sleep(0.1)  # 100ms between polls
+
+        return {
+            "name": info.get("name"),
+            "address": info.get("address"),
+            "rating": info.get("rating"),
+            "total_reviews": info.get("total_reviews"),
+            "success": bool(info.get("name")),
+            "error": None,
+            "time": time.time() - start_time
+        }
+
+    except Exception as e:
+        return {
+            "name": None,
+            "address": None,
+            "rating": None,
+            "total_reviews": None,
+            "success": False,
+            "error": str(e),
+            "time": time.time() - start_time
+        }
+
+    finally:
+        if should_close_driver and driver:
+            try:
+                driver.quit()
+            except:
+                pass