From 0682c0ec611352e12d8e83ed125bfaf9b0695c8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Fri, 23 Jan 2026 17:52:06 +0000 Subject: [PATCH] Add get_business_card_info to scraper_clean with multilingual support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces fast_scraper validation with efficient polling-based extraction using the same navigation pattern as scrape_reviews: - 10ms polling for consent handling (no fixed waits) - 100ms polling for data extraction - Exits early when data found Supports multiple languages: - Rating: stars/estrellas/étoiles/sterne/stelle - Reviews: reviews/reseñas/avis/bewertungen/recensioni - Handles comma decimals (4,8 -> 4.8) Result: 6.3s to extract name, address, rating, total_reviews Co-Authored-By: Claude Opus 4.5 --- api_server_production.py | 3 +- modules/scraper_clean.py | 124 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+), 2 deletions(-) diff --git a/api_server_production.py b/api_server_production.py index dad6f96..133ae8b 100644 --- a/api_server_production.py +++ b/api_server_production.py @@ -21,8 +21,7 @@ from fastapi.responses import JSONResponse, StreamingResponse from modules.database import DatabaseManager, JobStatus from modules.webhooks import WebhookDispatcher, WebhookManager from modules.health_checks import HealthCheckSystem -from modules.scraper_clean import fast_scrape_reviews, LogCapture # Clean scraper with hard refresh recovery -from modules.fast_scraper import check_reviews_available, get_business_card_info # Helper functions +from modules.scraper_clean import fast_scrape_reviews, LogCapture, get_business_card_info # Clean scraper from modules.chrome_pool import ( start_worker_pools, stop_worker_pools, diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py index 1648749..d9a762e 100644 --- a/modules/scraper_clean.py +++ b/modules/scraper_clean.py @@ -1148,3 +1148,127 @@ if __name__ == "__main__": finally: driver.quit() print("\n🏁 Done") + + +def get_business_card_info(url: str, headless: bool = True, driver=None, return_driver: bool = False) -> dict: + """ + Extract business card info from Google Maps. + Uses the same efficient polling navigation as scrape_reviews (no fixed waits). + + Returns: + dict with: name, address, rating, total_reviews, success, error, time + """ + from seleniumbase import Driver + + start_time = time.time() + driver_provided = driver is not None + should_close_driver = not return_driver and not driver_provided + + try: + # Create driver if not provided + if not driver: + driver = Driver(uc=True, headless=headless) + + # Set geolocation to US + try: + driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { + 'latitude': 42.3601, 'longitude': -71.0589, 'accuracy': 100 + }) + except: + pass + + # Navigate to URL + driver.get(url) + + # Handle consent popup - poll with 10ms sleep (same as scrape_reviews) + start = time.time() + while time.time() - start < 5: + if "consent.google" in driver.current_url: + try: + for btn in driver.find_elements(By.CSS_SELECTOR, "button"): + txt = btn.text.lower() + if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt: + btn.click() + driver.get(url) + break + except: + pass + break + if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url): + break + time.sleep(0.01) # 10ms - responsive but low CPU + + # Poll for business info (same pattern as total_reviews extraction) + info = {"name": None, "rating": None, "total_reviews": None, "address": None} + start = time.time() + while time.time() - start < 5: + try: + info = driver.execute_script(""" + var result = {name: null, rating: null, total_reviews: null, address: null}; + + // Business name from h1 + var h1 = document.querySelector('h1'); + if (h1) result.name = h1.textContent.trim(); + + // Rating and reviews from span[role="img"] aria-labels + // Handles multiple languages: stars/estrellas/étoiles, reviews/reseñas/avis + var spans = document.querySelectorAll('span[role="img"]'); + for (var i = 0; i < spans.length; i++) { + var label = spans[i].getAttribute('aria-label') || ''; + + // Rating: "4.8 stars" or "4,8 estrellas" or "4,8 étoiles" + var rMatch = label.match(/^([\\d,.]+)\\s*(stars?|estrellas?|étoiles?|sterne?|stelle)/i); + if (rMatch && !result.rating) { + result.rating = parseFloat(rMatch[1].replace(',', '.')); + } + + // Reviews: "79 reviews" or "79 reseñas" or "79 avis" + var revMatch = label.match(/^([\\d,\\.]+)\\s*(reviews?|reseñas?|avis|bewertungen|recensioni)/i); + if (revMatch && !result.total_reviews) { + result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, '')); + } + } + + // Address from button + var addrBtn = document.querySelector('button[data-item-id="address"]'); + if (addrBtn) { + var label = addrBtn.getAttribute('aria-label'); + if (label) result.address = label.replace(/^(Address|Dirección|Adresse):\\s*/i, ''); + } + + return result; + """) + # Exit early if we have the essentials + if info.get("name") and info.get("total_reviews") is not None: + break + except: + pass + time.sleep(0.1) # 100ms between polls + + return { + "name": info.get("name"), + "address": info.get("address"), + "rating": info.get("rating"), + "total_reviews": info.get("total_reviews"), + "success": bool(info.get("name")), + "error": None, + "time": time.time() - start_time + } + + except Exception as e: + return { + "name": None, + "address": None, + "rating": None, + "total_reviews": None, + "success": False, + "error": str(e), + "time": time.time() - start_time + } + + finally: + if should_close_driver and driver: + try: + driver.quit() + except: + pass