Add get_business_card_info to scraper_clean with multilingual support

Replaces fast_scraper validation with efficient polling-based extraction
using the same navigation pattern as scrape_reviews:
- 10ms polling for consent handling (no fixed waits)
- 100ms polling for data extraction
- Exits early when data found

Supports multiple languages:
- Rating: stars/estrellas/étoiles/sterne/stelle
- Reviews: reviews/reseñas/avis/bewertungen/recensioni
- Handles comma decimals (4,8 -> 4.8)

Result: 6.3s to extract name, address, rating, total_reviews

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-23 17:52:06 +00:00
parent 47bb032011
commit 0682c0ec61
2 changed files with 125 additions and 2 deletions

View File

@@ -21,8 +21,7 @@ from fastapi.responses import JSONResponse, StreamingResponse
from modules.database import DatabaseManager, JobStatus
from modules.webhooks import WebhookDispatcher, WebhookManager
from modules.health_checks import HealthCheckSystem
from modules.scraper_clean import fast_scrape_reviews, LogCapture # Clean scraper with hard refresh recovery
from modules.fast_scraper import check_reviews_available, get_business_card_info # Helper functions
from modules.scraper_clean import fast_scrape_reviews, LogCapture, get_business_card_info # Clean scraper
from modules.chrome_pool import (
start_worker_pools,
stop_worker_pools,

View File

@@ -1148,3 +1148,127 @@ if __name__ == "__main__":
finally:
driver.quit()
print("\n🏁 Done")
def get_business_card_info(url: str, headless: bool = True, driver=None, return_driver: bool = False) -> dict:
"""
Extract business card info from Google Maps.
Uses the same efficient polling navigation as scrape_reviews (no fixed waits).
Returns:
dict with: name, address, rating, total_reviews, success, error, time
"""
from seleniumbase import Driver
start_time = time.time()
driver_provided = driver is not None
should_close_driver = not return_driver and not driver_provided
try:
# Create driver if not provided
if not driver:
driver = Driver(uc=True, headless=headless)
# Set geolocation to US
try:
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': 42.3601, 'longitude': -71.0589, 'accuracy': 100
})
except:
pass
# Navigate to URL
driver.get(url)
# Handle consent popup - poll with 10ms sleep (same as scrape_reviews)
start = time.time()
while time.time() - start < 5:
if "consent.google" in driver.current_url:
try:
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
txt = btn.text.lower()
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
btn.click()
driver.get(url)
break
except:
pass
break
if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url):
break
time.sleep(0.01) # 10ms - responsive but low CPU
# Poll for business info (same pattern as total_reviews extraction)
info = {"name": None, "rating": None, "total_reviews": None, "address": None}
start = time.time()
while time.time() - start < 5:
try:
info = driver.execute_script("""
var result = {name: null, rating: null, total_reviews: null, address: null};
// Business name from h1
var h1 = document.querySelector('h1');
if (h1) result.name = h1.textContent.trim();
// Rating and reviews from span[role="img"] aria-labels
// Handles multiple languages: stars/estrellas/étoiles, reviews/reseñas/avis
var spans = document.querySelectorAll('span[role="img"]');
for (var i = 0; i < spans.length; i++) {
var label = spans[i].getAttribute('aria-label') || '';
// Rating: "4.8 stars" or "4,8 estrellas" or "4,8 étoiles"
var rMatch = label.match(/^([\\d,.]+)\\s*(stars?|estrellas?|étoiles?|sterne?|stelle)/i);
if (rMatch && !result.rating) {
result.rating = parseFloat(rMatch[1].replace(',', '.'));
}
// Reviews: "79 reviews" or "79 reseñas" or "79 avis"
var revMatch = label.match(/^([\\d,\\.]+)\\s*(reviews?|reseñas?|avis|bewertungen|recensioni)/i);
if (revMatch && !result.total_reviews) {
result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
}
}
// Address from button
var addrBtn = document.querySelector('button[data-item-id="address"]');
if (addrBtn) {
var label = addrBtn.getAttribute('aria-label');
if (label) result.address = label.replace(/^(Address|Dirección|Adresse):\\s*/i, '');
}
return result;
""")
# Exit early if we have the essentials
if info.get("name") and info.get("total_reviews") is not None:
break
except:
pass
time.sleep(0.1) # 100ms between polls
return {
"name": info.get("name"),
"address": info.get("address"),
"rating": info.get("rating"),
"total_reviews": info.get("total_reviews"),
"success": bool(info.get("name")),
"error": None,
"time": time.time() - start_time
}
except Exception as e:
return {
"name": None,
"address": None,
"rating": None,
"total_reviews": None,
"success": False,
"error": str(e),
"time": time.time() - start_time
}
finally:
if should_close_driver and driver:
try:
driver.quit()
except:
pass