From 1bd30c0789d43e2c6cc68f53221556927b595e75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Fri, 23 Jan 2026 18:09:51 +0000 Subject: [PATCH] Fix get_business_card_info for pooled workers - Clear cookies and navigate to about:blank before loading URL (ensures clean state when reusing pooled driver) - Simplified regex patterns for rating/reviews extraction - Uses partial word matching like scrape_reviews Co-Authored-By: Claude Opus 4.5 --- modules/scraper_clean.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py index d9a762e..a60d114 100644 --- a/modules/scraper_clean.py +++ b/modules/scraper_clean.py @@ -1177,6 +1177,14 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_ except: pass + # Clear state if reusing a pooled driver (ensures clean page load) + if driver_provided: + try: + driver.delete_all_cookies() + driver.get("about:blank") + except: + pass + # Navigate to URL driver.get(url) @@ -1211,19 +1219,20 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_ if (h1) result.name = h1.textContent.trim(); // Rating and reviews from span[role="img"] aria-labels - // Handles multiple languages: stars/estrellas/étoiles, reviews/reseñas/avis + // Same pattern as scrape_reviews for consistency var spans = document.querySelectorAll('span[role="img"]'); for (var i = 0; i < spans.length; i++) { var label = spans[i].getAttribute('aria-label') || ''; - // Rating: "4.8 stars" or "4,8 estrellas" or "4,8 étoiles" - var rMatch = label.match(/^([\\d,.]+)\\s*(stars?|estrellas?|étoiles?|sterne?|stelle)/i); + // Rating: "4.8 stars", "4,8 estrellas", etc (partial match) + var rMatch = label.match(/^([\\d,.]+)\\s*(star|estrella|étoile|stern|stell)/i); if (rMatch && !result.rating) { result.rating = parseFloat(rMatch[1].replace(',', '.')); } - // Reviews: "79 reviews" or "79 reseñas" or "79 avis" - var revMatch = label.match(/^([\\d,\\.]+)\\s*(reviews?|reseñas?|avis|bewertungen|recensioni)/i); + // Reviews: same as scrape_reviews - /^([\d,.]+)\s*review/i + // Plus Spanish "reseña" which doesn't contain "review" + var revMatch = label.match(/^([\\d,\\.]+)\\s*(review|reseña|avis|bewertung|recension)/i); if (revMatch && !result.total_reviews) { result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, '')); }