Add browser fingerprint support and analytics metadata display

- Transfer user's browser fingerprint (user-agent, viewport, timezone, language, geolocation) to Chrome for more authentic scraping - Display review topics from Google Maps in analytics dashboard - Show business category badge in analytics header - Fix date_text null handling in analytics (handle undefined/timestamp fields) - Add review_topics and business_category to JobStatus interface Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 10:36:06 +00:00
parent 1bd30c0789
commit a540ab97b1
9 changed files with 1214 additions and 231 deletions
--- a/modules/database.py
+++ b/modules/database.py
@@ -21,6 +21,7 @@ class JobStatus(str, Enum):
    COMPLETED = "completed"
    FAILED = "failed"
    CANCELLED = "cancelled"
+    PARTIAL = "partial"  # Job crashed but has partial reviews saved


 class DatabaseManager:
@@ -69,6 +70,7 @@ class DatabaseManager:
                    created_at TIMESTAMP NOT NULL DEFAULT NOW(),
                    started_at TIMESTAMP,
                    completed_at TIMESTAMP,
+                    updated_at TIMESTAMP,

                    reviews_count INTEGER,
                    total_reviews INTEGER,
@@ -79,7 +81,7 @@ class DatabaseManager:
                    metadata JSONB,
                    scrape_logs JSONB,

-                    CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
+                    CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'))
                );
            """)

@@ -88,6 +90,24 @@ class DatabaseManager:
                ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
            """)

+            # Add updated_at column if it doesn't exist (for incremental progress tracking)
+            await conn.execute("""
+                ALTER TABLE jobs ADD COLUMN IF NOT EXISTS updated_at TIMESTAMP;
+            """)
+
+            # Add review_topics column if it doesn't exist (extracted topic filters with mention counts)
+            await conn.execute("""
+                ALTER TABLE jobs ADD COLUMN IF NOT EXISTS review_topics JSONB;
+            """)
+
+            # Update constraint to include 'partial' status (for existing databases)
+            await conn.execute("""
+                ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_status;
+            """)
+            await conn.execute("""
+                ALTER TABLE jobs ADD CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'));
+            """)
+
            # Create indexes
            await conn.execute("""
                CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
@@ -187,13 +207,15 @@ class DatabaseManager:
                    created_at,
                    started_at,
                    completed_at,
+                    updated_at,
                    reviews_count,
                    total_reviews,
                    reviews_data,
                    scrape_time,
                    error_message,
                    metadata,
-                    scrape_logs
+                    scrape_logs,
+                    review_topics
                FROM jobs
                WHERE job_id = $1
            """, job_id)
@@ -203,22 +225,32 @@ class DatabaseManager:

            return dict(row)

-    async def get_job_reviews(self, job_id: UUID) -> Optional[List[Dict[str, Any]]]:
+    async def get_job_reviews(self, job_id: UUID, include_partial: bool = True) -> Optional[List[Dict[str, Any]]]:
        """
        Get reviews for a specific job.

        Args:
            job_id: Job UUID
+            include_partial: If True, also return reviews for running and partial jobs

        Returns:
-            List of reviews or None if not found/not completed
+            List of reviews or None if not found/no reviews
        """
        async with self.pool.acquire() as conn:
-            reviews_data = await conn.fetchval("""
-                SELECT reviews_data
-                FROM jobs
-                WHERE job_id = $1 AND status = 'completed'
-            """, job_id)
+            if include_partial:
+                # Return reviews for completed, running, or partial jobs
+                reviews_data = await conn.fetchval("""
+                    SELECT reviews_data
+                    FROM jobs
+                    WHERE job_id = $1 AND status IN ('completed', 'running', 'partial')
+                """, job_id)
+            else:
+                # Only return reviews for completed jobs
+                reviews_data = await conn.fetchval("""
+                    SELECT reviews_data
+                    FROM jobs
+                    WHERE job_id = $1 AND status = 'completed'
+                """, job_id)

            if not reviews_data:
                return None
@@ -278,7 +310,8 @@ class DatabaseManager:
        reviews: List[Dict[str, Any]],
        scrape_time: float,
        total_reviews: Optional[int] = None,
-        scrape_logs: Optional[List[Dict[str, Any]]] = None
+        scrape_logs: Optional[List[Dict[str, Any]]] = None,
+        review_topics: Optional[List[Dict[str, Any]]] = None
    ):
        """
        Save scraping results to database.
@@ -289,8 +322,33 @@ class DatabaseManager:
            scrape_time: Time taken to scrape in seconds
            total_reviews: Total reviews available (from page counter)
            scrape_logs: List of log entries from the scraper
+            review_topics: List of topic filter dictionaries with topic and count
        """
        async with self.pool.acquire() as conn:
+            # If reviews list is empty, check if job already has reviews from incremental saves
+            # This happens when flush_callback was used during scraping
+            if not reviews:
+                existing = await conn.fetchval(
+                    "SELECT reviews_count FROM jobs WHERE job_id = $1", job_id
+                )
+                if existing and existing > 0:
+                    # Job has reviews from incremental saves, don't overwrite reviews_data
+                    await conn.execute("""
+                        UPDATE jobs
+                        SET
+                            status = 'completed',
+                            completed_at = NOW(),
+                            total_reviews = COALESCE($2, total_reviews),
+                            scrape_time = $3,
+                            scrape_logs = $4::jsonb,
+                            review_topics = $5::jsonb
+                        WHERE job_id = $1
+                    """, job_id, total_reviews, scrape_time,
+                        json.dumps(scrape_logs) if scrape_logs else None,
+                        json.dumps(review_topics) if review_topics else None)
+                    log.info(f"Completed job {job_id} with {existing} reviews (from incremental saves)")
+                    return
+
            await conn.execute("""
                UPDATE jobs
                SET
@@ -300,13 +358,70 @@ class DatabaseManager:
                    total_reviews = $3,
                    reviews_data = $4::jsonb,
                    scrape_time = $5,
-                    scrape_logs = $6::jsonb
+                    scrape_logs = $6::jsonb,
+                    review_topics = $7::jsonb
                WHERE job_id = $1
            """, job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
-                json.dumps(scrape_logs) if scrape_logs else None)
+                json.dumps(scrape_logs) if scrape_logs else None,
+                json.dumps(review_topics) if review_topics else None)

            log.info(f"Saved {len(reviews)} reviews for job {job_id}")

+    async def save_reviews_incremental(
+        self,
+        job_id: UUID,
+        reviews: List[Dict[str, Any]],
+        total_reviews: Optional[int] = None
+    ):
+        """
+        Save reviews incrementally during scraping.
+        Called on each flush to preserve progress in case of crash.
+
+        Args:
+            job_id: Job UUID
+            reviews: ALL reviews collected so far (not just new ones)
+            total_reviews: Total reviews available (from page counter)
+        """
+        async with self.pool.acquire() as conn:
+            await conn.execute("""
+                UPDATE jobs
+                SET
+                    reviews_count = $2,
+                    total_reviews = COALESCE($3, total_reviews),
+                    reviews_data = $4::jsonb,
+                    updated_at = NOW()
+                WHERE job_id = $1 AND status = 'running'
+            """, job_id, len(reviews), total_reviews, json.dumps(reviews))
+
+            log.debug(f"Incremental save: {len(reviews)} reviews for job {job_id}")
+
+    async def mark_job_partial(
+        self,
+        job_id: UUID,
+        error_message: str,
+        scrape_logs: Optional[List[Dict[str, Any]]] = None
+    ):
+        """
+        Mark a job as partial (crashed but has some reviews saved).
+
+        Args:
+            job_id: Job UUID
+            error_message: Error that caused the crash
+            scrape_logs: Log entries from the scraper
+        """
+        async with self.pool.acquire() as conn:
+            await conn.execute("""
+                UPDATE jobs
+                SET
+                    status = 'partial',
+                    completed_at = NOW(),
+                    error_message = $2,
+                    scrape_logs = $3::jsonb
+                WHERE job_id = $1
+            """, job_id, error_message, json.dumps(scrape_logs) if scrape_logs else None)
+
+            log.info(f"Marked job {job_id} as partial due to: {error_message}")
+
    async def list_jobs(
        self,
        status: Optional[JobStatus] = None,
@@ -337,7 +452,8 @@ class DatabaseManager:
                        total_reviews,
                        scrape_time,
                        error_message,
-                        metadata
+                        metadata,
+                        review_topics
                    FROM jobs
                    WHERE status = $1
                    ORDER BY created_at DESC
@@ -355,7 +471,8 @@ class DatabaseManager:
                        total_reviews,
                        scrape_time,
                        error_message,
-                        metadata
+                        metadata,
+                        review_topics
                    FROM jobs
                    ORDER BY created_at DESC
                    LIMIT $1 OFFSET $2
--- a/modules/scraper_clean.py
+++ b/modules/scraper_clean.py
@@ -268,7 +268,7 @@ def parse_dom_review(card) -> dict:

 def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
                   flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None,
-                   progress_callback=None) -> dict:
+                   progress_callback=None, validation_only: bool = False) -> dict:
    """
    Scrape Google Maps reviews.

@@ -299,6 +299,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
    # Track total reviews (persists across refreshes)
    total_reviews = [None]  # Use list for closure mutation

+    # Store business info extracted from overview (before clicking reviews tab)
+    business_info_cache = [None]
+
    # Hard refresh counter
    hard_refresh_count = [0]
    max_hard_refreshes = 3  # Max number of hard refreshes before giving up
@@ -323,11 +326,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                pass
        return None

-    def setup_reviews_page(is_refresh=False):
+    def setup_reviews_page(is_refresh=False, validation_only_mode=False):
        """
        Setup the reviews page for scraping.
        Returns (scroll_container, stop_scrolling_event) or (None, None) on failure.
        Can be called after initial load or after a hard refresh.
+
+        If validation_only_mode=True, returns early after extracting business info
+        without clicking reviews tab or finding scroll container.
        """
        nonlocal total_reviews

@@ -335,6 +341,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in

        # Navigate to URL (only on initial load or refresh)
        if not is_refresh:
+            # Reset browser state by navigating to blank page first
+            # This clears any stale state from pooled browser sessions
+            try:
+                driver.get("about:blank")
+                time.sleep(0.1)
+            except:
+                pass
            log.info(f"🌐 Loading: {url[:80]}...")
        else:
            log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
@@ -353,6 +366,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                            # Reload original URL after consent
                            log.info("  Reloading after consent...")
                            driver.get(url)
+                            # Wait for page to settle after consent reload
+                            time.sleep(1)
                            break
                except:
                    pass
@@ -362,43 +377,108 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                break
            time.sleep(0.01)  # 10ms - responsive but low CPU

-        # Extract total review count BEFORE clicking reviews tab (it's on Overview)
+        # Extract business info and total review count BEFORE clicking reviews tab (on Overview)
+        # This captures name, rating, category, address while they're visible
        # Only on first load (don't overwrite if we already have it)
-        if total_reviews[0] is None:
+        if total_reviews[0] is None or business_info_cache[0] is None:
            start = time.time()
            while time.time() - start < 5:
                try:
-                    count = driver.execute_script("""
-                        var reviewSpans = document.querySelectorAll('span[role="img"]');
-                        for (var i = 0; i < reviewSpans.length; i++) {
-                            var label = reviewSpans[i].getAttribute('aria-label') || '';
-                            var match = label.match(/^([\\d,\\.]+)\\s*review/i);
-                            if (match) {
-                                return parseInt(match[1].replace(/[,\\.]/g, ''));
+                    info = driver.execute_script("""
+                        var result = {
+                            total_reviews: null,
+                            name: null,
+                            rating: null,
+                            category: null,
+                            address: null
+                        };
+
+                        // Business name from h1
+                        var h1 = document.querySelector('h1');
+                        if (h1) result.name = h1.textContent.trim();
+
+                        // Category - use jsaction attribute (robust selector)
+                        var catBtn = document.querySelector('button[jsaction*="category"]');
+                        if (catBtn) result.category = catBtn.textContent.trim();
+
+                        // Rating and review count from span[role="img"] aria-labels
+                        var spans = document.querySelectorAll('span[role="img"]');
+                        for (var i = 0; i < spans.length; i++) {
+                            var label = spans[i].getAttribute('aria-label') || '';
+
+                            // Rating: "4.8 stars"
+                            var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
+                            if (rMatch && !result.rating) {
+                                result.rating = parseFloat(rMatch[1].replace(',', '.'));
+                            }
+
+                            // Reviews: "79 reviews"
+                            var revMatch = label.match(/^([\\d,\\.]+)\\s*review/i);
+                            if (revMatch && !result.total_reviews) {
+                                result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
                            }
                        }
-                        return null;
+
+                        // Address from button
+                        var addrBtn = document.querySelector('button[data-item-id="address"]');
+                        if (addrBtn) {
+                            var label = addrBtn.getAttribute('aria-label');
+                            if (label) result.address = label.replace(/^Address:\\s*/i, '');
+                        }
+
+                        return result;
                    """)
-                    if count:
-                        total_reviews[0] = count
-                        log.info(f"📊 Total reviews on page: {count}")
-                        break
+
+                    if info:
+                        if info.get('total_reviews') and total_reviews[0] is None:
+                            total_reviews[0] = info['total_reviews']
+                            log.info(f"📊 Total reviews on page: {total_reviews[0]}")
+                        if info.get('name') and business_info_cache[0] is None:
+                            business_info_cache[0] = info
+                            log.info(f"📍 Business: {info.get('name')}")
+                        if total_reviews[0] and business_info_cache[0]:
+                            break
                except:
                    pass
                time.sleep(0.1)

+        # VALIDATION_ONLY: Return early - skip clicking reviews tab, sorting, etc.
+        if validation_only_mode:
+            log.info("📋 Validation mode: returning early (skipping reviews tab)")
+            return ("validation_done", None)
+
        # Click reviews tab - poll until found
        review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
        start = time.time()
        tab_clicked = False
+        tabs_logged = False
        while time.time() - start < 5:  # Max 5s for tabs
            try:
                tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
+                # Log available tabs once for debugging
+                if not tabs_logged and tabs:
+                    tabs_logged = True
+                    tab_texts = [t.text for t in tabs]
+                    log.info(f"  Available tabs: {tab_texts}")
                for tab in tabs:
                    tab_text = tab.text.lower()
                    if any(kw in tab_text for kw in review_keywords):
                        if not is_refresh:
                            log.info(f"  Clicking reviews tab: '{tab.text}'")
+                        # Extract total_reviews from tab text like "Reviews (79)" or "Reviews\n79"
+                        if total_reviews[0] is None:
+                            import re
+                            # Try pattern with parentheses: "Reviews (79)"
+                            match = re.search(r'\((\d+)\)', tab.text)
+                            if match:
+                                total_reviews[0] = int(match.group(1))
+                                log.info(f"📊 Total reviews from tab: {total_reviews[0]}")
+                            else:
+                                # Try pattern with newline: "Reviews\n79"
+                                match = re.search(r'(\d+)', tab.text)
+                                if match:
+                                    total_reviews[0] = int(match.group(1))
+                                    log.info(f"📊 Total reviews from tab: {total_reviews[0]}")
                        tab.click()
                        tab_clicked = True
                        break
@@ -569,11 +649,85 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in

        return scroll_container, stop_scrolling

-    # Initial page setup
-    scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False)
+    # Helper to extract review topics from the reviews tab
+    def extract_review_topics():
+        """Extract review topic filters from radiogroup (robust selectors)."""
+        try:
+            topics = driver.execute_script("""
+                var topics = [];
+
+                // Primary: use role="radiogroup" with aria-label="Refine reviews" (robust)
+                var container = document.querySelector('div[role="radiogroup"][aria-label*="Refine"], div[role="radiogroup"][aria-label*="refine"]');
+
+                if (!container) {
+                    // Fallback: any radiogroup in the reviews area
+                    container = document.querySelector('div[role="radiogroup"]');
+                }
+
+                if (container) {
+                    var buttons = container.querySelectorAll('button[role="radio"]');
+                    for (var btn of buttons) {
+                        var label = btn.getAttribute('aria-label') || '';
+                        // Parse "hair salon, mentioned in 4 reviews" format
+                        var match = label.match(/^([^,]+),\\s*mentioned in (\\d+)/i);
+                        if (match) {
+                            topics.push({
+                                topic: match[1].trim(),
+                                count: parseInt(match[2])
+                            });
+                        } else if (label && !label.toLowerCase().includes('all review')) {
+                            // Fallback: try to extract from child spans
+                            var countSpan = btn.querySelector('.bC3Nkc, .fontBodySmall');
+                            var nameSpan = btn.querySelector('.uEubGf, span:first-child');
+                            if (nameSpan) {
+                                var name = nameSpan.textContent.trim();
+                                var count = countSpan ? parseInt(countSpan.textContent) : 0;
+                                if (name && name.toLowerCase() !== 'all') {
+                                    topics.push({topic: name, count: count || 0});
+                                }
+                            }
+                        }
+                    }
+                }
+
+                return topics;
+            """)
+            return topics or []
+        except:
+            return []
+
+    # Initial page setup (pass validation_only to skip unnecessary steps)
+    scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False, validation_only_mode=validation_only)
+
+    # VALIDATION_ONLY MODE: Return early with just total_reviews and business info
+    # setup_reviews_page returns ("validation_done", None) in this case
+    if validation_only or scroll_container == "validation_done":
+        # Use the business info captured from Overview (before clicking reviews tab)
+        business_info = business_info_cache[0] or {}
+
+        return {
+            "reviews": [],
+            "total": total_reviews[0] or 0,
+            "scrolls": 0,
+            "error": None,
+            "validation_info": {
+                "name": business_info.get("name"),
+                "rating": business_info.get("rating"),
+                "category": business_info.get("category"),
+                "address": business_info.get("address"),
+                "total_reviews": total_reviews[0]
+            }
+        }
+
    if not scroll_container:
        return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}

+    # Extract review topics after reviews tab is loaded (before scrolling begins)
+    time.sleep(0.5)  # Brief wait for topic filters to render
+    review_topics = extract_review_topics()
+    if review_topics:
+        log.info(f"📊 Found {len(review_topics)} review topics: {', '.join(t['topic'] for t in review_topics[:5])}...")
+
    def get_api_reviews():
        """Get reviews from intercepted API responses."""
        api_revs = []
@@ -990,13 +1144,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        "total_flushed": total_flushed[0],
        "checks": check_num,
        "url": url,
-        "logs": log.get_logs()
+        "logs": log.get_logs(),
+        "review_topics": review_topics  # Topic filters with mention counts
    }


 def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
                        progress_callback=None, driver=None, return_driver: bool = False,
-                        log_capture: LogCapture = None):
+                        log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False,
+                        browser_fingerprint: dict = None):
    """
    Production-compatible wrapper for scrape_reviews.
    Matches the API expected by job_manager.py.
@@ -1009,6 +1165,13 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
        driver: Existing driver instance to reuse
        return_driver: If True, return driver in result
        log_capture: Optional LogCapture instance for real-time log access
+        browser_fingerprint: Optional dict with user's browser fingerprint:
+            - geolocation: {lat, lng}
+            - userAgent: string
+            - viewport: {width, height}
+            - timezone: string (e.g., "Europe/Madrid")
+            - language: string (e.g., "en-US")
+            - platform: string (e.g., "MacIntel")

    Returns:
        Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
@@ -1023,27 +1186,56 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
    log_capture = log_capture or LogCapture()

    try:
+        # Extract fingerprint settings
+        fp = browser_fingerprint or {}
+        user_agent = fp.get('userAgent') or "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+        viewport = fp.get('viewport') or {'width': 1200, 'height': 900}
+        geolocation = fp.get('geolocation')
+        timezone = fp.get('timezone')
+        language = fp.get('language', 'en-US')
+
        # Create driver if not provided
        if not driver:
            driver = Driver(
                uc=True,
                headless=headless,
                page_load_strategy="normal",
-                agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+                agent=user_agent  # Use user's actual user agent
            )
-            driver.set_window_size(1200, 900)  # Proper viewport for Google Maps
+            # Set viewport to match user's screen
+            driver.set_window_size(viewport['width'], viewport['height'])

-        # Set Chrome geolocation to US (Boston, MA) using CDP
-        # This ensures Google Maps shows US results regardless of server location
+        # Apply browser fingerprint settings via CDP
        try:
-            driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
-                'latitude': 42.3601,
-                'longitude': -71.0589,
-                'accuracy': 100
-            })
-            log_capture.info("Set geolocation to US (Boston, MA)")
+            # Set timezone if provided
+            if timezone:
+                driver.execute_cdp_cmd('Emulation.setTimezoneOverride', {'timezoneId': timezone})
+                log_capture.info(f"Set timezone to {timezone}")
+
+            # Set locale/language
+            driver.execute_cdp_cmd('Emulation.setLocaleOverride', {'locale': language})
+
+            # Set geolocation
+            if geolocation and 'lat' in geolocation and 'lng' in geolocation:
+                driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
+                    'latitude': geolocation['lat'],
+                    'longitude': geolocation['lng'],
+                    'accuracy': 1000  # ~1km accuracy for IP-based location
+                })
+                log_capture.info(f"Set geolocation to ({geolocation['lat']:.2f}, {geolocation['lng']:.2f})")
+            else:
+                # Default to US (Boston, MA) if no geolocation provided
+                driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
+                    'latitude': 42.3601,
+                    'longitude': -71.0589,
+                    'accuracy': 100
+                })
+                log_capture.info("Set geolocation to US (Boston, MA) [default]")
+
+            if fp:
+                log_capture.info(f"Browser fingerprint applied: {fp.get('platform', 'unknown')}, {viewport['width']}x{viewport['height']}")
        except Exception as e:
-            log_capture.warning(f"Could not set geolocation: {e}")
+            log_capture.warning(f"Could not apply fingerprint settings: {e}")

        # Add URL parameters for consistent results
        if 'hl=' not in url:
@@ -1052,14 +1244,18 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
        if 'gl=' not in url:
            url = f"{url}&gl=us"

-        # Create progress wrapper if callback provided
-        flush_callback = None
-        if progress_callback:
+        # Create combined flush callback for progress + external handler
+        external_flush = flush_callback  # Save external callback
+        internal_flush = None
+        if progress_callback or external_flush:
            collected = [0]
-            def flush_with_progress(reviews_batch):
-                collected[0] += len(reviews_batch)
-                progress_callback(collected[0], None)
-            flush_callback = flush_with_progress
+            def combined_flush(reviews_batch):
+                collected[0] = len(reviews_batch)  # reviews_batch is ALL reviews so far
+                if progress_callback:
+                    progress_callback(collected[0], None)
+                if external_flush:
+                    external_flush(reviews_batch)  # Pass reviews to external handler
+            internal_flush = combined_flush

        # Run the scraper with progress callback for real-time updates
        result = scrape_reviews(
@@ -1067,10 +1263,11 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
            url=url,
            max_reviews=999999,  # Effectively unlimited
            timeout_no_new=15,
-            flush_callback=flush_callback,
+            flush_callback=internal_flush,
            flush_batch_size=100,  # Smaller batches for more frequent progress
            log_capture=log_capture,
-            progress_callback=progress_callback  # Pass through for real-time log updates
+            progress_callback=progress_callback,  # Pass through for real-time log updates
+            validation_only=validation_only  # Return early if just validating
        )

        elapsed = time.time() - start_time
@@ -1083,9 +1280,14 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
            "time": elapsed,
            "success": True,
            "error": None,
-            "logs": result.get("logs", [])
+            "logs": result.get("logs", []),
+            "review_topics": result.get("review_topics", [])  # Topic filters with mention counts
        }

+        # Include validation_info if in validation_only mode
+        if validation_only and "validation_info" in result:
+            response["validation_info"] = result["validation_info"]
+
        if return_driver:
            response["driver"] = driver
        elif should_close_driver:
@@ -1120,6 +1322,122 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
        }


+def extract_about_info(driver, url: str = None) -> dict:
+    """
+    Extract About section info from Google Maps (Accessibility, Amenities, etc.).
+
+    This function should be called AFTER reviews are scraped if about info is needed,
+    as it navigates to a different tab.
+
+    Args:
+        driver: Selenium WebDriver instance (already on the business page)
+        url: Optional URL to navigate to first (if not already on the page)
+
+    Returns:
+        dict with section names as keys, each containing list of features
+    """
+    try:
+        # Navigate if URL provided
+        if url:
+            # Force English
+            if 'hl=' not in url:
+                separator = '&' if '?' in url else '?'
+                url = f"{url}{separator}hl=en"
+            if 'gl=' not in url:
+                url = f"{url}&gl=us"
+            driver.get(url)
+            time.sleep(1)
+
+        # Click About tab using robust selectors
+        clicked = driver.execute_script("""
+            // Try multiple selectors for about tab
+            var selectors = [
+                'button[aria-label*="About"]',
+                'button[data-tab-index="2"]',
+                'div[role="tablist"] button:nth-child(3)',
+                'button[jsaction*="about"]'
+            ];
+
+            for (var sel of selectors) {
+                var btn = document.querySelector(sel);
+                if (btn && btn.textContent.toLowerCase().includes('about')) {
+                    btn.click();
+                    return true;
+                }
+            }
+
+            // Fallback: find by text content
+            var buttons = document.querySelectorAll('button');
+            for (var btn of buttons) {
+                if (btn.textContent.trim().toLowerCase() === 'about') {
+                    btn.click();
+                    return true;
+                }
+            }
+            return false;
+        """)
+
+        if not clicked:
+            return {}
+
+        time.sleep(1.5)  # Wait for about tab to load
+
+        # Extract about sections using aria-labels (robust)
+        about = driver.execute_script("""
+            var about = {};
+
+            // Find the about region by aria-label or role
+            var container = document.querySelector('div[role="region"][aria-label*="About"]');
+
+            if (!container) {
+                // Fallback: look for the scrollable area with sections
+                container = document.querySelector('.m6QErb[aria-label*="About"]');
+            }
+
+            if (!container) {
+                // Last resort: find sections by h2 headers
+                container = document;
+            }
+
+            // Find all section headers (h2 elements)
+            var sections = container.querySelectorAll('h2');
+
+            for (var h2 of sections) {
+                var sectionName = h2.textContent.trim();
+                var items = [];
+
+                // Find the ul list following this h2
+                var parent = h2.closest('.iP2t7d, div');
+                if (parent) {
+                    var listItems = parent.querySelectorAll('li span[aria-label]');
+                    for (var li of listItems) {
+                        var label = li.getAttribute('aria-label');
+                        if (label) {
+                            // Parse "Has toilet" or "No wheelchair-accessible car park"
+                            var hasFeature = !label.toLowerCase().startsWith('no ');
+                            var featureName = label.replace(/^(Has |No )/i, '');
+                            items.push({
+                                feature: featureName,
+                                available: hasFeature
+                            });
+                        }
+                    }
+                }
+
+                if (sectionName && items.length > 0) {
+                    about[sectionName] = items;
+                }
+            }
+
+            return about;
+        """)
+
+        return about or {}
+
+    except Exception as e:
+        return {"error": str(e)}
+
+
 # Test function
 if __name__ == "__main__":
    from seleniumbase import Driver
@@ -1159,6 +1477,8 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
        dict with: name, address, rating, total_reviews, success, error, time
    """
    from seleniumbase import Driver
+    import logging
+    log = logging.getLogger(__name__)

    start_time = time.time()
    driver_provided = driver is not None
@@ -1177,13 +1497,15 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
        except:
            pass

-        # Clear state if reusing a pooled driver (ensures clean page load)
-        if driver_provided:
-            try:
-                driver.delete_all_cookies()
-                driver.get("about:blank")
-            except:
-                pass
+        # Don't clear state - Google may serve different content based on session history
+        # The scraper doesn't reset state, so validation shouldn't either
+
+        # Force English interface for consistent parsing
+        if 'hl=' not in url:
+            separator = '&' if '?' in url else '?'
+            url = f"{url}{separator}hl=en"
+        if 'gl=' not in url:
+            url = f"{url}&gl=us"

        # Navigate to URL
        driver.get(url)
@@ -1193,48 +1515,183 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
        while time.time() - start < 5:
            if "consent.google" in driver.current_url:
                try:
-                    for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
-                        txt = btn.text.lower()
-                        if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
-                            btn.click()
-                            driver.get(url)
-                            break
-                except:
+                    # Try multiple approaches to find and click accept button
+                    clicked = False
+
+                    # Method 1: Find by aria-label (most reliable for Google consent)
+                    for btn in driver.find_elements(By.CSS_SELECTOR, "button[aria-label*='Accept']"):
+                        btn.click()
+                        clicked = True
+                        break
+
+                    # Method 2: Find by text content
+                    if not clicked:
+                        for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
+                            txt = btn.text.lower()
+                            if "accept all" in txt or "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
+                                btn.click()
+                                clicked = True
+                                break
+
+                    if clicked:
+                        time.sleep(0.5)  # Brief wait for consent to process
+                        driver.get(url)  # Reload the target URL
+                        time.sleep(0.5)  # Wait for reload
+                except Exception as e:
                    pass
                break
            if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url):
                break
            time.sleep(0.01)  # 10ms - responsive but low CPU

+        # Log current URL after consent handling
+        try:
+            current_url = driver.current_url
+            log.info(f"🔍 Validation: Current URL after load: {current_url[:80]}...")
+        except:
+            pass
+
+        # Wait for page to fully render before polling (tabs may load dynamically)
+        time.sleep(2)
+
        # Poll for business info (same pattern as total_reviews extraction)
-        info = {"name": None, "rating": None, "total_reviews": None, "address": None}
+        # Timeout increased to 10s because Reviews tab can take 6+ seconds to appear after consent
+        info = {"name": None, "rating": None, "total_reviews": None, "address": None, "category": None}
        start = time.time()
-        while time.time() - start < 5:
+        debug_logged = False
+        while time.time() - start < 10:
            try:
                info = driver.execute_script("""
-                    var result = {name: null, rating: null, total_reviews: null, address: null};
+                    var result = {name: null, rating: null, total_reviews: null, address: null, category: null, debug: []};

                    // Business name from h1
                    var h1 = document.querySelector('h1');
                    if (h1) result.name = h1.textContent.trim();

-                    // Rating and reviews from span[role="img"] aria-labels
-                    // Same pattern as scrape_reviews for consistency
+                    // Category - use jsaction attribute (robust, survives class changes)
+                    var catBtn = document.querySelector('button[jsaction*="category"]');
+                    if (catBtn) result.category = catBtn.textContent.trim();
+
+                    // Fallback: look for button after rating that's not a link
+                    if (!result.category) {
+                        var buttons = document.querySelectorAll('button');
+                        for (var btn of buttons) {
+                            var text = btn.textContent.trim();
+                            // Categories are short words, no numbers, not navigation
+                            if (text && text.length < 50 && !text.match(/^[0-9]/) &&
+                                !text.match(/review|star|direction|save|share|photo/i)) {
+                                // Check if it's near the rating area
+                                var parent = btn.closest('.LBgpqf, .skqShb, .fontBodyMedium');
+                                if (parent) {
+                                    result.category = text;
+                                    break;
+                                }
+                            }
+                        }
+                    }
+
+                    // Rating from span[role="img"] aria-labels
                    var spans = document.querySelectorAll('span[role="img"]');
                    for (var i = 0; i < spans.length; i++) {
                        var label = spans[i].getAttribute('aria-label') || '';

-                        // Rating: "4.8 stars", "4,8 estrellas", etc (partial match)
-                        var rMatch = label.match(/^([\\d,.]+)\\s*(star|estrella|étoile|stern|stell)/i);
+                        // Collect debug info for all aria-labels
+                        if (label) {
+                            result.debug.push('img-aria: ' + label);
+                        }
+
+                        // Rating: "4.8 stars" (English forced via hl=en)
+                        var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
                        if (rMatch && !result.rating) {
                            result.rating = parseFloat(rMatch[1].replace(',', '.'));
                        }

-                        // Reviews: same as scrape_reviews - /^([\d,.]+)\s*review/i
-                        // Plus Spanish "reseña" which doesn't contain "review"
-                        var revMatch = label.match(/^([\\d,\\.]+)\\s*(review|reseña|avis|bewertung|recension)/i);
+                        // Reviews: "79 reviews" or "4.8 stars 79 reviews" (English forced via hl=en)
+                        // Try direct format first: "79 reviews"
+                        var revMatch = label.match(/^([\\d,]+)\\s*review/i);
                        if (revMatch && !result.total_reviews) {
-                            result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
+                            result.total_reviews = parseInt(revMatch[1].replace(/,/g, ''));
+                        }
+
+                        // Try combined format: "4.8 stars 79 reviews" or "4.8 stars 79k+ reviews"
+                        if (!result.total_reviews) {
+                            var combinedMatch = label.match(/stars?\\s+([\\d,]+k?\\+?)\\s*review/i);
+                            if (combinedMatch) {
+                                var countStr = combinedMatch[1].replace(/,/g, '');
+                                if (countStr.includes('k')) {
+                                    // Handle "9k+" format
+                                    result.total_reviews = parseInt(countStr) * 1000;
+                                } else {
+                                    result.total_reviews = parseInt(countStr);
+                                }
+                            }
+                        }
+                    }
+
+                    // Also collect tab button texts for debugging (include full text including numbers)
+                    var tabs = document.querySelectorAll('button[role="tab"]');
+                    for (var j = 0; j < tabs.length; j++) {
+                        var tabText = tabs[j].textContent.trim();
+                        result.debug.push('tab: ' + tabText);
+                        // Also try to extract review count from tab text like "Reviews (79)"
+                        if (tabText.toLowerCase().includes('review') && !result.total_reviews) {
+                            var tabMatch = tabText.match(/\\((\\d+)\\)/);
+                            if (tabMatch) {
+                                result.total_reviews = parseInt(tabMatch[1]);
+                                result.debug.push('Found reviews in tab: ' + tabText);
+                            }
+                        }
+                    }
+
+                    // Also check ALL buttons for reviews count
+                    var allButtons = document.querySelectorAll('button');
+                    for (var b = 0; b < allButtons.length; b++) {
+                        var btnText = allButtons[b].textContent || '';
+                        if (btnText.toLowerCase().includes('review') && !btnText.toLowerCase().includes('write')) {
+                            var numMatch = btnText.match(/\\((\\d+)\\)/);
+                            if (numMatch && !result.total_reviews) {
+                                result.total_reviews = parseInt(numMatch[1]);
+                                result.debug.push('Found reviews in button: ' + btnText.substring(0, 50));
+                            }
+                        }
+                    }
+
+                    // Check if we're on search results vs place page
+                    result.debug.push('title: ' + document.title);
+                    result.debug.push('url: ' + window.location.href.substring(0, 80));
+
+                    // Check for search results list
+                    var searchResults = document.querySelectorAll('div[role="feed"] > div');
+                    result.debug.push('search_results_count: ' + searchResults.length);
+
+                    // Fallback: Get review count from Reviews tab button "Reviews (79)"
+                    // Search ALL tab buttons for one containing "review" text (same as scrape_reviews)
+                    if (!result.total_reviews) {
+                        var tabs = document.querySelectorAll('button[role="tab"]');
+                        for (var tab of tabs) {
+                            var text = tab.textContent.toLowerCase();
+                            if (text.includes('review')) {
+                                var match = tab.textContent.match(/\\((\\d+)\\)/);
+                                if (match) {
+                                    result.total_reviews = parseInt(match[1]);
+                                    break;
+                                }
+                            }
+                        }
+                    }
+
+                    // Fallback 2: Look for any button with "Reviews" and a number
+                    if (!result.total_reviews) {
+                        var buttons = document.querySelectorAll('button');
+                        for (var btn of buttons) {
+                            var text = btn.textContent;
+                            if (text.toLowerCase().includes('review') && !text.toLowerCase().includes('write')) {
+                                var numMatch = text.match(/\\((\\d+)\\)/);
+                                if (numMatch) {
+                                    result.total_reviews = parseInt(numMatch[1]);
+                                    break;
+                                }
+                            }
                        }
                    }

@@ -1242,23 +1699,41 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
                    var addrBtn = document.querySelector('button[data-item-id="address"]');
                    if (addrBtn) {
                        var label = addrBtn.getAttribute('aria-label');
-                        if (label) result.address = label.replace(/^(Address|Dirección|Adresse):\\s*/i, '');
+                        if (label) result.address = label.replace(/^Address:\\s*/i, '');
                    }

                    return result;
                """)
-                # Exit early if we have the essentials
-                if info.get("name") and info.get("total_reviews") is not None:
+                # Exit early if we have the essentials (name found AND reviews count > 0)
+                if info.get("name") and info.get("total_reviews") and info.get("total_reviews") > 0:
                    break
+
+                # Log debug info once after 3 seconds
+                if not debug_logged and time.time() - start > 3:
+                    debug_logged = True
+                    debug_info = info.get("debug", [])
+                    if debug_info:
+                        log.info(f"🔍 Validation debug - URL: {url[:50]}...")
+                        log.info(f"   Name: {info.get('name')}, Rating: {info.get('rating')}, Reviews: {info.get('total_reviews')}")
+                        for d in debug_info[:10]:  # First 10 debug items
+                            log.info(f"   {d}")
            except:
                pass
            time.sleep(0.1)  # 100ms between polls

+        # Final debug log if still no reviews
+        if not info.get("total_reviews"):
+            debug_info = info.get("debug", [])
+            log.warning(f"⚠️ Validation: No reviews found for '{info.get('name')}' after 10s polling")
+            if debug_info:
+                log.warning(f"   Debug items: {debug_info[:10]}")
+
        return {
            "name": info.get("name"),
            "address": info.get("address"),
            "rating": info.get("rating"),
            "total_reviews": info.get("total_reviews"),
+            "category": info.get("category"),
            "success": bool(info.get("name")),
            "error": None,
            "time": time.time() - start_time
@@ -1270,6 +1745,7 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
            "address": None,
            "rating": None,
            "total_reviews": None,
+            "category": None,
            "success": False,
            "error": str(e),
            "time": time.time() - start_time