Initial commit - WhyRating Engine (Google Reviews Scraper)

2026-02-02 18:19:00 +00:00
parent 0543a08242
commit 2206ddeff2
136 changed files with 51138 additions and 855 deletions
--- a/scrapers/google_reviews/session_manager.py
+++ b/scrapers/google_reviews/session_manager.py
@@ -0,0 +1,266 @@
+"""
+Session Manager for Google Reviews Scraper
+
+Manages browser sessions between validation and scraping phases.
+Allows reusing the same browser instance to avoid duplicate navigation.
+
+Usage:
+    # During validation
+    session_id = session_manager.create_session(driver, business_info, total_reviews)
+    return {"session_id": session_id, "business_info": business_info}
+
+    # During scraping (with session_id from validation)
+    session = session_manager.get_session(session_id)
+    if session:
+        driver = session['driver']
+        # Continue from where validation left off
+"""
+
+import uuid
+import time
+import threading
+from typing import Optional, Dict, Any
+from dataclasses import dataclass, field
+from datetime import datetime
+
+
+@dataclass
+class BrowserSession:
+    """Represents a validated browser session ready for scraping."""
+    session_id: str
+    driver: Any  # WebDriver instance
+    url: str
+    business_info: Dict[str, Any]
+    total_reviews: int
+    created_at: float
+    expires_at: float
+    browser_fingerprint: Optional[Dict[str, Any]] = None
+    log_capture: Any = None  # LogCapture instance
+    # Track session state
+    state: str = "validated"  # validated -> scraping -> completed/expired
+
+
+class SessionManager:
+    """
+    Manages browser sessions between validation and scraping.
+
+    Sessions have a TTL (default 5 minutes) after which they're automatically
+    cleaned up and the browser is closed.
+    """
+
+    DEFAULT_TTL_SECONDS = 300  # 5 minutes
+    CLEANUP_INTERVAL_SECONDS = 30  # Check for expired sessions every 30s
+
+    def __init__(self, ttl_seconds: int = None):
+        self.ttl_seconds = ttl_seconds or self.DEFAULT_TTL_SECONDS
+        self._sessions: Dict[str, BrowserSession] = {}
+        self._lock = threading.RLock()
+        self._cleanup_thread: Optional[threading.Thread] = None
+        self._running = False
+
+    def start(self):
+        """Start the background cleanup thread."""
+        if self._running:
+            return
+        self._running = True
+        self._cleanup_thread = threading.Thread(target=self._cleanup_loop, daemon=True)
+        self._cleanup_thread.start()
+
+    def stop(self):
+        """Stop the background cleanup thread."""
+        self._running = False
+        if self._cleanup_thread:
+            self._cleanup_thread.join(timeout=5)
+
+    def _cleanup_loop(self):
+        """Background loop to clean up expired sessions."""
+        while self._running:
+            try:
+                self._cleanup_expired()
+            except Exception as e:
+                print(f"[SessionManager] Cleanup error: {e}")
+            time.sleep(self.CLEANUP_INTERVAL_SECONDS)
+
+    def _cleanup_expired(self):
+        """Remove expired sessions and close their browsers."""
+        now = time.time()
+        expired_ids = []
+
+        with self._lock:
+            for session_id, session in self._sessions.items():
+                if now > session.expires_at:
+                    expired_ids.append(session_id)
+
+        for session_id in expired_ids:
+            self.release_session(session_id, reason="expired")
+
+    def create_session(
+        self,
+        driver: Any,
+        url: str,
+        business_info: Dict[str, Any],
+        total_reviews: int,
+        browser_fingerprint: Optional[Dict[str, Any]] = None,
+        log_capture: Any = None,
+        ttl_seconds: Optional[int] = None
+    ) -> str:
+        """
+        Create a new browser session after validation.
+
+        Args:
+            driver: WebDriver instance (positioned on Google Maps page)
+            url: The validated Google Maps URL
+            business_info: Extracted business information
+            total_reviews: Total review count from page
+            browser_fingerprint: Browser fingerprint settings used
+            log_capture: LogCapture instance for logging
+            ttl_seconds: Custom TTL for this session (default: 5 min)
+
+        Returns:
+            session_id: Unique identifier for this session
+        """
+        session_id = str(uuid.uuid4())[:8]  # Short ID for easier use
+        ttl = ttl_seconds or self.ttl_seconds
+        now = time.time()
+
+        session = BrowserSession(
+            session_id=session_id,
+            driver=driver,
+            url=url,
+            business_info=business_info,
+            total_reviews=total_reviews,
+            created_at=now,
+            expires_at=now + ttl,
+            browser_fingerprint=browser_fingerprint,
+            log_capture=log_capture,
+            state="validated"
+        )
+
+        with self._lock:
+            self._sessions[session_id] = session
+
+        print(f"[SessionManager] Created session {session_id} for {business_info.get('name', 'unknown')} (TTL: {ttl}s)")
+        return session_id
+
+    def get_session(self, session_id: str) -> Optional[BrowserSession]:
+        """
+        Retrieve a session by ID.
+
+        Args:
+            session_id: The session identifier
+
+        Returns:
+            BrowserSession if found and not expired, None otherwise
+        """
+        with self._lock:
+            session = self._sessions.get(session_id)
+            if not session:
+                print(f"[SessionManager] Session {session_id} not found")
+                return None
+
+            # Check if expired
+            if time.time() > session.expires_at:
+                print(f"[SessionManager] Session {session_id} expired")
+                self.release_session(session_id, reason="expired")
+                return None
+
+            return session
+
+    def claim_session(self, session_id: str) -> Optional[BrowserSession]:
+        """
+        Claim a session for scraping (marks it as in-use).
+
+        Args:
+            session_id: The session identifier
+
+        Returns:
+            BrowserSession if successfully claimed, None otherwise
+        """
+        with self._lock:
+            session = self.get_session(session_id)
+            if not session:
+                return None
+
+            if session.state != "validated":
+                print(f"[SessionManager] Session {session_id} already in state: {session.state}")
+                return None
+
+            session.state = "scraping"
+            # Extend TTL during scraping (1 hour max)
+            session.expires_at = time.time() + 3600
+
+            print(f"[SessionManager] Claimed session {session_id} for scraping")
+            return session
+
+    def release_session(self, session_id: str, reason: str = "completed"):
+        """
+        Release a session and close the browser.
+
+        Args:
+            session_id: The session identifier
+            reason: Why the session is being released
+        """
+        with self._lock:
+            session = self._sessions.pop(session_id, None)
+
+        if session:
+            print(f"[SessionManager] Releasing session {session_id} ({reason})")
+            try:
+                if session.driver:
+                    session.driver.quit()
+            except Exception as e:
+                print(f"[SessionManager] Error closing driver for {session_id}: {e}")
+
+    def extend_session(self, session_id: str, additional_seconds: int = 300) -> bool:
+        """
+        Extend a session's TTL.
+
+        Args:
+            session_id: The session identifier
+            additional_seconds: Seconds to add to TTL
+
+        Returns:
+            True if extended, False if session not found
+        """
+        with self._lock:
+            session = self._sessions.get(session_id)
+            if not session:
+                return False
+            session.expires_at = time.time() + additional_seconds
+            return True
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get session manager statistics."""
+        with self._lock:
+            now = time.time()
+            sessions = []
+            for sid, s in self._sessions.items():
+                sessions.append({
+                    "session_id": sid,
+                    "business": s.business_info.get("name", "unknown"),
+                    "state": s.state,
+                    "age_seconds": int(now - s.created_at),
+                    "ttl_remaining": int(s.expires_at - now)
+                })
+            return {
+                "total_sessions": len(self._sessions),
+                "sessions": sessions
+            }
+
+    def list_sessions(self) -> list:
+        """List all active sessions."""
+        with self._lock:
+            return list(self._sessions.keys())
+
+
+# Global singleton instance
+_session_manager: Optional[SessionManager] = None
+
+
+def get_session_manager() -> SessionManager:
+    """Get or create the global session manager instance."""
+    global _session_manager
+    if _session_manager is None:
+        _session_manager = SessionManager()
+        _session_manager.start()
+    return _session_manager
--- a/scrapers/google_reviews/v1_1_0.py
+++ b/scrapers/google_reviews/v1_1_0.py
@@ -732,7 +732,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                   progress_callback=None, validation_only: bool = False,
                   sort_strategy: str = SORT_AUTO, sort_order: List[str] = None,
                   multi_sort_threshold: int = MULTI_SORT_THRESHOLD,
-                   close_enough_pct: float = 95.0) -> dict:
+                   close_enough_pct: float = 95.0, initial_sort: str = None) -> dict:
    """
    Scrape Google Maps reviews with optional multi-sort strategy.

@@ -754,6 +754,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        sort_order: Custom sort order for multi-sort (default: newest, lowest, highest, relevant)
        multi_sort_threshold: Auto-enable multi-sort if total reviews > this (default: 1000)
        close_enough_pct: Stop retrying if we have this % of total reviews (default: 95.0)
+        initial_sort: Initial sort order to use (default: newest). Used for retry with different sort

    Returns:
        dict with reviews list and metadata
@@ -1381,8 +1382,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                log.info('browser', "Sort button found")
                break

+        # Track bot detection - if sort button hidden, Google likely detected bot
+        bot_detected = not sort_found
        if not sort_found:
-            log.warn('browser', "Sort button not found after waiting, continuing without sorting")
+            log.warn('browser', "Sort button not found after waiting, continuing without sorting (bot detection likely)")

        # Sort by specified order (default: newest)
        target_sort = initial_sort or SORT_NEWEST
@@ -1815,6 +1818,71 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                    }
                    text = longestText;

+                    // OWNER RESPONSE: Find by "Response from the owner" text anchor
+                    var ownerResponse = null;
+                    var ownerSpan = null;
+                    var cardSpans = card.querySelectorAll('span');
+                    for (var k = 0; k < cardSpans.length; k++) {
+                        if (cardSpans[k].textContent.trim() === 'Response from the owner') {
+                            ownerSpan = cardSpans[k];
+                            break;
+                        }
+                    }
+
+                    if (ownerSpan) {
+                        // Navigate: span -> header div -> container div
+                        var headerDiv = ownerSpan.closest('div');
+                        var respContainer = headerDiv ? headerDiv.parentElement : null;
+
+                        if (respContainer) {
+                            // Click expand button if exists and not expanded
+                            var expandBtn = respContainer.querySelector('button[aria-label="See more"]');
+                            if (expandBtn && expandBtn.getAttribute('aria-expanded') !== 'true') {
+                                expandBtn.click();
+                            }
+
+                            // Get timestamp from header spans
+                            var respTimestamp = '';
+                            var headerSpans = headerDiv.querySelectorAll('span');
+                            for (var m = 0; m < headerSpans.length; m++) {
+                                var spanTxt = headerSpans[m].textContent.trim();
+                                if (spanTxt.match(/ago$/i)) {
+                                    respTimestamp = spanTxt;
+                                    break;
+                                }
+                            }
+
+                            // Get response text from direct child div[lang]
+                            var respText = '';
+                            var langDivs = respContainer.children;
+                            for (var m = 0; m < langDivs.length; m++) {
+                                if (langDivs[m].tagName === 'DIV' && langDivs[m].hasAttribute('lang')) {
+                                    respText = langDivs[m].textContent.trim();
+                                    respText = respText.replace(/(More|Less)$/, '').trim();
+                                    break;
+                                }
+                            }
+
+                            // Fallback: find longest text div that's not the header
+                            if (!respText) {
+                                for (var m = 0; m < langDivs.length; m++) {
+                                    if (langDivs[m].tagName === 'DIV') {
+                                        var divTxt = langDivs[m].textContent.trim();
+                                        if (divTxt.includes('Response from the owner')) continue;
+                                        divTxt = divTxt.replace(/(More|Less)$/, '').trim();
+                                        if (divTxt.length > respText.length) {
+                                            respText = divTxt;
+                                        }
+                                    }
+                                }
+                            }
+
+                            if (respText) {
+                                ownerResponse = {text: respText, timestamp: respTimestamp};
+                            }
+                        }
+                    }
+
                    if (author && rating >= 1 && rating <= 5) {
                        results.push({
                            id: rid,
@@ -1823,6 +1891,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                            text: text,
                            rating: rating,
                            timestamp: timestamp,
+                            owner_response: ownerResponse,
                            source: 'dom'
                        });
                    }
@@ -2198,6 +2267,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                topics_inferred_count += 1
        log.info('scraper', f"Topics inferred for {topics_inferred_count}/{len(review_list)} reviews", metrics={'topics_inferred_count': topics_inferred_count, 'reviews_count': len(review_list)})

+    # Include business info captured from Overview page
+    business_info = business_info_cache[0] or {}
+
    return {
        "reviews": review_list,  # Only unflushed reviews (flushed already sent to callback)
        "total": grand_total,
@@ -2209,10 +2281,19 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        "metrics_history": metrics_history,  # For crash detection
        "start_time": start_time,  # For crash report elapsed time
        "session_fingerprint": session_fingerprint,  # Browser fingerprint for bot detection analysis
+        "bot_detected": bot_detected if 'bot_detected' in dir() else False,  # True if sort button was hidden
+        "initial_sort_used": target_sort if 'target_sort' in dir() else SORT_NEWEST,  # Sort order used for first pass
        "multi_sort": {
            "enabled": should_multi_sort if 'should_multi_sort' in dir() else False,
            "completed_sorts": completed_sorts if 'completed_sorts' in dir() else [SORT_NEWEST],
            "first_pass_count": first_pass_count if 'first_pass_count' in dir() else grand_total
+        },
+        # Business info captured from Google Maps page
+        "business_info": {
+            "name": business_info.get("name"),
+            "category": business_info.get("category"),
+            "address": business_info.get("address"),
+            "rating": business_info.get("rating")
        }
    }

@@ -2220,7 +2301,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
 def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
                        progress_callback=None, driver=None, return_driver: bool = False,
                        log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False,
-                        browser_fingerprint: dict = None):
+                        browser_fingerprint: dict = None, initial_sort: str = None,
+                        sort_strategy: str = SORT_AUTO, max_reviews: int = None):
    """
    Production-compatible wrapper for scrape_reviews.
    Matches the API expected by job_manager.py.
@@ -2240,6 +2322,10 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
            - timezone: string (e.g., "Europe/Madrid")
            - language: string (e.g., "en-US")
            - platform: string (e.g., "MacIntel")
+        initial_sort: Initial sort order to use ("newest", "lowest", "highest", "relevant")
+                     Used for retry with different sort strategy
+        sort_strategy: Sort strategy ("auto", "multi", "single", or specific sort)
+        max_reviews: Maximum reviews to collect (for testing). None = unlimited (default: 5000)

    Returns:
        Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
@@ -2329,13 +2415,15 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
        result = scrape_reviews(
            driver=driver,
            url=url,
-            max_reviews=999999,  # Effectively unlimited
+            max_reviews=max_reviews if max_reviews else 999999,  # Unlimited by default, or custom limit for testing
            timeout_no_new=15,
            flush_callback=internal_flush,
            flush_batch_size=100,  # Smaller batches for more frequent progress
            log_capture=log_capture,
            progress_callback=progress_callback,  # Pass through for real-time log updates
-            validation_only=validation_only  # Return early if just validating
+            validation_only=validation_only,  # Return early if just validating
+            sort_strategy=sort_strategy,  # Sort strategy (auto, multi, single)
+            initial_sort=initial_sort  # Initial sort order for retry with different sort
        )

        elapsed = time.time() - start_time
@@ -2350,7 +2438,13 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
            "error": None,
            "logs": result.get("logs", []),
            "review_topics": result.get("review_topics", []),  # Topic filters with mention counts
-            "session_fingerprint": result.get("session_fingerprint")  # Browser fingerprint for bot detection
+            "session_fingerprint": result.get("session_fingerprint"),  # Browser fingerprint for bot detection
+            # Tracking info for retry strategy
+            "bot_detected": result.get("bot_detected", False),  # True if sort button was hidden by Google
+            "initial_sort_used": result.get("initial_sort_used", "newest"),  # Sort order used
+            "multi_sort": result.get("multi_sort", {}),  # Multi-sort completion info
+            # Business info captured from Google Maps page
+            "business_info": result.get("business_info", {})
        }

        # Include validation_info if in validation_only mode
--- a/scrapers/google_reviews/v1_2_0.py
+++ b/scrapers/google_reviews/v1_2_0.py