Initial commit - WhyRating Engine (Google Reviews Scraper)

2026-02-02 18:19:00 +00:00
parent 0543a08242
commit 2206ddeff2
136 changed files with 51138 additions and 855 deletions
--- a/scrapers/google_reviews/v1_1_0.py
+++ b/scrapers/google_reviews/v1_1_0.py
@@ -732,7 +732,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                   progress_callback=None, validation_only: bool = False,
                   sort_strategy: str = SORT_AUTO, sort_order: List[str] = None,
                   multi_sort_threshold: int = MULTI_SORT_THRESHOLD,
-                   close_enough_pct: float = 95.0) -> dict:
+                   close_enough_pct: float = 95.0, initial_sort: str = None) -> dict:
    """
    Scrape Google Maps reviews with optional multi-sort strategy.

@@ -754,6 +754,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        sort_order: Custom sort order for multi-sort (default: newest, lowest, highest, relevant)
        multi_sort_threshold: Auto-enable multi-sort if total reviews > this (default: 1000)
        close_enough_pct: Stop retrying if we have this % of total reviews (default: 95.0)
+        initial_sort: Initial sort order to use (default: newest). Used for retry with different sort

    Returns:
        dict with reviews list and metadata
@@ -1381,8 +1382,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                log.info('browser', "Sort button found")
                break

+        # Track bot detection - if sort button hidden, Google likely detected bot
+        bot_detected = not sort_found
        if not sort_found:
-            log.warn('browser', "Sort button not found after waiting, continuing without sorting")
+            log.warn('browser', "Sort button not found after waiting, continuing without sorting (bot detection likely)")

        # Sort by specified order (default: newest)
        target_sort = initial_sort or SORT_NEWEST
@@ -1815,6 +1818,71 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                    }
                    text = longestText;

+                    // OWNER RESPONSE: Find by "Response from the owner" text anchor
+                    var ownerResponse = null;
+                    var ownerSpan = null;
+                    var cardSpans = card.querySelectorAll('span');
+                    for (var k = 0; k < cardSpans.length; k++) {
+                        if (cardSpans[k].textContent.trim() === 'Response from the owner') {
+                            ownerSpan = cardSpans[k];
+                            break;
+                        }
+                    }
+
+                    if (ownerSpan) {
+                        // Navigate: span -> header div -> container div
+                        var headerDiv = ownerSpan.closest('div');
+                        var respContainer = headerDiv ? headerDiv.parentElement : null;
+
+                        if (respContainer) {
+                            // Click expand button if exists and not expanded
+                            var expandBtn = respContainer.querySelector('button[aria-label="See more"]');
+                            if (expandBtn && expandBtn.getAttribute('aria-expanded') !== 'true') {
+                                expandBtn.click();
+                            }
+
+                            // Get timestamp from header spans
+                            var respTimestamp = '';
+                            var headerSpans = headerDiv.querySelectorAll('span');
+                            for (var m = 0; m < headerSpans.length; m++) {
+                                var spanTxt = headerSpans[m].textContent.trim();
+                                if (spanTxt.match(/ago$/i)) {
+                                    respTimestamp = spanTxt;
+                                    break;
+                                }
+                            }
+
+                            // Get response text from direct child div[lang]
+                            var respText = '';
+                            var langDivs = respContainer.children;
+                            for (var m = 0; m < langDivs.length; m++) {
+                                if (langDivs[m].tagName === 'DIV' && langDivs[m].hasAttribute('lang')) {
+                                    respText = langDivs[m].textContent.trim();
+                                    respText = respText.replace(/(More|Less)$/, '').trim();
+                                    break;
+                                }
+                            }
+
+                            // Fallback: find longest text div that's not the header
+                            if (!respText) {
+                                for (var m = 0; m < langDivs.length; m++) {
+                                    if (langDivs[m].tagName === 'DIV') {
+                                        var divTxt = langDivs[m].textContent.trim();
+                                        if (divTxt.includes('Response from the owner')) continue;
+                                        divTxt = divTxt.replace(/(More|Less)$/, '').trim();
+                                        if (divTxt.length > respText.length) {
+                                            respText = divTxt;
+                                        }
+                                    }
+                                }
+                            }
+
+                            if (respText) {
+                                ownerResponse = {text: respText, timestamp: respTimestamp};
+                            }
+                        }
+                    }
+
                    if (author && rating >= 1 && rating <= 5) {
                        results.push({
                            id: rid,
@@ -1823,6 +1891,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                            text: text,
                            rating: rating,
                            timestamp: timestamp,
+                            owner_response: ownerResponse,
                            source: 'dom'
                        });
                    }
@@ -2198,6 +2267,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                topics_inferred_count += 1
        log.info('scraper', f"Topics inferred for {topics_inferred_count}/{len(review_list)} reviews", metrics={'topics_inferred_count': topics_inferred_count, 'reviews_count': len(review_list)})

+    # Include business info captured from Overview page
+    business_info = business_info_cache[0] or {}
+
    return {
        "reviews": review_list,  # Only unflushed reviews (flushed already sent to callback)
        "total": grand_total,
@@ -2209,10 +2281,19 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        "metrics_history": metrics_history,  # For crash detection
        "start_time": start_time,  # For crash report elapsed time
        "session_fingerprint": session_fingerprint,  # Browser fingerprint for bot detection analysis
+        "bot_detected": bot_detected if 'bot_detected' in dir() else False,  # True if sort button was hidden
+        "initial_sort_used": target_sort if 'target_sort' in dir() else SORT_NEWEST,  # Sort order used for first pass
        "multi_sort": {
            "enabled": should_multi_sort if 'should_multi_sort' in dir() else False,
            "completed_sorts": completed_sorts if 'completed_sorts' in dir() else [SORT_NEWEST],
            "first_pass_count": first_pass_count if 'first_pass_count' in dir() else grand_total
+        },
+        # Business info captured from Google Maps page
+        "business_info": {
+            "name": business_info.get("name"),
+            "category": business_info.get("category"),
+            "address": business_info.get("address"),
+            "rating": business_info.get("rating")
        }
    }

@@ -2220,7 +2301,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
 def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
                        progress_callback=None, driver=None, return_driver: bool = False,
                        log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False,
-                        browser_fingerprint: dict = None):
+                        browser_fingerprint: dict = None, initial_sort: str = None,
+                        sort_strategy: str = SORT_AUTO, max_reviews: int = None):
    """
    Production-compatible wrapper for scrape_reviews.
    Matches the API expected by job_manager.py.
@@ -2240,6 +2322,10 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
            - timezone: string (e.g., "Europe/Madrid")
            - language: string (e.g., "en-US")
            - platform: string (e.g., "MacIntel")
+        initial_sort: Initial sort order to use ("newest", "lowest", "highest", "relevant")
+                     Used for retry with different sort strategy
+        sort_strategy: Sort strategy ("auto", "multi", "single", or specific sort)
+        max_reviews: Maximum reviews to collect (for testing). None = unlimited (default: 5000)

    Returns:
        Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
@@ -2329,13 +2415,15 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
        result = scrape_reviews(
            driver=driver,
            url=url,
-            max_reviews=999999,  # Effectively unlimited
+            max_reviews=max_reviews if max_reviews else 999999,  # Unlimited by default, or custom limit for testing
            timeout_no_new=15,
            flush_callback=internal_flush,
            flush_batch_size=100,  # Smaller batches for more frequent progress
            log_capture=log_capture,
            progress_callback=progress_callback,  # Pass through for real-time log updates
-            validation_only=validation_only  # Return early if just validating
+            validation_only=validation_only,  # Return early if just validating
+            sort_strategy=sort_strategy,  # Sort strategy (auto, multi, single)
+            initial_sort=initial_sort  # Initial sort order for retry with different sort
        )

        elapsed = time.time() - start_time
@@ -2350,7 +2438,13 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
            "error": None,
            "logs": result.get("logs", []),
            "review_topics": result.get("review_topics", []),  # Topic filters with mention counts
-            "session_fingerprint": result.get("session_fingerprint")  # Browser fingerprint for bot detection
+            "session_fingerprint": result.get("session_fingerprint"),  # Browser fingerprint for bot detection
+            # Tracking info for retry strategy
+            "bot_detected": result.get("bot_detected", False),  # True if sort button was hidden by Google
+            "initial_sort_used": result.get("initial_sort_used", "newest"),  # Sort order used
+            "multi_sort": result.get("multi_sort", {}),  # Multi-sort completion info
+            # Business info captured from Google Maps page
+            "business_info": result.get("business_info", {})
        }

        # Include validation_info if in validation_only mode