feat: Add multi-sort scraper v1.1.0 and improve v1.0.0 reliability

v1.0.0 improvements: - Add captcha detection (reCAPTCHA, unusual traffic, challenges) - Block fonts, analytics, maps tiles for faster scrolling - Add 95% close-enough threshold to skip unnecessary retries - Stop immediately if captcha detected instead of retrying v1.1.0 new features: - Multi-sort strategy to bypass ~1000 review limit - Cycles through newest/lowest/highest/relevant sorts - Auto mode: enables multi-sort when total > 1000 - Diminishing returns detection (stops if <5% new per pass) - Configurable sort order and thresholds Also adds test_scraper_v110.py CLI tool for testing multi-sort. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 18:47:30 +00:00
parent e2d7f6f118
commit fbd61ff7f7
3 changed files with 3120 additions and 5 deletions
--- a/scrapers/google_reviews/v1_0_0.py
+++ b/scrapers/google_reviews/v1_0_0.py
@@ -801,6 +801,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                time.sleep(0.1)
            except:
                pass
+
            log.info('browser', f"Loading: {url[:80]}...")
        else:
            log.info('browser', f"Hard refresh #{hard_refresh_count[0]}: reloading page...")
@@ -1069,14 +1070,25 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        except:
            pass

-        # Block images to speed up scrolling (use CDP)
+        # Block heavy resources to speed up scrolling (use CDP)
        try:
            driver.execute_cdp_cmd('Network.setBlockedURLs', {
-                'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*']
+                'urls': [
+                    # Images
+                    '*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg',
+                    '*googleusercontent.com/*',
+                    # Fonts
+                    '*.woff', '*.woff2', '*.ttf', '*.otf',
+                    # Analytics/tracking
+                    '*google-analytics.com/*', '*googletagmanager.com/*',
+                    '*doubleclick.net/*', '*googlesyndication.com/*',
+                    # Maps tiles (not needed for reviews)
+                    '*khms*.google.com/*', '*maps.googleapis.com/maps/vt*'
+                ]
            })
            driver.execute_cdp_cmd('Network.enable', {})
            if not is_refresh:
-                log.info('browser', "Blocking images for faster scrolling")
+                log.info('browser', "Blocking heavy resources for faster scrolling")
        except:
            pass

@@ -1198,6 +1210,28 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
            pass
        return api_revs

+    # Captcha detection helper
+    def detect_captcha():
+        """Check if a captcha or challenge is blocking the page. Returns captcha type or None."""
+        try:
+            return driver.execute_script("""
+                // Check for reCAPTCHA iframe or checkbox
+                var recaptcha = document.querySelector('iframe[src*="recaptcha"], iframe[title*="reCAPTCHA"]');
+                if (recaptcha) return 'recaptcha';
+
+                // Check for "unusual traffic" message
+                var body = document.body ? document.body.innerText : '';
+                if (body.includes('unusual traffic') || body.includes('not a robot')) return 'unusual_traffic';
+
+                // Check for challenge frame
+                var challenge = document.querySelector('iframe[src*="challenge"]');
+                if (challenge) return 'challenge';
+
+                return null;
+            """)
+        except:
+            return None
+
    # Recovery function - use real mouse actions when stuck
    from selenium.webdriver.common.action_chains import ActionChains
    from selenium.webdriver.common.keys import Keys
@@ -1557,6 +1591,18 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        if elapsed >= 3 and int(elapsed) % 3 == 0:
            # After 8+ failed recovery attempts, try hard refresh
            if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes:
+                # Check for captcha before hard refresh - no point refreshing if blocked
+                captcha_type = detect_captcha()
+                if captcha_type:
+                    log.warn('browser', f"Captcha detected ({captcha_type}), stopping - human intervention needed", metrics={'captcha_type': captcha_type})
+                    stop_scrolling.set()
+                    return {
+                        "reviews": [],
+                        "total": current_count,
+                        "error": f"Captcha detected: {captcha_type}. Please solve manually and retry.",
+                        "captcha_detected": True
+                    }
+
                log.info('browser', f"Soft recovery failed {recovery_count[0]} times, trying hard refresh...", metrics={'recovery_count': recovery_count[0]})
                if do_hard_refresh():
                    last_new_time = time.time()  # Reset timer after refresh
@@ -1596,8 +1642,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        timeout_hit = elapsed >= timeout_no_new

        if truly_done or timeout_hit:
-            # Last chance: try hard refresh before giving up
-            if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
+            # Check if we're close enough to total (95%+ threshold)
+            # If we have 95%+ of reviews, don't waste time with hard refreshes
+            close_enough = False
+            if total_reviews[0] and current_count > 0:
+                pct_complete = (current_count / total_reviews[0]) * 100
+                close_enough = pct_complete >= 95
+                if close_enough:
+                    log.info('scraper', f"Close enough ({pct_complete:.1f}% complete), skipping further retries", metrics={'pct_complete': pct_complete})
+
+            # Last chance: try hard refresh before giving up (only if not close enough)
+            if not close_enough and hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
+                # Check for captcha first
+                captcha_type = detect_captcha()
+                if captcha_type:
+                    log.warn('browser', f"Captcha detected ({captcha_type}), stopping - human intervention needed", metrics={'captcha_type': captcha_type})
+                    stop_scrolling.set()
+                    break
+
                log.info('browser', "Timeout reached, trying hard refresh before giving up...", metrics={'idle_seconds': elapsed})
                if do_hard_refresh():
                    last_new_time = time.time()
--- a/scrapers/google_reviews/v1_1_0.py
+++ b/scrapers/google_reviews/v1_1_0.py