feat: Add multi-sort scraper v1.1.0 and improve v1.0.0 reliability

v1.0.0 improvements: - Add captcha detection (reCAPTCHA, unusual traffic, challenges) - Block fonts, analytics, maps tiles for faster scrolling - Add 95% close-enough threshold to skip unnecessary retries - Stop immediately if captcha detected instead of retrying v1.1.0 new features: - Multi-sort strategy to bypass ~1000 review limit - Cycles through newest/lowest/highest/relevant sorts - Auto mode: enables multi-sort when total > 1000 - Diminishing returns detection (stops if <5% new per pass) - Configurable sort order and thresholds Also adds test_scraper_v110.py CLI tool for testing multi-sort. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 18:47:30 +00:00
parent e2d7f6f118
commit fbd61ff7f7
3 changed files with 3120 additions and 5 deletions
--- a/scrapers/google_reviews/v1_0_0.py
+++ b/scrapers/google_reviews/v1_0_0.py
@@ -801,6 +801,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                time.sleep(0.1)
            except:
                pass
+
            log.info('browser', f"Loading: {url[:80]}...")
        else:
            log.info('browser', f"Hard refresh #{hard_refresh_count[0]}: reloading page...")
@@ -1069,14 +1070,25 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        except:
            pass

-        # Block images to speed up scrolling (use CDP)
+        # Block heavy resources to speed up scrolling (use CDP)
        try:
            driver.execute_cdp_cmd('Network.setBlockedURLs', {
-                'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*']
+                'urls': [
+                    # Images
+                    '*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg',
+                    '*googleusercontent.com/*',
+                    # Fonts
+                    '*.woff', '*.woff2', '*.ttf', '*.otf',
+                    # Analytics/tracking
+                    '*google-analytics.com/*', '*googletagmanager.com/*',
+                    '*doubleclick.net/*', '*googlesyndication.com/*',
+                    # Maps tiles (not needed for reviews)
+                    '*khms*.google.com/*', '*maps.googleapis.com/maps/vt*'
+                ]
            })
            driver.execute_cdp_cmd('Network.enable', {})
            if not is_refresh:
-                log.info('browser', "Blocking images for faster scrolling")
+                log.info('browser', "Blocking heavy resources for faster scrolling")
        except:
            pass

@@ -1198,6 +1210,28 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
            pass
        return api_revs

+    # Captcha detection helper
+    def detect_captcha():
+        """Check if a captcha or challenge is blocking the page. Returns captcha type or None."""
+        try:
+            return driver.execute_script("""
+                // Check for reCAPTCHA iframe or checkbox
+                var recaptcha = document.querySelector('iframe[src*="recaptcha"], iframe[title*="reCAPTCHA"]');
+                if (recaptcha) return 'recaptcha';
+
+                // Check for "unusual traffic" message
+                var body = document.body ? document.body.innerText : '';
+                if (body.includes('unusual traffic') || body.includes('not a robot')) return 'unusual_traffic';
+
+                // Check for challenge frame
+                var challenge = document.querySelector('iframe[src*="challenge"]');
+                if (challenge) return 'challenge';
+
+                return null;
+            """)
+        except:
+            return None
+
    # Recovery function - use real mouse actions when stuck
    from selenium.webdriver.common.action_chains import ActionChains
    from selenium.webdriver.common.keys import Keys
@@ -1557,6 +1591,18 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        if elapsed >= 3 and int(elapsed) % 3 == 0:
            # After 8+ failed recovery attempts, try hard refresh
            if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes:
+                # Check for captcha before hard refresh - no point refreshing if blocked
+                captcha_type = detect_captcha()
+                if captcha_type:
+                    log.warn('browser', f"Captcha detected ({captcha_type}), stopping - human intervention needed", metrics={'captcha_type': captcha_type})
+                    stop_scrolling.set()
+                    return {
+                        "reviews": [],
+                        "total": current_count,
+                        "error": f"Captcha detected: {captcha_type}. Please solve manually and retry.",
+                        "captcha_detected": True
+                    }
+
                log.info('browser', f"Soft recovery failed {recovery_count[0]} times, trying hard refresh...", metrics={'recovery_count': recovery_count[0]})
                if do_hard_refresh():
                    last_new_time = time.time()  # Reset timer after refresh
@@ -1596,8 +1642,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        timeout_hit = elapsed >= timeout_no_new

        if truly_done or timeout_hit:
-            # Last chance: try hard refresh before giving up
-            if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
+            # Check if we're close enough to total (95%+ threshold)
+            # If we have 95%+ of reviews, don't waste time with hard refreshes
+            close_enough = False
+            if total_reviews[0] and current_count > 0:
+                pct_complete = (current_count / total_reviews[0]) * 100
+                close_enough = pct_complete >= 95
+                if close_enough:
+                    log.info('scraper', f"Close enough ({pct_complete:.1f}% complete), skipping further retries", metrics={'pct_complete': pct_complete})
+
+            # Last chance: try hard refresh before giving up (only if not close enough)
+            if not close_enough and hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
+                # Check for captcha first
+                captcha_type = detect_captcha()
+                if captcha_type:
+                    log.warn('browser', f"Captcha detected ({captcha_type}), stopping - human intervention needed", metrics={'captcha_type': captcha_type})
+                    stop_scrolling.set()
+                    break
+
                log.info('browser', "Timeout reached, trying hard refresh before giving up...", metrics={'idle_seconds': elapsed})
                if do_hard_refresh():
                    last_new_time = time.time()
--- a/scrapers/google_reviews/v1_1_0.py
+++ b/scrapers/google_reviews/v1_1_0.py
--- a/tools/test_scraper_v110.py
+++ b/tools/test_scraper_v110.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+"""
+Quick CLI tool to test v1.1.0 scraper with multi-sort support.
+
+Usage:
+    # Basic test (auto mode - enables multi-sort if needed)
+    python tools/test_scraper_v110.py "ClickRent Gran Canaria"
+
+    # Force multi-sort through all sort orders
+    python tools/test_scraper_v110.py "White Hart Hotel Boston UK" --multi-sort
+
+    # Custom sort order
+    python tools/test_scraper_v110.py "Business" --multi-sort --sort-order "newest,lowest,highest"
+
+    # Single sort mode
+    python tools/test_scraper_v110.py "Business" --sort newest
+
+    # Set close-enough threshold
+    python tools/test_scraper_v110.py "Business" --close-enough 90
+"""
+
+import sys
+import os
+import argparse
+import time
+import json
+from datetime import datetime
+
+# Add project root to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Test Google Reviews scraper v1.1.0 with multi-sort',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+    parser.add_argument('query', nargs='?', help='Business name to search')
+    parser.add_argument('--url', '-u', help='Direct Google Maps URL (overrides query)')
+    parser.add_argument('--max', '-m', type=int, default=2000, help='Max reviews to scrape (default: 2000)')
+    parser.add_argument('--timeout', '-t', type=int, default=15, help='Timeout for no new reviews (default: 15s)')
+    parser.add_argument('--headless', action='store_true', help='Run in headless mode')
+    parser.add_argument('--output', '-o', help='Output JSON file')
+    parser.add_argument('--verbose', '-v', action='store_true', help='Verbose logging')
+
+    # Multi-sort options
+    parser.add_argument('--sort', choices=['auto', 'newest', 'lowest', 'highest', 'relevant', 'multi'],
+                       default='auto', help='Sort strategy (default: auto)')
+    parser.add_argument('--multi-sort', action='store_true', help='Force multi-sort mode')
+    parser.add_argument('--sort-order', help='Custom sort order, comma-separated (e.g., "newest,lowest,highest")')
+    parser.add_argument('--close-enough', type=float, default=95.0, help='Stop retrying at this %% (default: 95)')
+
+    args = parser.parse_args()
+
+    if not args.query and not args.url:
+        parser.error('Either query or --url is required')
+
+    # Build URL
+    if args.url:
+        url = args.url
+    else:
+        from urllib.parse import quote
+        url = f"https://www.google.com/maps/search/?api=1&query={quote(args.query)}&hl=en"
+
+    # Determine sort strategy
+    sort_strategy = 'multi' if args.multi_sort else args.sort
+    sort_order = args.sort_order.split(',') if args.sort_order else None
+
+    print(f"\n{'='*60}")
+    print(f"🔍 SCRAPER TEST v1.1.0 (Multi-Sort)")
+    print(f"{'='*60}")
+    print(f"URL: {url}")
+    print(f"Max reviews: {args.max}")
+    print(f"Sort strategy: {sort_strategy}")
+    if sort_order:
+        print(f"Sort order: {sort_order}")
+    print(f"Close enough: {args.close_enough}%")
+    print(f"Timeout: {args.timeout}s")
+    print(f"Headless: {args.headless}")
+    print(f"{'='*60}\n")
+
+    # Import v1.1.0 scraper
+    from seleniumbase import Driver
+    from scrapers.google_reviews.v1_1_0 import scrape_reviews, LogCapture, SORT_AUTO
+
+    # Set up log capture
+    log_capture = LogCapture()
+
+    # Track reviews for real-time progress
+    reviews_collected = []
+
+    def progress_callback(current, total):
+        if args.verbose:
+            print(f"  Progress: {current}/{total or '?'}")
+
+    def flush_callback(reviews):
+        reviews_collected.extend(reviews)
+        print(f"  📥 Flushed {len(reviews)} reviews (total: {len(reviews_collected)})")
+
+    # Set up driver
+    print("🚀 Starting browser...")
+    driver = Driver(uc=True, headless=args.headless)
+    driver.set_window_size(1200, 900)
+
+    start_time = time.time()
+
+    try:
+        result = scrape_reviews(
+            driver=driver,
+            url=url,
+            max_reviews=args.max,
+            timeout_no_new=args.timeout,
+            log_capture=log_capture,
+            flush_callback=flush_callback,
+            progress_callback=progress_callback,
+            flush_batch_size=100,
+            sort_strategy=sort_strategy,
+            sort_order=sort_order,
+            close_enough_pct=args.close_enough
+        )
+
+        elapsed = time.time() - start_time
+
+        # Combine flushed + remaining reviews
+        all_reviews = reviews_collected + result.get('reviews', [])
+
+        print(f"\n{'='*60}")
+        print(f"✅ SCRAPE COMPLETE")
+        print(f"{'='*60}")
+        print(f"Total reviews: {len(all_reviews)}")
+        print(f"Time: {elapsed:.1f}s")
+        if len(all_reviews) > 0 and elapsed > 0:
+            print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
+
+        # Multi-sort info
+        multi_sort_info = result.get('multi_sort', {})
+        if multi_sort_info.get('enabled'):
+            print(f"\n🔄 Multi-Sort:")
+            print(f"   Sorts used: {multi_sort_info.get('completed_sorts', [])}")
+            print(f"   First pass: {multi_sort_info.get('first_pass_count', 0)} reviews")
+
+        if result.get('error'):
+            print(f"⚠️  Error: {result['error']}")
+
+        # Show sample review
+        if all_reviews:
+            print(f"\n📝 Sample review:")
+            sample = all_reviews[0]
+            print(f"   Author: {sample.get('author', 'N/A')}")
+            print(f"   Rating: {'⭐' * sample.get('rating', 0)}")
+            if sample.get('text'):
+                text = sample['text'][:100] + '...' if len(sample.get('text', '')) > 100 else sample.get('text', '')
+                print(f"   Text: {text}")
+
+        # Save output if requested
+        if args.output:
+            output_data = {
+                'timestamp': datetime.now().isoformat(),
+                'url': url,
+                'query': args.query,
+                'total_reviews': len(all_reviews),
+                'elapsed_seconds': elapsed,
+                'speed': len(all_reviews)/elapsed if elapsed > 0 else 0,
+                'multi_sort': multi_sort_info,
+                'error': result.get('error'),
+                'reviews': all_reviews
+            }
+            with open(args.output, 'w') as f:
+                json.dump(output_data, f, indent=2)
+            print(f"\n💾 Saved to: {args.output}")
+
+        print(f"{'='*60}\n")
+
+        return 0 if not result.get('error') else 1
+
+    except Exception as e:
+        print(f"\n❌ SCRAPE FAILED: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+
+    finally:
+        print("🛑 Closing browser...")
+        driver.quit()
+
+
+if __name__ == '__main__':
+    sys.exit(main())