feat: Add multi-sort scraper v1.1.0 and improve v1.0.0 reliability

v1.0.0 improvements: - Add captcha detection (reCAPTCHA, unusual traffic, challenges) - Block fonts, analytics, maps tiles for faster scrolling - Add 95% close-enough threshold to skip unnecessary retries - Stop immediately if captcha detected instead of retrying v1.1.0 new features: - Multi-sort strategy to bypass ~1000 review limit - Cycles through newest/lowest/highest/relevant sorts - Auto mode: enables multi-sort when total > 1000 - Diminishing returns detection (stops if <5% new per pass) - Configurable sort order and thresholds Also adds test_scraper_v110.py CLI tool for testing multi-sort. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 18:47:30 +00:00
parent e2d7f6f118
commit fbd61ff7f7
3 changed files with 3120 additions and 5 deletions
--- a/tools/test_scraper_v110.py
+++ b/tools/test_scraper_v110.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+"""
+Quick CLI tool to test v1.1.0 scraper with multi-sort support.
+
+Usage:
+    # Basic test (auto mode - enables multi-sort if needed)
+    python tools/test_scraper_v110.py "ClickRent Gran Canaria"
+
+    # Force multi-sort through all sort orders
+    python tools/test_scraper_v110.py "White Hart Hotel Boston UK" --multi-sort
+
+    # Custom sort order
+    python tools/test_scraper_v110.py "Business" --multi-sort --sort-order "newest,lowest,highest"
+
+    # Single sort mode
+    python tools/test_scraper_v110.py "Business" --sort newest
+
+    # Set close-enough threshold
+    python tools/test_scraper_v110.py "Business" --close-enough 90
+"""
+
+import sys
+import os
+import argparse
+import time
+import json
+from datetime import datetime
+
+# Add project root to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Test Google Reviews scraper v1.1.0 with multi-sort',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+    parser.add_argument('query', nargs='?', help='Business name to search')
+    parser.add_argument('--url', '-u', help='Direct Google Maps URL (overrides query)')
+    parser.add_argument('--max', '-m', type=int, default=2000, help='Max reviews to scrape (default: 2000)')
+    parser.add_argument('--timeout', '-t', type=int, default=15, help='Timeout for no new reviews (default: 15s)')
+    parser.add_argument('--headless', action='store_true', help='Run in headless mode')
+    parser.add_argument('--output', '-o', help='Output JSON file')
+    parser.add_argument('--verbose', '-v', action='store_true', help='Verbose logging')
+
+    # Multi-sort options
+    parser.add_argument('--sort', choices=['auto', 'newest', 'lowest', 'highest', 'relevant', 'multi'],
+                       default='auto', help='Sort strategy (default: auto)')
+    parser.add_argument('--multi-sort', action='store_true', help='Force multi-sort mode')
+    parser.add_argument('--sort-order', help='Custom sort order, comma-separated (e.g., "newest,lowest,highest")')
+    parser.add_argument('--close-enough', type=float, default=95.0, help='Stop retrying at this %% (default: 95)')
+
+    args = parser.parse_args()
+
+    if not args.query and not args.url:
+        parser.error('Either query or --url is required')
+
+    # Build URL
+    if args.url:
+        url = args.url
+    else:
+        from urllib.parse import quote
+        url = f"https://www.google.com/maps/search/?api=1&query={quote(args.query)}&hl=en"
+
+    # Determine sort strategy
+    sort_strategy = 'multi' if args.multi_sort else args.sort
+    sort_order = args.sort_order.split(',') if args.sort_order else None
+
+    print(f"\n{'='*60}")
+    print(f"🔍 SCRAPER TEST v1.1.0 (Multi-Sort)")
+    print(f"{'='*60}")
+    print(f"URL: {url}")
+    print(f"Max reviews: {args.max}")
+    print(f"Sort strategy: {sort_strategy}")
+    if sort_order:
+        print(f"Sort order: {sort_order}")
+    print(f"Close enough: {args.close_enough}%")
+    print(f"Timeout: {args.timeout}s")
+    print(f"Headless: {args.headless}")
+    print(f"{'='*60}\n")
+
+    # Import v1.1.0 scraper
+    from seleniumbase import Driver
+    from scrapers.google_reviews.v1_1_0 import scrape_reviews, LogCapture, SORT_AUTO
+
+    # Set up log capture
+    log_capture = LogCapture()
+
+    # Track reviews for real-time progress
+    reviews_collected = []
+
+    def progress_callback(current, total):
+        if args.verbose:
+            print(f"  Progress: {current}/{total or '?'}")
+
+    def flush_callback(reviews):
+        reviews_collected.extend(reviews)
+        print(f"  📥 Flushed {len(reviews)} reviews (total: {len(reviews_collected)})")
+
+    # Set up driver
+    print("🚀 Starting browser...")
+    driver = Driver(uc=True, headless=args.headless)
+    driver.set_window_size(1200, 900)
+
+    start_time = time.time()
+
+    try:
+        result = scrape_reviews(
+            driver=driver,
+            url=url,
+            max_reviews=args.max,
+            timeout_no_new=args.timeout,
+            log_capture=log_capture,
+            flush_callback=flush_callback,
+            progress_callback=progress_callback,
+            flush_batch_size=100,
+            sort_strategy=sort_strategy,
+            sort_order=sort_order,
+            close_enough_pct=args.close_enough
+        )
+
+        elapsed = time.time() - start_time
+
+        # Combine flushed + remaining reviews
+        all_reviews = reviews_collected + result.get('reviews', [])
+
+        print(f"\n{'='*60}")
+        print(f"✅ SCRAPE COMPLETE")
+        print(f"{'='*60}")
+        print(f"Total reviews: {len(all_reviews)}")
+        print(f"Time: {elapsed:.1f}s")
+        if len(all_reviews) > 0 and elapsed > 0:
+            print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
+
+        # Multi-sort info
+        multi_sort_info = result.get('multi_sort', {})
+        if multi_sort_info.get('enabled'):
+            print(f"\n🔄 Multi-Sort:")
+            print(f"   Sorts used: {multi_sort_info.get('completed_sorts', [])}")
+            print(f"   First pass: {multi_sort_info.get('first_pass_count', 0)} reviews")
+
+        if result.get('error'):
+            print(f"⚠️  Error: {result['error']}")
+
+        # Show sample review
+        if all_reviews:
+            print(f"\n📝 Sample review:")
+            sample = all_reviews[0]
+            print(f"   Author: {sample.get('author', 'N/A')}")
+            print(f"   Rating: {'⭐' * sample.get('rating', 0)}")
+            if sample.get('text'):
+                text = sample['text'][:100] + '...' if len(sample.get('text', '')) > 100 else sample.get('text', '')
+                print(f"   Text: {text}")
+
+        # Save output if requested
+        if args.output:
+            output_data = {
+                'timestamp': datetime.now().isoformat(),
+                'url': url,
+                'query': args.query,
+                'total_reviews': len(all_reviews),
+                'elapsed_seconds': elapsed,
+                'speed': len(all_reviews)/elapsed if elapsed > 0 else 0,
+                'multi_sort': multi_sort_info,
+                'error': result.get('error'),
+                'reviews': all_reviews
+            }
+            with open(args.output, 'w') as f:
+                json.dump(output_data, f, indent=2)
+            print(f"\n💾 Saved to: {args.output}")
+
+        print(f"{'='*60}\n")
+
+        return 0 if not result.get('error') else 1
+
+    except Exception as e:
+        print(f"\n❌ SCRAPE FAILED: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+
+    finally:
+        print("🛑 Closing browser...")
+        driver.quit()
+
+
+if __name__ == '__main__':
+    sys.exit(main())