feat: Add multi-sort scraper v1.1.0 and improve v1.0.0 reliability

v1.0.0 improvements:
- Add captcha detection (reCAPTCHA, unusual traffic, challenges)
- Block fonts, analytics, maps tiles for faster scrolling
- Add 95% close-enough threshold to skip unnecessary retries
- Stop immediately if captcha detected instead of retrying

v1.1.0 new features:
- Multi-sort strategy to bypass ~1000 review limit
- Cycles through newest/lowest/highest/relevant sorts
- Auto mode: enables multi-sort when total > 1000
- Diminishing returns detection (stops if <5% new per pass)
- Configurable sort order and thresholds

Also adds test_scraper_v110.py CLI tool for testing multi-sort.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 18:47:30 +00:00
parent e2d7f6f118
commit fbd61ff7f7
3 changed files with 3120 additions and 5 deletions

188
tools/test_scraper_v110.py Normal file
View File

@@ -0,0 +1,188 @@
#!/usr/bin/env python3
"""
Quick CLI tool to test v1.1.0 scraper with multi-sort support.
Usage:
# Basic test (auto mode - enables multi-sort if needed)
python tools/test_scraper_v110.py "ClickRent Gran Canaria"
# Force multi-sort through all sort orders
python tools/test_scraper_v110.py "White Hart Hotel Boston UK" --multi-sort
# Custom sort order
python tools/test_scraper_v110.py "Business" --multi-sort --sort-order "newest,lowest,highest"
# Single sort mode
python tools/test_scraper_v110.py "Business" --sort newest
# Set close-enough threshold
python tools/test_scraper_v110.py "Business" --close-enough 90
"""
import sys
import os
import argparse
import time
import json
from datetime import datetime
# Add project root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
def main():
parser = argparse.ArgumentParser(
description='Test Google Reviews scraper v1.1.0 with multi-sort',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
parser.add_argument('query', nargs='?', help='Business name to search')
parser.add_argument('--url', '-u', help='Direct Google Maps URL (overrides query)')
parser.add_argument('--max', '-m', type=int, default=2000, help='Max reviews to scrape (default: 2000)')
parser.add_argument('--timeout', '-t', type=int, default=15, help='Timeout for no new reviews (default: 15s)')
parser.add_argument('--headless', action='store_true', help='Run in headless mode')
parser.add_argument('--output', '-o', help='Output JSON file')
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose logging')
# Multi-sort options
parser.add_argument('--sort', choices=['auto', 'newest', 'lowest', 'highest', 'relevant', 'multi'],
default='auto', help='Sort strategy (default: auto)')
parser.add_argument('--multi-sort', action='store_true', help='Force multi-sort mode')
parser.add_argument('--sort-order', help='Custom sort order, comma-separated (e.g., "newest,lowest,highest")')
parser.add_argument('--close-enough', type=float, default=95.0, help='Stop retrying at this %% (default: 95)')
args = parser.parse_args()
if not args.query and not args.url:
parser.error('Either query or --url is required')
# Build URL
if args.url:
url = args.url
else:
from urllib.parse import quote
url = f"https://www.google.com/maps/search/?api=1&query={quote(args.query)}&hl=en"
# Determine sort strategy
sort_strategy = 'multi' if args.multi_sort else args.sort
sort_order = args.sort_order.split(',') if args.sort_order else None
print(f"\n{'='*60}")
print(f"🔍 SCRAPER TEST v1.1.0 (Multi-Sort)")
print(f"{'='*60}")
print(f"URL: {url}")
print(f"Max reviews: {args.max}")
print(f"Sort strategy: {sort_strategy}")
if sort_order:
print(f"Sort order: {sort_order}")
print(f"Close enough: {args.close_enough}%")
print(f"Timeout: {args.timeout}s")
print(f"Headless: {args.headless}")
print(f"{'='*60}\n")
# Import v1.1.0 scraper
from seleniumbase import Driver
from scrapers.google_reviews.v1_1_0 import scrape_reviews, LogCapture, SORT_AUTO
# Set up log capture
log_capture = LogCapture()
# Track reviews for real-time progress
reviews_collected = []
def progress_callback(current, total):
if args.verbose:
print(f" Progress: {current}/{total or '?'}")
def flush_callback(reviews):
reviews_collected.extend(reviews)
print(f" 📥 Flushed {len(reviews)} reviews (total: {len(reviews_collected)})")
# Set up driver
print("🚀 Starting browser...")
driver = Driver(uc=True, headless=args.headless)
driver.set_window_size(1200, 900)
start_time = time.time()
try:
result = scrape_reviews(
driver=driver,
url=url,
max_reviews=args.max,
timeout_no_new=args.timeout,
log_capture=log_capture,
flush_callback=flush_callback,
progress_callback=progress_callback,
flush_batch_size=100,
sort_strategy=sort_strategy,
sort_order=sort_order,
close_enough_pct=args.close_enough
)
elapsed = time.time() - start_time
# Combine flushed + remaining reviews
all_reviews = reviews_collected + result.get('reviews', [])
print(f"\n{'='*60}")
print(f"✅ SCRAPE COMPLETE")
print(f"{'='*60}")
print(f"Total reviews: {len(all_reviews)}")
print(f"Time: {elapsed:.1f}s")
if len(all_reviews) > 0 and elapsed > 0:
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
# Multi-sort info
multi_sort_info = result.get('multi_sort', {})
if multi_sort_info.get('enabled'):
print(f"\n🔄 Multi-Sort:")
print(f" Sorts used: {multi_sort_info.get('completed_sorts', [])}")
print(f" First pass: {multi_sort_info.get('first_pass_count', 0)} reviews")
if result.get('error'):
print(f"⚠️ Error: {result['error']}")
# Show sample review
if all_reviews:
print(f"\n📝 Sample review:")
sample = all_reviews[0]
print(f" Author: {sample.get('author', 'N/A')}")
print(f" Rating: {'' * sample.get('rating', 0)}")
if sample.get('text'):
text = sample['text'][:100] + '...' if len(sample.get('text', '')) > 100 else sample.get('text', '')
print(f" Text: {text}")
# Save output if requested
if args.output:
output_data = {
'timestamp': datetime.now().isoformat(),
'url': url,
'query': args.query,
'total_reviews': len(all_reviews),
'elapsed_seconds': elapsed,
'speed': len(all_reviews)/elapsed if elapsed > 0 else 0,
'multi_sort': multi_sort_info,
'error': result.get('error'),
'reviews': all_reviews
}
with open(args.output, 'w') as f:
json.dump(output_data, f, indent=2)
print(f"\n💾 Saved to: {args.output}")
print(f"{'='*60}\n")
return 0 if not result.get('error') else 1
except Exception as e:
print(f"\n❌ SCRAPE FAILED: {e}")
import traceback
traceback.print_exc()
return 1
finally:
print("🛑 Closing browser...")
driver.quit()
if __name__ == '__main__':
sys.exit(main())