#!/usr/bin/env python3 """ Quick CLI tool to test v1.1.0 scraper with multi-sort support. Usage: # Basic test (auto mode - enables multi-sort if needed) python tools/test_scraper_v110.py "ClickRent Gran Canaria" # Force multi-sort through all sort orders python tools/test_scraper_v110.py "White Hart Hotel Boston UK" --multi-sort # Custom sort order python tools/test_scraper_v110.py "Business" --multi-sort --sort-order "newest,lowest,highest" # Single sort mode python tools/test_scraper_v110.py "Business" --sort newest # Set close-enough threshold python tools/test_scraper_v110.py "Business" --close-enough 90 """ import sys import os import argparse import time import json from datetime import datetime # Add project root to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) def main(): parser = argparse.ArgumentParser( description='Test Google Reviews scraper v1.1.0 with multi-sort', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__ ) parser.add_argument('query', nargs='?', help='Business name to search') parser.add_argument('--url', '-u', help='Direct Google Maps URL (overrides query)') parser.add_argument('--max', '-m', type=int, default=2000, help='Max reviews to scrape (default: 2000)') parser.add_argument('--timeout', '-t', type=int, default=15, help='Timeout for no new reviews (default: 15s)') parser.add_argument('--headless', action='store_true', help='Run in headless mode') parser.add_argument('--output', '-o', help='Output JSON file') parser.add_argument('--verbose', '-v', action='store_true', help='Verbose logging') # Multi-sort options parser.add_argument('--sort', choices=['auto', 'newest', 'lowest', 'highest', 'relevant', 'multi'], default='auto', help='Sort strategy (default: auto)') parser.add_argument('--multi-sort', action='store_true', help='Force multi-sort mode') parser.add_argument('--sort-order', help='Custom sort order, comma-separated (e.g., "newest,lowest,highest")') parser.add_argument('--close-enough', type=float, default=95.0, help='Stop retrying at this %% (default: 95)') args = parser.parse_args() if not args.query and not args.url: parser.error('Either query or --url is required') # Build URL if args.url: url = args.url else: from urllib.parse import quote url = f"https://www.google.com/maps/search/?api=1&query={quote(args.query)}&hl=en" # Determine sort strategy sort_strategy = 'multi' if args.multi_sort else args.sort sort_order = args.sort_order.split(',') if args.sort_order else None print(f"\n{'='*60}") print(f"šŸ” SCRAPER TEST v1.1.0 (Multi-Sort)") print(f"{'='*60}") print(f"URL: {url}") print(f"Max reviews: {args.max}") print(f"Sort strategy: {sort_strategy}") if sort_order: print(f"Sort order: {sort_order}") print(f"Close enough: {args.close_enough}%") print(f"Timeout: {args.timeout}s") print(f"Headless: {args.headless}") print(f"{'='*60}\n") # Import v1.1.0 scraper from seleniumbase import Driver from scrapers.google_reviews.v1_1_0 import scrape_reviews, LogCapture, SORT_AUTO # Set up log capture log_capture = LogCapture() # Track reviews for real-time progress reviews_collected = [] def progress_callback(current, total): if args.verbose: print(f" Progress: {current}/{total or '?'}") def flush_callback(reviews): reviews_collected.extend(reviews) print(f" šŸ“„ Flushed {len(reviews)} reviews (total: {len(reviews_collected)})") # Set up driver print("šŸš€ Starting browser...") driver = Driver(uc=True, headless=args.headless) driver.set_window_size(1200, 900) start_time = time.time() try: result = scrape_reviews( driver=driver, url=url, max_reviews=args.max, timeout_no_new=args.timeout, log_capture=log_capture, flush_callback=flush_callback, progress_callback=progress_callback, flush_batch_size=100, sort_strategy=sort_strategy, sort_order=sort_order, close_enough_pct=args.close_enough ) elapsed = time.time() - start_time # Combine flushed + remaining reviews all_reviews = reviews_collected + result.get('reviews', []) print(f"\n{'='*60}") print(f"āœ… SCRAPE COMPLETE") print(f"{'='*60}") print(f"Total reviews: {len(all_reviews)}") print(f"Time: {elapsed:.1f}s") if len(all_reviews) > 0 and elapsed > 0: print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec") # Multi-sort info multi_sort_info = result.get('multi_sort', {}) if multi_sort_info.get('enabled'): print(f"\nšŸ”„ Multi-Sort:") print(f" Sorts used: {multi_sort_info.get('completed_sorts', [])}") print(f" First pass: {multi_sort_info.get('first_pass_count', 0)} reviews") if result.get('error'): print(f"āš ļø Error: {result['error']}") # Show sample review if all_reviews: print(f"\nšŸ“ Sample review:") sample = all_reviews[0] print(f" Author: {sample.get('author', 'N/A')}") print(f" Rating: {'⭐' * sample.get('rating', 0)}") if sample.get('text'): text = sample['text'][:100] + '...' if len(sample.get('text', '')) > 100 else sample.get('text', '') print(f" Text: {text}") # Save output if requested if args.output: output_data = { 'timestamp': datetime.now().isoformat(), 'url': url, 'query': args.query, 'total_reviews': len(all_reviews), 'elapsed_seconds': elapsed, 'speed': len(all_reviews)/elapsed if elapsed > 0 else 0, 'multi_sort': multi_sort_info, 'error': result.get('error'), 'reviews': all_reviews } with open(args.output, 'w') as f: json.dump(output_data, f, indent=2) print(f"\nšŸ’¾ Saved to: {args.output}") print(f"{'='*60}\n") return 0 if not result.get('error') else 1 except Exception as e: print(f"\nāŒ SCRAPE FAILED: {e}") import traceback traceback.print_exc() return 1 finally: print("šŸ›‘ Closing browser...") driver.quit() if __name__ == '__main__': sys.exit(main())