feat: Add multi-sort scraper v1.1.0 and improve v1.0.0 reliability
v1.0.0 improvements: - Add captcha detection (reCAPTCHA, unusual traffic, challenges) - Block fonts, analytics, maps tiles for faster scrolling - Add 95% close-enough threshold to skip unnecessary retries - Stop immediately if captcha detected instead of retrying v1.1.0 new features: - Multi-sort strategy to bypass ~1000 review limit - Cycles through newest/lowest/highest/relevant sorts - Auto mode: enables multi-sort when total > 1000 - Diminishing returns detection (stops if <5% new per pass) - Configurable sort order and thresholds Also adds test_scraper_v110.py CLI tool for testing multi-sort. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -801,6 +801,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
time.sleep(0.1)
|
||||
except:
|
||||
pass
|
||||
|
||||
log.info('browser', f"Loading: {url[:80]}...")
|
||||
else:
|
||||
log.info('browser', f"Hard refresh #{hard_refresh_count[0]}: reloading page...")
|
||||
@@ -1069,14 +1070,25 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
except:
|
||||
pass
|
||||
|
||||
# Block images to speed up scrolling (use CDP)
|
||||
# Block heavy resources to speed up scrolling (use CDP)
|
||||
try:
|
||||
driver.execute_cdp_cmd('Network.setBlockedURLs', {
|
||||
'urls': ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg', '*googleusercontent.com/*']
|
||||
'urls': [
|
||||
# Images
|
||||
'*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.svg',
|
||||
'*googleusercontent.com/*',
|
||||
# Fonts
|
||||
'*.woff', '*.woff2', '*.ttf', '*.otf',
|
||||
# Analytics/tracking
|
||||
'*google-analytics.com/*', '*googletagmanager.com/*',
|
||||
'*doubleclick.net/*', '*googlesyndication.com/*',
|
||||
# Maps tiles (not needed for reviews)
|
||||
'*khms*.google.com/*', '*maps.googleapis.com/maps/vt*'
|
||||
]
|
||||
})
|
||||
driver.execute_cdp_cmd('Network.enable', {})
|
||||
if not is_refresh:
|
||||
log.info('browser', "Blocking images for faster scrolling")
|
||||
log.info('browser', "Blocking heavy resources for faster scrolling")
|
||||
except:
|
||||
pass
|
||||
|
||||
@@ -1198,6 +1210,28 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
pass
|
||||
return api_revs
|
||||
|
||||
# Captcha detection helper
|
||||
def detect_captcha():
|
||||
"""Check if a captcha or challenge is blocking the page. Returns captcha type or None."""
|
||||
try:
|
||||
return driver.execute_script("""
|
||||
// Check for reCAPTCHA iframe or checkbox
|
||||
var recaptcha = document.querySelector('iframe[src*="recaptcha"], iframe[title*="reCAPTCHA"]');
|
||||
if (recaptcha) return 'recaptcha';
|
||||
|
||||
// Check for "unusual traffic" message
|
||||
var body = document.body ? document.body.innerText : '';
|
||||
if (body.includes('unusual traffic') || body.includes('not a robot')) return 'unusual_traffic';
|
||||
|
||||
// Check for challenge frame
|
||||
var challenge = document.querySelector('iframe[src*="challenge"]');
|
||||
if (challenge) return 'challenge';
|
||||
|
||||
return null;
|
||||
""")
|
||||
except:
|
||||
return None
|
||||
|
||||
# Recovery function - use real mouse actions when stuck
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
@@ -1557,6 +1591,18 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
if elapsed >= 3 and int(elapsed) % 3 == 0:
|
||||
# After 8+ failed recovery attempts, try hard refresh
|
||||
if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes:
|
||||
# Check for captcha before hard refresh - no point refreshing if blocked
|
||||
captcha_type = detect_captcha()
|
||||
if captcha_type:
|
||||
log.warn('browser', f"Captcha detected ({captcha_type}), stopping - human intervention needed", metrics={'captcha_type': captcha_type})
|
||||
stop_scrolling.set()
|
||||
return {
|
||||
"reviews": [],
|
||||
"total": current_count,
|
||||
"error": f"Captcha detected: {captcha_type}. Please solve manually and retry.",
|
||||
"captcha_detected": True
|
||||
}
|
||||
|
||||
log.info('browser', f"Soft recovery failed {recovery_count[0]} times, trying hard refresh...", metrics={'recovery_count': recovery_count[0]})
|
||||
if do_hard_refresh():
|
||||
last_new_time = time.time() # Reset timer after refresh
|
||||
@@ -1596,8 +1642,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
timeout_hit = elapsed >= timeout_no_new
|
||||
|
||||
if truly_done or timeout_hit:
|
||||
# Last chance: try hard refresh before giving up
|
||||
if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
|
||||
# Check if we're close enough to total (95%+ threshold)
|
||||
# If we have 95%+ of reviews, don't waste time with hard refreshes
|
||||
close_enough = False
|
||||
if total_reviews[0] and current_count > 0:
|
||||
pct_complete = (current_count / total_reviews[0]) * 100
|
||||
close_enough = pct_complete >= 95
|
||||
if close_enough:
|
||||
log.info('scraper', f"Close enough ({pct_complete:.1f}% complete), skipping further retries", metrics={'pct_complete': pct_complete})
|
||||
|
||||
# Last chance: try hard refresh before giving up (only if not close enough)
|
||||
if not close_enough and hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
|
||||
# Check for captcha first
|
||||
captcha_type = detect_captcha()
|
||||
if captcha_type:
|
||||
log.warn('browser', f"Captcha detected ({captcha_type}), stopping - human intervention needed", metrics={'captcha_type': captcha_type})
|
||||
stop_scrolling.set()
|
||||
break
|
||||
|
||||
log.info('browser', "Timeout reached, trying hard refresh before giving up...", metrics={'idle_seconds': elapsed})
|
||||
if do_hard_refresh():
|
||||
last_new_time = time.time()
|
||||
|
||||
2865
scrapers/google_reviews/v1_1_0.py
Normal file
2865
scrapers/google_reviews/v1_1_0.py
Normal file
File diff suppressed because it is too large
Load Diff
188
tools/test_scraper_v110.py
Normal file
188
tools/test_scraper_v110.py
Normal file
@@ -0,0 +1,188 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quick CLI tool to test v1.1.0 scraper with multi-sort support.
|
||||
|
||||
Usage:
|
||||
# Basic test (auto mode - enables multi-sort if needed)
|
||||
python tools/test_scraper_v110.py "ClickRent Gran Canaria"
|
||||
|
||||
# Force multi-sort through all sort orders
|
||||
python tools/test_scraper_v110.py "White Hart Hotel Boston UK" --multi-sort
|
||||
|
||||
# Custom sort order
|
||||
python tools/test_scraper_v110.py "Business" --multi-sort --sort-order "newest,lowest,highest"
|
||||
|
||||
# Single sort mode
|
||||
python tools/test_scraper_v110.py "Business" --sort newest
|
||||
|
||||
# Set close-enough threshold
|
||||
python tools/test_scraper_v110.py "Business" --close-enough 90
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
import time
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
# Add project root to path
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Test Google Reviews scraper v1.1.0 with multi-sort',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__
|
||||
)
|
||||
parser.add_argument('query', nargs='?', help='Business name to search')
|
||||
parser.add_argument('--url', '-u', help='Direct Google Maps URL (overrides query)')
|
||||
parser.add_argument('--max', '-m', type=int, default=2000, help='Max reviews to scrape (default: 2000)')
|
||||
parser.add_argument('--timeout', '-t', type=int, default=15, help='Timeout for no new reviews (default: 15s)')
|
||||
parser.add_argument('--headless', action='store_true', help='Run in headless mode')
|
||||
parser.add_argument('--output', '-o', help='Output JSON file')
|
||||
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose logging')
|
||||
|
||||
# Multi-sort options
|
||||
parser.add_argument('--sort', choices=['auto', 'newest', 'lowest', 'highest', 'relevant', 'multi'],
|
||||
default='auto', help='Sort strategy (default: auto)')
|
||||
parser.add_argument('--multi-sort', action='store_true', help='Force multi-sort mode')
|
||||
parser.add_argument('--sort-order', help='Custom sort order, comma-separated (e.g., "newest,lowest,highest")')
|
||||
parser.add_argument('--close-enough', type=float, default=95.0, help='Stop retrying at this %% (default: 95)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.query and not args.url:
|
||||
parser.error('Either query or --url is required')
|
||||
|
||||
# Build URL
|
||||
if args.url:
|
||||
url = args.url
|
||||
else:
|
||||
from urllib.parse import quote
|
||||
url = f"https://www.google.com/maps/search/?api=1&query={quote(args.query)}&hl=en"
|
||||
|
||||
# Determine sort strategy
|
||||
sort_strategy = 'multi' if args.multi_sort else args.sort
|
||||
sort_order = args.sort_order.split(',') if args.sort_order else None
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"🔍 SCRAPER TEST v1.1.0 (Multi-Sort)")
|
||||
print(f"{'='*60}")
|
||||
print(f"URL: {url}")
|
||||
print(f"Max reviews: {args.max}")
|
||||
print(f"Sort strategy: {sort_strategy}")
|
||||
if sort_order:
|
||||
print(f"Sort order: {sort_order}")
|
||||
print(f"Close enough: {args.close_enough}%")
|
||||
print(f"Timeout: {args.timeout}s")
|
||||
print(f"Headless: {args.headless}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Import v1.1.0 scraper
|
||||
from seleniumbase import Driver
|
||||
from scrapers.google_reviews.v1_1_0 import scrape_reviews, LogCapture, SORT_AUTO
|
||||
|
||||
# Set up log capture
|
||||
log_capture = LogCapture()
|
||||
|
||||
# Track reviews for real-time progress
|
||||
reviews_collected = []
|
||||
|
||||
def progress_callback(current, total):
|
||||
if args.verbose:
|
||||
print(f" Progress: {current}/{total or '?'}")
|
||||
|
||||
def flush_callback(reviews):
|
||||
reviews_collected.extend(reviews)
|
||||
print(f" 📥 Flushed {len(reviews)} reviews (total: {len(reviews_collected)})")
|
||||
|
||||
# Set up driver
|
||||
print("🚀 Starting browser...")
|
||||
driver = Driver(uc=True, headless=args.headless)
|
||||
driver.set_window_size(1200, 900)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
result = scrape_reviews(
|
||||
driver=driver,
|
||||
url=url,
|
||||
max_reviews=args.max,
|
||||
timeout_no_new=args.timeout,
|
||||
log_capture=log_capture,
|
||||
flush_callback=flush_callback,
|
||||
progress_callback=progress_callback,
|
||||
flush_batch_size=100,
|
||||
sort_strategy=sort_strategy,
|
||||
sort_order=sort_order,
|
||||
close_enough_pct=args.close_enough
|
||||
)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Combine flushed + remaining reviews
|
||||
all_reviews = reviews_collected + result.get('reviews', [])
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"✅ SCRAPE COMPLETE")
|
||||
print(f"{'='*60}")
|
||||
print(f"Total reviews: {len(all_reviews)}")
|
||||
print(f"Time: {elapsed:.1f}s")
|
||||
if len(all_reviews) > 0 and elapsed > 0:
|
||||
print(f"Speed: {len(all_reviews)/elapsed:.1f} reviews/sec")
|
||||
|
||||
# Multi-sort info
|
||||
multi_sort_info = result.get('multi_sort', {})
|
||||
if multi_sort_info.get('enabled'):
|
||||
print(f"\n🔄 Multi-Sort:")
|
||||
print(f" Sorts used: {multi_sort_info.get('completed_sorts', [])}")
|
||||
print(f" First pass: {multi_sort_info.get('first_pass_count', 0)} reviews")
|
||||
|
||||
if result.get('error'):
|
||||
print(f"⚠️ Error: {result['error']}")
|
||||
|
||||
# Show sample review
|
||||
if all_reviews:
|
||||
print(f"\n📝 Sample review:")
|
||||
sample = all_reviews[0]
|
||||
print(f" Author: {sample.get('author', 'N/A')}")
|
||||
print(f" Rating: {'⭐' * sample.get('rating', 0)}")
|
||||
if sample.get('text'):
|
||||
text = sample['text'][:100] + '...' if len(sample.get('text', '')) > 100 else sample.get('text', '')
|
||||
print(f" Text: {text}")
|
||||
|
||||
# Save output if requested
|
||||
if args.output:
|
||||
output_data = {
|
||||
'timestamp': datetime.now().isoformat(),
|
||||
'url': url,
|
||||
'query': args.query,
|
||||
'total_reviews': len(all_reviews),
|
||||
'elapsed_seconds': elapsed,
|
||||
'speed': len(all_reviews)/elapsed if elapsed > 0 else 0,
|
||||
'multi_sort': multi_sort_info,
|
||||
'error': result.get('error'),
|
||||
'reviews': all_reviews
|
||||
}
|
||||
with open(args.output, 'w') as f:
|
||||
json.dump(output_data, f, indent=2)
|
||||
print(f"\n💾 Saved to: {args.output}")
|
||||
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
return 0 if not result.get('error') else 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n❌ SCRAPE FAILED: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
|
||||
finally:
|
||||
print("🛑 Closing browser...")
|
||||
driver.quit()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user