Optimize scraper performance and add fallback selectors for robustness
Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
162
test_fast_api.py
Normal file
162
test_fast_api.py
Normal file
@@ -0,0 +1,162 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for the Fast API server.
|
||||
Demonstrates how to use the updated API with the fast scraper (18.9s).
|
||||
"""
|
||||
import requests
|
||||
import time
|
||||
import json
|
||||
|
||||
# API base URL
|
||||
BASE_URL = "http://localhost:8000"
|
||||
|
||||
def test_api():
|
||||
"""Test the Fast API endpoints"""
|
||||
|
||||
print("=" * 60)
|
||||
print("Testing Fast Google Reviews Scraper API")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
# 1. Health check
|
||||
print("1. Health Check")
|
||||
response = requests.get(f"{BASE_URL}/")
|
||||
print(f" Status: {response.status_code}")
|
||||
print(f" Response: {response.json()}")
|
||||
print()
|
||||
|
||||
# 2. Start a scraping job
|
||||
print("2. Starting Scraping Job")
|
||||
|
||||
# Read URL from config
|
||||
import yaml
|
||||
with open('config.yaml', 'r') as f:
|
||||
config = yaml.safe_load(f)
|
||||
url = config.get('url')
|
||||
|
||||
scrape_request = {
|
||||
"url": url,
|
||||
"headless": True # Run in headless mode
|
||||
}
|
||||
|
||||
response = requests.post(f"{BASE_URL}/scrape", json=scrape_request)
|
||||
print(f" Status: {response.status_code}")
|
||||
result = response.json()
|
||||
print(f" Response: {result}")
|
||||
print()
|
||||
|
||||
job_id = result.get('job_id')
|
||||
if not job_id:
|
||||
print("❌ Failed to start job!")
|
||||
return
|
||||
|
||||
print(f" Job ID: {job_id}")
|
||||
print()
|
||||
|
||||
# 3. Poll job status
|
||||
print("3. Polling Job Status")
|
||||
start_time = time.time()
|
||||
|
||||
while True:
|
||||
response = requests.get(f"{BASE_URL}/jobs/{job_id}")
|
||||
job = response.json()
|
||||
|
||||
status = job['status']
|
||||
progress = job.get('progress', {})
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
print(f" [{elapsed:.1f}s] Status: {status} - {progress.get('message', '')}")
|
||||
|
||||
if status in ['completed', 'failed', 'cancelled']:
|
||||
break
|
||||
|
||||
time.sleep(2) # Poll every 2 seconds
|
||||
|
||||
print()
|
||||
|
||||
# 4. Get final job details
|
||||
print("4. Final Job Details")
|
||||
response = requests.get(f"{BASE_URL}/jobs/{job_id}")
|
||||
job = response.json()
|
||||
|
||||
print(f" Status: {job['status']}")
|
||||
print(f" Reviews Count: {job.get('reviews_count', 0)}")
|
||||
print(f" Scrape Time: {job.get('scrape_time', 0):.1f}s")
|
||||
|
||||
if job.get('error_message'):
|
||||
print(f" Error: {job['error_message']}")
|
||||
|
||||
if job.get('progress'):
|
||||
progress = job['progress']
|
||||
if 'scroll_time' in progress:
|
||||
print(f" Scroll Time: {progress['scroll_time']:.1f}s")
|
||||
if 'extract_time' in progress:
|
||||
print(f" Extract Time: {progress['extract_time']:.2f}s")
|
||||
|
||||
print()
|
||||
|
||||
# 5. Get reviews data
|
||||
if job['status'] == 'completed':
|
||||
print("5. Retrieving Reviews Data")
|
||||
response = requests.get(f"{BASE_URL}/jobs/{job_id}/reviews")
|
||||
|
||||
if response.status_code == 200:
|
||||
reviews_data = response.json()
|
||||
reviews = reviews_data['reviews']
|
||||
count = reviews_data['count']
|
||||
|
||||
print(f" Total Reviews: {count}")
|
||||
print()
|
||||
|
||||
# Show first 3 reviews
|
||||
print(" Sample Reviews:")
|
||||
for i, review in enumerate(reviews[:3], 1):
|
||||
print(f" {i}. {review.get('author', 'Unknown')} - {review.get('rating', 0)}★")
|
||||
text = review.get('text', '')
|
||||
if text:
|
||||
preview = text[:60] + "..." if len(text) > 60 else text
|
||||
print(f" \"{preview}\"")
|
||||
print()
|
||||
|
||||
# Save to file
|
||||
output_file = f"api_reviews_{job_id[:8]}.json"
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(reviews, f, indent=2, ensure_ascii=False)
|
||||
print(f" 💾 Saved all reviews to: {output_file}")
|
||||
|
||||
else:
|
||||
print(f" ❌ Failed to get reviews: {response.status_code}")
|
||||
print(f" {response.json()}")
|
||||
|
||||
print()
|
||||
|
||||
# 6. Get statistics
|
||||
print("6. Job Statistics")
|
||||
response = requests.get(f"{BASE_URL}/stats")
|
||||
stats = response.json()
|
||||
|
||||
print(f" Total Jobs: {stats['total_jobs']}")
|
||||
print(f" Running Jobs: {stats['running_jobs']}/{stats['max_concurrent_jobs']}")
|
||||
print(f" By Status: {stats['by_status']}")
|
||||
print()
|
||||
|
||||
print("=" * 60)
|
||||
print("✅ API Test Complete!")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
test_api()
|
||||
except requests.exceptions.ConnectionError:
|
||||
print("❌ Error: Could not connect to API server!")
|
||||
print()
|
||||
print("Please start the API server first:")
|
||||
print(" python api_server.py")
|
||||
print()
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nTest interrupted by user")
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
Reference in New Issue
Block a user