Switch production to scraper_clean with hard refresh recovery
- Add fast_scrape_reviews() wrapper to scraper_clean.py for API compatibility - Set window size (1200x900) in wrapper to ensure proper Google Maps rendering - Update job_manager.py to import from scraper_clean instead of fast_scraper - Production now uses clean scraper with: - Hard refresh recovery when stuck after 8+ soft recovery attempts - API interception + DOM parsing for complete data collection - Automatic deduplication across refreshes Tested: 589/589 reviews collected in 55s Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -946,6 +946,101 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
}
|
||||
|
||||
|
||||
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
|
||||
progress_callback=None, driver=None, return_driver: bool = False):
|
||||
"""
|
||||
Production-compatible wrapper for scrape_reviews.
|
||||
Matches the API expected by job_manager.py.
|
||||
|
||||
Args:
|
||||
url: Google Maps URL to scrape
|
||||
headless: Run Chrome in headless mode
|
||||
max_scrolls: Not used (kept for API compatibility)
|
||||
progress_callback: Optional callback(current_count, total_count) for progress
|
||||
driver: Existing driver instance to reuse
|
||||
return_driver: If True, return driver in result
|
||||
|
||||
Returns:
|
||||
Dictionary with: reviews, count, total_reviews, time, success, error, driver
|
||||
"""
|
||||
from seleniumbase import Driver
|
||||
|
||||
start_time = time.time()
|
||||
driver_provided = driver is not None
|
||||
should_close_driver = not return_driver and not driver_provided
|
||||
|
||||
try:
|
||||
# Create driver if not provided
|
||||
if not driver:
|
||||
driver = Driver(
|
||||
uc=True,
|
||||
headless=headless,
|
||||
page_load_strategy="normal",
|
||||
agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
driver.set_window_size(1200, 900) # Proper viewport for Google Maps
|
||||
|
||||
# Create progress wrapper if callback provided
|
||||
flush_callback = None
|
||||
if progress_callback:
|
||||
collected = [0]
|
||||
def flush_with_progress(reviews_batch):
|
||||
collected[0] += len(reviews_batch)
|
||||
progress_callback(collected[0], None)
|
||||
flush_callback = flush_with_progress
|
||||
|
||||
# Run the scraper
|
||||
result = scrape_reviews(
|
||||
driver=driver,
|
||||
url=url,
|
||||
max_reviews=999999, # Effectively unlimited
|
||||
timeout_no_new=15,
|
||||
flush_callback=flush_callback,
|
||||
flush_batch_size=100 # Smaller batches for more frequent progress
|
||||
)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Return in expected format
|
||||
response = {
|
||||
"reviews": result.get("reviews", []),
|
||||
"count": result.get("total", 0),
|
||||
"total_reviews": result.get("total", 0),
|
||||
"time": elapsed,
|
||||
"success": True,
|
||||
"error": None
|
||||
}
|
||||
|
||||
if return_driver:
|
||||
response["driver"] = driver
|
||||
elif should_close_driver:
|
||||
try:
|
||||
driver.quit()
|
||||
except:
|
||||
pass
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
if should_close_driver and driver:
|
||||
try:
|
||||
driver.quit()
|
||||
except:
|
||||
pass
|
||||
|
||||
return {
|
||||
"reviews": [],
|
||||
"count": 0,
|
||||
"total_reviews": 0,
|
||||
"time": elapsed,
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"driver": driver if return_driver else None
|
||||
}
|
||||
|
||||
|
||||
# Test function
|
||||
if __name__ == "__main__":
|
||||
from seleniumbase import Driver
|
||||
|
||||
Reference in New Issue
Block a user