Switch production to scraper_clean with hard refresh recovery
- Add fast_scrape_reviews() wrapper to scraper_clean.py for API compatibility - Set window size (1200x900) in wrapper to ensure proper Google Maps rendering - Update job_manager.py to import from scraper_clean instead of fast_scraper - Production now uses clean scraper with: - Hard refresh recovery when stuck after 8+ soft recovery attempts - API interception + DOM parsing for complete data collection - Automatic deduplication across refreshes Tested: 589/589 reviews collected in 55s Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -15,7 +15,7 @@ from dataclasses import dataclass, asdict
|
|||||||
|
|
||||||
from modules.config import load_config
|
from modules.config import load_config
|
||||||
from modules.scraper import GoogleReviewsScraper
|
from modules.scraper import GoogleReviewsScraper
|
||||||
from modules.fast_scraper import fast_scrape_reviews
|
from modules.scraper_clean import fast_scrape_reviews # Updated to use clean scraper with hard refresh recovery
|
||||||
from modules.chrome_pool import get_scraping_worker, release_scraping_worker
|
from modules.chrome_pool import get_scraping_worker, release_scraping_worker
|
||||||
|
|
||||||
log = logging.getLogger("scraper")
|
log = logging.getLogger("scraper")
|
||||||
|
|||||||
@@ -946,6 +946,101 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
|
||||||
|
progress_callback=None, driver=None, return_driver: bool = False):
|
||||||
|
"""
|
||||||
|
Production-compatible wrapper for scrape_reviews.
|
||||||
|
Matches the API expected by job_manager.py.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: Google Maps URL to scrape
|
||||||
|
headless: Run Chrome in headless mode
|
||||||
|
max_scrolls: Not used (kept for API compatibility)
|
||||||
|
progress_callback: Optional callback(current_count, total_count) for progress
|
||||||
|
driver: Existing driver instance to reuse
|
||||||
|
return_driver: If True, return driver in result
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with: reviews, count, total_reviews, time, success, error, driver
|
||||||
|
"""
|
||||||
|
from seleniumbase import Driver
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
driver_provided = driver is not None
|
||||||
|
should_close_driver = not return_driver and not driver_provided
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create driver if not provided
|
||||||
|
if not driver:
|
||||||
|
driver = Driver(
|
||||||
|
uc=True,
|
||||||
|
headless=headless,
|
||||||
|
page_load_strategy="normal",
|
||||||
|
agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
driver.set_window_size(1200, 900) # Proper viewport for Google Maps
|
||||||
|
|
||||||
|
# Create progress wrapper if callback provided
|
||||||
|
flush_callback = None
|
||||||
|
if progress_callback:
|
||||||
|
collected = [0]
|
||||||
|
def flush_with_progress(reviews_batch):
|
||||||
|
collected[0] += len(reviews_batch)
|
||||||
|
progress_callback(collected[0], None)
|
||||||
|
flush_callback = flush_with_progress
|
||||||
|
|
||||||
|
# Run the scraper
|
||||||
|
result = scrape_reviews(
|
||||||
|
driver=driver,
|
||||||
|
url=url,
|
||||||
|
max_reviews=999999, # Effectively unlimited
|
||||||
|
timeout_no_new=15,
|
||||||
|
flush_callback=flush_callback,
|
||||||
|
flush_batch_size=100 # Smaller batches for more frequent progress
|
||||||
|
)
|
||||||
|
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
|
||||||
|
# Return in expected format
|
||||||
|
response = {
|
||||||
|
"reviews": result.get("reviews", []),
|
||||||
|
"count": result.get("total", 0),
|
||||||
|
"total_reviews": result.get("total", 0),
|
||||||
|
"time": elapsed,
|
||||||
|
"success": True,
|
||||||
|
"error": None
|
||||||
|
}
|
||||||
|
|
||||||
|
if return_driver:
|
||||||
|
response["driver"] = driver
|
||||||
|
elif should_close_driver:
|
||||||
|
try:
|
||||||
|
driver.quit()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
|
||||||
|
if should_close_driver and driver:
|
||||||
|
try:
|
||||||
|
driver.quit()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return {
|
||||||
|
"reviews": [],
|
||||||
|
"count": 0,
|
||||||
|
"total_reviews": 0,
|
||||||
|
"time": elapsed,
|
||||||
|
"success": False,
|
||||||
|
"error": str(e),
|
||||||
|
"driver": driver if return_driver else None
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# Test function
|
# Test function
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
from seleniumbase import Driver
|
from seleniumbase import Driver
|
||||||
|
|||||||
Reference in New Issue
Block a user