From a6d65315431eaca409a291e23233eb1071e17d6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Thu, 22 Jan 2026 14:18:10 +0000 Subject: [PATCH] Switch production to scraper_clean with hard refresh recovery - Add fast_scrape_reviews() wrapper to scraper_clean.py for API compatibility - Set window size (1200x900) in wrapper to ensure proper Google Maps rendering - Update job_manager.py to import from scraper_clean instead of fast_scraper - Production now uses clean scraper with: - Hard refresh recovery when stuck after 8+ soft recovery attempts - API interception + DOM parsing for complete data collection - Automatic deduplication across refreshes Tested: 589/589 reviews collected in 55s Co-Authored-By: Claude Opus 4.5 --- modules/job_manager.py | 2 +- modules/scraper_clean.py | 95 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 1 deletion(-) diff --git a/modules/job_manager.py b/modules/job_manager.py index 7c7768a..19e4bfc 100644 --- a/modules/job_manager.py +++ b/modules/job_manager.py @@ -15,7 +15,7 @@ from dataclasses import dataclass, asdict from modules.config import load_config from modules.scraper import GoogleReviewsScraper -from modules.fast_scraper import fast_scrape_reviews +from modules.scraper_clean import fast_scrape_reviews # Updated to use clean scraper with hard refresh recovery from modules.chrome_pool import get_scraping_worker, release_scraping_worker log = logging.getLogger("scraper") diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py index 23856ce..126de46 100644 --- a/modules/scraper_clean.py +++ b/modules/scraper_clean.py @@ -946,6 +946,101 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in } +def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999, + progress_callback=None, driver=None, return_driver: bool = False): + """ + Production-compatible wrapper for scrape_reviews. + Matches the API expected by job_manager.py. + + Args: + url: Google Maps URL to scrape + headless: Run Chrome in headless mode + max_scrolls: Not used (kept for API compatibility) + progress_callback: Optional callback(current_count, total_count) for progress + driver: Existing driver instance to reuse + return_driver: If True, return driver in result + + Returns: + Dictionary with: reviews, count, total_reviews, time, success, error, driver + """ + from seleniumbase import Driver + + start_time = time.time() + driver_provided = driver is not None + should_close_driver = not return_driver and not driver_provided + + try: + # Create driver if not provided + if not driver: + driver = Driver( + uc=True, + headless=headless, + page_load_strategy="normal", + agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + ) + driver.set_window_size(1200, 900) # Proper viewport for Google Maps + + # Create progress wrapper if callback provided + flush_callback = None + if progress_callback: + collected = [0] + def flush_with_progress(reviews_batch): + collected[0] += len(reviews_batch) + progress_callback(collected[0], None) + flush_callback = flush_with_progress + + # Run the scraper + result = scrape_reviews( + driver=driver, + url=url, + max_reviews=999999, # Effectively unlimited + timeout_no_new=15, + flush_callback=flush_callback, + flush_batch_size=100 # Smaller batches for more frequent progress + ) + + elapsed = time.time() - start_time + + # Return in expected format + response = { + "reviews": result.get("reviews", []), + "count": result.get("total", 0), + "total_reviews": result.get("total", 0), + "time": elapsed, + "success": True, + "error": None + } + + if return_driver: + response["driver"] = driver + elif should_close_driver: + try: + driver.quit() + except: + pass + + return response + + except Exception as e: + elapsed = time.time() - start_time + + if should_close_driver and driver: + try: + driver.quit() + except: + pass + + return { + "reviews": [], + "count": 0, + "total_reviews": 0, + "time": elapsed, + "success": False, + "error": str(e), + "driver": driver if return_driver else None + } + + # Test function if __name__ == "__main__": from seleniumbase import Driver