Switch production to scraper_clean with hard refresh recovery

- Add fast_scrape_reviews() wrapper to scraper_clean.py for API compatibility - Set window size (1200x900) in wrapper to ensure proper Google Maps rendering - Update job_manager.py to import from scraper_clean instead of fast_scraper - Production now uses clean scraper with: - Hard refresh recovery when stuck after 8+ soft recovery attempts - API interception + DOM parsing for complete data collection - Automatic deduplication across refreshes Tested: 589/589 reviews collected in 55s Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 14:18:10 +00:00
parent ff03a4a1b7
commit a6d6531543
2 changed files with 96 additions and 1 deletions
--- a/modules/scraper_clean.py
+++ b/modules/scraper_clean.py
@@ -946,6 +946,101 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
    }


+def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
+                        progress_callback=None, driver=None, return_driver: bool = False):
+    """
+    Production-compatible wrapper for scrape_reviews.
+    Matches the API expected by job_manager.py.
+
+    Args:
+        url: Google Maps URL to scrape
+        headless: Run Chrome in headless mode
+        max_scrolls: Not used (kept for API compatibility)
+        progress_callback: Optional callback(current_count, total_count) for progress
+        driver: Existing driver instance to reuse
+        return_driver: If True, return driver in result
+
+    Returns:
+        Dictionary with: reviews, count, total_reviews, time, success, error, driver
+    """
+    from seleniumbase import Driver
+
+    start_time = time.time()
+    driver_provided = driver is not None
+    should_close_driver = not return_driver and not driver_provided
+
+    try:
+        # Create driver if not provided
+        if not driver:
+            driver = Driver(
+                uc=True,
+                headless=headless,
+                page_load_strategy="normal",
+                agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+            )
+            driver.set_window_size(1200, 900)  # Proper viewport for Google Maps
+
+        # Create progress wrapper if callback provided
+        flush_callback = None
+        if progress_callback:
+            collected = [0]
+            def flush_with_progress(reviews_batch):
+                collected[0] += len(reviews_batch)
+                progress_callback(collected[0], None)
+            flush_callback = flush_with_progress
+
+        # Run the scraper
+        result = scrape_reviews(
+            driver=driver,
+            url=url,
+            max_reviews=999999,  # Effectively unlimited
+            timeout_no_new=15,
+            flush_callback=flush_callback,
+            flush_batch_size=100  # Smaller batches for more frequent progress
+        )
+
+        elapsed = time.time() - start_time
+
+        # Return in expected format
+        response = {
+            "reviews": result.get("reviews", []),
+            "count": result.get("total", 0),
+            "total_reviews": result.get("total", 0),
+            "time": elapsed,
+            "success": True,
+            "error": None
+        }
+
+        if return_driver:
+            response["driver"] = driver
+        elif should_close_driver:
+            try:
+                driver.quit()
+            except:
+                pass
+
+        return response
+
+    except Exception as e:
+        elapsed = time.time() - start_time
+
+        if should_close_driver and driver:
+            try:
+                driver.quit()
+            except:
+                pass
+
+        return {
+            "reviews": [],
+            "count": 0,
+            "total_reviews": 0,
+            "time": elapsed,
+            "success": False,
+            "error": str(e),
+            "driver": driver if return_driver else None
+        }
+
+
 # Test function
 if __name__ == "__main__":
    from seleniumbase import Driver