Remove old scraper files - consolidate to scraper_clean
Production (api_server_production.py) only uses: - modules/scraper_clean.py - main scraping logic - modules/fast_scraper.py - validation helpers - modules/database.py, webhooks.py, health_checks.py, chrome_pool.py Deleted 33 unused Python files including: - Old API server (api_server.py) - 14 start*.py experimental scrapers - 7 *_scraper.py variants - Old modules: scraper.py, api_interceptor.py, job_manager.py, cli.py - Various debug/test/utility scripts Saves ~11,000 lines of unmaintained code. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -35,16 +35,45 @@ class ChromeWorker:
|
||||
|
||||
# SeleniumBase Driver automatically includes UC mode anti-detection
|
||||
# Initialize with longer timeouts for large scraping jobs
|
||||
# Chrome arguments for Docker stability
|
||||
chrome_args = [
|
||||
"--disable-dev-shm-usage", # Use /tmp instead of /dev/shm (critical for Docker)
|
||||
"--disable-gpu", # Disable GPU acceleration
|
||||
"--no-sandbox", # Required for Docker
|
||||
"--disable-software-rasterizer",
|
||||
"--disable-extensions",
|
||||
"--disable-background-networking",
|
||||
"--disable-default-apps",
|
||||
"--disable-sync",
|
||||
"--metrics-recording-only",
|
||||
"--mute-audio",
|
||||
"--no-first-run",
|
||||
"--safebrowsing-disable-auto-update",
|
||||
]
|
||||
|
||||
self.driver = Driver(
|
||||
uc=True,
|
||||
headless=self.headless,
|
||||
page_load_strategy="normal"
|
||||
page_load_strategy="normal",
|
||||
chromium_arg=",".join(chrome_args)
|
||||
)
|
||||
|
||||
# Set generous timeouts for large scraping jobs
|
||||
self.driver.set_page_load_timeout(120) # 2 minutes for slow networks
|
||||
self.driver.set_script_timeout(60) # 1 minute for complex extraction
|
||||
|
||||
# Set Chrome geolocation to US (Boston, MA) for consistent Google Maps results
|
||||
# This prevents location-based variations in search results
|
||||
try:
|
||||
self.driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
||||
'latitude': 42.3601,
|
||||
'longitude': -71.0589,
|
||||
'accuracy': 100
|
||||
})
|
||||
log.info(f"Worker {self.worker_id}: Geolocation set to US (Boston, MA)")
|
||||
except Exception as e:
|
||||
log.warning(f"Worker {self.worker_id}: Could not set geolocation: {e}")
|
||||
|
||||
self.driver.maximize_window()
|
||||
self.created_at = time.time()
|
||||
self.last_used = time.time()
|
||||
|
||||
Reference in New Issue
Block a user