Remove old scraper files - consolidate to scraper_clean

Production (api_server_production.py) only uses:
- modules/scraper_clean.py - main scraping logic
- modules/fast_scraper.py - validation helpers
- modules/database.py, webhooks.py, health_checks.py, chrome_pool.py

Deleted 33 unused Python files including:
- Old API server (api_server.py)
- 14 start*.py experimental scrapers
- 7 *_scraper.py variants
- Old modules: scraper.py, api_interceptor.py, job_manager.py, cli.py
- Various debug/test/utility scripts

Saves ~11,000 lines of unmaintained code.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-23 17:25:00 +00:00
parent 80e7771c00
commit 8ccf72a489
37 changed files with 859 additions and 11116 deletions

View File

@@ -35,16 +35,45 @@ class ChromeWorker:
# SeleniumBase Driver automatically includes UC mode anti-detection
# Initialize with longer timeouts for large scraping jobs
# Chrome arguments for Docker stability
chrome_args = [
"--disable-dev-shm-usage", # Use /tmp instead of /dev/shm (critical for Docker)
"--disable-gpu", # Disable GPU acceleration
"--no-sandbox", # Required for Docker
"--disable-software-rasterizer",
"--disable-extensions",
"--disable-background-networking",
"--disable-default-apps",
"--disable-sync",
"--metrics-recording-only",
"--mute-audio",
"--no-first-run",
"--safebrowsing-disable-auto-update",
]
self.driver = Driver(
uc=True,
headless=self.headless,
page_load_strategy="normal"
page_load_strategy="normal",
chromium_arg=",".join(chrome_args)
)
# Set generous timeouts for large scraping jobs
self.driver.set_page_load_timeout(120) # 2 minutes for slow networks
self.driver.set_script_timeout(60) # 1 minute for complex extraction
# Set Chrome geolocation to US (Boston, MA) for consistent Google Maps results
# This prevents location-based variations in search results
try:
self.driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': 42.3601,
'longitude': -71.0589,
'accuracy': 100
})
log.info(f"Worker {self.worker_id}: Geolocation set to US (Boston, MA)")
except Exception as e:
log.warning(f"Worker {self.worker_id}: Could not set geolocation: {e}")
self.driver.maximize_window()
self.created_at = time.time()
self.last_used = time.time()