Remove old scraper files - consolidate to scraper_clean

Production (api_server_production.py) only uses:
- modules/scraper_clean.py - main scraping logic
- modules/fast_scraper.py - validation helpers
- modules/database.py, webhooks.py, health_checks.py, chrome_pool.py

Deleted 33 unused Python files including:
- Old API server (api_server.py)
- 14 start*.py experimental scrapers
- 7 *_scraper.py variants
- Old modules: scraper.py, api_interceptor.py, job_manager.py, cli.py
- Various debug/test/utility scripts

Saves ~11,000 lines of unmaintained code.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-23 17:25:00 +00:00
parent 80e7771c00
commit 8ccf72a489
37 changed files with 859 additions and 11116 deletions

View File

@@ -77,11 +77,17 @@ class DatabaseManager:
error_message TEXT,
metadata JSONB,
scrape_logs JSONB,
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
);
""")
# Add scrape_logs column if it doesn't exist (for existing databases)
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
""")
# Create indexes
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
@@ -182,10 +188,12 @@ class DatabaseManager:
started_at,
completed_at,
reviews_count,
total_reviews,
reviews_data,
scrape_time,
error_message,
metadata
metadata,
scrape_logs
FROM jobs
WHERE job_id = $1
""", job_id)
@@ -246,8 +254,13 @@ class DatabaseManager:
kwargs['completed_at'] = datetime.now()
for key, value in kwargs.items():
set_clauses.append(f"{key} = ${param_idx}")
params.append(value)
# Handle JSONB fields specially
if key == 'scrape_logs' and value is not None:
set_clauses.append(f"{key} = ${param_idx}::jsonb")
params.append(json.dumps(value) if not isinstance(value, str) else value)
else:
set_clauses.append(f"{key} = ${param_idx}")
params.append(value)
param_idx += 1
query = f"""
@@ -264,7 +277,8 @@ class DatabaseManager:
job_id: UUID,
reviews: List[Dict[str, Any]],
scrape_time: float,
total_reviews: Optional[int] = None
total_reviews: Optional[int] = None,
scrape_logs: Optional[List[Dict[str, Any]]] = None
):
"""
Save scraping results to database.
@@ -274,6 +288,7 @@ class DatabaseManager:
reviews: List of review dictionaries
scrape_time: Time taken to scrape in seconds
total_reviews: Total reviews available (from page counter)
scrape_logs: List of log entries from the scraper
"""
async with self.pool.acquire() as conn:
await conn.execute("""
@@ -284,9 +299,11 @@ class DatabaseManager:
reviews_count = $2,
total_reviews = $3,
reviews_data = $4::jsonb,
scrape_time = $5
scrape_time = $5,
scrape_logs = $6::jsonb
WHERE job_id = $1
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time)
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
json.dumps(scrape_logs) if scrape_logs else None)
log.info(f"Saved {len(reviews)} reviews for job {job_id}")
@@ -317,8 +334,10 @@ class DatabaseManager:
created_at,
completed_at,
reviews_count,
total_reviews,
scrape_time,
error_message
error_message,
metadata
FROM jobs
WHERE status = $1
ORDER BY created_at DESC
@@ -333,8 +352,10 @@ class DatabaseManager:
created_at,
completed_at,
reviews_count,
total_reviews,
scrape_time,
error_message
error_message,
metadata
FROM jobs
ORDER BY created_at DESC
LIMIT $1 OFFSET $2