Remove old scraper files - consolidate to scraper_clean
Production (api_server_production.py) only uses: - modules/scraper_clean.py - main scraping logic - modules/fast_scraper.py - validation helpers - modules/database.py, webhooks.py, health_checks.py, chrome_pool.py Deleted 33 unused Python files including: - Old API server (api_server.py) - 14 start*.py experimental scrapers - 7 *_scraper.py variants - Old modules: scraper.py, api_interceptor.py, job_manager.py, cli.py - Various debug/test/utility scripts Saves ~11,000 lines of unmaintained code. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -77,11 +77,17 @@ class DatabaseManager:
|
||||
|
||||
error_message TEXT,
|
||||
metadata JSONB,
|
||||
scrape_logs JSONB,
|
||||
|
||||
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
|
||||
);
|
||||
""")
|
||||
|
||||
# Add scrape_logs column if it doesn't exist (for existing databases)
|
||||
await conn.execute("""
|
||||
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
|
||||
""")
|
||||
|
||||
# Create indexes
|
||||
await conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
|
||||
@@ -182,10 +188,12 @@ class DatabaseManager:
|
||||
started_at,
|
||||
completed_at,
|
||||
reviews_count,
|
||||
total_reviews,
|
||||
reviews_data,
|
||||
scrape_time,
|
||||
error_message,
|
||||
metadata
|
||||
metadata,
|
||||
scrape_logs
|
||||
FROM jobs
|
||||
WHERE job_id = $1
|
||||
""", job_id)
|
||||
@@ -246,8 +254,13 @@ class DatabaseManager:
|
||||
kwargs['completed_at'] = datetime.now()
|
||||
|
||||
for key, value in kwargs.items():
|
||||
set_clauses.append(f"{key} = ${param_idx}")
|
||||
params.append(value)
|
||||
# Handle JSONB fields specially
|
||||
if key == 'scrape_logs' and value is not None:
|
||||
set_clauses.append(f"{key} = ${param_idx}::jsonb")
|
||||
params.append(json.dumps(value) if not isinstance(value, str) else value)
|
||||
else:
|
||||
set_clauses.append(f"{key} = ${param_idx}")
|
||||
params.append(value)
|
||||
param_idx += 1
|
||||
|
||||
query = f"""
|
||||
@@ -264,7 +277,8 @@ class DatabaseManager:
|
||||
job_id: UUID,
|
||||
reviews: List[Dict[str, Any]],
|
||||
scrape_time: float,
|
||||
total_reviews: Optional[int] = None
|
||||
total_reviews: Optional[int] = None,
|
||||
scrape_logs: Optional[List[Dict[str, Any]]] = None
|
||||
):
|
||||
"""
|
||||
Save scraping results to database.
|
||||
@@ -274,6 +288,7 @@ class DatabaseManager:
|
||||
reviews: List of review dictionaries
|
||||
scrape_time: Time taken to scrape in seconds
|
||||
total_reviews: Total reviews available (from page counter)
|
||||
scrape_logs: List of log entries from the scraper
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
@@ -284,9 +299,11 @@ class DatabaseManager:
|
||||
reviews_count = $2,
|
||||
total_reviews = $3,
|
||||
reviews_data = $4::jsonb,
|
||||
scrape_time = $5
|
||||
scrape_time = $5,
|
||||
scrape_logs = $6::jsonb
|
||||
WHERE job_id = $1
|
||||
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time)
|
||||
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
|
||||
json.dumps(scrape_logs) if scrape_logs else None)
|
||||
|
||||
log.info(f"Saved {len(reviews)} reviews for job {job_id}")
|
||||
|
||||
@@ -317,8 +334,10 @@ class DatabaseManager:
|
||||
created_at,
|
||||
completed_at,
|
||||
reviews_count,
|
||||
total_reviews,
|
||||
scrape_time,
|
||||
error_message
|
||||
error_message,
|
||||
metadata
|
||||
FROM jobs
|
||||
WHERE status = $1
|
||||
ORDER BY created_at DESC
|
||||
@@ -333,8 +352,10 @@ class DatabaseManager:
|
||||
created_at,
|
||||
completed_at,
|
||||
reviews_count,
|
||||
total_reviews,
|
||||
scrape_time,
|
||||
error_message
|
||||
error_message,
|
||||
metadata
|
||||
FROM jobs
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $1 OFFSET $2
|
||||
|
||||
Reference in New Issue
Block a user