Wave 2: Migrate scraper to StructuredLogger, add crash detection & topic tags

- Task #2: Migrate scraper_clean.py to use StructuredLogger with categories
  (37 log calls with metrics across browser/scraper/network/system)
- Task #4: Add crash_reports table schema and database methods
  (save_crash_report, get_crash_report, get_crash_stats)
- Task #9: Implement crash detection wrapper with metrics sampling
  (get_chrome_memory, get_dom_node_count, classify_crash)
- Task #17: Add topic tags to frontend ReviewAnalytics
  (topic filter UI, tags on cards, topics in modal)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 12:17:23 +00:00
parent 313e32f358
commit 9e1bcde981
4 changed files with 526 additions and 74 deletions

View File

@@ -154,6 +154,41 @@ class DatabaseManager:
CREATE INDEX IF NOT EXISTS idx_webhook_job_id ON webhook_attempts(job_id);
""")
# Add session_fingerprint and metrics_history columns to jobs table
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS session_fingerprint JSONB;
""")
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS metrics_history JSONB;
""")
# Create crash_reports table
await conn.execute("""
CREATE TABLE IF NOT EXISTS crash_reports (
crash_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
job_id UUID REFERENCES jobs(job_id) ON DELETE CASCADE,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
crash_type VARCHAR(50) NOT NULL,
error_message TEXT,
state JSONB NOT NULL,
metrics_history JSONB,
logs_before_crash JSONB,
analysis JSONB,
screenshot_url TEXT,
dom_snapshot_id UUID
);
""")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_crash_reports_job ON crash_reports(job_id);
""")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_crash_reports_type ON crash_reports(crash_type);
""")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_crash_reports_created ON crash_reports(created_at DESC);
""")
log.info("Database schema initialized")
# ==================== Job Operations ====================
@@ -657,3 +692,150 @@ class DatabaseManager:
INSERT INTO webhook_attempts (job_id, attempt_number, success, status_code, error_message, response_time_ms)
VALUES ($1, $2, $3, $4, $5, $6)
""", job_id, attempt_number, success, status_code, error_message, response_time_ms)
# ==================== Crash Reports ====================
async def save_crash_report(self, job_id: str, crash_data: dict) -> str:
"""
Save a crash report and return the crash_id.
Args:
job_id: Job UUID as string
crash_data: Dictionary containing crash report data:
- crash_type: Type of crash (required)
- error_message: Error message (optional)
- state: Current state at crash time (required)
- metrics_history: Historical metrics (optional)
- logs_before_crash: Log entries before crash (optional)
- analysis: Crash analysis data (optional)
- screenshot_url: URL to screenshot (optional)
- dom_snapshot_id: UUID of DOM snapshot (optional)
Returns:
UUID of created crash report as string
"""
async with self.pool.acquire() as conn:
# Convert job_id string to UUID
job_uuid = UUID(job_id) if isinstance(job_id, str) else job_id
crash_id = await conn.fetchval("""
INSERT INTO crash_reports (
job_id,
crash_type,
error_message,
state,
metrics_history,
logs_before_crash,
analysis,
screenshot_url,
dom_snapshot_id
)
VALUES ($1, $2, $3, $4::jsonb, $5::jsonb, $6::jsonb, $7::jsonb, $8, $9)
RETURNING crash_id
""",
job_uuid,
crash_data.get('crash_type'),
crash_data.get('error_message'),
json.dumps(crash_data.get('state', {})),
json.dumps(crash_data.get('metrics_history')) if crash_data.get('metrics_history') else None,
json.dumps(crash_data.get('logs_before_crash')) if crash_data.get('logs_before_crash') else None,
json.dumps(crash_data.get('analysis')) if crash_data.get('analysis') else None,
crash_data.get('screenshot_url'),
UUID(crash_data['dom_snapshot_id']) if crash_data.get('dom_snapshot_id') else None
)
log.info(f"Saved crash report {crash_id} for job {job_id}, type: {crash_data.get('crash_type')}")
return str(crash_id)
async def get_crash_report(self, job_id: str) -> Optional[dict]:
"""
Get crash report for a job, if any.
Args:
job_id: Job UUID as string
Returns:
Crash report dictionary or None if not found
"""
async with self.pool.acquire() as conn:
job_uuid = UUID(job_id) if isinstance(job_id, str) else job_id
row = await conn.fetchrow("""
SELECT
crash_id,
job_id,
created_at,
crash_type,
error_message,
state,
metrics_history,
logs_before_crash,
analysis,
screenshot_url,
dom_snapshot_id
FROM crash_reports
WHERE job_id = $1
ORDER BY created_at DESC
LIMIT 1
""", job_uuid)
if not row:
return None
result = dict(row)
# Convert UUIDs to strings for JSON serialization
result['crash_id'] = str(result['crash_id'])
result['job_id'] = str(result['job_id'])
if result.get('dom_snapshot_id'):
result['dom_snapshot_id'] = str(result['dom_snapshot_id'])
return result
async def get_crash_stats(self, days: int = 7) -> dict:
"""
Get crash statistics for the last N days.
Args:
days: Number of days to look back (default: 7)
Returns:
Dictionary with:
- total: Total number of crashes
- by_type: Dict mapping crash type to count
- by_day: List of dicts with date and count
"""
async with self.pool.acquire() as conn:
# Get total count
total = await conn.fetchval("""
SELECT COUNT(*)
FROM crash_reports
WHERE created_at >= NOW() - INTERVAL '%s days'
""", days)
# Get counts by type
type_rows = await conn.fetch("""
SELECT crash_type, COUNT(*) as count
FROM crash_reports
WHERE created_at >= NOW() - INTERVAL '%s days'
GROUP BY crash_type
ORDER BY count DESC
""", days)
by_type = {row['crash_type']: row['count'] for row in type_rows}
# Get counts by day
day_rows = await conn.fetch("""
SELECT DATE(created_at) as date, COUNT(*) as count
FROM crash_reports
WHERE created_at >= NOW() - INTERVAL '%s days'
GROUP BY DATE(created_at)
ORDER BY date DESC
""", days)
by_day = [{'date': str(row['date']), 'count': row['count']} for row in day_rows]
return {
'total': total or 0,
'by_type': by_type,
'by_day': by_day
}