From 65fcaf43e8103e9e6a0ab10eb709aac69f174d8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Sat, 24 Jan 2026 11:10:34 +0000 Subject: [PATCH] Add Job DevTools specification document Comprehensive spec for observability suite including: - Structured logging system with categories - Crash intelligence and pattern analysis - Copy/export functionality - Session fingerprint panel - Real-time metrics dashboard - Review topics inference Organized by priority (P0-P3) with parallel implementation tracks. Co-Authored-By: Claude Opus 4.5 --- .artifacts/job-devtools-spec.md | 620 ++++++++++++++++++++++++++++++++ 1 file changed, 620 insertions(+) create mode 100644 .artifacts/job-devtools-spec.md diff --git a/.artifacts/job-devtools-spec.md b/.artifacts/job-devtools-spec.md new file mode 100644 index 0000000..8555cf6 --- /dev/null +++ b/.artifacts/job-devtools-spec.md @@ -0,0 +1,620 @@ +# Job DevTools - Observability Suite Specification + +## Executive Summary + +A comprehensive observability system for scraping jobs that provides real-time monitoring, crash analysis, session transparency, and debugging capabilities. Transforms opaque job execution into a fully inspectable process. + +--- + +## Priority Matrix + +| Priority | Feature | Business Value | Complexity | +|----------|---------|----------------|------------| +| P0 | Structured Logging System | Foundation for all other features | Medium | +| P0 | Crash Intelligence | Reduce job failures, debug tab crashes | High | +| P0 | Copy & Export System | User productivity, support debugging | Low | +| P1 | Session Fingerprint Panel | Trust, transparency, debugging | Medium | +| P1 | Tabbed Log Viewer | Organized debugging experience | Medium | +| P2 | Real-time Metrics Dashboard | Visual monitoring, performance insight | Medium | +| P2 | Review Topics Inference | Enhanced analytics value | Low | +| P3 | Network Inspector | Deep debugging for edge cases | High | +| P3 | DOM Snapshots | Root cause analysis for crashes | High | +| P3 | Job Comparison | Performance optimization insights | Medium | + +--- + +## P0: Critical Features + +### 1. Structured Logging System + +**Problem:** Current logs are flat text strings without categorization, making it impossible to filter or analyze specific aspects of job execution. + +**Solution:** JSON-structured log entries with metadata. + +**Log Entry Schema:** +```typescript +interface LogEntry { + timestamp: string; // ISO 8601 with milliseconds + timestamp_ms: number; // Unix ms for sorting/graphing + level: 'DEBUG' | 'INFO' | 'WARN' | 'ERROR' | 'FATAL'; + category: 'scraper' | 'browser' | 'network' | 'system'; + message: string; + + // Optional contextual data + metrics?: { + memory_mb?: number; + reviews_count?: number; + scroll_position?: number; + dom_nodes?: number; + }; + + // For crash correlation + snapshot_id?: string; + + // For network events + network?: { + url?: string; + method?: string; + status?: number; + size_bytes?: number; + duration_ms?: number; + }; +} +``` + +**Category Definitions:** + +| Category | What it captures | +|----------|------------------| +| `scraper` | Review extraction, batch progress, data parsing, topic extraction | +| `browser` | Page navigation, consent handling, tab clicks, scroll events, element waits | +| `network` | API interceptions, request/response data, rate limiting, failures | +| `system` | Memory pressure, Chrome process health, worker pool status, timeouts | + +**Backend Changes:** +- Replace `LogCapture` class with `StructuredLogger` +- All log calls include category and optional metrics +- Store as JSONB array in database (already `scrape_logs jsonb`) +- Stream via SSE with category field + +**Example Logs:** +```json +{"timestamp": "2024-01-24T14:32:01.234Z", "timestamp_ms": 1706103121234, "level": "INFO", "category": "browser", "message": "Navigating to Google Maps URL", "metrics": {"memory_mb": 245}} +{"timestamp": "2024-01-24T14:32:02.456Z", "timestamp_ms": 1706103122456, "level": "WARN", "category": "browser", "message": "Consent popup detected, handling...", "metrics": {"memory_mb": 248}} +{"timestamp": "2024-01-24T14:32:03.789Z", "timestamp_ms": 1706103123789, "level": "INFO", "category": "scraper", "message": "Extracted batch of 50 reviews from API", "metrics": {"reviews_count": 50, "memory_mb": 267}} +{"timestamp": "2024-01-24T14:32:45.123Z", "timestamp_ms": 1706103165123, "level": "ERROR", "category": "system", "message": "Chrome memory pressure critical", "metrics": {"memory_mb": 489, "dom_nodes": 12847}} +``` + +--- + +### 2. Crash Intelligence System + +**Problem:** Tab crashes are frequent but opaque. No visibility into what caused the crash or how to prevent it. + +**Solution:** Comprehensive crash detection, analysis, and remediation suggestions. + +**Crash Report Schema:** +```typescript +interface CrashReport { + crash_id: string; + job_id: string; + timestamp: string; + + // Crash classification + crash_type: 'tab_crash' | 'memory_exhaustion' | 'timeout' | 'network_failure' | 'element_not_found' | 'rate_limited'; + error_message: string; + error_code?: string; + + // State at crash + state: { + reviews_extracted: number; + total_expected: number; + scroll_count: number; + scroll_position: number; + elapsed_seconds: number; + }; + + // Resource metrics (last 10 readings) + metrics_history: Array<{ + timestamp_ms: number; + memory_mb: number; + dom_nodes: number; + cpu_percent?: number; + }>; + + // Last N log entries before crash + logs_before_crash: LogEntry[]; // Last 20 entries + + // Recovery info + last_successful_review_id?: string; + checkpoint_available: boolean; + + // Analysis + analysis: { + pattern: string; // e.g., "memory_exhaustion", "rate_limit_cascade" + confidence: number; // 0-100 + similar_crashes: number; // Count in last 7 days + suggested_fix: string; + auto_fixable: boolean; + }; + + // Optional artifacts + screenshot_url?: string; + dom_snapshot_id?: string; +} +``` + +**Crash Patterns to Detect:** + +| Pattern | Indicators | Suggested Fix | +|---------|------------|---------------| +| Memory Exhaustion | memory_mb > 450, rapid growth | Enable aggressive DOM cleanup | +| DOM Bloat | dom_nodes > 10000, not decreasing | Increase card hiding frequency | +| Rate Limited | Multiple 429 responses | Increase delays, rotate proxy | +| Consent Loop | Repeated consent URL detection | Clear cookies, different fingerprint | +| Element Timeout | Multiple "element not found" | Increase wait times, check selectors | +| Network Stall | No network activity > 30s | Refresh page, check connectivity | + +**Backend Implementation:** +- Wrap scraper execution in try/catch with crash capture +- Periodic metrics sampling (every 5 seconds) stored in ring buffer +- On crash: compile report, analyze pattern, store to database +- New table: `crash_reports` with JSONB data + +**Frontend Display:** +- Dedicated "Crash" section when job fails +- Timeline visualization showing metrics leading to crash +- Pattern explanation with confidence score +- One-click "Apply Fix & Retry" when auto_fixable=true + +--- + +### 3. Copy & Export System + +**Problem:** Users can't easily copy logs for debugging, sharing, or support requests. + +**Solution:** Multi-level copy functionality with various export formats. + +**Copy Levels:** + +| Action | What it copies | +|--------|----------------| +| Click log line | Single log entry as text | +| Shift+Click range | Selected range of logs | +| "Copy All" button | Entire log as formatted text | +| "Export JSON" | Full structured data with metrics | +| "Export TXT" | Human-readable plain text | +| "Share" | Generate shareable link (optional) | + +**Frontend Implementation:** +```typescript +interface CopySystem { + // Single line copy (click handler on each log row) + copyLine(entry: LogEntry): void; + + // Range selection (shift+click) + copyRange(startIndex: number, endIndex: number): void; + + // Full export + exportJSON(logs: LogEntry[], includeMetrics: boolean): string; + exportTXT(logs: LogEntry[]): string; + + // Clipboard with feedback + copyToClipboard(text: string): Promise; // Shows toast on success +} +``` + +**UI Elements:** +- Each log row has hover-visible copy icon +- Selection highlight for range copy +- Top toolbar: [Copy All] [Export JSON] [Export TXT] +- Toast notification: "Copied to clipboard" +- Keyboard shortcuts: Ctrl+C for selected, Ctrl+Shift+C for all + +--- + +## P1: Important Features + +### 4. Session Fingerprint Panel + +**Problem:** Users don't know what browser identity was used during scraping, making it hard to debug location-specific issues or understand detection risks. + +**Solution:** Display all fingerprint parameters used during the session. + +**Session Info Schema:** +```typescript +interface SessionFingerprint { + // Identity + ip_address: string; // Server's outbound IP + ip_location: string; // "Frankfurt, DE" + user_agent: string; + platform: string; // "MacIntel" + language: string; // "es-ES" + + // Geolocation + geolocation: { + lat: number; + lng: number; + city: string; + accuracy_meters: number; + }; + timezone: string; // "Atlantic/Canary" + + // Viewport + viewport: { + width: number; + height: number; + device_pixel_ratio: number; + }; + + // Anti-detection status + bot_detection: { + webdriver_hidden: boolean; + headless_hidden: boolean; + plugins_spoofed: boolean; + canvas_fingerprint: 'unique' | 'generic' | 'blocked'; + webgl_fingerprint: 'unique' | 'generic' | 'blocked'; + }; + + // Source + fingerprint_source: 'user_browser' | 'randomized' | 'default'; +} +``` + +**Backend Implementation:** +- Capture fingerprint at job start +- Store in job metadata +- Include bot detection test results (run fingerprint tests on page load) +- Return in job status response + +**Frontend Display:** +- Collapsible panel in job details +- Visual indicators for detection risk (green/yellow/red) +- "What Google Saw" framing for user understanding + +--- + +### 5. Tabbed Log Viewer + +**Problem:** All logs mixed together makes it hard to focus on specific aspects. + +**Solution:** Category-based tabs with filtering. + +**Tab Structure:** +``` +┌──────────┬──────────┬──────────┬──────────┬──────────┐ +│ All │ Scraper │ Browser │ Network │ System │ +│ (847) │ (423) │ (201) │ (156) │ (67) │ +└──────────┴──────────┴──────────┴──────────┴──────────┘ +``` + +**Features per Tab:** +- Log count badge +- Level filter dropdown (DEBUG/INFO/WARN/ERROR) +- Search within tab +- Auto-scroll toggle +- Timestamp format toggle (relative/absolute) + +**Frontend Implementation:** +```typescript +interface LogViewerState { + activeTab: 'all' | 'scraper' | 'browser' | 'network' | 'system'; + levelFilter: Set; + searchQuery: string; + autoScroll: boolean; + timestampFormat: 'relative' | 'absolute'; +} +``` + +--- + +## P2: Enhanced Features + +### 6. Real-time Metrics Dashboard + +**Problem:** No visual insight into job performance during execution. + +**Solution:** Live-updating charts showing key metrics. + +**Metrics to Display:** + +| Metric | Chart Type | Update Frequency | +|--------|------------|------------------| +| Extraction Rate | Line chart (reviews/sec) | 5s | +| Cumulative Reviews | Area chart | 5s | +| Memory Usage | Line chart (MB) | 5s | +| Network Transfer | Line chart (KB) | 5s | +| Data Source Ratio | Pie chart (API vs DOM) | On change | + +**Data Structure:** +```typescript +interface MetricsSnapshot { + timestamp_ms: number; + reviews_total: number; + reviews_delta: number; // Since last snapshot + memory_mb: number; + network_bytes: number; + api_reviews: number; + dom_reviews: number; +} +``` + +**Backend Implementation:** +- Emit metrics via SSE every 5 seconds during job execution +- Store final metrics summary in job record + +**Frontend Implementation:** +- Recharts line/area charts +- 60-second rolling window during execution +- Full history available after completion + +--- + +### 7. Review Topics Inference + +**Problem:** We extract topic filters from Google but don't know which topics apply to each review. + +**Solution:** Post-processing to infer topic matches from review text. + +**Algorithm:** +```python +def infer_review_topics(review_text: str, topics: List[dict]) -> List[str]: + """ + Match review text against extracted topic keywords. + + Args: + review_text: The review content + topics: List of {"topic": "cutting", "count": 3} + + Returns: + List of matched topic names + """ + matched = [] + text_lower = review_text.lower() + + for topic in topics: + keyword = topic['topic'].lower() + # Direct match + if keyword in text_lower: + matched.append(topic['topic']) + # Stemmed/variant match (e.g., "cut" matches "cutting") + elif any(variant in text_lower for variant in get_variants(keyword)): + matched.append(topic['topic']) + + return matched +``` + +**Storage:** +- Add `topics: string[]` field to each review object +- Process during scrape (after topics extracted, before reviews saved) + +**Frontend Display:** +- Topic tags on each review card +- Filter reviews by topic +- Topic distribution in analytics + +--- + +## P3: Advanced Features + +### 8. Network Inspector + +**Problem:** No visibility into API requests/responses for debugging rate limits or data issues. + +**Solution:** Chrome DevTools Network-style inspector. + +**Implementation:** +- Use Chrome DevTools Protocol (CDP) to intercept all network requests +- Filter for relevant domains (google.com/maps) +- Capture: URL, method, status, headers, timing, size +- Store subset in logs (full data in separate table if needed) + +**Display:** +- Sortable table of requests +- Click to expand: headers, response preview, timing breakdown +- Filter by: status (2xx/4xx/5xx), type (XHR/image/etc) + +--- + +### 9. DOM Snapshots + +**Problem:** Can't see what the page looked like at crash time or key moments. + +**Solution:** Periodic DOM state captures. + +**Implementation:** +- Capture serialized DOM at key events (page load, tab click, every N scrolls, before crash) +- Store compressed in blob storage or base64 in database +- Include screenshot (CDP Page.captureScreenshot) + +**Display:** +- Snapshot timeline with thumbnails +- Side-by-side comparison between snapshots +- Diff view showing DOM changes + +--- + +### 10. Job Comparison + +**Problem:** No way to know if a job performed better or worse than typical. + +**Solution:** Compare metrics against historical baselines. + +**Metrics to Compare:** +- Total time +- Extraction rate (reviews/second) +- Memory peak +- Network transfer +- Success rate (extracted/expected) + +**Display:** +- Bar chart comparing this job vs average +- Percentile ranking ("faster than 73% of jobs") +- Anomaly detection ("This job used 2x more memory than typical") + +--- + +## Database Schema Changes + +```sql +-- Crash reports table +CREATE TABLE crash_reports ( + crash_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + job_id UUID REFERENCES jobs(job_id) ON DELETE CASCADE, + created_at TIMESTAMP NOT NULL DEFAULT NOW(), + crash_type VARCHAR(50) NOT NULL, + error_message TEXT, + state JSONB NOT NULL, + metrics_history JSONB, + logs_before_crash JSONB, + analysis JSONB, + screenshot_url TEXT, + dom_snapshot_id UUID +); + +CREATE INDEX idx_crash_reports_job ON crash_reports(job_id); +CREATE INDEX idx_crash_reports_type ON crash_reports(crash_type); +CREATE INDEX idx_crash_reports_created ON crash_reports(created_at DESC); + +-- Add session fingerprint to jobs +ALTER TABLE jobs ADD COLUMN session_fingerprint JSONB; + +-- Add metrics snapshots for completed jobs +ALTER TABLE jobs ADD COLUMN metrics_history JSONB; +``` + +--- + +## API Changes + +### New Endpoints + +``` +GET /jobs/{job_id}/logs?category=scraper&level=ERROR +GET /jobs/{job_id}/crash-report +GET /jobs/{job_id}/session +GET /jobs/{job_id}/metrics +POST /jobs/{job_id}/retry?apply_fix=memory_cleanup +``` + +### SSE Stream Changes + +Current: `{"type": "log", "message": "..."}` + +New: +```json +{ + "type": "log", + "data": { + "timestamp": "2024-01-24T14:32:01.234Z", + "timestamp_ms": 1706103121234, + "level": "INFO", + "category": "browser", + "message": "Navigating to Google Maps URL", + "metrics": {"memory_mb": 245} + } +} + +{ + "type": "metrics", + "data": { + "timestamp_ms": 1706103121234, + "reviews_total": 150, + "memory_mb": 312, + "network_bytes": 1248576 + } +} + +{ + "type": "crash", + "data": { /* CrashReport object */ } +} +``` + +--- + +## Frontend Component Structure + +``` +components/ + JobDevTools/ + index.tsx # Main container with tabs + LogViewer.tsx # Tabbed log display + LogEntry.tsx # Single log row with copy + CopyToolbar.tsx # Export buttons + MetricsDashboard.tsx # Charts container + SessionPanel.tsx # Fingerprint display + CrashReport.tsx # Crash analysis view + NetworkInspector.tsx # Request table (P3) + DOMSnapshots.tsx # Snapshot viewer (P3) +``` + +--- + +## Implementation Dependencies + +``` +P0: Structured Logging ──┬──▶ P1: Tabbed Log Viewer + │ + ├──▶ P0: Crash Intelligence + │ + └──▶ P2: Metrics Dashboard + +P0: Copy System ─────────▶ (independent, can parallel) + +P1: Session Fingerprint ─▶ (independent, can parallel) + +P2: Topics Inference ────▶ (independent, can parallel) + +P3: Network Inspector ───▶ Requires: Structured Logging +P3: DOM Snapshots ───────▶ Requires: Crash Intelligence +P3: Job Comparison ──────▶ Requires: Metrics Dashboard +``` + +--- + +## Success Metrics + +| Feature | Success Metric | +|---------|----------------| +| Structured Logging | 100% of logs have category + timestamp_ms | +| Crash Intelligence | 80% of crashes have identified pattern | +| Copy System | < 200ms copy operation, toast feedback | +| Session Panel | All 15+ fingerprint fields populated | +| Tabbed Viewer | < 50ms tab switch, correct counts | +| Metrics Dashboard | < 100ms chart update, no memory leak | +| Topics Inference | > 70% accuracy vs manual labeling | + +--- + +## Parallel Implementation Tracks + +### Track A: Backend Logging Infrastructure +1. StructuredLogger class +2. Database schema changes +3. SSE stream updates +4. Crash detection wrapper + +### Track B: Frontend Log Viewer +1. JobDevTools container +2. LogViewer with tabs +3. LogEntry with copy +4. CopyToolbar + +### Track C: Crash Analysis +1. CrashReport schema +2. Pattern detection algorithms +3. CrashReport frontend component +4. Retry with fix functionality + +### Track D: Session & Metrics +1. Fingerprint capture +2. SessionPanel component +3. Metrics streaming +4. MetricsDashboard charts + +### Track E: Review Topics +1. Topic inference algorithm +2. Add topics to review schema +3. Frontend topic tags +4. Topic filter in analytics