From 12d37e350b1a7ff46d7a1bf6ca34323586ff5fbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Sat, 24 Jan 2026 15:13:19 +0000 Subject: [PATCH] Fix JobDevTools contrast + log normalization, add Platform Spec - Fix contrast issues in JobDevTools (level badges, text colors, timestamps) - Make log normalization more robust (handles old/new formats, edge cases) - Add ReviewIQ Platform Spec v1.2 defining: - Multi-tenant scraping-as-a-service architecture - Requester metadata, batches, webhooks, priority - Scraper versioning with A/B testing (stable/beta/canary) - API endpoints for job types, dashboard, admin - Output schemas for external service integration - Project structure reorganization plan Co-Authored-By: Claude Opus 4.5 --- .artifacts/ReviewIQ-Platform-Spec.md | 734 +++++++++++++++++++++++++++ web/app/jobs/[id]/page.tsx | 121 +++-- web/components/JobDevTools/index.tsx | 52 +- 3 files changed, 825 insertions(+), 82 deletions(-) create mode 100644 .artifacts/ReviewIQ-Platform-Spec.md diff --git a/.artifacts/ReviewIQ-Platform-Spec.md b/.artifacts/ReviewIQ-Platform-Spec.md new file mode 100644 index 0000000..ec0906b --- /dev/null +++ b/.artifacts/ReviewIQ-Platform-Spec.md @@ -0,0 +1,734 @@ +# ReviewIQ Scraping Platform - Specification + +> **Purpose**: Define WHAT the platform should do, not HOW. This document serves as the source of truth during implementation. + +--- + +## 1. Vision + +Transform the current Google Reviews scraper into a **multi-tenant scraping-as-a-service platform** that: + +- Serves external clients via API (initially veritasreview.com) +- Supports multiple scraping job types (reviews, business info, etc.) +- Provides full observability into system performance and problems +- Enables safe scraper iteration through versioning and A/B testing + +--- + +## 2. Core Concepts + +### 2.1 Job Types +The platform executes different types of scraping jobs: +- `google_reviews` (current, primary) +- Future: `yelp_reviews`, `tripadvisor_reviews`, `google_business_info`, etc. + +Each job type has its own: +- Input parameters +- Output schema +- Scraper implementation(s) + +### 2.2 Requesters +External systems that request scraping jobs: +- Identified by `client_id` (e.g., "veritas_client_123") +- Originate from a `source` (e.g., "veritasreview.com") +- Have a `purpose` for scraping: + - `client_report` - generating reports for their clients + - `prospect_screening` - evaluating potential clients + - `market_research` - competitive/market analysis + +### 2.3 Batches +Jobs can be grouped into batches: +- A batch is a collection of related jobs (e.g., "Q1 Prospect List") +- Batches have their own completion callback +- Dashboard shows batch progress and aggregate stats + +### 2.4 Scraper Versions +Each job type can have multiple scraper versions: +- **Variants**: `stable`, `beta`, `canary` +- **Traffic routing**: A/B testing via percentage allocation +- **Version pinning**: Clients can request specific versions +- **Safe rollouts**: Promote canary → beta → stable + +### 2.5 Priority Levels +Jobs have priority that affects execution order: +- `0` = normal +- `1` = high +- `2` = urgent + +--- + +## 3. Features + +### 3.1 API - Job Submission + +**Single job submission:** +- Submit a scraping job for a specific job type +- Include requester identification +- Optionally specify priority, callback URL, scraper variant +- Returns job ID immediately + +**Batch submission:** +- Submit multiple URLs as a single batch +- Batch has a name and optional batch-level callback +- Individual jobs track their position in batch +- Batch callback fires when all jobs complete + +### 3.2 API - Job Management + +- Get job status and results +- Cancel pending/running jobs +- Retry failed jobs +- List jobs with filtering (by client, status, date, batch, job type) + +### 3.3 API - Webhooks + +When a job completes (success or failure): +- POST to the provided `callback_url` +- Include job ID, status, summary results, error info if failed +- Track callback delivery status (pending, sent, failed) +- Retry failed callbacks + +When a batch completes: +- POST to batch-level callback +- Include batch summary (total, succeeded, failed) + +### 3.4 Main Dashboard + +**System Overview:** +- Total jobs (24h / 7d / 30d) +- Success rate trend +- Currently running jobs +- Recent failures / problems requiring attention + +**By Client/Source:** +- Jobs per client +- Top consumers (volume) +- Error rates by client +- Purpose breakdown per client + +**By Job Type:** +- Volume per job type +- Success rate per type +- Average duration per type + +**By Scraper Version:** +- Performance comparison across versions +- Success rate by version +- Duration by version +- Ability to identify when beta outperforms stable + +**Problems & Alerts:** +- Recent failures with error types +- Slow jobs (exceeding expected duration) +- Callback delivery failures +- Clients with elevated error rates + +### 3.5 Job Detail View (existing, enhanced) + +Current functionality preserved, plus: +- Show requester info (client, source, purpose) +- Show batch membership if applicable +- Show scraper version that executed +- Link to related jobs (same batch, same client) + +### 3.6 Analytics View + +Per-job analytics (existing) remains for Google Reviews: +- Rating distribution +- Sentiment analysis +- Review topics +- Timeline + +Future: type-specific analytics for other job types. + +--- + +## 4. Data Model + +### 4.1 Jobs (enhanced) + +**Existing fields preserved.** + +**New requester fields:** +- `requester_client_id` - which client requested this +- `requester_source` - origin system (veritasreview.com) +- `scrape_purpose` - why (client_report, prospect_screening, market_research) +- `requester_metadata` - flexible JSON for additional context + +**New batch fields:** +- `batch_id` - links to batch if part of one +- `batch_index` - position in batch (1, 2, 3...) + +**New execution fields:** +- `job_type` - type of scraping job (google_reviews, etc.) +- `scraper_version` - exact version that executed (1.2.0) +- `scraper_variant` - variant used (stable, beta, canary) +- `priority` - execution priority (0, 1, 2) + +**New callback fields:** +- `callback_url` - where to POST on completion +- `callback_status` - pending, sent, failed +- `callback_sent_at` - when callback was delivered +- `callback_attempts` - retry count + +### 4.2 Batches (new) + +- `id` - unique identifier +- `name` - human readable name +- `requester_client_id` - client who submitted +- `requester_source` - origin system +- `scrape_purpose` - purpose for all jobs in batch +- `total_jobs` - count of jobs in batch +- `completed_jobs` - count finished (success or fail) +- `failed_jobs` - count failed +- `status` - pending, running, completed +- `callback_url` - batch completion webhook +- `callback_status` - pending, sent, failed +- `created_at` - when batch was created +- `completed_at` - when last job finished +- `metadata` - flexible JSON + +### 4.3 Scraper Registry (new) + +- `id` - unique identifier +- `job_type` - which job type this scraper handles +- `version` - semantic version (1.2.0, 2.0.0-beta) +- `variant` - stable, beta, canary +- `module_path` - Python module path +- `function_name` - entry function +- `is_default` - use if no version specified +- `traffic_pct` - percentage of traffic for A/B testing +- `min_priority` - only use for jobs at or above this priority +- `created_at` - when registered +- `deprecated_at` - when marked deprecated (null if active) +- `config` - version-specific configuration JSON + +### 4.4 Generic Result Summary + +Jobs have a `result_summary` JSON field for cross-type dashboard: +```json +{ + "item_count": 150, + "primary_metric": 4.2, + "primary_metric_label": "rating", + "secondary_metrics": { + "reviews_with_text": 120, + "avg_review_length": 45 + } +} +``` + +This enables the dashboard to show unified metrics across job types. + +--- + +## 5. API Endpoints + +### 5.1 Scraping Endpoints + +``` +POST /api/scrape/google-reviews +POST /api/scrape/yelp-reviews (future) +POST /api/scrape/tripadvisor-reviews (future) +``` + +Each accepts type-specific parameters plus common fields: +- `requester` object (client_id, source, purpose, metadata) +- `priority` (0, 1, 2) +- `callback_url` +- `scraper_version` or `scraper_variant` (optional) + +### 5.2 Batch Endpoint + +``` +POST /api/scrape/google-reviews/batch +``` + +Accepts: +- `name` - batch name +- `urls` - array of URLs +- `requester` object +- `priority` +- `callback_url` - called when entire batch completes + +### 5.3 Management Endpoints + +``` +GET /api/jobs - list with filters +GET /api/jobs/{id} - job detail +DELETE /api/jobs/{id} - cancel job +POST /api/jobs/{id}/retry - retry failed job + +GET /api/batches - list batches +GET /api/batches/{id} - batch detail with job list +DELETE /api/batches/{id} - cancel all pending jobs in batch +``` + +### 5.4 Dashboard Endpoints + +``` +GET /api/dashboard/overview - system stats +GET /api/dashboard/by-client - breakdown by client +GET /api/dashboard/by-job-type - breakdown by job type +GET /api/dashboard/by-version - scraper version comparison +GET /api/dashboard/problems - recent failures, alerts +``` + +### 5.5 Admin Endpoints + +``` +GET /api/admin/scrapers - list registered scrapers +POST /api/admin/scrapers - register new scraper version +PUT /api/admin/scrapers/{id}/traffic - update traffic percentage +POST /api/admin/scrapers/{id}/deprecate - mark deprecated +POST /api/admin/scrapers/{id}/promote - promote to stable +``` + +--- + +## 6. Output Schemas + +Each job type has a defined output schema. External services (like veritasreview.com) consume this data to generate insights. + +### 6.1 Google Reviews Output + +**Business Summary:** +```json +{ + "business": { + "name": "Acme Restaurant", + "place_id": "ChIJ...", + "address": "123 Main St, City, State", + "category": "Restaurant", + "total_reviews": 1250, + "rating": 4.3, + "rating_distribution": { + "5": 720, + "4": 280, + "3": 120, + "2": 80, + "1": 50 + }, + "scraped_at": "2025-01-24T10:30:00Z" + } +} +``` + +**Review Object:** +```json +{ + "review_id": "abc123", + "author": { + "name": "John D.", + "profile_url": "https://...", + "is_local_guide": true, + "review_count": 42, + "photo_count": 15 + }, + "rating": 4, + "text": "Great food and service...", + "language": "en", + "published_at": "2025-01-15T14:30:00Z", + "photos": [ + { "url": "https://...", "caption": null } + ], + "owner_response": { + "text": "Thank you for your feedback...", + "responded_at": "2025-01-16T09:00:00Z" + }, + "metadata": { + "source": "dom", + "extracted_at": "2025-01-24T10:35:00Z" + } +} +``` + +**Key fields for insights service:** +- `rating` + `text` → Sentiment analysis, rating correlation +- `published_at` → Trend analysis, seasonality +- `language` → Multi-language support +- `owner_response` → Engagement metrics, response rate +- `author.is_local_guide` → Review credibility weighting +- `rating_distribution` → Rating spread analysis + +### 6.2 Future Job Types + +Other scrapers (Yelp, TripAdvisor, etc.) will have their own schemas but follow similar patterns: +- Business summary with ratings +- Individual review objects +- Author metadata +- Timestamps for trend analysis + +--- + +## 7. Webhook Payloads + +### 6.1 Job Completion + +```json +{ + "event": "job.completed", + "job_id": "uuid", + "job_type": "google_reviews", + "status": "completed", + "url": "https://google.com/maps/...", + "result_summary": { + "item_count": 150, + "primary_metric": 4.2 + }, + "scraper_version": "1.2.0", + "duration_seconds": 45.2, + "completed_at": "2024-01-15T10:30:00Z" +} +``` + +### 6.2 Job Failed + +```json +{ + "event": "job.failed", + "job_id": "uuid", + "job_type": "google_reviews", + "status": "failed", + "url": "https://google.com/maps/...", + "error": { + "type": "rate_limited", + "message": "Google rate limit detected" + }, + "scraper_version": "1.2.0", + "duration_seconds": 12.5, + "failed_at": "2024-01-15T10:30:00Z" +} +``` + +### 6.3 Batch Completion + +```json +{ + "event": "batch.completed", + "batch_id": "uuid", + "name": "Q1 Prospects", + "total_jobs": 50, + "succeeded": 47, + "failed": 3, + "completed_at": "2024-01-15T10:30:00Z", + "failed_job_ids": ["uuid1", "uuid2", "uuid3"] +} +``` + +--- + +## 8. UI Pages + +### 7.1 Main Dashboard (`/dashboard`) +- System health at a glance +- Key metrics with trends +- Problem alerts +- Quick links to drill down + +### 7.2 Clients View (`/dashboard/clients`) +- Table of clients with job counts, success rates +- Click to see client's jobs + +### 7.3 Scrapers View (`/dashboard/scrapers`) +- Registered scraper versions +- Performance comparison +- Traffic allocation controls +- Promote/deprecate actions + +### 7.4 Jobs View (`/jobs`) - enhanced +- Add filters: client, job type, batch, scraper version +- Show requester info in job cards + +### 7.5 Batches View (`/batches`) +- List of batches with progress +- Click to see batch detail and jobs + +--- + +## 9. Project Structure + +### 8.1 Backend Structure + +``` +reviewiq/ # Root (renamed from google-reviews-scraper-pro) +│ +├── api/ +│ ├── __init__.py +│ ├── server.py # FastAPI app, startup, middleware +│ ├── routes/ +│ │ ├── __init__.py +│ │ ├── scrape.py # /api/scrape/* endpoints +│ │ ├── jobs.py # /api/jobs/* endpoints +│ │ ├── batches.py # /api/batches/* endpoints +│ │ ├── dashboard.py # /api/dashboard/* endpoints +│ │ └── admin.py # /api/admin/* endpoints +│ └── middleware/ +│ ├── __init__.py +│ └── auth.py # API key authentication +│ +├── scrapers/ +│ ├── __init__.py +│ ├── registry.py # ScraperRegistry - version routing +│ ├── base.py # BaseScraper interface +│ │ +│ ├── google_reviews/ +│ │ ├── __init__.py +│ │ ├── v1_0_0.py # Current stable (migrated from scraper_clean.py) +│ │ └── parsers.py # Review parsing logic +│ │ +│ └── yelp_reviews/ # Future +│ ├── __init__.py +│ └── v1_0_0.py +│ +├── core/ +│ ├── __init__.py +│ ├── database.py # Database manager +│ ├── models.py # Pydantic models (Job, Batch, etc.) +│ ├── enums.py # JobStatus, JobType, Priority, etc. +│ └── config.py # Settings, environment variables +│ +├── services/ +│ ├── __init__.py +│ ├── job_service.py # Job creation, management +│ ├── batch_service.py # Batch operations +│ ├── webhook_service.py # Callback delivery +│ └── dashboard_service.py # Aggregate queries +│ +├── workers/ +│ ├── __init__.py +│ ├── chrome_pool.py # Browser pool management +│ ├── job_executor.py # Job execution orchestration +│ └── webhook_worker.py # Async webhook delivery +│ +├── utils/ +│ ├── __init__.py +│ ├── logger.py # StructuredLogger +│ ├── crash_analyzer.py # Crash detection +│ └── health_checks.py # System health +│ +├── tests/ +│ ├── __init__.py +│ ├── conftest.py # Pytest fixtures +│ ├── api/ # API route tests +│ ├── scrapers/ # Scraper tests (mirrors scrapers/) +│ │ └── google_reviews/ +│ │ └── test_v1_0_0.py +│ ├── services/ # Service tests +│ └── integration/ # End-to-end tests +│ +├── migrations/ # Database migrations +│ └── versions/ +│ +├── web/ # Next.js frontend (existing) +│ └── ... +│ +├── docker-compose.yml +├── Dockerfile +├── pyproject.toml # Python dependencies +└── README.md +``` + +### 8.2 Key Conventions + +**Naming:** +- Scraper versions use underscores: `v1_0_0.py` (valid Python module names) +- Version strings use dots: `"1.0.0"` (semantic versioning in data) + +**Imports:** +```python +from scrapers.google_reviews.v1_0_0 import GoogleReviewsScraper +from scrapers.registry import ScraperRegistry +from core.models import Job, Batch +from services.job_service import JobService +``` + +**Scraper Interface:** +Each scraper version implements: +```python +class GoogleReviewsScraper(BaseScraper): + VERSION = "1.0.0" + JOB_TYPE = "google_reviews" + + async def scrape(self, url: str, options: dict) -> ScraperResult: + ... + + def validate_url(self, url: str) -> bool: + ... +``` + +### 8.3 Frontend Structure (existing, minor additions) + +``` +web/ +├── app/ +│ ├── dashboard/ # New main dashboard +│ │ ├── page.tsx # Overview +│ │ ├── clients/page.tsx +│ │ ├── scrapers/page.tsx +│ │ └── problems/page.tsx +│ ├── batches/ # New +│ │ ├── page.tsx +│ │ └── [id]/page.tsx +│ ├── jobs/ # Enhanced +│ └── analytics/ # Existing +├── components/ +│ ├── dashboard/ # Dashboard-specific components +│ └── ... +└── ... +``` + +--- + +## 10. Backwards Compatibility + +### 9.1 Existing API +`POST /api/scrape` continues to work as-is: +- Defaults to `job_type: google_reviews` +- No requester required (legacy mode) +- No callback required +- Routes to the same scraper logic + +### 9.2 Existing Database +- All new fields have defaults +- Existing jobs have null requester fields +- `job_type` defaults to `google_reviews` +- Migration adds columns without breaking existing data + +### 9.3 Scraper Migration +- Current scraper code moves to `scrapers/google_reviews/v1_0_0.py` +- Registered in scraper_registry as `stable` with 100% traffic +- Old file `scraper_clean.py` deleted after migration +- All imports updated to new paths + +--- + +## 11. Additional Considerations + +### 10.1 Authentication +- External API clients authenticate via API keys +- API keys stored in `api_keys` table with `client_id` reference +- Keys can be scoped (read-only, submit jobs, admin) +- Rate limits can be per-key + +### 10.2 Error Handling +- All API errors return consistent JSON structure: + ```json + { + "error": { + "code": "VALIDATION_ERROR", + "message": "URL is required", + "details": { ... } + } + } + ``` +- Scraper errors captured with crash analysis +- Failed webhooks retry with exponential backoff (max 5 attempts) + +### 10.3 Logging +- All components use StructuredLogger +- Log levels: DEBUG, INFO, WARN, ERROR, FATAL +- Categories: api, scraper, webhook, system +- Logs include correlation IDs for tracing + +### 10.4 Configuration +- Environment-based configuration via `core/config.py` +- Sensitive values from environment variables +- Per-scraper config in scraper_registry.config JSON + +### 10.5 Monitoring +- Health check endpoint: `GET /health` +- Prometheus metrics endpoint: `GET /metrics` (future) +- Dashboard provides operational visibility + +### 10.6 Data Retention +- Define retention policy for completed jobs +- Archive or delete old job data after N days +- Keep aggregate stats for historical reporting + +--- + +## 12. Implementation Phases + +### Phase 0: Project Restructure +- Reorganize files to new structure +- Move `scraper_clean.py` → `scrapers/google_reviews/v1_0_0.py` +- Update all imports +- Verify everything still works + +### Phase 1: Data Model +- Add new fields to jobs table +- Create batches table +- Create scraper_registry table +- Create api_keys table +- Migration preserves existing data + +### Phase 2: Requester & Batch Support +- Update API to accept requester info +- Implement batch submission endpoint +- Store and display requester/batch info + +### Phase 3: Webhooks +- Implement callback delivery service +- Retry logic for failed callbacks +- Track delivery status + +### Phase 4: Scraper Versioning +- Implement scraper registry +- Version routing logic +- Admin endpoints for management + +### Phase 5: Main Dashboard +- Build dashboard pages +- Aggregate queries +- Real-time updates + +### Phase 6: Traffic Management & A/B +- A/B test traffic splitting +- Promote/deprecate workflow +- Performance comparison views + +### Phase 7: Authentication +- API key management +- Client authentication middleware +- Rate limiting (optional) + +--- + +## 13. Success Metrics + +- API response time < 200ms for job submission +- Webhook delivery within 5 seconds of job completion +- Dashboard loads in < 2 seconds +- Support 100+ concurrent scraping jobs +- 99% webhook delivery success rate +- Clear visibility into scraper version performance + +--- + +## 14. Open Questions + +1. ~~**Authentication**: How do external clients authenticate? API keys per client?~~ → Resolved: API keys +2. **Rate Limits**: Per-client rate limiting? (deferred to Phase 7) +3. **Retention**: How long to keep completed job data? (needs decision) +4. **Billing**: Track usage for billing purposes? (future consideration) +5. **Project Rename**: Rename folder from `google-reviews-scraper-pro` to `reviewiq`? + +--- + +## 15. Glossary + +| Term | Definition | +|------|------------| +| Job | A single scraping task for one URL | +| Batch | A collection of related jobs submitted together | +| Job Type | Category of scraping (google_reviews, yelp_reviews, etc.) | +| Requester | External client/system that requests jobs | +| Scraper Version | Specific implementation of a scraper (v1.0.0, v2.0.0) | +| Variant | Stability tier: stable, beta, canary | +| Callback/Webhook | HTTP POST to notify client of job completion | + +--- + +*Document Version: 1.2* +*Last Updated: 2025-01-24* diff --git a/web/app/jobs/[id]/page.tsx b/web/app/jobs/[id]/page.tsx index 5bffe09..a7ede75 100644 --- a/web/app/jobs/[id]/page.tsx +++ b/web/app/jobs/[id]/page.tsx @@ -47,58 +47,85 @@ function extractBusinessName(job: JobStatus): string { } } +// Valid categories for structured logs +const VALID_CATEGORIES: StructuredLog['category'][] = ['scraper', 'browser', 'network', 'system']; + +// Valid log levels +const VALID_LEVELS: StructuredLog['level'][] = ['DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL']; + /** - * Check if a log entry is in the old format (has 'source' property) - * or new structured format (has 'category' property) + * Map source/category strings to valid category values */ -function isOldLogFormat(log: OldLogEntry | StructuredLog): log is OldLogEntry { - return 'source' in log && !('category' in log); +function mapToCategory(source: string | undefined | null): StructuredLog['category'] { + if (!source) return 'scraper'; + const lower = source.toLowerCase(); + if (lower === 'browser' || lower === 'navigation' || lower === 'page') return 'browser'; + if (lower === 'network' || lower === 'api') return 'network'; + if (lower === 'system' || lower === 'memory' || lower === 'chrome') return 'system'; + if (lower === 'scraper') return 'scraper'; + return 'scraper'; // Default to scraper for unknown sources } /** - * Convert old log format to new StructuredLog format + * Map level strings to valid level values */ -function convertOldToStructured(oldLog: OldLogEntry): StructuredLog { - // Map old source to new category - const categoryMap: Record = { - browser: 'browser', - scraper: 'scraper', - network: 'network', - system: 'system', - }; +function mapToLevel(level: string | undefined | null): StructuredLog['level'] { + if (!level) return 'INFO'; + const upper = level.toUpperCase(); + if (upper === 'WARNING') return 'WARN'; + if (VALID_LEVELS.includes(upper as StructuredLog['level'])) { + return upper as StructuredLog['level']; + } + return 'INFO'; +} - // Map old level to new level - const levelMap: Record = { - DEBUG: 'DEBUG', - INFO: 'INFO', - WARNING: 'WARN', - WARN: 'WARN', - ERROR: 'ERROR', - FATAL: 'FATAL', - }; +/** + * Normalize any log entry to StructuredLog format + * Handles: new format, old format with 'source', logs without category, edge cases + */ +function normalizeLog(log: Record): StructuredLog { + // Get timestamp + const timestamp = (log.timestamp as string) || new Date().toISOString(); + const timestampMs = (log.timestamp_ms as number) || new Date(timestamp).getTime() || Date.now(); - const timestamp = oldLog.timestamp; - const timestampMs = new Date(timestamp).getTime(); + // Get message + const message = (log.message as string) || ''; + + // Determine category: prefer 'category' field, fall back to 'source' field + let category: StructuredLog['category']; + if (log.category && VALID_CATEGORIES.includes(log.category as StructuredLog['category'])) { + category = log.category as StructuredLog['category']; + } else { + category = mapToCategory((log.category as string) || (log.source as string)); + } + + // Determine level + const level = mapToLevel(log.level as string); return { timestamp, - timestamp_ms: timestampMs || Date.now(), - level: levelMap[oldLog.level?.toUpperCase()] || 'INFO', - category: categoryMap[oldLog.source] || 'system', - message: oldLog.message, + timestamp_ms: timestampMs, + level, + category, + message, + metrics: log.metrics as Record | undefined, + network: log.network as Record | undefined, }; } /** - * Convert array of logs to structured format if needed + * Convert array of logs to structured format + * Robust handling of various log formats (old, new, malformed) */ -function normalizeLogsTOStructured(logs: (OldLogEntry | StructuredLog)[]): StructuredLog[] { - return logs.map((log) => { - if (isOldLogFormat(log)) { - return convertOldToStructured(log); - } - return log as StructuredLog; - }); +function normalizeLogsTOStructured(logs: unknown[]): StructuredLog[] { + if (!Array.isArray(logs)) return []; + + return logs + .filter((log): log is Record => { + // Filter out non-objects and nulls + return log != null && typeof log === 'object' && !Array.isArray(log); + }) + .map(normalizeLog); } export default function JobDetailPage() { @@ -190,17 +217,7 @@ export default function JobDetailPage() { const data = JSON.parse(event.data); // Handle {"type": "log", "data": {...}} format const logData = data.data || data; - - const newLog: StructuredLog = { - timestamp: logData.timestamp || new Date().toISOString(), - timestamp_ms: logData.timestamp_ms || Date.now(), - level: logData.level || 'INFO', - category: logData.category || 'system', - message: logData.message || '', - metrics: logData.metrics, - network: logData.network, - }; - + const newLog = normalizeLog(logData); setStructuredLogs((prev) => [...prev, newLog]); } catch (err) { console.error('Failed to parse log event:', err); @@ -347,15 +364,7 @@ export default function JobDetailPage() { // Check for type field to route to correct handler if (data.type === 'log') { const logData = data.data || data; - const newLog: StructuredLog = { - timestamp: logData.timestamp || new Date().toISOString(), - timestamp_ms: logData.timestamp_ms || Date.now(), - level: logData.level || 'INFO', - category: logData.category || 'system', - message: logData.message || '', - metrics: logData.metrics, - network: logData.network, - }; + const newLog = normalizeLog(logData); setStructuredLogs((prev) => [...prev, newLog]); } else if (data.type === 'metrics') { const metricsPayload = data.data || data; diff --git a/web/components/JobDevTools/index.tsx b/web/components/JobDevTools/index.tsx index c4854cf..d03d2b6 100644 --- a/web/components/JobDevTools/index.tsx +++ b/web/components/JobDevTools/index.tsx @@ -60,19 +60,19 @@ const TAB_CONFIG: { id: TabType; label: string; icon: typeof Bug; category?: Str ]; const LEVEL_COLORS: Record = { - DEBUG: { bg: 'bg-gray-700', text: 'text-gray-300', border: 'border-gray-600' }, - INFO: { bg: 'bg-blue-900', text: 'text-blue-300', border: 'border-blue-700' }, - WARN: { bg: 'bg-yellow-900', text: 'text-yellow-300', border: 'border-yellow-700' }, - ERROR: { bg: 'bg-red-900', text: 'text-red-300', border: 'border-red-700' }, - FATAL: { bg: 'bg-purple-900', text: 'text-purple-300', border: 'border-purple-700' }, + DEBUG: { bg: 'bg-gray-900', text: 'text-gray-200', border: 'border-gray-700' }, + INFO: { bg: 'bg-gray-900', text: 'text-gray-100', border: 'border-gray-700' }, + WARN: { bg: 'bg-gray-900', text: 'text-amber-200', border: 'border-gray-700' }, + ERROR: { bg: 'bg-gray-900', text: 'text-red-200', border: 'border-gray-700' }, + FATAL: { bg: 'bg-gray-900', text: 'text-fuchsia-200', border: 'border-gray-700' }, }; const LEVEL_BADGE_COLORS: Record = { - DEBUG: 'bg-gray-600 text-gray-200', - INFO: 'bg-blue-600 text-blue-100', - WARN: 'bg-yellow-600 text-yellow-100', - ERROR: 'bg-red-600 text-red-100', - FATAL: 'bg-purple-600 text-purple-100', + DEBUG: 'bg-gray-500 text-white', + INFO: 'bg-blue-500 text-white', + WARN: 'bg-amber-500 text-gray-900', + ERROR: 'bg-red-500 text-white', + FATAL: 'bg-fuchsia-500 text-white', }; export default function JobDevTools({ @@ -263,11 +263,11 @@ export default function JobDevTools({ {/* Log entries - scrollable area */}
{filteredLogs.length === 0 ? ( -
+
- +

No logs to display

-

+

{logs.length > 0 ? 'Try adjusting your filters' : 'Logs will appear here during job execution'} @@ -281,23 +281,23 @@ export default function JobDevTools({ return (

{/* Timestamp */} - + {formatTimestamp(log.timestamp)} {/* Level badge */} {log.level} {/* Category badge */} - + {log.category} @@ -309,14 +309,14 @@ export default function JobDevTools({ {/* Additional data (metrics/network) */} {(log.metrics || log.network) && ( -
+
{log.metrics && ( - metrics: {JSON.stringify(log.metrics)} + metrics: {JSON.stringify(log.metrics)} )} {log.network && ( - network: {JSON.stringify(log.network)} + network: {JSON.stringify(log.network)} )}
)} @@ -329,30 +329,30 @@ export default function JobDevTools({ {/* Reserved space for metrics/session panels (footer) */}
-
+
{metrics && ( <> {metrics.duration_ms !== undefined && ( - Duration: {(metrics.duration_ms / 1000).toFixed(2)}s + Duration: {(metrics.duration_ms / 1000).toFixed(2)}s )} {metrics.reviews_scraped !== undefined && ( - Reviews: {metrics.reviews_scraped} + Reviews: {metrics.reviews_scraped} )} {metrics.memory_mb !== undefined && ( - Memory: {metrics.memory_mb.toFixed(1)}MB + Memory: {metrics.memory_mb.toFixed(1)}MB )} )}
{sessionFingerprint && ( - + Session: {sessionFingerprint.session_id?.slice(0, 8)}... )} {crashReport && ( - + Crash: {crashReport.error_type} )}