# Production Health Check Strategy ## Verify Actual Scraping Works --- ## 🎯 Problem with Basic Health Checks ### What Basic Health Checks Test: ```python @app.get("/health") async def health(): db_ok = await ping_database() # ✅ DB responds redis_ok = await ping_redis() # ✅ Redis responds disk_ok = check_disk_space() < 90 # ✅ Disk not full return {"status": "healthy"} ``` ### What They DON'T Test: - ❌ Can we actually scrape Google Maps? - ❌ Is Chrome working? - ❌ Are CSS selectors still valid? - ❌ Is GDPR handling working? - ❌ Did Google change their page structure? - ❌ Is our proxy/network working? ### Real-World Failure Example: ``` ✅ Database: healthy ✅ Redis: healthy ✅ Disk: 45% used ❌ Actual scraping: BROKEN (Google changed selectors) → Health check says "healthy" but all jobs fail! ``` --- ## ✅ Solution: Synthetic Monitoring ### Concept: Canary Tests Run an **actual scraping job** periodically on a known test URL: ```python TEST_URL = "https://www.google.com/maps/place/Soho+Factory/@54.6738155,25.2595844,17z/..." # A stable business that always has reviews Every 4-6 hours: 1. Run actual scrape on test URL 2. Verify we get reviews 3. Verify data structure is correct 4. Verify scrape time is reasonable 5. Alert if anything fails ``` --- ## 🏗️ Implementation ### 1. Canary Scraping Endpoint ```python from datetime import datetime, timedelta # Store last canary result canary_state = { "last_run": None, "last_success": None, "last_result": None, "consecutive_failures": 0 } @app.get("/health/canary") async def canary_health_check(): """ Run a real scraping test to verify the scraper works. This is the MOST IMPORTANT health check - it verifies: - Chrome can start - Google Maps is accessible - Selectors still work - GDPR handling works - We can extract reviews """ # Don't run too frequently (rate limit to avoid Google detection) if canary_state["last_run"]: elapsed = datetime.now() - canary_state["last_run"] if elapsed < timedelta(hours=1): # Return cached result return { "status": "cached", "last_run": canary_state["last_run"].isoformat(), "last_result": canary_state["last_result"], "cached_for": f"{elapsed.total_seconds():.0f}s" } # Run canary test canary_state["last_run"] = datetime.now() try: # Use a known stable business TEST_URL = "https://www.google.com/maps/place/Soho+Factory/@54.6738155,25.2595844,17z/" # Run actual scrape with timeout result = await asyncio.wait_for( fast_scrape_reviews( url=TEST_URL, headless=True, max_scrolls=10 # Limited for canary ), timeout=60 # Fail if takes > 60s ) # Validate result checks = { "scrape_succeeded": result['success'], "got_reviews": result['count'] > 0, "reasonable_count": 10 <= result['count'] <= 500, "reasonable_time": result['time'] < 30, "data_structure_valid": validate_review_structure(result['reviews']), } all_passed = all(checks.values()) if all_passed: canary_state["consecutive_failures"] = 0 canary_state["last_success"] = datetime.now() canary_state["last_result"] = { "status": "pass", "reviews_count": result['count'], "scrape_time": result['time'], "checks": checks } status_code = 200 else: canary_state["consecutive_failures"] += 1 canary_state["last_result"] = { "status": "fail", "reviews_count": result['count'], "scrape_time": result['time'], "checks": checks, "consecutive_failures": canary_state["consecutive_failures"] } status_code = 503 # Service Unavailable return JSONResponse( status_code=status_code, content={ "status": "pass" if all_passed else "fail", "last_run": canary_state["last_run"].isoformat(), "last_success": canary_state["last_success"].isoformat() if canary_state["last_success"] else None, "result": canary_state["last_result"], "details": { "test_url": TEST_URL, "reviews_found": result['count'], "scrape_time_seconds": result['time'], "checks": checks } } ) except asyncio.TimeoutError: canary_state["consecutive_failures"] += 1 canary_state["last_result"] = { "status": "timeout", "error": "Scrape took longer than 60 seconds" } return JSONResponse( status_code=503, content={ "status": "timeout", "error": "Canary scrape timeout (>60s)", "consecutive_failures": canary_state["consecutive_failures"] } ) except Exception as e: canary_state["consecutive_failures"] += 1 canary_state["last_result"] = { "status": "error", "error": str(e) } return JSONResponse( status_code=503, content={ "status": "error", "error": str(e), "consecutive_failures": canary_state["consecutive_failures"] } ) def validate_review_structure(reviews): """Validate that reviews have expected structure""" if not reviews or len(reviews) == 0: return False # Check first review has required fields first_review = reviews[0] required_fields = ['author', 'rating', 'date_text'] return all(field in first_review for field in required_fields) ``` --- ### 2. Background Canary Runner Instead of running on health check endpoint (which gets called frequently), run in background: ```python import asyncio from datetime import datetime, timedelta class CanaryMonitor: """Background task that runs canary tests periodically""" def __init__(self, interval_hours=4): self.interval = timedelta(hours=interval_hours) self.last_run = None self.last_success = None self.consecutive_failures = 0 self.running = False async def start(self): """Start the background canary monitoring""" self.running = True while self.running: try: await self.run_canary() except Exception as e: log.error(f"Canary test failed: {e}") self.consecutive_failures += 1 # Alert if multiple consecutive failures if self.consecutive_failures >= 3: await self.send_alert( f"🚨 CRITICAL: Scraper canary failed {self.consecutive_failures} times in a row!" ) # Sleep until next run await asyncio.sleep(self.interval.total_seconds()) async def run_canary(self): """Run a single canary test""" log.info("Running canary scrape test...") self.last_run = datetime.now() TEST_URL = "https://www.google.com/maps/place/Soho+Factory/@54.6738155,25.2595844,17z/" result = await asyncio.wait_for( fast_scrape_reviews(url=TEST_URL, headless=True, max_scrolls=10), timeout=60 ) # Validate result if result['success'] and result['count'] > 10 and result['time'] < 30: log.info(f"✅ Canary test passed: {result['count']} reviews in {result['time']:.1f}s") self.consecutive_failures = 0 self.last_success = datetime.now() # Store result in database for tracking await db.execute(""" INSERT INTO canary_results (timestamp, success, reviews_count, scrape_time) VALUES (NOW(), true, %s, %s) """, result['count'], result['time']) else: log.error(f"❌ Canary test failed: {result}") self.consecutive_failures += 1 await db.execute(""" INSERT INTO canary_results (timestamp, success, error_message) VALUES (NOW(), false, %s) """, result.get('error', 'Unknown error')) raise Exception(f"Canary validation failed: {result}") async def send_alert(self, message): """Send alert via Slack/email/PagerDuty when canary fails""" # Slack webhook await httpx.post( SLACK_WEBHOOK_URL, json={"text": message} ) # Or email await send_email( to="oncall@example.com", subject="Scraper Canary Failure", body=message ) def stop(self): """Stop the background monitoring""" self.running = False # In api_server.py startup canary_monitor = CanaryMonitor(interval_hours=4) @asynccontextmanager async def lifespan(app: FastAPI): # Startup asyncio.create_task(canary_monitor.start()) yield # Shutdown canary_monitor.stop() ``` --- ### 3. Canary Health Check Endpoint (Fast) ```python @app.get("/health/canary") async def get_canary_status(): """ Return the LATEST canary test result (doesn't run a new test). Use this for health checks from load balancers / monitoring systems. """ if not canary_monitor.last_success: return JSONResponse( status_code=503, content={ "status": "unknown", "message": "No canary tests run yet" } ) # Check if last success was recent enough age = datetime.now() - canary_monitor.last_success max_age = timedelta(hours=6) if age > max_age: return JSONResponse( status_code=503, content={ "status": "stale", "last_success": canary_monitor.last_success.isoformat(), "age_hours": age.total_seconds() / 3600, "message": f"Last successful canary was {age.total_seconds()/3600:.1f} hours ago" } ) # Recent success - all good! return { "status": "healthy", "last_success": canary_monitor.last_success.isoformat(), "age_minutes": age.total_seconds() / 60, "consecutive_failures": canary_monitor.consecutive_failures } ``` --- ## 📊 Complete Health Check Hierarchy ### 1. **Liveness** (Is the app alive?) ```python @app.get("/health/live") async def liveness(): # Simple: can the server respond? return {"status": "alive"} ``` **Use**: Kubernetes liveness probe (restart if fails) --- ### 2. **Readiness** (Can the app handle traffic?) ```python @app.get("/health/ready") async def readiness(): # Check dependencies db_ok = await ping_database() redis_ok = await ping_redis() if db_ok and redis_ok: return {"status": "ready"} else: raise HTTPException(status_code=503, detail="Not ready") ``` **Use**: Kubernetes readiness probe (remove from load balancer if fails) --- ### 3. **Canary** (Does scraping actually work?) ```python @app.get("/health/canary") async def canary(): # Return last canary test result if canary_monitor.last_success and age < 6_hours: return {"status": "healthy"} else: return JSONResponse(status_code=503, content={"status": "unhealthy"}) ``` **Use**: External monitoring (PagerDuty, DataDog) - alerts if fails --- ### 4. **Detailed** (Full system status) ```python @app.get("/health/detailed") async def detailed_health(): return { "status": "healthy", "components": { "api": {"status": "healthy", "latency_ms": 1}, "database": {"status": "healthy", "latency_ms": 5}, "redis": {"status": "healthy", "latency_ms": 2}, "workers": {"status": "healthy", "active": 4}, "canary": { "status": "healthy", "last_success": "2026-01-18T10:30:00Z", "age_minutes": 45, "consecutive_failures": 0 } }, "timestamp": datetime.utcnow().isoformat() } ``` **Use**: Monitoring dashboards, debugging --- ## 📈 Monitoring Strategy ### Canary Test Schedule ``` Every 4 hours: - Run full canary test - Store result in database - Alert if fails Benefits: ✅ Detects Google Maps changes within 4 hours ✅ Detects selector breakage quickly ✅ Low overhead (6 tests/day) ✅ Won't trigger Google rate limits ``` ### Alert Rules ```python # Alert on consecutive failures if consecutive_failures >= 3: send_pagerduty_alert("CRITICAL: Scraper broken") # Alert on slow canary if scrape_time > 60: send_slack_alert("WARNING: Scraper slow") # Alert on low review count if reviews_count < 10: send_slack_alert("WARNING: Low review count in canary") ``` --- ## 🎯 Canary Database Tracking ```sql CREATE TABLE canary_results ( id SERIAL PRIMARY KEY, timestamp TIMESTAMP NOT NULL DEFAULT NOW(), success BOOLEAN NOT NULL, reviews_count INTEGER, scrape_time REAL, error_message TEXT, metadata JSONB ); CREATE INDEX idx_canary_timestamp ON canary_results(timestamp DESC); -- Query to see canary health over time SELECT DATE_TRUNC('day', timestamp) as day, COUNT(*) as total_tests, SUM(CASE WHEN success THEN 1 ELSE 0 END) as successful, AVG(scrape_time) as avg_scrape_time, AVG(reviews_count) as avg_reviews FROM canary_results WHERE timestamp > NOW() - INTERVAL '7 days' GROUP BY day ORDER BY day DESC; ``` --- ## ✅ Complete Health Check Implementation ```python # health_checks.py from datetime import datetime, timedelta import asyncio from typing import Dict, Any class HealthCheckSystem: """Complete health check system for production""" def __init__(self): self.canary = CanaryMonitor(interval_hours=4) async def start(self): """Start background health monitoring""" asyncio.create_task(self.canary.start()) @property def is_healthy(self) -> bool: """Overall system health""" return ( self.canary.consecutive_failures < 3 and self.canary.last_success and (datetime.now() - self.canary.last_success) < timedelta(hours=6) ) async def get_status(self) -> Dict[str, Any]: """Get complete health status""" db_latency = await self.check_database() redis_latency = await self.check_redis() return { "status": "healthy" if self.is_healthy else "degraded", "components": { "database": { "healthy": db_latency is not None, "latency_ms": db_latency }, "redis": { "healthy": redis_latency is not None, "latency_ms": redis_latency }, "canary_scraper": { "healthy": self.canary.consecutive_failures == 0, "last_success": self.canary.last_success.isoformat() if self.canary.last_success else None, "consecutive_failures": self.canary.consecutive_failures } }, "timestamp": datetime.utcnow().isoformat() } ``` --- ## 🚀 Production Recommendations 1. ✅ **Run canary every 4-6 hours** (balanced between freshness and overhead) 2. ✅ **Alert after 3 consecutive failures** (avoid false positives) 3. ✅ **Store canary results in database** (historical tracking) 4. ✅ **Use different health checks for different purposes**: - `/health/live` → Kubernetes liveness (restart if fails) - `/health/ready` → Kubernetes readiness (route traffic) - `/health/canary` → External monitoring (PagerDuty alerts) 5. ✅ **Monitor canary metrics**: scrape time, review count, success rate --- **The canary test is your MOST IMPORTANT health check** - it's the only one that verifies your core business logic actually works!