实施综合健康检查端点,用于监控服务健康、依赖项和准备接收流量。
健康检查端点
概述
实现健康检查端点以监控服务健康、依赖项和准备接收流量。
何时使用
- Kubernetes 活跃度和就绪度探针
- 负载均衡器健康检查
- 服务发现和注册
- 监控和报警系统
- 断路器决策
- 自动扩展触发器
- 部署验证
健康检查类型
| 类型 | 目的 | 失败操作 |
|---|---|---|
| 活跃度 | 进程正在运行 | 重启容器 |
| 就绪度 | 准备接收流量 | 从负载均衡器中移除 |
| 启动 | 应用程序已启动 | 延迟其他探针 |
| 深度 | 依赖项健康 | 报警/断路 |
实施示例
1. Express.js 健康检查
import express from 'express';
import { Pool } from 'pg';
import Redis from 'ioredis';
interface HealthStatus {
status: 'healthy' | 'degraded' | 'unhealthy';
timestamp: string;
uptime: number;
checks: Record<string, CheckResult>;
version?: string;
environment?: string;
}
interface CheckResult {
status: 'pass' | 'fail' | 'warn';
time: number;
output?: string;
error?: string;
}
class HealthCheckService {
private startTime = Date.now();
private version = process.env.APP_VERSION || '1.0.0';
private environment = process.env.NODE_ENV || 'development';
constructor(
private db: Pool,
private redis: Redis
) {}
async liveness(): Promise<{ status: string }> {
// 简单检查:进程是否存活?
return { status: 'alive' };
}
async readiness(): Promise<HealthStatus> {
const checks = await Promise.all([
this.checkDatabase(),
this.checkRedis()
]);
const results = {
database: checks[0],
redis: checks[1]
};
const status = this.determineStatus(results);
return {
status,
timestamp: new Date().toISOString(),
uptime: Date.now() - this.startTime,
checks: results,
version: this.version,
environment: this.environment
};
}
async deep(): Promise<HealthStatus> {
const checks = await Promise.all([
this.checkDatabase(),
this.checkRedis(),
this.checkExternalAPI(),
this.checkDiskSpace(),
this.checkMemory()
]);
const results = {
database: checks[0],
redis: checks[1],
external_api: checks[2],
disk_space: checks[3],
memory: checks[4]
};
const status = this.determineStatus(results);
return {
status,
timestamp: new Date().toISOString(),
uptime: Date.now() - this.startTime,
checks: results,
version: this.version,
environment: this.environment
};
}
private async checkDatabase(): Promise<CheckResult> {
const startTime = Date.now();
try {
const result = await this.db.query('SELECT 1');
const time = Date.now() - startTime;
if (time > 1000) {
return {
status: 'warn',
time,
output: '数据库响应慢'
};
}
return {
status: 'pass',
time,
output: '数据库连接健康'
};
} catch (error: any) {
return {
status: 'fail',
time: Date.now() - startTime,
error: error.message
};
}
}
private async checkRedis(): Promise<CheckResult> {
const startTime = Date.now();
try {
await this.redis.ping();
const time = Date.now() - startTime;
return {
status: 'pass',
time,
output: 'Redis连接健康'
};
} catch (error: any) {
return {
status: 'fail',
time: Date.now() - startTime,
error: error.message
};
}
}
private async checkExternalAPI(): Promise<CheckResult> {
const startTime = Date.now();
try {
const response = await fetch('https://api.example.com/health', {
signal: AbortSignal.timeout(5000)
});
const time = Date.now() - startTime;
if (!response.ok) {
return {
status: 'warn',
time,
output: `API返回${response.status}`
};
}
return {
status: 'pass',
time,
output: '外部API健康'
};
} catch (error: any) {
return {
status: 'warn',
time: Date.now() - startTime,
error: error.message
};
}
}
private async checkDiskSpace(): Promise<CheckResult> {
const startTime = Date.now();
try {
const { execSync } = require('child_process');
const output = execSync('df -h /').toString();
const lines = output.split('
');
const stats = lines[1].split(/\s+/);
const usagePercent = parseInt(stats[4]);
const time = Date.now() - startTime;
if (usagePercent > 90) {
return {
status: 'fail',
time,
output: `磁盘使用率${usagePercent}%`
};
}
if (usagePercent > 80) {
return {
status: 'warn',
time,
output: `磁盘使用率${usagePercent}%`
};
}
return {
status: 'pass',
time,
output: `磁盘使用率${usagePercent}%`
};
} catch (error: any) {
return {
status: 'warn',
time: Date.now() - startTime,
error: error.message
};
}
}
private async checkMemory(): Promise<CheckResult> {
const startTime = Date.now();
try {
const used = process.memoryUsage();
const heapUsedMB = used.heapUsed / 1024 / 1024;
const heapTotalMB = used.heapTotal / 1024 / 1024;
const usagePercent = (heapUsedMB / heapTotalMB) * 100;
const time = Date.now() - startTime;
if (usagePercent > 90) {
return {
status: 'warn',
time,
output: `内存使用率${usagePercent.toFixed(2)}%`
};
}
return {
status: 'pass',
time,
output: `内存使用率${usagePercent.toFixed(2)}%`
};
} catch (error: any) {
return {
status: 'warn',
time: Date.now() - startTime,
error: error.message
};
}
}
private determineStatus(
checks: Record<string, CheckResult>
): 'healthy' | 'degraded' | 'unhealthy' {
const results = Object.values(checks);
if (results.some(c => c.status === 'fail')) {
return 'unhealthy';
}
if (results.some(c => c.status === 'warn')) {
return 'degraded';
}
return 'healthy';
}
}
// 设置路由
const app = express();
const db = new Pool({ connectionString: process.env.DATABASE_URL });
const redis = new Redis(process.env.REDIS_URL);
const healthCheck = new HealthCheckService(db, redis);
// 活跃度探针(轻量级)
app.get('/health/live', async (req, res) => {
const result = await healthCheck.liveness();
res.status(200).json(result);
});
// 就绪度探针(检查关键依赖项)
app.get('/health/ready', async (req, res) => {
const result = await healthCheck.readiness();
if (result.status === 'unhealthy') {
return res.status(503).json(result);
}
res.status(200).json(result);
});
// 深度健康检查(检查所有依赖项)
app.get('/health', async (req, res) => {
const result = await healthCheck.deep();
const statusCode =
result.status === 'healthy' ? 200 :
result.status === 'degraded' ? 200 :
503;
res.status(statusCode).json(result);
});
// 启动探针
app.get('/health/startup', async (req, res) => {
// 检查应用程序是否已完全启动
const isReady = true; // 检查实际启动条件
if (isReady) {
res.status(200).json({ status: 'started' });
} else {
res.status(503).json({ status: 'starting' });
}
});
2. Spring Boot Actuator-Style (Java)
@RestController
@RequestMapping("/actuator")
public class HealthController {
@Autowired
private DataSource dataSource;
@Autowired
private RedisTemplate<String, String> redisTemplate;
@GetMapping("/health")
public ResponseEntity<Map<String, Object>> health() {
Map<String, Object> health = new HashMap<>();
health.put("status", "UP");
health.put("timestamp", Instant.now().toString());
Map<String, Object> components = new HashMap<>();
// 检查数据库
components.put("db", checkDatabase());
// 检查Redis
components.put("redis", checkRedis());
health.put("components", components);
boolean anyDown = components.values().stream()
.anyMatch(c -> "DOWN".equals(((Map) c).get("status")));
if (anyDown) {
health.put("status", "DOWN");
return ResponseEntity.status(503).body(health);
}
return ResponseEntity.ok(health);
}
@GetMapping("/health/liveness")
public ResponseEntity<Map<String, String>> liveness() {
Map<String, String> response = new HashMap<>();
response.put("status", "UP");
return ResponseEntity.ok(response);
}
@GetMapping("/health/readiness")
public ResponseEntity<Map<String, Object>> readiness() {
Map<String, Object> readiness = new HashMap<>();
// 检查关键依赖项
Map<String, Object> dbCheck = checkDatabase();
readiness.put("database", dbCheck);
boolean isReady = "UP".equals(dbCheck.get("status"));
if (isReady) {
readiness.put("status", "UP");
return ResponseEntity.ok(readiness);
} else {
readiness.put("status", "DOWN");
return ResponseEntity.status(503).body(readiness);
}
}
private Map<String, Object> checkDatabase() {
Map<String, Object> result = new HashMap<>();
long startTime = System.currentTimeMillis();
try {
Connection conn = dataSource.getConnection();
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("SELECT 1");
long duration = System.currentTimeMillis() - startTime;
result.put("status", "UP");
result.put("responseTime", duration + "ms");
rs.close();
stmt.close();
conn.close();
} catch (Exception e) {
result.put("status", "DOWN");
result.put("error", e.getMessage());
}
return result;
}
private Map<String, Object> checkRedis() {
Map<String, Object> result = new HashMap<>();
long startTime = System.currentTimeMillis();
try {
redisTemplate.opsForValue().get("health-check");
long duration = System.currentTimeMillis() - startTime;
result.put("status", "UP");
result.put("responseTime", duration + "ms");
} catch (Exception e) {
result.put("status", "DOWN");
result.put("error", e.getMessage());
}
return result;
}
}
3. Python Flask 健康检查
from flask import Flask, jsonify
from typing import Dict, Any
import psycopg2
import redis
import time
app = Flask(__name__)
class HealthCheck:
def __init__(self):
self.start_time = time.time()
self.db_pool = None # 初始化你的数据库连接池
self.redis_client = redis.Redis(host='localhost', port=6379)
def liveness(self) -> Dict[str, str]:
"""简单的活跃度检查。"""
return {"status": "alive"}
def readiness(self) -> Dict[str, Any]:
"""就绪度检查,包含依赖项。"""
checks = {
"database": self.check_database(),
"redis": self.check_redis()
}
status = "ready" if all(
c["status"] == "pass" for c in checks.values()
) else "not_ready"
return {
"status": status,
"checks": checks,
"timestamp": time.time()
}
def check_database(self) -> Dict[str, Any]:
"""检查数据库连接。"""
start_time = time.time()
try:
conn = psycopg2.connect("dbname=test user=postgres")
cursor = conn.cursor()
cursor.execute("SELECT 1")
cursor.close()
conn.close()
duration = (time.time() - start_time) * 1000
return {
"status": "pass",
"time": f"{duration:.2f}ms"
}
except Exception as e:
return {
"status": "fail",
"error": str(e)
}
def check_redis(self) -> Dict[str, Any]:
"""检查Redis连接。"""
start_time = time.time()
try:
self.redis_client.ping()
duration = (time.time() - start_time) * 1000
return {
"status": "pass",
"time": f"{duration:.2f}ms"
}
except Exception as e:
return {
"status": "fail",
"error": str(e)
}
health_checker = HealthCheck()
@app.route('/health/live')
def liveness():
return jsonify(health_checker.liveness()), 200
@app.route('/health/ready')
def readiness():
result = health_checker.readiness()
status_code = 200 if result["status"] == "ready" else 503
return jsonify(result), status_code
@app.route('/health')
def health():
result = health_checker.readiness()
return jsonify(result), 200
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)
最佳实践
✅ 执行
- 实施独立的活跃度和就绪度探针
- 保持活跃度探针轻量级
- 在就绪度中检查关键依赖项
- 返回适当的HTTP状态码
- 包含响应时间指标
- 设置合理的超时
- 短暂缓存健康检查结果
- 包含版本和环境信息
- 监控健康检查失败
❌ 不执行
- 使活跃度探针检查依赖项
- 对于失败的健康检查返回200
- 响应时间过长
- 跳过重要的依赖项检查
- 暴露敏感信息
- 忽略健康检查失败
Kubernetes 配置
apiVersion: apps/v1
kind: Deployment
spec:
template:
spec:
containers:
- name: app
livenessProbe:
httpGet:
path: /health/live
port: 3000
initialDelaySeconds: 15
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /health/ready
port: 3000
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
startupProbe:
httpGet:
path: /health/startup
port: 3000
initialDelaySeconds: 0
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 30