数据库监控
概览
实施全面的数据库监控,用于性能分析、健康检查和主动告警。涵盖指标收集、分析和故障排除策略。
何时使用
- 性能基线建立
- 实时健康监控
- 容量规划
- 查询性能分析
- 资源利用率跟踪
- 告警规则配置
- 事件响应和故障排除
PostgreSQL监控
连接监控
PostgreSQL - 活动连接:
-- 查看当前连接
SELECT
pid,
usename,
application_name,
client_addr,
state,
query_start,
state_change
FROM pg_stat_activity
WHERE state != 'idle'
ORDER BY query_start DESC;
-- 每个数据库的连接数
SELECT
datname,
COUNT(*) as connection_count,
MAX(EXTRACT(EPOCH FROM (NOW() - query_start))) as max_query_duration_sec
FROM pg_stat_activity
GROUP BY datname;
-- 查找空闲事务
SELECT
pid,
usename,
state,
query_start,
xact_start,
EXTRACT(EPOCH FROM (NOW() - xact_start)) as transaction_age_sec
FROM pg_stat_activity
WHERE state = 'idle in transaction'
ORDER BY xact_start;
PostgreSQL - 最大连接配置:
-- 查看当前max_connections
SHOW max_connections;
-- 设置max_connections(需要重启)
-- 在postgresql.conf中:
-- max_connections = 200
-- 监控连接池使用情况
SELECT
sum(numbackends) as total_backends,
(SELECT setting::int FROM pg_settings WHERE name = 'max_connections') as max_connections,
ROUND(100.0 * sum(numbackends) /
(SELECT setting::int FROM pg_settings WHERE name = 'max_connections'), 2) as usage_percent
FROM pg_stat_database;
查询性能监控
PostgreSQL - 查询统计:
-- 启用查询统计(pg_stat_statements扩展)
CREATE EXTENSION IF NOT EXISTS pg_stat_statements;
-- 查看最慢的查询
SELECT
query,
calls,
mean_exec_time,
max_exec_time,
total_exec_time
FROM pg_stat_statements
WHERE query NOT LIKE '%pg_stat%'
ORDER BY mean_exec_time DESC
LIMIT 10;
-- 按总执行时间排序的顶级查询
SELECT
SUBSTRING(query, 1, 50) as query_snippet,
calls,
ROUND(total_exec_time::NUMERIC, 2) as total_time_ms,
ROUND(mean_exec_time::NUMERIC, 2) as avg_time_ms,
ROUND(stddev_exec_time::NUMERIC, 2) as stddev_ms
FROM pg_stat_statements
ORDER BY total_exec_time DESC
LIMIT 10;
-- 重置统计
SELECT pg_stat_statements_reset();
PostgreSQL - 长时间运行的查询:
-- 查找运行超过1分钟的查询
SELECT
pid,
usename,
application_name,
state,
query,
EXTRACT(EPOCH FROM (NOW() - query_start)) as duration_seconds
FROM pg_stat_activity
WHERE (NOW() - query_start) > INTERVAL '1 minute'
ORDER BY query_start;
-- 取消长时间运行的查询
SELECT pg_cancel_backend(pid);
-- 终止卡住的查询
SELECT pg_terminate_backend(pid);
表和索引监控
PostgreSQL - 表统计:
-- 表大小分析
SELECT
schemaname,
tablename,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) as size,
ROUND(100.0 * pg_total_relation_size(schemaname||'.'||tablename) /
(SELECT pg_database_size(current_database()))::NUMERIC, 2) as percent_of_db
FROM pg_tables
WHERE schemaname NOT IN ('pg_catalog', 'information_schema')
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC;
-- 表行数和死亡元组
SELECT
schemaname,
tablename,
n_live_tup,
n_dead_tup,
ROUND(100.0 * n_dead_tup / (n_live_tup + n_dead_tup), 2) as dead_percent
FROM pg_stat_user_tables
ORDER BY n_dead_tup DESC;
-- 当死亡元组超过阈值时触发VACUUM
-- 死亡元组超过20%的表需要VACUUM
SELECT
schemaname,
tablename,
ROUND(100.0 * n_dead_tup / (n_live_tup + n_dead_tup), 2) as dead_percent
FROM pg_stat_user_tables
WHERE n_dead_tup > n_live_tup * 0.2;
PostgreSQL - 索引监控:
-- 未使用的索引(从未扫描过)
SELECT
schemaname,
tablename,
indexname,
idx_scan,
pg_size_pretty(pg_relation_size(indexrelid)) as size
FROM pg_stat_user_indexes
WHERE idx_scan = 0
ORDER BY pg_relation_size(indexrelid) DESC;
-- 索引碎片化
SELECT
schemaname,
tablename,
indexname,
ROUND(100.0 * (pg_relation_size(indexrelid) -
pg_relation_size(indexrelid, 'main')) /
pg_relation_size(indexrelid), 2) as fragmentation_percent
FROM pg_stat_user_indexes
WHERE pg_relation_size(indexrelid) > 1000000
ORDER BY fragmentation_percent DESC;
-- 重建碎片化索引
REINDEX INDEX CONCURRENTLY idx_name;
MySQL监控
性能模式
MySQL - 查询统计:
-- 启用性能模式
-- 在my.cnf中:performance_schema = ON
-- 最慢的查询
SELECT
object_schema,
object_name,
COUNT_STAR,
SUM_TIMER_WAIT / 1000000000000 as total_time_sec,
AVG_TIMER_WAIT / 1000000000 as avg_time_ms
FROM performance_schema.table_io_waits_summary_by_table_io_type
ORDER BY SUM_TIMER_WAIT DESC
LIMIT 10;
-- 查询响应时间插件
SELECT
TIME,
COUNT,
TOTAL,
ERRORS
FROM mysql.query_response_time
ORDER BY TIME DESC;
MySQL - 连接监控:
-- 当前连接
SHOW PROCESSLIST;
-- 增强型processlist
SELECT
ID,
USER,
HOST,
DB,
COMMAND,
TIME,
STATE,
INFO
FROM INFORMATION_SCHEMA.PROCESSLIST
WHERE STATE != 'Sleep'
ORDER BY TIME DESC;
-- 终止长时间运行的查询
KILL QUERY process_id;
KILL CONNECTION process_id;
-- 最大连接使用情况
SHOW STATUS LIKE 'Threads%';
SHOW STATUS LIKE 'Max_used_connections';
InnoDB监控
MySQL - InnoDB缓冲池:
-- 缓冲池统计
SHOW STATUS LIKE 'Innodb_buffer_pool%';
-- 计算命中率
-- (Innodb_buffer_pool_read_requests - Innodb_buffer_pool_reads) /
-- Innodb_buffer_pool_read_requests
-- 查看InnoDB事务
SELECT * FROM INFORMATION_SCHEMA.INNODB_TRX
ORDER BY trx_started DESC;
-- 查看InnoDB锁
SELECT * FROM INFORMATION_SCHEMA.INNODB_LOCKS;
-- 监控InnoDB页面
SHOW STATUS LIKE 'Innodb_pages%';
MySQL - 表和索引统计:
-- 表统计
SELECT
TABLE_SCHEMA,
TABLE_NAME,
ROUND(((DATA_LENGTH + INDEX_LENGTH) / 1024 / 1024), 2) as Size_MB,
TABLE_ROWS
FROM INFORMATION_SCHEMA.TABLES
WHERE TABLE_SCHEMA != 'information_schema'
ORDER BY (DATA_LENGTH + INDEX_LENGTH) DESC;
-- 索引基数
SELECT
TABLE_SCHEMA,
TABLE_NAME,
COLUMN_NAME,
SEQ_IN_INDEX,
CARDINALITY
FROM INFORMATION_SCHEMA.STATISTICS
WHERE TABLE_SCHEMA = 'your_database'
ORDER BY TABLE_NAME, SEQ_IN_INDEX;
实时监控工具
PostgreSQL监控设置
PostgreSQL与Prometheus:
# prometheus.yml配置
scrape_configs:
- job_name: 'postgres'
static_configs:
- targets: ['localhost:9187']
# 使用postgres_exporter
# 下载并运行:
# ./postgres_exporter --web.listen-address=:9187
自定义监控查询:
-- 创建监控函数
CREATE OR REPLACE FUNCTION get_database_metrics()
RETURNS TABLE (
metric_name VARCHAR,
metric_value NUMERIC,
collected_at TIMESTAMP
) AS $$
BEGIN
-- 返回各种指标
RETURN QUERY
SELECT 'connections'::VARCHAR,
(SELECT count(*) FROM pg_stat_activity)::NUMERIC,
NOW();
RETURN QUERY
SELECT 'transactions_per_second',
(SELECT sum(xact_commit + xact_rollback) / 60 FROM pg_stat_database)::NUMERIC,
NOW();
RETURN QUERY
SELECT 'cache_hit_ratio',
ROUND(100.0 * (1 - (
(SELECT sum(heap_blks_read) FROM pg_statio_user_tables)::FLOAT /
((SELECT sum(heap_blks_read + heap_blks_hit) FROM pg_statio_user_tables)::FLOAT)
)), 2)::NUMERIC,
NOW();
END;
$$ LANGUAGE plpgsql;
SELECT * FROM get_database_metrics();
自动化监控仪表板
-- 创建监控表
CREATE TABLE database_metrics_history (
collected_at TIMESTAMP,
metric_name VARCHAR(100),
metric_value NUMERIC,
PRIMARY KEY (collected_at, metric_name)
);
-- 收集指标的函数
CREATE OR REPLACE FUNCTION collect_metrics()
RETURNS void AS $$
BEGIN
INSERT INTO database_metrics_history (collected_at, metric_name, metric_value)
SELECT
NOW(),
'active_connections',
(SELECT count(*) FROM pg_stat_activity WHERE state != 'idle')::NUMERIC
UNION ALL
SELECT
NOW(),
'cache_hit_ratio',
ROUND(100.0 * sum(heap_blks_hit) / (sum(heap_blks_hit) + sum(heap_blks_read)), 2)
FROM pg_statio_user_tables
UNION ALL
SELECT
NOW(),
'database_size_mb',
pg_database_size(current_database())::NUMERIC / 1024 / 1024
UNION ALL
SELECT
NOW(),
'table_bloat_percent',
ROUND(100.0 * sum(n_dead_tup) / sum(n_live_tup + n_dead_tup), 2)
FROM pg_stat_user_tables;
END;
$$ LANGUAGE plpgsql;
-- 通过cron计划
-- SELECT cron.schedule('collect_metrics', '* * * * *', 'SELECT collect_metrics()');
健康检查
PostgreSQL - 健康检查函数:
CREATE OR REPLACE FUNCTION database_health_check()
RETURNS TABLE (
check_name VARCHAR,
status VARCHAR,
details VARCHAR
) AS $$
BEGIN
-- 检查连接
RETURN QUERY
SELECT
'connections'::VARCHAR,
CASE WHEN (SELECT count(*) FROM pg_stat_activity)::INT /
(SELECT setting::INT FROM pg_settings WHERE name = 'max_connections')::FLOAT > 0.8
THEN 'WARNING' ELSE 'OK' END,
'Active connections: ' || (SELECT count(*) FROM pg_stat_activity)::TEXT;
-- 检查缓存命中率
RETURN QUERY
SELECT
'cache_hit_ratio',
CASE WHEN 100.0 * sum(heap_blks_hit) / (sum(heap_blks_hit) + sum(heap_blks_read)) < 90
THEN 'WARNING' ELSE 'OK' END,
'Cache hit ratio: ' ||
ROUND(100.0 * sum(heap_blks_hit) / (sum(heap_blks_hit) + sum(heap_blks_read)), 2)::TEXT
FROM pg_statio_user_tables;
-- 检查事务包装
RETURN QUERY
SELECT
'transaction_wraparound'::VARCHAR,
CASE WHEN min(age(datfrozenxid)) > 10000000
THEN 'CRITICAL' ELSE 'OK' END,
'Oldest transaction age: ' || min(age(datfrozenxid))::TEXT
FROM pg_database;
END;
$$ LANGUAGE plpgsql;
SELECT * FROM database_health_check();
告警规则
常见告警条件:
1. 高连接使用率(>max_connections的80%)
2. 查询执行时间超过阈值(>5秒)
3. 缓存命中率低于90%
4. 表膨胀百分比>20%
5. 复制延迟>1分钟
6. 磁盘空间使用率>80%
7. 长时间运行的事务(>30分钟)
8. 索引膨胀百分比>30%
性能调整指标
PostgreSQL - 关键监控指标:
-- 缓存命中率(应>99%)
SELECT
sum(heap_blks_hit)::FLOAT / (sum(heap_blks_hit) + sum(heap_blks_read)) as cache_hit_ratio
FROM pg_statio_user_tables;
-- 每秒事务数
SELECT
sum(xact_commit + xact_rollback) / 60 as txns_per_sec
FROM pg_stat_database;
-- 索引使用率
SELECT
schemaname,
tablename,
indexname,
idx_scan,
idx_tup_read,
idx_tup_fetch
FROM pg_stat_user_indexes
ORDER BY idx_scan DESC;
故障排除指南
PostgreSQL - 慢查询诊断:
-- 1. 检查查询计划
EXPLAIN ANALYZE
SELECT * FROM orders WHERE user_id = 123 AND status = 'pending';
-- 2. 检查索引
SELECT * FROM pg_stat_user_indexes WHERE tablename = 'orders';
-- 3. 更新统计信息
ANALYZE orders;
-- 4. 检查表膨胀
SELECT n_dead_tup, n_live_tup FROM pg_stat_user_tables WHERE relname = 'orders';
-- 5. 如有需要,运行VACUUM
VACUUM ANALYZE orders;
最佳实践
✅ 监控关键性能指标(KPIs) ✅ 设置关键指标的告警 ✅ 定期基线性能 ✅ 及时调查异常 ✅ 维护监控历史 ✅ 测试告警规则 ✅ 记录告警程序
❌ 忽略警告 ❌ 跳过基线测量 ❌ 设置过于敏感的告警阈值 ❌ 监控而不采取行动 ❌ 忘记磁盘空间