name: resource-monitor description: 在开发和生产环境中监控系统资源（CPU、内存、磁盘、网络）。

资源监控技能

在开发和生产环境中监控系统资源（CPU、内存、磁盘、网络）。

指令

您是一位系统资源监控专家。当被调用时：

监控资源：
- CPU使用率和负载平均值
- 内存使用情况（RAM和交换空间）
- 磁盘使用情况和I/O
- 网络流量和连接
- 进程级别指标
分析模式：
- 识别资源密集型进程
- 检测内存泄漏
- 找到CPU瓶颈
- 监控磁盘空间趋势
- 跟踪网络带宽使用
设置警报：
- CPU使用率阈值
- 内存限制
- 磁盘空间警告
- 异常网络活动
提供推荐：
- 资源优化策略
- 扩展建议
- 配置改进
- 性能调优

资源指标

CPU监控

# 当前CPU使用率
top -bn1 | grep "Cpu(s)"

# 每核心使用率
mpstat -P ALL 1

# 进程CPU使用率
ps aux --sort=-%cpu | head -10

# 负载平均值
uptime

# Node.js CPU性能分析
node --prof app.js
node --prof-process isolate-*.log

内存监控

# 内存使用情况
free -h

# 详细内存信息
cat /proc/meminfo

# 进程内存使用率
ps aux --sort=-%mem | head -10

# 特定进程的内存映射
pmap -x <PID>

# Node.js内存使用情况
node --inspect app.js
# Chrome DevTools -> Memory

磁盘监控

# 磁盘空间
df -h

# 磁盘I/O
iostat -x 1

# 大文件/目录
du -h --max-depth=1 / | sort -hr | head -20

# 目录磁盘使用情况
ncdu /

# 监控磁盘写入
iotop

网络监控

# 网络连接
netstat -tunapl

# 活动连接
ss -s

# 带宽使用情况
iftop

# 网络流量
nload

# 连接状态
netstat -ant | awk '{print $6}' | sort | uniq -c | sort -n

监控脚本

Node.js资源监控器

// resource-monitor.js
const os = require('os');

class ResourceMonitor {
  constructor(interval = 5000) {
    this.interval = interval;
    this.startTime = Date.now();
  }

  start() {
    console.log('🔍 资源监控器已启动
');
    this.logResources();
    setInterval(() => this.logResources(), this.interval);
  }

  logResources() {
    const uptime = Math.floor((Date.now() - this.startTime) / 1000);
    const cpu = this.getCPUUsage();
    const memory = this.getMemoryUsage();
    const load = os.loadavg();

    console.clear();
    console.log('📊 系统资源');
    console.log('='.repeat(50));
    console.log(`运行时间：${this.formatUptime(uptime)}`);
    console.log('');

    console.log('CPU：');
    console.log(`  使用率：${cpu.toFixed(2)}%`);
    console.log(`  负载平均值：${load[0].toFixed(2)}, ${load[1].toFixed(2)}, ${load[2].toFixed(2)}`);
    console.log(`  核心数：${os.cpus().length}`);
    console.log('');

    console.log('内存：');
    console.log(`  总计：${this.formatBytes(memory.total)}`);
    console.log(`  已使用：${this.formatBytes(memory.used)} (${memory.percentage.toFixed(2)}%)`);
    console.log(`  可用：${this.formatBytes(memory.free)}`);
    this.printProgressBar('内存', memory.percentage);
    console.log('');

    const processMemory = process.memoryUsage();
    console.log('进程内存：');
    console.log(`  RSS：${this.formatBytes(processMemory.rss)}`);
    console.log(`  堆总计：${this.formatBytes(processMemory.heapTotal)}`);
    console.log(`  堆使用：${this.formatBytes(processMemory.heapUsed)}`);
    console.log(`  外部：${this.formatBytes(processMemory.external)}`);
    console.log('');

    this.checkThresholds(cpu, memory);
  }

  getCPUUsage() {
    const cpus = os.cpus();
    let totalIdle = 0;
    let totalTick = 0;

    cpus.forEach(cpu => {
      for (const type in cpu.times) {
        totalTick += cpu.times[type];
      }
      totalIdle += cpu.times.idle;
    });

    const idle = totalIdle / cpus.length;
    const total = totalTick / cpus.length;
    const usage = 100 - ~~(100 * idle / total);

    return usage;
  }

  getMemoryUsage() {
    const total = os.totalmem();
    const free = os.freemem();
    const used = total - free;
    const percentage = (used / total) * 100;

    return { total, free, used, percentage };
  }

  formatBytes(bytes) {
    const units = ['B', 'KB', 'MB', 'GB', 'TB'];
    let size = bytes;
    let unitIndex = 0;

    while (size >= 1024 && unitIndex < units.length - 1) {
      size /= 1024;
      unitIndex++;
    }

    return `${size.toFixed(2)} ${units[unitIndex]}`;
  }

  formatUptime(seconds) {
    const hours = Math.floor(seconds / 3600);
    const minutes = Math.floor((seconds % 3600) / 60);
    const secs = seconds % 60;
    return `${hours}h ${minutes}m ${secs}s`;
  }

  printProgressBar(label, percentage) {
    const width = 40;
    const filled = Math.floor(width * percentage / 100);
    const empty = width - filled;
    const bar = '█'.repeat(filled) + '░'.repeat(empty);

    let color = '\x1b[32m'; // 绿色
    if (percentage > 70) color = '\x1b[33m'; // 黄色
    if (percentage > 85) color = '\x1b[31m'; // 红色

    console.log(`  ${color}[${bar}] ${percentage.toFixed(1)}%\x1b[0m`);
  }

  checkThresholds(cpu, memory) {
    const warnings = [];

    if (cpu > 80) {
      warnings.push(`⚠️  高CPU使用率：${cpu.toFixed(2)}%`);
    }

    if (memory.percentage > 80) {
      warnings.push(`⚠️  高内存使用率：${memory.percentage.toFixed(2)}%`);
    }

    if (warnings.length > 0) {
      console.log('
警告：');
      warnings.forEach(w => console.log(`  ${w}`));
    }
  }
}

// 启动监控
const monitor = new ResourceMonitor(5000);
monitor.start();

Python资源监控器

# resource_monitor.py
import psutil
import time
from datetime import datetime

class ResourceMonitor:
    def __init__(self, interval=5):
        self.interval = interval

    def start(self):
        print("🔍 资源监控器已启动
")
        while True:
            self.log_resources()
            time.sleep(self.interval)

    def log_resources(self):
        cpu_percent = psutil.cpu_percent(interval=1)
        memory = psutil.virtual_memory()
        disk = psutil.disk_usage('/')
        net = psutil.net_io_counters()

        print("\033[2J\033[H")  # 清屏
        print("📊 系统资源")
        print("=" * 50)
        print(f"时间：{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
")

        print("CPU：")
        print(f"  使用率：{cpu_percent}%")
        print(f"  核心数：{psutil.cpu_count()}")
        self.print_progress_bar("CPU", cpu_percent)
        print()

        print("内存：")
        print(f"  总计：{self.format_bytes(memory.total)}")
        print(f"  已使用：{self.format_bytes(memory.used)} ({memory.percent}%)")
        print(f"  可用：{self.format_bytes(memory.available)}")
        self.print_progress_bar("内存", memory.percent)
        print()

        print("磁盘：")
        print(f"  总计：{self.format_bytes(disk.total)}")
        print(f"  已使用：{self.format_bytes(disk.used)} ({disk.percent}%)")
        print(f"  可用：{self.format_bytes(disk.free)}")
        self.print_progress_bar("磁盘", disk.percent)
        print()

        print("网络：")
        print(f"  发送：{self.format_bytes(net.bytes_sent)}")
        print(f"  接收：{self.format_bytes(net.bytes_recv)}")
        print()

        self.check_thresholds(cpu_percent, memory.percent, disk.percent)

    def format_bytes(self, bytes):
        for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
            if bytes < 1024:
                return f"{bytes:.2f} {unit}"
            bytes /= 1024
        return f"{bytes:.2f} PB"

    def print_progress_bar(self, label, percentage):
        width = 40
        filled = int(width * percentage / 100)
        empty = width - filled
        bar = '█' * filled + '░' * empty

        if percentage > 85:
            color = '\033[91m'  # 红色
        elif percentage > 70:
            color = '\033[93m'  # 黄色
        else:
            color = '\033[92m'  # 绿色

        print(f"  {color}[{bar}] {percentage:.1f}%\033[0m")

    def check_thresholds(self, cpu, memory, disk):
        warnings = []

        if cpu > 80:
            warnings.append(f"⚠️  高CPU使用率：{cpu}%")
        if memory > 80:
            warnings.append(f"⚠️  高内存使用率：{memory}%")
        if disk > 80:
            warnings.append(f"⚠️  低磁盘空间：{100-disk}% 可用")

        if warnings:
            print("
警告：")
            for warning in warnings:
                print(f"  {warning}")

# 启动监控
monitor = ResourceMonitor(interval=5)
monitor.start()

使用示例

@resource-monitor
@resource-monitor --interval 5
@resource-monitor --alert
@resource-monitor --process node
@resource-monitor --export-metrics

监控报告

# 资源监控报告

**期间**：2024-01-15 00:00 - 23:59
**服务器**：web-server-01
**环境**：生产

---

## 执行摘要

**整体健康状态**：🟢 良好
**关键警报**：0
**警告**：3
**平均CPU**：45%
**平均内存**：62%
**磁盘使用率**：58%

---

## CPU指标

**平均**：45%
**峰值**：87%（在14:30）
**最低**：12%（在03:00）

**负载平均值**：
- 1分钟：2.34
- 5分钟：2.12
- 15分钟：1.98

**顶级CPU进程**：
1. node（PID 1234）：34%
2. postgres（PID 5678）：12%
3. redis（PID 9012）：5%

**时间线**：

00:00 ████░░░░░░ 12% 06:00 ████████░░ 35% 12:00 ███████████ 52% 14:30 █████████████████ 87% ⚠️ 峰值 18:00 ████████░░ 38% 23:00 █████░░░░░ 18%


---

## 内存指标

**总计**：16 GB
**平均已使用**：9.92 GB（62%）
**峰值**：13.6 GB（85%）⚠️
**交换空间已使用**：0 GB

**内存细分**：
- 应用程序：6.4 GB（40%）
- 数据库：2.4 GB（15%）
- 缓存：1.12 GB（7%）
- 系统：0.8 GB（5%）
- 可用：5.28 GB（33%）

**顶级内存进程**：
1. node（PID 1234）：6.4 GB
2. postgres（PID 5678）：2.4 GB
3. redis（PID 9012）：1.12 GB

**内存时间线**：

00:00 ████████░░ 58% 06:00 ████████░░ 62% 12:00 █████████░ 68% 14:30 █████████████ 85% ⚠️ 峰值 18:00 ████████░░ 65% 23:00 ████████░░ 60%


---

## 磁盘指标

**总计**：500 GB
**已使用**：290 GB（58%）
**可用**：210 GB（42%）

**磁盘I/O**：
- 读取：12.3 GB/天
- 写入：8.7 GB/天
- 平均IOPS：234

**最大目录**：
1. /var/log：45 GB（15.5%）
2. /var/lib/postgresql：89 GB（30.7%）
3. /app/uploads：67 GB（23.1%）
4. /var/lib/redis：23 GB（7.9%）

**增长趋势**：+2.3 GB/天
**预计满时间**：91天

---

## 网络指标

**流量**：
- 发送：234 GB
- 接收：456 GB
- 总计：690 GB

**带宽**：
- 平均：80 Mbps
- 峰值：450 Mbps（在15:00）

**连接**：
- 已建立：1,234
- 时间等待：456
- 关闭等待：23

**顶级通信者**：
1. 192.168.1.100：45 GB
2. 10.0.0.50：34 GB
3. 172.16.0.20：28 GB

---

## 警报与警告

### 关键（0）
无

### 警告（3）

1. **高CPU在14:30**
   - 峰值：87%
   - 持续时间：15分钟
   - 原因：计划报告生成
   - 行动：考虑移动到非高峰时段

2. **高内存在14:30**
   - 峰值：85%
   - 持续时间：20分钟
   - 原因：大数据集处理
   - 行动：实施流式处理或分页

3. **日志目录增长**
   - 大小：45 GB
   - 增长：1.2 GB/天
   - 行动：实施日志轮转和归档

---

## 推荐

### 立即行动
1. ✓ 实施日志轮转（从45 GB减少到<10 GB）
2. ✓ 安排资源密集型任务在非高峰时段
3. ✓ 为应用程序添加内存限制（最大8 GB）

### 短期
1. 监控内存使用趋势以检测潜在泄漏
2. 优化报告生成查询
3. 为频繁访问的数据添加缓存
4. 归档旧数据库数据

### 长期
1. 考虑垂直扩展（升级到32 GB RAM）
2. 在高峰时段实施水平扩展
3. 将文件上传移动到对象存储（如S3）
4. 设置预测性警报

---

## 容量规划

**当前容量**：🟢 良好

**预测**（未来3个月）：
- CPU：将保持在可接受范围内
- 内存：如果趋势持续可能需要升级
- 磁盘：需要处理日志增长
- 网络：当前容量足够

**推荐行动**：
- 每周监控内存使用情况
- 在1周内实施日志归档
- 在6个月内计划存储扩展

警报阈值

CPU

警告：> 70% 持续5分钟
关键：> 85% 持续5分钟

内存

警告：> 80% 已使用
关键：> 90% 已使用

磁盘

警告：> 80% 已使用
关键：> 90% 已使用

网络

警告：> 80% 带宽
关键：连接错误 > 100/分钟

工具与集成

监控工具

Prometheus：指标收集
Grafana：可视化和仪表板
Datadog：全栈监控
New Relic：应用性能监控
CloudWatch：AWS监控
htop：交互式进程查看器
glances：系统监控（CLI）

Node.js监控

// 使用prom-client进行Prometheus集成
const client = require('prom-client');

const register = new client.Registry();

// CPU指标
const cpuUsage = new client.Gauge({
  name: 'process_cpu_usage_percent',
  help: '进程CPU使用率百分比',
  registers: [register]
});

// 内存指标
const memoryUsage = new client.Gauge({
  name: 'process_memory_usage_bytes',
  help: '进程内存使用字节数',
  registers: [register]
});

// 每5秒更新指标
setInterval(() => {
  const usage = process.cpuUsage();
  cpuUsage.set(usage.user + usage.system);

  const mem = process.memoryUsage();
  memoryUsage.set(mem.heapUsed);
}, 5000);

注释

定期监控，而非仅在问题发生时
设置自动警报以处理关键阈值
保留历史数据以进行趋势分析
将资源使用与应用程序事件关联
使用监控数据进行容量规划
建立正常行为的基准
避免过度警报（警报疲劳）
记录异常模式及其原因