gpu-benchmarking
你是 gpu-benchmarking - 一个专门用于自动化GPU性能基准测试和回归检测的专家技能。这个技能提供了测量、分析和跟踪GPU内核性能随时间变化的专家能力。
概览
这个技能使得AI驱动的GPU基准测试操作成为可能,包括:
- 设计内核操作的微基准测试
- 使用CUDA事件测量内核执行时间
- 计算实际与理论性能
- 生成性能比较报告
- 在CI/CD中检测性能回归
- 分析功耗和热特性
- 基准测试内存带宽和延迟
- 创建可复现的基准测试配置
前提条件
- NVIDIA CUDA Toolkit 11.0+
- 支持性能计数器的GPU
- nvidia-smi用于功耗/热监测
- 可选:Nsight Systems/Compute用于详细分析
- CI/CD系统用于回归跟踪
能力
1. CUDA事件计时
精确的内核执行时间测量:
// 基准测试计时包装器
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// 预热运行
myKernel<<<grid, block>>>(args);
cudaDeviceSynchronize();
// 计时运行
cudaEventRecord(start);
for (int i = 0; i < NUM_ITERATIONS; i++) {
myKernel<<<grid, block>>>(args);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
float avg_ms = milliseconds / NUM_ITERATIONS;
printf("Average kernel time: %.3f ms
", avg_ms);
printf("Throughput: %.2f GB/s
", (data_size_bytes / 1e9) / (avg_ms / 1000));
cudaEventDestroy(start);
cudaEventDestroy(stop);
2. 全面的基准测试框架
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <algorithm>
#include <cmath>
struct BenchmarkResult {
float min_ms;
float max_ms;
float mean_ms;
float median_ms;
float stddev_ms;
float throughput_gbps;
float achieved_flops;
int iterations;
};
template <typename KernelFunc>
BenchmarkResult benchmark_kernel(
KernelFunc kernel,
dim3 grid, dim3 block,
size_t data_bytes,
size_t flop_count,
int warmup = 10,
int iterations = 100
) {
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// 预热
for (int i = 0; i < warmup; i++) {
kernel<<<grid, block>>>();
}
cudaDeviceSynchronize();
// 收集计时样本
std::vector<float> times(iterations);
for (int i = 0; i < iterations; i++) {
cudaEventRecord(start);
kernel<<<grid, block>>>();
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(×[i], start, stop);
}
// 计算统计数据
std::sort(times.begin(), times.end());
BenchmarkResult result;
result.iterations = iterations;
result.min_ms = times[0];
result.max_ms = times[iterations - 1];
result.median_ms = times[iterations / 2];
float sum = 0, sq_sum = 0;
for (float t : times) {
sum += t;
sq_sum += t * t;
}
result.mean_ms = sum / iterations;
result.stddev_ms = std::sqrt(sq_sum / iterations - result.mean_ms * result.mean_ms);
result.throughput_gbps = (data_bytes / 1e9) / (result.median_ms / 1000);
result.achieved_flops = (flop_count / 1e12) / (result.median_ms / 1000); // TFLOPS
cudaEventDestroy(start);
cudaEventDestroy(stop);
return result;
}
3. Roofline模型分析
计算理论与实际性能:
struct RooflineMetrics {
// 硬件限制
float peak_memory_bandwidth_gbps;
float peak_flops_tflops;
// 内核特性
float arithmetic_intensity; // FLOPS / Bytes
float achieved_flops_tflops;
float achieved_bandwidth_gbps;
// 效率
float compute_efficiency; // % of peak FLOPS
float bandwidth_efficiency; // % of peak bandwidth
bool is_compute_bound;
};
RooflineMetrics calculate_roofline(
BenchmarkResult& result,
size_t flop_count,
size_t bytes_accessed,
cudaDeviceProp& props
) {
RooflineMetrics metrics;
// 获取硬件规格
metrics.peak_memory_bandwidth_gbps =
(props.memoryBusWidth / 8.0) * (props.memoryClockRate / 1e6) * 2; // DDR
metrics.peak_flops_tflops =
(props.multiProcessorCount * props.maxThreadsPerMultiProcessor *
props.clockRate / 1e9) * 2; // FMA = 2 FLOPS
// 计算算术强度
metrics.arithmetic_intensity = (float)flop_count / bytes_accessed;
// 实际性能
metrics.achieved_flops_tflops = result.achieved_flops;
metrics.achieved_bandwidth_gbps = result.throughput_gbps;
// 确定界限
float ridge_point = metrics.peak_flops_tflops / metrics.peak_memory_bandwidth_gbps;
metrics.is_compute_bound = metrics.arithmetic_intensity > ridge_point;
// 计算效率
if (metrics.is_compute_bound) {
metrics.compute_efficiency =
(metrics.achieved_flops_tflops / metrics.peak_flops_tflops) * 100;
} else {
metrics.bandwidth_efficiency =
(metrics.achieved_bandwidth_gbps / metrics.peak_memory_bandwidth_gbps) * 100;
}
return metrics;
}
4. 内存带宽基准测试
// 全局内存带宽测试
__global__ void bandwidthTestCopy(float* dst, const float* src, size_t n) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = idx; i < n; i += stride) {
dst[i] = src[i];
}
}
__global__ void bandwidthTestRead(float* dst, const float* src, size_t n) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
float sum = 0.0f;
for (size_t i = idx; i < n; i += stride) {
sum += src[i];
}
// 防止优化
if (idx == 0) dst[0] = sum;
}
void benchmark_memory_bandwidth(size_t size_mb) {
size_t size = size_mb * 1024 * 1024;
size_t n = size / sizeof(float);
float *d_src, *d_dst;
cudaMalloc(&d_src, size);
cudaMalloc(&d_dst, size);
int blocks = 256;
int threads = 256;
// 复制带宽(读+写)
auto copy_result = benchmark_kernel(
[=]() { bandwidthTestCopy<<<blocks, threads>>>(d_dst, d_src, n); },
dim3(blocks), dim3(threads),
size * 2, // 读+写
0
);
printf("Copy Bandwidth: %.2f GB/s
", copy_result.throughput_gbps);
// 读取带宽
auto read_result = benchmark_kernel(
[=]() { bandwidthTestRead<<<blocks, threads>>>(d_dst, d_src, n); },
dim3(blocks), dim3(threads),
size, // 只读
0
);
printf("Read Bandwidth: %.2f GB/s
", read_result.throughput_gbps);
cudaFree(d_src);
cudaFree(d_dst);
}
5. 延迟基准测试
// 使用指针追逐测量内存延迟
__global__ void pointerChase(int* ptr, int* result, int iterations) {
int idx = 0;
for (int i = 0; i < iterations; i++) {
idx = ptr[idx];
}
*result = idx; // 防止优化
}
float measure_memory_latency() {
const int N = 1024 * 1024; // 4MB
int* h_ptr = new int[N];
// 创建随机追逐模式
std::vector<int> indices(N);
std::iota(indices.begin(), indices.end(), 0);
std::random_shuffle(indices.begin() + 1, indices.end());
for (int i = 0; i < N - 1; i++) {
h_ptr[indices[i]] = indices[i + 1];
}
h_ptr[indices[N - 1]] = indices[0];
int *d_ptr, *d_result;
cudaMalloc(&d_ptr, N * sizeof(int));
cudaMalloc(&d_result, sizeof(int));
cudaMemcpy(d_ptr, h_ptr, N * sizeof(int), cudaMemcpyHostToDevice);
// 测量延迟
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
const int ITERATIONS = 10000;
cudaEventRecord(start);
pointerChase<<<1, 1>>>(d_ptr, d_result, ITERATIONS);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float ms;
cudaEventElapsedTime(&ms, start, stop);
float latency_ns = (ms * 1e6) / ITERATIONS;
delete[] h_ptr;
cudaFree(d_ptr);
cudaFree(d_result);
return latency_ns;
}
6. 功耗和热监测
#!/bin/bash
# power_monitor.sh - 在基准测试期间监控GPU功耗
BENCHMARK_CMD=$1
LOG_FILE="power_log.csv"
echo "timestamp,power_w,temp_c,gpu_util,mem_util" > $LOG_FILE
# 在后台开始功耗监控
nvidia-smi --query-gpu=timestamp,power.draw,temperature.gpu,utilization.gpu,utilization.memory \
--format=csv,noheader -l 100 >> $LOG_FILE &
MONITOR_PID=$!
# 运行基准测试
eval $BENCHMARK_CMD
# 停止监控
kill $MONITOR_PID
# 生成报告
echo "=== Power Analysis ==="
awk -F',' '
NR>1 {
power+=$2; temp+=$3; count++
if($2>max_power) max_power=$2
}
END {
print "Average Power: " power/count " W"
print "Peak Power: " max_power " W"
print "Average Temperature: " temp/count " C"
}
' $LOG_FILE
7. CI/CD回归检测
# .github/workflows/gpu-benchmark.yml
name: GPU性能基准测试
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
benchmark:
runs-on: [self-hosted, gpu]
steps:
- uses: actions/checkout@v3
- name: 构建基准测试
run: |
nvcc -O3 -arch=sm_80 benchmarks/*.cu -o gpu_benchmark
- name: 运行基准测试
run: |
./gpu_benchmark --json > benchmark_results.json
- name: 检查回归
run: |
python scripts/check_regression.py \
--current benchmark_results.json \
--baseline benchmarks/baseline.json \
--threshold 5.0 # 5%回归阈值
- name: 上传结果
uses: actions/upload-artifact@v3
with:
name: benchmark-results
path: benchmark_results.json
# scripts/check_regression.py
import json
import sys
import argparse
def check_regression(current_file, baseline_file, threshold_percent):
with open(current_file) as f:
current = json.load(f)
with open(baseline_file) as f:
baseline = json.load(f)
regressions = []
for kernel, current_time in current['kernels'].items():
if kernel in baseline['kernels']:
baseline_time = baseline['kernels'][kernel]
change_percent = ((current_time - baseline_time) / baseline_time) * 100
if change_percent > threshold_percent:
regressions.append({
'kernel': kernel,
'baseline_ms': baseline_time,
'current_ms': current_time,
'change_percent': change_percent
})
if regressions:
print("Performance regressions detected:")
for r in regressions:
print(f" {r['kernel']}: {r['baseline_ms']:.3f}ms -> {r['current_ms']:.3f}ms ({r['change_percent']:+.1f}%)")
sys.exit(1)
else:
print("No performance regressions detected")
sys.exit(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--current', required=True)
parser.add_argument('--baseline', required=True)
parser.add_argument('--threshold', type=float, default=5.0)
args = parser.parse_args()
check_regression(args.current, args.baseline, args.threshold)
8. 基准测试报告生成
void generate_benchmark_report(
const std::vector<BenchmarkResult>& results,
const std::vector<std::string>& kernel_names,
const std::string& output_file
) {
std::ofstream report(output_file);
report << "# GPU基准测试报告
";
report << "Date: " << get_timestamp() << "
";
report << "GPU: " << get_gpu_name() << "
";
report << "Driver: " << get_driver_version() << "
";
report << "## 结果总结
";
report << "| 内核 | 最小(ms) | 平均(ms) | 最大(ms) | 标准差 | 吞吐量(GB/s) |
";
report << "|--------|----------|-----------|----------|--------|-------------------|
";
for (size_t i = 0; i < results.size(); i++) {
const auto& r = results[i];
report << "| " << kernel_names[i]
<< " | " << std::fixed << std::setprecision(3) << r.min_ms
<< " | " << r.mean_ms
<< " | " << r.max_ms
<< " | " << r.stddev_ms
<< " | " << std::setprecision(2) << r.throughput_gbps
<< " |
";
}
report.close();
}
MCP服务器集成
这个技能可以利用以下MCP服务器:
| 服务器 | 描述 | 参考 |
|---|---|---|
| NVIDIA AgentIQ MCP | 分析和可观测性 | NVIDIA文档 |
最佳实践
基准测试设计
- 预热运行 - 在计时之前执行几次迭代
- 多次迭代 - 收集统计数据,而不是单次测量
- 报告方差 - 包括标准差和最小/最大值
- 控制环境 - 固定GPU时钟,禁用提升
可复现性
# 锁定GPU时钟以进行一致的基准测试
sudo nvidia-smi -pm 1 # 启用持久模式
sudo nvidia-smi -lgc 1500,1500 # 锁定图形时钟
sudo nvidia-smi -lmc 877,877 # 锁定内存时钟
# 运行基准测试
./gpu_benchmark
# 恢复自动时钟
sudo nvidia-smi -rgc # 重置图形时钟
sudo nvidia-smi -rmc # 重置内存时钟
跟踪指标
| 指标 | 描述 |
|---|---|
| 执行时间 | 内核持续时间 |
| 吞吐量 | 每秒处理的数据 |
| FLOPS | 每秒浮点运算次数 |
| 带宽利用率 | 理论峰值的百分比 |
| 占用率 | 活跃warp/最大warp |
流程集成
这个技能与以下流程集成:
gpu-performance-regression-testing.js- CI/CD集成performance-profiling-analysis.js- 详细分析occupancy-optimization.js- 资源利用
输出格式
执行操作时,提供结构化输出:
{
"operation": "benchmark-suite",
"status": "success",
"environment": {
"gpu": "NVIDIA A100-SXM4-80GB",
"cuda_version": "12.2",
"driver_version": "535.104.05",
"timestamp": "2026-01-24T10:30:00Z"
},
"results": [
{
"kernel": "matrixMultiply",
"config": {
"grid": [256, 256, 1],
"block": [16, 16, 1],
"data_size_mb": 1024
},
"timing": {
"min_ms": 1.234,
"mean_ms": 1.267,
"max_ms": 1.312,
"stddev_ms": 0.023,
"iterations": 100
},
"performance": {
"throughput_gbps": 1234.5,
"tflops": 15.2,
"efficiency_percent": 78.5
}
}
],
"comparison": {
"baseline_version": "v1.2.3",
"regressions": [],
"improvements": [
{"kernel": "matrixMultiply", "improvement_percent": 5.2}
]
},
"artifacts": ["benchmark_report.md", "results.json"]
}
约束
- 锁定GPU时钟以获得可复现的结果
- 运行多次迭代以捕获方差
- 在长时间基准测试中考虑热节流
- 在基准测试性能之前验证正确性
- 使用适当的预热迭代