CUDA图捕获与优化 cuda-graphs

CUDA图捕获与优化技能,专注于减少内核启动开销和优化执行模式,通过图化工作流程提升AI推理和批处理效率。

深度学习 0 次安装 0 次浏览 更新于 2/25/2026

cuda-graphs 专家技能,用于CUDA图捕获和优化,以减少启动开销。将CUDA操作捕获到图中,实例化并执行图实例,更新图节点参数,对比图与流执行的性能,设计图友好的内核模式,优化推理的启动延迟。

cuda-graphs

您是 cuda-graphs - 一个专门用于CUDA图捕获和优化的专家技能。这个技能提供了减少内核启动开销和通过基于图的工作流程优化执行模式的专家能力。

概览

这个技能使得AI驱动的CUDA图操作成为可能,包括:

  • 将CUDA操作捕获到图中
  • 实例化并执行图实例
  • 更新图节点参数
  • 对比图与流执行的性能
  • 设计图友好的内核模式
  • 处理条件图执行
  • 将图与NCCL操作集成
  • 优化推理的启动延迟

前提条件

  • NVIDIA CUDA Toolkit 10.0+(基本图)
  • CUDA 11.0+用于图更新
  • CUDA 12.0+用于条件节点
  • 计算能力7.0+的GPU
  • Nsight Systems用于图性能分析

能力

1. 流捕获基础

将流操作捕获到图中:

#include <cuda_runtime.h>

cudaGraph_t graph;
cudaGraphExec_t graphExec;
cudaStream_t stream;

cudaStreamCreate(&stream);

// 开始流捕获
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);

// 记录要捕获的操作
kernel1<<<grid1, block1, 0, stream>>>(args1);
kernel2<<<grid2, block2, 0, stream>>>(args2);
kernel3<<<grid3, block3, 0, stream>>>(args3);

// 结束捕获并创建图
cudaStreamEndCapture(stream, &graph);

// 实例化图以供执行
cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);

// 执行图(比单独启动的开销低得多)
for (int i = 0; i < iterations; i++) {
    cudaGraphLaunch(graphExec, stream);
}
cudaStreamSynchronize(stream);

// 清理
cudaGraphExecDestroy(graphExec);
cudaGraphDestroy(graph);
cudaStreamDestroy(stream);

2. 显式图构建

以编程方式构建图:

cudaGraph_t graph;
cudaGraphCreate(&graph, 0);

// 创建内核节点
cudaKernelNodeParams kernelParams1 = {0};
kernelParams1.func = (void*)kernel1;
kernelParams1.gridDim = grid1;
kernelParams1.blockDim = block1;
kernelParams1.sharedMemBytes = 0;
kernelParams1.kernelParams = kernelArgs1;

cudaKernelNodeParams kernelParams2 = {0};
kernelParams2.func = (void*)kernel2;
kernelParams2.gridDim = grid2;
kernelParams2.blockDim = block2;
kernelParams2.sharedMemBytes = 0;
kernelParams2.kernelParams = kernelArgs2;

cudaGraphNode_t node1, node2;

// 添加第一个内核(无依赖)
cudaGraphAddKernelNode(&node1, graph, NULL, 0, &kernelParams1);

// 添加第二个内核(依赖于第一个)
cudaGraphNode_t dependencies[] = {node1};
cudaGraphAddKernelNode(&node2, graph, dependencies, 1, &kernelParams2);

// 实例化并执行
cudaGraphExec_t graphExec;
cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
cudaGraphLaunch(graphExec, stream);

3. 图节点类型

// 内存复制节点
cudaMemcpy3DParms copyParams = {0};
// ... 配置复制参数
cudaGraphNode_t copyNode;
cudaGraphAddMemcpyNode(&copyNode, graph, NULL, 0, &copyParams);

// Memset节点
cudaMemsetParams memsetParams = {0};
memsetParams.dst = d_array;
memsetParams.value = 0;
memsetParams.pitch = 0;
memsetParams.elementSize = sizeof(int);
memsetParams.width = N;
memsetParams.height = 1;
cudaGraphNode_t memsetNode;
cudaGraphAddMemsetNode(&memsetNode, graph, NULL, 0, &memsetParams);

// 主机函数节点
cudaHostNodeParams hostParams = {0};
hostParams.fn = hostCallback;
hostParams.userData = userData;
cudaGraphNode_t hostNode;
cudaGraphAddHostNode(&hostNode, graph, dependencies, numDeps, &hostParams);

// 事件记录/等待节点(CUDA 11.1+)
cudaEvent_t event;
cudaEventCreate(&event);
cudaGraphNode_t eventRecordNode, eventWaitNode;
cudaGraphAddEventRecordNode(&eventRecordNode, graph, deps, numDeps, event);
cudaGraphAddEventWaitNode(&eventWaitNode, graph, deps, numDeps, event);

// 空节点(仅依赖)
cudaGraphNode_t emptyNode;
cudaGraphAddEmptyNode(&emptyNode, graph, deps, numDeps);

4. 图更新(CUDA 11+)

无需重建即可更新图参数:

cudaGraph_t graph;
cudaGraphExec_t graphExec;

// 初始捕获
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
myKernel<<<grid, block, 0, stream>>>(d_input, d_output, N);
cudaStreamEndCapture(stream, &graph);
cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);

// 执行初始图
cudaGraphLaunch(graphExec, stream);
cudaStreamSynchronize(stream);

// 使用新捕获更新图
cudaGraph_t newGraph;
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
myKernel<<<grid, block, 0, stream>>>(d_input2, d_output2, N);  // 不同指针
cudaStreamEndCapture(stream, &newGraph);

// 更新可执行图
cudaGraphExecUpdateResult updateResult;
cudaGraphExecUpdate(graphExec, newGraph, NULL, &updateResult);

if (updateResult == cudaGraphExecUpdateSuccess) {
    // 图成功更新
    cudaGraphLaunch(graphExec, stream);
} else {
    // 需要重新实例化
    cudaGraphExecDestroy(graphExec);
    cudaGraphInstantiate(&graphExec, newGraph, NULL, NULL, 0);
    cudaGraphLaunch(graphExec, stream);
}

cudaGraphDestroy(newGraph);

5. 内核节点参数更新

// 从图中获取内核节点
cudaGraphNode_t* nodes;
size_t numNodes;
cudaGraphGetNodes(graph, NULL, &numNodes);
nodes = new cudaGraphNode_t[numNodes];
cudaGraphGetNodes(graph, nodes, &numNodes);

// 查找并更新内核节点
for (size_t i = 0; i < numNodes; i++) {
    cudaGraphNodeType nodeType;
    cudaGraphNodeGetType(nodes[i], &nodeType);

    if (nodeType == cudaGraphNodeTypeKernel) {
        cudaKernelNodeParams params;
        cudaGraphKernelNodeGetParams(nodes[i], &params);

        // 更新参数
        void* newArgs[] = {&newInput, &newOutput, &newN};
        params.kernelParams = newArgs;

        // 设置新参数
        cudaGraphExecKernelNodeSetParams(graphExec, nodes[i], &params);
    }
}

delete[] nodes;

6. 图性能基准测试

void benchmarkGraphVsStreams(int numKernels, int iterations) {
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // 基准测试基于流的执行
    cudaEventRecord(start);
    for (int i = 0; i < iterations; i++) {
        for (int k = 0; k < numKernels; k++) {
            smallKernel<<<grid, block, 0, stream>>>(d_data, N);
        }
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float streamTime;
    cudaEventElapsedTime(&streamTime, start, stop);

    // 基准测试基于图的执行
    cudaGraph_t graph;
    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
    for (int k = 0; k < numKernels; k++) {
        smallKernel<<<grid, block, 0, stream>>>(d_data, N);
    }
    cudaStreamEndCapture(stream, &graph);

    cudaGraphExec_t graphExec;
    cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);

    cudaEventRecord(start);
    for (int i = 0; i < iterations; i++) {
        cudaGraphLaunch(graphExec, stream);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float graphTime;
    cudaEventElapsedTime(&graphTime, start, stop);

    printf("流执行:%.3f ms
", streamTime);
    printf("图执行:%.3f ms
", graphTime);
    printf("加速比:%.2fx
", streamTime / graphTime);

    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
}

7. 推理管道与图

class InferenceGraphPipeline {
private:
    cudaGraph_t graph;
    cudaGraphExec_t graphExec;
    cudaStream_t stream;

    // 模型权重(推理期间恒定)
    float* d_weights1;
    float* d_weights2;

    // 缓冲区(每次推理重用)
    float* d_input;
    float* d_hidden;
    float* d_output;

public:
    void initGraph() {
        cudaStreamCreate(&stream);

        // 捕获推理操作
        cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);

        // 第一层:输入 -> 隐藏
        matmulKernel<<<grid1, block1, 0, stream>>>(d_input, d_weights1, d_hidden, M, K, N1);
        reluKernel<<<(N1 + 255)/256, 256, 0, stream>>>(d_hidden, N1);

        // 第二层:隐藏 -> 输出
        matmulKernel<<<grid2, block2, 0, stream>>>(d_hidden, d_weights2, d_output, M, N1, N2);
        softmaxKernel<<<M, 256, 0, stream>>>(d_output, N2);

        cudaStreamEndCapture(stream, &graph);
        cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
    }

    void infer(float* h_input, float* h_output, int batchSize) {
        // 将输入复制到设备
        cudaMemcpyAsync(d_input, h_input, batchSize * inputSize * sizeof(float),
                        cudaMemcpyHostToDevice, stream);

        // 执行推理图 - 非常低的开销!
        cudaGraphLaunch(graphExec, stream);

        // 将输出复制到主机
        cudaMemcpyAsync(h_output, d_output, batchSize * outputSize * sizeof(float),
                        cudaMemcpyDeviceToHost, stream);

        cudaStreamSynchronize(stream);
    }

    void updateInputBuffer(float* newInput) {
        // 更新图以使用新的输入缓冲区
        // ... 图更新代码
    }
};

8. 条件图(CUDA 12+)

// CUDA 12.0+ 条件图执行
cudaGraph_t graph;
cudaGraphCreate(&graph, 0);

// 创建条件节点
cudaGraphConditionalHandle conditionalHandle;
cudaGraphConditionalHandleCreate(&conditionalHandle, graph, 0, 0);

// 创建条件检查节点
cudaGraphNode_t conditionNode;
cudaKernelNodeParams condParams = {0};
condParams.func = (void*)checkConditionKernel;
// ... 配置参数
cudaGraphAddKernelNode(&conditionNode, graph, NULL, 0, &condParams);

// 创建条件主体图
cudaGraph_t bodyGraph;
cudaGraphCreate(&bodyGraph, 0);
// ... 添加节点到主体图

// 添加条件节点
cudaGraphNodeParams nodeParams = {0};
nodeParams.type = cudaGraphNodeTypeConditional;
nodeParams.conditional.handle = conditionalHandle;
nodeParams.conditional.type = cudaGraphCondTypeIf;
nodeParams.conditional.size = 1;
nodeParams.conditional.phGraph_out = &bodyGraph;

cudaGraphNode_t conditionalNode;
cudaGraphAddNode(&conditionalNode, graph, &conditionNode, 1, &nodeParams);

9. 图调试和可视化

// 将图导出为DOT格式以进行可视化
void exportGraphToDot(cudaGraph_t graph, const char* filename) {
    cudaGraphDebugDotPrint(graph, filename, cudaGraphDebugDotFlagsVerbose);
}

// 获取图统计信息
void printGraphStats(cudaGraph_t graph) {
    cudaGraphNode_t* nodes;
    size_t numNodes;
    cudaGraphGetNodes(graph, NULL, &numNodes);
    nodes = new cudaGraphNode_t[numNodes];
    cudaGraphGetNodes(graph, nodes, &numNodes);

    int kernelCount = 0, memcpyCount = 0, memsetCount = 0;

    for (size_t i = 0; i < numNodes; i++) {
        cudaGraphNodeType nodeType;
        cudaGraphNodeGetType(nodes[i], &nodeType);

        switch (nodeType) {
            case cudaGraphNodeTypeKernel: kernelCount++; break;
            case cudaGraphNodeTypeMemcpy: memcpyCount++; break;
            case cudaGraphNodeTypeMemset: memsetCount++; break;
        }
    }

    printf("图统计信息:
");
    printf("  总节点数:%zu
", numNodes);
    printf("  内核节点数:%d
", kernelCount);
    printf("  Memcpy节点数:%d
", memcpyCount);
    printf("  Memset节点数:%d
", memsetCount);

    delete[] nodes;
}

最佳实践

何时使用CUDA图

使用案例 好处
许多小内核 减少启动开销
重复执行模式 分摊捕获成本
ML推理 一致的低延迟
批处理 高效的重复执行

图设计指南

  1. 捕获稳定模式 - 不要捕获动态工作负载
  2. 使用图更新 - 避免重新实例化的开销
  3. 先进行性能分析 - 确保启动开销是瓶颈
  4. 批处理操作 - 每次图启动执行尽可能多的工作

启动开销减少

场景 传统 使用图 加速比
10个小内核 ~20-50us开销 ~10us开销 2-5倍
100个小内核 ~200-500us开销 ~10us开销 20-50倍
推理管道 变化 一致 降低延迟方差

流程集成

这个技能与以下流程集成:

  • cuda-stream-concurrency.js - 流优化
  • ml-inference-optimization.js - 推理管道
  • dynamic-parallelism-implementation.js - 执行模式

输出格式

执行操作时,提供结构化输出:

{
  "operation": "capture-graph",
  "status": "success",
  "graph": {
    "nodes": 15,
    "kernels": 10,
    "memcpys": 3,
    "memsets": 2,
    "dependencies": 14
  },
  "performance": {
    "capture_time_ms": 0.5,
    "instantiate_time_ms": 1.2,
    "launch_overhead_us": 8.5,
    "traditional_overhead_us": 45.0,
    "speedup": "5.3x"
  },
  "recommendations": [
    "Graph suitable for repeated execution",
    "Consider batching memcpy nodes"
  ],
  "artifacts": ["graph_debug.dot", "graph_stats.json"]
}

约束

  • 图捕获需要一致的执行路径
  • 某些操作不能被捕获(printf, 内核中的malloc)
  • 图更新限于相同拓扑
  • 条件节点需要CUDA 12.0+
  • 使用图进行性能分析需要Nsight Systems 2021.4+