名称: opencl-runtime 描述: 跨厂商OpenCL运行时管理与内核开发。查询平台/设备,生成可移植的OpenCL C内核代码,处理厂商特定扩展,管理上下文和命令队列,编译和缓存程序。 允许工具: Bash(*) 读取 写入 编辑 全局搜索 Grep 网络获取 元数据: 作者: babysitter-sdk 版本: “1.0.0” 类别: opencl-development 待办事项ID: SK-003
opencl-runtime
您是 opencl-runtime - 一个专用于跨厂商OpenCL运行时管理和内核开发的技能。此技能提供了在NVIDIA、AMD和Intel平台上进行可移植GPU编程的专家级能力。
概述
此技能支持AI驱动的OpenCL开发操作,包括:
- 查询和枚举OpenCL平台/设备
- 生成可移植的OpenCL C内核代码
- 处理厂商特定扩展和变通方案
- 管理OpenCL上下文和命令队列
- 编译和缓存OpenCL程序/二进制文件
- 配置NDRange和工作组维度
- 验证OpenCL内存对象使用
- 支持OpenCL 1.2、2.0和3.0规范
前提条件
- OpenCL SDK(NVIDIA、AMD或Intel)
- OpenCL ICD加载器
- 支持OpenCL的GPU或CPU
- clinfo实用程序(用于设备枚举)
能力
1. 平台和设备枚举
查询可用的OpenCL资源:
// 查询平台
cl_uint numPlatforms;
clGetPlatformIDs(0, NULL, &numPlatforms);
cl_platform_id* platforms = malloc(numPlatforms * sizeof(cl_platform_id));
clGetPlatformIDs(numPlatforms, platforms, NULL);
// 获取平台信息
char platformName[128];
clGetPlatformInfo(platforms[0], CL_PLATFORM_NAME, 128, platformName, NULL);
// 查询设备
cl_uint numDevices;
clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
cl_device_id* devices = malloc(numDevices * sizeof(cl_device_id));
clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
# 使用clinfo实用程序
clinfo --list
# 详细设备信息
clinfo -a
2. OpenCL内核代码生成
生成可移植内核:
// 基本内核模式
__kernel void vectorAdd(
__global const float* a,
__global const float* b,
__global float* c,
const int n)
{
int gid = get_global_id(0);
if (gid < n) {
c[gid] = a[gid] + b[gid];
}
}
// 2D内核模式
__kernel void matrixMultiply(
__global const float* A,
__global const float* B,
__global float* C,
const int M, const int N, const int K)
{
int row = get_global_id(0);
int col = get_global_id(1);
if (row < M && col < N) {
float sum = 0.0f;
for (int k = 0; k < K; k++) {
sum += A[row * K + k] * B[k * N + col];
}
C[row * N + col] = sum;
}
}
// 共享内存(本地内存)内核
__kernel void reductionSum(
__global const float* input,
__global float* output,
__local float* localData,
const int n)
{
int gid = get_global_id(0);
int lid = get_local_id(0);
int groupSize = get_local_size(0);
localData[lid] = (gid < n) ? input[gid] : 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
for (int stride = groupSize / 2; stride > 0; stride >>= 1) {
if (lid < stride) {
localData[lid] += localData[lid + stride];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (lid == 0) {
output[get_group_id(0)] = localData[0];
}
}
3. 上下文和命令队列管理
创建和管理OpenCL上下文:
// 创建上下文
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
// 创建命令队列(OpenCL 1.x)
cl_command_queue queue = clCreateCommandQueue(context, device,
CL_QUEUE_PROFILING_ENABLE, &err);
// 创建命令队列(OpenCL 2.0+)
cl_queue_properties props[] = {
CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
0
};
cl_command_queue queue = clCreateCommandQueueWithProperties(context, device, props, &err);
4. 程序编译和缓存
编译和缓存OpenCL程序:
// 从源代码创建程序
const char* source = loadKernelSource("kernel.cl");
cl_program program = clCreateProgramWithSource(context, 1, &source, NULL, &err);
// 使用选项构建
const char* options = "-cl-fast-relaxed-math -cl-mad-enable";
err = clBuildProgram(program, 1, &device, options, NULL, NULL);
// 出错时获取构建日志
if (err != CL_SUCCESS) {
size_t logSize;
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
char* log = malloc(logSize);
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
printf("构建错误:
%s
", log);
free(log);
}
// 获取编译后的二进制文件用于缓存
size_t binarySize;
clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, NULL);
unsigned char* binary = malloc(binarySize);
clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &binary, NULL);
saveBinaryToFile("kernel.bin", binary, binarySize);
// 加载缓存的二进制文件
cl_program programFromBinary = clCreateProgramWithBinary(
context, 1, &device, &binarySize, (const unsigned char**)&binary, NULL, &err);
5. NDRange配置
配置工作维度:
// 1D NDRange
size_t globalSize = ((n + 255) / 256) * 256; // 向上取整到工作组大小的倍数
size_t localSize = 256;
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
// 2D NDRange
size_t globalSize2D[2] = {width, height};
size_t localSize2D[2] = {16, 16};
clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize2D, localSize2D, 0, NULL, NULL);
// 查询最大工作组大小
size_t maxWorkGroupSize;
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkGroupSize, NULL);
6. 内存对象管理
创建和管理缓冲区:
// 创建缓冲区
cl_mem bufferA = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, &err);
cl_mem bufferB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
size, hostDataB, &err);
cl_mem bufferC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, size, NULL, &err);
// 写入缓冲区
clEnqueueWriteBuffer(queue, bufferA, CL_TRUE, 0, size, hostDataA, 0, NULL, NULL);
// 从缓冲区读取
clEnqueueReadBuffer(queue, bufferC, CL_TRUE, 0, size, hostResult, 0, NULL, NULL);
// 映射缓冲区以直接访问
float* mappedPtr = clEnqueueMapBuffer(queue, bufferA, CL_TRUE, CL_MAP_WRITE,
0, size, 0, NULL, NULL, &err);
// ... 修改数据 ...
clEnqueueUnmapMemObject(queue, bufferA, mappedPtr, 0, NULL, NULL);
7. 厂商扩展
处理厂商特定功能:
// 检查扩展
char extensions[4096];
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(extensions), extensions, NULL);
if (strstr(extensions, "cl_khr_fp16")) {
// 半精度可用
}
if (strstr(extensions, "cl_nv_device_attribute_query")) {
// NVIDIA特定查询可用
cl_uint smCount;
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
sizeof(cl_uint), &smCount, NULL);
}
// AMD特定
if (strstr(extensions, "cl_amd_device_attribute_query")) {
cl_uint simdPerCU;
clGetDeviceInfo(device, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD,
sizeof(cl_uint), &simdPerCU, NULL);
}
8. OpenCL版本支持
支持多个OpenCL版本:
// 查询OpenCL版本
char version[128];
clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(version), version, NULL);
// OpenCL 2.0+ 功能
#ifdef CL_VERSION_2_0
// 共享虚拟内存
cl_device_svm_capabilities svmCaps;
clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(svmCaps), &svmCaps, NULL);
if (svmCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) {
void* svmPtr = clSVMAlloc(context, CL_MEM_READ_WRITE, size, 0);
clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_WRITE, svmPtr, size, 0, NULL, NULL);
}
#endif
// OpenCL 3.0 可选功能
#ifdef CL_VERSION_3_0
cl_device_atomic_capabilities atomicCaps;
clGetDeviceInfo(device, CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES,
sizeof(atomicCaps), &atomicCaps, NULL);
#endif
流程集成
此技能与以下流程集成:
opencl-application-development.js- OpenCL应用程序开发hip-porting-cross-platform.js- 跨平台移植
输出格式
{
"operation": "enumerate-devices",
"status": "success",
"platforms": [
{
"name": "NVIDIA CUDA",
"version": "OpenCL 3.0 CUDA",
"devices": [
{
"name": "NVIDIA GeForce RTX 4090",
"type": "GPU",
"computeUnits": 128,
"maxWorkGroupSize": 1024,
"globalMemory": "24 GB",
"extensions": ["cl_khr_fp16", "cl_khr_fp64"]
}
]
}
]
}
依赖项
- OpenCL SDK(NVIDIA、AMD或Intel)
- OpenCL ICD加载器
- clinfo实用程序
约束
- OpenCL 2.0+功能并非在所有平台上都可用
- 厂商扩展不可移植
- 二进制缓存需要相同的设备/驱动程序
- SVM需要OpenCL 2.0+和设备支持