ray分布式训练器

概述

使用Ray在集群中进行并行训练、超参数搜索和资源管理的分布式计算技能。

能力

使用Ray Train进行分布式训练
使用Ray Tune进行大规模超参数搜索
集群资源管理
容错与检查点
基于Actor的并行处理
与PyTorch和TensorFlow集成
弹性训练支持
多节点编排

目标流程

分布式训练编排
自动机器学习（AutoML）管道编排
模型训练管道

工具与库

Ray
Ray Train
Ray Tune
Ray Cluster

输入模式

{
  "type": "object",
  "required": ["mode", "config"],
  "properties": {
    "mode": {
      "type": "string",
      "enum": ["train", "tune", "cluster"],
      "description": "Ray操作模式"
    },
    "config": {
      "type": "object",
      "properties": {
        "numWorkers": { "type": "integer" },
        "useGpu": { "type": "boolean" },
        "resourcesPerWorker": {
          "type": "object",
          "properties": {
            "cpu": { "type": "number" },
            "gpu": { "type": "number" }
          }
        }
      }
    },
    "trainConfig": {
      "type": "object",
      "properties": {
        "trainerPath": { "type": "string" },
        "framework": { "type": "string", "enum": ["pytorch", "tensorflow", "xgboost"] },
        "scalingConfig": { "type": "object" }
      }
    },
    "tuneConfig": {
      "type": "object",
      "properties": {
        "searchSpace": { "type": "object" },
        "scheduler": { "type": "string" },
        "numSamples": { "type": "integer" },
        "metric": { "type": "string" },
        "mode": { "type": "string", "enum": ["min", "max"] }
      }
    }
  }
}

输出模式

{
  "type": "object",
  "required": ["status", "results"],
  "properties": {
    "status": {
      "type": "string",
      "enum": ["success", "error", "partial"]
    },
    "results": {
      "type": "object",
      "properties": {
        "bestConfig": { "type": "object" },
        "bestMetric": { "type": "number" },
        "numTrials": { "type": "integer" },
        "completedTrials": { "type": "integer" }
      }
    },
    "checkpointPath": {
      "type": "string"
    },
    "clusterStatus": {
      "type": "object",
      "properties": {
        "numNodes": { "type": "integer" },
        "totalCpu": { "type": "number" },
        "totalGpu": { "type": "number" }
      }
    },
    "trainingTime": {
      "type": "number"
    }
  }
}

使用示例

{
  kind: 'skill',
  title: '分布式超参数调优',
  skill: {
    name: 'ray-distributed-trainer',
    context: {
      mode: 'tune',
      config: {
        numWorkers: 4,
        useGpu: true,
        resourcesPerWorker: { cpu: 2, gpu: 1 }
      },
      tuneConfig: {
        searchSpace: {
          lr: { type: 'loguniform', min: 1e-5, max: 1e-1 },
          batchSize: { type: 'choice', values: [16, 32, 64] }
        },
        scheduler: 'asha',
        numSamples: 100,
        metric: 'val_loss',
        mode: 'min'
      }
    }
  }
}