名称: mlflow 描述: 跟踪ML实验、管理模型注册表、部署模型到生产,并用MLflow重现实验——框架无关的ML生命周期平台 版本: 1.0.0 作者: Orchestra Research 许可证: MIT 标签: [MLOps, MLflow, 实验跟踪, 模型注册表, ML生命周期, 部署, 模型版本控制, PyTorch, TensorFlow, Scikit-Learn, HuggingFace] 依赖项: [mlflow, sqlalchemy, boto3]
MLflow: 机器学习生命周期管理平台
何时使用此技能
使用MLflow当您需要时:
- 跟踪ML实验,包括参数、指标和工件
- 管理模型注册表,支持版本控制和阶段转换
- 部署模型到各种平台(本地、云端、服务端)
- 重现实验,使用项目配置
- 比较模型版本和性能指标
- 协作ML项目,支持团队工作流
- 集成任何ML框架(框架无关)
用户: 20,000+ 组织 | GitHub星标: 23k+ | 许可证: Apache 2.0
安装
# 安装MLflow
pip install mlflow
# 安装额外依赖
pip install mlflow[extras] # 包括SQLAlchemy、boto3等
# 启动MLflow UI
mlflow ui
# 访问 http://localhost:5000
快速开始
基础跟踪
import mlflow
# 开始一个运行
with mlflow.start_run():
# 记录参数
mlflow.log_param("learning_rate", 0.001)
mlflow.log_param("batch_size", 32)
# 您的训练代码
model = train_model()
# 记录指标
mlflow.log_metric("train_loss", 0.15)
mlflow.log_metric("val_accuracy", 0.92)
# 记录模型
mlflow.sklearn.log_model(model, "model")
自动记录(自动跟踪)
import mlflow
from sklearn.ensemble import RandomForestClassifier
# 启用自动记录
mlflow.autolog()
# 训练(自动记录)
model = RandomForestClassifier(n_estimators=100, max_depth=5)
model.fit(X_train, y_train)
# 指标、参数和模型自动记录!
核心概念
1. 实验和运行
实验: 相关运行的逻辑容器 运行: ML代码的单次执行(参数、指标、工件)
import mlflow
# 创建/设置实验
mlflow.set_experiment("my-experiment")
# 开始一个运行
with mlflow.start_run(run_name="baseline-model"):
# 记录参数
mlflow.log_param("model", "ResNet50")
mlflow.log_param("epochs", 10)
# 训练
model = train()
# 记录指标
mlflow.log_metric("accuracy", 0.95)
# 记录模型
mlflow.pytorch.log_model(model, "model")
# 运行ID自动生成
print(f"Run ID: {mlflow.active_run().info.run_id}")
2. 记录参数
with mlflow.start_run():
# 单个参数
mlflow.log_param("learning_rate", 0.001)
# 多个参数
mlflow.log_params({
"batch_size": 32,
"epochs": 50,
"optimizer": "Adam",
"dropout": 0.2
})
# 嵌套参数(作为字典)
config = {
"model": {
"architecture": "ResNet50",
"pretrained": True
},
"training": {
"lr": 0.001,
"weight_decay": 1e-4
}
}
# 作为JSON字符串或单独参数记录
for key, value in config.items():
mlflow.log_param(key, str(value))
3. 记录指标
with mlflow.start_run():
# 训练循环
for epoch in range(NUM_EPOCHS):
train_loss = train_epoch()
val_loss = validate()
# 每步记录指标
mlflow.log_metric("train_loss", train_loss, step=epoch)
mlflow.log_metric("val_loss", val_loss, step=epoch)
# 记录多个指标
mlflow.log_metrics({
"train_accuracy": train_acc,
"val_accuracy": val_acc
}, step=epoch)
# 记录最终指标(无步数)
mlflow.log_metric("final_accuracy", final_acc)
4. 记录工件
with mlflow.start_run():
# 记录文件
model.save('model.pkl')
mlflow.log_artifact('model.pkl')
# 记录目录
os.makedirs('plots', exist_ok=True)
plt.savefig('plots/loss_curve.png')
mlflow.log_artifacts('plots')
# 记录文本
with open('config.txt', 'w') as f:
f.write(str(config))
mlflow.log_artifact('config.txt')
# 记录字典为JSON
mlflow.log_dict({'config': config}, 'config.json')
5. 记录模型
# PyTorch
import mlflow.pytorch
with mlflow.start_run():
model = train_pytorch_model()
mlflow.pytorch.log_model(model, "model")
# Scikit-learn
import mlflow.sklearn
with mlflow.start_run():
model = train_sklearn_model()
mlflow.sklearn.log_model(model, "model")
# Keras/TensorFlow
import mlflow.keras
with mlflow.start_run():
model = train_keras_model()
mlflow.keras.log_model(model, "model")
# HuggingFace Transformers
import mlflow.transformers
with mlflow.start_run():
mlflow.transformers.log_model(
transformers_model={
"model": model,
"tokenizer": tokenizer
},
artifact_path="model"
)
自动记录
自动为流行框架记录指标、参数和模型。
启用自动记录
import mlflow
# 为所有支持的框架启用
mlflow.autolog()
# 或为特定框架启用
mlflow.sklearn.autolog()
mlflow.pytorch.autolog()
mlflow.keras.autolog()
mlflow.xgboost.autolog()
Scikit-learn自动记录
import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
# 启用自动记录
mlflow.sklearn.autolog()
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# 训练(自动记录参数、指标、模型)
with mlflow.start_run():
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
model.fit(X_train, y_train)
# 指标如accuracy、f1_score自动记录
# 模型自动记录
# 训练时长自动记录
PyTorch Lightning自动记录
import mlflow
import pytorch_lightning as pl
# 启用自动记录
mlflow.pytorch.autolog()
# 训练
with mlflow.start_run():
trainer = pl.Trainer(max_epochs=10)
trainer.fit(model, datamodule=dm)
# 超参数记录
# 训练指标记录
# 最佳模型检查点记录
模型注册表
用版本控制和阶段转换管理模型生命周期。
注册模型
import mlflow
# 记录并注册模型
with mlflow.start_run():
model = train_model()
# 记录模型
mlflow.sklearn.log_model(
model,
"model",
registered_model_name="my-classifier" # 立即注册
)
# 或稍后注册
run_id = "abc123"
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri, "my-classifier")
模型阶段
在阶段间转换模型:无 → 暂存 → 生产 → 归档
from mlflow.tracking import MlflowClient
client = MlflowClient()
# 提升到暂存
client.transition_model_version_stage(
name="my-classifier",
version=3,
stage="Staging"
)
# 提升到生产
client.transition_model_version_stage(
name="my-classifier",
version=3,
stage="Production",
archive_existing_versions=True # 归档旧的生产版本
)
# 归档模型
client.transition_model_version_stage(
name="my-classifier",
version=2,
stage="Archived"
)
从注册表加载模型
import mlflow.pyfunc
# 加载最新生产模型
model = mlflow.pyfunc.load_model("models:/my-classifier/Production")
# 加载特定版本
model = mlflow.pyfunc.load_model("models:/my-classifier/3")
# 从暂存加载
model = mlflow.pyfunc.load_model("models:/my-classifier/Staging")
# 使用模型
predictions = model.predict(X_test)
模型版本控制
client = MlflowClient()
# 列出所有版本
versions = client.search_model_versions("name='my-classifier'")
for v in versions:
print(f"版本 {v.version}: {v.current_stage}")
# 按阶段获取最新版本
latest_prod = client.get_latest_versions("my-classifier", stages=["Production"])
latest_staging = client.get_latest_versions("my-classifier", stages=["Staging"])
# 获取模型版本详情
version_info = client.get_model_version(name="my-classifier", version="3")
print(f"运行ID: {version_info.run_id}")
print(f"阶段: {version_info.current_stage}")
print(f"标签: {version_info.tags}")
模型注解
client = MlflowClient()
# 添加描述
client.update_model_version(
name="my-classifier",
version="3",
description="在1M图像上训练的ResNet50分类器,准确率95%"
)
# 添加标签
client.set_model_version_tag(
name="my-classifier",
version="3",
key="validation_status",
value="approved"
)
client.set_model_version_tag(
name="my-classifier",
version="3",
key="deployed_date",
value="2025-01-15"
)
搜索运行
程序化查找运行。
from mlflow.tracking import MlflowClient
client = MlflowClient()
# 搜索实验中的所有运行
experiment_id = client.get_experiment_by_name("my-experiment").experiment_id
runs = client.search_runs(
experiment_ids=[experiment_id],
filter_string="metrics.accuracy > 0.9",
order_by=["metrics.accuracy DESC"],
max_results=10
)
for run in runs:
print(f"运行ID: {run.info.run_id}")
print(f"准确率: {run.data.metrics['accuracy']}")
print(f"参数: {run.data.params}")
# 复杂过滤器搜索
runs = client.search_runs(
experiment_ids=[experiment_id],
filter_string="""
metrics.accuracy > 0.9 AND
params.model = 'ResNet50' AND
tags.dataset = 'ImageNet'
""",
order_by=["metrics.f1_score DESC"]
)
集成示例
PyTorch
import mlflow
import torch
import torch.nn as nn
# 启用自动记录
mlflow.pytorch.autolog()
with mlflow.start_run():
# 记录配置
config = {
"lr": 0.001,
"epochs": 10,
"batch_size": 32
}
mlflow.log_params(config)
# 训练
model = create_model()
optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
for epoch in range(config["epochs"]):
train_loss = train_epoch(model, optimizer, train_loader)
val_loss, val_acc = validate(model, val_loader)
# 记录指标
mlflow.log_metrics({
"train_loss": train_loss,
"val_loss": val_loss,
"val_accuracy": val_acc
}, step=epoch)
# 记录模型
mlflow.pytorch.log_model(model, "model")
HuggingFace Transformers
import mlflow
from transformers import Trainer, TrainingArguments
# 启用自动记录
mlflow.transformers.autolog()
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=16,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True
)
# 开始MLflow运行
with mlflow.start_run():
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset
)
# 训练(自动记录)
trainer.train()
# 记录最终模型到注册表
mlflow.transformers.log_model(
transformers_model={
"model": trainer.model,
"tokenizer": tokenizer
},
artifact_path="model",
registered_model_name="hf-classifier"
)
XGBoost
import mlflow
import xgboost as xgb
# 启用自动记录
mlflow.xgboost.autolog()
with mlflow.start_run():
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
params = {
'max_depth': 6,
'learning_rate': 0.1,
'objective': 'binary:logistic',
'eval_metric': ['logloss', 'auc']
}
# 训练(自动记录)
model = xgb.train(
params,
dtrain,
num_boost_round=100,
evals=[(dtrain, 'train'), (dval, 'val')],
early_stopping_rounds=10
)
# 模型和指标自动记录
最佳实践
1. 用实验组织
# ✅ 好: 为不同任务分离实验
mlflow.set_experiment("sentiment-analysis")
mlflow.set_experiment("image-classification")
mlflow.set_experiment("recommendation-system")
# ❌ 坏: 所有内容在一个实验
mlflow.set_experiment("all-models")
2. 使用描述性运行名称
# ✅ 好: 描述性名称
with mlflow.start_run(run_name="resnet50-imagenet-lr0.001-bs32"):
train()
# ❌ 坏: 无名称(自动生成UUID)
with mlflow.start_run():
train()
3. 记录全面元数据
with mlflow.start_run():
# 记录超参数
mlflow.log_params({
"learning_rate": 0.001,
"batch_size": 32,
"epochs": 50
})
# 记录系统信息
mlflow.set_tags({
"dataset": "ImageNet",
"framework": "PyTorch 2.0",
"gpu": "A100",
"git_commit": get_git_commit()
})
# 记录数据信息
mlflow.log_param("train_samples", len(train_dataset))
mlflow.log_param("val_samples", len(val_dataset))
4. 跟踪模型谱系
# 链接运行以理解谱系
with mlflow.start_run(run_name="preprocessing"):
data = preprocess()
mlflow.log_artifact("data.csv")
preprocessing_run_id = mlflow.active_run().info.run_id
with mlflow.start_run(run_name="training"):
# 引用父运行
mlflow.set_tag("preprocessing_run_id", preprocessing_run_id)
model = train(data)
5. 使用模型注册表进行部署
# ✅ 好: 使用注册表进行生产
model_uri = "models:/my-classifier/Production"
model = mlflow.pyfunc.load_model(model_uri)
# ❌ 坏: 硬编码运行ID
model_uri = "runs:/abc123/model"
model = mlflow.pyfunc.load_model(model_uri)
部署
本地服务模型
# 服务注册模型
mlflow models serve -m "models:/my-classifier/Production" -p 5001
# 服务运行模型
mlflow models serve -m "runs:/<RUN_ID>/model" -p 5001
# 测试端点
curl http://127.0.0.1:5001/invocations -H 'Content-Type: application/json' -d '{
"inputs": [[1.0, 2.0, 3.0, 4.0]]
}'
部署到云端
# 部署到AWS SageMaker
mlflow sagemaker deploy -m "models:/my-classifier/Production" --region-name us-west-2
# 部署到Azure ML
mlflow azureml deploy -m "models:/my-classifier/Production"
配置
跟踪服务器
# 启动带后端存储的跟踪服务器
mlflow server \
--backend-store-uri postgresql://user:password@localhost/mlflow \
--default-artifact-root s3://my-bucket/mlflow \
--host 0.0.0.0 \
--port 5000
客户端配置
import mlflow
# 设置跟踪URI
mlflow.set_tracking_uri("http://localhost:5000")
# 或使用环境变量
# export MLFLOW_TRACKING_URI=http://localhost:5000
资源
- 文档: https://mlflow.org/docs/latest
- GitHub: https://github.com/mlflow/mlflow (23k+ 星标)
- 示例: https://github.com/mlflow/mlflow/tree/master/examples
- 社区: https://mlflow.org/community
另请参见
references/tracking.md- 综合跟踪指南references/model-registry.md- 模型生命周期管理references/deployment.md- 生产部署模式