名称: mlflow 描述: 跟踪ML实验、管理模型注册表、部署模型到生产，并用MLflow重现实验——框架无关的ML生命周期平台版本: 1.0.0 作者: Orchestra Research 许可证: MIT 标签: [MLOps, MLflow, 实验跟踪, 模型注册表, ML生命周期, 部署, 模型版本控制, PyTorch, TensorFlow, Scikit-Learn, HuggingFace] 依赖项: [mlflow, sqlalchemy, boto3]

MLflow: 机器学习生命周期管理平台

何时使用此技能

使用MLflow当您需要时：

跟踪ML实验，包括参数、指标和工件
管理模型注册表，支持版本控制和阶段转换
部署模型到各种平台（本地、云端、服务端）
重现实验，使用项目配置
比较模型版本和性能指标
协作ML项目，支持团队工作流
集成任何ML框架（框架无关）

用户: 20,000+ 组织 | GitHub星标: 23k+ | 许可证: Apache 2.0

安装

# 安装MLflow
pip install mlflow

# 安装额外依赖
pip install mlflow[extras]  # 包括SQLAlchemy、boto3等

# 启动MLflow UI
mlflow ui

# 访问 http://localhost:5000

快速开始

基础跟踪

import mlflow

# 开始一个运行
with mlflow.start_run():
    # 记录参数
    mlflow.log_param("learning_rate", 0.001)
    mlflow.log_param("batch_size", 32)

    # 您的训练代码
    model = train_model()

    # 记录指标
    mlflow.log_metric("train_loss", 0.15)
    mlflow.log_metric("val_accuracy", 0.92)

    # 记录模型
    mlflow.sklearn.log_model(model, "model")

自动记录（自动跟踪）

import mlflow
from sklearn.ensemble import RandomForestClassifier

# 启用自动记录
mlflow.autolog()

# 训练（自动记录）
model = RandomForestClassifier(n_estimators=100, max_depth=5)
model.fit(X_train, y_train)

# 指标、参数和模型自动记录！

核心概念

1. 实验和运行

实验: 相关运行的逻辑容器运行: ML代码的单次执行（参数、指标、工件）

import mlflow

# 创建/设置实验
mlflow.set_experiment("my-experiment")

# 开始一个运行
with mlflow.start_run(run_name="baseline-model"):
    # 记录参数
    mlflow.log_param("model", "ResNet50")
    mlflow.log_param("epochs", 10)

    # 训练
    model = train()

    # 记录指标
    mlflow.log_metric("accuracy", 0.95)

    # 记录模型
    mlflow.pytorch.log_model(model, "model")

# 运行ID自动生成
print(f"Run ID: {mlflow.active_run().info.run_id}")

2. 记录参数

with mlflow.start_run():
    # 单个参数
    mlflow.log_param("learning_rate", 0.001)

    # 多个参数
    mlflow.log_params({
        "batch_size": 32,
        "epochs": 50,
        "optimizer": "Adam",
        "dropout": 0.2
    })

    # 嵌套参数（作为字典）
    config = {
        "model": {
            "architecture": "ResNet50",
            "pretrained": True
        },
        "training": {
            "lr": 0.001,
            "weight_decay": 1e-4
        }
    }

    # 作为JSON字符串或单独参数记录
    for key, value in config.items():
        mlflow.log_param(key, str(value))

3. 记录指标

with mlflow.start_run():
    # 训练循环
    for epoch in range(NUM_EPOCHS):
        train_loss = train_epoch()
        val_loss = validate()

        # 每步记录指标
        mlflow.log_metric("train_loss", train_loss, step=epoch)
        mlflow.log_metric("val_loss", val_loss, step=epoch)

        # 记录多个指标
        mlflow.log_metrics({
            "train_accuracy": train_acc,
            "val_accuracy": val_acc
        }, step=epoch)

    # 记录最终指标（无步数）
    mlflow.log_metric("final_accuracy", final_acc)

4. 记录工件

with mlflow.start_run():
    # 记录文件
    model.save('model.pkl')
    mlflow.log_artifact('model.pkl')

    # 记录目录
    os.makedirs('plots', exist_ok=True)
    plt.savefig('plots/loss_curve.png')
    mlflow.log_artifacts('plots')

    # 记录文本
    with open('config.txt', 'w') as f:
        f.write(str(config))
    mlflow.log_artifact('config.txt')

    # 记录字典为JSON
    mlflow.log_dict({'config': config}, 'config.json')

5. 记录模型

# PyTorch
import mlflow.pytorch

with mlflow.start_run():
    model = train_pytorch_model()
    mlflow.pytorch.log_model(model, "model")

# Scikit-learn
import mlflow.sklearn

with mlflow.start_run():
    model = train_sklearn_model()
    mlflow.sklearn.log_model(model, "model")

# Keras/TensorFlow
import mlflow.keras

with mlflow.start_run():
    model = train_keras_model()
    mlflow.keras.log_model(model, "model")

# HuggingFace Transformers
import mlflow.transformers

with mlflow.start_run():
    mlflow.transformers.log_model(
        transformers_model={
            "model": model,
            "tokenizer": tokenizer
        },
        artifact_path="model"
    )

自动记录

自动为流行框架记录指标、参数和模型。

启用自动记录

import mlflow

# 为所有支持的框架启用
mlflow.autolog()

# 或为特定框架启用
mlflow.sklearn.autolog()
mlflow.pytorch.autolog()
mlflow.keras.autolog()
mlflow.xgboost.autolog()

Scikit-learn自动记录

import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# 启用自动记录
mlflow.sklearn.autolog()

# 分割数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 训练（自动记录参数、指标、模型）
with mlflow.start_run():
    model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
    model.fit(X_train, y_train)

    # 指标如accuracy、f1_score自动记录
    # 模型自动记录
    # 训练时长自动记录

PyTorch Lightning自动记录

import mlflow
import pytorch_lightning as pl

# 启用自动记录
mlflow.pytorch.autolog()

# 训练
with mlflow.start_run():
    trainer = pl.Trainer(max_epochs=10)
    trainer.fit(model, datamodule=dm)

    # 超参数记录
    # 训练指标记录
    # 最佳模型检查点记录

模型注册表

用版本控制和阶段转换管理模型生命周期。

注册模型

import mlflow

# 记录并注册模型
with mlflow.start_run():
    model = train_model()

    # 记录模型
    mlflow.sklearn.log_model(
        model,
        "model",
        registered_model_name="my-classifier"  # 立即注册
    )

# 或稍后注册
run_id = "abc123"
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri, "my-classifier")

模型阶段

在阶段间转换模型：无 → 暂存 → 生产 → 归档

from mlflow.tracking import MlflowClient

client = MlflowClient()

# 提升到暂存
client.transition_model_version_stage(
    name="my-classifier",
    version=3,
    stage="Staging"
)

# 提升到生产
client.transition_model_version_stage(
    name="my-classifier",
    version=3,
    stage="Production",
    archive_existing_versions=True  # 归档旧的生产版本
)

# 归档模型
client.transition_model_version_stage(
    name="my-classifier",
    version=2,
    stage="Archived"
)

从注册表加载模型

import mlflow.pyfunc

# 加载最新生产模型
model = mlflow.pyfunc.load_model("models:/my-classifier/Production")

# 加载特定版本
model = mlflow.pyfunc.load_model("models:/my-classifier/3")

# 从暂存加载
model = mlflow.pyfunc.load_model("models:/my-classifier/Staging")

# 使用模型
predictions = model.predict(X_test)

模型版本控制

client = MlflowClient()

# 列出所有版本
versions = client.search_model_versions("name='my-classifier'")

for v in versions:
    print(f"版本 {v.version}: {v.current_stage}")

# 按阶段获取最新版本
latest_prod = client.get_latest_versions("my-classifier", stages=["Production"])
latest_staging = client.get_latest_versions("my-classifier", stages=["Staging"])

# 获取模型版本详情
version_info = client.get_model_version(name="my-classifier", version="3")
print(f"运行ID: {version_info.run_id}")
print(f"阶段: {version_info.current_stage}")
print(f"标签: {version_info.tags}")

模型注解

client = MlflowClient()

# 添加描述
client.update_model_version(
    name="my-classifier",
    version="3",
    description="在1M图像上训练的ResNet50分类器，准确率95%"
)

# 添加标签
client.set_model_version_tag(
    name="my-classifier",
    version="3",
    key="validation_status",
    value="approved"
)

client.set_model_version_tag(
    name="my-classifier",
    version="3",
    key="deployed_date",
    value="2025-01-15"
)

搜索运行

程序化查找运行。

from mlflow.tracking import MlflowClient

client = MlflowClient()

# 搜索实验中的所有运行
experiment_id = client.get_experiment_by_name("my-experiment").experiment_id
runs = client.search_runs(
    experiment_ids=[experiment_id],
    filter_string="metrics.accuracy > 0.9",
    order_by=["metrics.accuracy DESC"],
    max_results=10
)

for run in runs:
    print(f"运行ID: {run.info.run_id}")
    print(f"准确率: {run.data.metrics['accuracy']}")
    print(f"参数: {run.data.params}")

# 复杂过滤器搜索
runs = client.search_runs(
    experiment_ids=[experiment_id],
    filter_string="""
        metrics.accuracy > 0.9 AND
        params.model = 'ResNet50' AND
        tags.dataset = 'ImageNet'
    """,
    order_by=["metrics.f1_score DESC"]
)

集成示例

PyTorch

import mlflow
import torch
import torch.nn as nn

# 启用自动记录
mlflow.pytorch.autolog()

with mlflow.start_run():
    # 记录配置
    config = {
        "lr": 0.001,
        "epochs": 10,
        "batch_size": 32
    }
    mlflow.log_params(config)

    # 训练
    model = create_model()
    optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])

    for epoch in range(config["epochs"]):
        train_loss = train_epoch(model, optimizer, train_loader)
        val_loss, val_acc = validate(model, val_loader)

        # 记录指标
        mlflow.log_metrics({
            "train_loss": train_loss,
            "val_loss": val_loss,
            "val_accuracy": val_acc
        }, step=epoch)

    # 记录模型
    mlflow.pytorch.log_model(model, "model")

HuggingFace Transformers

import mlflow
from transformers import Trainer, TrainingArguments

# 启用自动记录
mlflow.transformers.autolog()

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

# 开始MLflow运行
with mlflow.start_run():
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )

    # 训练（自动记录）
    trainer.train()

    # 记录最终模型到注册表
    mlflow.transformers.log_model(
        transformers_model={
            "model": trainer.model,
            "tokenizer": tokenizer
        },
        artifact_path="model",
        registered_model_name="hf-classifier"
    )

XGBoost

import mlflow
import xgboost as xgb

# 启用自动记录
mlflow.xgboost.autolog()

with mlflow.start_run():
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    params = {
        'max_depth': 6,
        'learning_rate': 0.1,
        'objective': 'binary:logistic',
        'eval_metric': ['logloss', 'auc']
    }

    # 训练（自动记录）
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=100,
        evals=[(dtrain, 'train'), (dval, 'val')],
        early_stopping_rounds=10
    )

    # 模型和指标自动记录

最佳实践

1. 用实验组织

# ✅ 好: 为不同任务分离实验
mlflow.set_experiment("sentiment-analysis")
mlflow.set_experiment("image-classification")
mlflow.set_experiment("recommendation-system")

# ❌ 坏: 所有内容在一个实验
mlflow.set_experiment("all-models")

2. 使用描述性运行名称

# ✅ 好: 描述性名称
with mlflow.start_run(run_name="resnet50-imagenet-lr0.001-bs32"):
    train()

# ❌ 坏: 无名称（自动生成UUID）
with mlflow.start_run():
    train()

3. 记录全面元数据

with mlflow.start_run():
    # 记录超参数
    mlflow.log_params({
        "learning_rate": 0.001,
        "batch_size": 32,
        "epochs": 50
    })

    # 记录系统信息
    mlflow.set_tags({
        "dataset": "ImageNet",
        "framework": "PyTorch 2.0",
        "gpu": "A100",
        "git_commit": get_git_commit()
    })

    # 记录数据信息
    mlflow.log_param("train_samples", len(train_dataset))
    mlflow.log_param("val_samples", len(val_dataset))

4. 跟踪模型谱系

# 链接运行以理解谱系
with mlflow.start_run(run_name="preprocessing"):
    data = preprocess()
    mlflow.log_artifact("data.csv")
    preprocessing_run_id = mlflow.active_run().info.run_id

with mlflow.start_run(run_name="training"):
    # 引用父运行
    mlflow.set_tag("preprocessing_run_id", preprocessing_run_id)
    model = train(data)

5. 使用模型注册表进行部署

# ✅ 好: 使用注册表进行生产
model_uri = "models:/my-classifier/Production"
model = mlflow.pyfunc.load_model(model_uri)

# ❌ 坏: 硬编码运行ID
model_uri = "runs:/abc123/model"
model = mlflow.pyfunc.load_model(model_uri)

部署

本地服务模型

# 服务注册模型
mlflow models serve -m "models:/my-classifier/Production" -p 5001

# 服务运行模型
mlflow models serve -m "runs:/<RUN_ID>/model" -p 5001

# 测试端点
curl http://127.0.0.1:5001/invocations -H 'Content-Type: application/json' -d '{
  "inputs": [[1.0, 2.0, 3.0, 4.0]]
}'

部署到云端

# 部署到AWS SageMaker
mlflow sagemaker deploy -m "models:/my-classifier/Production" --region-name us-west-2

# 部署到Azure ML
mlflow azureml deploy -m "models:/my-classifier/Production"

配置

跟踪服务器

# 启动带后端存储的跟踪服务器
mlflow server \
  --backend-store-uri postgresql://user:password@localhost/mlflow \
  --default-artifact-root s3://my-bucket/mlflow \
  --host 0.0.0.0 \
  --port 5000

客户端配置

import mlflow

# 设置跟踪URI
mlflow.set_tracking_uri("http://localhost:5000")

# 或使用环境变量
# export MLFLOW_TRACKING_URI=http://localhost:5000

资源

文档: https://mlflow.org/docs/latest
GitHub: https://github.com/mlflow/mlflow (23k+ 星标)
示例: https://github.com/mlflow/mlflow/tree/master/examples
社区: https://mlflow.org/community

另请参见

references/tracking.md - 综合跟踪指南
references/model-registry.md - 模型生命周期管理
references/deployment.md - 生产部署模式