本地大型语言模型部署
概览
使用Ollama、vLLM和llama.cpp在本地部署大型语言模型的全面指南。
前提条件
- 基本了解Docker和容器化
- GPU硬件知识(NVIDIA CUDA)
- Python编程技能
- 熟悉Linux命令行
- 理解模型量化和优化
核心概念
- Ollama:用户友好的工具,通过简单的CLI和API在本地运行大型语言模型
- vLLM:高性能的大型语言模型服务引擎,具有优化的推理能力
- llama.cpp:轻量级的C++实现,用于在消费级硬件上运行大型语言模型
- GGUF格式:llama.cpp使用的量化模型的高效二进制格式
- 量化:降低模型精度以减少内存使用并提高速度
- 张量并行:跨多个GPU分割模型层
- 流水线并行:跨多个GPU分割模型流水线阶段
- Modelfile:Ollama用于自定义模型的配置文件
- 上下文窗口:模型可以处理的最大令牌数
- GPU内存利用率:分配给模型的GPU内存百分比
- 交换空间:用作GPU内存溢出的CPU内存
- 批量处理:同时处理多个提示以提高效率
- 流式处理:实时逐令牌响应交付
1. Ollama设置和使用
1.1 安装
# Linux/macOS
curl -fsSL https://ollama.com/install.sh | sh
# macOS (Homebrew)
brew install ollama
# Windows
# 从 https://ollama.com/download 下载安装程序
# Docker
docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
1.2 模型管理
# 拉取模型
ollama pull llama2
ollama pull mistral
ollama pull codellama
# 列出可用模型
ollama list
# 显示模型信息
ollama show llama2
# 移除模型
ollama rm llama2
# 创建自定义模型
ollama create mymodel -f Modelfile
# 更新模型
ollama pull llama2:latest
1.3 API集成
import requests
import json
class OllamaClient:
"""Ollama API客户端。"""
def __init__(self, base_url: str = "http://localhost:11434"):
self.base_url = base_url
def generate(
self,
model: str,
prompt: str,
stream: bool = False,
options: dict = None
):
"""生成补全。"""
url = f"{self.base_url}/api/generate"
payload = {
"model": model,
"prompt": prompt,
"stream": stream
}
if options:
payload["options"] = options
if stream:
return self._stream_response(url, payload)
else:
response = requests.post(url, json=payload)
return response.json()
def chat(
self,
model: str,
messages: list,
stream: bool = False
):
"""聊天补全。"""
url = f"{self.base_url}/api/chat"
payload = {
"model": model,
"messages": messages,
"stream": stream
}
if stream:
return self._stream_response(url, payload)
else:
response = requests.post(url, json=payload)
return response.json()
def _stream_response(self, url: str, payload: dict):
"""处理流式响应。"""
response = requests.post(url, json=payload, stream=True)
for line in response.iter_lines():
if line:
chunk = json.loads(line)
yield chunk.get("response", "")
def list_models(self):
"""列出可用模型。"""
url = f"{self.base_url}/api/tags"
response = requests.get(url)
return response.json()
# 使用
client = OllamaClient()
# 生成补全
response = client.generate(
model="llama2",
prompt="法国的首都是哪里?"
)
print(response["response"])
# 聊天补全
messages = [
{"role": "user", "content": "你好!"}
]
response = client.chat(model="llama2", messages=messages)
print(response["message"]["content"])
# 流式处理
for chunk in client.generate(
model="llama2",
prompt="给我讲个故事",
stream=True
):
print(chunk, end="", flush=True)
1.4 Modelfile自定义
# 自定义模型的Modelfile
FROM llama2
PARAMETER temperature 0.7
PARAMETER top_p 0.9
PARAMETER top_k 40
PARAMETER num_ctx 4096
SYSTEM """
你是一个擅长Python编程的热心助手。
总是提供清晰、有良好注释的代码示例。
"""
TEMPLATE """
{{- if .System }}<|start_header_id|>system<|end_header_id|>
{{ .System }}<|end_header_id|>
{{- end if }}
{{- range .Messages }}
{{- if eq .Role "user" }}<|start_header_id|>user<|end_header_id|>
{{- else if eq .Role "assistant" }}<|start_header_id|>assistant<|end_header_id|>
{{- end if }}
{{ .Content }}
<|end_header_id|>
{{- end }}
<|start_header_id|>assistant<|end_header_id|>
"""
# 创建自定义模型
ollama create python-assistant -f Modelfile
# 运行自定义模型
ollama run python-assistant
2. vLLM部署
2.1 安装
# 从PyPI安装
pip install vllm
# 从源代码安装
git clone https://github.com/vllm-project/vllm.git
cd vllm
pip install -e .
# 安装GPU支持
pip install vllm --extra-index-url https://download.pytorch.org/whl/cu118
2.2 服务器设置
from vllm import LLM, SamplingParams
from vllm.entrypoints.api_server import serve
# 启动vLLM服务器
serve(
model="meta-llama/Llama-2-7b-hf",
tensor-parallel-size=1,
pipeline-parallel-size=1,
host="0.0.0.0",
port=8000,
dtype="auto",
max-model-len=4096,
gpu-memory-utilization=0.9,
enforce-eager=True,
)
# 从命令行启动服务器
python -m vllm.entrypoints.api_server \
--model meta-llama/Llama-2-7b-hf \
--tensor-parallel-size 1 \
--pipeline-parallel-size 1 \
--host 0.0.0.0 \
--port 8000 \
--dtype auto \
--max-model-len 4096
2.3 配置
from vllm import LLM, SamplingParams
# 初始化LLM
llm = LLM(
model="meta-llama/Llama-2-7b-hf",
tensor_parallel_size=1,
pipeline_parallel_size=1,
dtype="auto",
max_model_len=4096,
trust_remote_code=False,
download_dir="./models",
load_format="auto",
quantization=None, # "awq", "gptq", "squeezellm"
enforce_eager=True,
gpu_memory_utilization=0.9,
swap_space=4,
)
# 采样参数
sampling_params = SamplingParams(
n=1, # 输出序列数
best_of=1, # 最佳序列数
presence_penalty=0.0, # 存在惩罚
frequency_penalty=0.0, # 频率惩罚
repetition_penalty=1.0, # 重复惩罚
temperature=0.7, # 采样温度
top_p=0.9, # 核采样
top_k=-1, # 顶部k采样
min_p=0.0, # 最小概率
use_beam_search=False, # 使用束搜索
length_penalty=1.0, # 长度惩罚
early_stopping=False, # 提前停止
stop=[], # 停止令牌
stop_token_ids=[], # 停止令牌ID
ignore_eos=False, # 忽略EOS令牌
max_tokens=100, # 生成的最大令牌数
logprobs=None, # 返回对数概率
prompt_logprobs=None, # 返回提示对数概率
skip_special_tokens=True, # 跳过特殊令牌
spaces_between_special_tokens=True,
)
# 生成
outputs = llm.generate(
prompts=["Hello, how are you?"],
sampling_params=sampling_params
)
for output in outputs:
print(f"Output: {output.outputs[0].text}")
2.4 性能调整
from vllm import LLM
# 针对多个GPU
llm = LLM(
model="meta-llama/Llama-2-7b-hf",
tensor_parallel_size=2, # 在2个GPU上分割模型
pipeline_parallel_size=1,
)
# 针对流水线并行
llm = LLM(
model="meta-llama/Llama-2-70b-hf",
tensor_parallel_size=1,
pipeline_parallel_size=4, # 在4个流水线阶段上分割
)
# 针对量化模型
llm = LLM(
model="meta-llama/Llama-2-7b-hf",
quantization="awq", # 或"gptq", "squeezellm"
)
# 针对内存优化
llm = LLM(
model="meta-llama/Llama-2-7b-hf",
gpu_memory_utilization=0.9, # 使用90%的GPU内存
swap_space=4, # 如有需要,交换4GB到CPU
)
3. llama.cpp
3.1 构建
# 克隆仓库
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
# 构建
cmake -B build
cmake --build build --config Release
# 或者使用make
make
# 安装Python绑定
pip install llama-cpp-python
3.2 模型格式(GGUF)
from llama_cpp import Llama
# 加载GGUF模型
model = Llama(
model_path="./models/llama-2-7b-chat.Q4_K_M.gguf",
n_ctx=4096, # 上下文大小
n_gpu_layers=-1, # -1 = 所有层在GPU上
seed=42,
f16_kv=True, # 使用FP16用于KV缓存
logits_all=False, # 返回所有logits
vocab_only=False,
use_mmap=True, # 使用内存映射
use_mlock=False, # 锁定内存
embedding=False, # 返回嵌入
n_threads=8, # 线程数
n_batch=512, # 批量大小
)
# 生成
output = model(
"Hello, how are you?",
max_tokens=100,
stop=["
", "User:", "Assistant:"],
echo=False,
temperature=0.7,
top_p=0.9,
top_k=40,
repeat_penalty=1.0,
presence_penalty=0.0,
frequency_penalty=0.0,
)
print(output['choices'][0]['text'])
3.3 服务器模式
# 启动llama.cpp服务器
./server \
--model ./models/llama-2-7b-chat.Q4_K_M.gguf \
--host 0.0.0.0 \
--port 8080 \
--ctx-size 4096 \
--n-gpu-layers -1 \
--threads 8 \
--n-parallel 4 \
--batch-size 512 \
--temperature 0.7 \
--top-p 0.9
# 启动OpenAI兼容API
./server \
--model ./models/llama-2-7b-chat.Q4_K_M.gguf \
--port 8080 \
--host 0.0.0.0 \
--ctx-size 4096 \
--n-gpu-layers -1 \
--log-format json
import requests
# 使用llama.cpp服务器
url = "http://localhost:8080/completion"
payload = {
"prompt": "Hello, how are you?",
"n_predict": 100,
"temperature": 0.7,
"top_p": 0.9,
"stop": ["
", "User:", "Assistant:"]
}
response = requests.post(url, json=payload)
print(response.json())
4. Docker部署
4.1 Ollama Docker
# Ollama的Dockerfile
FROM ollama/ollama
# 复制自定义模型
COPY models /root/.ollama/models
# 设置环境
ENV OLLAMA_HOST=0.0.0.0
ENV OLLAMA_PORT=11434
# 暴露端口
EXPOSE 11434
# 启动服务器
CMD ["ollama", "serve"]
# docker-compose.yml
version: '3.3'
services:
ollama:
image: ollama/ollama:latest
container_name: ollama
ports:
- 11434:11434
volumes:
- ./ollama-data:/root/.ollama
environment:
- OLLAMA_HOST=0.0.0.0
- OLLAMA_PORT=11434
restart: unless-stopped
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
4.2 vLLM Docker
# vLLM的Dockerfile
FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04
# 安装Python
RUN apt-get update && apt-get install -y \
python3.10 \
python3-pip \
&& rm -rf /var/lib/apt/lists/*
# 安装vLLM
RUN pip install vllm
# 暴露端口
EXPOSE 8000
# 启动服务器
CMD ["python", "-m", "vllm.entrypoints.api_server", \
"--model", "meta-llama/Llama-2-7b-hf", \
"--host", "0.0.0.0", \
"--port", "8000"]
4.3 llama.cpp Docker
# llama.cpp的Dockerfile
FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04
# 安装依赖
RUN apt-get update && apt-get install -y \
git \
cmake \
build-essential \
python3 \
python3-pip \
&& rm -rf /var/lib/apt/lists/*
# 克隆并构建llama.cpp
RUN git clone https://github.com/ggerganov/llama.cpp.git /app/llama.cpp
WORKDIR /app/llama.cpp
RUN cmake -B build && cmake --build build --config Release
# 安装Python绑定
RUN pip install llama-cpp-python
# 暴露端口
EXPOSE 8080
# 启动服务器
CMD ["./build/bin/server", \
"--model", "/models/llama-2-7b-chat.Q4_K_M.gguf", \
"--host", "0.0.0.0", \
"--port", "8080"]
5. 模型选择指南
5.1 模型比较
| 模型 | 参数 | VRAM所需 | 速度 | 用例 |
|---|---|---|---|---|
| Llama-2-7B | 7B | 6GB | 快速 | 通用 |
| Llama-2-13B | 13B | 12GB | 中等 | 复杂任务 |
| Llama-2-70B | 70B | 40GB | 慢 | 高质量 |
| Mistral-7B | 7B | 6GB | 快速 | 通用 |
| CodeLlama-7B | 7B | 6GB | 快速 | 代码生成 |
| Phi-2 | 2.7B | 4GB | 非常快 | 边缘部署 |
| TinyLlama-1.1B | 1.1B | 2GB | 非常快 | 移动/嵌入式 |
5.2 选择标准
def select_model(
available_vram_gb: int,
use_case: str = "general",
speed_preference: str = "fast"
) -> str:
"""根据约束选择模型。"""
# 模型规格
models = {
"llama-2-7b": {
"vram": 6,
"speed": "fast",
"use_cases": ["general", "chat"]
},
"llama-2-13b": {
"vram": 12,
"speed": "medium",
"use_cases": ["general", "chat", "reasoning"]
},
"mistral-7b": {
"vram": 6,
"speed": "fast",
"use_cases": ["general", "chat", "code"]
},
"codellama-7b": {
"vram": 6,
"speed": "fast",
"use_cases": ["code", "technical"]
},
"phi-2": {
"vram": 4,
"speed": "very_fast",
"use_cases": ["general", "edge"]
},
}
# 按VRAM过滤
suitable_models = {
name: spec for name, spec in models.items()
if spec["vram"] <= available_vram_gb
}
# 按用例过滤
if use_case != "all":
suitable_models = {
name: spec for name, spec in suitable_models.items()
if use_case in spec["use_cases"]
}
# 按速度偏好排序
speed_order = {"very_fast": 0, "fast": 1, "medium": 2, "slow": 3}
suitable_models = sorted(
suitable_models.items(),
key=lambda x: speed_order.get(x[1]["speed"], 4)
)
if not suitable_models:
raise ValueError("No suitable model found")
return suitable_models[0][0]
# 使用
model = select_model(
available_vram_gb=8,
use_case="general",
speed_preference="fast"
)
print(f"Recommended model: {model}")
6. 量化策略
6.1 量化级别
| 量化 | 大小减少 | 质量损失 | VRAM节省 |
|---|---|---|---|
| FP16 | 2倍 | 无 | ~50% |
| Q8_0 | 4倍 | 最小 | ~75% |
| Q4_K_M | 6倍 | 低 | ~85% |
| Q4_K_S | 6倍 | 中等 | ~85% |
| Q2_K | 10倍 | 高 | ~95% |
6.2 使用llama.cpp进行量化
# 量化到Q4
./quantize ./models/llama-2-7b-f16.gguf \
./models/llama-2-7b-q4.gguf \
Q4_K_M
# 量化到Q8
./quantize ./models/llama-2-7b-f16.gguf \
./models/llama-2-7b-q8.gguf \
Q8_0
# 使用自定义设置进行量化
./quantize ./models/llama-2-7b-f16.gguf \
./models/llama-2-7b-q4.gguf \
Q4_K_M \
--imatrix ./models/llama-2-7b-imatrix.dat
6.3 使用AutoGPTQ进行量化
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from transformers import AutoTokenizer
# 加载模型
model_id = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# 量化配置
quantize_config = BaseQuantizeConfig(
bits=4,
group_size=128,
damp_percent=0.01,
desc_act=False,
sym=True,
true_sequential=True,
model_name_base=model_id,
model_file_name_base="llama-2-7b-gptq",
)
# 加载量化模型
model = AutoGPTQForCausalLM.from_pretrained(
model_id,
quantize_config=quantize_config,
use_triton=False,
inject_fused_attention=False,
inject_fused_mlp=False,
use_cuda_fp16=True,
)
7. GPU配置
7.1 多GPU设置
# vLLM与张量并行
from vllm import LLM
llm = LLM(
model="meta-llama/Llama-2-70b-hf",
tensor_parallel_size=4, # 在4个GPU上分割
pipeline_parallel_size=1,
)
# 使用多个GPU启动
python -m vllm.entrypoints.api_server \
--model meta-llama/Llama-2-70b-hf \
--tensor-parallel-size 4 \
--pipeline-parallel-size 1 \
--host 0.0.0.0 \
--port 8000
7.2 GPU内存优化
# 内存高效配置
llm = LLM(
model="meta-llama/Llama-2-7b-hf",
gpu_memory_utilization=0.9, # 保留10%空闲
swap_space=4, # 如有需要,交换4GB到CPU
max_model_len=4096, # 限制上下文
enforce_eager=True, # 急切模式
)
7.3 GPU选择
import torch
def select_gpu(gpu_memory_gb: int) -> int:
"""根据内存需求选择合适的GPU。"""
available_gpus = torch.cuda.device_count()
for i in range(available_gpus):
props = torch.cuda.get_device_properties(i)
total_memory_gb = props.total_memory / 1024**3
if total_memory_gb >= gpu_memory_gb:
return i
raise ValueError(f"No GPU with {gpu_memory_gb}GB available")
# 使用
gpu_id = select_gpu(gpu_memory_gb=12)
torch.cuda.set_device(gpu_id)
8. 性能优化
8.1 批量处理
from vllm import LLM, SamplingParams
llm = LLM(model="meta-llama/Llama-2-7b-hf")
# 批量生成
prompts = [
"Hello, how are you?",
"What is the capital of France?",
"Tell me a joke."
]
sampling_params = SamplingParams(
temperature=0.7,
top_p=0.9,
max_tokens=100
)
outputs = llm.generate(prompts, sampling_params)
for i, output in enumerate(outputs):
print(f"Prompt {i}: {output.outputs[0].text}")
8.2 缓存
from functools import lru_cache
import hashlib
import pickle
from pathlib import Path
class ModelCache:
"""缓存模型输出。"""
def __init__(self, cache_dir="./cache"):
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(exist_ok=True)
def _get_cache_key(self, prompt: str, model: str) -> str:
"""生成缓存键。"""
key = f"{model}:{prompt}"
return hashlib.md5(key.encode()).hexdigest()
def get(self, prompt: str, model: str) -> str:
"""获取缓存输出。"""
cache_key = self._get_cache_key(prompt, model)
cache_file = self.cache_dir / f"{cache_key}.pkl"
if cache_file.exists():
with open(cache_file, "rb") as f:
return pickle.load(f)
return None
def set(self, prompt: str, model: str, output: str):
"""缓存输出。"""
cache_key = self._get_cache_key(prompt, model)
cache_file = self.cache_dir / f"{cache_key}.pkl"
with open(cache_file, "wb") as f:
pickle.dump(output, f)
# 使用
cache = ModelCache()
# 检查缓存
cached_output = cache.get("Hello, how are you?", "llama-2-7b")
if cached_output:
print(f"Cached: {cached_output}")
else:
# 生成输出
output = model.generate("Hello, how are you?")
cache.set("Hello, how are you?", "llama-2-7b", output)
8.3 流式优化
from vllm import LLM, SamplingParams
llm = LLM(model="meta-llama/Llama-2-7b-hf")
# 流式生成
sampling_params = SamplingParams(
temperature=0.7,
top_p=0.9,
max_tokens=100
)
prompts = ["Tell me a story about a robot."]
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
for token in output.outputs[0].token_ids:
# 处理每个令牌
print(token, end="", flush=True)
9. 监控
9.1 性能指标
import time
import psutil
import torch
class PerformanceMonitor:
"""监控模型性能。"""
def __init__(self):
self.metrics = []
def measure_inference(
self,
model,
prompt: str,
max_tokens: int = 100
):
"""测量推理性能。"""
# 开始计时
start_time = time.time()
start_memory = psutil.Process().memory_info().rss / 1024**2 # GB
# 生成
output = model(prompt, max_tokens=max_tokens)
# 结束计时
end_time = time.time()
end_memory = psutil.Process().memory_info().rss / 1024**2 # GB
# GPU内存
if torch.cuda.is_available():
gpu_memory = torch.cuda.max_memory_allocated() / 1024**3 # GB
torch.cuda.reset_peak_memory_stats()
else:
gpu_memory = 0
metrics = {
"prompt_length": len(prompt),
"output_length": len(output),
"latency_ms": (end_time - start_time) * 1000,
"tokens_per_second": len(output) / (end_time - start_time),
"memory_delta_gb": end_memory - start_memory,
"gpu_memory_gb": gpu_memory
}
self.metrics.append(metrics)
return metrics
def get_summary(self):
"""获取性能摘要。"""
if not self.metrics:
return {}
return {
"avg_latency_ms": sum(m["latency_ms"] for m in self.metrics) / len(self.metrics),
"avg_tokens_per_second": sum(m["tokens_per_second"] for m in self.metrics) / len(self.metrics),
"avg_gpu_memory_gb": sum(m["gpu_memory_gb"] for m in self.metrics) / len(self.metrics),
"total_inferences": len(self.metrics)
}
# 使用
monitor = PerformanceMonitor()
for i in range(10):
monitor.measure_inference(model, "Hello, how are you?")
summary = monitor.get_summary()
print(f"Average latency: {summary['avg_latency_ms']:.2f}ms")
print(f"Average tokens/sec: {summary['avg_tokens_per_second']:.2f}")
9.2 资源监控
import GPUtil
import psutil
import time
class ResourceMonitor:
"""监控系统资源。"""
def __init__(self, interval=1.0):
self.interval = interval
self.running = False
def start(self):
"""开始监控。"""
self.running = True
while self.running:
# GPU统计
gpus = GPUtil.getGPUs()
for i, gpu in enumerate(gpus):
print(f"GPU {i}: {gpu.load*100:.1f}% load, {gpu.memoryUsed:.1f}GB/{gpu.memoryTotal:.1f}GB")
# CPU统计
cpu_percent = psutil.cpu_percent()
memory = psutil.virtual_memory()
print(f"CPU: {cpu_percent:.1f}%")
print(f"Memory: {memory.percent:.1f}% used ({memory.used/1024**3:.2f}GB/{memory.total/1024**3:.2f}GB)")
time.sleep(self.interval)
def stop(self):
"""停止监控。"""
self.running = False
# 使用
monitor = ResourceMonitor(interval=1.0)
monitor.start()
# ...运行你的推理...
monitor.stop()
10. 生产检查表
10.1 部署前检查表
-
[ ] 模型选择
- [ ] 模型已测试和验证
- [ ] 如有需要,模型已量化
- [ ] 模型大小适合可用VRAM
-
[ ] 性能
- [ ] 延迟符合要求
- [ ] 吞吐量已测试
- [ ] 内存使用已优化
-
[ ] 可靠性
- [ ] 实现了错误处理
- [ ] 失败时的优雅降级
- [ ] 瞬态故障的重试逻辑
-
[ ] 安全性
- [ ] 配置了API认证
- [ ] 启用了速率限制
- [ ] 实施了输入验证
-
[ ] 监控
- [ ] 配置了指标收集
- [ ] 启用了日志记录
- [ ] 设置了警报阈值
-
[ ] 部署
- [ ] 创建了Docker镜像
- [ ] 配置了环境变量
- [ ] 健康检查端点
- [ ] 自动重启策略
10.2 部署后检查表
-
[ ] 验证
- [ ] 通过烟雾测试
- [ ] 完成负载测试
- [ ] 监控性能指标
-
[ ] 文档
- [ ] 更新了API文档
- [ ] 创建了部署指南
- [ ] 记录了已知问题
相关技能
06-ai-ml-production/llm-integration06-ai-ml-production/ai-observability06-ai-ml-production/embedding-models06-ai-ml-production/agent-patterns05-ai-ml-core/pytorch-deployment