LLM 路由与编排
智能 LLM 路由、模型选择和成本感知编排(LiteLLM、Portkey、自定义路由器)
执行摘要
随着 LLM 应用的扩展,您需要智能地跨多个模型/提供商路由请求。这项技能涵盖了语义路由、成本感知选择、回退策略和生产编排。
为什么路由很重要
单一模型问题
- 无回退如果 OpenAI、Anthropic 或 Cohere 宕机
- 高成本当使用 GPT-4 进行简单任务时
- 区域问题(仅限美国)
- 速率限制阻碍关键流程
- 过度设计简单查询
路由优势
- ✅ 成本优化→将简单任务路由到 GPT-3.5,复杂任务到 GPT-4
- ✅ 可靠性→回退到次要提供商
- ✅ 性能→当速度重要时使用更快的模型
- ✅ 合规性→根据数据居住要求进行路由
- ✅ 功能一致性→跨提供商统一 API
按请求复杂度路由
from typing import Literal
from pydantic import BaseModel
class LLMRequest(BaseModel):
user_message: str
context: str = ""
urgency: Literal["LOW", "MEDIUM", "HIGH"] = "LOW"
requires_reasoning: bool = False
class ComplexityRouter:
"""基于复杂度路由请求"""
COMPLEXITY_INDICATORS = [
"explain", "analyze", "evaluate", "compare", "design",
"solve", "derive", "prove", "calculate", "optimize"
]
def __init__(self):
pass
def classify_complexity(self, request: LLMRequest) -> str:
"""分类请求复杂度"""
# 复杂推理的关键词
message_lower = request.user_message.lower()
has_complex_keyword = any(kw in message_lower for kw in self.COMPLEXITY_INDICATORS)
# 用户指定要求
if request.requires_reasoning:
return "COMPLEX"
# 紧急性覆盖
if request.urgency == "HIGH":
return "FAST"
# 上下文感知
if len(request.context) > 1000: # 大上下文
return "STANDARD"
# 基于关键词分类
if has_complex_keyword:
return "COMPLEX"
return "SIMPLE"
def select_model(self, complexity: str) -> str:
"""根据复杂度选择模型"""
model_map = {
"COMPLEX": "gpt-4o", # 最佳推理
"SIMPLE": "gpt-4o-mini", # 低成本,足够好
"FAST": "claude-3-haiku", # 快速,便宜
"STANDARD": "gpt-4o-mini" # 平衡
}
return model_map.get(complexity, "gpt-4o-mini")
按成本路由
class CostAwareRouter:
"""路由请求以最小化成本"""
# 每 1M 令牌的成本(近似,2025 年费率)
MODEL_COSTS = {
"gpt-4o": {"input": 2.50, "output": 10.00},
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
"claude-3-opus": {"input": 15.00, "output": 75.00},
"claude-3-sonnet": {"input": 3.00, "output": 15.00},
"claude-3-haiku": {"input": 0.25, "output": 1.25},
}
def calculate_token_estimates(self, text: str) -> dict:
"""估计输入的令牌计数"""
# 粗略估计:每个令牌约 4 个字符
input_tokens = len(text) // 4
# 估计输出令牌(通常为输入的 2-3 倍)
output_tokens = int(input_tokens * 2)
return {"input": input_tokens, "output": output_tokens}
def calculate_cost(self, model: str, input_tokens: int, output_tokens: int) -> float:
"""计算请求的总成本"""
costs = self.MODEL_COSTS[model]
input_cost = (input_tokens / 1_000_000) * costs["input"]
output_cost = (output_tokens / 1_000_000) * costs["output"]
return input_cost + output_cost
def select_cheapest_for_quality(self, min_quality: str = "GOOD") -> str:
"""选择满足质量阈值的最便宜的模型"""
# 质量等级
quality_tiers = {
"EXCELLENT": ["gpt-4o", "claude-3-opus"],
"GOOD": ["gpt-4o-mini", "claude-3-sonnet"],
"BASIC": ["claude-3-haiku"]
}
eligible_models = quality_tiers.get(min_quality, quality_tiers["GOOD"])
# 寻找最便宜的
cheapest = None
cheapest_cost = float("inf")
for model in eligible_models:
avg_cost = (self.MODEL_COSTS[model]["input"] + self.MODEL_COSTS[model]["output"]) / 2
if avg_cost < cheapest_cost:
cheapest_cost = avg_cost
cheapest = model
return cheapest
LiteLLM 集成
LiteLLM 提供了 100+ 提供商的统一 API。
设置
pip install litellm
配置
from litellm import completion
import os
# 设置 API 密钥
os.environ["OPENAI_API_KEY"] = "sk-..."
os.environ["ANTHROPIC_API_KEY"] = "sk-ant-..."
os.environ["COHERE_API_KEY"] = "..."
基本用法
def call_llm_with_routing(prompt: str, model: str = None):
"""统一 API 调用,自动回退"""
# 如果没有指定模型,则自动选择
if model is None:
model = "gpt-4o-mini" # 默认
try:
response = completion(
model=model,
messages=[{"role": "user", "content": prompt}],
timeout=10.0
)
return response.choices[0].message["content"]
except Exception as e:
print(f"Error with {model}: {e}")
# 回退策略
fallback_model = select_fallback(model)
return call_llm_with_routing(prompt, fallback_model)
模型别名和回退
# litellm_config.yaml
model_list:
- model_name: primary
litellm_params:
model: openai/gpt-4o-mini
- model_name: primary_fallback
litellm_params:
model: anthropic/claude-3-haiku
- model_name: complex
litellm_params:
model: openai/gpt-4o
- model_name: complex_fallback
litellm_params:
model: anthropic/claude-3-sonnet
api_key: os.environ/ANTHROPIC_API_KEY
# 环境变量
litellm_settings:
drop_params: true
set_verbose: false
fallback: ["primary_fallback", "complex_fallback"]
num_retries: 2
重试逻辑
from litellm import completion, RateLimitError, APIError
import time
def call_with_retry(model: str, messages: list, max_retries=3):
"""带指数退避的调用"""
for attempt in range(max_retries):
try:
response = completion(
model=model,
messages=messages,
timeout=30.0
)
return response.choices[0].message["content"]
except RateLimitError as e:
wait_time = 2 ** attempt # 指数退避
print(f"Rate limited. Waiting {wait_time}s...")
time.sleep(wait_time)
except APIError as e:
print(f"API Error: {e}")
if attempt == max_retries - 1:
# 最后一次尝试失败,尝试回退
print("Trying fallback model...")
return call_with_retry(
fallback_model(model),
messages,
max_retries - attempt - 1
)
raise Exception("All retries failed")
def fallback_model(original: str) -> str:
"""选择回退模型"""
fallback_map = {
"gpt-4o": "claude-3-sonnet",
"gpt-4o-mini": "claude-3-haiku",
"claude-3-opus": "gpt-4o",
"claude-3-sonnet": "gpt-4o-mini"
}
return fallback_map.get(original, "gpt-4o-mini")
Portkey 集成
Portkey 是一个具有可观察性和路由功能的 LLM 网关。
设置
npm install @portkey-ai/portkey-js
配置
import Portkey from 'portkey-js';
const portkey = new Portkey({
apiKey: process.env.PORTKEY_API_KEY,
virtualKey: "openai-vk-xxx" # 提供商的虚拟密钥
});
使用 Portkey 网关
# Python
import portkey
# 自定义路由配置
config = {
"strategy": {
"mode": "fallback",
"models": [
{
"name": "OpenAI GPT-4",
"virtual_key": "openai-vk-xxx"
},
{
"name": "Anthropic Claude",
"virtual_key": "anthropic-vk-yyy"
}
]
},
"retry": {
"num_attempts": 3
}
}
portkey_response = portkey.ChatCompletions.create(
model="gpt-4o",
messages=[{"role": "user", "content": "Hello!"}],
config=config
)
语义路由
基于意图/分类路由。
from sentence_transformers import SentenceTransformer
import numpy as np
class SemanticRouter:
"""基于语义相似性路由"""
def __init__(self):
self.model = SentenceTransformer('all-MiniLM-L6-v2')
# 定义路由类别
self.routes = {
"code": ["write a function", "debug this code", "optimize algorithm"],
"creative": ["write a story", "generate a poem", "creative writing"],
"analysis": ["analyze data", "summarize report", "compare options"],
"general": ["hello", "help me", "what is", "how to"]
}
# 预计算路由的嵌入
self.route_embeddings = {}
for route, examples in self.routes.items():
embeddings = self.model.encode(examples)
self.route_embeddings[route] = np.mean(embeddings, axis=0)
def route(self, user_message: str) -> str:
"""将用户消息路由到适当类别"""
# 获取用户消息的嵌入
message_embedding = self.model.encode([user_message])[0]
# 找到最近的路由
best_route = None
best_score = -1
for route, route_embedding in self.route_embeddings.items():
# 余弦相似度
score = np.dot(message_embedding, route_embedding) / (
np.linalg.norm(message_embedding) * np.linalg.norm(route_embedding)
)
if score > best_score:
best_score = score
best_route = route
return best_route
def select_model_for_route(self, route: str) -> str:
"""根据路由选择模型"""
model_map = {
"code": "gpt-4o",
"creative": "gpt-4o",
"analysis": "gpt-4o-mini",
"general": "gpt-4o-mini"
}
return model_map.get(route, "gpt-4o-mini")
生产模式
1. A/B 测试模型
class ModelABTester:
"""A/B 测试同一路由的不同模型"""
def __init__(self):
self.models = ["gpt-4o-mini", "claude-3-haiku"]
self.current = 0 # 轮询起始索引
def get_model(self):
"""获取下一个模型"""
model = self.models[self.current]
self.current = (self.current + 1) % len(self.models)
return model
def route_with_ab_test(self, request: LLMRequest):
"""带有 A/B 测试的路由"""
model = self.get_model()
# 发出请求
response = call_llm(model, request.user_message)
# 记录模型和响应以进行分析
log_ab_test(model, request, response)
return response
2. 区域路由
class RegionalRouter:
"""基于用户地区/数据居住的路由"""
def __init__(self):
self.regional_providers = {
"us": ["openai", "anthropic"],
"eu": ["mistral", "openai"],
"asia": ["openai", "cohere"]
}
def route(self, user_region: str, fallback: bool = False):
"""为地区选择提供商"""
providers = self.regional_providers.get(user_region, self.regional_providers["us"])
# 主要或回退
provider = providers[0] if not fallback else providers[1]
if provider == "openai":
return f"openai/gpt-4o-mini"
elif provider == "anthropic":
return f"anthropic/claude-3-haiku"
elif provider == "mistral":
return f"mistral/mistral-7b"
return "openai/gpt-4o-mini"
3. 基于超时的路由
from time import time
import threading
class TimeoutAwareRouter:
"""基于响应时间要求的路由"""
def __init__(self, timeout_ms: int = 1000):
self.timeout_ms = timeout_ms
def call_with_timeout(self, model: str, messages: list):
"""带超时的调用模型"""
result = [None]
error = [None]
def worker():
try:
result[0] = completion(model=model, messages=messages)
except Exception as e:
error[0] = e
thread = threading.Thread(target=worker)
thread.start()
thread.join(timeout=self.timeout_ms / 1000.0)
if thread.is_alive():
# 超时,杀死线程并使用回退
print(f"{model} timed out, using fallback")
return self.call_fallback_fast(messages)
if error[0]:
raise error[0]
return result[0]
def call_fallback_fast(self, messages: list):
"""使用最快的模型进行回退"""
return completion(
model="claude-3-haiku", # 最快的
messages=messages
)
生产清单
- ✅ 配置回退(2+ 提供商)
- ✅ 监控速率限制
- ✅ 按模型/端点跟踪成本
- ✅ 监控延迟
- ✅ 错误日志记录以进行调试
- ✅ A/B 测试结果分析
- ✅ 数据居住验证
- ✅ 在提供商之间进行负载均衡
常见陷阱
❌ 无回退→单点故障 ❌ 硬编码模型名称→使用别名以增加灵活性 ❌ 忽略速率限制→应用在规模扩大时阻塞 ❌ 不跟踪成本→意外账单 ❌ 无可观察性→无法调试路由问题