LLM路由与编排 llm-routing-orchestration

智能跨模型/提供商路由请求,优化成本,提高可靠性和性能,适用于大模型应用。关键词:LLM路由、成本优化、模型选择、回退策略、生产编排。

AI应用 0 次安装 0 次浏览 更新于 3/5/2026

LLM 路由与编排

智能 LLM 路由、模型选择和成本感知编排(LiteLLM、Portkey、自定义路由器)


执行摘要

随着 LLM 应用的扩展,您需要智能地跨多个模型/提供商路由请求。这项技能涵盖了语义路由、成本感知选择、回退策略和生产编排。

为什么路由很重要

单一模型问题

  • 无回退如果 OpenAI、Anthropic 或 Cohere 宕机
  • 高成本当使用 GPT-4 进行简单任务时
  • 区域问题(仅限美国)
  • 速率限制阻碍关键流程
  • 过度设计简单查询

路由优势

  • 成本优化→将简单任务路由到 GPT-3.5,复杂任务到 GPT-4
  • 可靠性→回退到次要提供商
  • 性能→当速度重要时使用更快的模型
  • 合规性→根据数据居住要求进行路由
  • 功能一致性→跨提供商统一 API

按请求复杂度路由

from typing import Literal
from pydantic import BaseModel

class LLMRequest(BaseModel):
    user_message: str
    context: str = ""
    urgency: Literal["LOW", "MEDIUM", "HIGH"] = "LOW"
    requires_reasoning: bool = False

class ComplexityRouter:
    """基于复杂度路由请求"""

    COMPLEXITY_INDICATORS = [
        "explain", "analyze", "evaluate", "compare", "design",
        "solve", "derive", "prove", "calculate", "optimize"
    ]

    def __init__(self):
        pass

    def classify_complexity(self, request: LLMRequest) -> str:
        """分类请求复杂度"""

        # 复杂推理的关键词
        message_lower = request.user_message.lower()
        has_complex_keyword = any(kw in message_lower for kw in self.COMPLEXITY_INDICATORS)

        # 用户指定要求
        if request.requires_reasoning:
            return "COMPLEX"

        # 紧急性覆盖
        if request.urgency == "HIGH":
            return "FAST"

        # 上下文感知
        if len(request.context) > 1000:  # 大上下文
            return "STANDARD"

        # 基于关键词分类
        if has_complex_keyword:
            return "COMPLEX"

        return "SIMPLE"

    def select_model(self, complexity: str) -> str:
        """根据复杂度选择模型"""

        model_map = {
            "COMPLEX": "gpt-4o",         # 最佳推理
            "SIMPLE": "gpt-4o-mini",     # 低成本,足够好
            "FAST": "claude-3-haiku",    # 快速,便宜
            "STANDARD": "gpt-4o-mini"    # 平衡
        }

        return model_map.get(complexity, "gpt-4o-mini")

按成本路由

class CostAwareRouter:
    """路由请求以最小化成本"""

    # 每 1M 令牌的成本(近似,2025 年费率)
    MODEL_COSTS = {
        "gpt-4o": {"input": 2.50, "output": 10.00},
        "gpt-4o-mini": {"input": 0.15, "output": 0.60},
        "claude-3-opus": {"input": 15.00, "output": 75.00},
        "claude-3-sonnet": {"input": 3.00, "output": 15.00},
        "claude-3-haiku": {"input": 0.25, "output": 1.25},
    }

    def calculate_token_estimates(self, text: str) -> dict:
        """估计输入的令牌计数"""

        # 粗略估计:每个令牌约 4 个字符
        input_tokens = len(text) // 4

        # 估计输出令牌(通常为输入的 2-3 倍)
        output_tokens = int(input_tokens * 2)

        return {"input": input_tokens, "output": output_tokens}

    def calculate_cost(self, model: str, input_tokens: int, output_tokens: int) -> float:
        """计算请求的总成本"""

        costs = self.MODEL_COSTS[model]
        input_cost = (input_tokens / 1_000_000) * costs["input"]
        output_cost = (output_tokens / 1_000_000) * costs["output"]

        return input_cost + output_cost

    def select_cheapest_for_quality(self, min_quality: str = "GOOD") -> str:
        """选择满足质量阈值的最便宜的模型"""

        # 质量等级
        quality_tiers = {
            "EXCELLENT": ["gpt-4o", "claude-3-opus"],
            "GOOD": ["gpt-4o-mini", "claude-3-sonnet"],
            "BASIC": ["claude-3-haiku"]
        }

        eligible_models = quality_tiers.get(min_quality, quality_tiers["GOOD"])

        # 寻找最便宜的
        cheapest = None
        cheapest_cost = float("inf")

        for model in eligible_models:
            avg_cost = (self.MODEL_COSTS[model]["input"] + self.MODEL_COSTS[model]["output"]) / 2
            if avg_cost < cheapest_cost:
                cheapest_cost = avg_cost
                cheapest = model

        return cheapest

LiteLLM 集成

LiteLLM 提供了 100+ 提供商的统一 API。

设置

pip install litellm

配置

from litellm import completion
import os

# 设置 API 密钥
os.environ["OPENAI_API_KEY"] = "sk-..."
os.environ["ANTHROPIC_API_KEY"] = "sk-ant-..."
os.environ["COHERE_API_KEY"] = "..."

基本用法

def call_llm_with_routing(prompt: str, model: str = None):
    """统一 API 调用,自动回退"""

    # 如果没有指定模型,则自动选择
    if model is None:
        model = "gpt-4o-mini"  # 默认

    try:
        response = completion(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            timeout=10.0
        )
        return response.choices[0].message["content"]

    except Exception as e:
        print(f"Error with {model}: {e}")

        # 回退策略
        fallback_model = select_fallback(model)
        return call_llm_with_routing(prompt, fallback_model)

模型别名和回退

# litellm_config.yaml
model_list:
  - model_name: primary
    litellm_params:
      model: openai/gpt-4o-mini

  - model_name: primary_fallback
    litellm_params:
      model: anthropic/claude-3-haiku

  - model_name: complex
    litellm_params:
      model: openai/gpt-4o

  - model_name: complex_fallback
    litellm_params:
      model: anthropic/claude-3-sonnet
      api_key: os.environ/ANTHROPIC_API_KEY

# 环境变量
litellm_settings:
  drop_params: true
  set_verbose: false
  fallback: ["primary_fallback", "complex_fallback"]
  num_retries: 2

重试逻辑

from litellm import completion, RateLimitError, APIError
import time

def call_with_retry(model: str, messages: list, max_retries=3):
    """带指数退避的调用"""

    for attempt in range(max_retries):
        try:
            response = completion(
                model=model,
                messages=messages,
                timeout=30.0
            )
            return response.choices[0].message["content"]

        except RateLimitError as e:
            wait_time = 2 ** attempt  # 指数退避
            print(f"Rate limited. Waiting {wait_time}s...")
            time.sleep(wait_time)

        except APIError as e:
            print(f"API Error: {e}")

            if attempt == max_retries - 1:
                # 最后一次尝试失败,尝试回退
                print("Trying fallback model...")
                return call_with_retry(
                    fallback_model(model),
                    messages,
                    max_retries - attempt - 1
                )

    raise Exception("All retries failed")

def fallback_model(original: str) -> str:
    """选择回退模型"""

    fallback_map = {
        "gpt-4o": "claude-3-sonnet",
        "gpt-4o-mini": "claude-3-haiku",
        "claude-3-opus": "gpt-4o",
        "claude-3-sonnet": "gpt-4o-mini"
    }

    return fallback_map.get(original, "gpt-4o-mini")

Portkey 集成

Portkey 是一个具有可观察性和路由功能的 LLM 网关。

设置

npm install @portkey-ai/portkey-js

配置

import Portkey from 'portkey-js';

const portkey = new Portkey({
  apiKey: process.env.PORTKEY_API_KEY,
  virtualKey: "openai-vk-xxx"  # 提供商的虚拟密钥
});

使用 Portkey 网关

# Python
import portkey

# 自定义路由配置
config = {
    "strategy": {
        "mode": "fallback",
        "models": [
            {
                "name": "OpenAI GPT-4",
                "virtual_key": "openai-vk-xxx"
            },
            {
                "name": "Anthropic Claude",
                "virtual_key": "anthropic-vk-yyy"
            }
        ]
    },
    "retry": {
        "num_attempts": 3
    }
}

portkey_response = portkey.ChatCompletions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": "Hello!"}],
    config=config
)

语义路由

基于意图/分类路由。

from sentence_transformers import SentenceTransformer
import numpy as np

class SemanticRouter:
    """基于语义相似性路由"""

    def __init__(self):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

        # 定义路由类别
        self.routes = {
            "code": ["write a function", "debug this code", "optimize algorithm"],
            "creative": ["write a story", "generate a poem", "creative writing"],
            "analysis": ["analyze data", "summarize report", "compare options"],
            "general": ["hello", "help me", "what is", "how to"]
        }

        # 预计算路由的嵌入
        self.route_embeddings = {}
        for route, examples in self.routes.items():
            embeddings = self.model.encode(examples)
            self.route_embeddings[route] = np.mean(embeddings, axis=0)

    def route(self, user_message: str) -> str:
        """将用户消息路由到适当类别"""

        # 获取用户消息的嵌入
        message_embedding = self.model.encode([user_message])[0]

        # 找到最近的路由
        best_route = None
        best_score = -1

        for route, route_embedding in self.route_embeddings.items():
            # 余弦相似度
            score = np.dot(message_embedding, route_embedding) / (
                np.linalg.norm(message_embedding) * np.linalg.norm(route_embedding)
            )

            if score > best_score:
                best_score = score
                best_route = route

        return best_route

    def select_model_for_route(self, route: str) -> str:
        """根据路由选择模型"""

        model_map = {
            "code": "gpt-4o",
            "creative": "gpt-4o",
            "analysis": "gpt-4o-mini",
            "general": "gpt-4o-mini"
        }

        return model_map.get(route, "gpt-4o-mini")

生产模式

1. A/B 测试模型

class ModelABTester:
    """A/B 测试同一路由的不同模型"""

    def __init__(self):
        self.models = ["gpt-4o-mini", "claude-3-haiku"]
        self.current = 0  # 轮询起始索引

    def get_model(self):
        """获取下一个模型"""
        model = self.models[self.current]
        self.current = (self.current + 1) % len(self.models)
        return model

    def route_with_ab_test(self, request: LLMRequest):
        """带有 A/B 测试的路由"""

        model = self.get_model()

        # 发出请求
        response = call_llm(model, request.user_message)

        # 记录模型和响应以进行分析
        log_ab_test(model, request, response)

        return response

2. 区域路由

class RegionalRouter:
    """基于用户地区/数据居住的路由"""

    def __init__(self):
        self.regional_providers = {
            "us": ["openai", "anthropic"],
            "eu": ["mistral", "openai"],
           "asia": ["openai", "cohere"]
        }

    def route(self, user_region: str, fallback: bool = False):
        """为地区选择提供商"""

        providers = self.regional_providers.get(user_region, self.regional_providers["us"])

        # 主要或回退
        provider = providers[0] if not fallback else providers[1]

        if provider == "openai":
            return f"openai/gpt-4o-mini"
        elif provider == "anthropic":
            return f"anthropic/claude-3-haiku"
        elif provider == "mistral":
            return f"mistral/mistral-7b"

        return "openai/gpt-4o-mini"

3. 基于超时的路由

from time import time
import threading

class TimeoutAwareRouter:
    """基于响应时间要求的路由"""

    def __init__(self, timeout_ms: int = 1000):
        self.timeout_ms = timeout_ms

    def call_with_timeout(self, model: str, messages: list):
        """带超时的调用模型"""

        result = [None]
        error = [None]

        def worker():
            try:
                result[0] = completion(model=model, messages=messages)
            except Exception as e:
                error[0] = e

        thread = threading.Thread(target=worker)
        thread.start()
        thread.join(timeout=self.timeout_ms / 1000.0)

        if thread.is_alive():
            # 超时,杀死线程并使用回退
            print(f"{model} timed out, using fallback")
            return self.call_fallback_fast(messages)

        if error[0]:
            raise error[0]

        return result[0]

    def call_fallback_fast(self, messages: list):
        """使用最快的模型进行回退"""
        return completion(
            model="claude-3-haiku",  # 最快的
            messages=messages
        )

生产清单

  • ✅ 配置回退(2+ 提供商)
  • ✅ 监控速率限制
  • ✅ 按模型/端点跟踪成本
  • ✅ 监控延迟
  • ✅ 错误日志记录以进行调试
  • ✅ A/B 测试结果分析
  • ✅ 数据居住验证
  • ✅ 在提供商之间进行负载均衡

常见陷阱

无回退→单点故障 ❌ 硬编码模型名称→使用别名以增加灵活性 ❌ 忽略速率限制→应用在规模扩大时阻塞 ❌ 不跟踪成本→意外账单 ❌ 无可观察性→无法调试路由问题

资源