feat(router): 集成NVIDIA多头分类器实现3-tier智能路由

- 新增nvidia_router.py: 手动加载NVIDIA prompt-task-and-complexity-classifier模型 - DeBERTa-v3-base backbone + 8个分类头(task_type/creativity/reasoning/domain等) - 综合多维度评分实现simple/medium/complex三级路由 - 映射: simple->qwen-flash, medium->qwen-plus, complex->qwen-max - main.py切换到NVIDIA路由替代RouteLLM BERT二分类 - 移除LiteLLM依赖解决版本冲突,使用原生httpx调用 - 版本升级至v0.3.0
2026-04-18 01:21:31 +08:00
parent f9cc7973b9
commit 59c03516e4
3 changed files with 343 additions and 27 deletions
--- a/main.py
+++ b/main.py
@@ -13,7 +13,7 @@ from litellm import acompletion
 import litellm
 from config import MODEL_CONFIG, ROUTING_THRESHOLDS, DEFAULT_ROUTING, DASHSCOPE_API_KEY
-from bert_router import get_bert_router, route_with_bert
+from nvidia_router import get_nvidia_router, select_model_by_nvidia
 # 配置 LiteLLM 使用 DashScope (Qwen)
 if DASHSCOPE_API_KEY:
@@ -21,15 +21,15 @@ if DASHSCOPE_API_KEY:
    # Qwen 使用 OpenAI 兼容接口，但需要通过 api_base 指定
    litellm.api_base = "https://dashscope.aliyuncs.com/compatible-mode/v1"
-# BERT Router 实例（延迟加载）
+# NVIDIA Router 实例（延迟加载）
-_bert_router = None
+_nvidia_router = None
 def get_router():
-    """获取 BERT Router 实例（延迟加载）"""
+    """获取 NVIDIA Router 实例（延迟加载）"""
-    global _bert_router
+    global _nvidia_router
-    if _bert_router is None:
+    if _nvidia_router is None:
-        _bert_router = get_bert_router()
+        _nvidia_router = get_nvidia_router()
-    return _bert_router
+    return _nvidia_router
 # 调用历史记录
@@ -69,8 +69,8 @@ class StatsResponse(BaseModel):
 app = FastAPI(
    title="LLM Router MVP",
-    description="基于 LiteLLM 的多提供商路由服务",
+    description="基于 LiteLLM + NVIDIA 分类器的多提供商路由服务（支持3-tier智能路由）",
-    version="0.2.0",
+    version="0.3.0",
 )
@@ -102,30 +102,26 @@ def select_model_by_length(messages: List[Message]) -> str:
        return DEFAULT_ROUTING["complex"]
-def select_model_by_bert(messages: List[Message]) -> str:
+def select_model_by_nvidia_classifier(messages: List[Message]) -> str:
    """
-    基于 BERT 分类器选择模型
+    基于 NVIDIA 多头分类器选择模型（3-tier路由）
-    BERT 输出: strong / weak
+    NVIDIA 输出: 多维度复杂度评分
    映射到 Qwen 模型:
-    - strong -> qwen-max (复杂任务)
+    - simple  -> qwen-flash (简单任务)
-    - weak -> qwen-flash (简单任务)
+    - medium  -> qwen-plus  (中等任务)
    - complex -> qwen-max   (复杂任务)
    """
    # 取最后一条用户消息作为查询
    query = messages[-1].content if messages else ""
    try:
        router = get_router()
-        complexity = router.predict(query)
+        model = router.select_model(query)
-        
+        return model
        # BERT 二分类映射到三模型
        if complexity == "strong":
            return "qwen-max"
        else:
            return "qwen-flash"
    except Exception as e:
-        # BERT 失败时回退到 token 长度策略
+        # NVIDIA 分类器失败时回退到 token 长度策略
-        print(f"BERT routing failed: {e}, falling back to token length")
+        print(f"NVIDIA routing failed: {e}, falling back to token length")
        return select_model_by_length(messages)
@@ -180,8 +176,8 @@ async def chat_completions(request: ChatRequest):
    if request.model:
        model_key = request.model
    else:
-        # 使用 BERT 智能路由（替代原来的 token 长度路由）
+        # 使用 NVIDIA 多头分类器智能路由（支持3-tier）
-        model_key = select_model_by_bert(request.messages)
+        model_key = select_model_by_nvidia_classifier(request.messages)
    # 获取 LiteLLM 模型名称
    provider_model = get_provider_model(model_key)
@@ -293,7 +289,7 @@ async def get_stats():
@app.get("/health")
 async def health_check():
    """健康检查"""
-    return {"status": "healthy", "version": "0.2.0"}
+    return {"status": "healthy", "version": "0.3.0", "router": "nvidia-3tier"}
 if __name__ == "__main__":
--- a/nvidia_router.py
+++ b/nvidia_router.py
@@ -0,0 +1,318 @@
 """
 NVIDIA Prompt Task & Complexity Classifier Router
 手动加载自定义多头模型，支持3-tier路由
 模型: nvidia/prompt-task-and-complexity-classifier (184M参数)
 架构: DeBERTa-v3-base backbone + 8个分类头
 输出: task_type(12类), creativity(3类), reasoning(2类), 
      domain_knowledge(4类), complexity_score 等多维度
 """
 import torch
 import torch.nn as nn
 from transformers import AutoTokenizer, DebertaV2Model, AutoConfig
 from safetensors.torch import load_file
 from huggingface_hub import hf_hub_download
 from typing import Dict, Optional
 import logging
 logger = logging.getLogger(__name__)
 class ClassificationHead(nn.Module):
    """单个分类头"""
    def __init__(self, input_dim: int, num_classes: int, dropout: float = 0.2):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(input_dim, num_classes)
    def forward(self, x):
        x = self.dropout(x)
        return self.fc(x)
 class NvidiaMultiHeadClassifier(nn.Module):
    """
    NVIDIA 多头分类器
    DeBERTa backbone + 8个独立分类头
    """
    def __init__(self, config):
        super().__init__()
        self.config = config
        # DeBERTa backbone
        self.backbone = DebertaV2Model.from_pretrained(
            config.base_model, 
            ignore_mismatched_sizes=True
        )
        hidden_size = 768  # DeBERTa-v3-base
        dropout = config.fc_dropout if hasattr(config, 'fc_dropout') else 0.2
        # 8个分类头 (与 state_dict 中的 head_0 ~ head_7 对应)
        target_sizes = config.target_sizes
        self.head_0 = ClassificationHead(hidden_size, target_sizes["task_type"], dropout)       # 12类
        self.head_1 = ClassificationHead(hidden_size, target_sizes["creativity_scope"], dropout)  # 3类
        self.head_2 = ClassificationHead(hidden_size, target_sizes["reasoning"], dropout)         # 2类
        self.head_3 = ClassificationHead(hidden_size, target_sizes["contextual_knowledge"], dropout) # 2类
        self.head_4 = ClassificationHead(hidden_size, target_sizes["number_of_few_shots"], dropout)  # 6类
        self.head_5 = ClassificationHead(hidden_size, target_sizes["domain_knowledge"], dropout)     # 4类
        self.head_6 = ClassificationHead(hidden_size, target_sizes["no_label_reason"], dropout)      # 1类
        self.head_7 = ClassificationHead(hidden_size, target_sizes["constraint_ct"], dropout)        # 2类
        # Head 名称映射
        self.head_names = [
            "task_type",            # head_0: 12类
            "creativity_scope",     # head_1: 3类
            "reasoning",            # head_2: 2类
            "contextual_knowledge", # head_3: 2类
            "number_of_few_shots",  # head_4: 6类
            "domain_knowledge",     # head_5: 4类
            "no_label_reason",      # head_6: 1类
            "constraint_ct",        # head_7: 2类
        ]
    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        # 使用 [CLS] token 的隐层
        cls_output = outputs.last_hidden_state[:, 0]
        # 各头输出
        head_outputs = {
            "task_type": self.head_0(cls_output),
            "creativity_scope": self.head_1(cls_output),
            "reasoning": self.head_2(cls_output),
            "contextual_knowledge": self.head_3(cls_output),
            "number_of_few_shots": self.head_4(cls_output),
            "domain_knowledge": self.head_5(cls_output),
            "no_label_reason": self.head_6(cls_output),
            "constraint_ct": self.head_7(cls_output),
        }
        return head_outputs
 class NvidiaComplexityRouter:
    """NVIDIA 多头分类器路由封装"""
    MODEL_NAME = "nvidia/prompt-task-and-complexity-classifier"
    # Task type 映射
    TASK_TYPE_MAP = {
        0: "Brainstorming", 1: "Chatbot", 2: "Classification",
        3: "Closed QA", 4: "Code Generation", 5: "Extraction",
        6: "Open QA", 7: "Other", 8: "Rewrite",
        9: "Summarization", 10: "Text Generation", 11: "Unknown"
    }
    # Domain knowledge 映射 
    DOMAIN_MAP = {0: "High", 1: "Low", 2: "Medium", 3: "No"}
    # Creativity 映射
    CREATIVITY_MAP = {0: "High", 1: "Low", 2: "No"}
    def __init__(self, device: str = "cpu"):
        self.device = device
        self.tokenizer = None
        self.model = None
        self.config = None
        self._initialized = False
    def initialize(self):
        """延迟加载模型"""
        if self._initialized:
            return
        logger.info(f"Loading NVIDIA classifier: {self.MODEL_NAME}")
        # 1. 加载 config
        self.config = AutoConfig.from_pretrained(self.MODEL_NAME)
        # 2. 加载 tokenizer (slow模式，兼容性好)
        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL_NAME, use_fast=False)
        # 3. 构建模型并加载权重
        self.model = NvidiaMultiHeadClassifier(self.config)
        model_path = hf_hub_download(self.MODEL_NAME, "model.safetensors")
        state_dict = load_file(model_path)
        self.model.load_state_dict(state_dict, strict=False)
        self.model.to(self.device)
        self.model.eval()
        self._initialized = True
        logger.info("NVIDIA classifier loaded successfully")
    def predict(self, query: str) -> Dict:
        """
        预测查询的多维度特征
        Returns:
            {
                "tier": "simple" | "medium" | "complex",
                "complexity_score": float (0-1),
                "task_type": str,
                "domain_knowledge": str,
                "reasoning": bool,
                "creativity": str
            }
        """
        if not self._initialized:
            self.initialize()
        inputs = self.tokenizer(
            query, return_tensors="pt", truncation=True, max_length=512, padding=True
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = self.model(**inputs)
        # 解析各头输出
        task_type_idx = torch.argmax(outputs["task_type"], dim=-1).item()
        task_type = self.TASK_TYPE_MAP.get(task_type_idx, "Unknown")
        domain_idx = torch.argmax(outputs["domain_knowledge"], dim=-1).item()
        domain = self.DOMAIN_MAP.get(domain_idx, "Unknown")
        creativity_idx = torch.argmax(outputs["creativity_scope"], dim=-1).item()
        creativity = self.CREATIVITY_MAP.get(creativity_idx, "Unknown")
        reasoning_idx = torch.argmax(outputs["reasoning"], dim=-1).item()
        needs_reasoning = reasoning_idx == 1
        # 计算综合复杂度评分 (0-1)
        complexity_score = self._compute_complexity_score(
            domain=domain,
            creativity=creativity,
            needs_reasoning=needs_reasoning,
            task_type=task_type
        )
        tier = self._score_to_tier(complexity_score)
        return {
            "tier": tier,
            "complexity_score": complexity_score,
            "task_type": task_type,
            "domain_knowledge": domain,
            "reasoning": needs_reasoning,
            "creativity": creativity,
        }
    def _compute_complexity_score(self, domain, creativity, needs_reasoning, task_type) -> float:
        """
        综合多维度计算复杂度评分 (0-1)
        权重:
        - domain_knowledge: 40% (High=1.0, Medium=0.6, Low=0.3, No=0.0)
        - reasoning: 30% (Yes=1.0, No=0.0)
        - creativity: 20% (High=1.0, Low=0.4, No=0.0)
        - task_type: 10% (Code=0.8, QA=0.5, Chatbot=0.2, ...)
        """
        domain_scores = {"High": 1.0, "Medium": 0.6, "Low": 0.3, "No": 0.0}
        creativity_scores = {"High": 1.0, "Low": 0.4, "No": 0.0}
        task_complexity = {
            "Code Generation": 0.8, "Text Generation": 0.7,
            "Summarization": 0.6, "Rewrite": 0.5,
            "Open QA": 0.5, "Closed QA": 0.4,
            "Classification": 0.3, "Extraction": 0.3,
            "Brainstorming": 0.6, "Chatbot": 0.2,
            "Other": 0.5, "Unknown": 0.5,
        }
        score = (
            0.4 * domain_scores.get(domain, 0.5) +
            0.3 * (1.0 if needs_reasoning else 0.0) +
            0.2 * creativity_scores.get(creativity, 0.5) +
            0.1 * task_complexity.get(task_type, 0.5)
        )
        return round(score, 3)
    def _score_to_tier(self, score: float) -> str:
        if score < 0.35:
            return "simple"
        elif score < 0.65:
            return "medium"
        else:
            return "complex"
    def select_model(self, query: str) -> str:
        """直接返回推荐的模型名称"""
        result = self.predict(query)
        model_map = {
            "simple": "qwen-flash",
            "medium": "qwen-plus",
            "complex": "qwen-max"
        }
        return model_map[result["tier"]]
    def benchmark(self, queries: list) -> Dict:
        """批量测试"""
        import time
        results = []
        for query in queries:
            start = time.time()
            result = self.predict(query)
            elapsed = (time.time() - start) * 1000
            results.append({
                "query": query[:50],
                "tier": result["tier"],
                "score": result["complexity_score"],
                "task": result["task_type"],
                "domain": result["domain_knowledge"],
                "reasoning": result["reasoning"],
                "time_ms": round(elapsed, 1)
            })
        times = [r["time_ms"] for r in results]
        return {
            "avg_ms": round(sum(times) / len(times), 1),
            "results": results
        }
 # 全局单例
 _router_instance: Optional[NvidiaComplexityRouter] = None
 def get_nvidia_router() -> NvidiaComplexityRouter:
    global _router_instance
    if _router_instance is None:
        _router_instance = NvidiaComplexityRouter()
    return _router_instance
 def select_model_by_nvidia(query: str) -> str:
    return get_nvidia_router().select_model(query)
 if __name__ == "__main__":
    test_queries = [
        "你好",
        "What is 2+2?",
        "Explain quantum computing principles in detail",
        "Write a quicksort algorithm in Python with error handling",
        "Analyze this 10-page research paper and summarize the key innovations",
        "Rewrite this sentence to be more concise",
        "Generate a creative story about a robot",
    ]
    router = NvidiaComplexityRouter()
    print("=" * 80)
    print("NVIDIA Prompt Task & Complexity Classifier - 3-Tier Router Test")
    print("=" * 80)
    for query in test_queries:
        result = router.predict(query)
        model = router.select_model(query)
        print(f"\nQuery: {query}")
        print(f"  Tier:       {result['tier']}")
        print(f"  Score:      {result['complexity_score']}")
        print(f"  Task:       {result['task_type']}")
        print(f"  Domain:     {result['domain_knowledge']}")
        print(f"  Reasoning:  {result['reasoning']}")
        print(f"  Creativity: {result['creativity']}")
        print(f"  -> Model:   {model}")
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,5 +7,7 @@ httpx>=0.25.0
 python-dotenv>=1.0.0
 transformers>=4.30.0
 torch>=2.0.0
 # NVIDIA Multi-head Classifier for 3-tier routing
 # nvidia/prompt-task-and-complexity-classifier will be loaded via transformers
 pytest>=7.4.0
 pytest-asyncio>=0.21.0