feat(stats): 完善调用记录详情并持久化到JSONL文件

- log_call保存完整request/routing/llm三层数据（含NVIDIA分类原始输出） - 新增/stats/raw接口返回原始调用记录（支持分页） - /stats摘要新增tier_distribution、task_type_distribution、avg_routing_ms - 调用历史持久化到data/call_history.jsonl，重启自动恢复 - data/目录加入.gitignore
2026-04-18 01:58:33 +08:00
parent a247df34a5
commit 1e273e3670
2 changed files with 186 additions and 76 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,9 @@ __pycache__/
 .env
 .venv

+# Data (call history logs)
+data/
+
 # IDE
 .vscode/
 .idea/
--- a/main.py
+++ b/main.py
@@ -4,8 +4,11 @@ MVP版 LLM 路由服务
 支持: OpenAI, Anthropic, Gemini, Ollama 等 100+ 提供商
 """
 import time
+import json
+import os
 import tiktoken
 from typing import List, Dict, Any, Optional
+from pathlib import Path

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
@@ -32,9 +35,29 @@ def get_router():
    return _nvidia_router


-# 调用历史记录
+# 调用历史 - JSON 文件持久化
+CALL_LOG_DIR = Path(__file__).parent / "data"
+CALL_LOG_DIR.mkdir(exist_ok=True)
+CALL_LOG_FILE = CALL_LOG_DIR / "call_history.jsonl"
+
+# 内存缓存（启动时从文件加载）
 call_history: List[Dict[str, Any]] = []

+def _load_history():
+    """启动时从 JSONL 文件加载历史记录"""
+    if CALL_LOG_FILE.exists():
+        with open(CALL_LOG_FILE, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    try:
+                        call_history.append(json.loads(line))
+                    except json.JSONDecodeError:
+                        continue
+    print(f"Loaded {len(call_history)} historical records from {CALL_LOG_FILE}")
+
+_load_history()
+

 class Message(BaseModel):
    role: str
@@ -58,15 +81,6 @@ class ChatResponse(BaseModel):
    latency_ms: float


-class StatsResponse(BaseModel):
-    total_calls: int
-    total_cost_usd: float
-    avg_latency_ms: float
-    model_distribution: Dict[str, int]
-    provider_distribution: Dict[str, int]
-    recent_calls: List[Dict[str, Any]]
-
-
 app = FastAPI(
    title="LLM Router MVP",
    description="基于 LiteLLM + NVIDIA 分类器的多提供商路由服务（支持3-tier智能路由）",
@@ -102,27 +116,46 @@ def select_model_by_length(messages: List[Message]) -> str:
        return DEFAULT_ROUTING["complex"]


-def select_model_by_nvidia_classifier(messages: List[Message]) -> str:
+def select_model_by_nvidia_classifier(messages: List[Message]) -> tuple:
    """
    基于 NVIDIA 多头分类器选择模型（3-tier路由）
    
-    NVIDIA 输出: 多维度复杂度评分
-    映射到 Qwen 模型:
-    - simple  -> qwen-flash (简单任务)
-    - medium  -> qwen-plus  (中等任务)
-    - complex -> qwen-max   (复杂任务)
+    Returns:
+        (model_key, routing_detail) - 模型名称 + 路由分类细节
    """
-    # 取最后一条用户消息作为查询
    query = messages[-1].content if messages else ""
    
    try:
        router = get_router()
-        model = router.select_model(query)
-        return model
+        start = time.time()
+        result = router.predict(query)
+        routing_ms = (time.time() - start) * 1000
+        
+        model_map = {"simple": "qwen-flash", "medium": "qwen-plus", "complex": "qwen-max"}
+        model_key = model_map[result["tier"]]
+        
+        routing_detail = {
+            "method": "nvidia_classifier",
+            "query": query,
+            "routing_latency_ms": round(routing_ms, 2),
+            "tier": result["tier"],
+            "complexity_score": result["complexity_score"],
+            "task_type": result["task_type"],
+            "domain_knowledge": result["domain_knowledge"],
+            "reasoning": result["reasoning"],
+            "creativity": result["creativity"],
+        }
+        return model_key, routing_detail
    except Exception as e:
-        # NVIDIA 分类器失败时回退到 token 长度策略
        print(f"NVIDIA routing failed: {e}, falling back to token length")
-        return select_model_by_length(messages)
+        model_key = select_model_by_length(messages)
+        routing_detail = {
+            "method": "fallback_token_length",
+            "query": query,
+            "routing_latency_ms": 0,
+            "error": str(e),
+        }
+        return model_key, routing_detail


 def get_provider_model(model_key: str) -> str:
@@ -154,61 +187,114 @@ def get_provider_from_model(model_name: str) -> str:
    return "unknown"


-def log_call(model: str, provider: str, cost: float, latency_ms: float, tokens: int):
-    """记录调用历史"""
-    call_history.append({
-        "model": model,
-        "provider": provider,
-        "cost_usd": cost,
-        "latency_ms": latency_ms,
-        "tokens": tokens,
+def log_call(
+    model: str,
+    provider: str,
+    cost: float,
+    latency_ms: float,
+    input_tokens: int,
+    output_tokens: int,
+    messages: List[Dict[str, str]],
+    response_content: str,
+    response_id: str,
+    routing_detail: Optional[Dict[str, Any]],
+    request_params: Dict[str, Any],
+):
+    """记录完整调用历史（含路由细节 + LLM 原始数据，供后续调优）"""
+    record = {
        "timestamp": time.time(),
-    })
+        # 请求
+        "request": {
+            "messages": messages,
+            "temperature": request_params.get("temperature"),
+            "max_tokens": request_params.get("max_tokens"),
+            "user_specified_model": request_params.get("user_specified_model"),
+        },
+        # 路由决策
+        "routing": routing_detail,
+        # LLM 调用
+        "llm": {
+            "model": model,
+            "provider": provider,
+            "response_id": response_id,
+            "response_content": response_content,
+            "input_tokens": input_tokens,
+            "output_tokens": output_tokens,
+            "total_tokens": input_tokens + output_tokens,
+            "cost_usd": cost,
+            "llm_latency_ms": round(latency_ms, 2),
+        },
+    }
+    call_history.append(record)
+    
+    # 追加写入 JSONL 文件
+    with open(CALL_LOG_FILE, "a", encoding="utf-8") as f:
+        f.write(json.dumps(record, ensure_ascii=False) + "\n")


@app.post("/v1/chat/completions", response_model=ChatResponse)
 async def chat_completions(request: ChatRequest):
    """
    聊天完成接口
-    如果 request.model 未指定，则根据 token 长度自动路由
+    如果 request.model 未指定，则使用 NVIDIA 分类器智能路由
    """
-    # 选择模型
+    routing_detail = None
+    
    if request.model:
        model_key = request.model
+        routing_detail = {
+            "method": "user_specified",
+            "query": request.messages[-1].content if request.messages else "",
+        }
    else:
-        # 使用 NVIDIA 多头分类器智能路由（支持3-tier）
-        model_key = select_model_by_nvidia_classifier(request.messages)
+        model_key, routing_detail = select_model_by_nvidia_classifier(request.messages)
    
    # 获取 LiteLLM 模型名称
    provider_model = get_provider_model(model_key)
    provider = get_provider_from_model(provider_model)
+    messages_raw = [{"role": m.role, "content": m.content} for m in request.messages]
    
    start_time = time.time()
    
    try:
-        # 使用 LiteLLM 统一调用
        response = await acompletion(
            model=provider_model,
-            messages=[{"role": m.role, "content": m.content} for m in request.messages],
+            messages=messages_raw,
            temperature=request.temperature,
            max_tokens=request.max_tokens,
        )
        
        latency_ms = (time.time() - start_time) * 1000
        
-        # 计算成本
        input_tokens = response.usage.prompt_tokens
        output_tokens = response.usage.completion_tokens
        cost = calculate_cost(model_key, input_tokens, output_tokens)
+        response_content = response.choices[0].message.content
        
-        # 记录调用
-        log_call(model_key, provider, cost, latency_ms, input_tokens + output_tokens)
+        # 记录完整调用数据
+        log_call(
+            model=model_key,
+            provider=provider,
+            cost=cost,
+            latency_ms=latency_ms,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            messages=messages_raw,
+            response_content=response_content,
+            response_id=response.id,
+            routing_detail=routing_detail,
+            request_params={
+                "temperature": request.temperature,
+                "max_tokens": request.max_tokens,
+                "user_specified_model": request.model,
+            },
+        )
        
        return ChatResponse(
            id=response.id,
            model=model_key,
            provider=provider,
-            content=response.choices[0].message.content,
+            content=response_content,
            usage={
                "prompt_tokens": input_tokens,
                "completion_tokens": output_tokens,
@@ -238,52 +324,73 @@ async def list_models():
    }


-@app.get("/stats", response_model=StatsResponse)
+@app.get("/stats")
 async def get_stats():
-    """获取调用统计"""
+    """获取调用统计摘要"""
    if not call_history:
-        return StatsResponse(
-            total_calls=0,
-            total_cost_usd=0.0,
-            avg_latency_ms=0.0,
-            model_distribution={},
-            provider_distribution={},
-            recent_calls=[],
-        )
+        return {
+            "total_calls": 0,
+            "total_cost_usd": 0.0,
+            "avg_latency_ms": 0.0,
+            "model_distribution": {},
+            "tier_distribution": {},
+            "task_type_distribution": {},
+        }
    
    total_calls = len(call_history)
-    total_cost = sum(c["cost_usd"] for c in call_history)
-    avg_latency = sum(c["latency_ms"] for c in call_history) / total_calls
+    total_cost = sum(c["llm"]["cost_usd"] for c in call_history)
+    avg_latency = sum(c["llm"]["llm_latency_ms"] for c in call_history) / total_calls
    
-    # 模型分布
    model_dist: Dict[str, int] = {}
-    provider_dist: Dict[str, int] = {}
+    tier_dist: Dict[str, int] = {}
+    task_dist: Dict[str, int] = {}
+    
    for call in call_history:
-        model = call["model"]
-        provider = call["provider"]
+        model = call["llm"]["model"]
        model_dist[model] = model_dist.get(model, 0) + 1
-        provider_dist[provider] = provider_dist.get(provider, 0) + 1
        
-    # 最近 10 条记录
-    recent = [
-        {
-            "model": c["model"],
-            "provider": c["provider"],
-            "cost_usd": round(c["cost_usd"], 6),
-            "latency_ms": round(c["latency_ms"], 2),
-            "tokens": c["tokens"],
-        }
-        for c in call_history[-10:]
-    ]
+        routing = call.get("routing") or {}
+        if routing.get("tier"):
+            tier = routing["tier"]
+            tier_dist[tier] = tier_dist.get(tier, 0) + 1
+        if routing.get("task_type"):
+            task = routing["task_type"]
+            task_dist[task] = task_dist.get(task, 0) + 1
    
-    return StatsResponse(
-        total_calls=total_calls,
-        total_cost_usd=round(total_cost, 6),
-        avg_latency_ms=round(avg_latency, 2),
-        model_distribution=model_dist,
-        provider_distribution=provider_dist,
-        recent_calls=recent,
-    )
+    return {
+        "total_calls": total_calls,
+        "total_cost_usd": round(total_cost, 6),
+        "avg_latency_ms": round(avg_latency, 2),
+        "avg_routing_ms": round(
+            sum(c.get("routing", {}).get("routing_latency_ms", 0) for c in call_history) / total_calls, 2
+        ),
+        "model_distribution": model_dist,
+        "tier_distribution": tier_dist,
+        "task_type_distribution": task_dist,
+    }
+
+
+@app.get("/stats/raw")
+async def get_stats_raw(limit: int = 50, offset: int = 0):
+    """
+    获取原始调用记录（含路由分类细节 + LLM 完整数据）
+    用于后续调优和分析
+    
+    参数:
+    - limit: 返回条数（默认50）
+    - offset: 偏移量（默认0，从最新开始）
+    """
+    total = len(call_history)
+    # 倒序返回（最新在前）
+    records = list(reversed(call_history))
+    page = records[offset:offset + limit]
+    
+    return {
+        "total": total,
+        "limit": limit,
+        "offset": offset,
+        "records": page,
+    }


@app.get("/health")