From 508118cc50134c768e88cb3aa65a539a82ad1a7e Mon Sep 17 00:00:00 2001 From: aszerW Date: Sun, 19 Apr 2026 00:58:51 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=20max=5Ftokens=20?= =?UTF-8?q?=E4=B8=BA=200=20=E6=88=96=20None=20=E6=97=B6=E5=93=8D=E5=BA=94?= =?UTF-8?q?=E5=86=85=E5=AE=B9=E8=A2=AB=E6=88=AA=E6=96=AD=E7=9A=84=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 问题: Swagger UI 测试时 max_tokens 默认值为 0,导致 DashScope API 返回的响应内容只有 1 个 token(被截断) 修复: - 非流式和流式响应中,当 max_tokens 为 None 或 ≤0 时不传给后端 API - 让 DashScope 使用自己的默认 max_tokens 值(通常 2048/4096) - 使用 completion_kwargs 字典动态构建请求参数 效果: - Swagger UI 中 max_tokens 留空或设为 0 都能返回完整响应 - 需要限制输出时可手动设置合理的 max_tokens 值 --- main.py | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/main.py b/main.py index 0ed6433..ebbae13 100644 --- a/main.py +++ b/main.py @@ -291,13 +291,18 @@ async def chat_completions(request: ChatCompletionRequest): # 3. 非流式响应 start_time = time.time() + + # 构建请求参数(过滤掉 None 和 0 的 max_tokens) + completion_kwargs = { + "model": provider_model, + "messages": messages_raw, + "temperature": request.temperature, + } + if request.max_tokens and request.max_tokens > 0: + completion_kwargs["max_tokens"] = request.max_tokens + try: - response = await acompletion( - model=provider_model, - messages=messages_raw, - temperature=request.temperature, - max_tokens=request.max_tokens, - ) + response = await acompletion(**completion_kwargs) latency_ms = (time.time() - start_time) * 1000 input_tokens = response.usage.prompt_tokens @@ -338,13 +343,17 @@ async def _stream_response( output_tokens = 0 try: - response = await acompletion( - model=provider_model, - messages=messages_raw, - temperature=request.temperature, - max_tokens=request.max_tokens, - stream=True, - ) + # 构建请求参数(过滤掉 None 和 0 的 max_tokens) + completion_kwargs = { + "model": provider_model, + "messages": messages_raw, + "temperature": request.temperature, + "stream": True, + } + if request.max_tokens and request.max_tokens > 0: + completion_kwargs["max_tokens"] = request.max_tokens + + response = await acompletion(**completion_kwargs) async for chunk in response: delta = chunk.choices[0].delta