diff --git a/main.py b/main.py index 0ed6433..ebbae13 100644 --- a/main.py +++ b/main.py @@ -291,13 +291,18 @@ async def chat_completions(request: ChatCompletionRequest): # 3. 非流式响应 start_time = time.time() + + # 构建请求参数(过滤掉 None 和 0 的 max_tokens) + completion_kwargs = { + "model": provider_model, + "messages": messages_raw, + "temperature": request.temperature, + } + if request.max_tokens and request.max_tokens > 0: + completion_kwargs["max_tokens"] = request.max_tokens + try: - response = await acompletion( - model=provider_model, - messages=messages_raw, - temperature=request.temperature, - max_tokens=request.max_tokens, - ) + response = await acompletion(**completion_kwargs) latency_ms = (time.time() - start_time) * 1000 input_tokens = response.usage.prompt_tokens @@ -338,13 +343,17 @@ async def _stream_response( output_tokens = 0 try: - response = await acompletion( - model=provider_model, - messages=messages_raw, - temperature=request.temperature, - max_tokens=request.max_tokens, - stream=True, - ) + # 构建请求参数(过滤掉 None 和 0 的 max_tokens) + completion_kwargs = { + "model": provider_model, + "messages": messages_raw, + "temperature": request.temperature, + "stream": True, + } + if request.max_tokens and request.max_tokens > 0: + completion_kwargs["max_tokens"] = request.max_tokens + + response = await acompletion(**completion_kwargs) async for chunk in response: delta = chunk.choices[0].delta