diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index ed0fdec25..d2127f83f 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -160,7 +160,6 @@ async def async_request_openai_completions( if request_func_input.model_name else request_func_input.model, "prompt": request_func_input.prompt, - "temperature": 0.0, "repetition_penalty": 1.0, "max_tokens": request_func_input.output_len, "logprobs": request_func_input.logprobs, @@ -294,7 +293,6 @@ async def async_request_openai_chat_completions( "messages": [ {"role": "user", "content": content}, ], - "temperature": 0.0, "max_completion_tokens": request_func_input.output_len, "stream": True, "stream_options": { @@ -389,7 +387,6 @@ async def async_request_openai_audio( "model": request_func_input.model_name if request_func_input.model_name else request_func_input.model, - "temperature": 0.0, "max_completion_tokens": request_func_input.output_len, "stream": True, "language": "en", diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 679d305c9..19d98f659 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -1419,8 +1419,7 @@ def add_cli_args(parser: argparse.ArgumentParser): type=float, default=None, help="Temperature sampling parameter. Only has effect on " - "openai-compatible backends. If not specified, default to greedy " - "decoding (i.e. temperature==0.0).", + "openai-compatible backends.", ) sampling_group.add_argument( "--frequency-penalty", @@ -1634,7 +1633,12 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: ) if "temperature" not in sampling_params: - sampling_params["temperature"] = 0.0 # Default to greedy decoding. + print( + "WARNING: vllm bench serve no longer sets temperature==0 (greedy) " + "in requests by default. The default will be determined on the " + "server side and can be model/API specific. " + "For the old behavior, include --temperature=0." + ) default_percentile_metrics = "ttft,tpot,itl" else: