[Benchmark] Don't default to `temperature==0` in `vllm bench serve` (#32723)

Signed-off-by: Nick Hill <nickhill123@gmail.com>

[Benchmark] Don't default to `temperature==0` in `vllm bench serve` (#32723)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
098b2d66 · Nick Hill · GitHub · 8ebf271b · 098b2d66 · 098b2d66
Unverified Commit 098b2d66 authored Jan 22, 2026 by Nick Hill Committed by GitHub Jan 22, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 6 deletions

vllm/benchmarks/lib/endpoint_request_func.py vllm/benchmarks/lib/endpoint_request_func.py +0 -3

vllm/benchmarks/serve.py vllm/benchmarks/serve.py +7 -3

No files found.
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -160,7 +160,6 @@ async def async_request_openai_completions(
        if request_func_input.model_name
        else request_func_input.model,
        "prompt": request_func_input.prompt,
-        "temperature": 0.0,
        "repetition_penalty": 1.0,
        "max_tokens": request_func_input.output_len,
        "logprobs": request_func_input.logprobs,
@@ -294,7 +293,6 @@ async def async_request_openai_chat_completions(
        "messages": [
            {"role": "user", "content": content},
        ],
-        "temperature": 0.0,
        "max_completion_tokens": request_func_input.output_len,
        "stream": True,
        "stream_options": {
@@ -389,7 +387,6 @@ async def async_request_openai_audio(
        "model": request_func_input.model_name
        if request_func_input.model_name
        else request_func_input.model,
-        "temperature": 0.0,
        "max_completion_tokens": request_func_input.output_len,
        "stream": True,
        "language": "en",

--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -1419,8 +1419,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
        type=float,
        default=None,
        help="Temperature sampling parameter. Only has effect on "
-        "openai-compatible backends. If not specified, default to greedy "
+        "openai-compatible backends.",
-        "decoding (i.e. temperature==0.0).",
    )
    sampling_group.add_argument(
        "--frequency-penalty",
@@ -1634,7 +1633,12 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
            )
        if "temperature" not in sampling_params:
-            sampling_params["temperature"] = 0.0  # Default to greedy decoding.
+            print(
+                "WARNING: vllm bench serve no longer sets temperature==0 (greedy) "
+                "in requests by default. The default will be determined on the "
+                "server side and can be model/API specific. "
+                "For the old behavior, include --temperature=0."
+            )
        default_percentile_metrics = "ttft,tpot,itl"
    else: