[Benchmark] Use truncation by default for pooling benchmarks (#26992)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Benchmark] Use truncation by default for pooling benchmarks (#26992)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
17838e50 · Cyrus Leung · GitHub · 44c85556 · 17838e50
Unverified Commit 17838e50 authored Oct 16, 2025 by Cyrus Leung Committed by GitHub Oct 16, 2025
Show whitespace changes
Inline Side-by-side

Showing with 9 additions and 7 deletions

vllm/benchmarks/lib/endpoint_request_func.py vllm/benchmarks/lib/endpoint_request_func.py +9 -7

No files found.
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -527,6 +527,9 @@ async def async_request_openai_embeddings(
        if request_func_input.model_name
        else request_func_input.model,
        "input": request_func_input.prompt,
+        # Many embedding models have short context length,
+        # this is to avoid dropping some of the requests.
+        "truncate_prompt_tokens": -1,
    }
    _update_payload_common(payload, request_func_input)

@@ -564,6 +567,9 @@ async def async_request_vllm_rerank(
        else request_func_input.model,
        "query": request_func_input.prompt[0],
        "documents": request_func_input.prompt[1:],
+        # Many reranker models have short context length,
+        # this is to avoid dropping some of the requests.
+        "truncate_prompt_tokens": -1,
    }

    headers = {
@@ -599,6 +605,9 @@ async def async_request_openai_embeddings_chat(
        "messages": [
            {"role": "user", "content": content},
        ],
+        # Many embedding models have short context length,
+        # this is to avoid dropping some of the requests.
+        "truncate_prompt_tokens": -1,
    }
    _update_payload_common(payload, request_func_input)

@@ -634,13 +643,6 @@ def _preprocess_clip(request_func_input: RequestFuncInput):
        # Image input
        request_func_input.prompt = ""

-    # max_model_len=77 is too short for most datasets,
-    # so by default we truncate the prompt to max_model_len
-    if request_func_input.extra_body is None:
-        request_func_input.extra_body = {}
-    if "truncate_prompt_tokens" not in request_func_input.extra_body:
-        request_func_input.extra_body["truncate_prompt_tokens"] = -1
-

 def _preprocess_vlm2vec(request_func_input: RequestFuncInput):
    if request_func_input.multi_modal_content: