Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
599e4335
Unverified
Commit
599e4335
authored
Feb 10, 2026
by
mgazz
Committed by
GitHub
Feb 10, 2026
Browse files
Support benchmarking of Geospatial models (#33922)
Signed-off-by:
Michele Gazzetti
<
michele.gazzetti1@ibm.com
>
parent
a1946570
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
110 additions
and
56 deletions
+110
-56
vllm/benchmarks/datasets.py
vllm/benchmarks/datasets.py
+30
-24
vllm/benchmarks/lib/endpoint_request_func.py
vllm/benchmarks/lib/endpoint_request_func.py
+32
-0
vllm/benchmarks/serve.py
vllm/benchmarks/serve.py
+48
-32
No files found.
vllm/benchmarks/datasets.py
View file @
599e4335
...
@@ -2072,6 +2072,9 @@ class CustomDataset(BenchmarkDataset):
...
@@ -2072,6 +2072,9 @@ class CustomDataset(BenchmarkDataset):
break
break
prompt
=
item
[
"prompt"
]
prompt
=
item
[
"prompt"
]
if
tokenizer
is
None
:
new_output_len
=
1
else
:
new_output_len
=
output_len
new_output_len
=
output_len
if
output_len
is
None
or
output_len
==
-
1
:
if
output_len
is
None
or
output_len
==
-
1
:
# check that the request has an 'output_tokens' field
# check that the request has an 'output_tokens' field
...
@@ -2089,6 +2092,9 @@ class CustomDataset(BenchmarkDataset):
...
@@ -2089,6 +2092,9 @@ class CustomDataset(BenchmarkDataset):
f
"'
{
item
[
'output_tokens'
]
}
'. Must be an integer."
f
"'
{
item
[
'output_tokens'
]
}
'. Must be an integer."
)
from
e
)
from
e
if
tokenizer
is
None
:
prompt_len
=
1
else
:
# apply template
# apply template
if
not
skip_chat_template
:
if
not
skip_chat_template
:
prompt
=
tokenizer
.
apply_chat_template
(
prompt
=
tokenizer
.
apply_chat_template
(
...
...
vllm/benchmarks/lib/endpoint_request_func.py
View file @
599e4335
...
@@ -746,6 +746,37 @@ async def async_request_infinity_embeddings_clip(
...
@@ -746,6 +746,37 @@ async def async_request_infinity_embeddings_clip(
)
)
async
def
async_request_vllm_pooling
(
request_func_input
:
RequestFuncInput
,
session
:
aiohttp
.
ClientSession
,
pbar
:
tqdm
|
None
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
_validate_api_url
(
api_url
,
"vLLM Pooling API"
,
"pooling"
)
payload
=
{
"model"
:
request_func_input
.
model_name
if
request_func_input
.
model_name
else
request_func_input
.
model
,
"truncate_prompt_tokens"
:
-
1
,
}
payload
=
payload
|
request_func_input
.
prompt
_update_payload_common
(
payload
,
request_func_input
)
headers
=
_get_headers
(
"application/json"
)
_update_headers_common
(
headers
,
request_func_input
)
return
await
_run_pooling_request
(
session
,
api_url
,
payload
=
payload
,
headers
=
headers
,
pbar
=
pbar
,
)
# TODO: Add more request functions for different API protocols.
# TODO: Add more request functions for different API protocols.
ASYNC_REQUEST_FUNCS
:
dict
[
str
,
RequestFunc
]
=
{
ASYNC_REQUEST_FUNCS
:
dict
[
str
,
RequestFunc
]
=
{
"vllm"
:
async_request_openai_completions
,
"vllm"
:
async_request_openai_completions
,
...
@@ -760,6 +791,7 @@ ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = {
...
@@ -760,6 +791,7 @@ ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = {
"infinity-embeddings"
:
async_request_infinity_embeddings
,
"infinity-embeddings"
:
async_request_infinity_embeddings
,
"infinity-embeddings-clip"
:
async_request_infinity_embeddings_clip
,
"infinity-embeddings-clip"
:
async_request_infinity_embeddings_clip
,
# (Infinity embedding server does not support vlm2vec)
# (Infinity embedding server does not support vlm2vec)
"vllm-pooling"
:
async_request_vllm_pooling
,
"vllm-rerank"
:
async_request_vllm_rerank
,
"vllm-rerank"
:
async_request_vllm_rerank
,
}
}
...
...
vllm/benchmarks/serve.py
View file @
599e4335
...
@@ -423,6 +423,9 @@ def calculate_metrics(
...
@@ -423,6 +423,9 @@ def calculate_metrics(
output_len
=
outputs
[
i
].
output_tokens
output_len
=
outputs
[
i
].
output_tokens
if
not
output_len
:
if
not
output_len
:
if
tokenizer
is
None
:
output_len
=
1
else
:
# We use the tokenizer to count the number of output tokens
# We use the tokenizer to count the number of output tokens
# for some serving backends instead of looking at
# for some serving backends instead of looking at
# len(outputs[i].itl) since multiple output tokens may be
# len(outputs[i].itl) since multiple output tokens may be
...
@@ -919,7 +922,7 @@ async def benchmark(
...
@@ -919,7 +922,7 @@ async def benchmark(
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request rate configured (RPS):"
,
request_rate
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request rate configured (RPS):"
,
request_rate
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Benchmark duration (s):"
,
benchmark_duration
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Benchmark duration (s):"
,
benchmark_duration
))
print
(
"{:<40} {:<10}"
.
format
(
"Total input tokens:"
,
metrics
.
total_input
))
print
(
"{:<40} {:<10}"
.
format
(
"Total input tokens:"
,
metrics
.
total_input
))
if
isinstance
(
metrics
,
BenchmarkMetrics
):
if
isinstance
(
metrics
,
BenchmarkMetrics
)
and
tokenizer
:
print
(
"{:<40} {:<10}"
.
format
(
"Total generated tokens:"
,
metrics
.
total_output
))
print
(
"{:<40} {:<10}"
.
format
(
"Total generated tokens:"
,
metrics
.
total_output
))
print
(
print
(
"{:<40} {:<10.2f}"
.
format
(
"{:<40} {:<10.2f}"
.
format
(
...
@@ -933,6 +936,7 @@ async def benchmark(
...
@@ -933,6 +936,7 @@ async def benchmark(
)
)
)
)
if
isinstance
(
metrics
,
BenchmarkMetrics
):
if
isinstance
(
metrics
,
BenchmarkMetrics
):
if
tokenizer
:
print
(
print
(
"{:<40} {:<10.2f}"
.
format
(
"{:<40} {:<10.2f}"
.
format
(
"Output token throughput (tok/s):"
,
metrics
.
output_throughput
"Output token throughput (tok/s):"
,
metrics
.
output_throughput
...
@@ -940,7 +944,8 @@ async def benchmark(
...
@@ -940,7 +944,8 @@ async def benchmark(
)
)
print
(
print
(
"{:<40} {:<10.2f}"
.
format
(
"{:<40} {:<10.2f}"
.
format
(
"Peak output token throughput (tok/s):"
,
metrics
.
max_output_tokens_per_s
"Peak output token throughput (tok/s):"
,
metrics
.
max_output_tokens_per_s
,
)
)
)
)
print
(
print
(
...
@@ -954,6 +959,7 @@ async def benchmark(
...
@@ -954,6 +959,7 @@ async def benchmark(
"RTFx (Inverse Real-Time Factor):"
,
metrics
.
rtfx
"RTFx (Inverse Real-Time Factor):"
,
metrics
.
rtfx
)
)
)
)
if
tokenizer
:
print
(
print
(
"{:<40} {:<10.2f}"
.
format
(
"{:<40} {:<10.2f}"
.
format
(
"Total token throughput (tok/s):"
,
metrics
.
total_token_throughput
"Total token throughput (tok/s):"
,
metrics
.
total_token_throughput
...
@@ -1047,7 +1053,7 @@ async def benchmark(
...
@@ -1047,7 +1053,7 @@ async def benchmark(
print
(
"{:<40} {:<10.2f}"
.
format
(
f
"P
{
p_word
}
{
metric_name
}
(ms):"
,
value
))
print
(
"{:<40} {:<10.2f}"
.
format
(
f
"P
{
p_word
}
{
metric_name
}
(ms):"
,
value
))
result
[
f
"p
{
p_word
}
_
{
metric_attribute_name
}
_ms"
]
=
value
result
[
f
"p
{
p_word
}
_
{
metric_attribute_name
}
_ms"
]
=
value
if
task_type
==
TaskType
.
GENERATION
:
if
task_type
==
TaskType
.
GENERATION
and
tokenizer
:
process_one_metric
(
"ttft"
,
"TTFT"
,
"Time to First Token"
)
process_one_metric
(
"ttft"
,
"TTFT"
,
"Time to First Token"
)
process_one_metric
(
"tpot"
,
"TPOT"
,
"Time per Output Token (excl. 1st token)"
)
process_one_metric
(
"tpot"
,
"TPOT"
,
"Time per Output Token (excl. 1st token)"
)
process_one_metric
(
"itl"
,
"ITL"
,
"Inter-token Latency"
)
process_one_metric
(
"itl"
,
"ITL"
,
"Inter-token Latency"
)
...
@@ -1519,6 +1525,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
...
@@ -1519,6 +1525,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
type
=
json
.
loads
,
type
=
json
.
loads
,
default
=
None
,
default
=
None
,
)
)
parser
.
add_argument
(
"--skip-tokenizer-init"
,
action
=
"store_true"
,
default
=
False
,
help
=
"Skip initialization of tokenizer and detokenizer"
,
)
parser
.
add_argument
(
parser
.
add_argument
(
"--insecure"
,
"--insecure"
,
...
@@ -1599,9 +1611,13 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
...
@@ -1599,9 +1611,13 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
model_name
=
args
.
served_model_name
model_name
=
args
.
served_model_name
model_id
=
args
.
model
model_id
=
args
.
model
if
args
.
skip_tokenizer_init
:
tokenizer_id
=
None
tokenizer_mode
=
None
tokenizer
=
None
else
:
tokenizer_id
=
args
.
tokenizer
if
args
.
tokenizer
is
not
None
else
model_id
tokenizer_id
=
args
.
tokenizer
if
args
.
tokenizer
is
not
None
else
model_id
tokenizer_mode
=
args
.
tokenizer_mode
tokenizer_mode
=
args
.
tokenizer_mode
tokenizer
=
get_tokenizer
(
tokenizer
=
get_tokenizer
(
tokenizer_id
,
tokenizer_id
,
tokenizer_mode
=
tokenizer_mode
,
tokenizer_mode
=
tokenizer_mode
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment