Unverified Commit 599e4335 authored by mgazz's avatar mgazz Committed by GitHub
Browse files

Support benchmarking of Geospatial models (#33922)


Signed-off-by: default avatarMichele Gazzetti <michele.gazzetti1@ibm.com>
parent a1946570
...@@ -2072,32 +2072,38 @@ class CustomDataset(BenchmarkDataset): ...@@ -2072,32 +2072,38 @@ class CustomDataset(BenchmarkDataset):
break break
prompt = item["prompt"] prompt = item["prompt"]
new_output_len = output_len if tokenizer is None:
if output_len is None or output_len == -1: new_output_len = 1
# check that the request has an 'output_tokens' field else:
if "output_tokens" not in item: new_output_len = output_len
raise ValueError( if output_len is None or output_len == -1:
"If no output length is provided the " # check that the request has an 'output_tokens' field
"custom dataset must contain an 'output_tokens' field." if "output_tokens" not in item:
raise ValueError(
"If no output length is provided the "
"custom dataset must contain an 'output_tokens' field."
)
# Use number of output tokens from the request data
try:
new_output_len = int(item["output_tokens"])
except (ValueError, TypeError) as e:
raise ValueError(
f"Invalid value for 'output_tokens' in custom dataset: "
f"'{item['output_tokens']}'. Must be an integer."
) from e
if tokenizer is None:
prompt_len = 1
else:
# apply template
if not skip_chat_template:
prompt = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
) )
# Use number of output tokens from the request data
try:
new_output_len = int(item["output_tokens"])
except (ValueError, TypeError) as e:
raise ValueError(
f"Invalid value for 'output_tokens' in custom dataset: "
f"'{item['output_tokens']}'. Must be an integer."
) from e
# apply template prompt_len = len(tokenizer(prompt).input_ids)
if not skip_chat_template:
prompt = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append( sampled_requests.append(
SampleRequest( SampleRequest(
prompt=prompt, prompt=prompt,
......
...@@ -746,6 +746,37 @@ async def async_request_infinity_embeddings_clip( ...@@ -746,6 +746,37 @@ async def async_request_infinity_embeddings_clip(
) )
async def async_request_vllm_pooling(
request_func_input: RequestFuncInput,
session: aiohttp.ClientSession,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
_validate_api_url(api_url, "vLLM Pooling API", "pooling")
payload = {
"model": request_func_input.model_name
if request_func_input.model_name
else request_func_input.model,
"truncate_prompt_tokens": -1,
}
payload = payload | request_func_input.prompt
_update_payload_common(payload, request_func_input)
headers = _get_headers("application/json")
_update_headers_common(headers, request_func_input)
return await _run_pooling_request(
session,
api_url,
payload=payload,
headers=headers,
pbar=pbar,
)
# TODO: Add more request functions for different API protocols. # TODO: Add more request functions for different API protocols.
ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = { ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = {
"vllm": async_request_openai_completions, "vllm": async_request_openai_completions,
...@@ -760,6 +791,7 @@ ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = { ...@@ -760,6 +791,7 @@ ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = {
"infinity-embeddings": async_request_infinity_embeddings, "infinity-embeddings": async_request_infinity_embeddings,
"infinity-embeddings-clip": async_request_infinity_embeddings_clip, "infinity-embeddings-clip": async_request_infinity_embeddings_clip,
# (Infinity embedding server does not support vlm2vec) # (Infinity embedding server does not support vlm2vec)
"vllm-pooling": async_request_vllm_pooling,
"vllm-rerank": async_request_vllm_rerank, "vllm-rerank": async_request_vllm_rerank,
} }
......
...@@ -423,16 +423,19 @@ def calculate_metrics( ...@@ -423,16 +423,19 @@ def calculate_metrics(
output_len = outputs[i].output_tokens output_len = outputs[i].output_tokens
if not output_len: if not output_len:
# We use the tokenizer to count the number of output tokens if tokenizer is None:
# for some serving backends instead of looking at output_len = 1
# len(outputs[i].itl) since multiple output tokens may be else:
# bundled together # We use the tokenizer to count the number of output tokens
# Note : this may inflate the output token count slightly # for some serving backends instead of looking at
output_len = len( # len(outputs[i].itl) since multiple output tokens may be
tokenizer( # bundled together
outputs[i].generated_text, add_special_tokens=False # Note : this may inflate the output token count slightly
).input_ids output_len = len(
) tokenizer(
outputs[i].generated_text, add_special_tokens=False
).input_ids
)
actual_output_lens.append(output_len) actual_output_lens.append(output_len)
total_input += input_requests[i].prompt_len total_input += input_requests[i].prompt_len
tpot = 0 tpot = 0
...@@ -919,7 +922,7 @@ async def benchmark( ...@@ -919,7 +922,7 @@ async def benchmark(
print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate)) print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
if isinstance(metrics, BenchmarkMetrics): if isinstance(metrics, BenchmarkMetrics) and tokenizer:
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
print( print(
"{:<40} {:<10.2f}".format( "{:<40} {:<10.2f}".format(
...@@ -933,16 +936,18 @@ async def benchmark( ...@@ -933,16 +936,18 @@ async def benchmark(
) )
) )
if isinstance(metrics, BenchmarkMetrics): if isinstance(metrics, BenchmarkMetrics):
print( if tokenizer:
"{:<40} {:<10.2f}".format( print(
"Output token throughput (tok/s):", metrics.output_throughput "{:<40} {:<10.2f}".format(
"Output token throughput (tok/s):", metrics.output_throughput
)
) )
) print(
print( "{:<40} {:<10.2f}".format(
"{:<40} {:<10.2f}".format( "Peak output token throughput (tok/s):",
"Peak output token throughput (tok/s):", metrics.max_output_tokens_per_s metrics.max_output_tokens_per_s,
)
) )
)
print( print(
"{:<40} {:<10.2f}".format( "{:<40} {:<10.2f}".format(
"Peak concurrent requests:", metrics.max_concurrent_requests "Peak concurrent requests:", metrics.max_concurrent_requests
...@@ -954,11 +959,12 @@ async def benchmark( ...@@ -954,11 +959,12 @@ async def benchmark(
"RTFx (Inverse Real-Time Factor):", metrics.rtfx "RTFx (Inverse Real-Time Factor):", metrics.rtfx
) )
) )
print( if tokenizer:
"{:<40} {:<10.2f}".format( print(
"Total token throughput (tok/s):", metrics.total_token_throughput "{:<40} {:<10.2f}".format(
"Total token throughput (tok/s):", metrics.total_token_throughput
)
) )
)
if isinstance(metrics, BenchmarkMetrics): if isinstance(metrics, BenchmarkMetrics):
result = { result = {
...@@ -1047,7 +1053,7 @@ async def benchmark( ...@@ -1047,7 +1053,7 @@ async def benchmark(
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
result[f"p{p_word}_{metric_attribute_name}_ms"] = value result[f"p{p_word}_{metric_attribute_name}_ms"] = value
if task_type == TaskType.GENERATION: if task_type == TaskType.GENERATION and tokenizer:
process_one_metric("ttft", "TTFT", "Time to First Token") process_one_metric("ttft", "TTFT", "Time to First Token")
process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
process_one_metric("itl", "ITL", "Inter-token Latency") process_one_metric("itl", "ITL", "Inter-token Latency")
...@@ -1519,6 +1525,12 @@ def add_cli_args(parser: argparse.ArgumentParser): ...@@ -1519,6 +1525,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
type=json.loads, type=json.loads,
default=None, default=None,
) )
parser.add_argument(
"--skip-tokenizer-init",
action="store_true",
default=False,
help="Skip initialization of tokenizer and detokenizer",
)
parser.add_argument( parser.add_argument(
"--insecure", "--insecure",
...@@ -1599,14 +1611,18 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: ...@@ -1599,14 +1611,18 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
model_name = args.served_model_name model_name = args.served_model_name
model_id = args.model model_id = args.model
tokenizer_id = args.tokenizer if args.tokenizer is not None else model_id if args.skip_tokenizer_init:
tokenizer_mode = args.tokenizer_mode tokenizer_id = None
tokenizer_mode = None
tokenizer = get_tokenizer( tokenizer = None
tokenizer_id, else:
tokenizer_mode=tokenizer_mode, tokenizer_id = args.tokenizer if args.tokenizer is not None else model_id
trust_remote_code=args.trust_remote_code, tokenizer_mode = args.tokenizer_mode
) tokenizer = get_tokenizer(
tokenizer_id,
tokenizer_mode=tokenizer_mode,
trust_remote_code=args.trust_remote_code,
)
if args.dataset_name is None: if args.dataset_name is None:
raise ValueError( raise ValueError(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment