"git@developer.sourcefind.cn:OpenDAS/openpcdet.git" did not exist on "1a42e442815b4f9c2774ea4ddce229c36ec83378"
Unverified Commit 0a894cc3 authored by Ryan McCormick's avatar Ryan McCormick Committed by GitHub
Browse files

fix: Check nvext for ignore_eos and set min_tokens for benchmark consistency (#988)

parent dc3ae2b7
...@@ -25,6 +25,8 @@ osl=150 ...@@ -25,6 +25,8 @@ osl=150
# Concurrency levels to test # Concurrency levels to test
for concurrency in 1 2 4 8 16 32 64 128 256; do for concurrency in 1 2 4 8 16 32 64 128 256; do
# NOTE: For Dynamo HTTP OpenAI frontend, use `nvext` for fields like
# `ignore_eos` since they are not in the official OpenAI spec.
genai-perf profile \ genai-perf profile \
--model ${model} \ --model ${model} \
--tokenizer ${model} \ --tokenizer ${model} \
...@@ -40,6 +42,7 @@ for concurrency in 1 2 4 8 16 32 64 128 256; do ...@@ -40,6 +42,7 @@ for concurrency in 1 2 4 8 16 32 64 128 256; do
--extra-inputs max_tokens:${osl} \ --extra-inputs max_tokens:${osl} \
--extra-inputs min_tokens:${osl} \ --extra-inputs min_tokens:${osl} \
--extra-inputs ignore_eos:true \ --extra-inputs ignore_eos:true \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--concurrency ${concurrency} \ --concurrency ${concurrency} \
--request-count $(($concurrency*10)) \ --request-count $(($concurrency*10)) \
--warmup-request-count $(($concurrency*2)) \ --warmup-request-count $(($concurrency*2)) \
......
...@@ -156,6 +156,20 @@ class Processor(ChatProcessorMixin): ...@@ -156,6 +156,20 @@ class Processor(ChatProcessorMixin):
raise ValueError( raise ValueError(
"max_tokens and max_completion_tokens must be the same" "max_tokens and max_completion_tokens must be the same"
) )
# min_tokens isn't currently propagated through the Rust OpenAI HTTP frontend,
# and ignore_eos is passed through the 'nvext' field, so set both when found.
if raw_request.nvext:
ignore_eos = raw_request.nvext.get("ignore_eos")
raw_request.ignore_eos = ignore_eos
# If ignore_eos is True, set min_tokens to max_tokens to guarantee
# the full expected OSL for consistent benchmarking purposes.
if ignore_eos:
logger.debug(
f"[preprocessor] `ignore_eos` detected, setting `min_tokens` to `max_completion_tokens`: {raw_request.max_completion_tokens}"
)
raw_request.min_tokens = raw_request.max_completion_tokens
async for response in self._generate(raw_request, RequestType.CHAT): async for response in self._generate(raw_request, RequestType.CHAT):
yield response yield response
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment