fix: Check nvext for ignore_eos and set min_tokens for benchmark consistency (#988)

0a894cc3 · Ryan McCormick · GitHub · dc3ae2b7 · 0a894cc3 · 0a894cc3
Unverified Commit 0a894cc3 authored May 07, 2025 by Ryan McCormick Committed by GitHub May 07, 2025
Show whitespace changes
Inline Side-by-side

Showing with 17 additions and 0 deletions

examples/llm/benchmarks/perf.sh examples/llm/benchmarks/perf.sh +3 -0

examples/tensorrt_llm/components/processor.py examples/tensorrt_llm/components/processor.py +14 -0

No files found.
--- a/examples/llm/benchmarks/perf.sh
+++ b/examples/llm/benchmarks/perf.sh
@@ -25,6 +25,8 @@ osl=150
 # Concurrency levels to test
 for concurrency in 1 2 4 8 16 32 64 128 256; do

+  # NOTE: For Dynamo HTTP OpenAI frontend, use `nvext` for fields like
+  # `ignore_eos` since they are not in the official OpenAI spec.
  genai-perf profile \
    --model ${model} \
    --tokenizer ${model} \
@@ -40,6 +42,7 @@ for concurrency in 1 2 4 8 16 32 64 128 256; do
    --extra-inputs max_tokens:${osl} \
    --extra-inputs min_tokens:${osl} \
    --extra-inputs ignore_eos:true \
+    --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
    --concurrency ${concurrency} \
    --request-count $(($concurrency*10)) \
    --warmup-request-count $(($concurrency*2)) \

--- a/examples/tensorrt_llm/components/processor.py
+++ b/examples/tensorrt_llm/components/processor.py
@@ -156,6 +156,20 @@ class Processor(ChatProcessorMixin):
                    raise ValueError(
                        "max_tokens and max_completion_tokens must be the same"
                    )
+
+        # min_tokens isn't currently propagated through the Rust OpenAI HTTP frontend,
+        # and ignore_eos is passed through the 'nvext' field, so set both when found.
+        if raw_request.nvext:
+            ignore_eos = raw_request.nvext.get("ignore_eos")
+            raw_request.ignore_eos = ignore_eos
+            # If ignore_eos is True, set min_tokens to max_tokens to guarantee
+            # the full expected OSL for consistent benchmarking purposes.
+            if ignore_eos:
+                logger.debug(
+                    f"[preprocessor] `ignore_eos` detected, setting `min_tokens` to `max_completion_tokens`: {raw_request.max_completion_tokens}"
+                )
+                raw_request.min_tokens = raw_request.max_completion_tokens
+
        async for response in self._generate(raw_request, RequestType.CHAT):
            yield response