feat: Add ignore_eos/nvext support for legacy completions (#1080)

7275d496 · Ryan McCormick · GitHub · b813befa · 7275d496
Unverified Commit 7275d496 authored May 14, 2025 by Ryan McCormick Committed by GitHub May 15, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 0 deletions

examples/tensorrt_llm/components/processor.py examples/tensorrt_llm/components/processor.py +13 -0

No files found.
--- a/examples/tensorrt_llm/components/processor.py
+++ b/examples/tensorrt_llm/components/processor.py
@@ -174,5 +174,18 @@ class Processor(ChatProcessorMixin):

    @dynamo_endpoint(name="completions")
    async def completions(self, raw_request: DynamoTRTLLMCompletionRequest):
+        # min_tokens isn't currently propagated through the Rust OpenAI HTTP frontend,
+        # and ignore_eos is passed through the 'nvext' field, so set both when found.
+        if raw_request.nvext:
+            ignore_eos = raw_request.nvext.get("ignore_eos")
+            raw_request.ignore_eos = ignore_eos
+            # If ignore_eos is True, set min_tokens to max_tokens to guarantee
+            # the full expected OSL for consistent benchmarking purposes.
+            if ignore_eos:
+                logger.debug(
+                    f"[preprocessor] `ignore_eos` detected, setting `min_tokens` to `max_tokens`: {raw_request.max_tokens}"
+                )
+                raw_request.min_tokens = raw_request.max_tokens
+
        async for response in self._generate(raw_request, RequestType.COMPLETION):
            yield response