fix: prevent null second token output & reduce time to second token l… (#1861)

Co-authored-by: root <root@H20-GPU-24.cm.cluster>

fix: prevent null second token output & reduce time to second token l… (#1861)
Co-authored-by: root <root@H20-GPU-24.cm.cluster>
382fa2c2 · Ethan-ES · GitHub · 83345419 · 382fa2c2
Unverified Commit 382fa2c2 authored Jul 11, 2025 by Ethan-ES Committed by GitHub Jul 10, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 0 additions and 6 deletions

examples/tensorrt_llm/common/base_engine.py examples/tensorrt_llm/common/base_engine.py +0 -6

No files found.
--- a/examples/tensorrt_llm/common/base_engine.py
+++ b/examples/tensorrt_llm/common/base_engine.py
@@ -314,7 +314,6 @@ class BaseTensorrtLLMEngine:
            ):
                yield remote_prefill_response
                return
-            num_output_tokens_so_far = len(remote_prefill_response["token_ids"])
            # Decode the disaggregated params from the remote prefill response
            # Decode the disaggregated params from the remote prefill response
@@ -326,11 +325,6 @@ class BaseTensorrtLLMEngine:
                )
            )
-            # Send the first token response to the client
-            first_token_response = remote_prefill_response
-            first_token_response.pop("disaggregated_params")
-            yield first_token_response
            # Set the disaggregated params to generation_only for the rest of the generation
            disaggregated_params.request_type = "generation_only"