Unverified Commit 382fa2c2 authored by Ethan-ES's avatar Ethan-ES Committed by GitHub
Browse files

fix: prevent null second token output & reduce time to second token l… (#1861)


Co-authored-by: default avatarroot <root@H20-GPU-24.cm.cluster>
parent 83345419
...@@ -314,7 +314,6 @@ class BaseTensorrtLLMEngine: ...@@ -314,7 +314,6 @@ class BaseTensorrtLLMEngine:
): ):
yield remote_prefill_response yield remote_prefill_response
return return
num_output_tokens_so_far = len(remote_prefill_response["token_ids"])
# Decode the disaggregated params from the remote prefill response # Decode the disaggregated params from the remote prefill response
# Decode the disaggregated params from the remote prefill response # Decode the disaggregated params from the remote prefill response
...@@ -326,11 +325,6 @@ class BaseTensorrtLLMEngine: ...@@ -326,11 +325,6 @@ class BaseTensorrtLLMEngine:
) )
) )
# Send the first token response to the client
first_token_response = remote_prefill_response
first_token_response.pop("disaggregated_params")
yield first_token_response
# Set the disaggregated params to generation_only for the rest of the generation # Set the disaggregated params to generation_only for the rest of the generation
disaggregated_params.request_type = "generation_only" disaggregated_params.request_type = "generation_only"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment