Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
05d839c1
Unverified
Commit
05d839c1
authored
Aug 29, 2025
by
Raghavan
Committed by
GitHub
Aug 28, 2025
Browse files
Fix(async): Add support for truncate_prompt_tokens in AsyncLLM (#23800)
parent
6597d7a4
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
20 additions
and
0 deletions
+20
-0
vllm/v1/engine/async_llm.py
vllm/v1/engine/async_llm.py
+20
-0
No files found.
vllm/v1/engine/async_llm.py
View file @
05d839c1
...
...
@@ -15,6 +15,7 @@ import vllm.envs as envs
from
vllm.config
import
ModelConfig
,
VllmConfig
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.utils
import
_validate_truncation_size
from
vllm.envs
import
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
from
vllm.inputs
import
PromptType
from
vllm.inputs.preprocess
import
InputPreprocessor
...
...
@@ -348,6 +349,15 @@ class AsyncLLM(EngineClient):
# to handle startup failure gracefully in the OpenAI server.
self
.
_run_output_handler
()
tokenization_kwargs
:
dict
[
str
,
Any
]
=
{}
truncate_prompt_tokens
=
sampling_params
.
truncate_prompt_tokens
_validate_truncation_size
(
self
.
model_config
.
max_model_len
,
truncate_prompt_tokens
,
tokenization_kwargs
,
)
q
=
await
self
.
add_request
(
request_id
,
prompt
,
...
...
@@ -355,6 +365,7 @@ class AsyncLLM(EngineClient):
lora_request
=
lora_request
,
trace_headers
=
trace_headers
,
priority
=
priority
,
tokenization_kwargs
=
tokenization_kwargs
,
data_parallel_rank
=
data_parallel_rank
,
)
...
...
@@ -481,6 +492,7 @@ class AsyncLLM(EngineClient):
lora_request
:
Optional
[
LoRARequest
]
=
None
,
trace_headers
:
Optional
[
Mapping
[
str
,
str
]]
=
None
,
priority
:
int
=
0
,
truncate_prompt_tokens
:
Optional
[
int
]
=
None
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
)
->
AsyncGenerator
[
PoolingRequestOutput
,
None
]:
"""
...
...
@@ -503,6 +515,14 @@ class AsyncLLM(EngineClient):
# to handle startup failure gracefully in the OpenAI server.
self
.
_run_output_handler
()
if
tokenization_kwargs
is
None
:
tokenization_kwargs
=
dict
[
str
,
Any
]()
_validate_truncation_size
(
self
.
model_config
.
max_model_len
,
truncate_prompt_tokens
,
tokenization_kwargs
,
)
q
=
await
self
.
add_request
(
request_id
,
prompt
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment