Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
44554a00
Unverified
Commit
44554a00
authored
Jul 22, 2025
by
Wang Yijun
Committed by
GitHub
Jul 22, 2025
Browse files
Add tokenization_kwargs to encode for embedding model truncation (#21033)
parent
226b452a
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
20 additions
and
3 deletions
+20
-3
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+6
-0
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+12
-3
vllm/v1/engine/async_llm.py
vllm/v1/engine/async_llm.py
+2
-0
No files found.
vllm/engine/async_llm_engine.py
View file @
44554a00
...
...
@@ -438,6 +438,7 @@ class _AsyncLLMEngine(LLMEngine):
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
priority
:
int
=
0
,
data_parallel_rank
:
Optional
[
int
]
=
None
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
)
->
None
:
"""
Async version of
...
...
@@ -468,6 +469,7 @@ class _AsyncLLMEngine(LLMEngine):
prompt
,
lora_request
=
lora_request
,
prompt_adapter_request
=
prompt_adapter_request
,
tokenization_kwargs
=
tokenization_kwargs
,
)
if
isinstance
(
params
,
SamplingParams
)
and
\
...
...
@@ -862,6 +864,7 @@ class AsyncLLMEngine(EngineClient):
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
priority
:
int
=
0
,
data_parallel_rank
:
Optional
[
int
]
=
None
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
)
->
AsyncGenerator
[
Union
[
RequestOutput
,
PoolingRequestOutput
],
None
]:
if
not
self
.
is_running
:
if
self
.
start_engine_loop
:
...
...
@@ -889,6 +892,7 @@ class AsyncLLMEngine(EngineClient):
prompt_adapter_request
=
prompt_adapter_request
,
priority
=
priority
,
data_parallel_rank
=
data_parallel_rank
,
tokenization_kwargs
=
tokenization_kwargs
,
)
return
stream
.
generator
()
...
...
@@ -996,6 +1000,7 @@ class AsyncLLMEngine(EngineClient):
lora_request
:
Optional
[
LoRARequest
]
=
None
,
trace_headers
:
Optional
[
Mapping
[
str
,
str
]]
=
None
,
priority
:
int
=
0
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
)
->
AsyncGenerator
[
PoolingRequestOutput
,
None
]:
"""Generate outputs for a request from a pooling model.
...
...
@@ -1070,6 +1075,7 @@ class AsyncLLMEngine(EngineClient):
lora_request
=
lora_request
,
trace_headers
=
trace_headers
,
priority
=
priority
,
tokenization_kwargs
=
tokenization_kwargs
,
):
yield
LLMEngine
.
validate_output
(
output
,
PoolingRequestOutput
)
except
asyncio
.
CancelledError
:
...
...
vllm/entrypoints/llm.py
View file @
44554a00
...
...
@@ -965,6 +965,7 @@ class LLM:
lora_request
:
Optional
[
Union
[
list
[
LoRARequest
],
LoRARequest
]]
=
None
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
pooling_task
:
PoolingTask
=
"encode"
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
)
->
list
[
PoolingRequestOutput
]:
...
...
...
@@ -981,6 +982,7 @@ class LLM:
lora_request
:
Optional
[
Union
[
list
[
LoRARequest
],
LoRARequest
]]
=
None
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
pooling_task
:
PoolingTask
=
"encode"
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
)
->
list
[
PoolingRequestOutput
]:
...
...
...
@@ -997,6 +999,7 @@ class LLM:
lora_request
:
Optional
[
Union
[
list
[
LoRARequest
],
LoRARequest
]]
=
None
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
pooling_task
:
PoolingTask
=
"encode"
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
)
->
list
[
PoolingRequestOutput
]:
...
...
...
@@ -1014,6 +1017,7 @@ class LLM:
lora_request
:
Optional
[
Union
[
list
[
LoRARequest
],
LoRARequest
]]
=
None
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
pooling_task
:
PoolingTask
=
"encode"
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
)
->
list
[
PoolingRequestOutput
]:
...
...
...
@@ -1031,6 +1035,7 @@ class LLM:
lora_request
:
Optional
[
Union
[
list
[
LoRARequest
],
LoRARequest
]]
=
None
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
pooling_task
:
PoolingTask
=
"encode"
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
)
->
list
[
PoolingRequestOutput
]:
...
...
...
@@ -1046,6 +1051,7 @@ class LLM:
lora_request
:
Optional
[
Union
[
list
[
LoRARequest
],
LoRARequest
]]
=
None
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
pooling_task
:
PoolingTask
=
"encode"
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
)
->
list
[
PoolingRequestOutput
]:
...
...
...
@@ -1066,6 +1072,7 @@ class LLM:
lora_request
:
Optional
[
Union
[
list
[
LoRARequest
],
LoRARequest
]]
=
None
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
pooling_task
:
PoolingTask
=
"encode"
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
)
->
list
[
PoolingRequestOutput
]:
"""Apply pooling to the hidden states corresponding to the input
prompts.
...
...
@@ -1131,9 +1138,11 @@ class LLM:
for
pooling_param
in
pooling_params
:
pooling_param
.
verify
(
pooling_task
,
model_config
)
if
tokenization_kwargs
is
None
:
tokenization_kwargs
=
dict
[
str
,
Any
]()
_validate_truncation_size
(
model_config
.
max_model_len
,
truncate_prompt_tokens
,
tokenization_kwargs
)
truncate_prompt_tokens
,
tokenization_kwargs
)
self
.
_validate_and_add_requests
(
prompts
=
parsed_prompts
,
...
...
vllm/v1/engine/async_llm.py
View file @
44554a00
...
...
@@ -437,6 +437,7 @@ class AsyncLLM(EngineClient):
lora_request
:
Optional
[
LoRARequest
]
=
None
,
trace_headers
:
Optional
[
Mapping
[
str
,
str
]]
=
None
,
priority
:
int
=
0
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
)
->
AsyncGenerator
[
PoolingRequestOutput
,
None
]:
"""
Main function called by the API server to kick off a request
...
...
@@ -465,6 +466,7 @@ class AsyncLLM(EngineClient):
lora_request
=
lora_request
,
trace_headers
=
trace_headers
,
priority
=
priority
,
tokenization_kwargs
=
tokenization_kwargs
,
)
# The output_handler task pushes items into the queue.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment