Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
82ec66f5
Unverified
Commit
82ec66f5
authored
Jul 23, 2025
by
Michael Goin
Committed by
GitHub
Jul 23, 2025
Browse files
[V0 Deprecation] Remove Prompt Adapters (#20588)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
78c13e30
Changes
60
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
9 additions
and
560 deletions
+9
-560
vllm/prompt_adapter/request.py
vllm/prompt_adapter/request.py
+0
-37
vllm/prompt_adapter/utils.py
vllm/prompt_adapter/utils.py
+0
-98
vllm/prompt_adapter/worker_manager.py
vllm/prompt_adapter/worker_manager.py
+0
-179
vllm/sequence.py
vllm/sequence.py
+2
-37
vllm/utils/__init__.py
vllm/utils/__init__.py
+0
-5
vllm/v1/engine/async_llm.py
vllm/v1/engine/async_llm.py
+1
-6
vllm/v1/engine/llm_engine.py
vllm/v1/engine/llm_engine.py
+1
-4
vllm/v1/engine/processor.py
vllm/v1/engine/processor.py
+0
-6
vllm/v1/utils.py
vllm/v1/utils.py
+0
-2
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+0
-1
vllm/v1/worker/tpu_model_runner.py
vllm/v1/worker/tpu_model_runner.py
+0
-1
vllm/v1/worker/tpu_worker.py
vllm/v1/worker/tpu_worker.py
+0
-1
vllm/worker/enc_dec_model_runner.py
vllm/worker/enc_dec_model_runner.py
+3
-4
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+2
-149
vllm/worker/model_runner_base.py
vllm/worker/model_runner_base.py
+0
-1
vllm/worker/multi_step_model_runner.py
vllm/worker/multi_step_model_runner.py
+0
-3
vllm/worker/pooling_model_runner.py
vllm/worker/pooling_model_runner.py
+0
-7
vllm/worker/utils.py
vllm/worker/utils.py
+0
-4
vllm/worker/worker.py
vllm/worker/worker.py
+0
-14
vllm/worker/worker_base.py
vllm/worker/worker_base.py
+0
-1
No files found.
vllm/prompt_adapter/request.py
deleted
100644 → 0
View file @
78c13e30
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
msgspec
from
vllm.adapter_commons.request
import
AdapterRequest
class
PromptAdapterRequest
(
msgspec
.
Struct
,
array_like
=
True
,
# type: ignore[call-arg]
omit_defaults
=
True
,
# type: ignore[call-arg]
frozen
=
True
):
# type: ignore[call-arg]
"""
Request for a Prompt adapter.
"""
__metaclass__
=
AdapterRequest
prompt_adapter_name
:
str
prompt_adapter_id
:
int
prompt_adapter_local_path
:
str
prompt_adapter_num_virtual_tokens
:
int
def
__hash__
(
self
):
return
super
().
__hash__
()
@
property
def
adapter_id
(
self
):
return
self
.
prompt_adapter_id
@
property
def
name
(
self
):
return
self
.
prompt_adapter_name
@
property
def
local_path
(
self
):
return
self
.
prompt_adapter_local_path
vllm/prompt_adapter/utils.py
deleted
100644 → 0
View file @
78c13e30
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420
import
os
from
typing
import
Optional
import
torch
from
huggingface_hub
import
file_exists
,
hf_hub_download
from
huggingface_hub.utils
import
EntryNotFoundError
from
safetensors.torch
import
load_file
as
safe_load_file
from
vllm.platforms
import
current_platform
WEIGHTS_NAME
=
"adapter_model.bin"
SAFETENSORS_WEIGHTS_NAME
=
"adapter_model.safetensors"
# Get current device name based on available devices
def
infer_device
()
->
str
:
if
current_platform
.
is_cuda_alike
():
return
"cuda"
return
"cpu"
def
load_peft_weights
(
model_id
:
str
,
device
:
Optional
[
str
]
=
None
,
**
hf_hub_download_kwargs
)
->
dict
:
r
"""
A helper method to load the PEFT weights from the HuggingFace Hub or locally
Args:
model_id (`str`):
The local path to the adapter weights or the name of the adapter to
load from the HuggingFace Hub.
device (`str`):
The device to load the weights onto.
hf_hub_download_kwargs (`dict`):
Additional arguments to pass to the `hf_hub_download` method when
loading from the HuggingFace Hub.
"""
path
=
(
os
.
path
.
join
(
model_id
,
hf_hub_download_kwargs
[
"subfolder"
])
if
hf_hub_download_kwargs
.
get
(
"subfolder"
)
is
not
None
else
model_id
)
if
device
is
None
:
device
=
infer_device
()
if
os
.
path
.
exists
(
os
.
path
.
join
(
path
,
SAFETENSORS_WEIGHTS_NAME
)):
filename
=
os
.
path
.
join
(
path
,
SAFETENSORS_WEIGHTS_NAME
)
use_safetensors
=
True
elif
os
.
path
.
exists
(
os
.
path
.
join
(
path
,
WEIGHTS_NAME
)):
filename
=
os
.
path
.
join
(
path
,
WEIGHTS_NAME
)
use_safetensors
=
False
else
:
token
=
hf_hub_download_kwargs
.
get
(
"token"
)
if
token
is
None
:
token
=
hf_hub_download_kwargs
.
get
(
"use_auth_token"
)
hub_filename
=
(
os
.
path
.
join
(
hf_hub_download_kwargs
[
"subfolder"
],
SAFETENSORS_WEIGHTS_NAME
)
if
hf_hub_download_kwargs
.
get
(
"subfolder"
)
is
not
None
else
SAFETENSORS_WEIGHTS_NAME
)
has_remote_safetensors_file
=
file_exists
(
repo_id
=
model_id
,
filename
=
hub_filename
,
revision
=
hf_hub_download_kwargs
.
get
(
"revision"
),
repo_type
=
hf_hub_download_kwargs
.
get
(
"repo_type"
),
token
=
token
,
)
use_safetensors
=
has_remote_safetensors_file
if
has_remote_safetensors_file
:
# Priority 1: load safetensors weights
filename
=
hf_hub_download
(
model_id
,
SAFETENSORS_WEIGHTS_NAME
,
**
hf_hub_download_kwargs
,
)
else
:
try
:
filename
=
hf_hub_download
(
model_id
,
WEIGHTS_NAME
,
**
hf_hub_download_kwargs
)
except
EntryNotFoundError
:
raise
ValueError
(
# noqa: B904
f
"Can't find weights for
{
model_id
}
in
{
model_id
}
or
\
in the Hugging Face Hub. "
f
"Please check that the file
{
WEIGHTS_NAME
}
or
\
{
SAFETENSORS_WEIGHTS_NAME
}
is present at
{
model_id
}
."
)
if
use_safetensors
:
adapters_weights
=
safe_load_file
(
filename
,
device
=
device
)
else
:
adapters_weights
=
torch
.
load
(
filename
,
map_location
=
torch
.
device
(
device
),
weights_only
=
True
)
return
adapters_weights
vllm/prompt_adapter/worker_manager.py
deleted
100644 → 0
View file @
78c13e30
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
logging
from
typing
import
Any
,
Optional
,
Set
,
Type
import
torch
from
vllm.adapter_commons.utils
import
(
add_adapter_worker
,
apply_adapters_worker
,
list_adapters_worker
,
set_active_adapters_worker
)
from
vllm.adapter_commons.worker_manager
import
AbstractWorkerManager
from
vllm.config
import
PromptAdapterConfig
from
vllm.prompt_adapter.models
import
(
LRUCachePromptAdapterModelManager
,
PromptAdapterModel
,
PromptAdapterModelManager
,
create_prompt_adapter_manager
)
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
logger
=
logging
.
getLogger
(
__name__
)
class
WorkerPromptAdapterManager
(
AbstractWorkerManager
):
"""WorkerPromptAdapterManager that manages
prompt_adapter models on the worker side.
Every request, the requested prompt_adapters will be
loaded (unless they are already loaded),
and every other prompt_adapter will be unloaded."""
_manager_cls
:
Type
[
PromptAdapterModelManager
]
=
PromptAdapterModelManager
def
__init__
(
self
,
max_num_seqs
:
int
,
max_num_batched_tokens
:
int
,
device
:
torch
.
device
,
prompt_adapter_config
:
PromptAdapterConfig
,
prompt_adapter_model_cls
:
Type
[
PromptAdapterModel
]
=
PromptAdapterModel
):
self
.
_adapter_manager
:
PromptAdapterModelManager
self
.
max_num_seqs
=
max_num_seqs
self
.
max_num_batched_tokens
=
max_num_batched_tokens
self
.
_prompt_adapter_model_cls
=
prompt_adapter_model_cls
self
.
prompt_adapter_config
=
prompt_adapter_config
super
().
__init__
(
device
)
@
property
def
is_enabled
(
self
)
->
bool
:
return
True
def
create_prompt_adapter_manager
(
self
,
model
:
torch
.
nn
.
Module
,
)
->
Any
:
prompt_adapter_manager
=
create_prompt_adapter_manager
(
model
,
max_num_seqs
=
self
.
max_num_seqs
,
max_num_batched_tokens
=
self
.
max_num_batched_tokens
,
prompt_adapter_config
=
self
.
prompt_adapter_config
,
prompt_adapter_manager_cls
=
self
.
_manager_cls
,
)
self
.
_adapter_manager
=
prompt_adapter_manager
return
prompt_adapter_manager
.
model
def
_load_adapter
(
self
,
prompt_adapter_request
:
PromptAdapterRequest
)
->
PromptAdapterModel
:
try
:
prompt_adapter
=
(
self
.
_prompt_adapter_model_cls
.
from_local_checkpoint
(
prompt_adapter_request
.
prompt_adapter_local_path
,
prompt_adapter_id
=
prompt_adapter_request
.
prompt_adapter_id
,
num_virtual_tokens
=
prompt_adapter_request
.
prompt_adapter_num_virtual_tokens
,
config
=
self
.
prompt_adapter_config
,
device
=
str
(
self
.
device
),
))
except
Exception
as
e
:
raise
RuntimeError
(
f
"Loading prompt_adapter "
f
"
{
prompt_adapter_request
.
prompt_adapter_local_path
}
"
f
" failed"
)
from
e
return
prompt_adapter
def
add_dummy_prompt_adapter
(
self
,
prompt_adapter_request
:
PromptAdapterRequest
)
->
bool
:
return
True
def
pin_adapter
(
self
,
adapter_id
:
int
)
->
bool
:
return
self
.
_adapter_manager
.
pin_adapter
(
adapter_id
)
def
set_active_adapters
(
self
,
requests
:
Set
[
Any
],
mapping
:
Optional
[
Any
])
->
None
:
set_active_adapters_worker
(
requests
,
mapping
,
self
.
_apply_adapters
,
self
.
_adapter_manager
.
set_adapter_mapping
)
def
add_adapter
(
self
,
adapter_request
:
Any
)
->
bool
:
return
add_adapter_worker
(
adapter_request
,
self
.
list_adapters
,
self
.
_load_adapter
,
self
.
_adapter_manager
.
add_adapter
,
self
.
_adapter_manager
.
activate_adapter
)
def
_apply_adapters
(
self
,
adapter_requests
:
Set
[
Any
])
->
None
:
apply_adapters_worker
(
adapter_requests
,
self
.
list_adapters
,
self
.
_adapter_manager
.
adapter_slots
,
self
.
remove_adapter
,
self
.
add_adapter
)
def
remove_adapter
(
self
,
adapter_id
:
int
)
->
bool
:
return
self
.
_adapter_manager
.
remove_adapter
(
adapter_id
)
def
remove_all_adapters
(
self
):
self
.
_adapter_manager
.
remove_all_adapters
()
def
list_adapters
(
self
)
->
Set
[
int
]:
return
list_adapters_worker
(
self
.
_adapter_manager
.
list_adapters
)
class
LRUCacheWorkerPromptAdapterManager
(
WorkerPromptAdapterManager
):
"""WorkerPromptAdapterManager that manages
prompt_adapter models on the worker side.
Uses an LRU Cache. Every request, the requested
prompt_adapters will be loaded (unless they are already loaded)
and least recently used prompt_adapters will
be unloaded if the cache is above capacity."""
_prompt_adapter_manager_cls
:
Type
[
LRUCachePromptAdapterModelManager
]
=
LRUCachePromptAdapterModelManager
def
create_prompt_adapter_manager
(
self
,
model
:
torch
.
nn
.
Module
,
)
->
Any
:
prompt_adapter_manager
=
create_prompt_adapter_manager
(
model
,
max_num_seqs
=
self
.
max_num_seqs
,
max_num_batched_tokens
=
self
.
max_num_batched_tokens
,
prompt_adapter_config
=
self
.
prompt_adapter_config
,
prompt_adapter_manager_cls
=
self
.
_prompt_adapter_manager_cls
)
self
.
_adapter_manager
:
LRUCachePromptAdapterModelManager
=
(
prompt_adapter_manager
)
return
prompt_adapter_manager
.
model
def
_apply_adapters
(
self
,
prompt_adapter_requests
:
Set
[
PromptAdapterRequest
])
->
None
:
prompt_adapters_map
=
{
prompt_adapter_request
.
prompt_adapter_id
:
prompt_adapter_request
for
prompt_adapter_request
in
prompt_adapter_requests
if
prompt_adapter_request
}
if
len
(
prompt_adapters_map
)
>
self
.
_adapter_manager
.
prompt_adapter_slots
:
raise
RuntimeError
(
f
"Number of requested prompt_adapters "
f
"(
{
len
(
prompt_adapters_map
)
}
) is greater "
"than the number of GPU prompt_adapter slots "
f
"(
{
self
.
_adapter_manager
.
prompt_adapter_slots
}
)."
)
for
prompt_adapter
in
prompt_adapters_map
.
values
():
self
.
add_adapter
(
prompt_adapter
)
def
add_adapter
(
self
,
prompt_adapter_request
:
PromptAdapterRequest
)
->
bool
:
if
prompt_adapter_request
.
prompt_adapter_id
not
in
self
.
list_adapters
(
):
# Remove before we load the new prompt_adapter to save memory
if
len
(
self
.
_adapter_manager
)
+
1
>
self
.
_adapter_manager
.
capacity
:
self
.
_adapter_manager
.
remove_oldest_adapter
()
prompt_adapter
=
self
.
_load_adapter
(
prompt_adapter_request
)
loaded
=
self
.
_adapter_manager
.
add_adapter
(
prompt_adapter
)
else
:
# If the prompt_adapter is already loaded, just touch it to
# update its position in the caches
loaded
=
self
.
_adapter_manager
.
get_adapter
(
prompt_adapter_request
.
prompt_adapter_id
)
is
not
None
self
.
_adapter_manager
.
activate_adapter
(
prompt_adapter_request
.
prompt_adapter_id
)
return
loaded
vllm/sequence.py
View file @
82ec66f5
...
...
@@ -19,7 +19,6 @@ from vllm.inputs import SingletonInputs
from
vllm.lora.request
import
LoRARequest
from
vllm.multimodal
import
MultiModalKwargs
,
MultiModalPlaceholderDict
from
vllm.pooling_params
import
PoolingParams
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
from
vllm.sampling_params
import
RequestOutputKind
,
SamplingParams
VLLM_TOKEN_ID_ARRAY_TYPE
=
"l"
...
...
@@ -458,7 +457,6 @@ class Sequence:
block size used by the block manager and cache engine.
eos_token_id: The end-of-sequence (EOS) token id recognized by this LLM.
lora_request: LoRA request.
prompt_adapter_request: Prompt Adapter request.
"""
def
__init__
(
...
...
@@ -468,14 +466,12 @@ class Sequence:
block_size
:
int
,
eos_token_id
:
Optional
[
int
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
)
->
None
:
self
.
seq_id
=
seq_id
self
.
inputs
=
inputs
self
.
block_size
=
block_size
self
.
eos_token_id
=
eos_token_id
self
.
lora_request
=
lora_request
self
.
prompt_adapter_request
=
prompt_adapter_request
self
.
data
=
SequenceData
.
from_seqs
(
self
.
prompt_token_ids
,
...
...
@@ -537,11 +533,6 @@ class Sequence:
def
lora_int_id
(
self
)
->
int
:
return
self
.
lora_request
.
lora_int_id
if
self
.
lora_request
else
0
@
property
def
prompt_adapter_id
(
self
)
->
int
:
return
self
.
prompt_adapter_request
.
prompt_adapter_id
\
if
self
.
prompt_adapter_request
else
0
def
get_output_text_to_return
(
self
,
buffer_length
:
int
,
delta
:
bool
)
->
str
:
"""If delta is True, only new text since the last call to
...
...
@@ -601,12 +592,12 @@ class Sequence:
designed for prefix caching mode. The final sequence hash is determined
by applying token_ids from the sequence's blocks.
"""
if
self
.
prompt_adapter_id
==
0
and
self
.
lora_int_id
==
0
:
if
self
.
lora_int_id
==
0
:
return
None
# NOTE: If there are additional factors influencing the block aside from
# token_ids, include them as input parameters to the hash.
return
hash
(
(
self
.
prompt_adapter_id
,
self
.
lora_int_id
)
)
return
hash
(
self
.
lora_int_id
)
def
num_hashed_tokens_of_block
(
self
,
logical_idx
:
int
):
return
logical_idx
*
self
.
block_size
+
self
.
block_size
...
...
@@ -707,7 +698,6 @@ class SequenceGroup:
encoder_seq: Optional, the single encoder sequence. Should be None
unless you are working with an encoder/decoder model.
trace_headers: OpenTelemetry trace headers.
prompt_adapter_request: Prompt Adapter request.
priority: User-defined priority of the request.
draft_size: The number of speculative tokens plus one from the target
model; equal to max number of tokens a step can generate
...
...
@@ -725,7 +715,6 @@ class SequenceGroup:
pooled_data
:
Optional
[
torch
.
Tensor
]
=
None
,
encoder_seq
:
Optional
[
Sequence
]
=
None
,
trace_headers
:
Optional
[
Mapping
[
str
,
str
]]
=
None
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
priority
:
int
=
0
,
draft_size
:
int
=
1
)
->
None
:
self
.
request_id
=
request_id
...
...
@@ -747,7 +736,6 @@ class SequenceGroup:
self
.
state
=
SequenceGroupState
()
self
.
pooling_params
=
pooling_params
self
.
pooled_data
=
pooled_data
self
.
prompt_adapter_request
=
prompt_adapter_request
self
.
encoder_seq
=
encoder_seq
self
.
trace_headers
=
trace_headers
self
.
priority
=
priority
...
...
@@ -802,16 +790,6 @@ class SequenceGroup:
def
lora_int_id
(
self
)
->
int
:
return
self
.
lora_request
.
lora_int_id
if
self
.
lora_request
else
0
@
property
def
prompt_adapter_id
(
self
)
->
int
:
return
self
.
prompt_adapter_request
.
prompt_adapter_id
\
if
self
.
prompt_adapter_request
else
0
@
property
def
prompt_adapter_num_virtual_tokens
(
self
)
->
int
:
return
self
.
prompt_adapter_request
.
prompt_adapter_num_virtual_tokens
\
if
self
.
prompt_adapter_request
else
0
def
init_multi_step
(
self
,
num_steps
:
int
)
->
None
:
self
.
state
.
num_steps
=
num_steps
self
.
state
.
current_step
=
0
...
...
@@ -1011,7 +989,6 @@ class SequenceGroupMetadata(
(SequenceGroup.encoder_seq). Should be None
unless you are working with an encoder/decoder
model.
prompt_adapter_request: Prompt Adapter request.
"""
request_id
:
str
...
...
@@ -1030,7 +1007,6 @@ class SequenceGroupMetadata(
multi_modal_placeholders
:
Optional
[
MultiModalPlaceholderDict
]
=
None
encoder_seq_data
:
Optional
[
SequenceData
]
=
None
cross_block_table
:
Optional
[
list
[
int
]]
=
None
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
token_chunk_size
:
Optional
[
int
]
=
None
### Stateful fields that are lazily defined. ###
...
...
@@ -1052,16 +1028,6 @@ class SequenceGroupMetadata(
def
lora_int_id
(
self
)
->
int
:
return
self
.
lora_request
.
lora_int_id
if
self
.
lora_request
else
0
@
property
def
prompt_adapter_id
(
self
)
->
int
:
return
self
.
prompt_adapter_request
.
prompt_adapter_id
\
if
self
.
prompt_adapter_request
else
0
@
property
def
prompt_adapter_num_virtual_tokens
(
self
)
->
int
:
return
self
.
prompt_adapter_request
.
prompt_adapter_num_virtual_tokens
\
if
self
.
prompt_adapter_request
else
0
# Multi-Step Chunked-Prefill property
@
property
def
is_single_step_prompt
(
self
)
->
bool
:
...
...
@@ -1525,7 +1491,6 @@ class ParallelSampleSequenceGroup(SequenceGroupBase):
pooled_data
=
seq_group
.
pooled_data
,
encoder_seq
=
seq_group
.
encoder_seq
,
trace_headers
=
seq_group
.
trace_headers
,
prompt_adapter_request
=
seq_group
.
prompt_adapter_request
,
priority
=
seq_group
.
priority
,
)
...
...
vllm/utils/__init__.py
View file @
82ec66f5
...
...
@@ -128,10 +128,6 @@ STR_NOT_IMPL_ENC_DEC_BACKEND = ("XFormers and Flash-Attention are the only "
"backends currently supported with encoder/"
"decoder models."
)
STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER
=
(
"Prompt adapters are not "
"currently supported with encoder/"
"decoder models."
)
# Efficiently import all enc/dec error strings
# rather than having to import all of the above
STR_NOT_IMPL_ENC_DEC_ERR_STRS
=
{
...
...
@@ -145,7 +141,6 @@ STR_NOT_IMPL_ENC_DEC_ERR_STRS = {
"STR_NOT_IMPL_ENC_DEC_MM"
:
STR_NOT_IMPL_ENC_DEC_MM
,
"STR_NOT_IMPL_ENC_DEC_SPEC_DEC"
:
STR_NOT_IMPL_ENC_DEC_SPEC_DEC
,
"STR_NOT_IMPL_ENC_DEC_BACKEND"
:
STR_NOT_IMPL_ENC_DEC_BACKEND
,
"STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER"
:
STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER
,
}
# Constants related to forcing the attention backend selection
...
...
vllm/v1/engine/async_llm.py
View file @
82ec66f5
...
...
@@ -20,7 +20,6 @@ from vllm.lora.request import LoRARequest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalRegistry
from
vllm.outputs
import
PoolingRequestOutput
,
RequestOutput
from
vllm.pooling_params
import
PoolingParams
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
from
vllm.sampling_params
import
SamplingParams
from
vllm.transformers_utils.config
import
(
maybe_register_config_serialize_by_value
)
...
...
@@ -221,7 +220,6 @@ class AsyncLLM(EngineClient):
lora_request
:
Optional
[
LoRARequest
]
=
None
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
trace_headers
:
Optional
[
Mapping
[
str
,
str
]]
=
None
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
priority
:
int
=
0
,
data_parallel_rank
:
Optional
[
int
]
=
None
,
)
->
RequestOutputCollector
:
...
...
@@ -238,8 +236,7 @@ class AsyncLLM(EngineClient):
# Convert Input --> Request.
prompt_str
,
request
=
self
.
processor
.
process_inputs
(
request_id
,
prompt
,
params
,
arrival_time
,
lora_request
,
tokenization_kwargs
,
trace_headers
,
prompt_adapter_request
,
priority
,
data_parallel_rank
)
tokenization_kwargs
,
trace_headers
,
priority
,
data_parallel_rank
)
if
is_pooling
or
params
.
n
==
1
:
await
self
.
_add_request
(
request
,
prompt_str
,
None
,
0
,
queue
)
...
...
@@ -283,7 +280,6 @@ class AsyncLLM(EngineClient):
request_id
:
str
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
trace_headers
:
Optional
[
Mapping
[
str
,
str
]]
=
None
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
priority
:
int
=
0
,
data_parallel_rank
:
Optional
[
int
]
=
None
,
)
->
AsyncGenerator
[
RequestOutput
,
None
]:
...
...
@@ -314,7 +310,6 @@ class AsyncLLM(EngineClient):
sampling_params
,
lora_request
=
lora_request
,
trace_headers
=
trace_headers
,
prompt_adapter_request
=
prompt_adapter_request
,
priority
=
priority
,
data_parallel_rank
=
data_parallel_rank
,
)
...
...
vllm/v1/engine/llm_engine.py
View file @
82ec66f5
...
...
@@ -17,7 +17,6 @@ from vllm.lora.request import LoRARequest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalRegistry
from
vllm.outputs
import
PoolingRequestOutput
,
RequestOutput
from
vllm.pooling_params
import
PoolingParams
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
from
vllm.sampling_params
import
SamplingParams
from
vllm.transformers_utils.tokenizer_group
import
(
TokenizerGroup
,
init_tokenizer_from_configs
)
...
...
@@ -192,7 +191,6 @@ class LLMEngine:
lora_request
:
Optional
[
LoRARequest
]
=
None
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
trace_headers
:
Optional
[
Mapping
[
str
,
str
]]
=
None
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
priority
:
int
=
0
,
)
->
None
:
# Validate the request_id type.
...
...
@@ -203,8 +201,7 @@ class LLMEngine:
# Process raw inputs into the request.
prompt_str
,
request
=
self
.
processor
.
process_inputs
(
request_id
,
prompt
,
params
,
arrival_time
,
lora_request
,
tokenization_kwargs
,
trace_headers
,
prompt_adapter_request
,
priority
)
tokenization_kwargs
,
trace_headers
,
priority
)
n
=
params
.
n
if
isinstance
(
params
,
SamplingParams
)
else
1
...
...
vllm/v1/engine/processor.py
View file @
82ec66f5
...
...
@@ -16,7 +16,6 @@ from vllm.multimodal.inputs import PlaceholderRange
from
vllm.multimodal.processing
import
EncDecMultiModalProcessor
from
vllm.multimodal.utils
import
merge_and_sort_multimodal_metadata
from
vllm.pooling_params
import
PoolingParams
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
from
vllm.sampling_params
import
SamplingParams
from
vllm.transformers_utils.tokenizer_group
import
TokenizerGroup
from
vllm.v1.engine
import
EngineCoreRequest
...
...
@@ -226,7 +225,6 @@ class Processor:
lora_request
:
Optional
[
LoRARequest
]
=
None
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
trace_headers
:
Optional
[
Mapping
[
str
,
str
]]
=
None
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
priority
:
int
=
0
,
data_parallel_rank
:
Optional
[
int
]
=
None
,
)
->
tuple
[
Optional
[
str
],
EngineCoreRequest
]:
...
...
@@ -237,8 +235,6 @@ class Processor:
self
.
_validate_params
(
params
,
lora_request
)
if
trace_headers
is
not
None
:
raise
ValueError
(
"V1 does not support tracing yet."
)
if
prompt_adapter_request
is
not
None
:
raise
ValueError
(
"V1 does not support prompt_adapter_request."
)
data_parallel_size
=
self
.
vllm_config
.
parallel_config
.
data_parallel_size
if
data_parallel_rank
is
not
None
and
not
(
0
<=
data_parallel_rank
<
...
...
@@ -253,12 +249,10 @@ class Processor:
# 1. Tokenize text prompt, with LoRA request if one exists.
# 2. For multimodal models with a merged preprocessor, preprocess
# multimodal data and expand prompt token ids accordingly.
# 3. Apply prompt adapter to prompt token ids if one exists.
processed_inputs
:
ProcessorInputs
=
self
.
input_preprocessor
.
preprocess
(
prompt
,
tokenization_kwargs
=
tokenization_kwargs
,
lora_request
=
lora_request
,
prompt_adapter_request
=
prompt_adapter_request
,
return_mm_hashes
=
self
.
use_hash
,
)
from
vllm.platforms
import
current_platform
...
...
vllm/v1/utils.py
View file @
82ec66f5
...
...
@@ -318,8 +318,6 @@ def report_usage_stats(
# Feature flags
"enable_lora"
:
bool
(
vllm_config
.
lora_config
),
"enable_prompt_adapter"
:
bool
(
vllm_config
.
prompt_adapter_config
),
"enable_prefix_caching"
:
vllm_config
.
cache_config
.
enable_prefix_caching
,
"enforce_eager"
:
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
82ec66f5
...
...
@@ -104,7 +104,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self
.
parallel_config
=
vllm_config
.
parallel_config
self
.
scheduler_config
=
vllm_config
.
scheduler_config
self
.
speculative_config
=
vllm_config
.
speculative_config
self
.
prompt_adapter_config
=
vllm_config
.
prompt_adapter_config
self
.
observability_config
=
vllm_config
.
observability_config
from
vllm.model_executor.models.utils
import
set_cpu_offload_max_bytes
...
...
vllm/v1/worker/tpu_model_runner.py
View file @
82ec66f5
...
...
@@ -114,7 +114,6 @@ class TPUModelRunner(LoRAModelRunnerMixin):
self
.
original_parallel_config
=
original_parallel_config
self
.
scheduler_config
=
vllm_config
.
scheduler_config
self
.
speculative_config
=
vllm_config
.
speculative_config
self
.
prompt_adapter_config
=
vllm_config
.
prompt_adapter_config
self
.
observability_config
=
vllm_config
.
observability_config
self
.
device_config
=
vllm_config
.
device_config
...
...
vllm/v1/worker/tpu_worker.py
View file @
82ec66f5
...
...
@@ -62,7 +62,6 @@ class TPUWorker:
self
.
scheduler_config
=
vllm_config
.
scheduler_config
self
.
device_config
=
vllm_config
.
device_config
self
.
speculative_config
=
vllm_config
.
speculative_config
self
.
prompt_adapter_config
=
vllm_config
.
prompt_adapter_config
self
.
observability_config
=
vllm_config
.
observability_config
self
.
parallel_config
.
rank
=
rank
...
...
vllm/worker/enc_dec_model_runner.py
View file @
82ec66f5
...
...
@@ -91,10 +91,9 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
'''
EncoderDecoderModelRunner constructor.
`lora_config` and `prompt_adapter_config` are
unused (since these features are not yet supported for encoder/decoder
models) but these arguments are present here for compatibility with
the base-class constructor.
`lora_config` is unused (since these features are not yet supported
for encoder/decoder models) but these arguments are present here for
compatibility with the base-class constructor.
'''
self
.
_maybe_force_supported_attention_backend
()
...
...
vllm/worker/model_runner.py
View file @
82ec66f5
...
...
@@ -45,10 +45,6 @@ from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
from
vllm.multimodal
import
(
MULTIMODAL_REGISTRY
,
BatchedTensorInputs
,
MultiModalKwargs
,
MultiModalPlaceholderMap
,
MultiModalRegistry
)
from
vllm.prompt_adapter.layers
import
PromptAdapterMapping
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
from
vllm.prompt_adapter.worker_manager
import
(
LRUCacheWorkerPromptAdapterManager
)
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
IntermediateTensors
,
SequenceGroupMetadata
from
vllm.utils
import
(
DeviceMemoryProfiler
,
GiB_bytes
,
PyObjectCache
,
...
...
@@ -95,8 +91,6 @@ class ModelInputForGPU(ModelRunnerInputBase):
lora_mapping
:
Optional
[
"LoRAMapping"
]
=
None
lora_requests
:
Optional
[
Set
[
LoRARequest
]]
=
None
attn_metadata
:
Optional
[
"AttentionMetadata"
]
=
None
prompt_adapter_mapping
:
Optional
[
PromptAdapterMapping
]
=
None
prompt_adapter_requests
:
Optional
[
Set
[
PromptAdapterRequest
]]
=
None
multi_modal_kwargs
:
Optional
[
BatchedTensorInputs
]
=
None
request_ids_to_seq_ids
:
Optional
[
Dict
[
str
,
List
[
int
]]]
=
None
finished_requests_ids
:
Optional
[
List
[
str
]]
=
None
...
...
@@ -113,8 +107,6 @@ class ModelInputForGPU(ModelRunnerInputBase):
"lora_requests"
:
self
.
lora_requests
,
"lora_mapping"
:
self
.
lora_mapping
,
"multi_modal_kwargs"
:
self
.
multi_modal_kwargs
,
"prompt_adapter_mapping"
:
self
.
prompt_adapter_mapping
,
"prompt_adapter_requests"
:
self
.
prompt_adapter_requests
,
"virtual_engine"
:
self
.
virtual_engine
,
"request_ids_to_seq_ids"
:
self
.
request_ids_to_seq_ids
,
"finished_requests_ids"
:
self
.
finished_requests_ids
,
...
...
@@ -164,8 +156,6 @@ class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU):
"lora_requests"
:
self
.
lora_requests
,
"lora_mapping"
:
self
.
lora_mapping
,
"multi_modal_kwargs"
:
self
.
multi_modal_kwargs
,
"prompt_adapter_mapping"
:
self
.
prompt_adapter_mapping
,
"prompt_adapter_requests"
:
self
.
prompt_adapter_requests
,
"virtual_engine"
:
self
.
virtual_engine
,
"request_ids_to_seq_ids"
:
self
.
request_ids_to_seq_ids
,
"finished_requests_ids"
:
self
.
finished_requests_ids
,
...
...
@@ -212,8 +202,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
self
.
lora_index_mapping
.
clear
()
# type: ignore
self
.
lora_prompt_mapping
.
clear
()
# type: ignore
self
.
lora_requests
.
clear
()
# type: ignore
self
.
prompt_adapter_index_mapping
.
clear
()
# type: ignore
self
.
prompt_adapter_prompt_mapping
.
clear
()
# type: ignore
def
__init__
(
self
,
...
...
@@ -252,11 +240,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
lora_prompt_mapping
:
Optional
[
List
[
List
[
int
]]]
=
None
,
lora_requests
:
Optional
[
Set
[
LoRARequest
]]
=
None
,
# Prompt adapter inputs.
prompt_adapter_index_mapping
:
Optional
[
List
[
int
]]
=
None
,
prompt_adapter_prompt_mapping
:
Optional
[
List
[
int
]]
=
None
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
# Multi-modal inputs.
multi_modal_kwargs
:
Optional
[
MultiModalKwargs
]
=
None
,
multi_modal_placeholder_maps
:
Optional
[
Dict
[
...
...
@@ -360,18 +343,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
else
:
self
.
lora_requests
.
clear
()
if
prompt_adapter_index_mapping
:
self
.
prompt_adapter_index_mapping
=
\
prompt_adapter_index_mapping
else
:
self
.
prompt_adapter_index_mapping
.
clear
()
if
prompt_adapter_prompt_mapping
:
self
.
prompt_adapter_prompt_mapping
=
\
prompt_adapter_prompt_mapping
else
:
self
.
prompt_adapter_prompt_mapping
.
clear
()
else
:
self
.
input_tokens
=
input_tokens
or
[]
self
.
inputs_embeds
=
inputs_embeds
...
...
@@ -390,12 +361,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
self
.
lora_prompt_mapping
=
lora_prompt_mapping
or
[]
self
.
lora_requests
=
lora_requests
or
set
()
self
.
prompt_adapter_index_mapping
=
(
prompt_adapter_index_mapping
or
[])
self
.
prompt_adapter_prompt_mapping
=
(
prompt_adapter_prompt_mapping
or
[])
self
.
prompt_adapter_request
=
prompt_adapter_request
self
.
multi_modal_kwargs
=
multi_modal_kwargs
self
.
multi_modal_placeholder_maps
=
multi_modal_placeholder_maps
self
.
prefix_cache_hit
=
prefix_cache_hit
...
...
@@ -485,7 +450,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
# Compute functions for each sequence group.
# WARNING: The order of the functions matters!
self
.
per_seq_group_compute_fns
=
[
self
.
_compute_prompt_adapter_input
,
self
.
_compute_multi_modal_input
,
]
...
...
@@ -496,8 +460,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
self
.
sliding_window
=
self
.
runner
.
sliding_window
self
.
block_size
=
self
.
runner
.
block_size
self
.
enable_lora
=
self
.
runner
.
lora_config
is
not
None
self
.
enable_prompt_adapter
=
(
self
.
runner
.
prompt_adapter_config
is
not
None
)
# Attention metadata inputs.
if
self
.
attn_backend
is
not
None
:
...
...
@@ -693,34 +655,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
else
:
inter_data
.
lora_prompt_mapping
.
append
([])
def
_compute_prompt_adapter_input
(
self
,
inter_data
:
InterDataForSeqGroup
,
seq_group_metadata
:
SequenceGroupMetadata
):
"""If prompt adapter is enabled, compute index and prompt mapping.
"""
# Note that when is_prompt=True, we expect only one sequence
# in the group.
if
not
self
.
enable_prompt_adapter
:
return
prompt_adapter_id
=
seq_group_metadata
.
prompt_adapter_id
if
prompt_adapter_id
<=
0
or
not
inter_data
.
is_prompt
:
return
# We expect only one sequence in the group when is_prompt=True.
assert
inter_data
.
n_seqs
==
1
query_len
=
inter_data
.
query_lens
[
0
]
inter_data
.
prompt_adapter_request
=
(
seq_group_metadata
.
prompt_adapter_request
)
num_tokens
=
seq_group_metadata
.
prompt_adapter_num_virtual_tokens
inter_data
.
prompt_adapter_index_mapping
=
[
prompt_adapter_id
]
*
num_tokens
+
[
0
]
*
(
query_len
-
num_tokens
)
inter_data
.
prompt_adapter_prompt_mapping
=
[
prompt_adapter_id
]
*
(
query_len
if
seq_group_metadata
.
sampling_params
and
seq_group_metadata
.
sampling_params
.
prompt_logprobs
else
1
)
def
_compute_multi_modal_input
(
self
,
inter_data
:
InterDataForSeqGroup
,
seq_group_metadata
:
SequenceGroupMetadata
):
"""If multi-modal data is given, add it to the input."""
...
...
@@ -1009,29 +943,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
prompt_mapping
=
lora_prompt_mapping
,
is_prefill
=
not
self
.
decode_only
))
# Prompt adapter data.
prompt_adapter_requests
:
Set
[
PromptAdapterRequest
]
=
set
()
prompt_adapter_mapping
=
None
if
self
.
enable_prompt_adapter
:
prompt_adapter_requests
=
set
(
data
.
prompt_adapter_request
for
data
in
self
.
inter_data_list
if
data
.
prompt_adapter_request
is
not
None
)
prompt_adapter_index_mapping
=
flatten_2d_lists
([
inter_data
.
prompt_adapter_index_mapping
for
inter_data
in
self
.
inter_data_list
])
if
cuda_graph_pad_size
:
prompt_adapter_index_mapping
.
extend
(
itertools
.
repeat
(
0
,
cuda_graph_pad_size
))
prompt_adapter_prompt_mapping
=
flatten_2d_lists
([
inter_data
.
prompt_adapter_prompt_mapping
for
inter_data
in
self
.
inter_data_list
])
prompt_adapter_mapping
=
PromptAdapterMapping
(
prompt_adapter_index_mapping
,
prompt_adapter_prompt_mapping
,
)
# Multi-modal data.
multi_modal_kwargs_list
=
[
data
.
multi_modal_kwargs
for
data
in
self
.
inter_data_list
...
...
@@ -1051,9 +962,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
lora_requests
=
lora_requests
,
multi_modal_kwargs
=
multi_modal_kwargs
,
request_ids_to_seq_ids
=
request_ids_to_seq_ids
,
finished_requests_ids
=
self
.
finished_requests_ids
,
prompt_adapter_mapping
=
prompt_adapter_mapping
,
prompt_adapter_requests
=
prompt_adapter_requests
)
finished_requests_ids
=
self
.
finished_requests_ids
)
class
GPUModelRunnerBase
(
ModelRunnerBase
[
TModelInputForGPU
]):
...
...
@@ -1148,7 +1057,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
self
.
model
:
nn
.
Module
# Set after load_model
# Set after load_model.
self
.
lora_manager
:
Optional
[
LRUCacheWorkerLoRAManager
]
=
None
self
.
prompt_adapter_manager
:
LRUCacheWorkerPromptAdapterManager
=
None
self
.
sampler
=
get_sampler
()
set_cpu_offload_max_bytes
(
...
...
@@ -1207,14 +1115,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
logger
.
info
(
"Model loading took %.4f GiB and %.6f seconds"
,
self
.
model_memory_usage
/
GiB_bytes
,
time_after_load
-
time_before_load
)
if
self
.
prompt_adapter_config
:
self
.
prompt_adapter_manager
=
LRUCacheWorkerPromptAdapterManager
(
self
.
scheduler_config
.
max_num_seqs
,
self
.
scheduler_config
.
max_num_batched_tokens
,
self
.
device
,
self
.
prompt_adapter_config
)
self
.
model
=
(
self
.
prompt_adapter_manager
.
create_prompt_adapter_manager
(
self
.
model
))
if
self
.
vllm_config
.
compilation_config
.
level
==
\
CompilationLevel
.
DYNAMO_AS_IS
and
supports_dynamo
():
...
...
@@ -1466,40 +1367,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
raise
RuntimeError
(
"LoRA is not enabled."
)
return
self
.
lora_manager
.
list_adapters
()
def
remove_all_prompt_adapters
(
self
):
if
not
self
.
prompt_adapter_manager
:
raise
RuntimeError
(
"PromptAdapter is not enabled."
)
self
.
prompt_adapter_manager
.
remove_all_adapters
()
def
set_active_prompt_adapters
(
self
,
prompt_adapter_requests
:
Set
[
PromptAdapterRequest
],
prompt_adapter_mapping
:
PromptAdapterMapping
)
->
None
:
if
not
self
.
prompt_adapter_manager
:
raise
RuntimeError
(
"PromptAdapter is not enabled."
)
self
.
prompt_adapter_manager
.
set_active_adapters
(
prompt_adapter_requests
,
prompt_adapter_mapping
)
def
add_prompt_adapter
(
self
,
prompt_adapter_request
:
PromptAdapterRequest
)
->
bool
:
if
not
self
.
prompt_adapter_manager
:
raise
RuntimeError
(
"PromptAdapter is not enabled."
)
return
self
.
prompt_adapter_manager
.
add_adapter
(
prompt_adapter_request
)
def
remove_prompt_adapter
(
self
,
prompt_adapter_id
:
int
)
->
bool
:
if
not
self
.
prompt_adapter_manager
:
raise
RuntimeError
(
"PromptAdapter is not enabled."
)
return
self
.
prompt_adapter_manager
.
remove_adapter
(
prompt_adapter_id
)
def
pin_prompt_adapter
(
self
,
prompt_adapter_id
:
int
)
->
bool
:
if
not
self
.
prompt_adapter_manager
:
raise
RuntimeError
(
"PromptAdapter is not enabled."
)
return
self
.
prompt_adapter_manager
.
pin_adapter
(
prompt_adapter_id
)
def
list_prompt_adapters
(
self
)
->
Set
[
int
]:
if
not
self
.
prompt_adapter_manager
:
raise
RuntimeError
(
"PromptAdapter is not enabled."
)
return
self
.
prompt_adapter_manager
.
list_adapters
()
@
torch
.
inference_mode
()
def
capture_model
(
self
,
kv_caches
:
List
[
List
[
torch
.
Tensor
]])
->
None
:
"""Cuda graph capture a model.
...
...
@@ -1609,13 +1476,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
self
.
set_active_loras
(
set
([
dummy_lora_request
]),
lora_mapping
)
if
self
.
prompt_adapter_config
:
prompt_adapter_mapping
=
PromptAdapterMapping
(
[
-
1
]
*
batch_size
,
[
-
1
]
*
batch_size
,
)
self
.
set_active_prompt_adapters
(
set
(),
prompt_adapter_mapping
)
graph_runner
=
CUDAGraphRunner
(
self
.
model
,
self
.
attn_backend
.
get_name
(),
self
.
attn_state
.
graph_clone
(
batch_size
),
...
...
@@ -1776,13 +1636,6 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
self
.
set_active_loras
(
model_input
.
lora_requests
,
model_input
.
lora_mapping
)
if
self
.
prompt_adapter_config
:
assert
model_input
.
prompt_adapter_requests
is
not
None
assert
model_input
.
prompt_adapter_mapping
is
not
None
self
.
set_active_prompt_adapters
(
model_input
.
prompt_adapter_requests
,
model_input
.
prompt_adapter_mapping
)
self
.
attn_state
.
begin_forward
(
model_input
)
# Currently cuda graph is only supported by the decode phase.
...
...
vllm/worker/model_runner_base.py
View file @
82ec66f5
...
...
@@ -190,7 +190,6 @@ class ModelRunnerBase(ABC, Generic[T]):
self
.
scheduler_config
=
vllm_config
.
scheduler_config
self
.
device_config
=
vllm_config
.
device_config
self
.
speculative_config
=
vllm_config
.
speculative_config
self
.
prompt_adapter_config
=
vllm_config
.
prompt_adapter_config
self
.
observability_config
=
vllm_config
.
observability_config
# Map of request_id -> generator used for seeded random sampling
...
...
vllm/worker/multi_step_model_runner.py
View file @
82ec66f5
...
...
@@ -288,9 +288,6 @@ class StatefulModelInput(BroadcastableModelInput):
assert
fmi
.
lora_requests
is
not
None
assert
len
(
fmi
.
lora_requests
)
==
0
assert
fmi
.
attn_metadata
is
not
None
assert
fmi
.
prompt_adapter_mapping
is
None
assert
fmi
.
prompt_adapter_requests
is
not
None
assert
len
(
fmi
.
prompt_adapter_requests
)
==
0
assert
fmi
.
multi_modal_kwargs
is
not
None
assert
len
(
fmi
.
multi_modal_kwargs
)
==
0
...
...
vllm/worker/pooling_model_runner.py
View file @
82ec66f5
...
...
@@ -64,13 +64,6 @@ class PoolingModelRunner(
self
.
set_active_loras
(
model_input
.
lora_requests
,
model_input
.
lora_mapping
)
if
self
.
prompt_adapter_config
:
assert
model_input
.
prompt_adapter_requests
is
not
None
assert
model_input
.
prompt_adapter_mapping
is
not
None
self
.
set_active_prompt_adapters
(
model_input
.
prompt_adapter_requests
,
model_input
.
prompt_adapter_mapping
)
# Currently cuda graph is only supported by the decode phase.
assert
model_input
.
attn_metadata
is
not
None
prefill_meta
=
model_input
.
attn_metadata
.
prefill_metadata
...
...
vllm/worker/utils.py
View file @
82ec66f5
...
...
@@ -47,7 +47,3 @@ def assert_enc_dec_mr_supported_scenario(
if
enc_dec_mr
.
scheduler_config
.
num_lookahead_slots
>
0
:
raise
NotImplementedError
(
STR_NOT_IMPL_ENC_DEC_ERR_STRS
[
'STR_NOT_IMPL_ENC_DEC_SPEC_DEC'
])
if
enc_dec_mr
.
prompt_adapter_config
is
not
None
:
raise
NotImplementedError
(
STR_NOT_IMPL_ENC_DEC_ERR_STRS
[
'STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER'
])
vllm/worker/worker.py
View file @
82ec66f5
...
...
@@ -22,7 +22,6 @@ from vllm.model_executor import set_random_seed
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.model_loader.tensorizer
import
TensorizerConfig
from
vllm.platforms
import
current_platform
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
from
vllm.sequence
import
(
ExecuteModelRequest
,
IntermediateTensors
,
SequenceGroupMetadata
,
SequenceGroupMetadataDelta
)
from
vllm.utils
import
(
GiB_bytes
,
MemorySnapshot
,
bind_kv_cache
,
...
...
@@ -513,19 +512,6 @@ class Worker(LocalOrDistributedWorkerBase):
def
list_loras
(
self
)
->
Set
[
int
]:
return
self
.
model_runner
.
list_loras
()
def
add_prompt_adapter
(
self
,
prompt_adapter_request
:
PromptAdapterRequest
)
->
bool
:
return
self
.
model_runner
.
add_prompt_adapter
(
prompt_adapter_request
)
def
remove_prompt_adapter
(
self
,
prompt_adapter_id
:
int
)
->
bool
:
return
self
.
model_runner
.
remove_lora
(
prompt_adapter_id
)
def
pin_prompt_adapter
(
self
,
prompt_adapter_id
:
int
)
->
bool
:
return
self
.
model_runner
.
pin_prompt_adapter
(
prompt_adapter_id
)
def
list_prompt_adapters
(
self
)
->
Set
[
int
]:
return
self
.
model_runner
.
list_prompt_adapters
()
@
property
def
max_model_len
(
self
)
->
int
:
return
self
.
model_config
.
max_model_len
...
...
vllm/worker/worker_base.py
View file @
82ec66f5
...
...
@@ -49,7 +49,6 @@ class WorkerBase:
self
.
scheduler_config
=
vllm_config
.
scheduler_config
self
.
device_config
=
vllm_config
.
device_config
self
.
speculative_config
=
vllm_config
.
speculative_config
self
.
prompt_adapter_config
=
vllm_config
.
prompt_adapter_config
self
.
observability_config
=
vllm_config
.
observability_config
self
.
kv_transfer_config
=
vllm_config
.
kv_transfer_config
self
.
compilation_config
=
vllm_config
.
compilation_config
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment