"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "ad60a973fbee9102ca542c7eaa388c02fd8581ce"
Unverified Commit 82ec66f5 authored by Michael Goin's avatar Michael Goin Committed by GitHub
Browse files

[V0 Deprecation] Remove Prompt Adapters (#20588)


Signed-off-by: default avatarmgoin <mgoin64@gmail.com>
parent 78c13e30
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import msgspec
from vllm.adapter_commons.request import AdapterRequest
class PromptAdapterRequest(
msgspec.Struct,
array_like=True, # type: ignore[call-arg]
omit_defaults=True, # type: ignore[call-arg]
frozen=True): # type: ignore[call-arg]
"""
Request for a Prompt adapter.
"""
__metaclass__ = AdapterRequest
prompt_adapter_name: str
prompt_adapter_id: int
prompt_adapter_local_path: str
prompt_adapter_num_virtual_tokens: int
def __hash__(self):
return super().__hash__()
@property
def adapter_id(self):
return self.prompt_adapter_id
@property
def name(self):
return self.prompt_adapter_name
@property
def local_path(self):
return self.prompt_adapter_local_path
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420
import os
from typing import Optional
import torch
from huggingface_hub import file_exists, hf_hub_download
from huggingface_hub.utils import EntryNotFoundError
from safetensors.torch import load_file as safe_load_file
from vllm.platforms import current_platform
WEIGHTS_NAME = "adapter_model.bin"
SAFETENSORS_WEIGHTS_NAME = "adapter_model.safetensors"
# Get current device name based on available devices
def infer_device() -> str:
if current_platform.is_cuda_alike():
return "cuda"
return "cpu"
def load_peft_weights(model_id: str,
device: Optional[str] = None,
**hf_hub_download_kwargs) -> dict:
r"""
A helper method to load the PEFT weights from the HuggingFace Hub or locally
Args:
model_id (`str`):
The local path to the adapter weights or the name of the adapter to
load from the HuggingFace Hub.
device (`str`):
The device to load the weights onto.
hf_hub_download_kwargs (`dict`):
Additional arguments to pass to the `hf_hub_download` method when
loading from the HuggingFace Hub.
"""
path = (os.path.join(model_id, hf_hub_download_kwargs["subfolder"]) if
hf_hub_download_kwargs.get("subfolder") is not None else model_id)
if device is None:
device = infer_device()
if os.path.exists(os.path.join(path, SAFETENSORS_WEIGHTS_NAME)):
filename = os.path.join(path, SAFETENSORS_WEIGHTS_NAME)
use_safetensors = True
elif os.path.exists(os.path.join(path, WEIGHTS_NAME)):
filename = os.path.join(path, WEIGHTS_NAME)
use_safetensors = False
else:
token = hf_hub_download_kwargs.get("token")
if token is None:
token = hf_hub_download_kwargs.get("use_auth_token")
hub_filename = (os.path.join(hf_hub_download_kwargs["subfolder"],
SAFETENSORS_WEIGHTS_NAME)
if hf_hub_download_kwargs.get("subfolder") is not None
else SAFETENSORS_WEIGHTS_NAME)
has_remote_safetensors_file = file_exists(
repo_id=model_id,
filename=hub_filename,
revision=hf_hub_download_kwargs.get("revision"),
repo_type=hf_hub_download_kwargs.get("repo_type"),
token=token,
)
use_safetensors = has_remote_safetensors_file
if has_remote_safetensors_file:
# Priority 1: load safetensors weights
filename = hf_hub_download(
model_id,
SAFETENSORS_WEIGHTS_NAME,
**hf_hub_download_kwargs,
)
else:
try:
filename = hf_hub_download(model_id, WEIGHTS_NAME,
**hf_hub_download_kwargs)
except EntryNotFoundError:
raise ValueError( # noqa: B904
f"Can't find weights for {model_id} in {model_id} or \
in the Hugging Face Hub. "
f"Please check that the file {WEIGHTS_NAME} or \
{SAFETENSORS_WEIGHTS_NAME} is present at {model_id}.")
if use_safetensors:
adapters_weights = safe_load_file(filename, device=device)
else:
adapters_weights = torch.load(filename,
map_location=torch.device(device),
weights_only=True)
return adapters_weights
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import logging
from typing import Any, Optional, Set, Type
import torch
from vllm.adapter_commons.utils import (add_adapter_worker,
apply_adapters_worker,
list_adapters_worker,
set_active_adapters_worker)
from vllm.adapter_commons.worker_manager import AbstractWorkerManager
from vllm.config import PromptAdapterConfig
from vllm.prompt_adapter.models import (LRUCachePromptAdapterModelManager,
PromptAdapterModel,
PromptAdapterModelManager,
create_prompt_adapter_manager)
from vllm.prompt_adapter.request import PromptAdapterRequest
logger = logging.getLogger(__name__)
class WorkerPromptAdapterManager(AbstractWorkerManager):
"""WorkerPromptAdapterManager that manages
prompt_adapter models on the worker side.
Every request, the requested prompt_adapters will be
loaded (unless they are already loaded),
and every other prompt_adapter will be unloaded."""
_manager_cls: Type[PromptAdapterModelManager] = PromptAdapterModelManager
def __init__(
self,
max_num_seqs: int,
max_num_batched_tokens: int,
device: torch.device,
prompt_adapter_config: PromptAdapterConfig,
prompt_adapter_model_cls: Type[PromptAdapterModel] = PromptAdapterModel
):
self._adapter_manager: PromptAdapterModelManager
self.max_num_seqs = max_num_seqs
self.max_num_batched_tokens = max_num_batched_tokens
self._prompt_adapter_model_cls = prompt_adapter_model_cls
self.prompt_adapter_config = prompt_adapter_config
super().__init__(device)
@property
def is_enabled(self) -> bool:
return True
def create_prompt_adapter_manager(
self,
model: torch.nn.Module,
) -> Any:
prompt_adapter_manager = create_prompt_adapter_manager(
model,
max_num_seqs=self.max_num_seqs,
max_num_batched_tokens=self.max_num_batched_tokens,
prompt_adapter_config=self.prompt_adapter_config,
prompt_adapter_manager_cls=self._manager_cls,
)
self._adapter_manager = prompt_adapter_manager
return prompt_adapter_manager.model
def _load_adapter(
self, prompt_adapter_request: PromptAdapterRequest
) -> PromptAdapterModel:
try:
prompt_adapter = (
self._prompt_adapter_model_cls.from_local_checkpoint(
prompt_adapter_request.prompt_adapter_local_path,
prompt_adapter_id=prompt_adapter_request.prompt_adapter_id,
num_virtual_tokens=prompt_adapter_request.
prompt_adapter_num_virtual_tokens,
config=self.prompt_adapter_config,
device=str(self.device),
))
except Exception as e:
raise RuntimeError(
f"Loading prompt_adapter "
f"{prompt_adapter_request.prompt_adapter_local_path}"
f" failed") from e
return prompt_adapter
def add_dummy_prompt_adapter(
self, prompt_adapter_request: PromptAdapterRequest) -> bool:
return True
def pin_adapter(self, adapter_id: int) -> bool:
return self._adapter_manager.pin_adapter(adapter_id)
def set_active_adapters(self, requests: Set[Any],
mapping: Optional[Any]) -> None:
set_active_adapters_worker(requests, mapping, self._apply_adapters,
self._adapter_manager.set_adapter_mapping)
def add_adapter(self, adapter_request: Any) -> bool:
return add_adapter_worker(adapter_request, self.list_adapters,
self._load_adapter,
self._adapter_manager.add_adapter,
self._adapter_manager.activate_adapter)
def _apply_adapters(self, adapter_requests: Set[Any]) -> None:
apply_adapters_worker(adapter_requests, self.list_adapters,
self._adapter_manager.adapter_slots,
self.remove_adapter, self.add_adapter)
def remove_adapter(self, adapter_id: int) -> bool:
return self._adapter_manager.remove_adapter(adapter_id)
def remove_all_adapters(self):
self._adapter_manager.remove_all_adapters()
def list_adapters(self) -> Set[int]:
return list_adapters_worker(self._adapter_manager.list_adapters)
class LRUCacheWorkerPromptAdapterManager(WorkerPromptAdapterManager):
"""WorkerPromptAdapterManager that manages
prompt_adapter models on the worker side.
Uses an LRU Cache. Every request, the requested
prompt_adapters will be loaded (unless they are already loaded)
and least recently used prompt_adapters will
be unloaded if the cache is above capacity."""
_prompt_adapter_manager_cls: Type[
LRUCachePromptAdapterModelManager] = LRUCachePromptAdapterModelManager
def create_prompt_adapter_manager(
self,
model: torch.nn.Module,
) -> Any:
prompt_adapter_manager = create_prompt_adapter_manager(
model,
max_num_seqs=self.max_num_seqs,
max_num_batched_tokens=self.max_num_batched_tokens,
prompt_adapter_config=self.prompt_adapter_config,
prompt_adapter_manager_cls=self._prompt_adapter_manager_cls)
self._adapter_manager: LRUCachePromptAdapterModelManager = (
prompt_adapter_manager)
return prompt_adapter_manager.model
def _apply_adapters(
self, prompt_adapter_requests: Set[PromptAdapterRequest]) -> None:
prompt_adapters_map = {
prompt_adapter_request.prompt_adapter_id: prompt_adapter_request
for prompt_adapter_request in prompt_adapter_requests
if prompt_adapter_request
}
if len(prompt_adapters_map
) > self._adapter_manager.prompt_adapter_slots:
raise RuntimeError(
f"Number of requested prompt_adapters "
f"({len(prompt_adapters_map)}) is greater "
"than the number of GPU prompt_adapter slots "
f"({self._adapter_manager.prompt_adapter_slots}).")
for prompt_adapter in prompt_adapters_map.values():
self.add_adapter(prompt_adapter)
def add_adapter(self,
prompt_adapter_request: PromptAdapterRequest) -> bool:
if prompt_adapter_request.prompt_adapter_id not in self.list_adapters(
):
# Remove before we load the new prompt_adapter to save memory
if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
self._adapter_manager.remove_oldest_adapter()
prompt_adapter = self._load_adapter(prompt_adapter_request)
loaded = self._adapter_manager.add_adapter(prompt_adapter)
else:
# If the prompt_adapter is already loaded, just touch it to
# update its position in the caches
loaded = self._adapter_manager.get_adapter(
prompt_adapter_request.prompt_adapter_id) is not None
self._adapter_manager.activate_adapter(
prompt_adapter_request.prompt_adapter_id)
return loaded
...@@ -19,7 +19,6 @@ from vllm.inputs import SingletonInputs ...@@ -19,7 +19,6 @@ from vllm.inputs import SingletonInputs
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
from vllm.pooling_params import PoolingParams from vllm.pooling_params import PoolingParams
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import RequestOutputKind, SamplingParams from vllm.sampling_params import RequestOutputKind, SamplingParams
VLLM_TOKEN_ID_ARRAY_TYPE = "l" VLLM_TOKEN_ID_ARRAY_TYPE = "l"
...@@ -458,7 +457,6 @@ class Sequence: ...@@ -458,7 +457,6 @@ class Sequence:
block size used by the block manager and cache engine. block size used by the block manager and cache engine.
eos_token_id: The end-of-sequence (EOS) token id recognized by this LLM. eos_token_id: The end-of-sequence (EOS) token id recognized by this LLM.
lora_request: LoRA request. lora_request: LoRA request.
prompt_adapter_request: Prompt Adapter request.
""" """
def __init__( def __init__(
...@@ -468,14 +466,12 @@ class Sequence: ...@@ -468,14 +466,12 @@ class Sequence:
block_size: int, block_size: int,
eos_token_id: Optional[int] = None, eos_token_id: Optional[int] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> None: ) -> None:
self.seq_id = seq_id self.seq_id = seq_id
self.inputs = inputs self.inputs = inputs
self.block_size = block_size self.block_size = block_size
self.eos_token_id = eos_token_id self.eos_token_id = eos_token_id
self.lora_request = lora_request self.lora_request = lora_request
self.prompt_adapter_request = prompt_adapter_request
self.data = SequenceData.from_seqs( self.data = SequenceData.from_seqs(
self.prompt_token_ids, self.prompt_token_ids,
...@@ -537,11 +533,6 @@ class Sequence: ...@@ -537,11 +533,6 @@ class Sequence:
def lora_int_id(self) -> int: def lora_int_id(self) -> int:
return self.lora_request.lora_int_id if self.lora_request else 0 return self.lora_request.lora_int_id if self.lora_request else 0
@property
def prompt_adapter_id(self) -> int:
return self.prompt_adapter_request.prompt_adapter_id \
if self.prompt_adapter_request else 0
def get_output_text_to_return(self, buffer_length: int, def get_output_text_to_return(self, buffer_length: int,
delta: bool) -> str: delta: bool) -> str:
"""If delta is True, only new text since the last call to """If delta is True, only new text since the last call to
...@@ -601,12 +592,12 @@ class Sequence: ...@@ -601,12 +592,12 @@ class Sequence:
designed for prefix caching mode. The final sequence hash is determined designed for prefix caching mode. The final sequence hash is determined
by applying token_ids from the sequence's blocks. by applying token_ids from the sequence's blocks.
""" """
if self.prompt_adapter_id == 0 and self.lora_int_id == 0: if self.lora_int_id == 0:
return None return None
# NOTE: If there are additional factors influencing the block aside from # NOTE: If there are additional factors influencing the block aside from
# token_ids, include them as input parameters to the hash. # token_ids, include them as input parameters to the hash.
return hash((self.prompt_adapter_id, self.lora_int_id)) return hash(self.lora_int_id)
def num_hashed_tokens_of_block(self, logical_idx: int): def num_hashed_tokens_of_block(self, logical_idx: int):
return logical_idx * self.block_size + self.block_size return logical_idx * self.block_size + self.block_size
...@@ -707,7 +698,6 @@ class SequenceGroup: ...@@ -707,7 +698,6 @@ class SequenceGroup:
encoder_seq: Optional, the single encoder sequence. Should be None encoder_seq: Optional, the single encoder sequence. Should be None
unless you are working with an encoder/decoder model. unless you are working with an encoder/decoder model.
trace_headers: OpenTelemetry trace headers. trace_headers: OpenTelemetry trace headers.
prompt_adapter_request: Prompt Adapter request.
priority: User-defined priority of the request. priority: User-defined priority of the request.
draft_size: The number of speculative tokens plus one from the target draft_size: The number of speculative tokens plus one from the target
model; equal to max number of tokens a step can generate model; equal to max number of tokens a step can generate
...@@ -725,7 +715,6 @@ class SequenceGroup: ...@@ -725,7 +715,6 @@ class SequenceGroup:
pooled_data: Optional[torch.Tensor] = None, pooled_data: Optional[torch.Tensor] = None,
encoder_seq: Optional[Sequence] = None, encoder_seq: Optional[Sequence] = None,
trace_headers: Optional[Mapping[str, str]] = None, trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0, priority: int = 0,
draft_size: int = 1) -> None: draft_size: int = 1) -> None:
self.request_id = request_id self.request_id = request_id
...@@ -747,7 +736,6 @@ class SequenceGroup: ...@@ -747,7 +736,6 @@ class SequenceGroup:
self.state = SequenceGroupState() self.state = SequenceGroupState()
self.pooling_params = pooling_params self.pooling_params = pooling_params
self.pooled_data = pooled_data self.pooled_data = pooled_data
self.prompt_adapter_request = prompt_adapter_request
self.encoder_seq = encoder_seq self.encoder_seq = encoder_seq
self.trace_headers = trace_headers self.trace_headers = trace_headers
self.priority = priority self.priority = priority
...@@ -802,16 +790,6 @@ class SequenceGroup: ...@@ -802,16 +790,6 @@ class SequenceGroup:
def lora_int_id(self) -> int: def lora_int_id(self) -> int:
return self.lora_request.lora_int_id if self.lora_request else 0 return self.lora_request.lora_int_id if self.lora_request else 0
@property
def prompt_adapter_id(self) -> int:
return self.prompt_adapter_request.prompt_adapter_id \
if self.prompt_adapter_request else 0
@property
def prompt_adapter_num_virtual_tokens(self) -> int:
return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens\
if self.prompt_adapter_request else 0
def init_multi_step(self, num_steps: int) -> None: def init_multi_step(self, num_steps: int) -> None:
self.state.num_steps = num_steps self.state.num_steps = num_steps
self.state.current_step = 0 self.state.current_step = 0
...@@ -1011,7 +989,6 @@ class SequenceGroupMetadata( ...@@ -1011,7 +989,6 @@ class SequenceGroupMetadata(
(SequenceGroup.encoder_seq). Should be None (SequenceGroup.encoder_seq). Should be None
unless you are working with an encoder/decoder unless you are working with an encoder/decoder
model. model.
prompt_adapter_request: Prompt Adapter request.
""" """
request_id: str request_id: str
...@@ -1030,7 +1007,6 @@ class SequenceGroupMetadata( ...@@ -1030,7 +1007,6 @@ class SequenceGroupMetadata(
multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
encoder_seq_data: Optional[SequenceData] = None encoder_seq_data: Optional[SequenceData] = None
cross_block_table: Optional[list[int]] = None cross_block_table: Optional[list[int]] = None
prompt_adapter_request: Optional[PromptAdapterRequest] = None
token_chunk_size: Optional[int] = None token_chunk_size: Optional[int] = None
### Stateful fields that are lazily defined. ### ### Stateful fields that are lazily defined. ###
...@@ -1052,16 +1028,6 @@ class SequenceGroupMetadata( ...@@ -1052,16 +1028,6 @@ class SequenceGroupMetadata(
def lora_int_id(self) -> int: def lora_int_id(self) -> int:
return self.lora_request.lora_int_id if self.lora_request else 0 return self.lora_request.lora_int_id if self.lora_request else 0
@property
def prompt_adapter_id(self) -> int:
return self.prompt_adapter_request.prompt_adapter_id \
if self.prompt_adapter_request else 0
@property
def prompt_adapter_num_virtual_tokens(self) -> int:
return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens \
if self.prompt_adapter_request else 0
# Multi-Step Chunked-Prefill property # Multi-Step Chunked-Prefill property
@property @property
def is_single_step_prompt(self) -> bool: def is_single_step_prompt(self) -> bool:
...@@ -1525,7 +1491,6 @@ class ParallelSampleSequenceGroup(SequenceGroupBase): ...@@ -1525,7 +1491,6 @@ class ParallelSampleSequenceGroup(SequenceGroupBase):
pooled_data=seq_group.pooled_data, pooled_data=seq_group.pooled_data,
encoder_seq=seq_group.encoder_seq, encoder_seq=seq_group.encoder_seq,
trace_headers=seq_group.trace_headers, trace_headers=seq_group.trace_headers,
prompt_adapter_request=seq_group.prompt_adapter_request,
priority=seq_group.priority, priority=seq_group.priority,
) )
......
...@@ -128,10 +128,6 @@ STR_NOT_IMPL_ENC_DEC_BACKEND = ("XFormers and Flash-Attention are the only " ...@@ -128,10 +128,6 @@ STR_NOT_IMPL_ENC_DEC_BACKEND = ("XFormers and Flash-Attention are the only "
"backends currently supported with encoder/" "backends currently supported with encoder/"
"decoder models.") "decoder models.")
STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER = ("Prompt adapters are not "
"currently supported with encoder/"
"decoder models.")
# Efficiently import all enc/dec error strings # Efficiently import all enc/dec error strings
# rather than having to import all of the above # rather than having to import all of the above
STR_NOT_IMPL_ENC_DEC_ERR_STRS = { STR_NOT_IMPL_ENC_DEC_ERR_STRS = {
...@@ -145,7 +141,6 @@ STR_NOT_IMPL_ENC_DEC_ERR_STRS = { ...@@ -145,7 +141,6 @@ STR_NOT_IMPL_ENC_DEC_ERR_STRS = {
"STR_NOT_IMPL_ENC_DEC_MM": STR_NOT_IMPL_ENC_DEC_MM, "STR_NOT_IMPL_ENC_DEC_MM": STR_NOT_IMPL_ENC_DEC_MM,
"STR_NOT_IMPL_ENC_DEC_SPEC_DEC": STR_NOT_IMPL_ENC_DEC_SPEC_DEC, "STR_NOT_IMPL_ENC_DEC_SPEC_DEC": STR_NOT_IMPL_ENC_DEC_SPEC_DEC,
"STR_NOT_IMPL_ENC_DEC_BACKEND": STR_NOT_IMPL_ENC_DEC_BACKEND, "STR_NOT_IMPL_ENC_DEC_BACKEND": STR_NOT_IMPL_ENC_DEC_BACKEND,
"STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER": STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER,
} }
# Constants related to forcing the attention backend selection # Constants related to forcing the attention backend selection
......
...@@ -20,7 +20,6 @@ from vllm.lora.request import LoRARequest ...@@ -20,7 +20,6 @@ from vllm.lora.request import LoRARequest
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.outputs import PoolingRequestOutput, RequestOutput
from vllm.pooling_params import PoolingParams from vllm.pooling_params import PoolingParams
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.config import ( from vllm.transformers_utils.config import (
maybe_register_config_serialize_by_value) maybe_register_config_serialize_by_value)
...@@ -221,7 +220,6 @@ class AsyncLLM(EngineClient): ...@@ -221,7 +220,6 @@ class AsyncLLM(EngineClient):
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
trace_headers: Optional[Mapping[str, str]] = None, trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0, priority: int = 0,
data_parallel_rank: Optional[int] = None, data_parallel_rank: Optional[int] = None,
) -> RequestOutputCollector: ) -> RequestOutputCollector:
...@@ -238,8 +236,7 @@ class AsyncLLM(EngineClient): ...@@ -238,8 +236,7 @@ class AsyncLLM(EngineClient):
# Convert Input --> Request. # Convert Input --> Request.
prompt_str, request = self.processor.process_inputs( prompt_str, request = self.processor.process_inputs(
request_id, prompt, params, arrival_time, lora_request, request_id, prompt, params, arrival_time, lora_request,
tokenization_kwargs, trace_headers, prompt_adapter_request, tokenization_kwargs, trace_headers, priority, data_parallel_rank)
priority, data_parallel_rank)
if is_pooling or params.n == 1: if is_pooling or params.n == 1:
await self._add_request(request, prompt_str, None, 0, queue) await self._add_request(request, prompt_str, None, 0, queue)
...@@ -283,7 +280,6 @@ class AsyncLLM(EngineClient): ...@@ -283,7 +280,6 @@ class AsyncLLM(EngineClient):
request_id: str, request_id: str,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None, trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0, priority: int = 0,
data_parallel_rank: Optional[int] = None, data_parallel_rank: Optional[int] = None,
) -> AsyncGenerator[RequestOutput, None]: ) -> AsyncGenerator[RequestOutput, None]:
...@@ -314,7 +310,6 @@ class AsyncLLM(EngineClient): ...@@ -314,7 +310,6 @@ class AsyncLLM(EngineClient):
sampling_params, sampling_params,
lora_request=lora_request, lora_request=lora_request,
trace_headers=trace_headers, trace_headers=trace_headers,
prompt_adapter_request=prompt_adapter_request,
priority=priority, priority=priority,
data_parallel_rank=data_parallel_rank, data_parallel_rank=data_parallel_rank,
) )
......
...@@ -17,7 +17,6 @@ from vllm.lora.request import LoRARequest ...@@ -17,7 +17,6 @@ from vllm.lora.request import LoRARequest
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.outputs import PoolingRequestOutput, RequestOutput
from vllm.pooling_params import PoolingParams from vllm.pooling_params import PoolingParams
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer_group import ( from vllm.transformers_utils.tokenizer_group import (
TokenizerGroup, init_tokenizer_from_configs) TokenizerGroup, init_tokenizer_from_configs)
...@@ -192,7 +191,6 @@ class LLMEngine: ...@@ -192,7 +191,6 @@ class LLMEngine:
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
trace_headers: Optional[Mapping[str, str]] = None, trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0, priority: int = 0,
) -> None: ) -> None:
# Validate the request_id type. # Validate the request_id type.
...@@ -203,8 +201,7 @@ class LLMEngine: ...@@ -203,8 +201,7 @@ class LLMEngine:
# Process raw inputs into the request. # Process raw inputs into the request.
prompt_str, request = self.processor.process_inputs( prompt_str, request = self.processor.process_inputs(
request_id, prompt, params, arrival_time, lora_request, request_id, prompt, params, arrival_time, lora_request,
tokenization_kwargs, trace_headers, prompt_adapter_request, tokenization_kwargs, trace_headers, priority)
priority)
n = params.n if isinstance(params, SamplingParams) else 1 n = params.n if isinstance(params, SamplingParams) else 1
......
...@@ -16,7 +16,6 @@ from vllm.multimodal.inputs import PlaceholderRange ...@@ -16,7 +16,6 @@ from vllm.multimodal.inputs import PlaceholderRange
from vllm.multimodal.processing import EncDecMultiModalProcessor from vllm.multimodal.processing import EncDecMultiModalProcessor
from vllm.multimodal.utils import merge_and_sort_multimodal_metadata from vllm.multimodal.utils import merge_and_sort_multimodal_metadata
from vllm.pooling_params import PoolingParams from vllm.pooling_params import PoolingParams
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer_group import TokenizerGroup from vllm.transformers_utils.tokenizer_group import TokenizerGroup
from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine import EngineCoreRequest
...@@ -226,7 +225,6 @@ class Processor: ...@@ -226,7 +225,6 @@ class Processor:
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
trace_headers: Optional[Mapping[str, str]] = None, trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0, priority: int = 0,
data_parallel_rank: Optional[int] = None, data_parallel_rank: Optional[int] = None,
) -> tuple[Optional[str], EngineCoreRequest]: ) -> tuple[Optional[str], EngineCoreRequest]:
...@@ -237,8 +235,6 @@ class Processor: ...@@ -237,8 +235,6 @@ class Processor:
self._validate_params(params, lora_request) self._validate_params(params, lora_request)
if trace_headers is not None: if trace_headers is not None:
raise ValueError("V1 does not support tracing yet.") raise ValueError("V1 does not support tracing yet.")
if prompt_adapter_request is not None:
raise ValueError("V1 does not support prompt_adapter_request.")
data_parallel_size = self.vllm_config.parallel_config.data_parallel_size data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
if data_parallel_rank is not None and not (0 <= data_parallel_rank < if data_parallel_rank is not None and not (0 <= data_parallel_rank <
...@@ -253,12 +249,10 @@ class Processor: ...@@ -253,12 +249,10 @@ class Processor:
# 1. Tokenize text prompt, with LoRA request if one exists. # 1. Tokenize text prompt, with LoRA request if one exists.
# 2. For multimodal models with a merged preprocessor, preprocess # 2. For multimodal models with a merged preprocessor, preprocess
# multimodal data and expand prompt token ids accordingly. # multimodal data and expand prompt token ids accordingly.
# 3. Apply prompt adapter to prompt token ids if one exists.
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess( processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
prompt, prompt,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request, lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request,
return_mm_hashes=self.use_hash, return_mm_hashes=self.use_hash,
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
......
...@@ -318,8 +318,6 @@ def report_usage_stats( ...@@ -318,8 +318,6 @@ def report_usage_stats(
# Feature flags # Feature flags
"enable_lora": "enable_lora":
bool(vllm_config.lora_config), bool(vllm_config.lora_config),
"enable_prompt_adapter":
bool(vllm_config.prompt_adapter_config),
"enable_prefix_caching": "enable_prefix_caching":
vllm_config.cache_config.enable_prefix_caching, vllm_config.cache_config.enable_prefix_caching,
"enforce_eager": "enforce_eager":
......
...@@ -104,7 +104,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -104,7 +104,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self.parallel_config = vllm_config.parallel_config self.parallel_config = vllm_config.parallel_config
self.scheduler_config = vllm_config.scheduler_config self.scheduler_config = vllm_config.scheduler_config
self.speculative_config = vllm_config.speculative_config self.speculative_config = vllm_config.speculative_config
self.prompt_adapter_config = vllm_config.prompt_adapter_config
self.observability_config = vllm_config.observability_config self.observability_config = vllm_config.observability_config
from vllm.model_executor.models.utils import set_cpu_offload_max_bytes from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
......
...@@ -114,7 +114,6 @@ class TPUModelRunner(LoRAModelRunnerMixin): ...@@ -114,7 +114,6 @@ class TPUModelRunner(LoRAModelRunnerMixin):
self.original_parallel_config = original_parallel_config self.original_parallel_config = original_parallel_config
self.scheduler_config = vllm_config.scheduler_config self.scheduler_config = vllm_config.scheduler_config
self.speculative_config = vllm_config.speculative_config self.speculative_config = vllm_config.speculative_config
self.prompt_adapter_config = vllm_config.prompt_adapter_config
self.observability_config = vllm_config.observability_config self.observability_config = vllm_config.observability_config
self.device_config = vllm_config.device_config self.device_config = vllm_config.device_config
......
...@@ -62,7 +62,6 @@ class TPUWorker: ...@@ -62,7 +62,6 @@ class TPUWorker:
self.scheduler_config = vllm_config.scheduler_config self.scheduler_config = vllm_config.scheduler_config
self.device_config = vllm_config.device_config self.device_config = vllm_config.device_config
self.speculative_config = vllm_config.speculative_config self.speculative_config = vllm_config.speculative_config
self.prompt_adapter_config = vllm_config.prompt_adapter_config
self.observability_config = vllm_config.observability_config self.observability_config = vllm_config.observability_config
self.parallel_config.rank = rank self.parallel_config.rank = rank
......
...@@ -91,10 +91,9 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): ...@@ -91,10 +91,9 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
''' '''
EncoderDecoderModelRunner constructor. EncoderDecoderModelRunner constructor.
`lora_config` and `prompt_adapter_config` are `lora_config` is unused (since these features are not yet supported
unused (since these features are not yet supported for encoder/decoder for encoder/decoder models) but these arguments are present here for
models) but these arguments are present here for compatibility with compatibility with the base-class constructor.
the base-class constructor.
''' '''
self._maybe_force_supported_attention_backend() self._maybe_force_supported_attention_backend()
......
...@@ -45,10 +45,6 @@ from vllm.model_executor.models.utils import set_cpu_offload_max_bytes ...@@ -45,10 +45,6 @@ from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
MultiModalKwargs, MultiModalPlaceholderMap, MultiModalKwargs, MultiModalPlaceholderMap,
MultiModalRegistry) MultiModalRegistry)
from vllm.prompt_adapter.layers import PromptAdapterMapping
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.prompt_adapter.worker_manager import (
LRUCacheWorkerPromptAdapterManager)
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache, from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache,
...@@ -95,8 +91,6 @@ class ModelInputForGPU(ModelRunnerInputBase): ...@@ -95,8 +91,6 @@ class ModelInputForGPU(ModelRunnerInputBase):
lora_mapping: Optional["LoRAMapping"] = None lora_mapping: Optional["LoRAMapping"] = None
lora_requests: Optional[Set[LoRARequest]] = None lora_requests: Optional[Set[LoRARequest]] = None
attn_metadata: Optional["AttentionMetadata"] = None attn_metadata: Optional["AttentionMetadata"] = None
prompt_adapter_mapping: Optional[PromptAdapterMapping] = None
prompt_adapter_requests: Optional[Set[PromptAdapterRequest]] = None
multi_modal_kwargs: Optional[BatchedTensorInputs] = None multi_modal_kwargs: Optional[BatchedTensorInputs] = None
request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None
finished_requests_ids: Optional[List[str]] = None finished_requests_ids: Optional[List[str]] = None
...@@ -113,8 +107,6 @@ class ModelInputForGPU(ModelRunnerInputBase): ...@@ -113,8 +107,6 @@ class ModelInputForGPU(ModelRunnerInputBase):
"lora_requests": self.lora_requests, "lora_requests": self.lora_requests,
"lora_mapping": self.lora_mapping, "lora_mapping": self.lora_mapping,
"multi_modal_kwargs": self.multi_modal_kwargs, "multi_modal_kwargs": self.multi_modal_kwargs,
"prompt_adapter_mapping": self.prompt_adapter_mapping,
"prompt_adapter_requests": self.prompt_adapter_requests,
"virtual_engine": self.virtual_engine, "virtual_engine": self.virtual_engine,
"request_ids_to_seq_ids": self.request_ids_to_seq_ids, "request_ids_to_seq_ids": self.request_ids_to_seq_ids,
"finished_requests_ids": self.finished_requests_ids, "finished_requests_ids": self.finished_requests_ids,
...@@ -164,8 +156,6 @@ class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU): ...@@ -164,8 +156,6 @@ class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU):
"lora_requests": self.lora_requests, "lora_requests": self.lora_requests,
"lora_mapping": self.lora_mapping, "lora_mapping": self.lora_mapping,
"multi_modal_kwargs": self.multi_modal_kwargs, "multi_modal_kwargs": self.multi_modal_kwargs,
"prompt_adapter_mapping": self.prompt_adapter_mapping,
"prompt_adapter_requests": self.prompt_adapter_requests,
"virtual_engine": self.virtual_engine, "virtual_engine": self.virtual_engine,
"request_ids_to_seq_ids": self.request_ids_to_seq_ids, "request_ids_to_seq_ids": self.request_ids_to_seq_ids,
"finished_requests_ids": self.finished_requests_ids, "finished_requests_ids": self.finished_requests_ids,
...@@ -212,8 +202,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -212,8 +202,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
self.lora_index_mapping.clear() # type: ignore self.lora_index_mapping.clear() # type: ignore
self.lora_prompt_mapping.clear() # type: ignore self.lora_prompt_mapping.clear() # type: ignore
self.lora_requests.clear() # type: ignore self.lora_requests.clear() # type: ignore
self.prompt_adapter_index_mapping.clear() # type: ignore
self.prompt_adapter_prompt_mapping.clear() # type: ignore
def __init__( def __init__(
self, self,
...@@ -252,11 +240,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -252,11 +240,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
lora_prompt_mapping: Optional[List[List[int]]] = None, lora_prompt_mapping: Optional[List[List[int]]] = None,
lora_requests: Optional[Set[LoRARequest]] = None, lora_requests: Optional[Set[LoRARequest]] = None,
# Prompt adapter inputs.
prompt_adapter_index_mapping: Optional[List[int]] = None,
prompt_adapter_prompt_mapping: Optional[List[int]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
# Multi-modal inputs. # Multi-modal inputs.
multi_modal_kwargs: Optional[MultiModalKwargs] = None, multi_modal_kwargs: Optional[MultiModalKwargs] = None,
multi_modal_placeholder_maps: Optional[Dict[ multi_modal_placeholder_maps: Optional[Dict[
...@@ -360,18 +343,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -360,18 +343,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
else: else:
self.lora_requests.clear() self.lora_requests.clear()
if prompt_adapter_index_mapping:
self.prompt_adapter_index_mapping = \
prompt_adapter_index_mapping
else:
self.prompt_adapter_index_mapping.clear()
if prompt_adapter_prompt_mapping:
self.prompt_adapter_prompt_mapping = \
prompt_adapter_prompt_mapping
else:
self.prompt_adapter_prompt_mapping.clear()
else: else:
self.input_tokens = input_tokens or [] self.input_tokens = input_tokens or []
self.inputs_embeds = inputs_embeds self.inputs_embeds = inputs_embeds
...@@ -390,12 +361,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -390,12 +361,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
self.lora_prompt_mapping = lora_prompt_mapping or [] self.lora_prompt_mapping = lora_prompt_mapping or []
self.lora_requests = lora_requests or set() self.lora_requests = lora_requests or set()
self.prompt_adapter_index_mapping = (
prompt_adapter_index_mapping or [])
self.prompt_adapter_prompt_mapping = (
prompt_adapter_prompt_mapping or [])
self.prompt_adapter_request = prompt_adapter_request
self.multi_modal_kwargs = multi_modal_kwargs self.multi_modal_kwargs = multi_modal_kwargs
self.multi_modal_placeholder_maps = multi_modal_placeholder_maps self.multi_modal_placeholder_maps = multi_modal_placeholder_maps
self.prefix_cache_hit = prefix_cache_hit self.prefix_cache_hit = prefix_cache_hit
...@@ -485,7 +450,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -485,7 +450,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
# Compute functions for each sequence group. # Compute functions for each sequence group.
# WARNING: The order of the functions matters! # WARNING: The order of the functions matters!
self.per_seq_group_compute_fns = [ self.per_seq_group_compute_fns = [
self._compute_prompt_adapter_input,
self._compute_multi_modal_input, self._compute_multi_modal_input,
] ]
...@@ -496,8 +460,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -496,8 +460,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
self.sliding_window = self.runner.sliding_window self.sliding_window = self.runner.sliding_window
self.block_size = self.runner.block_size self.block_size = self.runner.block_size
self.enable_lora = self.runner.lora_config is not None self.enable_lora = self.runner.lora_config is not None
self.enable_prompt_adapter = (self.runner.prompt_adapter_config
is not None)
# Attention metadata inputs. # Attention metadata inputs.
if self.attn_backend is not None: if self.attn_backend is not None:
...@@ -693,34 +655,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -693,34 +655,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
else: else:
inter_data.lora_prompt_mapping.append([]) inter_data.lora_prompt_mapping.append([])
def _compute_prompt_adapter_input(
self, inter_data: InterDataForSeqGroup,
seq_group_metadata: SequenceGroupMetadata):
"""If prompt adapter is enabled, compute index and prompt mapping.
"""
# Note that when is_prompt=True, we expect only one sequence
# in the group.
if not self.enable_prompt_adapter:
return
prompt_adapter_id = seq_group_metadata.prompt_adapter_id
if prompt_adapter_id <= 0 or not inter_data.is_prompt:
return
# We expect only one sequence in the group when is_prompt=True.
assert inter_data.n_seqs == 1
query_len = inter_data.query_lens[0]
inter_data.prompt_adapter_request = (
seq_group_metadata.prompt_adapter_request)
num_tokens = seq_group_metadata.prompt_adapter_num_virtual_tokens
inter_data.prompt_adapter_index_mapping = [
prompt_adapter_id
] * num_tokens + [0] * (query_len - num_tokens)
inter_data.prompt_adapter_prompt_mapping = [prompt_adapter_id] * (
query_len if seq_group_metadata.sampling_params
and seq_group_metadata.sampling_params.prompt_logprobs else 1)
def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
seq_group_metadata: SequenceGroupMetadata): seq_group_metadata: SequenceGroupMetadata):
"""If multi-modal data is given, add it to the input.""" """If multi-modal data is given, add it to the input."""
...@@ -1009,29 +943,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -1009,29 +943,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
prompt_mapping=lora_prompt_mapping, prompt_mapping=lora_prompt_mapping,
is_prefill=not self.decode_only)) is_prefill=not self.decode_only))
# Prompt adapter data.
prompt_adapter_requests: Set[PromptAdapterRequest] = set()
prompt_adapter_mapping = None
if self.enable_prompt_adapter:
prompt_adapter_requests = set(
data.prompt_adapter_request for data in self.inter_data_list
if data.prompt_adapter_request is not None)
prompt_adapter_index_mapping = flatten_2d_lists([
inter_data.prompt_adapter_index_mapping
for inter_data in self.inter_data_list
])
if cuda_graph_pad_size:
prompt_adapter_index_mapping.extend(
itertools.repeat(0, cuda_graph_pad_size))
prompt_adapter_prompt_mapping = flatten_2d_lists([
inter_data.prompt_adapter_prompt_mapping
for inter_data in self.inter_data_list
])
prompt_adapter_mapping = PromptAdapterMapping(
prompt_adapter_index_mapping,
prompt_adapter_prompt_mapping,
)
# Multi-modal data. # Multi-modal data.
multi_modal_kwargs_list = [ multi_modal_kwargs_list = [
data.multi_modal_kwargs for data in self.inter_data_list data.multi_modal_kwargs for data in self.inter_data_list
...@@ -1051,9 +962,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -1051,9 +962,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
lora_requests=lora_requests, lora_requests=lora_requests,
multi_modal_kwargs=multi_modal_kwargs, multi_modal_kwargs=multi_modal_kwargs,
request_ids_to_seq_ids=request_ids_to_seq_ids, request_ids_to_seq_ids=request_ids_to_seq_ids,
finished_requests_ids=self.finished_requests_ids, finished_requests_ids=self.finished_requests_ids)
prompt_adapter_mapping=prompt_adapter_mapping,
prompt_adapter_requests=prompt_adapter_requests)
class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
...@@ -1148,7 +1057,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): ...@@ -1148,7 +1057,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
self.model: nn.Module # Set after load_model self.model: nn.Module # Set after load_model
# Set after load_model. # Set after load_model.
self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
self.prompt_adapter_manager: LRUCacheWorkerPromptAdapterManager = None
self.sampler = get_sampler() self.sampler = get_sampler()
set_cpu_offload_max_bytes( set_cpu_offload_max_bytes(
...@@ -1207,14 +1115,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): ...@@ -1207,14 +1115,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
logger.info("Model loading took %.4f GiB and %.6f seconds", logger.info("Model loading took %.4f GiB and %.6f seconds",
self.model_memory_usage / GiB_bytes, self.model_memory_usage / GiB_bytes,
time_after_load - time_before_load) time_after_load - time_before_load)
if self.prompt_adapter_config:
self.prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager(
self.scheduler_config.max_num_seqs,
self.scheduler_config.max_num_batched_tokens, self.device,
self.prompt_adapter_config)
self.model = (
self.prompt_adapter_manager.create_prompt_adapter_manager(
self.model))
if self.vllm_config.compilation_config.level ==\ if self.vllm_config.compilation_config.level ==\
CompilationLevel.DYNAMO_AS_IS and supports_dynamo(): CompilationLevel.DYNAMO_AS_IS and supports_dynamo():
...@@ -1466,40 +1367,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): ...@@ -1466,40 +1367,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
raise RuntimeError("LoRA is not enabled.") raise RuntimeError("LoRA is not enabled.")
return self.lora_manager.list_adapters() return self.lora_manager.list_adapters()
def remove_all_prompt_adapters(self):
if not self.prompt_adapter_manager:
raise RuntimeError("PromptAdapter is not enabled.")
self.prompt_adapter_manager.remove_all_adapters()
def set_active_prompt_adapters(
self, prompt_adapter_requests: Set[PromptAdapterRequest],
prompt_adapter_mapping: PromptAdapterMapping) -> None:
if not self.prompt_adapter_manager:
raise RuntimeError("PromptAdapter is not enabled.")
self.prompt_adapter_manager.set_active_adapters(
prompt_adapter_requests, prompt_adapter_mapping)
def add_prompt_adapter(
self, prompt_adapter_request: PromptAdapterRequest) -> bool:
if not self.prompt_adapter_manager:
raise RuntimeError("PromptAdapter is not enabled.")
return self.prompt_adapter_manager.add_adapter(prompt_adapter_request)
def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
if not self.prompt_adapter_manager:
raise RuntimeError("PromptAdapter is not enabled.")
return self.prompt_adapter_manager.remove_adapter(prompt_adapter_id)
def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
if not self.prompt_adapter_manager:
raise RuntimeError("PromptAdapter is not enabled.")
return self.prompt_adapter_manager.pin_adapter(prompt_adapter_id)
def list_prompt_adapters(self) -> Set[int]:
if not self.prompt_adapter_manager:
raise RuntimeError("PromptAdapter is not enabled.")
return self.prompt_adapter_manager.list_adapters()
@torch.inference_mode() @torch.inference_mode()
def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
"""Cuda graph capture a model. """Cuda graph capture a model.
...@@ -1609,13 +1476,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): ...@@ -1609,13 +1476,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
self.set_active_loras(set([dummy_lora_request]), self.set_active_loras(set([dummy_lora_request]),
lora_mapping) lora_mapping)
if self.prompt_adapter_config:
prompt_adapter_mapping = PromptAdapterMapping(
[-1] * batch_size,
[-1] * batch_size,
)
self.set_active_prompt_adapters(
set(), prompt_adapter_mapping)
graph_runner = CUDAGraphRunner( graph_runner = CUDAGraphRunner(
self.model, self.attn_backend.get_name(), self.model, self.attn_backend.get_name(),
self.attn_state.graph_clone(batch_size), self.attn_state.graph_clone(batch_size),
...@@ -1776,13 +1636,6 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): ...@@ -1776,13 +1636,6 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
self.set_active_loras(model_input.lora_requests, self.set_active_loras(model_input.lora_requests,
model_input.lora_mapping) model_input.lora_mapping)
if self.prompt_adapter_config:
assert model_input.prompt_adapter_requests is not None
assert model_input.prompt_adapter_mapping is not None
self.set_active_prompt_adapters(
model_input.prompt_adapter_requests,
model_input.prompt_adapter_mapping)
self.attn_state.begin_forward(model_input) self.attn_state.begin_forward(model_input)
# Currently cuda graph is only supported by the decode phase. # Currently cuda graph is only supported by the decode phase.
......
...@@ -190,7 +190,6 @@ class ModelRunnerBase(ABC, Generic[T]): ...@@ -190,7 +190,6 @@ class ModelRunnerBase(ABC, Generic[T]):
self.scheduler_config = vllm_config.scheduler_config self.scheduler_config = vllm_config.scheduler_config
self.device_config = vllm_config.device_config self.device_config = vllm_config.device_config
self.speculative_config = vllm_config.speculative_config self.speculative_config = vllm_config.speculative_config
self.prompt_adapter_config = vllm_config.prompt_adapter_config
self.observability_config = vllm_config.observability_config self.observability_config = vllm_config.observability_config
# Map of request_id -> generator used for seeded random sampling # Map of request_id -> generator used for seeded random sampling
......
...@@ -288,9 +288,6 @@ class StatefulModelInput(BroadcastableModelInput): ...@@ -288,9 +288,6 @@ class StatefulModelInput(BroadcastableModelInput):
assert fmi.lora_requests is not None assert fmi.lora_requests is not None
assert len(fmi.lora_requests) == 0 assert len(fmi.lora_requests) == 0
assert fmi.attn_metadata is not None assert fmi.attn_metadata is not None
assert fmi.prompt_adapter_mapping is None
assert fmi.prompt_adapter_requests is not None
assert len(fmi.prompt_adapter_requests) == 0
assert fmi.multi_modal_kwargs is not None assert fmi.multi_modal_kwargs is not None
assert len(fmi.multi_modal_kwargs) == 0 assert len(fmi.multi_modal_kwargs) == 0
......
...@@ -64,13 +64,6 @@ class PoolingModelRunner( ...@@ -64,13 +64,6 @@ class PoolingModelRunner(
self.set_active_loras(model_input.lora_requests, self.set_active_loras(model_input.lora_requests,
model_input.lora_mapping) model_input.lora_mapping)
if self.prompt_adapter_config:
assert model_input.prompt_adapter_requests is not None
assert model_input.prompt_adapter_mapping is not None
self.set_active_prompt_adapters(
model_input.prompt_adapter_requests,
model_input.prompt_adapter_mapping)
# Currently cuda graph is only supported by the decode phase. # Currently cuda graph is only supported by the decode phase.
assert model_input.attn_metadata is not None assert model_input.attn_metadata is not None
prefill_meta = model_input.attn_metadata.prefill_metadata prefill_meta = model_input.attn_metadata.prefill_metadata
......
...@@ -47,7 +47,3 @@ def assert_enc_dec_mr_supported_scenario( ...@@ -47,7 +47,3 @@ def assert_enc_dec_mr_supported_scenario(
if enc_dec_mr.scheduler_config.num_lookahead_slots > 0: if enc_dec_mr.scheduler_config.num_lookahead_slots > 0:
raise NotImplementedError( raise NotImplementedError(
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SPEC_DEC']) STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SPEC_DEC'])
if enc_dec_mr.prompt_adapter_config is not None:
raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_ERR_STRS[
'STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER'])
...@@ -22,7 +22,6 @@ from vllm.model_executor import set_random_seed ...@@ -22,7 +22,6 @@ from vllm.model_executor import set_random_seed
from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
SequenceGroupMetadata, SequenceGroupMetadataDelta) SequenceGroupMetadata, SequenceGroupMetadataDelta)
from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache, from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache,
...@@ -513,19 +512,6 @@ class Worker(LocalOrDistributedWorkerBase): ...@@ -513,19 +512,6 @@ class Worker(LocalOrDistributedWorkerBase):
def list_loras(self) -> Set[int]: def list_loras(self) -> Set[int]:
return self.model_runner.list_loras() return self.model_runner.list_loras()
def add_prompt_adapter(
self, prompt_adapter_request: PromptAdapterRequest) -> bool:
return self.model_runner.add_prompt_adapter(prompt_adapter_request)
def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
return self.model_runner.remove_lora(prompt_adapter_id)
def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
return self.model_runner.pin_prompt_adapter(prompt_adapter_id)
def list_prompt_adapters(self) -> Set[int]:
return self.model_runner.list_prompt_adapters()
@property @property
def max_model_len(self) -> int: def max_model_len(self) -> int:
return self.model_config.max_model_len return self.model_config.max_model_len
......
...@@ -49,7 +49,6 @@ class WorkerBase: ...@@ -49,7 +49,6 @@ class WorkerBase:
self.scheduler_config = vllm_config.scheduler_config self.scheduler_config = vllm_config.scheduler_config
self.device_config = vllm_config.device_config self.device_config = vllm_config.device_config
self.speculative_config = vllm_config.speculative_config self.speculative_config = vllm_config.speculative_config
self.prompt_adapter_config = vllm_config.prompt_adapter_config
self.observability_config = vllm_config.observability_config self.observability_config = vllm_config.observability_config
self.kv_transfer_config = vllm_config.kv_transfer_config self.kv_transfer_config = vllm_config.kv_transfer_config
self.compilation_config = vllm_config.compilation_config self.compilation_config = vllm_config.compilation_config
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment