Commit 539aa992 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.2' into v0.6.2-dev

parents 93872128 7193774b
from transformers.models.mllama import configuration_mllama as mllama_hf_config
class MllamaTextConfig(mllama_hf_config.MllamaTextConfig):
'''
Use this class to override is_encoder_decoder:
- transformers regards mllama as is_encoder_decoder=False
- vllm needs is_encoder_decoder=True to enable cross-attention
'''
def __init__(
self,
**kwargs,
):
super().__init__(**kwargs)
self.is_encoder_decoder = True
class MllamaConfig(mllama_hf_config.MllamaConfig):
def __init__(
self,
text_config=None,
**kwargs,
):
if isinstance(text_config, dict):
text_config = MllamaTextConfig(**text_config)
super().__init__(text_config=text_config, **kwargs)
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Solar model configuration"""
from transformers import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
class SolarConfig(PretrainedConfig):
r"""
This is the configuration class to store
the configuration of a [`SolarModel`].
It is used to instantiate an LLaMA model
according to the specified arguments,
defining the model architecture.
Instantiating a configuration with the
defaults will yield a similar
configuration to that of the LLaMA-7B.
Configuration objects inherit from [`PretrainedConfig`]
and can be used to control the model outputs.
Read the documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the LLaMA model.
Defines the number of different tokens
that can be represented by the `inputs_ids`
passed when calling [`SolarModel`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer
in the Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that
should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`,
the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1` the model
will use Multi Query Attention (MQA)
otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint,
each group key and value head should be constructed
by meanpooling all the original heads within that group.
For more details checkout [this paper]
(https://arxiv.org/pdf/2305.13245.pdf).
If it is not specified, will default to
`num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string)
in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with.
Solar 1 supports up to 2048 tokens,
Solar 2 up to 4096, CodeSolar up to 16384.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of
the truncated_normal_initializer for initializing
all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return
the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
pretraining_tp (`int`, *optional*, defaults to 1):
Experimental feature. Tensor parallelism rank
used during pretraining.
Please refer to [this
document](https://huggingface.co/docs/
transformers/main/
perf_train_gpu_many#tensor-parallelism)
to understand more about it. This value is
necessary to ensure exact reproducibility
of the pretraining results.
Please refer to [this
issue](https://github.com/pytorch/pytorch/issues/76232).
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for
the RoPE embeddings.
Currently supports two scaling
strategies: linear and dynamic.
Their scaling factor must be a float greater than 1.
The expected format is
`{"type": strategy name, "factor": scaling factor}`.
When using this flag, don't update
`max_position_embeddings` to the expected new maximum.
See the following thread for more information on how
these scaling strategies behave:
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/
dynamically_scaled_rope_further_increases/. This is an
experimental feature, subject to breaking
API changes in future versions.
attention_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value
and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
mlp_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in up_proj, down_proj and gate_proj
layers in the MLP layers.
sliding_window (`int`, *optional*, defaults to 2047):
Sliding window attention window size. If not specified,
will default to `2047`.
```python
>>> from transformers import SolarModel, SolarConfig
>>> # Initializing a Solar-pro style configuration
>>> configuration = SolarConfig()
>>> # Initializing a model from the Solar-pro style configuration
>>> model = SolarModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "solar"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=32000,
hidden_size=4096,
intermediate_size=11008,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=None,
bos_token_id=1,
eos_token_id=2,
pretraining_tp=1,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
mlp_bias=False,
sliding_window=2047,
bskcn_1=None,
bskcn_2=None,
bskcn_3=None,
bskcn_4=None,
bskcn_tv=None,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self._rope_scaling_validation()
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.mlp_bias = mlp_bias
self.sliding_window = sliding_window
self.bskcn_1 = bskcn_1 if bskcn_1 is not None else [12, 20, 32, 44]
self.bskcn_2 = bskcn_2 if bskcn_2 is not None else [20, 32]
self.bskcn_3 = bskcn_3 if bskcn_3 is not None else [16, 24, 36, 48]
self.bskcn_4 = bskcn_4 if bskcn_4 is not None else [28, 40]
self.bskcn_tv = bskcn_tv if bskcn_tv is not None else [0.9, 0.8]
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
def _rope_scaling_validation(self):
"""
Validate the `rope_scaling` configuration.
"""
if self.rope_scaling is None:
return
if (not isinstance(self.rope_scaling, dict)
or len(self.rope_scaling) != 2):
raise ValueError(
"`rope_scaling` must be a dictionary with two fields,"
" `type` and `factor`, "
f"got {self.rope_scaling}")
rope_scaling_type = self.rope_scaling.get("type", None)
rope_scaling_factor = self.rope_scaling.get("factor", None)
if rope_scaling_type is None or rope_scaling_type not in [
"linear",
"dynamic",
]:
raise ValueError(f"`rope_scaling`'s type field must be one of "
f"['linear', 'dynamic'], got {rope_scaling_type}")
if (rope_scaling_factor is None
or not isinstance(rope_scaling_factor, float)
or rope_scaling_factor <= 1.0):
raise ValueError(
f"`rope_scaling`'s factor field must be a float > 1,"
f" got {rope_scaling_factor}")
from typing import Dict, List, Optional, Tuple
from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams,
Sequence, SequenceGroup)
from .tokenizer import AnyTokenizer
from .tokenizer_group import BaseTokenizerGroup
# Used eg. for marking rejected tokens in spec decoding.
INVALID_TOKEN_ID = -1
class Detokenizer:
"""Provides methods to decode the output of a model into text."""
......@@ -61,7 +59,7 @@ class Detokenizer:
continue
for token_id, sample_logprob in prompt_logprobs_for_token.items():
if (sample_logprob.decoded_token is None
and token_id != INVALID_TOKEN_ID):
and token_id != VLLM_INVALID_TOKEN_ID):
prompt_token_ids_with_token = (
prompt_token_ids[:token_position] + [token_id])
(new_tokens, new_text, new_prefix_offset,
......@@ -143,7 +141,7 @@ class Detokenizer:
continue
if (sample_logprob.decoded_token is None
and token_id != INVALID_TOKEN_ID):
and token_id != VLLM_INVALID_TOKEN_ID):
all_input_ids_with_logprob = previous_tokens + [token_id]
(_, new_text, _, _) = detokenize_incrementally(
tokenizer=tokenizer,
......@@ -282,14 +280,14 @@ def detokenize_incrementally(
assert prev_tokens is not None
# If the new token id is out of bounds, return an empty string.
if new_token_id >= len(tokenizer):
new_tokens = [""]
else:
if 0 <= new_token_id < len(tokenizer):
# Put new_token_id in a list so skip_special_tokens is respected
new_tokens = tokenizer.convert_ids_to_tokens(
[new_token_id], skip_special_tokens=skip_special_tokens)
if isinstance(new_tokens, str):
new_tokens = [new_tokens]
else:
new_tokens = [""]
output_tokens = prev_tokens + new_tokens
# If this is the first iteration, return all tokens.
......
from typing import cast
def get_video_processor(
processor_name: str,
trust_remote_code: bool = False,
):
"""
Gets a processor for the given model name via HuggingFace.
"""
from transformers import AutoProcessor
try:
processor = AutoProcessor.from_pretrained(processor_name)
video_processor = processor.video_processor
except ValueError as e:
if not trust_remote_code:
err_msg = (
"Failed to load the processor. If the processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI.")
raise RuntimeError(err_msg) from e
else:
raise e
return video_processor
def get_image_processor(
processor_name: str,
*args,
trust_remote_code: bool = False,
**kwargs,
):
"""Gets an image processor for the given model name via HuggingFace."""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from transformers import AutoImageProcessor
from transformers.image_processing_utils import BaseImageProcessor
try:
processor = AutoImageProcessor.from_pretrained(
processor_name,
*args,
trust_remote_code=trust_remote_code,
**kwargs)
except ValueError as e:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
# Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
if not trust_remote_code:
err_msg = (
"Failed to load the image processor. If the image processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI.")
raise RuntimeError(err_msg) from e
else:
raise e
return cast(BaseImageProcessor, processor)
from typing import cast
from typing import Any, cast
def get_processor(
processor_name: str,
*args,
*args: Any,
trust_remote_code: bool = False,
**kwargs,
**kwargs: Any,
):
"""Gets a processor for the given model name via HuggingFace."""
"""Load a processor for the given model name via HuggingFace."""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from transformers import AutoProcessor
......@@ -35,3 +35,60 @@ def get_processor(
raise e
return cast(ProcessorMixin, processor)
def get_image_processor(
processor_name: str,
*args: Any,
trust_remote_code: bool = False,
**kwargs: Any,
):
"""Load an image processor for the given model name via HuggingFace."""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from transformers import AutoImageProcessor
from transformers.image_processing_utils import BaseImageProcessor
try:
processor = AutoImageProcessor.from_pretrained(
processor_name,
*args,
trust_remote_code=trust_remote_code,
**kwargs)
except ValueError as e:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
# Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
if not trust_remote_code:
err_msg = (
"Failed to load the image processor. If the image processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI.")
raise RuntimeError(err_msg) from e
else:
raise e
return cast(BaseImageProcessor, processor)
def get_video_processor(
processor_name: str,
*args: Any,
trust_remote_code: bool = False,
**kwargs: Any,
):
"""Load a video processor for the given model name via HuggingFace."""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from transformers.image_processing_utils import BaseImageProcessor
processor = get_processor(
processor_name,
*args,
trust_remote_code=trust_remote_code,
**kwargs,
)
return cast(BaseImageProcessor, processor.video_processor)
......@@ -111,7 +111,6 @@ def get_tokenizer(
'encoding and decoding.',
FutureWarning,
stacklevel=2)
if tokenizer_mode == "mistral":
tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name),
revision=revision)
......
......@@ -165,10 +165,9 @@ class MistralTokenizer:
messages: List["ChatCompletionMessageParam"],
tools: Optional[Dict[str, Any]] = None,
**kwargs) -> List[int]:
assert tools is None, "`tools` are not yet supported."
request = ChatCompletionRequest(
messages=messages) # type: ignore[type-var]
request = ChatCompletionRequest(messages=messages,
tools=tools) # type: ignore[type-var]
encoded = self.mistral.encode_chat_completion(request)
# encode-decode to get clean prompt
......@@ -176,9 +175,29 @@ class MistralTokenizer:
def convert_tokens_to_string(self, tokens: List[str]) -> str:
if isinstance(self.tokenizer, Tekkenizer):
return "".join(tokens)
tokens = [
t for t in tokens
if t not in self.tokenizer._all_special_tokens
]
if any(isinstance(t, bytes) for t in tokens):
# we need to encode and decode all tokens again
shift = self.tokenizer.num_special_tokens
byte_tokens = [
t.encode("utf-8") if not isinstance(t, bytes) else t
for t in tokens
]
ids = [
self.tokenizer._tekken_token2id_nospecial[t] + shift
for t in byte_tokens
]
decoded = self.tokenizer.decode(ids)
else:
decoded = "".join(tokens)
else:
return self.tokenizer.decode(tokens) # type: ignore[arg-type]
decoded = self.tokenizer.decode(tokens) # type: ignore[arg-type]
return decoded
def decode(self, ids: Union[List[int], int]) -> str:
if isinstance(ids, int):
......@@ -200,4 +219,11 @@ class MistralTokenizer:
self.tokenizer)
tokens = [self.tokenizer.id_to_piece(id) for id in ids]
if any(t.strip() == "�" for t in tokens):
# if any stripped decoded token is undefined
# because it's invalid unicode then pass bytes
# See: https://github.com/vllm-project/vllm/pull/8640
tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]
return tokens
......@@ -35,8 +35,8 @@ class LibEntry(triton.KernelInterface):
dns_key = [
arg.dtype if hasattr(
arg, "data_ptr") else type(arg) if not isinstance(arg, int)
else "i32" if -(2**31) <= arg and arg <= 2**31 -
1 else "u64" if 2**63 <= arg and arg <= 2**64 - 1 else "i64"
else "i32" if arg >= -(2**31) and arg <= 2**31 -
1 else "u64" if arg >= 2**63 and arg <= 2**64 - 1 else "i64"
for arg in dns_args
]
# const args passed by position
......
import math
# This is a hardcoded limit in Triton (max block size).
MAX_TRITON_N_COLS = 131072
def get_num_triton_sampler_splits(n_cols: int) -> int:
"""Get the number of splits to use for Triton sampling.
Triton has a limit on the number of columns it can handle, so we need to
split the tensor and call the kernel multiple times if it's too large.
"""
return math.ceil(n_cols / MAX_TRITON_N_COLS)
......@@ -17,6 +17,7 @@ import torch
import vllm.envs as envs
from vllm.connections import global_http_connection
from vllm.platforms import current_platform
from vllm.version import __version__ as VLLM_VERSION
_config_home = envs.VLLM_CONFIG_ROOT
......@@ -151,7 +152,7 @@ class UsageMessage:
usage_context: UsageContext,
extra_kvs: Dict[str, Any]) -> None:
# Platform information
if torch.cuda.is_available():
if current_platform.is_cuda_alike():
device_property = torch.cuda.get_device_properties(0)
self.gpu_count = torch.cuda.device_count()
self.gpu_type = device_property.name
......
......@@ -4,7 +4,10 @@ import contextlib
import datetime
import enum
import gc
import inspect
import ipaddress
import os
import random
import socket
import subprocess
import sys
......@@ -12,6 +15,7 @@ import tempfile
import threading
import uuid
import warnings
import weakref
from asyncio import FIRST_COMPLETED, ensure_future
from functools import lru_cache, partial, wraps
from platform import uname
......@@ -31,6 +35,7 @@ from typing_extensions import ParamSpec, TypeIs, assert_never
import vllm.envs as envs
from vllm.logger import enable_trace_function_call, init_logger
from vllm.platforms import current_platform
logger = init_logger(__name__)
......@@ -70,10 +75,6 @@ STR_NOT_IMPL_ENC_DEC_SPEC_DEC = ("Speculative decoding is not "
"currently supported with encoder/"
"decoder models.")
STR_NOT_IMPL_ENC_DEC_CUDAGRAPH = ("CUDAGraph is not "
"currently supported with encoder/"
"decoder models.")
STR_NOT_IMPL_ENC_DEC_BACKEND = ("XFormers is the only backend "
"currently supported with encoder/"
"decoder models.")
......@@ -97,7 +98,6 @@ STR_NOT_IMPL_ENC_DEC_ERR_STRS = {
"STR_NOT_IMPL_ENC_DEC_PP": STR_NOT_IMPL_ENC_DEC_PP,
"STR_NOT_IMPL_ENC_DEC_MM": STR_NOT_IMPL_ENC_DEC_MM,
"STR_NOT_IMPL_ENC_DEC_SPEC_DEC": STR_NOT_IMPL_ENC_DEC_SPEC_DEC,
"STR_NOT_IMPL_ENC_DEC_CUDA_GRAPH": STR_NOT_IMPL_ENC_DEC_CUDAGRAPH,
"STR_NOT_IMPL_ENC_DEC_BACKEND": STR_NOT_IMPL_ENC_DEC_BACKEND,
"STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER": STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER,
"STR_NOT_IMPL_ENC_DEC_CPU": STR_NOT_IMPL_ENC_DEC_CPU
......@@ -269,7 +269,7 @@ class LRUCache(Generic[T]):
class PyObjectCache:
"""Used to cache python objects to avoid object allocations
"""Used to cache python objects to avoid object allocations
across scheduler iterations.
"""
......@@ -288,7 +288,7 @@ class PyObjectCache:
self._obj_cache.append(self._obj_builder())
def get_object(self):
"""Returns a pre-allocated cached object. If there is not enough
"""Returns a pre-allocated cached object. If there is not enough
objects, then the cache size will double.
"""
if self._index >= len(self._obj_cache):
......@@ -377,6 +377,22 @@ def get_cpu_memory() -> int:
return psutil.virtual_memory().total
def seed_everything(seed: int) -> None:
"""
Set the seed of each random module.
Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20
"""
random.seed(seed)
np.random.seed(seed)
if current_platform.is_cuda_alike():
torch.cuda.manual_seed_all(seed)
if is_xpu():
torch.xpu.manual_seed_all(seed)
def random_uuid() -> str:
return str(uuid.uuid4().hex)
......@@ -518,6 +534,14 @@ def get_ip() -> str:
return "0.0.0.0"
def is_valid_ipv6_address(address: str) -> bool:
try:
ipaddress.IPv6Address(address)
return True
except ValueError:
return False
def get_distributed_init_method(ip: str, port: int) -> str:
# Brackets are not permitted in ipv4 addresses,
# see https://github.com/python/cpython/issues/103848
......@@ -638,9 +662,7 @@ def create_kv_caches_with_random_flash(
seed: int = 0,
device: Optional[str] = "cuda",
) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
seed_everything(seed)
torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
......@@ -682,9 +704,7 @@ def create_kv_caches_with_random(
f"Does not support key cache of type fp8 with head_size {head_size}"
)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
seed_everything(seed)
torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
......@@ -747,14 +767,14 @@ def is_pin_memory_available() -> bool:
return True
class CudaMemoryProfiler:
class DeviceMemoryProfiler:
def __init__(self, device: Optional[torch.types.Device] = None):
self.device = device
def current_memory_usage(self) -> float:
# Return the memory usage in bytes.
if torch.cuda.is_available():
if current_platform.is_cuda_alike():
torch.cuda.reset_peak_memory_stats(self.device)
mem = torch.cuda.max_memory_allocated(self.device)
elif is_xpu():
......@@ -836,15 +856,6 @@ def async_tensor_h2d(
return t.to(device=target_device, non_blocking=True)
def maybe_expand_dim(tensor: torch.Tensor,
target_dims: int,
size: int = 1) -> torch.Tensor:
"""Expand the tensor to the target_dims."""
if tensor.ndim < target_dims:
tensor = tensor.view(-1, *([size] * (target_dims - tensor.ndim)))
return tensor
def get_dtype_size(dtype: torch.dtype) -> int:
"""Get the size of the data type in bytes."""
return torch.tensor([], dtype=dtype).element_size()
......@@ -1069,7 +1080,7 @@ def _cuda_device_count_stateless(
def cuda_device_count_stateless() -> int:
"""Get number of CUDA devices, caching based on the value of
CUDA_VISIBLE_DEVICES at the time of call.
This should be used instead of torch.cuda.device_count()
unless CUDA_VISIBLE_DEVICES has already been set to the desired
value."""
......@@ -1079,6 +1090,20 @@ def cuda_device_count_stateless() -> int:
return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
def weak_bind(bound_method: Callable[..., Any], ) -> Callable[..., None]:
"""Make an instance method that weakly references
its associated instance and no-ops once that
instance is collected."""
ref = weakref.ref(bound_method.__self__) # type: ignore[attr-defined]
unbound = bound_method.__func__ # type: ignore[attr-defined]
def weak_bound(*args, **kwargs) -> None:
if inst := ref():
unbound(inst, *args, **kwargs)
return weak_bound
#From: https://stackoverflow.com/a/4104188/2749989
def run_once(f: Callable[P, None]) -> Callable[P, None]:
......@@ -1121,10 +1146,10 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
def _pull_args_from_config(args: List[str]) -> List[str]:
"""Method to pull arguments specified in the config file
into the command-line args variable.
The arguments in config file will be inserted between
The arguments in config file will be inserted between
the argument list.
example:
```yaml
port: 12323
......@@ -1135,21 +1160,21 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
--config config.yaml -tp 2
$: args = [
"serve,chat,complete",
"facebook/opt-12B",
'--config', 'config.yaml',
"facebook/opt-12B",
'--config', 'config.yaml',
'-tp', '2'
]
$: args = [
"serve,chat,complete",
"facebook/opt-12B",
'--port', '12323',
'--tensor-parallel-size', '4',
"facebook/opt-12B",
'--port', '12323',
'--tensor-parallel-size', '4',
'-tp', '2'
]
```
Please note how the config args are inserted after the sub command.
this way the order of priorities is maintained when these are args
this way the order of priorities is maintained when these are args
parsed by super().
"""
assert args.count(
......@@ -1175,7 +1200,7 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
@staticmethod
def _load_config_file(file_path: str) -> List[str]:
"""Loads a yaml file and returns the key value pairs as a
"""Loads a yaml file and returns the key value pairs as a
flattened list with argparse like pattern
```yaml
port: 12323
......@@ -1186,7 +1211,7 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
'--port': '12323',
'--tensor-parallel-size': '4'
]
"""
extension: str = file_path.split('.')[-1]
......@@ -1222,6 +1247,53 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
return await task(*args, **kwargs)
def get_allowed_kwarg_only_overrides(
callable: Callable[..., object],
overrides: Optional[Dict[str, Any]],
) -> Dict[str, Any]:
"""
Given a callable which has one or more keyword only params and a dict
mapping param names to values, drop values that can be not be kwarg
expanded to overwrite one or more keyword-only args. This is used in a
few places to handle custom processor overrides for multimodal models,
e.g., for profiling when processor options provided by the user
may affect the number of mm tokens per instance.
Args:
callable: Callable which takes 0 or more keyword only arguments.
overrides: Potential overrides to be used when invoking the callable.
Returns:
Dictionary containing the kwargs to be leveraged which may be used
to overwrite one or more keyword only arguments when invoking the
callable.
"""
if not overrides:
return {}
allowed_override_names = [
name for name, param in inspect.signature(callable).parameters.items()
if param.kind == inspect.Parameter.KEYWORD_ONLY
]
# Drop any mm_processor_kwargs provided by the user that are
# not kwarg names accepted by the provided input processor.
filtered_overrides = {
kwarg_name: val
for kwarg_name, val in overrides.items()
if kwarg_name in allowed_override_names
}
# If anything is dropped, log a warning
dropped_keys = overrides.keys() - filtered_overrides.keys()
if dropped_keys:
logger.warning(
"The following intended overrides are not keyword-only args "
"and and will be dropped: %s", dropped_keys)
return filtered_overrides
# Using dynamo with vLLM doesn't really work well with PyTorch versions < 2.4.0.
# In particular, the FakeScalarType is not supported for earlier versions of
# PyTorch which breaks dynamo for any ops registered using ScalarType.
......@@ -1230,6 +1302,12 @@ def supports_dynamo() -> bool:
return base_torch_version >= Version("2.4.0")
# Some backends use pytorch version < 2.4.0 which doesn't
# support `torch.library.custom_op`.
def supports_custom_op() -> bool:
return hasattr(torch.library, "custom_op")
class AtomicCounter:
"""An atomic, thread-safe counter"""
......
import warnings
try:
import vllm.commit_id
__commit__ = vllm.commit_id.__commit__
from ._version import __version__, __version_tuple__
except Exception as e:
import warnings
warnings.warn(f"Failed to read commit hash:\n{e}",
RuntimeWarning,
stacklevel=2)
__commit__ = "COMMIT_HASH_PLACEHOLDER"
__version__ = "0.6.1.post2"
__version__ = "dev"
__version_tuple__ = (0, 0, __version__)
import dataclasses
import weakref
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
......@@ -10,14 +12,16 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
SchedulerConfig)
from vllm.logger import init_logger
from vllm.model_executor import SamplingMetadata
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.model_loader import get_model
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
MultiModalInputs)
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
from vllm.sequence import (IntermediateTensors, SequenceData,
SequenceGroupMetadata)
from vllm.utils import STR_NOT_IMPL_ENC_DEC_ERR_STRS, make_tensor_with_pad
from vllm.worker.model_runner_base import (
ModelRunnerBase, ModelRunnerInputBase,
ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
_add_attn_metadata_broadcastable_dict,
_add_sampling_metadata_broadcastable_dict,
_init_attn_metadata_from_tensor_dict,
......@@ -32,16 +36,17 @@ _PAD_SLOT_ID = -1
@dataclass(frozen=True)
class CPUModelInput(ModelRunnerInputBase):
class ModelInputForCPU(ModelRunnerInputBase):
"""
Used by the CPUModelRunner.
Base class contains metadata needed for the base model forward pass on CPU
"""
input_tokens: Optional[torch.Tensor] = None
input_positions: Optional[torch.Tensor] = None
attn_metadata: Optional["AttentionMetadata"] = None
sampling_metadata: Optional["SamplingMetadata"] = None
multi_modal_kwargs: Optional[BatchedTensorInputs] = None
virtual_engine: Optional[int] = None
seq_lens: Optional[List[int]] = None
query_lens: Optional[List[int]] = None
def as_broadcastable_tensor_dict(
self) -> Dict[str, Union[int, torch.Tensor]]:
......@@ -51,16 +56,44 @@ class CPUModelInput(ModelRunnerInputBase):
"multi_modal_kwargs": self.multi_modal_kwargs,
}
_add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
return tensor_dict
@classmethod
def from_broadcasted_tensor_dict(
cls: Type["ModelInputForCPU"],
tensor_dict: Dict[str, Any],
attn_backend: Optional["AttentionBackend"] = None
) -> "ModelInputForCPU":
if attn_backend is not None:
tensor_dict = _init_attn_metadata_from_tensor_dict(
attn_backend, tensor_dict)
return cls(**tensor_dict)
@dataclass(frozen=True)
class ModelInputForCPUWithSamplingMetadata(ModelInputForCPU):
"""
Used by the ModelRunner.
"""
sampling_metadata: Optional["SamplingMetadata"] = None
def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
tensor_dict = {
"input_tokens": self.input_tokens,
"input_positions": self.input_positions,
}
_add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
_add_sampling_metadata_broadcastable_dict(tensor_dict,
self.sampling_metadata)
return tensor_dict
@classmethod
def from_broadcasted_tensor_dict(
cls: Type["CPUModelInput"],
tensor_dict: Dict[str, Any],
attn_backend: Optional["AttentionBackend"] = None
) -> "CPUModelInput":
cls,
tensor_dict: Dict[str, Any],
attn_backend: Optional["AttentionBackend"] = None,
) -> "ModelInputForCPUWithSamplingMetadata":
tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
if attn_backend is not None:
tensor_dict = _init_attn_metadata_from_tensor_dict(
......@@ -68,71 +101,83 @@ class CPUModelInput(ModelRunnerInputBase):
return cls(**tensor_dict)
class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]):
def __init__(
self,
model_config: ModelConfig,
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
device_config: DeviceConfig,
cache_config: CacheConfig,
load_config: LoadConfig,
lora_config: Optional[LoRAConfig],
kv_cache_dtype: Optional[str] = "auto",
prompt_adapter_config: Optional[PromptAdapterConfig] = None,
is_driver_worker: bool = False,
*args,
**kwargs,
):
self.model_config = model_config
self.parallel_config = parallel_config
self.scheduler_config = scheduler_config
# Currently, CPU worker doesn't support chunked prefill.
assert self.scheduler_config.chunked_prefill_enabled is False
self.device_config = device_config
self.cache_config = cache_config
self.lora_config = lora_config
self.prompt_adapter_config = prompt_adapter_config
self.load_config = load_config
self.is_driver_worker = is_driver_worker
def __init__(self,
runner: "CPUModelRunner",
finished_requests_ids: Optional[List[str]] = None) -> None:
super().__init__()
self.seq_group_metadata_list: List[SequenceGroupMetadata] = []
self.runner = runner
self.model_input_cls = self.runner._model_input_cls
self.attn_backend = self.runner.attn_backend
self.sliding_window = self.runner.sliding_window
self.block_size = self.runner.block_size
self.device = self.runner.device
self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper
self.device = self.device_config.device
def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
self.seq_group_metadata_list.append(seq_group_metadata)
self.kv_cache_dtype = kv_cache_dtype
self.sliding_window = model_config.get_sliding_window()
self.block_size = cache_config.block_size
self.attn_backend = get_attn_backend(
self.model_config.get_num_attention_heads(self.parallel_config),
self.model_config.get_head_size(),
self.model_config.get_num_kv_heads(self.parallel_config),
self.model_config.get_sliding_window(),
self.model_config.dtype,
self.kv_cache_dtype,
self.block_size,
)
# Multi-modal data support
self.mm_registry = MULTIMODAL_REGISTRY
self.multi_modal_input_mapper = self.mm_registry \
.create_input_mapper(self.model_config)
self.mm_registry.init_mm_limits_per_prompt(self.model_config)
# Lazy initialization.
self.model: nn.Module # Set after init_Model
def build(self) -> ModelInputForCPU:
multi_modal_kwargs = None
# NOTE: We assume that all sequences in the group are all prompts or
# all decodes.
is_prompt = self.seq_group_metadata_list[0].is_prompt
# Prepare input tensors.
if is_prompt:
(input_tokens, input_positions, attn_metadata, seq_lens,
multi_modal_kwargs) = self._prepare_prompt(
self.seq_group_metadata_list)
else:
(input_tokens, input_positions,
attn_metadata) = self._prepare_decode(
self.seq_group_metadata_list)
seq_lens = []
if self.model_config.is_encoder_decoder_model:
raise NotImplementedError(
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_CPU'])
return self.model_input_cls(
input_tokens=input_tokens,
input_positions=input_positions,
attn_metadata=attn_metadata,
multi_modal_kwargs=multi_modal_kwargs,
# query_lens is not needed if chunked prefill is not
# supported. Since CPU worker doesn't support chunked prefill
# just use seq_lens instead.
seq_lens=seq_lens,
query_lens=seq_lens,
)
def load_model(self) -> None:
self.model = get_model(model_config=self.model_config,
load_config=self.load_config,
device_config=self.device_config,
lora_config=self.lora_config,
parallel_config=self.parallel_config,
scheduler_config=self.scheduler_config,
cache_config=self.cache_config)
def _compute_multi_modal_input(self, seq_data: SequenceData, mm_data,
computed_len: int):
mm_kwargs = self.multi_modal_input_mapper(mm_data)
# special processing for mrope position deltas.
mrope_positions = None
if self.runner.model_is_mrope:
image_grid_thw = mm_kwargs.get("image_grid_thw", None)
video_grid_thw = mm_kwargs.get("video_grid_thw", None)
assert image_grid_thw is not None or video_grid_thw is not None, (
"mrope embedding type requires multi-modal input mapper "
"returns 'image_grid_thw' or 'video_grid_thw'.")
hf_config = self.runner.model_config.hf_config
token_ids = seq_data.get_token_ids()
mrope_positions, mrope_position_delta = \
MRotaryEmbedding.get_input_positions(
token_ids,
image_grid_thw=image_grid_thw,
video_grid_thw=video_grid_thw,
image_token_id=hf_config.image_token_id,
video_token_id=hf_config.video_token_id,
vision_start_token_id=hf_config.vision_start_token_id,
vision_end_token_id=hf_config.vision_end_token_id,
spatial_merge_size=hf_config.vision_config.
spatial_merge_size,
context_len=computed_len,
)
seq_data.mrope_position_delta = mrope_position_delta
return mm_kwargs, mrope_positions
def _prepare_prompt(
self,
......@@ -142,6 +187,8 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
assert len(seq_group_metadata_list) > 0
input_tokens: List[int] = []
input_positions: List[int] = []
input_mrope_positions: List[List[int]] = [[] for _ in range(3)]
slot_mapping: List[int] = []
seq_lens: List[int] = []
multi_modal_inputs_list: List[MultiModalInputs] = []
......@@ -160,15 +207,20 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
seq_lens.append(seq_len) # Prompt token num
input_tokens.extend(prompt_tokens) # Token ids
mrope_positions = None
if (mm_data := seq_group_metadata.multi_modal_data):
mm_kwargs, mrope_positions = self._compute_multi_modal_input(
seq_data, mm_data, computed_len)
multi_modal_inputs_list.append(mm_kwargs)
# Token position ids
# NOTE(woosuk): Here we assume that the first token in the prompt
# is always the first token in the sequence.
input_positions.extend(list(range(computed_len, seq_len)))
mm_data = seq_group_metadata.multi_modal_data
if mm_data:
mm_kwargs = self.multi_modal_input_mapper(mm_data)
multi_modal_inputs_list.append(mm_kwargs)
if mrope_positions:
for idx in range(3):
input_mrope_positions[idx].extend(mrope_positions[idx])
else:
input_positions.extend(list(range(computed_len, seq_len)))
# Compute the slot mapping.
block_table = seq_group_metadata.block_tables[seq_id]
......@@ -192,12 +244,18 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
slot = block_number * self.block_size + block_offset
slot_mapping.append(slot)
if any(input_mrope_positions):
input_positions = None # type: ignore
else:
input_mrope_positions = None # type: ignore
num_prompt_tokens = len(input_tokens)
input_tokens = torch.tensor(input_tokens,
dtype=torch.long,
device=self.device) # type: ignore
input_positions = torch.tensor(input_positions,
input_positions = torch.tensor(input_positions
or input_mrope_positions,
dtype=torch.long,
device=self.device) # type: ignore
slot_mapping = torch.tensor(slot_mapping,
......@@ -228,6 +286,7 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
assert len(seq_group_metadata_list) > 0
input_tokens: List[int] = []
input_positions: List[int] = []
input_mrope_positions: List[List[int]] = [[] for _ in range(3)]
slot_mapping: List[int] = []
seq_lens: List[int] = []
block_tables: List[List[int]] = []
......@@ -245,7 +304,17 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
seq_len = seq_data.get_len()
position = seq_len - 1
input_positions.append(position)
if seq_data.mrope_position_delta is not None:
context_len = seq_data.get_num_computed_tokens()
next_pos = MRotaryEmbedding.get_next_input_positions(
seq_data.mrope_position_delta,
context_len,
seq_len,
)
for idx in range(3):
input_mrope_positions[idx].extend(next_pos[idx])
else:
input_positions.append(position)
seq_len = seq_len if self.sliding_window is None else min(
seq_len, self.sliding_window)
......@@ -263,12 +332,18 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
block_table = block_table[-sliding_window_blocks:]
block_tables.append(block_table)
if any(input_mrope_positions):
input_positions = None # type: ignore
else:
input_mrope_positions = None # type: ignore
max_decode_seq_len = max(seq_lens)
input_tokens = torch.tensor(input_tokens,
dtype=torch.long,
device=self.device)
input_positions = torch.tensor(input_positions,
input_positions = torch.tensor(input_positions
or input_mrope_positions,
dtype=torch.long,
device=self.device)
slot_mapping = torch.tensor(slot_mapping,
......@@ -302,56 +377,139 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
attn_metadata,
)
class CPUModelRunner(ModelRunnerBase[ModelInputForCPU]):
_model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = (
ModelInputForCPUWithSamplingMetadata)
_builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder
def __init__(
self,
model_config: ModelConfig,
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
device_config: DeviceConfig,
cache_config: CacheConfig,
load_config: LoadConfig,
lora_config: Optional[LoRAConfig],
kv_cache_dtype: Optional[str] = "auto",
prompt_adapter_config: Optional[PromptAdapterConfig] = None,
is_driver_worker: bool = False,
*args,
**kwargs,
):
self.model_config = model_config
self.parallel_config = parallel_config
self.scheduler_config = scheduler_config
# Currently, CPU worker doesn't support chunked prefill.
assert self.scheduler_config.chunked_prefill_enabled is False
self.device_config = device_config
self.cache_config = cache_config
self.lora_config = lora_config
self.prompt_adapter_config = prompt_adapter_config
self.load_config = load_config
self.is_driver_worker = is_driver_worker
self.device = self.device_config.device
self.kv_cache_dtype = kv_cache_dtype
self.sliding_window = model_config.get_sliding_window()
self.block_size = cache_config.block_size
self.attn_backend = get_attn_backend(
self.model_config.get_num_attention_heads(self.parallel_config),
self.model_config.get_head_size(),
self.model_config.get_num_kv_heads(self.parallel_config),
self.model_config.get_sliding_window(),
self.model_config.dtype,
self.kv_cache_dtype,
self.block_size,
)
# Multi-modal data support
self.mm_registry = MULTIMODAL_REGISTRY
self.multi_modal_input_mapper = self.mm_registry \
.create_input_mapper(self.model_config)
self.mm_registry.init_mm_limits_per_prompt(self.model_config)
# Lazy initialization.
self.model: nn.Module # Set after init_Model
if self.model_config.is_encoder_decoder_model:
raise NotImplementedError(
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_CPU'])
@property
def model_is_mrope(self) -> bool:
"""Detect if the model has "mrope" rope_scaling type.
mrope requires keep "rope_deltas" between prompt and decoding phases."""
rope_scaling = getattr(self.model_config.hf_config, "rope_scaling", {})
if rope_scaling is None:
return False
return rope_scaling.get("type", None) == "mrope"
def load_model(self) -> None:
self.model = get_model(model_config=self.model_config,
load_config=self.load_config,
device_config=self.device_config,
lora_config=self.lora_config,
parallel_config=self.parallel_config,
scheduler_config=self.scheduler_config,
cache_config=self.cache_config)
def make_model_input_from_broadcasted_tensor_dict(
self,
tensor_dict: Dict[str, Any],
) -> CPUModelInput:
return CPUModelInput.from_broadcasted_tensor_dict(
) -> ModelInputForCPU:
return ModelInputForCPU.from_broadcasted_tensor_dict(
tensor_dict,
attn_backend=self.attn_backend,
)
def _prepare_model_input_tensors(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
finished_requests_ids: Optional[List[str]] = None
) -> ModelInputForCPUWithSamplingMetadata:
"""Helper method to prepare the model input based on a given sequence
group. Prepares metadata needed for the base model forward pass but not
metadata for possible additional steps, e.g., sampling.
"""
builder = self._builder_cls(weakref.proxy(self), finished_requests_ids)
for seq_group_metadata in seq_group_metadata_list:
builder.add_seq_group(seq_group_metadata)
return builder.build() # type: ignore
def prepare_model_input(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
virtual_engine: int = 0,
finished_requests_ids: Optional[List[str]] = None
) -> CPUModelInput:
multi_modal_kwargs = None
# NOTE: We assume that all sequences in the group are all prompts or
# all decodes.
is_prompt = seq_group_metadata_list[0].is_prompt
# Prepare input tensors.
if is_prompt:
(input_tokens, input_positions, attn_metadata, seq_lens,
multi_modal_kwargs
) = self._prepare_prompt(seq_group_metadata_list)
else:
(input_tokens, input_positions,
attn_metadata) = self._prepare_decode(seq_group_metadata_list)
seq_lens = []
sampling_metadata = SamplingMetadata.prepare(
seq_group_metadata_list,
seq_lens,
# query_lens is not needed if chunked prefill is not
# supported. Since CPU worker doesn't support chunked prefill
# just use seq_lens instead.
seq_lens,
self.device,
pin_memory=False,
generators=self.get_generators(finished_requests_ids))
return CPUModelInput(
input_tokens=input_tokens,
input_positions=input_positions,
attn_metadata=attn_metadata,
sampling_metadata=sampling_metadata,
multi_modal_kwargs=multi_modal_kwargs,
)
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
virtual_engine: int = 0,
finished_requests_ids: Optional[List[str]] = None
) -> ModelInputForCPUWithSamplingMetadata:
"""Prepare the model input based on a given sequence group, including
metadata for the sampling step.
"""
model_input = self._prepare_model_input_tensors(
seq_group_metadata_list, finished_requests_ids)
# Sampling metadata is only required for the final pp group
generators = self.get_generators(finished_requests_ids)
sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
model_input.seq_lens,
model_input.query_lens,
self.device,
pin_memory=False,
generators=generators)
return dataclasses.replace(model_input,
sampling_metadata=sampling_metadata,
virtual_engine=virtual_engine)
@torch.no_grad()
def execute_model(
self,
model_input: CPUModelInput,
model_input: ModelInputForCPUWithSamplingMetadata,
kv_caches: List[torch.Tensor],
intermediate_tensors: Optional[IntermediateTensors] = None,
num_steps: int = 1,
......@@ -372,6 +530,8 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
model_input.attn_metadata,
**MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
device=self.device),
"intermediate_tensors":
intermediate_tensors,
}
hidden_states = model_executable(**execute_model_kwargs)
......
import dataclasses
import itertools
from typing import Any, Dict, List, Optional, Tuple, Type, cast
import torch
......@@ -17,14 +18,16 @@ from vllm.inputs import INPUT_REGISTRY, InputRegistry
from vllm.logger import init_logger
from vllm.model_executor import SamplingMetadata
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalInputs,
MultiModalRegistry)
from vllm.sampling_params import SamplingParams
from vllm.sequence import (IntermediateTensors, PoolerOutput,
SequenceGroupMetadata)
from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad
from vllm.worker.model_runner import (GPUModelRunnerBase,
ModelInputForGPUBuilder,
ModelInputForGPUWithSamplingMetadata)
ModelInputForGPUWithSamplingMetadata,
_get_graph_batch_size)
from vllm.worker.model_runner_base import (
_add_attn_metadata_broadcastable_dict,
_add_sampling_metadata_broadcastable_dict)
......@@ -50,6 +53,7 @@ class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata):
"virtual_engine": self.virtual_engine,
"request_ids_to_seq_ids": self.request_ids_to_seq_ids,
"finished_requests_ids": self.finished_requests_ids,
"multi_modal_kwargs": self.multi_modal_kwargs,
}
_add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
_add_sampling_metadata_broadcastable_dict(tensor_dict,
......@@ -178,12 +182,22 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
raise ValueError("num_steps > 1 is not supported in "
"EncoderDecoderModelRunner")
model_executable = self.model
if (model_input.attn_metadata is not None
and model_input.attn_metadata.prefill_metadata is None
and model_input.attn_metadata.decode_metadata.use_cuda_graph):
assert model_input.input_tokens is not None
graph_batch_size = model_input.input_tokens.shape[0]
model_executable = self.graph_runners[
model_input.virtual_engine][graph_batch_size]
else:
model_executable = self.model
seqlen_agnostic_kwargs = {
"finished_requests_ids": model_input.finished_requests_ids,
"request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
} if self.has_seqlen_agnostic else {}
multi_modal_kwargs = model_input.multi_modal_kwargs or {}
hidden_or_intermediate_states = model_executable(
input_ids=model_input.input_tokens,
positions=model_input.input_positions,
......@@ -192,6 +206,8 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
kv_caches=kv_caches,
attn_metadata=model_input.attn_metadata,
intermediate_tensors=intermediate_tensors,
**MultiModalInputs.as_kwargs(multi_modal_kwargs,
device=self.device),
**seqlen_agnostic_kwargs)
logits = self.model.compute_logits(hidden_or_intermediate_states,
......@@ -200,6 +216,9 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
if not self.is_driver_worker:
return []
if model_input.async_callback is not None:
model_input.async_callback()
# Sample the next token.
output: SamplerOutput = self.model.sample(
logits=logits,
......@@ -231,14 +250,12 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
"""
model_input = self._prepare_model_input_tensors(
seq_group_metadata_list, finished_requests_ids)
(
attn_metadata,
encoder_input_tokens_tensor,
encoder_input_positions_tensor,
) = (self._prepare_encoder_model_input_tensors(seq_group_metadata_list,
model_input))
# Inject attn_metadata encoder/cross-attention fields &
# encoder input tokens/positions into model_input.
# Frozen dataclass fields cannot be modified, so use
......@@ -277,8 +294,7 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
self.model_config)
if max_mm_tokens > 0:
raise NotImplementedError(
"Multi-modal encoder-decoder models are not supported yet")
logger.info("Starting profile run for multi-modal models.")
batch_size = 0
for group_id in range(max_num_seqs):
......@@ -286,24 +302,39 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
(group_id < max_num_batched_tokens % max_num_seqs))
batch_size += seq_len
seq_data, _ = self.input_registry \
.dummy_data_for_profiling(self.model_config,
decoder_seq_data, decoder_dummy_multi_modal_data \
= self.input_registry.dummy_data_for_profiling(
self.model_config,
seq_len,
self.mm_registry)
self.mm_registry,
is_encoder_data=False)
encoder_seq_data, encoder_dummy_multi_modal_data \
= self.input_registry.dummy_data_for_profiling(
self.model_config,
seq_len,
self.mm_registry,
is_encoder_data=True)
# Having more tokens is over-conservative but otherwise fine
assert len(seq_data.prompt_token_ids) >= seq_len, (
assert len(decoder_seq_data.prompt_token_ids) >= seq_len, (
f"Expected at least {seq_len} dummy tokens for profiling, "
f"but got: {len(seq_data.prompt_token_ids)}")
f"but got: {len(decoder_seq_data.prompt_token_ids)}")
assert decoder_dummy_multi_modal_data is None or \
encoder_dummy_multi_modal_data is None, (
"Multi-modal data can't be provided in both encoder and decoder"
)
seq = SequenceGroupMetadata(
request_id=str(group_id),
is_prompt=True,
seq_data={group_id: seq_data},
seq_data={group_id: decoder_seq_data},
sampling_params=sampling_params,
block_tables=None,
encoder_seq_data=seq_data,
encoder_seq_data=encoder_seq_data,
cross_block_table=None,
multi_modal_data=decoder_dummy_multi_modal_data
or encoder_dummy_multi_modal_data,
)
seqs.append(seq)
......@@ -424,24 +455,42 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
encoder_input_tokens_tensor = self._empty_long_tensor()
encoder_input_positions_tensor = self._empty_long_tensor()
cross_slot_mapping_tensor = self._empty_long_tensor()
# Extract cross-attention block tables &
# seq len from each sequence group metadata.
# Cross-attention block tables are empty
# during vLLM memory profiling.
cross_block_tables = []
for seq_group_metadata in seq_group_metadata_list:
encoder_seq_lens.append(
seq_group_metadata.encoder_seq_data.get_len())
cross_block_table = seq_group_metadata.cross_block_table
cross_block_tables.append([] if (
cross_block_table is None) else cross_block_table)
for _ in range(len(seq_group_metadata.seq_data)):
encoder_seq_lens.append(
seq_group_metadata.encoder_seq_data.get_len())
cross_block_table = seq_group_metadata.cross_block_table
cross_block_tables.append([] if (
cross_block_table is None) else cross_block_table)
if (model_input.attn_metadata is not None
and model_input.attn_metadata.use_cuda_graph):
# We will be using CUDA graph replay for this decode.
max_len_of_block_table = self.get_max_block_per_batch()
batch_size = len(encoder_seq_lens)
graph_batch_size = _get_graph_batch_size(batch_size)
assert graph_batch_size >= batch_size
cuda_graph_pad_size = graph_batch_size - batch_size
# extend the cross_block_tables and encoder_seq_lens to match
# the graph_batch_size.
cross_block_tables.extend([[]
for _ in range(cuda_graph_pad_size)
])
encoder_seq_lens.extend(
itertools.repeat(1, cuda_graph_pad_size))
else:
max_len_of_block_table = max(
len(block_table) for block_table in cross_block_tables)
# Convert cross-attention block tables to encoder input tensor
cross_block_tables = make_tensor_with_pad(
cross_block_tables,
max_len=max(
len(block_table) for block_table in cross_block_tables),
max_len=max_len_of_block_table,
pad=0,
dtype=torch.int32,
device=self.device,
......
......@@ -45,7 +45,7 @@ from vllm.prompt_adapter.worker_manager import (
LRUCacheWorkerPromptAdapterManager)
from vllm.sampling_params import SamplingParams
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
from vllm.utils import (CudaMemoryProfiler, PyObjectCache, async_tensor_h2d,
from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, async_tensor_h2d,
flatten_2d_lists, is_hip, is_pin_memory_available,
supports_dynamo)
from vllm.worker.model_runner_base import (
......@@ -243,6 +243,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
prefix_cache_hit: bool = False,
reinit: bool = False,
reinit_use_defaults: bool = False,
encoder_seq_len: int = 0,
):
if reinit:
assert len(self.seq_ids) == len(seq_ids) # type: ignore
......@@ -256,6 +257,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
self.block_tables = block_tables
self.computed_block_nums = computed_block_nums
self.n_seqs = n_seqs
self.encoder_seq_len = encoder_seq_len
if reinit:
if len(self.seq_ids) == 1 and reinit_use_defaults:
......@@ -702,6 +704,11 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
assert n_seqs == 1
self.decode_only = False
encoder_seq_len = 0
if self.runner.model_config.is_encoder_decoder_model:
encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len()
inter_data = self.init_cached_inter_data(
request_id=seq_group_metadata.request_id,
seq_ids=seq_ids,
......@@ -709,7 +716,8 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
block_tables=seq_group_metadata.block_tables,
computed_block_nums=seq_group_metadata.computed_block_nums,
reinit=True,
reinit_use_defaults=True)
reinit_use_defaults=True,
encoder_seq_len=encoder_seq_len)
self.inter_data_list.append(inter_data)
......@@ -719,11 +727,15 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
for per_seq_group_fn in self.per_seq_group_compute_fns:
per_seq_group_fn(inter_data, seq_group_metadata)
def _use_captured_graph(self, batch_size: int,
max_decode_seq_len: int) -> bool:
def _use_captured_graph(self,
batch_size: int,
max_decode_seq_len: int,
max_encoder_seq_len: int = 0) -> bool:
return (self.decode_only and not self.runner.model_config.enforce_eager
and batch_size <= self.runner.max_batchsize_to_capture
and max_decode_seq_len <= self.runner.max_seq_len_to_capture)
and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1]
and max_decode_seq_len <= self.runner.max_seq_len_to_capture
and max_encoder_seq_len <= self.runner.max_seq_len_to_capture
and batch_size <= self.runner.max_batchsize_to_capture)
def build(self) -> ModelInputForGPU:
"""Finalize the builder intermediate data and
......@@ -763,15 +775,18 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
input_positions.extend(cur_input_positions)
seq_lens = []
query_lens = []
max_decode_seq_len = 0
max_encoder_seq_len = 0
for inter_data in self.inter_data_list:
seq_lens.extend(inter_data.seq_lens)
query_lens.extend(inter_data.query_lens)
if not inter_data.is_prompt:
max_decode_seq_len = max(max_decode_seq_len,
max(inter_data.seq_lens))
query_lens = []
for inter_data in self.inter_data_list:
query_lens.extend(inter_data.query_lens)
if self.runner.model_config.is_encoder_decoder_model:
max_encoder_seq_len = max(max_encoder_seq_len,
inter_data.encoder_seq_len)
# Mapping from request IDs to sequence IDs. Used for Jamba models
# that manages the cache by itself.
......@@ -781,8 +796,10 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
}
batch_size = len(input_tokens)
use_captured_graph = self._use_captured_graph(batch_size,
max_decode_seq_len)
use_captured_graph = self._use_captured_graph(
batch_size,
max_decode_seq_len,
max_encoder_seq_len=max_encoder_seq_len)
# If cuda graph can be used, pad tensors accordingly.
# See `capture_model` API for more details.
......@@ -995,7 +1012,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
def load_model(self) -> None:
logger.info("Starting to load model %s...", self.model_config.model)
with CudaMemoryProfiler() as m:
with DeviceMemoryProfiler() as m:
self.model = get_model(model_config=self.model_config,
device_config=self.device_config,
load_config=self.load_config,
......@@ -1064,8 +1081,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
"This may lead to less accurate results!")
if envs.VLLM_TEST_DYNAMO_GRAPH_CAPTURE and supports_dynamo():
from vllm.compilation.backends import vllm_backend
from vllm.plugins import get_torch_compile_backend
backend = get_torch_compile_backend() or "eager"
backend = get_torch_compile_backend() or vllm_backend
self.model = torch.compile(
self.model,
fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
......@@ -1363,7 +1381,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
for batch_size in reversed(batch_size_capture_list):
attn_metadata = (
self.attn_state.graph_capture_get_metadata_for_batch(
batch_size))
batch_size,
is_encoder_decoder_model=self.model_config.
is_encoder_decoder_model))
if self.lora_config:
lora_mapping = LoRAMapping(
......@@ -1379,10 +1399,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
)
self.set_active_prompt_adapters(
set(), prompt_adapter_mapping)
graph_runner = CUDAGraphRunner(
self.model, self.attn_backend.get_name(),
self.attn_state.graph_clone(batch_size))
self.attn_state.graph_clone(batch_size),
self.model_config.is_encoder_decoder_model)
capture_inputs = {
"input_ids":
......@@ -1419,6 +1439,12 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
self.model.get_seqlen_agnostic_capture_inputs(
batch_size)
})
if self.model_config.is_encoder_decoder_model:
# add the additional inputs to capture for
# encoder-decoder models.
self._update_inputs_to_capture_for_enc_dec_model(
capture_inputs)
graph_runner.capture(**capture_inputs)
self.graph_memory_pool = graph_runner.graph.pool()
self.graph_runners[virtual_engine][batch_size] = (
......@@ -1429,6 +1455,24 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
# This usually takes < 10 seconds.
logger.info("Graph capturing finished in %.0f secs.", elapsed_time)
def _update_inputs_to_capture_for_enc_dec_model(self,
capture_inputs: Dict[str,
Any]):
"""
Updates the set of input tensors needed for CUDA graph capture in an
encoder-decoder model.
This method modifies the provided `capture_inputs` dictionary by
adding tensors specific to encoder-decoder specific models that
need to be captured for CUDA Graph replay.
"""
# During the decode phase encoder_input_ids and encoder_positions are
# unset. Do the same thing for graph capture.
capture_inputs["encoder_input_ids"] = torch.tensor(
[], dtype=torch.long).cuda()
capture_inputs["encoder_positions"] = torch.tensor(
[], dtype=torch.long).cuda()
@property
def vocab_size(self) -> int:
return self.model_config.get_vocab_size()
......@@ -1628,7 +1672,7 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
class CUDAGraphRunner:
def __init__(self, model: nn.Module, backend_name: str,
attn_state: AttentionState):
attn_state: AttentionState, is_encoder_decoder_model: bool):
self.model = model
self.backend_name = backend_name
self.attn_state = attn_state
......@@ -1637,6 +1681,7 @@ class CUDAGraphRunner:
self.output_buffers: Dict[str, torch.Tensor] = {}
self._graph: Optional[torch.cuda.CUDAGraph] = None
self._is_encoder_decoder_model = is_encoder_decoder_model
@property
def graph(self):
......@@ -1670,8 +1715,9 @@ class CUDAGraphRunner:
intermediate_tensors=intermediate_inputs,
**kwargs,
)
# Wait for the warm up operations to finish before proceeding with
# Graph Capture.
torch.cuda.synchronize()
# Capture the graph.
self._graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream):
......@@ -1703,10 +1749,14 @@ class CUDAGraphRunner:
# Save the input and output buffers.
self.input_buffers = {
"input_ids": input_ids,
"positions": positions,
"kv_caches": kv_caches,
**self.attn_state.get_graph_input_buffers(attn_metadata),
"input_ids":
input_ids,
"positions":
positions,
"kv_caches":
kv_caches,
**self.attn_state.get_graph_input_buffers(
attn_metadata, self._is_encoder_decoder_model),
**kwargs,
}
if intermediate_inputs is not None:
......@@ -1736,8 +1786,8 @@ class CUDAGraphRunner:
self.input_buffers["positions"].copy_(positions, non_blocking=True)
self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping,
non_blocking=True)
self.attn_state.prepare_graph_input_buffers(self.input_buffers,
attn_metadata)
self.attn_state.prepare_graph_input_buffers(
self.input_buffers, attn_metadata, self._is_encoder_decoder_model)
if "seqlen_agnostic_capture_inputs" in self.input_buffers:
self.model.copy_inputs_before_cuda_graphs(self.input_buffers,
**kwargs)
......@@ -1751,6 +1801,12 @@ class CUDAGraphRunner:
if key != "model_execute_time" and key != "model_forward_time":
self.input_buffers[key].copy_(intermediate_tensors[key],
non_blocking=True)
if self._is_encoder_decoder_model:
self.input_buffers["encoder_input_ids"].copy_(
kwargs['encoder_input_ids'], non_blocking=True)
self.input_buffers["encoder_positions"].copy_(
kwargs['encoder_positions'], non_blocking=True)
# Run the graph.
self.graph.replay()
# Return the output tensor.
......
......@@ -3,11 +3,13 @@ import pickle
from abc import ABC, abstractmethod
from datetime import datetime
from functools import wraps
from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type,
TypeVar)
from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List,
Optional, Type, TypeVar)
import torch
from torch import is_tensor
from vllm.logger import init_logger
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
......@@ -17,6 +19,8 @@ if TYPE_CHECKING:
from vllm.attention.backends.abstract import AttentionBackend
from vllm.model_executor import SamplingMetadata
logger = init_logger(__name__)
T = TypeVar('T', bound="BroadcastableModelInput")
......@@ -113,6 +117,8 @@ def dump_input_when_exception(exclude_args: Optional[List[int]] = None,
except Exception as err:
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
filename = f"/tmp/err_{func.__name__}_input_{timestamp}.pkl"
logger.info("Writing input of failed execution to %s...",
filename)
with open(filename, "wb") as filep:
dumped_inputs = {
k: v
......@@ -122,7 +128,27 @@ def dump_input_when_exception(exclude_args: Optional[List[int]] = None,
for i, arg in enumerate(args):
if i not in (exclude_args or []):
dumped_inputs[f"arg_{i}"] = arg
pickle.dump(dumped_inputs, filep)
# Only persist dtype and shape for kvcache tensors
# (can be way to big otherwise)
if (kv_caches := dumped_inputs.get("kv_caches")) \
and isinstance(kv_caches, Iterable):
dumped_inputs["kv_caches"] = [(t.dtype, t.shape)
for t in kv_caches
if is_tensor(t)]
try:
pickle.dump(dumped_inputs, filep)
except Exception as pickle_err:
logger.warning(
"Failed to pickle inputs of failed execution: %s",
str(pickle_err))
raise type(err)(f"Error in model execution: "
f"{str(err)}") from err
logger.info(
"Completed writing input of failed execution to %s.",
filename)
raise type(err)(
f"Error in model execution (input dumped to {filename}): "
f"{str(err)}") from err
......
......@@ -29,7 +29,7 @@ if TYPE_CHECKING:
logger = init_logger(__name__)
MULTI_STEP_ATTENTION_BACKENDS = ["flash-attn", "flashinfer"]
MULTI_STEP_ATTENTION_BACKENDS = ["flash-attn", "rocm-flash-attn", "flashinfer"]
def seq_output_builder():
......@@ -614,34 +614,66 @@ def _pythonize_sampler_output(
frozen_model_input = model_input.frozen_model_input
assert frozen_model_input.sampling_metadata is not None
sampling_metadata = frozen_model_input.sampling_metadata
# samples generation should have been skipped
assert not output.outputs
pinned_buffer = pinned_sampled_token_buffer[:model_input.num_queries]
# CPU GPU sync
pinned_buffer = pinned_buffer.copy_(sampled_token_ids, non_blocking=False)
# We guarantee output tensors are ready, so it is safe to
# pythonize the sampler output & obtain CPU-side logprobs.
#
# However we should check whether logprobs pythonization may
# be skipped entirely, i.e. because no logprobs were requested
# or pythonization was not deferred. To that end,
#
# * `prompt_logprobs_are_requested_for_prefill` signals that
# there are *any* prefill-phase requests which specify that
# prompt logprobs should be returned.
#
# * `any_logprobs_are_requested` signals that there are any
# requests which (1) specify that sample logprobs should be
# returned, or (2) are in the prefill phase AND specify that
# prompt logprobs should be returned.
#
# Later on, these flags cause adjustments to the pythonization
# process to accommodate logprobs.
seq_groups = sampling_metadata.seq_groups
prompt_logprobs_are_requested_for_prefill = any([
sg.sampling_params.prompt_logprobs is not None and sg.is_prompt
for sg in seq_groups
])
any_logprobs_are_requested = (
prompt_logprobs_are_requested_for_prefill
or any([sg.sampling_params.logprobs is not None for sg in seq_groups]))
if prompt_logprobs_are_requested_for_prefill:
# CPU GPU sync, after gathering *only* sampled tokens (since
# requesting prompt logprobs leads `sampled_token_ids` to
# include prompt token ids in addition to sampled token ids.)
sample_idx_tensor = torch.tensor(
[sdx for sg in seq_groups for sdx in sg.sample_indices])
pinned_buffer = pinned_buffer.copy_(
sampled_token_ids[sample_idx_tensor, :], non_blocking=False)
else:
# CPU GPU sync
pinned_buffer = pinned_buffer.copy_(sampled_token_ids,
non_blocking=False)
# this will not block as the tensors are already on CPU
samples_list = pinned_buffer.tolist()
sampling_metadata = frozen_model_input.sampling_metadata
skip_sampler_cpu_output = (
frozen_model_input.sampling_metadata.skip_sampler_cpu_output)
# We are guaranteed output tensors are ready, so it is safe to
# pythonize the sampler output & obtain CPU-side logprobs.
#
# However this computation may be skipped entirely
# if no pythonization was deferred.
seq_groups = sampling_metadata.seq_groups
logprobs_are_requested = any([
sg.sampling_params.logprobs is not None
or sg.sampling_params.prompt_logprobs is not None for sg in seq_groups
])
# *Don't* skip logprobs pythonization *if*:
# * Any requests require logprobs to be returned in this
# iteration AND
# * These requests are being scheduled in a fashion which
# defers pythonization (i.e. multi-step scheduling.)
do_pythonize_logprobs = (skip_sampler_cpu_output
and logprobs_are_requested)
and any_logprobs_are_requested)
(
prompt_logprobs,
sample_logprobs,
......@@ -666,7 +698,7 @@ def _pythonize_sampler_output(
prompt_logprobs[sgdx],
sample_logprobs[sgdx],
)
elif logprobs_are_requested:
elif any_logprobs_are_requested:
(
group_prompt_logprobs,
group_sample_logprobs,
......@@ -696,7 +728,7 @@ def _pythonize_sampler_output(
seq_output.parent_seq_id = seq_ids[parent_id]
seq_output.output_token = next_token_id
if logprobs_are_requested:
if any_logprobs_are_requested:
seq_output.logprobs = group_sample_logprobs[tdx]
else:
logprobs = next(iter(seq_output.logprobs.values()))
......@@ -714,7 +746,7 @@ def _pythonize_sampler_output(
seq_outputs.append(
SequenceOutput(seq_ids[parent_id], next_token_id,
(group_sample_logprobs[tdx]
if logprobs_are_requested else {
if any_logprobs_are_requested else {
next_token_id:
Logprob(logprob=float('inf'),
rank=None,
......@@ -722,12 +754,12 @@ def _pythonize_sampler_output(
})))
if cache is not None:
completion_seq_group_output.prompt_logprobs = \
group_prompt_logprobs if logprobs_are_requested else None
group_prompt_logprobs if any_logprobs_are_requested else None
output.outputs.append(completion_seq_group_output)
else:
output.outputs.append(
CompletionSequenceGroupOutput(
seq_outputs, (group_prompt_logprobs
if logprobs_are_requested else None)))
if any_logprobs_are_requested else None)))
assert len(output.outputs) > 0
import dataclasses
from typing import Dict, Optional, Tuple
import torch
from vllm.distributed import broadcast_tensor_dict
from vllm.sequence import ExecuteModelRequest
from vllm.worker.tpu_model_runner import ModelInputForTPU
from vllm.worker.tpu_worker import TPUWorker
from vllm.worker.worker_base import WorkerInput
class MultiStepTPUWorker(TPUWorker):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.cached_model_input: Optional[ModelInputForTPU] = None
def _get_driver_input_and_broadcast(
self, execute_model_req: ExecuteModelRequest
) -> Tuple[ModelInputForTPU, WorkerInput, Dict[str, torch.Tensor]]:
assert self.is_driver_worker
assert execute_model_req.virtual_engine == 0
is_first_multi_step = execute_model_req.is_first_multi_step
is_last_step = execute_model_req.is_last_step
if is_first_multi_step:
worker_input: WorkerInput = self.prepare_worker_input(
execute_model_req=execute_model_req)
worker_input = dataclasses.replace(
worker_input,
num_steps=execute_model_req.num_lookahead_slots + 1)
model_input: ModelInputForTPU = (
self.model_runner.prepare_model_input(
execute_model_req.seq_group_metadata_list,
execute_model_req.virtual_engine,
execute_model_req.finished_requests_ids))
if execute_model_req.async_callback:
model_input = dataclasses.replace(
model_input,
async_callback=execute_model_req.async_callback)
else:
assert self.cached_model_input is not None
model_input = self.cached_model_input
worker_input = WorkerInput()
model_input = dataclasses.replace(
model_input,
is_first_multi_step=is_first_multi_step,
is_last_step=is_last_step)
if self.do_metadata_broadcast:
if is_first_multi_step:
broadcast_data = worker_input.as_broadcastable_tensor_dict()
broadcast_data.update(
model_input.as_broadcastable_tensor_dict())
broadcast_tensor_dict(broadcast_data, src=0)
else:
broadcast_data = {
"is_first_multi_step": is_first_multi_step,
"is_last_step": is_last_step,
}
broadcast_tensor_dict(broadcast_data, src=0)
# Retuning empty dict here to keep this compatible with
# `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast`
return model_input, worker_input, {}
def prepare_input(
self,
execute_model_req: Optional[ExecuteModelRequest] = None,
) -> Optional[Tuple[ModelInputForTPU, WorkerInput, Dict[str,
torch.Tensor]]]:
if self.is_driver_worker:
if execute_model_req is None:
if self.do_metadata_broadcast:
broadcast_tensor_dict({}, src=0)
return None
model_input, worker_input, _ = self._get_driver_input_and_broadcast(
execute_model_req)
if model_input.is_first_multi_step:
self.cached_model_input = model_input
return model_input, worker_input, {}
else:
broadcast_data = broadcast_tensor_dict(src=0)
if not broadcast_data:
return None
if len(broadcast_data) == 2:
assert self.cached_model_input is not None
self.cached_model_input = dataclasses.replace(
self.cached_model_input,
is_first_multi_step=broadcast_data["is_first_multi_step"],
is_last_step=broadcast_data["is_last_step"])
empty_worker_input = WorkerInput()
return self.cached_model_input, empty_worker_input, {}
worker_input = WorkerInput.from_broadcasted_tensor_dict(
broadcast_data)
model_input = (
self.model_runner.
make_model_input_from_broadcasted_tensor_dict(broadcast_data))
self.cached_model_input = model_input
return model_input, worker_input, {}
......@@ -51,6 +51,8 @@ class ModelInputForTPU(ModelRunnerInputBase):
num_samples: int
best_of: List[int]
seq_groups: List[List[int]]
is_first_multi_step: bool = True
is_last_step: bool = True
virtual_engine: int = 0
async_callback: Optional[Callable] = None
......@@ -65,6 +67,8 @@ class ModelInputForTPU(ModelRunnerInputBase):
"num_samples": self.num_samples,
"best_of": self.best_of,
"seq_groups": self.seq_groups,
"is_first_multi_step": self.is_first_multi_step,
"is_last_step": self.is_last_step,
"virtual_engine": self.virtual_engine,
}
_add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
......@@ -118,6 +122,7 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
self.block_size,
False,
)
self.cached_step_outputs: List[torch.Tensor] = []
def load_model(self) -> None:
self.device = self.device_config.device
......@@ -518,97 +523,159 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
num_steps: int = 1,
) -> List[SamplerOutput]:
assert intermediate_tensors is None
if num_steps > 1:
raise ValueError(
"TPUModelRunner does not support multi-step execution.")
def _execute_model(*args):
"""Move input args from CPU to device and execute the model."""
new_args = []
for arg in args:
if isinstance(arg, torch.Tensor):
arg = arg.to(self.device)
elif isinstance(arg, AttentionMetadata):
arg.slot_mapping = arg.slot_mapping.to(self.device)
if getattr(arg, "block_tables", None) is not None:
arg.block_tables = arg.block_tables.to(self.device)
if getattr(arg, "context_lens", None) is not None:
arg.context_lens = arg.context_lens.to(self.device)
new_args.append(arg)
return self.model(*new_args, is_prompt=is_prompt)
num_prefills = model_input.attn_metadata.num_prefills
is_prompt = num_prefills > 0
if not model_input.is_first_multi_step:
if not model_input.is_last_step:
return []
use_async_out_proc = model_input.async_callback is not None
sampler_outputs = []
num_outputs = len(self.cached_step_outputs)
for i in range(num_outputs):
next_token_ids = self.cached_step_outputs.pop(0)
next_token_ids = next_token_ids.cpu().tolist()
sampler_output = _make_decode_output(next_token_ids,
model_input.seq_groups)
sampler_outputs.append(sampler_output)
if i < num_outputs - 1 and use_async_out_proc:
assert model_input.async_callback is not None
ctx = model_input.async_callback.keywords[ # type: ignore
"ctx"]
ctx.append_output(
outputs=[sampler_output],
seq_group_metadata_list=ctx.seq_group_metadata_list,
scheduler_outputs=ctx.scheduler_outputs,
is_async=False,
is_last_step=False)
model_input.async_callback()
if use_async_out_proc:
return [sampler_outputs[-1]]
else:
return sampler_outputs
is_prompt = model_input.attn_metadata.num_prefills > 0
if is_prompt:
assert num_steps == 1
# NOTE(woosuk): Since the FlashAttention kernel does not support
# ragged inputs, we split the prompts into different batches and
# process them separately. This is a temporary hack that should be
# optimized by using SplashAttention.
next_token_ids = []
orig_slot_mapping = model_input.attn_metadata.slot_mapping
batch_size = model_input.input_lens.shape[0]
start_idx = 0
next_token_ids = []
for i in range(batch_size):
# Get the actual prefill_len.
prefill_len = model_input.input_lens[i:i + 1].item()
prefill_len = _get_padded_prefill_len(prefill_len)
end_idx = start_idx + prefill_len
model_input.attn_metadata.slot_mapping = orig_slot_mapping[
None, start_idx:end_idx]
model_input.attn_metadata.num_prefills = 1
output_token_ids = _execute_model(
model_input.token_ids[None, start_idx:end_idx],
model_input.position_ids[None, start_idx:end_idx],
model_input.attn_metadata, model_input.input_lens[i:i + 1],
model_input.t[i:i + 1], model_input.p[i:i + 1],
model_input.num_samples, kv_caches)
if i == 0 and model_input.async_callback is not None:
model_input.async_callback()
# Retrieve the outputs to CPU.
next_token_ids += output_token_ids.cpu().tolist()
token_ids = model_input.token_ids[None, start_idx:end_idx].to(
self.device)
position_ids = model_input.position_ids[None,
start_idx:end_idx].to(
self.device)
attn_metadata = model_input.attn_metadata
attn_metadata.num_prefills = 1
attn_metadata.slot_mapping = orig_slot_mapping[
None, start_idx:end_idx].to(self.device)
input_lens = model_input.input_lens[i:i + 1].to(self.device)
t = model_input.t[i:i + 1].to(self.device)
p = model_input.p[i:i + 1].to(self.device)
output_token_ids = self.model(token_ids,
position_ids,
attn_metadata,
input_lens,
t,
p,
model_input.num_samples,
kv_caches,
is_prompt=True)
next_token_ids.append(output_token_ids[0])
start_idx = end_idx
else:
# Execute the model.
output_token_ids = _execute_model(
model_input.token_ids, model_input.position_ids,
model_input.attn_metadata, model_input.input_lens,
model_input.t, model_input.p, model_input.num_samples,
kv_caches)
if model_input.async_callback is not None:
model_input.async_callback()
# Retrieve the outputs to CPU.
next_token_ids = output_token_ids.cpu().tolist()
# NOTE(woosuk): Minimal code to construct the sampler outputs.
# The TPU backend does not reuse the sampler, since the TPU backend
# does not support the advanced sampling parameters such as logprobs.
zero_logprob = Logprob(0.0)
batch_idx = 0
sampler_outputs = []
for seq_group in model_input.seq_groups:
seq_ids = seq_group
seq_outputs = []
if is_prompt:
next_token_ids = [
output_token_ids.cpu().tolist()
for output_token_ids in next_token_ids
]
# NOTE(woosuk): Minimal code to construct the sampler outputs.
# The TPU backend does not reuse the sampler, since the TPU backend
# does not support advanced sampling parameters such as logprobs.
zero_logprob = Logprob(0.0)
sampler_outputs = []
for i, seq_group in enumerate(model_input.seq_groups):
seq_ids = seq_group
assert len(seq_ids) == 1
seq_id = seq_ids[0]
for i in range(model_input.best_of[batch_idx]):
next_token_id = next_token_ids[batch_idx][i]
seq_outputs = []
for j in range(model_input.best_of[i]):
next_token_id = next_token_ids[i][j]
seq_outputs.append(
SequenceOutput(seq_id, next_token_id,
{next_token_id: zero_logprob}))
batch_idx += 1
else:
for seq_id in seq_ids:
next_token_id = next_token_ids[batch_idx]
seq_outputs.append(
SequenceOutput(seq_id, next_token_id,
{next_token_id: zero_logprob}))
batch_idx += 1
sampler_outputs.append(
CompletionSequenceGroupOutput(seq_outputs, None))
return [SamplerOutput(sampler_outputs)]
sampler_outputs.append(
CompletionSequenceGroupOutput(seq_outputs, None))
return [SamplerOutput(sampler_outputs)]
else:
token_ids = model_input.token_ids.to(self.device)
position_ids = model_input.position_ids.to(self.device)
attn_metadata = model_input.attn_metadata
attn_metadata.slot_mapping = attn_metadata.slot_mapping.to(
self.device)
attn_metadata.block_tables = attn_metadata.block_tables.to(
self.device)
attn_metadata.context_lens = attn_metadata.context_lens.to(
self.device)
t = model_input.t.to(self.device)
p = model_input.p.to(self.device)
input_lens = model_input.input_lens.to(self.device)
for i in range(num_steps):
slot_mapping = attn_metadata.slot_mapping
output_token_ids = self.model(token_ids,
position_ids,
attn_metadata,
input_lens,
t,
p,
model_input.num_samples,
kv_caches,
is_prompt=False)
self.cached_step_outputs.append(output_token_ids)
if i < num_steps - 1:
# Prepare the inputs for the next step.
token_ids = output_token_ids.unsqueeze(dim=1).int()
position_ids = position_ids + 1
attn_metadata.context_lens = attn_metadata.context_lens + 1
block_tables = attn_metadata.block_tables
block_number = block_tables.gather(
1,
position_ids.long() // self.block_size)
block_offset = position_ids % self.block_size
is_padding = slot_mapping == _PAD_SLOT_ID
slot_mapping = block_number * self.block_size + block_offset
slot_mapping = slot_mapping.long()
slot_mapping = torch.where(is_padding, _PAD_SLOT_ID,
slot_mapping)
attn_metadata.slot_mapping = slot_mapping
if model_input.async_callback is not None:
model_input.async_callback()
if num_steps > 1:
return []
# Retrieve the outputs to CPU.
next_token_ids = self.cached_step_outputs.pop(0)
next_token_ids = next_token_ids.cpu().tolist()
sampler_output = _make_decode_output(next_token_ids,
model_input.seq_groups)
return [sampler_output]
class ModelWrapper(TorchCompileWrapperWithCustomDispatcher):
......@@ -756,3 +823,24 @@ def _apply_top_p(logits: torch.Tensor, p: torch.Tensor) -> torch.Tensor:
cutoff_logit = torch.gather(logits_sorted, -1, cutoff_index)
logits = logits.masked_fill_(logits < cutoff_logit, -float("inf"))
return logits
def _make_decode_output(
next_token_ids: List[int],
seq_groups: List[List[int]],
) -> SamplerOutput:
zero_logprob = Logprob(0.0)
sampler_outputs = []
batch_idx = 0
for seq_group in seq_groups:
seq_ids = seq_group
seq_outputs = []
for seq_id in seq_ids:
next_token_id = next_token_ids[batch_idx]
seq_outputs.append(
SequenceOutput(seq_id, next_token_id,
{next_token_id: zero_logprob}))
batch_idx += 1
sampler_outputs.append(CompletionSequenceGroupOutput(
seq_outputs, None))
return SamplerOutput(sampler_outputs)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment