update

e019635f · xuxzh1 · 64def8e2 · 64def8e2 · 64def8e2 · 64def8e2
Commit e019635f authored Nov 01, 2024 by xuxzh1 🎱
11 changed files
--- a/server/vllm/vllm/transformers_utils/configs/__init__.py
+++ b/server/vllm/vllm/transformers_utils/configs/__init__.py
-from vllm.transformers_utils.configs.mpt import MPTConfig
-from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
-from vllm.transformers_utils.configs.aquila import AquilaConfig
-from vllm.transformers_utils.configs.qwen import QWenConfig
-# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
-# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
-# `FalconConfig` class from the official HuggingFace transformers library.
-from vllm.transformers_utils.configs.falcon import RWConfig
-__all__ = [
-    "MPTConfig",
-    "BaiChuanConfig",
-    "AquilaConfig",
-    "QWenConfig",
-    "RWConfig",
-]
--- a/server/vllm/vllm/transformers_utils/configs/aquila.py
+++ b/server/vllm/vllm/transformers_utils/configs/aquila.py
-# coding=utf-8
-# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Aquila model configuration"""
-from transformers import PretrainedConfig
-class AquilaConfig(PretrainedConfig):
-    model_type = "aquila"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    def __init__(
-        self,
-        vocab_size=100008,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.006,
-        rms_norm_eps=1e-5,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
--- a/server/vllm/vllm/transformers_utils/configs/baichuan.py
+++ b/server/vllm/vllm/transformers_utils/configs/baichuan.py
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from transformers.configuration_utils import PretrainedConfig
-class BaiChuanConfig(PretrainedConfig):
-    model_type = "baichuan"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    def __init__(
-        self,
-        vocab_size=64000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        hidden_act="silu",
-        max_position_embeddings=4096,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
--- a/server/vllm/vllm/transformers_utils/configs/falcon.py
+++ b/server/vllm/vllm/transformers_utils/configs/falcon.py
-# Adapted from
-# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Falcon configuration"""
-from transformers.configuration_utils import PretrainedConfig
-class RWConfig(PretrainedConfig):
-    model_type = "falcon"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    attribute_map = {
-        "num_hidden_layers": "n_layer",
-        "num_attention_heads": "n_head",
-        "num_kv_heads": "n_head_kv",
-    }
-    def __init__(
-        self,
-        vocab_size=250880,
-        hidden_size=64,
-        n_layer=2,
-        n_head=8,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        use_cache=True,
-        bos_token_id=1,
-        eos_token_id=2,
-        hidden_dropout=0.0,
-        attention_dropout=0.0,
-        multi_query=True,
-        n_head_kv=None,
-        alibi=False,
-        bias=False,
-        parallel_attn=False,
-        new_decoder_architecture=False,
-        **kwargs,
-    ) -> None:
-        self.vocab_size = vocab_size
-        # Backward compatibility with n_embed kwarg
-        n_embed = kwargs.pop("n_embed", None)
-        self.hidden_size = hidden_size if n_embed is None else n_embed
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.use_cache = use_cache
-        self.hidden_dropout = hidden_dropout
-        self.attention_dropout = attention_dropout
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.multi_query = multi_query
-        self.n_head_kv = 1 if n_head_kv is None else n_head_kv
-        self.alibi = alibi
-        self.bias = bias
-        self.parallel_attn = parallel_attn
-        self.new_decoder_architecture = new_decoder_architecture
-        if self.hidden_size == 8192:
-            # Hack for falcon-40b
-            self.new_decoder_architecture = True
-        super().__init__(bos_token_id=bos_token_id,
-                         eos_token_id=eos_token_id,
-                         **kwargs)
-    @property
-    def head_dim(self):
-        return self.hidden_size // self.n_head
-    @property
-    def rotary(self):
-        return not self.alibi
--- a/server/vllm/vllm/transformers_utils/configs/mpt.py
+++ b/server/vllm/vllm/transformers_utils/configs/mpt.py
-# Adapted from
-# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
-from typing import Any, Dict, Optional, Union
-from transformers import PretrainedConfig
-_ATTN_CONFIG_DEFAULTS = {
-    "attn_type": "multihead_attention",
-    "attn_pdrop": 0.0,
-    "attn_impl": "triton",
-    "qk_ln": False,
-    "clip_qkv": None,
-    "softmax_scale": None,
-    "prefix_lm": False,
-    "attn_uses_sequence_id": False,
-    "alibi": False,
-    "alibi_bias_max": 8,
-}
-class MPTConfig(PretrainedConfig):
-    model_type = "mpt"
-    attribute_map = {
-        "hidden_size": "d_model",
-        "num_attention_heads": "n_heads",
-        "num_hidden_layers": "n_layers",
-    }
-    def __init__(
-        self,
-        d_model: int = 2048,
-        n_heads: int = 16,
-        n_layers: int = 24,
-        expansion_ratio: int = 4,
-        max_seq_len: int = 2048,
-        vocab_size: int = 50368,
-        resid_pdrop: float = 0.0,
-        emb_pdrop: float = 0.0,
-        learned_pos_emb: bool = True,
-        attn_config: Optional[Dict[str, Any]] = None,
-        init_device: str = "cpu",
-        logit_scale: Optional[Union[float, str]] = None,
-        no_bias: bool = False,
-        verbose: int = 0,
-        embedding_fraction: float = 1.0,
-        norm_type: str = "low_precision_layernorm",
-        use_cache: bool = False,
-        **kwargs,
-    ) -> None:
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.expansion_ratio = expansion_ratio
-        self.max_seq_len = max_seq_len
-        self.vocab_size = vocab_size
-        self.resid_pdrop = resid_pdrop
-        self.emb_pdrop = emb_pdrop
-        self.learned_pos_emb = learned_pos_emb
-        if attn_config is None:
-            self.attn_config = _ATTN_CONFIG_DEFAULTS
-        else:
-            self.attn_config = attn_config
-        self.init_device = init_device
-        self.logit_scale = logit_scale
-        self.no_bias = no_bias
-        self.verbose = verbose
-        self.embedding_fraction = embedding_fraction
-        self.norm_type = norm_type
-        self.use_cache = use_cache
-        if "name" in kwargs:
-            del kwargs["name"]
-        if "loss_fn" in kwargs:
-            del kwargs["loss_fn"]
-        super().__init__(**kwargs)
--- a/server/vllm/vllm/transformers_utils/configs/qwen.py
+++ b/server/vllm/vllm/transformers_utils/configs/qwen.py
-# Copyright (c) Alibaba Cloud.
-# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
-from transformers import PretrainedConfig
-class QWenConfig(PretrainedConfig):
-    model_type = "qwen"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    def __init__(
-        self,
-        vocab_size=151936,
-        hidden_size=4096,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        emb_dropout_prob=0.0,
-        attn_dropout_prob=0.0,
-        layer_norm_epsilon=1e-6,
-        initializer_range=0.02,
-        max_position_embeddings=8192,
-        scale_attn_weights=True,
-        use_cache=True,
-        bf16=False,
-        fp16=False,
-        fp32=False,
-        kv_channels=128,
-        rotary_pct=1.0,
-        rotary_emb_base=10000,
-        use_dynamic_ntk=True,
-        use_logn_attn=True,
-        use_flash_attn="auto",
-        intermediate_size=22016,
-        no_bias=True,
-        tie_word_embeddings=False,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.emb_dropout_prob = emb_dropout_prob
-        self.attn_dropout_prob = attn_dropout_prob
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.scale_attn_weights = scale_attn_weights
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-        self.bf16 = bf16
-        self.fp16 = fp16
-        self.fp32 = fp32
-        self.kv_channels = kv_channels
-        self.rotary_pct = rotary_pct
-        self.rotary_emb_base = rotary_emb_base
-        self.use_dynamic_ntk = use_dynamic_ntk
-        self.use_logn_attn = use_logn_attn
-        self.use_flash_attn = use_flash_attn
-        self.no_bias = no_bias
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
--- a/server/vllm/vllm/transformers_utils/tokenizer.py
+++ b/server/vllm/vllm/transformers_utils/tokenizer.py
-from typing import List, Optional, Tuple, Union
-from transformers import (AutoTokenizer, PreTrainedTokenizer,
-                          PreTrainedTokenizerFast)
-from vllm.logger import init_logger
-logger = init_logger(__name__)
-# A fast LLaMA tokenizer with the pre-processed `tokenizer.json` file.
-_FAST_LLAMA_TOKENIZER = "hf-internal-testing/llama-tokenizer"
-def get_tokenizer(
-    tokenizer_name: str,
-    *args,
-    tokenizer_mode: str = "auto",
-    trust_remote_code: bool = False,
-    tokenizer_revision: Optional[str] = None,
-    **kwargs,
-) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
-    """Gets a tokenizer for the given model name via Huggingface."""
-    if tokenizer_mode == "slow":
-        if kwargs.get("use_fast", False):
-            raise ValueError(
-                "Cannot use the fast tokenizer in slow tokenizer mode.")
-        kwargs["use_fast"] = False
-    if ("llama" in tokenizer_name.lower() and kwargs.get("use_fast", True)
-            and tokenizer_name != _FAST_LLAMA_TOKENIZER):
-        logger.info(
-            "For some LLaMA V1 models, initializing the fast tokenizer may "
-            "take a long time. To reduce the initialization time, consider "
-            f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original "
-            "tokenizer.")
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_name,
-            *args,
-            trust_remote_code=trust_remote_code,
-            tokenizer_revision=tokenizer_revision,
-            **kwargs)
-    except TypeError as e:
-        # The LLaMA tokenizer causes a protobuf error in some environments.
-        err_msg = (
-            "Failed to load the tokenizer. If you are using a LLaMA V1 model "
-            f"consider using '{_FAST_LLAMA_TOKENIZER}' instead of the "
-            "original tokenizer.")
-        raise RuntimeError(err_msg) from e
-    except ValueError as e:
-        # If the error pertains to the tokenizer class not existing or not
-        # currently being imported, suggest using the --trust-remote-code flag.
-        if (not trust_remote_code and
-            ("does not exist or is not currently imported." in str(e)
-             or "requires you to execute the tokenizer file" in str(e))):
-            err_msg = (
-                "Failed to load the tokenizer. If the tokenizer is a custom "
-                "tokenizer not yet available in the HuggingFace transformers "
-                "library, consider setting `trust_remote_code=True` in LLM "
-                "or using the `--trust-remote-code` flag in the CLI.")
-            raise RuntimeError(err_msg) from e
-        else:
-            raise e
-    if not isinstance(tokenizer, PreTrainedTokenizerFast):
-        logger.warning(
-            "Using a slow tokenizer. This might cause a significant "
-            "slowdown. Consider using a fast tokenizer instead.")
-    return tokenizer
-def _convert_tokens_to_string_with_added_encoders(
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-    output_tokens: List[str],
-    skip_special_tokens: bool,
-) -> str:
-    # Adapted from
-    # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
-    # NOTE(woosuk): The following code is slow because it runs a for loop over
-    # the output_tokens. In Python, running a for loop over a list can be slow
-    # even when the loop body is very simple.
-    sub_texts = []
-    current_sub_text = []
-    all_special_tokens = set(tokenizer.all_special_tokens)
-    for token in output_tokens:
-        if skip_special_tokens and token in all_special_tokens:
-            continue
-        if token in tokenizer.get_added_vocab():
-            if current_sub_text:
-                sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
-                sub_texts.append(sub_text)
-                current_sub_text = []
-            sub_texts.append(token)
-        else:
-            current_sub_text.append(token)
-    if current_sub_text:
-        sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
-        sub_texts.append(sub_text)
-    return " ".join(sub_texts)
-# Based on
-# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
-# under Apache 2.0 license
-def detokenize_incrementally(
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-    all_input_ids: List[int],
-    prev_tokens: Optional[List[str]],
-    prefix_offset: int = 0,
-    read_offset: int = 0,
-    skip_special_tokens: bool = False,
-) -> Tuple[List[str], str, int, int]:
-    new_token_id = all_input_ids[-1]
-    # This is the first iteration for this sequence
-    if prev_tokens is None:
-        new_tokens = tokenizer.convert_ids_to_tokens(
-            all_input_ids, skip_special_tokens=skip_special_tokens)
-        output_tokens = new_tokens
-        # 5 is an arbitrary value that should work for all
-        # tokenizers (bigger = more conservative).
-        # Subtract 1 extra to account for the generated token.
-        prefix_offset = max(len(output_tokens) - 6, 0)
-        read_offset = max(len(output_tokens) - 1, 0)
-    else:
-        # Put new_token_id in a list so skip_special_tokens is respected
-        new_tokens = tokenizer.convert_ids_to_tokens(
-            [new_token_id], skip_special_tokens=skip_special_tokens)
-        output_tokens = prev_tokens + new_tokens
-    # The prefix text is necessary only to defeat cleanup algorithms in
-    # the decode which decide to add a space or not depending on the
-    # surrounding ids.
-    if tokenizer.is_fast or not tokenizer.get_added_vocab():
-        prefix_text = tokenizer.convert_tokens_to_string(
-            output_tokens[prefix_offset:read_offset])
-        new_text = tokenizer.convert_tokens_to_string(
-            output_tokens[prefix_offset:])
-    else:
-        prefix_text = _convert_tokens_to_string_with_added_encoders(
-            tokenizer,
-            output_tokens[prefix_offset:read_offset],
-            skip_special_tokens=skip_special_tokens)
-        new_text = _convert_tokens_to_string_with_added_encoders(
-            tokenizer,
-            output_tokens[prefix_offset:],
-            skip_special_tokens=skip_special_tokens)
-    if len(new_text) > len(prefix_text) and not new_text.endswith("�"):
-        # utf-8 char at the end means it's a potential unfinished byte sequence
-        # from byte fallback tokenization.
-        # If it's in the middle, it's probably a real invalid id generated
-        # by the model
-        new_text = new_text[len(prefix_text):]
-        return new_tokens, new_text, read_offset, len(output_tokens)
-    else:
-        return new_tokens, "", prefix_offset, read_offset
--- a/server/vllm/vllm/utils.py
+++ b/server/vllm/vllm/utils.py
-import enum
-import uuid
-from platform import uname
-import psutil
-import torch
-from vllm import cuda_utils
-class Device(enum.Enum):
-    GPU = enum.auto()
-    CPU = enum.auto()
-class Counter:
-    def __init__(self, start: int = 0) -> None:
-        self.counter = start
-    def __next__(self) -> int:
-        i = self.counter
-        self.counter += 1
-        return i
-    def reset(self) -> None:
-        self.counter = 0
-def get_max_shared_memory_bytes(gpu: int = 0) -> int:
-    """Returns the maximum shared memory per thread block in bytes."""
-    # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
-    cudaDevAttrMaxSharedMemoryPerBlockOptin = 97  # pylint: disable=invalid-name
-    max_shared_mem = cuda_utils.get_device_attribute(
-        cudaDevAttrMaxSharedMemoryPerBlockOptin, gpu)
-    return int(max_shared_mem)
-def get_gpu_memory(gpu: int = 0) -> int:
-    """Returns the total memory of the GPU in bytes."""
-    return torch.cuda.get_device_properties(gpu).total_memory
-def get_cpu_memory() -> int:
-    """Returns the total CPU memory of the node in bytes."""
-    return psutil.virtual_memory().total
-def random_uuid() -> str:
-    return str(uuid.uuid4().hex)
-def in_wsl() -> bool:
-    # Reference: https://github.com/microsoft/WSL/issues/4071
-    return "microsoft" in " ".join(uname()).lower()
--- a/server/vllm/vllm/worker/__init__.py
+++ b/server/vllm/vllm/worker/__init__.py
--- a/server/vllm/vllm/worker/cache_engine.py
+++ b/server/vllm/vllm/worker/cache_engine.py
-"""CacheEngine class for managing the KV cache."""
-from typing import Dict, List, Tuple
-import torch
-from vllm import cache_ops
-from vllm.config import CacheConfig, ModelConfig, ParallelConfig
-from vllm.logger import init_logger
-from vllm.utils import in_wsl
-logger = init_logger(__name__)
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-class CacheEngine:
-    """Manages the KV cache.
-    This class is responsible for initializing and managing the GPU and CPU KV
-    caches. It also provides methods for performing KV cache operations, such
-    as swapping and copying.
-    """
-    def __init__(
-        self,
-        cache_config: CacheConfig,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-    ) -> None:
-        self.cache_config = cache_config
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.head_size = model_config.get_head_size()
-        self.num_layers = model_config.get_num_layers(parallel_config)
-        self.num_heads = model_config.get_num_kv_heads(parallel_config)
-        self.dtype = model_config.dtype
-        self.block_size = cache_config.block_size
-        self.num_gpu_blocks = cache_config.num_gpu_blocks
-        self.num_cpu_blocks = cache_config.num_cpu_blocks
-        # Initialize the cache.
-        self.gpu_cache = self.allocate_gpu_cache()
-        self.cpu_cache = self.allocate_cpu_cache()
-        # Initialize the stream for caching operations.
-        self.cache_stream = torch.cuda.Stream()
-        assert self.cache_stream != torch.cuda.current_stream()
-        # Initialize the events for stream synchronization.
-        self.events = [torch.cuda.Event() for _ in range(self.num_layers)]
-    def get_key_block_shape(self) -> Tuple[int, int, int, int]:
-        element_size = torch.tensor([], dtype=self.dtype).element_size()
-        x = 16 // element_size
-        return (
-            self.num_heads,
-            self.head_size // x,
-            self.block_size,
-            x,
-        )
-    def get_value_block_shape(self) -> Tuple[int, int, int]:
-        return (
-            self.num_heads,
-            self.head_size,
-            self.block_size,
-        )
-    def allocate_gpu_cache(self) -> List[KVCache]:
-        gpu_cache: List[KVCache] = []
-        key_block_shape = self.get_key_block_shape()
-        value_block_shape = self.get_value_block_shape()
-        for _ in range(self.num_layers):
-            key_blocks = torch.empty(
-                size=(self.num_gpu_blocks, *key_block_shape),
-                dtype=self.dtype,
-                device="cuda",
-            )
-            value_blocks = torch.empty(
-                size=(self.num_gpu_blocks, *value_block_shape),
-                dtype=self.dtype,
-                device="cuda",
-            )
-            gpu_cache.append((key_blocks, value_blocks))
-        return gpu_cache
-    def allocate_cpu_cache(self) -> List[KVCache]:
-        cpu_cache: List[KVCache] = []
-        key_block_shape = self.get_key_block_shape()
-        value_block_shape = self.get_value_block_shape()
-        pin_memory = not in_wsl()
-        if not pin_memory:
-            # Pinning memory in WSL is not supported.
-            # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
-            logger.warning("Using 'pin_memory=False' as WSL is detected. "
-                           "This may slow down the performance.")
-        for _ in range(self.num_layers):
-            key_blocks = torch.empty(
-                size=(self.num_cpu_blocks, *key_block_shape),
-                dtype=self.dtype,
-                pin_memory=pin_memory,
-            )
-            value_blocks = torch.empty(
-                size=(self.num_cpu_blocks, *value_block_shape),
-                dtype=self.dtype,
-                pin_memory=pin_memory,
-            )
-            cpu_cache.append((key_blocks, value_blocks))
-        return cpu_cache
-    def _swap(
-        self,
-        src: List[KVCache],
-        dst: List[KVCache],
-        src_to_dst: Dict[int, int],
-    ) -> None:
-        with torch.cuda.stream(self.cache_stream):
-            for i in range(self.num_layers):
-                src_key_cache, src_value_cache = src[i]
-                dst_key_cache, dst_value_cache = dst[i]
-                # Copy the key blocks.
-                cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
-                # Copy the value blocks.
-                cache_ops.swap_blocks(src_value_cache, dst_value_cache,
-                                      src_to_dst)
-                event = self.events[i]
-                event.record(stream=self.cache_stream)
-    def swap_in(self, src_to_dst: Dict[int, int]) -> None:
-        self._swap(self.cpu_cache, self.gpu_cache, src_to_dst)
-    def swap_out(self, src_to_dst: Dict[int, int]) -> None:
-        self._swap(self.gpu_cache, self.cpu_cache, src_to_dst)
-    def copy(self, src_to_dsts: Dict[int, List[int]]) -> None:
-        key_caches = [key_cache for key_cache, _ in self.gpu_cache]
-        value_caches = [value_cache for _, value_cache in self.gpu_cache]
-        # NOTE(woosuk): This operation implicitly synchronizes the CPU and GPU.
-        cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts)
-    @staticmethod
-    def get_cache_block_size(
-        block_size: int,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-    ) -> int:
-        head_size = model_config.get_head_size()
-        num_heads = model_config.get_num_kv_heads(parallel_config)
-        num_layers = model_config.get_num_layers(parallel_config)
-        key_cache_block = block_size * num_heads * head_size
-        value_cache_block = key_cache_block
-        total = num_layers * (key_cache_block + value_cache_block)
-        dtype_size = _get_dtype_size(model_config.dtype)
-        return dtype_size * total
-def _get_dtype_size(dtype: torch.dtype) -> int:
-    return torch.tensor([], dtype=dtype).element_size()
--- a/server/vllm/vllm/worker/worker.py
+++ b/server/vllm/vllm/worker/worker.py
-"""A GPU worker class."""
-import os
-from typing import Dict, List, Tuple, Optional
-import torch
-import torch.distributed
-from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig)
-from vllm.model_executor import get_model, InputMetadata, set_random_seed
-from vllm.model_executor.parallel_utils.parallel_state import (
-    initialize_model_parallel)
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
-from vllm.worker.cache_engine import CacheEngine
-from vllm.utils import get_gpu_memory, get_max_shared_memory_bytes
-class Worker:
-    """A worker class that executes (a partition of) the model on a GPU.
-    Each worker is associated with a single GPU. The worker is responsible for
-    maintaining the KV cache and executing the model on the GPU. In case of
-    distributed inference, each worker is assigned a partition of the model.
-    """
-    def __init__(
-        self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        rank: Optional[int] = None,
-        distributed_init_method: Optional[str] = None,
-    ) -> None:
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.rank = rank
-        self.distributed_init_method = distributed_init_method
-        # Uninitialized cache engine. Will be initialized by
-        # self.init_cache_engine().
-        self.cache_config = None
-        self.block_size = None
-        self.sliding_window = None
-        self.cache_engine = None
-        self.cache_events = None
-        self.gpu_cache = None
-    def init_model(self):
-        # This env var set by Ray causes exceptions with graph building.
-        os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
-        # Env vars will be set by Ray.
-        self.rank = self.rank if self.rank is not None else int(
-            os.getenv("RANK", "-1"))
-        local_rank = int(os.getenv("LOCAL_RANK", "0"))
-        self.device = torch.device(f"cuda:{local_rank}")
-        if self.rank < 0:
-            raise ValueError("Invalid or unspecified rank.")
-        torch.cuda.set_device(self.device)
-        _check_if_gpu_supports_dtype(self.model_config.dtype)
-        # Initialize the distributed environment.
-        _init_distributed_environment(self.parallel_config, self.rank,
-                                      self.distributed_init_method)
-        # Initialize the model.
-        set_random_seed(self.model_config.seed)
-        self.model = get_model(self.model_config)
-    @torch.inference_mode()
-    def profile_num_available_blocks(
-        self,
-        block_size: int,
-        gpu_memory_utilization: float,
-        cpu_swap_space: int,
-    ) -> Tuple[int, int]:
-        # Profile the memory usage of the model and get the maximum number of
-        # cache blocks that can be allocated with the remaining free memory.
-        torch.cuda.empty_cache()
-        torch.cuda.reset_peak_memory_stats()
-        # Profile memory usage with max_num_sequences sequences and the total
-        # number of tokens equal to max_num_batched_tokens.
-        # Enable top-k sampling to reflect the accurate memory usage.
-        vocab_size = self.model.config.vocab_size
-        sampling_params = SamplingParams(top_p=0.99, top_k=vocab_size - 1)
-        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
-        max_num_seqs = self.scheduler_config.max_num_seqs
-        seqs = []
-        for group_id in range(max_num_seqs):
-            seq_len = (max_num_batched_tokens // max_num_seqs +
-                       (group_id < max_num_batched_tokens % max_num_seqs))
-            seq_data = SequenceData([0] * seq_len)
-            seq = SequenceGroupMetadata(
-                request_id=str(group_id),
-                is_prompt=True,
-                seq_data={group_id: seq_data},
-                sampling_params=sampling_params,
-                block_tables=None,
-            )
-            seqs.append(seq)
-        input_tokens, input_positions, input_metadata = self._prepare_inputs(
-            seqs)
-        # Execute the model.
-        num_layers = self.model_config.get_num_layers(self.parallel_config)
-        self.model(
-            input_ids=input_tokens,
-            positions=input_positions,
-            kv_caches=[(None, None)] * num_layers,
-            input_metadata=input_metadata,
-            cache_events=None,
-        )
-        # Calculate the number of blocks that can be allocated with the
-        # profiled peak memory.
-        torch.cuda.synchronize()
-        peak_memory = torch.cuda.max_memory_allocated()
-        total_gpu_memory = get_gpu_memory()
-        cache_block_size = CacheEngine.get_cache_block_size(
-            block_size, self.model_config, self.parallel_config)
-        num_gpu_blocks = int(
-            (total_gpu_memory * gpu_memory_utilization - peak_memory) //
-            cache_block_size)
-        num_cpu_blocks = int(cpu_swap_space // cache_block_size)
-        num_gpu_blocks = max(num_gpu_blocks, 0)
-        num_cpu_blocks = max(num_cpu_blocks, 0)
-        torch.cuda.empty_cache()
-        # Reset the seed to ensure that the random state is not affected by
-        # the model initialization and profiling.
-        set_random_seed(self.model_config.seed)
-        return num_gpu_blocks, num_cpu_blocks
-    def init_cache_engine(self, cache_config: CacheConfig) -> None:
-        self.cache_config = cache_config
-        self.block_size = cache_config.block_size
-        self.sliding_window = cache_config.sliding_window
-        if self.sliding_window is None:
-            max_seq_len = self.scheduler_config.max_model_len
-        else:
-            max_seq_len = min(self.scheduler_config.max_model_len,
-                              self.sliding_window)
-        _check_if_can_support_max_seq_len(max_seq_len, self.block_size)
-        self.cache_engine = CacheEngine(self.cache_config, self.model_config,
-                                        self.parallel_config)
-        self.cache_events = self.cache_engine.events
-        self.gpu_cache = self.cache_engine.gpu_cache
-    def _prepare_inputs(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata]:
-        seq_groups: List[Tuple[List[int], SamplingParams]] = []
-        input_tokens: List[List[int]] = []
-        input_positions: List[List[int]] = []
-        slot_mapping: List[List[int]] = []
-        # Add prompt tokens.
-        prompt_lens: List[int] = []
-        for seq_group_metadata in seq_group_metadata_list:
-            if not seq_group_metadata.is_prompt:
-                continue
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-            sampling_params = seq_group_metadata.sampling_params
-            seq_groups.append((seq_ids, sampling_params))
-            # Use any sequence in the group.
-            seq_id = seq_ids[0]
-            seq_data = seq_group_metadata.seq_data[seq_id]
-            prompt_tokens = seq_data.get_token_ids()
-            prompt_len = len(prompt_tokens)
-            prompt_lens.append(prompt_len)
-            input_tokens.append(prompt_tokens)
-            # NOTE(woosuk): Here we assume that the first token in the prompt
-            # is always the first token in the sequence.
-            input_positions.append(list(range(prompt_len)))
-            if seq_group_metadata.block_tables is None:
-                # During memory profiling, the block tables are not initialized
-                # yet. In this case, we just use a dummy slot mapping.
-                slot_mapping.append([0] * prompt_len)
-                continue
-            # Compute the slot mapping.
-            slot_mapping.append([])
-            block_table = seq_group_metadata.block_tables[seq_id]
-            for i in range(prompt_len):
-                block_number = block_table[i // self.block_size]
-                block_offset = i % self.block_size
-                slot = block_number * self.block_size + block_offset
-                slot_mapping[-1].append(slot)
-        # Add generation tokens.
-        max_context_len = 0
-        max_num_blocks_per_seq = 0
-        context_lens: List[int] = []
-        generation_block_tables: List[List[int]] = []
-        for seq_group_metadata in seq_group_metadata_list:
-            if seq_group_metadata.is_prompt:
-                continue
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-            sampling_params = seq_group_metadata.sampling_params
-            seq_groups.append((seq_ids, sampling_params))
-            for seq_id in seq_ids:
-                seq_data = seq_group_metadata.seq_data[seq_id]
-                generation_token = seq_data.get_last_token_id()
-                input_tokens.append([generation_token])
-                context_len = seq_data.get_len()
-                position = context_len - 1
-                if self.sliding_window is not None:
-                    context_len = min(context_len, self.sliding_window)
-                input_positions.append([position])
-                block_table = seq_group_metadata.block_tables[seq_id]
-                max_context_len = max(max_context_len, context_len)
-                max_num_blocks_per_seq = max(max_num_blocks_per_seq,
-                                             len(block_table))
-                context_lens.append(context_len)
-                block_number = block_table[position // self.block_size]
-                block_offset = position % self.block_size
-                slot = block_number * self.block_size + block_offset
-                slot_mapping.append([slot])
-                if self.sliding_window is not None:
-                    sliding_window_blocks = (self.sliding_window //
-                                             self.block_size)
-                    block_table = block_table[-sliding_window_blocks:]
-                generation_block_tables.append(block_table)
-        max_seq_len = max(prompt_lens) if prompt_lens else 1
-        padded_input_tokens = [
-            _pad_to_max(tokens, max_seq_len, pad=0) for tokens in input_tokens
-        ]
-        padded_input_positions = [
-            _pad_to_max(positions, max_seq_len, pad=0)
-            for positions in input_positions
-        ]
-        padded_slot_mapping = [
-            _pad_to_max(mapping, max_seq_len, pad=-1)
-            for mapping in slot_mapping
-        ]
-        padded_block_tables = [
-            _pad_to_max(block_table, max_num_blocks_per_seq, pad=0)
-            for block_table in generation_block_tables
-        ]
-        # Convert to tensors.
-        tokens_tensor = torch.tensor(padded_input_tokens,
-                                     dtype=torch.long,
-                                     device="cuda")
-        positions_tensor = torch.tensor(padded_input_positions,
-                                        dtype=torch.long,
-                                        device="cuda")
-        slot_mapping_tensor = torch.tensor(padded_slot_mapping,
-                                           dtype=torch.int,
-                                           device="cuda")
-        context_lens_tensor = torch.tensor(context_lens,
-                                           dtype=torch.int,
-                                           device="cuda")
-        block_tables_tensor = torch.tensor(padded_block_tables,
-                                           dtype=torch.int,
-                                           device="cuda")
-        seq_data: Dict[int, SequenceData] = {}
-        for seq_group_metadata in seq_group_metadata_list:
-            seq_data.update(seq_group_metadata.seq_data)
-        input_metadata = InputMetadata(
-            seq_groups=seq_groups,
-            seq_data=seq_data,
-            prompt_lens=prompt_lens,
-            slot_mapping=slot_mapping_tensor,
-            context_lens=context_lens_tensor,
-            max_context_len=max_context_len,
-            block_tables=block_tables_tensor,
-            sliding_window=self.sliding_window,
-        )
-        return tokens_tensor, positions_tensor, input_metadata
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
-    ) -> SamplerOutput:
-        # Issue cache operations.
-        issued_cache_op = False
-        if blocks_to_swap_in:
-            self.cache_engine.swap_in(blocks_to_swap_in)
-            issued_cache_op = True
-        if blocks_to_swap_out:
-            self.cache_engine.swap_out(blocks_to_swap_out)
-            issued_cache_op = True
-        if blocks_to_copy:
-            self.cache_engine.copy(blocks_to_copy)
-            issued_cache_op = True
-        if issued_cache_op:
-            cache_events = self.cache_events
-        else:
-            cache_events = None
-        # If there is no input, we don't need to execute the model.
-        if not seq_group_metadata_list:
-            if cache_events is not None:
-                for event in cache_events:
-                    event.wait()
-            return {}
-        # Prepare input tensors.
-        input_tokens, input_positions, input_metadata = self._prepare_inputs(
-            seq_group_metadata_list)
-        # Execute the model.
-        output = self.model(
-            input_ids=input_tokens,
-            positions=input_positions,
-            kv_caches=self.gpu_cache,
-            input_metadata=input_metadata,
-            cache_events=cache_events,
-        )
-        return output
-def _init_distributed_environment(
-    parallel_config: ParallelConfig,
-    rank: int,
-    distributed_init_method: Optional[str] = None,
-) -> None:
-    """Initialize the distributed environment."""
-    if torch.distributed.is_initialized():
-        torch_world_size = torch.distributed.get_world_size()
-        if torch_world_size != parallel_config.world_size:
-            raise RuntimeError(
-                "torch.distributed is already initialized but the torch world "
-                "size does not match parallel_config.world_size "
-                f"({torch_world_size} vs. {parallel_config.world_size}).")
-    elif not distributed_init_method:
-        raise ValueError(
-            "distributed_init_method must be set if torch.distributed "
-            "is not already initialized")
-    else:
-        torch.distributed.init_process_group(
-            backend="nccl",
-            world_size=parallel_config.world_size,
-            rank=rank,
-            init_method=distributed_init_method,
-        )
-    # A small all_reduce for warmup.
-    torch.distributed.all_reduce(torch.zeros(1).cuda())
-    initialize_model_parallel(parallel_config.tensor_parallel_size,
-                              parallel_config.pipeline_parallel_size)
-def _pad_to_alignment(x: List[int], multiple_of: int, pad: int) -> List[int]:
-    return x + [pad] * ((-len(x)) % multiple_of)
-def _pad_to_max(x: List[int], max_len: int, pad: int) -> List[int]:
-    return x + [pad] * (max_len - len(x))
-def _check_if_can_support_max_seq_len(max_seq_len: int,
-                                      block_size: int) -> None:
-    # Follows the logic in
-    # attention_kernels.cu::single_query_cached_kv_attention_launcher
-    max_shared_mem = get_max_shared_memory_bytes()
-    float32_bytes = torch.finfo(torch.float).bits // 8
-    padded_max_seq_len = (
-        (max_seq_len + block_size - 1) / block_size) * block_size
-    # padded_max_seq_len + extra buffer
-    required_shared_mem = (padded_max_seq_len + 512) * float32_bytes
-    if padded_max_seq_len * float32_bytes > max_shared_mem:
-        raise RuntimeError(
-            f"vLLM cannot currently support max_model_len={max_seq_len} "
-            f"with block_size={block_size} on GPU with compute "
-            f"capability {torch.cuda.get_device_capability()} "
-            f"(required shared memory {required_shared_mem} > "
-            f"available shared memory {max_shared_mem}). "
-            "This will be fixed in a future release.")
-def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
-    # Check if the GPU supports the dtype.
-    if torch_dtype == torch.bfloat16:
-        compute_capability = torch.cuda.get_device_capability()
-        if compute_capability[0] < 8:
-            gpu_name = torch.cuda.get_device_name()
-            raise ValueError(
-                "Bfloat16 is only supported on GPUs with compute capability "
-                f"of at least 8.0. Your {gpu_name} GPU has compute capability "
-                f"{compute_capability[0]}.{compute_capability[1]}.")