update

e019635f · xuxzh1 · 64def8e2 · 64def8e2 · 64def8e2 · 64def8e2
Commit e019635f authored Nov 01, 2024 by xuxzh1 🎱
20 changed files
--- a/server/vllm/vllm/model_executor/models/llama.py
+++ b/server/vllm/vllm/model_executor/models/llama.py
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only LLaMA model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
-from typing import Any, Dict, List, Optional, Tuple
-
-import torch
-from torch import nn
-from transformers import LlamaConfig
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.layers.quantized_linear import ParallelLinear
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-from vllm.model_executor.parallel_utils.layers import VocabParallelEmbedding
-from vllm.model_executor.quantization_utils import QuantizationConfig
-from vllm.model_executor.weight_utils import (
-    convert_pyslice_to_tensor, hf_model_weights_iterator,
-    load_tensor_parallel_weights, load_padded_tensor_parallel_vocab)
-from vllm.sequence import SamplerOutput
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class LlamaMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.gate_up_proj = ParallelLinear.column(hidden_size,
-                                                  2 * intermediate_size,
-                                                  bias=False,
-                                                  gather_output=False,
-                                                  quant_config=quant_config)
-        self.down_proj = ParallelLinear.row(intermediate_size,
-                                            hidden_size,
-                                            bias=False,
-                                            input_is_parallel=True,
-                                            quant_config=quant_config)
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class LlamaAttention(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
-        self.total_num_kv_heads = num_kv_heads
-        if self.total_num_kv_heads >= tp_size:
-            # Number of KV heads is greater than TP size, so we partition
-            # the KV heads across multiple tensor parallel GPUs.
-            assert self.total_num_kv_heads % tp_size == 0
-        else:
-            # Number of KV heads is less than TP size, so we replicate
-            # the KV heads across multiple tensor parallel GPUs.
-            assert tp_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        num_kv_heads_replicas = max(1, tp_size // self.total_num_kv_heads)
-        self.head_dim = hidden_size // self.total_num_heads
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        self.qkv_proj = ParallelLinear.column(
-            hidden_size,
-            (self.total_num_heads +
-             2 * self.total_num_kv_heads * num_kv_heads_replicas) *
-            self.head_dim,
-            bias=False,
-            gather_output=False,
-            quant_config=quant_config,
-        )
-        self.o_proj = ParallelLinear.row(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            input_is_parallel=True,
-            quant_config=quant_config,
-        )
-        self.attn = PagedAttentionWithRoPE(
-            self.num_heads,
-            self.head_dim,
-            self.scaling,
-            base=self.rope_theta,
-            max_position=self.max_position_embeddings,
-            rotary_dim=self.head_dim,
-            num_kv_heads=self.num_kv_heads,
-            rope_scaling=rope_scaling)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
-                                input_metadata, cache_event)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class LlamaDecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: LlamaConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.self_attn = LlamaAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
-            max_position_embeddings=max_position_embeddings,
-            quant_config=quant_config,
-        )
-        self.mlp = LlamaMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            quant_config=quant_config,
-        )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        # Self Attention
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-            cache_event=cache_event,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        return hidden_states
-
-
-class LlamaModel(nn.Module):
-
-    def __init__(
-        self,
-        config: LlamaConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        vocab_size = ((config.vocab_size + 63) // 64) * 64
-        self.embed_tokens = VocabParallelEmbedding(
-            vocab_size,
-            config.hidden_size,
-        )
-        self.layers = nn.ModuleList([
-            LlamaDecoderLayer(config, quant_config)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        for i in range(len(self.layers)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
-            layer = self.layers[i]
-            hidden_states = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                cache_event,
-            )
-        hidden_states = self.norm(hidden_states)
-        return hidden_states
-
-
-class LlamaForCausalLM(nn.Module):
-
-    def __init__(
-        self,
-        config: LlamaConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.quant_config = quant_config
-        self.model = LlamaModel(config, quant_config)
-        vocab_size = ((config.vocab_size + 63) // 64) * 64
-        # NOTE: The LM head is not quantized.
-        self.lm_head = ParallelLinear.column(config.hidden_size,
-                                             vocab_size,
-                                             bias=False,
-                                             gather_output=False,
-                                             quant_config=None)
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata, cache_events)
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   input_metadata)
-        return next_tokens
-
-    _column_parallel_layers = []
-    _row_parallel_layers = ["o_proj", "down_proj"]
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        if self.quant_config is None:
-            weight_suffixes = ["weight"]
-        else:
-            weight_suffixes = self.quant_config.get_tp_tensor_names()
-
-        column_parallel_weights: List[str] = []
-        for layer in self._column_parallel_layers:
-            for suffix in weight_suffixes:
-                column_parallel_weights.append(f"{layer}.{suffix}")
-        row_parallel_weights: List[str] = []
-        for layer in self._row_parallel_layers:
-            for suffix in weight_suffixes:
-                row_parallel_weights.append(f"{layer}.{suffix}")
-
-        tp_size = get_tensor_model_parallel_world_size()
-        tp_rank = get_tensor_model_parallel_rank()
-        q_proj_shard_size = (self.config.hidden_size // tp_size)
-        num_kv_heads_replicas = max(1,
-                                    tp_size // self.config.num_key_value_heads)
-        num_kv_heads_per_gpu = max(1,
-                                   self.config.num_key_value_heads // tp_size)
-        kv_proj_shard_size = (self.config.hidden_size //
-                              self.config.num_attention_heads *
-                              num_kv_heads_per_gpu)
-        attention_weight_specs = [
-            # (weight_name, shard_size, offset)
-            ("q_proj", q_proj_shard_size, 0),
-            ("k_proj", kv_proj_shard_size, q_proj_shard_size),
-            ("v_proj", kv_proj_shard_size,
-             q_proj_shard_size + kv_proj_shard_size),
-        ]
-        state_dict = self.state_dict()
-
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            is_packed = False
-            is_transposed = False
-            if self.quant_config is not None:
-                is_packed = self.quant_config.is_packed(name)
-                is_transposed = self.quant_config.is_transposed(name)
-            if is_transposed:
-                loaded_weight = convert_pyslice_to_tensor(loaded_weight)
-                loaded_weight = loaded_weight.T
-
-            is_attention_weight = False
-            for weight_name, shard_size, offset in attention_weight_specs:
-                if weight_name not in name:
-                    continue
-                param = state_dict[name.replace(weight_name, "qkv_proj")]
-                if is_transposed:
-                    param = param.T
-
-                if is_packed:
-                    shard_size //= self.quant_config.pack_factor
-                    offset //= self.quant_config.pack_factor
-
-                if weight_name in ["k_proj", "v_proj"]:
-                    shard_id = tp_rank // num_kv_heads_replicas
-                else:
-                    shard_id = tp_rank
-                loaded_weight = loaded_weight[shard_size *
-                                              shard_id:shard_size *
-                                              (shard_id + 1)]
-                param_slice = param.data[offset:offset + shard_size]
-                assert param_slice.shape == loaded_weight.shape
-
-                param_slice.copy_(loaded_weight)
-                is_attention_weight = True
-                break
-            if is_attention_weight:
-                continue
-
-            is_gate_up_weight = False
-            for stride_id, weight_name in enumerate(["gate_proj", "up_proj"]):
-                if weight_name not in name:
-                    continue
-                param = state_dict[name.replace(weight_name, "gate_up_proj")]
-                if is_transposed:
-                    param = param.T
-
-                shard_size = param.shape[0] // 2
-                loaded_weight = loaded_weight[shard_size * tp_rank:shard_size *
-                                              (tp_rank + 1)]
-                param_slice = param.data[shard_size * stride_id:shard_size *
-                                         (stride_id + 1)]
-                assert param_slice.shape == loaded_weight.shape
-                param_slice.copy_(loaded_weight)
-                is_gate_up_weight = True
-                break
-            if is_gate_up_weight:
-                continue
-
-            param = state_dict[name]
-            if is_transposed:
-                param = param.T
-
-            if "embed_tokens" in name or "lm_head" in name:
-                load_padded_tensor_parallel_vocab(param, loaded_weight,
-                                                  tp_rank)
-                continue
-
-            load_tensor_parallel_weights(param, loaded_weight, name,
-                                         column_parallel_weights,
-                                         row_parallel_weights, tp_rank)
--- a/server/vllm/vllm/model_executor/models/mpt.py
+++ b/server/vllm/vllm/model_executor/models/mpt.py
-# coding=utf-8
-# Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
-import math
-from typing import List, Optional, Tuple
-
-import torch
-import torch.nn as nn
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttentionWithALiBi
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.weight_utils import (convert_pyslice_to_tensor,
-                                              hf_model_weights_iterator,
-                                              load_tensor_parallel_weights)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-from vllm.model_executor.parallel_utils.layers import (VocabParallelEmbedding,
-                                                       ColumnParallelLinear,
-                                                       RowParallelLinear)
-from vllm.sequence import SamplerOutput
-from vllm.transformers_utils.configs.mpt import MPTConfig
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-def _get_alibi_slopes(
-    total_num_heads: int,
-    alibi_bias_max: int,
-) -> torch.Tensor:
-    next_power_of_2 = 2**math.ceil(math.log2(total_num_heads))
-    m = torch.arange(1, next_power_of_2 + 1, dtype=torch.float32)
-    m = m.mul(alibi_bias_max / next_power_of_2)
-    slopes = 1.0 / torch.pow(2, m)
-    if next_power_of_2 != total_num_heads:
-        slopes = torch.concat([slopes[1::2], slopes[::2]])[:total_num_heads]
-    return slopes
-
-
-class MPTAttention(nn.Module):
-
-    def __init__(self, config: MPTConfig):
-        super().__init__()
-        self.d_model = config.d_model
-        self.total_num_heads = config.n_heads
-        self.clip_qkv = config.attn_config["clip_qkv"]
-        self.qk_ln = config.attn_config["qk_ln"]
-        self.alibi_bias_max = config.attn_config["alibi_bias_max"]
-        assert not config.attn_config["prefix_lm"]
-        assert config.attn_config["alibi"]
-
-        self.qkv_proj = ColumnParallelLinear(
-            self.d_model,
-            3 * self.d_model,
-            bias=not config.no_bias,
-            gather_output=False,
-        )
-        if self.qk_ln:
-            self.q_ln = nn.LayerNorm(self.d_model)
-            self.k_ln = nn.LayerNorm(self.d_model)
-        self.out_proj = RowParallelLinear(
-            self.d_model,
-            self.d_model,
-            bias=not config.no_bias,
-            input_is_parallel=True,
-        )
-
-        tp_world_size = get_tensor_model_parallel_world_size()
-        assert self.total_num_heads % tp_world_size == 0
-        self.num_heads = self.total_num_heads // tp_world_size
-
-        # Create the alibi slopes and slice them.
-        tp_rank = get_tensor_model_parallel_rank()
-        head_start = tp_rank * self.num_heads
-        head_end = (tp_rank + 1) * self.num_heads
-        alibi_slopes = _get_alibi_slopes(self.total_num_heads,
-                                         self.alibi_bias_max)
-        alibi_slopes = alibi_slopes[head_start:head_end].tolist()
-
-        self.head_dim = self.d_model // self.total_num_heads
-        scaling = self.head_dim**-0.5
-        self.attn = PagedAttentionWithALiBi(self.num_heads, self.head_dim,
-                                            scaling, alibi_slopes)
-
-    def forward(
-        self,
-        position_ids: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        del position_ids  # unused.
-        qkv, _ = self.qkv_proj(hidden_states)
-        if self.clip_qkv is not None:
-            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
-        q, k, v = qkv.chunk(chunks=3, dim=-1)
-        if self.qk_ln:
-            q = self.q_ln(q)
-            k = self.k_ln(k)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
-                                cache_event)
-        output, _ = self.out_proj(attn_output)
-        return output
-
-
-class MPTMLP(nn.Module):
-
-    def __init__(self, config: MPTConfig):
-        super().__init__()
-        hidden_size = config.d_model
-        expansion_ratio = config.expansion_ratio
-        intermediate_size = expansion_ratio * hidden_size
-        self.up_proj = ColumnParallelLinear(
-            hidden_size,
-            intermediate_size,
-            bias=not config.no_bias,
-            gather_output=False,
-        )
-        self.act = get_act_fn("gelu")
-        self.down_proj = RowParallelLinear(
-            intermediate_size,
-            hidden_size,
-            bias=not config.no_bias,
-            input_is_parallel=True,
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x, _ = self.up_proj(x)
-        x = self.act(x)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class MPTBlock(nn.Module):
-
-    def __init__(self, config: MPTConfig):
-        super().__init__()
-        hidden_size = config.d_model
-        self.norm_1 = nn.LayerNorm(hidden_size)
-        self.attn = MPTAttention(config)
-        self.norm_2 = nn.LayerNorm(hidden_size)
-        self.ffn = MPTMLP(config)
-
-    def forward(
-        self,
-        position_ids: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        x = self.norm_1(hidden_states)
-        x = self.attn(
-            position_ids=position_ids,
-            hidden_states=x,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-            cache_event=cache_event,
-        )
-        hidden_states = hidden_states + x
-        x = self.norm_2(hidden_states)
-        x = self.ffn(x)
-        hidden_states = hidden_states + x
-        return hidden_states
-
-
-class MPTModel(nn.Module):
-
-    def __init__(self, config: MPTConfig):
-        super().__init__()
-        assert config.embedding_fraction == 1.0
-        assert config.norm_type == "low_precision_layernorm"
-
-        self.wte = VocabParallelEmbedding(
-            config.vocab_size,
-            config.d_model,
-        )
-        self.blocks = nn.ModuleList(
-            [MPTBlock(config) for _ in range(config.n_layers)])
-        self.norm_f = nn.LayerNorm(config.d_model)
-        if config.no_bias:
-            for module in self.modules():
-                if hasattr(module, "bias"):
-                    if isinstance(module.bias, nn.Parameter):
-                        # Remove the bias term in Linear and LayerNorm.
-                        module.register_parameter("bias", None)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> torch.Tensor:
-        hidden_states = self.wte(input_ids)
-        for i in range(len(self.blocks)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
-            block = self.blocks[i]
-            hidden_states = block(
-                position_ids,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                cache_event,
-            )
-        hidden_states = self.norm_f(hidden_states)
-        return hidden_states
-
-
-class MPTForCausalLM(nn.Module):
-
-    def __init__(self, config: MPTConfig):
-        super().__init__()
-        self.config = config
-        assert config.tie_word_embeddings
-
-        self.transformer = MPTModel(config)
-        # TODO(zhuohan): create a new weight after implementing pipeline
-        #                parallelism
-        self.lm_head_weight = self.transformer.wte.weight
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         input_metadata, cache_events)
-        next_tokens = self.sampler(self.lm_head_weight, hidden_states,
-                                   input_metadata)
-        return next_tokens
-
-    _column_parallel_weights = ["wte.weight", "up_proj.weight", "up_proj.bias"]
-    _row_parallel_weights = ["out_proj.weight", "down_proj.weight"]
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        tp_world_size = get_tensor_model_parallel_world_size()
-        tp_rank = get_tensor_model_parallel_rank()
-        state_dict = self.state_dict()
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "Wqkv" in name:
-                # NOTE(woosuk): MPT's fused QKV has the shape of
-                # [3 * num_heads * head_size, hidden_size].
-                # When tensor model parallelism is used, we need to shard
-                # the weight along the hidden dimension.
-                total_num_heads = self.config.num_attention_heads
-                hidden_size = self.config.hidden_size
-                head_size = hidden_size // total_num_heads
-                num_heads = total_num_heads // tp_world_size
-                head_start = tp_rank * num_heads
-                head_end = (tp_rank + 1) * num_heads
-                loaded_weight = convert_pyslice_to_tensor(loaded_weight)
-                if name.endswith(".weight"):
-                    loaded_weight = loaded_weight.view(3, total_num_heads,
-                                                       head_size, hidden_size)
-                    loaded_weight = loaded_weight[:, head_start:head_end, :, :]
-                    loaded_weight = loaded_weight.reshape(-1, hidden_size)
-                elif name.endswith(".bias"):
-                    loaded_weight = loaded_weight.view(3, total_num_heads,
-                                                       head_size)
-                    loaded_weight = loaded_weight[:, head_start:head_end, :]
-                    loaded_weight = loaded_weight.reshape(-1)
-                else:
-                    raise ValueError(f"Unexpected parameter name {name}")
-                name = name.replace("Wqkv", "qkv_proj")
-            param = state_dict[name]
-            load_tensor_parallel_weights(param, loaded_weight, name,
-                                         self._column_parallel_weights,
-                                         self._row_parallel_weights, tp_rank)
--- a/server/vllm/vllm/model_executor/models/opt.py
+++ b/server/vllm/vllm/model_executor/models/opt.py
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights
-# reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only OPT model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
-from typing import List, Optional, Tuple
-
-import torch
-from torch import nn
-from transformers import OPTConfig
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
-                                              load_tensor_parallel_weights)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-from vllm.model_executor.parallel_utils.layers import (VocabParallelEmbedding,
-                                                       ColumnParallelLinear,
-                                                       RowParallelLinear)
-from vllm.sequence import SamplerOutput
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class OPTLearnedPositionalEmbedding(nn.Embedding):
-
-    def __init__(self, num_embeddings: int, embedding_dim: int):
-        # OPT is set up so that if padding_idx is specified then offset the
-        # embedding ids by 2 and adjust num_embeddings appropriately. Other
-        # models don't have this hack
-        self.offset = 2
-        super().__init__(num_embeddings + self.offset, embedding_dim)
-
-    def forward(self, positions: torch.Tensor):
-        return super().forward(positions + self.offset)
-
-
-class OPTAttention(nn.Module):
-
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        bias: bool = True,
-    ) -> None:
-        super().__init__()
-        self.embed_dim = embed_dim
-        tensor_model_parallel_world_size = (
-            get_tensor_model_parallel_world_size())
-        total_num_heads = num_heads
-        assert num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = total_num_heads // tensor_model_parallel_world_size
-        self.head_dim = embed_dim // total_num_heads
-        self.scaling = self.head_dim**-0.5
-
-        self.qkv_proj = ColumnParallelLinear(
-            embed_dim,
-            3 * embed_dim,
-            bias=bias,
-            gather_output=False,
-        )
-        self.out_proj = RowParallelLinear(
-            embed_dim,
-            embed_dim,
-            bias=bias,
-            input_is_parallel=True,
-        )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   scale=self.scaling)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.chunk(chunks=3, dim=-1)
-        key_cache, value_cache = kv_cache
-        attn_output = self.attn(q, k, v, key_cache, value_cache,
-                                input_metadata, cache_event)
-        output, _ = self.out_proj(attn_output)
-        return output
-
-
-class OPTDecoderLayer(nn.Module):
-
-    def __init__(self, config: OPTConfig):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.self_attn = OPTAttention(
-            embed_dim=self.embed_dim,
-            num_heads=config.num_attention_heads,
-            bias=config.enable_bias,
-        )
-        self.do_layer_norm_before = config.do_layer_norm_before
-        self.activation_fn = get_act_fn(config.activation_function)
-
-        self.self_attn_layer_norm = nn.LayerNorm(
-            self.embed_dim,
-            elementwise_affine=config.layer_norm_elementwise_affine)
-        self.fc1 = ColumnParallelLinear(
-            self.embed_dim,
-            config.ffn_dim,
-            bias=config.enable_bias,
-            gather_output=False,
-        )
-        self.fc2 = RowParallelLinear(
-            config.ffn_dim,
-            self.embed_dim,
-            bias=config.enable_bias,
-            input_is_parallel=True,
-        )
-        self.final_layer_norm = nn.LayerNorm(
-            self.embed_dim,
-            elementwise_affine=config.layer_norm_elementwise_affine)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        # Self Attention
-        residual = hidden_states
-        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
-        if self.do_layer_norm_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
-        hidden_states = self.self_attn(hidden_states=hidden_states,
-                                       kv_cache=kv_cache,
-                                       input_metadata=input_metadata,
-                                       cache_event=cache_event)
-        hidden_states = residual + hidden_states
-        # 350m applies layer norm AFTER attention
-        if not self.do_layer_norm_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
-
-        # Fully Connected
-        residual = hidden_states
-        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
-        if self.do_layer_norm_before:
-            hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states, _ = self.fc1(hidden_states)
-        hidden_states = self.activation_fn(hidden_states)
-        hidden_states, _ = self.fc2(hidden_states)
-        hidden_states = residual + hidden_states
-        # 350m applies layer norm AFTER attention
-        if not self.do_layer_norm_before:
-            hidden_states = self.final_layer_norm(hidden_states)
-        return hidden_states
-
-
-class OPTDecoder(nn.Module):
-
-    def __init__(self, config: OPTConfig):
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.max_target_positions = config.max_position_embeddings
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
-            config.word_embed_proj_dim,
-        )
-        # Positional embeddings are replicated (not sharded).
-        self.embed_positions = OPTLearnedPositionalEmbedding(
-            config.max_position_embeddings, config.hidden_size)
-
-        # Project out & in will be replicated if they exist.
-        if config.word_embed_proj_dim != config.hidden_size:
-            self.project_out = nn.Linear(config.hidden_size,
-                                         config.word_embed_proj_dim,
-                                         bias=False)
-        else:
-            self.project_out = None
-
-        if config.word_embed_proj_dim != config.hidden_size:
-            self.project_in = nn.Linear(config.word_embed_proj_dim,
-                                        config.hidden_size,
-                                        bias=False)
-        else:
-            self.project_in = None
-
-        # Note that the only purpose of `config._remove_final_layer_norm` is to
-        # keep backward compatibility with checkpoints that have been fine-tuned
-        # before transformers v4.20.1
-        # see https://github.com/facebookresearch/metaseq/pull/164
-        if config.do_layer_norm_before and not config._remove_final_layer_norm:
-            self.final_layer_norm = nn.LayerNorm(
-                config.hidden_size,
-                elementwise_affine=config.layer_norm_elementwise_affine)
-        else:
-            self.final_layer_norm = None
-
-        self.layers = nn.ModuleList(
-            [OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)])
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> torch.Tensor:
-        inputs_embeds = self.embed_tokens(input_ids)
-        pos_embeds = self.embed_positions(positions)
-        if self.project_in is not None:
-            inputs_embeds = self.project_in(inputs_embeds)
-        hidden_states = inputs_embeds + pos_embeds
-
-        for i in range(len(self.layers)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
-            layer = self.layers[i]
-            hidden_states = layer(hidden_states, kv_caches[i], input_metadata,
-                                  cache_event)
-
-        if self.final_layer_norm is not None:
-            hidden_states = self.final_layer_norm(hidden_states)
-        if self.project_out is not None:
-            hidden_states = self.project_out(hidden_states)
-        return hidden_states
-
-
-class OPTModel(nn.Module):
-
-    def __init__(self, config: OPTConfig):
-        super().__init__()
-        self.decoder = OPTDecoder(config)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> torch.Tensor:
-        return self.decoder(input_ids, positions, kv_caches, input_metadata,
-                            cache_events)
-
-
-class OPTForCausalLM(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.model = OPTModel(config)
-        # TODO(zhuohan): create a new weight after implementing pipeline
-        #                parallelism
-        self.lm_head_weight = self.model.decoder.embed_tokens.weight
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata, cache_events)
-        next_tokens = self.sampler(self.lm_head_weight, hidden_states,
-                                   input_metadata)
-        return next_tokens
-
-    _column_parallel_weights = [
-        "embed_tokens.weight", "fc1.weight", "fc1.bias"
-    ]
-    _row_parallel_weights = ["out_proj.weight", "fc2.weight"]
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
-        state_dict = self.state_dict()
-
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "lm_head.weight" in name:
-                continue
-
-            if name.startswith("decoder."):
-                name = "model." + name
-
-            is_attention_weight = False
-            for stride_id, att_weight_name in enumerate(
-                ["q_proj", "k_proj", "v_proj"]):
-                if att_weight_name not in name:
-                    continue
-                param = state_dict[name.replace(att_weight_name, "qkv_proj")]
-                shard_size = param.shape[0] // 3
-                loaded_weight = loaded_weight[
-                    shard_size * tensor_model_parallel_rank:shard_size *
-                    (tensor_model_parallel_rank + 1)]
-                param_slice = param.data[shard_size * stride_id:shard_size *
-                                         (stride_id + 1)]
-                assert param_slice.shape == loaded_weight.shape
-                param_slice.copy_(loaded_weight)
-                is_attention_weight = True
-                break
-            if is_attention_weight:
-                continue
-
-            param = state_dict[name]
-            load_tensor_parallel_weights(param, loaded_weight, name,
-                                         self._column_parallel_weights,
-                                         self._row_parallel_weights,
-                                         tensor_model_parallel_rank)
--- a/server/vllm/vllm/model_executor/models/qwen.py
+++ b/server/vllm/vllm/model_executor/models/qwen.py
-# coding=utf-8
-# Adapted from
-# https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
-# Copyright (c) Alibaba Cloud.
-# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
-"""Inference-only QWen model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
-from typing import Any, Dict, List, Optional, Tuple
-
-import torch
-from torch import nn
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.weight_utils import (
-    convert_pyslice_to_tensor,
-    hf_model_weights_iterator,
-    load_padded_tensor_parallel_vocab,
-    load_tensor_parallel_weights,
-)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
-from vllm.model_executor.parallel_utils.layers import (
-    VocabParallelEmbedding,
-    ColumnParallelLinear,
-    RowParallelLinear,
-)
-from vllm.sequence import SamplerOutput
-from vllm.transformers_utils.configs.qwen import QWenConfig
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class QWenMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str = "silu",
-    ):
-        super().__init__()
-        self.gate_up_proj = ColumnParallelLinear(
-            hidden_size,
-            2 * intermediate_size,
-            bias=False,
-            gather_output=False,
-        )
-        self.c_proj = RowParallelLinear(
-            intermediate_size,
-            hidden_size,
-            bias=False,
-            input_is_parallel=True,
-        )
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.c_proj(x)
-        return x
-
-
-class QWenAttention(nn.Module):
-
-    def __init__(self,
-                 hidden_size: int,
-                 num_heads: int,
-                 max_position_embeddings: int,
-                 rope_theta: float = 10000,
-                 rope_scaling: Optional[Dict[str, Any]] = None):
-        super().__init__()
-        self.hidden_size = hidden_size
-        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
-        )
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = (self.total_num_heads //
-                          tensor_model_parallel_world_size)
-        self.head_dim = hidden_size // self.total_num_heads
-
-        # pylint: disable=invalid-name
-        self.c_attn = ColumnParallelLinear(
-            hidden_size,
-            3 * hidden_size,
-            bias=True,
-            gather_output=False,
-        )
-        self.c_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            input_is_parallel=True,
-        )
-        self.scaling = self.head_dim**-0.5
-        self.attn = PagedAttentionWithRoPE(
-            self.num_heads,
-            self.head_dim,
-            self.scaling,
-            rotary_dim=self.head_dim,
-            base=rope_theta,
-            max_position=max_position_embeddings,
-            rope_scaling=rope_scaling)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        qkv, _ = self.c_attn(hidden_states)
-        q, k, v = qkv.chunk(chunks=3, dim=-1)
-
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
-                                input_metadata, cache_event)
-
-        output, _ = self.c_proj(attn_output)
-        return output
-
-
-class QWenBlock(nn.Module):
-
-    def __init__(self, config: QWenConfig):
-        super().__init__()
-        self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
-
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        self.attn = QWenAttention(config.hidden_size,
-                                  config.num_attention_heads,
-                                  config.max_position_embeddings,
-                                  rope_theta=rope_theta,
-                                  rope_scaling=rope_scaling)
-
-        self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
-
-        self.mlp = QWenMLP(config.hidden_size, config.intermediate_size // 2)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        # Self Attention
-        residual = hidden_states
-        hidden_states = self.ln_1(hidden_states)
-        hidden_states = self.attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-            cache_event=cache_event,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.ln_2(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        return hidden_states
-
-
-class QWenModel(nn.Module):
-
-    def __init__(self, config: QWenConfig):
-        super().__init__()
-        self.config = config
-        self.vocab_size = config.vocab_size
-
-        vocab_size = ((config.vocab_size + 63) // 64) * 64
-        self.wte = VocabParallelEmbedding(
-            vocab_size,
-            config.hidden_size,
-        )
-        self.h = nn.ModuleList(
-            [QWenBlock(config) for _ in range(config.num_hidden_layers)])
-        self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> torch.Tensor:
-        hidden_states = self.wte(input_ids)
-        for i in range(len(self.h)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
-            layer = self.h[i]
-            hidden_states = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                cache_event,
-            )
-        hidden_states = self.ln_f(hidden_states)
-        return hidden_states
-
-
-class QWenLMHeadModel(nn.Module):
-
-    def __init__(self, config: QWenConfig):
-        super().__init__()
-        self.config = config
-        self.transformer = QWenModel(config)
-        vocab_size = ((config.vocab_size + 63) // 64) * 64
-        self.lm_head = ColumnParallelLinear(
-            config.hidden_size,
-            vocab_size,
-            bias=False,
-            gather_output=False,
-        )
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         input_metadata, cache_events)
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   input_metadata)
-        return next_tokens
-
-    _column_parallel_weights = []
-    _row_parallel_weights = ["c_proj.weight"]
-
-    def load_weights(
-        self,
-        model_name_or_path: str,
-        cache_dir: Optional[str] = None,
-        load_format: str = "auto",
-        revision: Optional[str] = None,
-    ):
-        tp_world_size = get_tensor_model_parallel_world_size()
-        tp_rank = get_tensor_model_parallel_rank()
-        state_dict = self.state_dict()
-
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            loaded_weight = convert_pyslice_to_tensor(loaded_weight)
-
-            if "c_attn" in name:
-                total_num_heads = self.config.num_attention_heads
-                hidden_size = self.config.hidden_size
-                head_size = hidden_size // total_num_heads
-                num_heads = total_num_heads // tp_world_size
-                head_start = tp_rank * num_heads
-                head_end = (tp_rank + 1) * num_heads
-
-                if "weight" in name:
-                    loaded_weight = loaded_weight.view(3, total_num_heads,
-                                                       head_size, hidden_size)
-                    loaded_weight = loaded_weight[:, head_start:head_end, :, :]
-                    loaded_weight = loaded_weight.reshape(-1, hidden_size)
-                elif "bias" in name:
-                    loaded_weight = loaded_weight.view(3, total_num_heads,
-                                                       head_size)
-                    loaded_weight = loaded_weight[:, head_start:head_end, :]
-                    loaded_weight = loaded_weight.reshape(-1)
-
-            is_gate_up_weight = False
-            for stride_id, weight_name in enumerate(["w2", "w1"]):
-                if weight_name not in name:
-                    continue
-                param = state_dict[name.replace(weight_name, "gate_up_proj")]
-                shard_size = param.shape[0] // 2
-                loaded_weight = loaded_weight[shard_size * tp_rank:shard_size *
-                                              (tp_rank + 1)]
-                param_slice = param.data[shard_size * stride_id:shard_size *
-                                         (stride_id + 1)]
-                assert param_slice.shape == loaded_weight.shape
-                param_slice.copy_(loaded_weight)
-                is_gate_up_weight = True
-                break
-            if is_gate_up_weight:
-                continue
-
-            param = state_dict[name]
-
-            if "wte" in name or "lm_head" in name:
-                load_padded_tensor_parallel_vocab(param, loaded_weight,
-                                                  tp_rank)
-                continue
-
-            load_tensor_parallel_weights(
-                param,
-                loaded_weight,
-                name,
-                self._column_parallel_weights,
-                self._row_parallel_weights,
-                tp_rank,
-            )
--- a/server/vllm/vllm/model_executor/parallel_utils/README.md
+++ b/server/vllm/vllm/model_executor/parallel_utils/README.md
-The files in this folder are ported from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core). We only keep the codes that are used in inference.
\ No newline at end of file
--- a/server/vllm/vllm/model_executor/parallel_utils/__init__.py
+++ b/server/vllm/vllm/model_executor/parallel_utils/__init__.py
--- a/server/vllm/vllm/model_executor/parallel_utils/communication_op.py
+++ b/server/vllm/vllm/model_executor/parallel_utils/communication_op.py
-import torch
-
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_world_size,
-    get_tensor_model_parallel_group,
-)
-
-
-def tensor_model_parallel_all_reduce(input_):
-    """All-reduce the input tensor across model parallel group.
-
-    NOTE: This operation is applied in-place on the input tensor.
-    """
-    # Bypass the function if we are using only 1 GPU.
-    if get_tensor_model_parallel_world_size() == 1:
-        return input_
-    # All-reduce.
-    torch.distributed.all_reduce(input_,
-                                 group=get_tensor_model_parallel_group())
-    return input_
-
-
-def tensor_model_parallel_all_gather(input_, dim=-1):
-    """All-gather the input tensor across model parallel group."""
-    world_size = get_tensor_model_parallel_world_size()
-    # Bypass the function if we are using only 1 GPU.
-    if world_size == 1:
-        return input_
-    assert -input_.dim() <= dim < input_.dim(), (
-        f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
-    if dim < 0:
-        # Convert negative dim to positive.
-        dim += input_.dim()
-    input_size = input_.size()
-    # Allocate output tensor.
-    output_tensor = torch.empty((world_size, ) + input_size,
-                                dtype=input_.dtype,
-                                device=input_.device)
-    # All-gather.
-    torch.distributed.all_gather_into_tensor(
-        output_tensor, input_, group=get_tensor_model_parallel_group())
-    # Reshape
-    output_tensor = output_tensor.movedim(0, dim)
-    output_tensor = output_tensor.reshape(input_size[:dim] +
-                                          (world_size * input_size[dim], ) +
-                                          input_size[dim + 1:])
-    return output_tensor
--- a/server/vllm/vllm/model_executor/parallel_utils/layers.py
+++ b/server/vllm/vllm/model_executor/parallel_utils/layers.py
-# Copyright 2023 The vLLM team.
-# Adapted from
-# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-# Parts of the code here are adapted from PyTorch
-# repo: https://github.com/pytorch/pytorch
-from typing import Optional
-
-import torch
-import torch.nn.functional as F
-from torch.nn.parameter import Parameter
-
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
-from vllm.model_executor.quantization_utils import QuantizationConfig
-from vllm.model_executor.parallel_utils.communication_op import (
-    tensor_model_parallel_all_reduce, tensor_model_parallel_all_gather)
-
-from vllm.model_executor.parallel_utils.utils import (
-    divide,
-    VocabUtility,
-    split_tensor_along_last_dim,
-)
-
-
-class VocabParallelEmbedding(torch.nn.Module):
-    """Embedding parallelized in the vocabulary dimension.
-
-    This is mainly adapted from torch.nn.Embedding and all the default
-    values are kept.
-    Arguments:
-        num_embeddings: vocabulary size.
-        embedding_dim: size of hidden state.
-        params_dtype: type of the parameters.
-    """
-
-    def __init__(self,
-                 num_embeddings: int,
-                 embedding_dim: int,
-                 params_dtype: Optional[torch.dtype] = None):
-        super().__init__()
-
-        # Keep the input dimensions.
-        self.num_embeddings = num_embeddings
-        self.embedding_dim = embedding_dim
-        if params_dtype is None:
-            params_dtype = torch.get_default_dtype()
-
-        self.tp_size = get_tensor_model_parallel_world_size()
-        # TODO: Handle vocab padding here.
-        # Divide the weight matrix along the vocaburaly dimension.
-        self.vocab_start_index, self.vocab_end_index = (
-            VocabUtility.vocab_range_from_global_vocab_size(
-                self.num_embeddings, get_tensor_model_parallel_rank(),
-                self.tp_size))
-        self.num_embeddings_per_partition = (self.vocab_end_index -
-                                             self.vocab_start_index)
-
-        self.weight = Parameter(
-            torch.empty(self.num_embeddings_per_partition,
-                        self.embedding_dim,
-                        device=torch.cuda.current_device(),
-                        dtype=params_dtype))
-
-    def forward(self, input_):
-        if self.tp_size > 1:
-            # Build the mask.
-            input_mask = ((input_ < self.vocab_start_index) |
-                          (input_ >= self.vocab_end_index))
-            # Mask the input.
-            masked_input = input_.clone() - self.vocab_start_index
-            masked_input[input_mask] = 0
-        else:
-            masked_input = input_
-            # Get the embeddings.
-        output_parallel = F.embedding(masked_input, self.weight)
-        # Mask the output embedding.
-        if self.tp_size > 1:
-            output_parallel[input_mask, :] = 0.0
-        # Reduce across all the model parallel GPUs.
-        output = tensor_model_parallel_all_reduce(output_parallel)
-        return output
-
-
-class ColumnParallelLinear(torch.nn.Module):
-    """Linear layer with column parallelism.
-
-    The linear layer is defined as Y = XA + b. A is parallelized along
-    its second dimension as A = [A_1, ..., A_p].
-
-    Arguments:
-        input_size: first dimension of matrix A.
-        output_size: second dimension of matrix A.
-
-    Keyword Arguments
-        bias: If true, add bias
-        gather_output: If true, call all-gather on output and make Y available
-                       to all GPUs, otherwise, every GPU will have its output
-                       which is Y_i = XA_i
-        skip_bias_add: This was added to enable performance optimizations where
-                       bias can be fused with other element-wise operations. we
-                       skip adding bias but instead return it.
-        params_dtype: Data type for the parameters.
-        quant_config: Quantization configuration.
-    """
-
-    def __init__(
-        self,
-        input_size: int,
-        output_size: int,
-        bias: bool = True,
-        gather_output: bool = True,
-        skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ):
-        super().__init__()
-
-        # Keep input parameters
-        self.input_size = input_size
-        self.output_size = output_size
-        self.gather_output = gather_output
-        # Divide the weight matrix along the last dimension.
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.output_size_per_partition = divide(output_size, self.tp_size)
-        self.skip_bias_add = skip_bias_add
-        self.quant_config = quant_config
-
-        if params_dtype is None:
-            params_dtype = torch.get_default_dtype()
-
-        # Parameters.
-        # NOTE: torch.nn.functional.linear performs XA^T + b and as a result
-        # we allocate the transpose.
-        self.create_weights(params_dtype)
-
-        if bias:
-            self.bias = Parameter(
-                torch.empty(self.output_size_per_partition,
-                            device=torch.cuda.current_device(),
-                            dtype=params_dtype))
-        else:
-            self.register_parameter('bias', None)
-
-    def create_weights(self, dtype: torch.dtype) -> None:
-        self.weight = Parameter(
-            torch.empty(self.output_size_per_partition,
-                        self.input_size,
-                        device=torch.cuda.current_device(),
-                        dtype=dtype))
-
-    def apply_weights(
-        self,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        return F.linear(x, self.weight, bias)
-
-    def forward(self, input_):
-        """Forward of ColumnParallelLinear
-
-        Args:
-            input_: Tensor whose last dimension is `input_size`.
-
-        Returns:
-            - output
-            - bias
-        """
-        bias = self.bias if not self.skip_bias_add else None
-
-        input_parallel = input_
-        # Matrix multiply.
-        output_parallel = self.apply_weights(input_parallel, bias)
-        if self.gather_output:
-            # All-gather across the partitions.
-            output = tensor_model_parallel_all_gather(output_parallel)
-        else:
-            output = output_parallel
-        output_bias = self.bias if self.skip_bias_add else None
-        return output, output_bias
-
-
-class RowParallelLinear(torch.nn.Module):
-    """Linear layer with row parallelism.
-
-    The linear layer is defined as Y = XA + b. A is parallelized along
-    its first dimension and X along its second dimension as:
-               -   -
-              | A_1 |
-              | .   |
-          A = | .   |        X = [X_1, ..., X_p]
-              | .   |
-              | A_p |
-               -   -
-    Arguments:
-        input_size: first dimension of matrix A.
-        output_size: second dimension of matrix A.
-
-    Keyword Arguments:
-        bias: If true, add bias. Note that bias is not parallelized.
-        input_is_parallel: If true, we assume that the input is already
-                           split across the GPUs and we do not split
-                           again.
-        skip_bias_add: This was added to enable performance optimization where
-                       bias can be fused with other element-wise operations.
-                       We skip adding bias but instead return it.
-        params_dtype: Data type for the parameters.
-        quant_config: Quantization configuration.
-    """
-
-    def __init__(
-        self,
-        input_size: int,
-        output_size: int,
-        bias: bool = True,
-        input_is_parallel: bool = False,
-        skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        reduce_results: bool = True,
-        quant_config: Optional[QuantizationConfig] = None,
-    ):
-        super().__init__()
-        # Keep input parameters
-        self.input_size = input_size
-        self.output_size = output_size
-        self.input_is_parallel = input_is_parallel
-        self.reduce_results = reduce_results
-        if params_dtype is None:
-            params_dtype = torch.get_default_dtype()
-
-        # Divide the weight matrix along the last dimension.
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.input_size_per_partition = divide(input_size, self.tp_size)
-        self.skip_bias_add = skip_bias_add
-        self.quant_config = quant_config
-
-        self.create_weights(params_dtype)
-
-        if not reduce_results and (bias and not skip_bias_add):
-            raise ValueError('When not reduce the results, adding bias to the '
-                             'results can lead to incorrect results')
-
-        if bias:
-            self.bias = Parameter(
-                torch.empty(self.output_size,
-                            device=torch.cuda.current_device(),
-                            dtype=params_dtype))
-
-            # Always initialize bias to zero.
-            with torch.no_grad():
-                self.bias.zero_()
-        else:
-            self.register_parameter('bias', None)
-
-    def create_weights(self, dtype: torch.dtype) -> None:
-        self.weight = Parameter(
-            torch.empty(self.output_size,
-                        self.input_size_per_partition,
-                        device=torch.cuda.current_device(),
-                        dtype=dtype))
-
-    def apply_weights(self, x: torch.Tensor) -> torch.Tensor:
-        return F.linear(x, self.weight)
-
-    def forward(self, input_):
-        """Forward of RowParallelLinear
-
-        Args:
-            input_: tensor whose last dimension is `input_size`. If
-                    `input_is_parallel` is set, then the last dimension
-                    is `input_size // tp_size`.
-
-        Returns:
-            - output
-            - bias
-        """
-        # Set up backprop all-reduce.
-        if self.input_is_parallel:
-            input_parallel = input_
-        else:
-            # TODO: simplify code below
-            tp_rank = get_tensor_model_parallel_rank()
-            splitted_input = split_tensor_along_last_dim(
-                input_, num_partitions=self.tp_size)
-            input_parallel = splitted_input[tp_rank].contiguous()
-
-        # Matrix multiply.
-        output_parallel = self.apply_weights(input_parallel)
-        if self.reduce_results and self.tp_size > 1:
-            output_ = tensor_model_parallel_all_reduce(output_parallel)
-        else:
-            output_ = output_parallel
-
-        if not self.skip_bias_add:
-            output = output_ + self.bias if self.bias is not None else output_
-            output_bias = None
-        else:
-            output = output_
-            output_bias = self.bias
-        return output, output_bias
--- a/server/vllm/vllm/model_executor/parallel_utils/parallel_state.py
+++ b/server/vllm/vllm/model_executor/parallel_utils/parallel_state.py
-# Copyright 2023 The vLLM team.
-# Adapted from
-# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-"""Model and data parallel groups."""
-
-import torch
-
-# Tensor model parallel group that the current rank belongs to.
-_TENSOR_MODEL_PARALLEL_GROUP = None
-# Pipeline model parallel group that the current rank belongs to.
-_PIPELINE_MODEL_PARALLEL_GROUP = None
-
-# A list of global ranks for each pipeline group to ease calculation of the
-# source rank when broadcasting from the first or last pipeline stage.
-_PIPELINE_GLOBAL_RANKS = None
-
-
-def initialize_model_parallel(
-    tensor_model_parallel_size: int = 1,
-    pipeline_model_parallel_size: int = 1,
-) -> None:
-    """
-    Initialize model parallel groups.
-
-    Arguments:
-        tensor_model_parallel_size: number of GPUs used for tensor model
-            parallelism.
-        pipeline_model_parallel_size: number of GPUs used for pipeline model
-            parallelism.
-
-    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
-    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
-    the model pipeline. The present function will
-    create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
-        4 tensor model-parallel groups:
-            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
-        2 pipeline model-parallel groups:
-            [g0, g2, g4, g6], [g1, g3, g5, g7]
-    Note that for efficiency, the caller should make sure adjacent ranks
-    are on the same DGX box. For example if we are using 2 DGX-1 boxes
-    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
-    ranks 8 to 15 belong to the second box.
-    """
-    # Get world size and rank. Ensure some consistencies.
-    assert torch.distributed.is_initialized()
-    world_size: int = torch.distributed.get_world_size()
-
-    if (world_size !=
-            tensor_model_parallel_size * pipeline_model_parallel_size):
-        raise RuntimeError(
-            f"world_size ({world_size}) is not equal to "
-            f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
-            f"pipeline_model_parallel_size ({pipeline_model_parallel_size})")
-
-    num_tensor_model_parallel_groups: int = (world_size //
-                                             tensor_model_parallel_size)
-    num_pipeline_model_parallel_groups: int = (world_size //
-                                               pipeline_model_parallel_size)
-    rank = torch.distributed.get_rank()
-
-    # Build the tensor model-parallel groups.
-    global _TENSOR_MODEL_PARALLEL_GROUP
-    assert _TENSOR_MODEL_PARALLEL_GROUP is None, (
-        "tensor model parallel group is already initialized")
-    for i in range(num_tensor_model_parallel_groups):
-        ranks = range(i * tensor_model_parallel_size,
-                      (i + 1) * tensor_model_parallel_size)
-        group = torch.distributed.new_group(ranks)
-        if rank in ranks:
-            _TENSOR_MODEL_PARALLEL_GROUP = group
-
-    # Build the pipeline model-parallel groups.
-    global _PIPELINE_MODEL_PARALLEL_GROUP
-    global _PIPELINE_GLOBAL_RANKS
-    assert _PIPELINE_MODEL_PARALLEL_GROUP is None, (
-        "pipeline model parallel group is already initialized")
-    for i in range(num_pipeline_model_parallel_groups):
-        ranks = range(i, world_size, num_pipeline_model_parallel_groups)
-        group = torch.distributed.new_group(ranks)
-        if rank in ranks:
-            _PIPELINE_MODEL_PARALLEL_GROUP = group
-            _PIPELINE_GLOBAL_RANKS = ranks
-
-
-def model_parallel_is_initialized():
-    """Check if model and data parallel groups are initialized."""
-    return (_TENSOR_MODEL_PARALLEL_GROUP is not None
-            and _PIPELINE_MODEL_PARALLEL_GROUP is not None)
-
-
-def get_tensor_model_parallel_group():
-    """Get the tensor model parallel group the caller rank belongs to."""
-    assert _TENSOR_MODEL_PARALLEL_GROUP is not None, (
-        "tenosr model parallel group is not initialized")
-    return _TENSOR_MODEL_PARALLEL_GROUP
-
-
-def get_pipeline_model_parallel_group():
-    """Get the pipeline model parallel group the caller rank belongs to."""
-    assert _PIPELINE_MODEL_PARALLEL_GROUP is not None, (
-        "pipeline model parallel group is not initialized")
-    return _PIPELINE_MODEL_PARALLEL_GROUP
-
-
-def get_tensor_model_parallel_world_size():
-    """Return world size for the tensor model parallel group."""
-    return torch.distributed.get_world_size(
-        group=get_tensor_model_parallel_group())
-
-
-def get_pipeline_model_parallel_world_size():
-    """Return world size for the pipeline model parallel group."""
-    return torch.distributed.get_world_size(
-        group=get_pipeline_model_parallel_group())
-
-
-def get_tensor_model_parallel_rank():
-    """Return my rank for the tensor model parallel group."""
-    return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
-
-
-def get_pipeline_model_parallel_rank():
-    """Return my rank for the pipeline model parallel group."""
-    return torch.distributed.get_rank(
-        group=get_pipeline_model_parallel_group())
-
-
-def get_tensor_model_parallel_src_rank():
-    """Calculate the global rank corresponding to the first local rank
-    in the tensor model parallel group."""
-    global_rank = torch.distributed.get_rank()
-    local_world_size = get_tensor_model_parallel_world_size()
-    return (global_rank // local_world_size) * local_world_size
-
-
-def get_pipeline_model_parallel_first_rank():
-    """Return the global rank of the first process in the pipeline for the
-    current tensor parallel group"""
-    assert _PIPELINE_GLOBAL_RANKS is not None, (
-        "Pipeline parallel group is not initialized")
-    return _PIPELINE_GLOBAL_RANKS[0]
-
-
-def get_pipeline_model_parallel_last_rank():
-    """Return the global rank of the last process in the pipeline for the
-    current tensor parallel group"""
-    assert _PIPELINE_GLOBAL_RANKS is not None, (
-        "Pipeline parallel group is not initialized")
-    last_rank_local = get_pipeline_model_parallel_world_size() - 1
-    return _PIPELINE_GLOBAL_RANKS[last_rank_local]
-
-
-def get_pipeline_model_parallel_next_rank():
-    """Return the global rank that follows the caller in the pipeline"""
-    assert _PIPELINE_GLOBAL_RANKS is not None, (
-        "Pipeline parallel group is not initialized")
-    rank_in_pipeline = get_pipeline_model_parallel_rank()
-    world_size = get_pipeline_model_parallel_world_size()
-    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size]
-
-
-def get_pipeline_model_parallel_prev_rank():
-    """Return the global rank that preceeds the caller in the pipeline"""
-    assert _PIPELINE_GLOBAL_RANKS is not None, (
-        "Pipeline parallel group is not initialized")
-    rank_in_pipeline = get_pipeline_model_parallel_rank()
-    world_size = get_pipeline_model_parallel_world_size()
-    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size]
-
-
-def destroy_model_parallel():
-    """Set the groups to none."""
-    global _TENSOR_MODEL_PARALLEL_GROUP
-    _TENSOR_MODEL_PARALLEL_GROUP = None
-    global _PIPELINE_MODEL_PARALLEL_GROUP
-    _PIPELINE_MODEL_PARALLEL_GROUP = None
-    global _PIPELINE_GLOBAL_RANKS
-    _PIPELINE_GLOBAL_RANKS = None
--- a/server/vllm/vllm/model_executor/parallel_utils/utils.py
+++ b/server/vllm/vllm/model_executor/parallel_utils/utils.py
-# Copyright 2023 The vLLM team.
-# Adapted from
-# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-from typing import List, Sequence
-
-import torch
-
-
-def ensure_divisibility(numerator, denominator):
-    """Ensure that numerator is divisible by the denominator."""
-    assert numerator % denominator == 0, "{} is not divisible by {}".format(
-        numerator, denominator)
-
-
-def divide(numerator, denominator):
-    """Ensure that numerator is divisible by the denominator and return
-    the division value."""
-    ensure_divisibility(numerator, denominator)
-    return numerator // denominator
-
-
-def split_tensor_along_last_dim(
-    tensor: torch.Tensor,
-    num_partitions: int,
-    contiguous_split_chunks: bool = False,
-) -> List[torch.Tensor]:
-    """ Split a tensor along its last dimension.
-
-        Arguments:
-            tensor: input tensor.
-            num_partitions: number of partitions to split the tensor
-            contiguous_split_chunks: If True, make each chunk contiguous
-                                     in memory.
-
-        Returns:
-            A list of Tensors
-    """
-    # Get the size and dimension.
-    last_dim = tensor.dim() - 1
-    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
-    # Split.
-    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
-    # NOTE: torch.split does not create contiguous tensors by default.
-    if contiguous_split_chunks:
-        return tuple(chunk.contiguous() for chunk in tensor_list)
-
-    return tensor_list
-
-
-class VocabUtility:
-    """ Split the vocabulary into `world_size` chunks and return the first
-        and last index of the vocabulary belonging to the `rank`
-        partition: Note that indices in [fist, last)
-
-    """
-
-    @staticmethod
-    def vocab_range_from_per_partition_vocab_size(
-            per_partition_vocab_size: int, rank: int) -> Sequence[int]:
-        index_f = rank * per_partition_vocab_size
-        index_l = index_f + per_partition_vocab_size
-        return index_f, index_l
-
-    @staticmethod
-    def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int,
-                                           world_size: int) -> Sequence[int]:
-        per_partition_vocab_size = divide(global_vocab_size, world_size)
-        return VocabUtility.vocab_range_from_per_partition_vocab_size(
-            per_partition_vocab_size, rank)
--- a/server/vllm/vllm/model_executor/quantization_utils/__init__.py
+++ b/server/vllm/vllm/model_executor/quantization_utils/__init__.py
-from typing import Type
-
-from vllm.model_executor.quantization_utils.awq import AWQConfig
-from vllm.model_executor.quantization_utils.base import QuantizationConfig
-
-_QUANTIZATION_REGISTRY = {
-    "awq": AWQConfig,
-}
-
-
-def get_quant_class(quantization: str) -> Type[QuantizationConfig]:
-    if quantization not in _QUANTIZATION_REGISTRY:
-        raise ValueError(f"Invalid quantization method: {quantization}")
-    return _QUANTIZATION_REGISTRY[quantization]
-
-
-__all__ = [
-    "QuantizationConfig",
-    "get_quant_class",
-]
--- a/server/vllm/vllm/model_executor/quantization_utils/awq.py
+++ b/server/vllm/vllm/model_executor/quantization_utils/awq.py
-from typing import Any, Dict, List
-
-import torch
-
-from vllm.model_executor.quantization_utils.base import QuantizationConfig
-
-
-class AWQConfig(QuantizationConfig):
-    """Config class for AWQ.
-
-    Reference: https://arxiv.org/abs/2306.00978
-    """
-
-    def __init__(
-        self,
-        weight_bits: int,
-        group_size: int,
-        zero_point: bool,
-    ) -> None:
-        self.weight_bits = weight_bits
-        self.group_size = group_size
-        self.zero_point = zero_point
-
-        if self.weight_bits != 4:
-            raise ValueError(
-                "Currently, only 4-bit weight quantization is supported for "
-                f"AWQ, but got {self.weight_bits} bits.")
-        self.pack_factor = 32 // self.weight_bits
-
-    def __repr__(self) -> str:
-        return (f"AWQConfig(weight_bits={self.weight_bits}, "
-                f"group_size={self.group_size}, "
-                f"zero_point={self.zero_point})")
-
-    @classmethod
-    def get_name(cls) -> str:
-        return "awq"
-
-    @classmethod
-    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
-        return [torch.half]
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        # The AWQ kernel only supports Turing or newer GPUs.
-        return 75
-
-    @classmethod
-    def get_config_filenames(cls) -> List[str]:
-        return [
-            "quant_config.json",  # E.g., casperhansen/vicuna-7b-v1.5-awq
-            "quantize_config.json",  # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq  # pylint: disable=line-too-long
-        ]
-
-    @classmethod
-    def from_config(cls, config: Dict[str, Any]) -> "AWQConfig":
-        weight_bits = cls.get_from_keys(config, ["w_bit", "bits"])
-        group_size = cls.get_from_keys(config, ["q_group_size", "group_size"])
-        zero_point = cls.get_from_keys(config, ["zero_point"])
-        return cls(weight_bits, group_size, zero_point)
-
-    @classmethod
-    def get_packed_tensor_names(cls) -> List[str]:
-        return ["qweight", "qzeros"]
-
-    @classmethod
-    def get_transposed_tensor_names(cls) -> List[str]:
-        return ["qweight", "qzeros", "scales"]
-
-    @classmethod
-    def get_tp_tensor_names(cls) -> List[str]:
-        return ["qweight", "qzeros", "scales"]
--- a/server/vllm/vllm/model_executor/quantization_utils/base.py
+++ b/server/vllm/vllm/model_executor/quantization_utils/base.py
-from typing import Any, Dict, List
-
-import torch
-
-
-class QuantizationConfig:
-
-    @classmethod
-    def get_name(cls) -> str:
-        """Name of the quantization method."""
-        raise NotImplementedError
-
-    @classmethod
-    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
-        """List of supported activation dtypes."""
-        raise NotImplementedError
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        """Minimum GPU capability to support the quantization method.
-
-        E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
-        This requirement is due to the custom CUDA kernels used by the
-        quantization method.
-        """
-        raise NotImplementedError
-
-    @classmethod
-    def get_config_filenames(cls) -> List[str]:
-        """List of filenames to search for in the model directory."""
-        raise NotImplementedError
-
-    @classmethod
-    def from_config(cls, config: Dict[str, Any]) -> "QuantizationConfig":
-        """Create a config class from the model's quantization config."""
-        raise NotImplementedError
-
-    @staticmethod
-    def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any:
-        """Get a value from the model's quantization config."""
-        for key in keys:
-            if key in config:
-                return config[key]
-        raise ValueError(f"Cannot find any of {keys} in the model's "
-                         "quantization config.")
-
-    @classmethod
-    def get_packed_tensor_names(cls) -> List[str]:
-        raise NotImplementedError
-
-    @classmethod
-    def is_packed(cls, tensor_name: str) -> bool:
-        """Returns True if a tensor is packed.
-
-        A tensor is considered packed if each element in the tensor is a
-        packed representation of multiple elements in the original tensor.
-        For example, an INT32 element in the tensor may represent 8 INT4
-        elements in the original tensor.
-        """
-        return any(tag in tensor_name for tag in cls.get_packed_tensor_names())
-
-    @classmethod
-    def get_transposed_tensor_names(cls) -> List[str]:
-        raise NotImplementedError
-
-    @classmethod
-    def is_transposed(cls, tensor_name: str) -> bool:
-        """Returns True if a tensor is transposed relative to nn.Linear.weight.
-        """
-        return any(tag in tensor_name
-                   for tag in cls.get_transposed_tensor_names())
-
-    @classmethod
-    def get_tp_tensor_names(cls) -> List[str]:
-        raise NotImplementedError
--- a/server/vllm/vllm/model_executor/utils.py
+++ b/server/vllm/vllm/model_executor/utils.py
-"""Utils for model executor."""
-import random
-
-import numpy as np
-import torch
-
-
-def set_random_seed(seed: int) -> None:
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(seed)
--- a/server/vllm/vllm/model_executor/weight_utils.py
+++ b/server/vllm/vllm/model_executor/weight_utils.py
-"""Utilities for downloading and initializing model weights."""
-import filelock
-import glob
-import json
-import os
-from collections import defaultdict
-from typing import Any, Iterator, List, Optional, Tuple
-
-from huggingface_hub import snapshot_download
-from safetensors.torch import load_file, save_file, safe_open
-import numpy as np
-import torch
-from tqdm.auto import tqdm
-
-from vllm.logger import init_logger
-from vllm.model_executor.quantization_utils import get_quant_class
-from vllm.model_executor.quantization_utils.base import QuantizationConfig
-
-logger = init_logger(__name__)
-
-
-class Disabledtqdm(tqdm):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs, disable=True)
-
-
-def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None):
-    lock_dir = cache_dir if cache_dir is not None else "/tmp"
-    lock_file_name = model_name_or_path.replace("/", "-") + ".lock"
-    lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name))
-    return lock
-
-
-def _shared_pointers(tensors):
-    ptrs = defaultdict(list)
-    for k, v in tensors.items():
-        ptrs[v.data_ptr()].append(k)
-    failing = []
-    for _, names in ptrs.items():
-        if len(names) > 1:
-            failing.append(names)
-    return failing
-
-
-def convert_bin_to_safetensor_file(
-    pt_filename: str,
-    sf_filename: str,
-) -> None:
-    loaded = torch.load(pt_filename, map_location="cpu")
-    if "state_dict" in loaded:
-        loaded = loaded["state_dict"]
-    shared = _shared_pointers(loaded)
-    for shared_weights in shared:
-        for name in shared_weights[1:]:
-            loaded.pop(name)
-
-    # For tensors to be contiguous
-    loaded = {k: v.contiguous() for k, v in loaded.items()}
-
-    dirname = os.path.dirname(sf_filename)
-    os.makedirs(dirname, exist_ok=True)
-    save_file(loaded, sf_filename, metadata={"format": "pt"})
-
-    # check file size
-    sf_size = os.stat(sf_filename).st_size
-    pt_size = os.stat(pt_filename).st_size
-    if (sf_size - pt_size) / pt_size > 0.01:
-        raise RuntimeError(f"""The file size different is more than 1%:
-         - {sf_filename}: {sf_size}
-         - {pt_filename}: {pt_size}
-         """)
-
-    # check if the tensors are the same
-    reloaded = load_file(sf_filename)
-    for k in loaded:
-        pt_tensor = loaded[k]
-        sf_tensor = reloaded[k]
-        if not torch.equal(pt_tensor, sf_tensor):
-            raise RuntimeError(f"The output tensors do not match for key {k}")
-
-
-# TODO(woosuk): Move this to other place.
-def get_quant_config(
-    quantization: str,
-    model_name_or_path: str,
-    cache_dir: Optional[str] = None,
-) -> QuantizationConfig:
-    is_local = os.path.isdir(model_name_or_path)
-    if not is_local:
-        # Download the config files.
-        with get_lock(model_name_or_path, cache_dir):
-            hf_folder = snapshot_download(model_name_or_path,
-                                          allow_patterns="*.json",
-                                          cache_dir=cache_dir,
-                                          tqdm_class=Disabledtqdm)
-    else:
-        hf_folder = model_name_or_path
-    config_files = glob.glob(os.path.join(hf_folder, "*.json"))
-
-    quant_cls = get_quant_class(quantization)
-    quant_config_files = [
-        f for f in config_files if any(
-            f.endswith(x) for x in quant_cls.get_config_filenames())
-    ]
-    if len(quant_config_files) == 0:
-        raise ValueError(f"Cannot find the config file for {quantization}")
-    if len(quant_config_files) > 1:
-        raise ValueError(f"Found multiple config files for {quantization}: "
-                         f"{quant_config_files}")
-
-    quant_config_file = quant_config_files[0]
-    with open(quant_config_file, "r") as f:
-        config = json.load(f)
-    return quant_cls.from_config(config)
-
-
-def prepare_hf_model_weights(
-    model_name_or_path: str,
-    cache_dir: Optional[str] = None,
-    use_safetensors: bool = False,
-    fall_back_to_pt: bool = True,
-    revision: Optional[str] = None,
-) -> Tuple[str, List[str], bool]:
-    # Download model weights from huggingface.
-    is_local = os.path.isdir(model_name_or_path)
-    if use_safetensors:
-        allow_patterns = ["*.safetensors"]
-    else:
-        # Some quantized models use .pt files for storing the weights.
-        allow_patterns = ["*.bin", "*.pt"]
-    if not is_local:
-        # Use file lock to prevent multiple processes from
-        # downloading the same model weights at the same time.
-        with get_lock(model_name_or_path, cache_dir):
-            hf_folder = snapshot_download(model_name_or_path,
-                                          allow_patterns=allow_patterns,
-                                          cache_dir=cache_dir,
-                                          tqdm_class=Disabledtqdm,
-                                          revision=revision)
-    else:
-        hf_folder = model_name_or_path
-    hf_weights_files: List[str] = []
-    for pattern in allow_patterns:
-        hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
-    if not use_safetensors:
-        # Exclude files that are not needed for inference.
-        # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233
-        blacklist = [
-            "training_args.bin",
-            "optimizer.bin",
-            "optimizer.pt",
-            "scheduler.pt",
-            "scaler.pt",
-        ]
-        hf_weights_files = [
-            f for f in hf_weights_files
-            if not any(f.endswith(x) for x in blacklist)
-        ]
-
-    if len(hf_weights_files) == 0 and use_safetensors and fall_back_to_pt:
-        return prepare_hf_model_weights(model_name_or_path,
-                                        cache_dir=cache_dir,
-                                        use_safetensors=False,
-                                        fall_back_to_pt=False,
-                                        revision=revision)
-
-    if len(hf_weights_files) == 0:
-        raise RuntimeError(
-            f"Cannot find any model weights with `{model_name_or_path}`")
-
-    return hf_folder, hf_weights_files, use_safetensors
-
-
-def hf_model_weights_iterator(
-    model_name_or_path: str,
-    cache_dir: Optional[str] = None,
-    load_format: str = "auto",
-    revision: Optional[str] = None,
-) -> Iterator[Tuple[str, torch.Tensor]]:
-    use_safetensors = False
-    use_np_cache = False
-    fall_back_to_pt = False
-    if load_format == "auto":
-        use_safetensors = True
-        fall_back_to_pt = True
-    elif load_format == "safetensors":
-        use_safetensors = True
-    elif load_format == "pt":
-        pass
-    elif load_format == "npcache":
-        use_np_cache = True
-    else:
-        raise ValueError(f"Unknown load_format: {load_format}")
-
-    hf_folder, hf_weights_files, use_safetensors = prepare_hf_model_weights(
-        model_name_or_path,
-        cache_dir=cache_dir,
-        use_safetensors=use_safetensors,
-        fall_back_to_pt=fall_back_to_pt,
-        revision=revision)
-
-    if use_np_cache:
-        # Currently np_cache only support *.bin checkpoints
-        assert use_safetensors is False
-
-        # Convert the model weights from torch tensors to numpy arrays for
-        # faster loading.
-        np_folder = os.path.join(hf_folder, "np")
-        os.makedirs(np_folder, exist_ok=True)
-        weight_names_file = os.path.join(np_folder, "weight_names.json")
-        # Use file lock to prevent multiple processes from
-        # dumping the same model weights to numpy at the same time.
-        with get_lock(model_name_or_path, cache_dir):
-            if not os.path.exists(weight_names_file):
-                weight_names = []
-                for bin_file in hf_weights_files:
-                    state = torch.load(bin_file, map_location="cpu")
-                    for name, param in state.items():
-                        param_path = os.path.join(np_folder, name)
-                        with open(param_path, "wb") as f:
-                            np.save(f, param.cpu().detach().numpy())
-                        weight_names.append(name)
-                with open(weight_names_file, "w") as f:
-                    json.dump(weight_names, f)
-
-        with open(weight_names_file, "r") as f:
-            weight_names = json.load(f)
-
-        for name in weight_names:
-            param_path = os.path.join(np_folder, name)
-            with open(param_path, "rb") as f:
-                param = np.load(f)
-            yield name, torch.from_numpy(param)
-    elif use_safetensors:
-        for st_file in hf_weights_files:
-            with safe_open(st_file, framework="pt") as f:
-                for name in f.keys():
-                    param = f.get_slice(name)
-                    yield name, param
-    else:
-        for bin_file in hf_weights_files:
-            state = torch.load(bin_file, map_location="cpu")
-            for name, param in state.items():
-                yield name, param
-            del state
-            torch.cuda.empty_cache()
-
-
-def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
-    """convert PySafeSlice object from safetensors to torch.Tensor
-
-    PySafeSlice object supports indexing, which is done before loading the
-    actual tensor and can reduce the amount of memory being read into the
-    memory. However, it does not support more advanced functionalities
-    like `.view()` or `.t()`. Therefore, if we need to modify the loaded
-    tensor with these more complicated operators, we need to convert to
-    tensor first.
-    """
-    if not isinstance(x, torch.Tensor):
-        x = x[:]
-    return x
-
-
-def load_padded_tensor_parallel_vocab(
-    param: torch.Tensor,
-    loaded_weight: Any,  # `torch.Tensor` or `PySafeSlice`
-    tensor_model_parallel_rank: int,
-) -> None:
-    shard_size = param.shape[0]
-    start_idx = tensor_model_parallel_rank * shard_size
-    end_idx = (tensor_model_parallel_rank + 1) * shard_size
-    loaded_weight = loaded_weight[start_idx:end_idx]
-    loaded_weight = convert_pyslice_to_tensor(loaded_weight)
-    param[:loaded_weight.shape[0]].copy_(loaded_weight)
-
-
-def load_tensor_parallel_weights(
-    param: torch.Tensor,
-    loaded_weight: Any,  # `torch.Tensor` or `PySafeSlice`
-    param_name: str,
-    column_parallel_weight_names: List[str],
-    row_parallel_weight_names: List[str],
-    tensor_model_parallel_rank: int,
-) -> None:
-    for p in column_parallel_weight_names:
-        if p in param_name:
-            shard_size = param.shape[0]
-            start_idx = tensor_model_parallel_rank * shard_size
-            end_idx = (tensor_model_parallel_rank + 1) * shard_size
-            loaded_weight = loaded_weight[start_idx:end_idx]
-            break
-    for p in row_parallel_weight_names:
-        if p in param_name:
-            shard_size = param.shape[1]
-            start_idx = tensor_model_parallel_rank * shard_size
-            end_idx = (tensor_model_parallel_rank + 1) * shard_size
-            loaded_weight = loaded_weight[:, start_idx:end_idx]
-            break
-
-    loaded_weight = convert_pyslice_to_tensor(loaded_weight)
-    assert param.shape == loaded_weight.shape, (
-        f"{param_name} shape mismatch between model and checkpoint: "
-        f"{param.shape} != {loaded_weight.shape}")
-    param.data.copy_(loaded_weight)
-
-
-def initialize_dummy_weights(
-    model: torch.nn.Module,
-    low: float = -1e-3,
-    high: float = 1e-3,
-) -> None:
-    """Initialize model weights with random values.
-
-    The model weights must be randomly initialized for accurate performance
-    measurements. Additionally, the model weights should not cause NaNs in the
-    forward pass. We empirically found that initializing the weights with
-    values between -1e-3 and 1e-3 works well for most models.
-    """
-    for param in model.state_dict().values():
-        param.data.uniform_(low, high)
--- a/server/vllm/vllm/outputs.py
+++ b/server/vllm/vllm/outputs.py
-from typing import List, Optional
-
-from vllm.sequence import (PromptLogprobs, SampleLogprobs, SequenceGroup,
-                           SequenceStatus)
-
-
-class CompletionOutput:
-    """The output data of one completion output of a request.
-
-    Args:
-        index: The index of the output in the request.
-        text: The generated output text.
-        token_ids: The token IDs of the generated output text.
-        cumulative_logprob: The cumulative log probability of the generated
-            output text.
-        logprobs: The log probabilities of the top probability words at each
-            position if the logprobs are requested.
-        finish_reason: The reason why the sequence is finished.
-    """
-
-    def __init__(
-        self,
-        index: int,
-        text: str,
-        token_ids: List[int],
-        cumulative_logprob: float,
-        logprobs: Optional[SampleLogprobs],
-        finish_reason: Optional[str] = None,
-    ) -> None:
-        self.index = index
-        self.text = text
-        self.token_ids = token_ids
-        self.cumulative_logprob = cumulative_logprob
-        self.logprobs = logprobs
-        self.finish_reason = finish_reason
-
-    def finished(self) -> bool:
-        return self.finish_reason is not None
-
-    def __repr__(self) -> str:
-        return (f"CompletionOutput(index={self.index}, "
-                f"text={self.text!r}, "
-                f"token_ids={self.token_ids}, "
-                f"cumulative_logprob={self.cumulative_logprob}, "
-                f"logprobs={self.logprobs}, "
-                f"finish_reason={self.finish_reason})")
-
-
-class RequestOutput:
-    """The output data of a request to the LLM.
-
-    Args:
-        request_id: The unique ID of the request.
-        prompt: The prompt string of the request.
-        prompt_token_ids: The token IDs of the prompt.
-        outputs: The output sequences of the request.
-        finished: Whether the whole request is finished.
-    """
-
-    def __init__(
-        self,
-        request_id: str,
-        prompt: str,
-        prompt_token_ids: List[int],
-        prompt_logprobs: Optional[PromptLogprobs],
-        outputs: List[CompletionOutput],
-        finished: bool,
-    ) -> None:
-        self.request_id = request_id
-        self.prompt = prompt
-        self.prompt_token_ids = prompt_token_ids
-        self.prompt_logprobs = prompt_logprobs
-        self.outputs = outputs
-        self.finished = finished
-
-    @classmethod
-    def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
-        # Get the top-n sequences.
-        n = seq_group.sampling_params.n
-        seqs = seq_group.get_seqs()
-        if seq_group.sampling_params.use_beam_search:
-            sorting_key = lambda seq: seq.get_beam_search_score(
-                seq_group.sampling_params.length_penalty)
-        else:
-            sorting_key = lambda seq: seq.get_cumulative_logprob()
-        sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
-        top_n_seqs = sorted_seqs[:n]
-
-        # Create the outputs.
-        outputs: List[CompletionOutput] = []
-        for seq in top_n_seqs:
-            logprobs = seq.output_logprobs
-            if seq_group.sampling_params.logprobs is None:
-                # NOTE: We need to take care of this case because the sequence
-                # always has the logprobs of the sampled tokens even if the
-                # logprobs are not requested.
-                logprobs = None
-            finshed_reason = SequenceStatus.get_finished_reason(seq.status)
-            output = CompletionOutput(seqs.index(seq), seq.output_text,
-                                      seq.get_output_token_ids(),
-                                      seq.get_cumulative_logprob(), logprobs,
-                                      finshed_reason)
-            outputs.append(output)
-
-        # Every sequence in the sequence group should have the same prompt.
-        prompt = seq_group.prompt
-        prompt_token_ids = seq_group.prompt_token_ids
-        prompt_logprobs = seq_group.prompt_logprobs
-        finished = seq_group.is_finished()
-        return cls(seq_group.request_id, prompt, prompt_token_ids,
-                   prompt_logprobs, outputs, finished)
-
-    def __repr__(self) -> str:
-        return (f"RequestOutput(request_id={self.request_id}, "
-                f"prompt={self.prompt!r}, "
-                f"prompt_token_ids={self.prompt_token_ids}, "
-                f"prompt_logprobs={self.prompt_logprobs}, "
-                f"outputs={self.outputs}, "
-                f"finished={self.finished})")
--- a/server/vllm/vllm/sampling_params.py
+++ b/server/vllm/vllm/sampling_params.py
-"""Sampling parameters for text generation."""
-from enum import IntEnum
-from functools import cached_property
-from typing import List, Optional, Union
-
-_SAMPLING_EPS = 1e-5
-
-
-class SamplingType(IntEnum):
-    GREEDY = 0
-    RANDOM = 1
-    BEAM = 2
-
-
-class SamplingParams:
-    """Sampling parameters for text generation.
-
-    Overall, we follow the sampling parameters from the OpenAI text completion
-    API (https://platform.openai.com/docs/api-reference/completions/create).
-    In addition, we support beam search, which is not supported by OpenAI.
-
-    Args:
-        n: Number of output sequences to return for the given prompt.
-        best_of: Number of output sequences that are generated from the prompt.
-            From these `best_of` sequences, the top `n` sequences are returned.
-            `best_of` must be greater than or equal to `n`. This is treated as
-            the beam width when `use_beam_search` is True. By default, `best_of`
-            is set to `n`.
-        presence_penalty: Float that penalizes new tokens based on whether they
-            appear in the generated text so far. Values > 0 encourage the model
-            to use new tokens, while values < 0 encourage the model to repeat
-            tokens.
-        frequency_penalty: Float that penalizes new tokens based on their
-            frequency in the generated text so far. Values > 0 encourage the
-            model to use new tokens, while values < 0 encourage the model to
-            repeat tokens.
-        temperature: Float that controls the randomness of the sampling. Lower
-            values make the model more deterministic, while higher values make
-            the model more random. Zero means greedy sampling.
-        top_p: Float that controls the cumulative probability of the top tokens
-            to consider. Must be in (0, 1]. Set to 1 to consider all tokens.
-        top_k: Integer that controls the number of top tokens to consider. Set
-            to -1 to consider all tokens.
-        use_beam_search: Whether to use beam search instead of sampling.
-        length_penalty: Float that penalizes sequences based on their length.
-            Used in beam search.
-        early_stopping: Controls the stopping condition for beam search. It
-            accepts the following values: `True`, where the generation stops as
-            soon as there are `best_of` complete candidates; `False`, where an
-            heuristic is applied and the generation stops when is it very
-            unlikely to find better candidates; `"never"`, where the beam search
-            procedure only stops when there cannot be better candidates
-            (canonical beam search algorithm).
-        stop: List of strings that stop the generation when they are generated.
-            The returned output will not contain the stop strings.
-        stop_token_ids: List of tokens that stop the generation when they are
-            generated. The returned output will contain the stop tokens unless
-            the stop tokens are sepcial tokens.
-        ignore_eos: Whether to ignore the EOS token and continue generating
-            tokens after the EOS token is generated.
-        max_tokens: Maximum number of tokens to generate per output sequence.
-        logprobs: Number of log probabilities to return per output token.
-            Note that the implementation follows the OpenAI API: The return
-            result includes the log probabilities on the `logprobs` most likely
-            tokens, as well the chosen tokens. The API will always return the
-            log probability of the sampled token, so there  may be up to
-            `logprobs+1` elements in the response.
-        prompt_logprobs: Number of log probabilities to return per prompt token.
-        skip_special_tokens: Whether to skip special tokens in the output.
-    """
-
-    def __init__(
-        self,
-        n: int = 1,
-        best_of: Optional[int] = None,
-        presence_penalty: float = 0.0,
-        frequency_penalty: float = 0.0,
-        temperature: float = 1.0,
-        top_p: float = 1.0,
-        top_k: int = -1,
-        use_beam_search: bool = False,
-        length_penalty: float = 1.0,
-        early_stopping: Union[bool, str] = False,
-        stop: Optional[Union[str, List[str]]] = None,
-        stop_token_ids: Optional[List[int]] = None,
-        ignore_eos: bool = False,
-        max_tokens: int = 16,
-        logprobs: Optional[int] = None,
-        prompt_logprobs: Optional[int] = None,
-        skip_special_tokens: bool = True,
-    ) -> None:
-        self.n = n
-        self.best_of = best_of if best_of is not None else n
-        self.presence_penalty = presence_penalty
-        self.frequency_penalty = frequency_penalty
-        self.temperature = temperature
-        self.top_p = top_p
-        self.top_k = top_k
-        self.use_beam_search = use_beam_search
-        self.length_penalty = length_penalty
-        self.early_stopping = early_stopping
-        if stop is None:
-            self.stop = []
-        elif isinstance(stop, str):
-            self.stop = [stop]
-        else:
-            self.stop = list(stop)
-        if stop_token_ids is None:
-            self.stop_token_ids = []
-        else:
-            self.stop_token_ids = list(stop_token_ids)
-        self.ignore_eos = ignore_eos
-        self.max_tokens = max_tokens
-        self.logprobs = logprobs
-        self.prompt_logprobs = prompt_logprobs
-        self.skip_special_tokens = skip_special_tokens
-
-        self._verify_args()
-        if self.use_beam_search:
-            self._verify_beam_search()
-        else:
-            self._verify_non_beam_search()
-            if self.temperature < _SAMPLING_EPS:
-                # Zero temperature means greedy sampling.
-                self._verify_greedy_sampling()
-
-    def _verify_args(self) -> None:
-        if self.n < 1:
-            raise ValueError(f"n must be at least 1, got {self.n}.")
-        if self.best_of < self.n:
-            raise ValueError(f"best_of must be greater than or equal to n, "
-                             f"got n={self.n} and best_of={self.best_of}.")
-        if not -2.0 <= self.presence_penalty <= 2.0:
-            raise ValueError("presence_penalty must be in [-2, 2], got "
-                             f"{self.presence_penalty}.")
-        if not -2.0 <= self.frequency_penalty <= 2.0:
-            raise ValueError("frequency_penalty must be in [-2, 2], got "
-                             f"{self.frequency_penalty}.")
-        if self.temperature < 0.0:
-            raise ValueError(
-                f"temperature must be non-negative, got {self.temperature}.")
-        if not 0.0 < self.top_p <= 1.0:
-            raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
-        if self.top_k < -1 or self.top_k == 0:
-            raise ValueError(f"top_k must be -1 (disable), or at least 1, "
-                             f"got {self.top_k}.")
-        if self.max_tokens < 1:
-            raise ValueError(
-                f"max_tokens must be at least 1, got {self.max_tokens}.")
-        if self.logprobs is not None and self.logprobs < 0:
-            raise ValueError(
-                f"logprobs must be non-negative, got {self.logprobs}.")
-        if self.prompt_logprobs is not None and self.prompt_logprobs < 0:
-            raise ValueError(f"prompt_logprobs must be non-negative, got "
-                             f"{self.prompt_logprobs}.")
-
-    def _verify_beam_search(self) -> None:
-        if self.best_of == 1:
-            raise ValueError("best_of must be greater than 1 when using beam "
-                             f"search. Got {self.best_of}.")
-        if self.temperature > _SAMPLING_EPS:
-            raise ValueError("temperature must be 0 when using beam search.")
-        if self.top_p < 1.0 - _SAMPLING_EPS:
-            raise ValueError("top_p must be 1 when using beam search.")
-        if self.top_k != -1:
-            raise ValueError("top_k must be -1 when using beam search.")
-        if self.early_stopping not in [True, False, "never"]:
-            raise ValueError(
-                f"early_stopping must be True, False, or 'never', "
-                f"got {self.early_stopping}.")
-
-    def _verify_non_beam_search(self) -> None:
-        if self.early_stopping is not False:
-            raise ValueError("early_stopping is not effective and must be "
-                             "False when not using beam search.")
-        if (self.length_penalty < 1.0 - _SAMPLING_EPS
-                or self.length_penalty > 1.0 + _SAMPLING_EPS):
-            raise ValueError(
-                "length_penalty is not effective and must be the "
-                "default value of 1.0 when not using beam search.")
-
-    def _verify_greedy_sampling(self) -> None:
-        if self.best_of > 1:
-            raise ValueError("best_of must be 1 when using greedy sampling."
-                             f"Got {self.best_of}.")
-        if self.top_p < 1.0 - _SAMPLING_EPS:
-            raise ValueError("top_p must be 1 when using greedy sampling.")
-        if self.top_k != -1:
-            raise ValueError("top_k must be -1 when using greedy sampling.")
-
-    @cached_property
-    def sampling_type(self) -> SamplingType:
-        if self.use_beam_search:
-            return SamplingType.BEAM
-        if self.temperature < _SAMPLING_EPS:
-            return SamplingType.GREEDY
-        return SamplingType.RANDOM
-
-    def __repr__(self) -> str:
-        return (f"SamplingParams(n={self.n}, "
-                f"best_of={self.best_of}, "
-                f"presence_penalty={self.presence_penalty}, "
-                f"frequency_penalty={self.frequency_penalty}, "
-                f"temperature={self.temperature}, "
-                f"top_p={self.top_p}, "
-                f"top_k={self.top_k}, "
-                f"use_beam_search={self.use_beam_search}, "
-                f"length_penalty={self.length_penalty}, "
-                f"early_stopping={self.early_stopping}, "
-                f"stop={self.stop}, "
-                f"ignore_eos={self.ignore_eos}, "
-                f"max_tokens={self.max_tokens}, "
-                f"logprobs={self.logprobs}, "
-                f"prompt_logprobs={self.prompt_logprobs}, "
-                f"skip_special_tokens={self.skip_special_tokens})")
--- a/server/vllm/vllm/sequence.py
+++ b/server/vllm/vllm/sequence.py
-"""Sequence and its related classes."""
-import copy
-import enum
-from typing import Dict, List, Optional, Union
-
-from vllm.block import LogicalTokenBlock
-from vllm.sampling_params import SamplingParams
-
-PromptLogprobs = List[Optional[Dict[int, float]]]
-SampleLogprobs = List[Dict[int, float]]
-
-
-class SequenceStatus(enum.Enum):
-    """Status of a sequence."""
-    WAITING = enum.auto()
-    RUNNING = enum.auto()
-    SWAPPED = enum.auto()
-    FINISHED_STOPPED = enum.auto()
-    FINISHED_LENGTH_CAPPED = enum.auto()
-    FINISHED_ABORTED = enum.auto()
-    FINISHED_IGNORED = enum.auto()
-
-    @staticmethod
-    def is_finished(status: "SequenceStatus") -> bool:
-        return status in [
-            SequenceStatus.FINISHED_STOPPED,
-            SequenceStatus.FINISHED_LENGTH_CAPPED,
-            SequenceStatus.FINISHED_ABORTED,
-            SequenceStatus.FINISHED_IGNORED,
-        ]
-
-    @staticmethod
-    def get_finished_reason(status: "SequenceStatus") -> Union[str, None]:
-        if status == SequenceStatus.FINISHED_STOPPED:
-            finish_reason = "stop"
-        elif status == SequenceStatus.FINISHED_LENGTH_CAPPED:
-            finish_reason = "length"
-        elif status == SequenceStatus.FINISHED_ABORTED:
-            finish_reason = "abort"
-        elif status == SequenceStatus.FINISHED_IGNORED:
-            # The ignored sequences are the sequences whose prompt lengths
-            # are longer than the model's length cap. Therefore, the stop
-            # reason should also be "length" as in OpenAI API.
-            finish_reason = "length"
-        else:
-            finish_reason = None
-        return finish_reason
-
-
-class SequenceData:
-    """Data associated with a sequence.
-
-
-    Args:
-        prompt_token_ids: The token IDs of the prompt.
-
-    Attributes:
-        prompt_token_ids: The token IDs of the prompt.
-        output_token_ids: The token IDs of the output.
-        cumulative_logprob: The cumulative log probability of the output.
-    """
-
-    def __init__(
-        self,
-        prompt_token_ids: List[int],
-    ) -> None:
-        self.prompt_token_ids = prompt_token_ids
-        self.output_token_ids: List[int] = []
-        self.cumulative_logprob = 0.0
-
-    def append_token_id(self, token_id: int, logprob: float) -> None:
-        self.output_token_ids.append(token_id)
-        self.cumulative_logprob += logprob
-
-    def get_len(self) -> int:
-        return len(self.output_token_ids) + len(self.prompt_token_ids)
-
-    def get_prompt_len(self) -> int:
-        return len(self.prompt_token_ids)
-
-    def get_output_len(self) -> int:
-        return len(self.output_token_ids)
-
-    def get_token_ids(self) -> List[int]:
-        return self.prompt_token_ids + self.output_token_ids
-
-    def get_last_token_id(self) -> int:
-        if not self.output_token_ids:
-            return self.prompt_token_ids[-1]
-        return self.output_token_ids[-1]
-
-    def __repr__(self) -> str:
-        return (f"SequenceData("
-                f"prompt_token_ids={self.prompt_token_ids}, "
-                f"output_token_ids={self.output_token_ids}, "
-                f"cumulative_logprob={self.cumulative_logprob})")
-
-
-class Sequence:
-    """Stores the data, status, and block information of a sequence.
-
-    Args:
-        seq_id: The ID of the sequence.
-        prompt: The prompt of the sequence.
-        prompt_token_ids: The token IDs of the prompt.
-        block_size: The block size of the sequence. Should be the same as the
-            block size used by the block manager and cache engine.
-    """
-
-    def __init__(
-        self,
-        seq_id: int,
-        prompt: str,
-        prompt_token_ids: List[int],
-        block_size: int,
-    ) -> None:
-        self.seq_id = seq_id
-        self.prompt = prompt
-        self.block_size = block_size
-
-        self.data = SequenceData(prompt_token_ids)
-        self.output_logprobs: SampleLogprobs = []
-        self.output_text = ""
-
-        self.logical_token_blocks: List[LogicalTokenBlock] = []
-        # Initialize the logical token blocks with the prompt token ids.
-        self._append_tokens_to_blocks(prompt_token_ids)
-        self.status = SequenceStatus.WAITING
-
-        # Used for incremental detokenization
-        self.prefix_offset = 0
-        self.read_offset = 0
-        # Input + output tokens
-        self.tokens: Optional[List[str]] = None
-
-    def _append_logical_block(self) -> None:
-        block = LogicalTokenBlock(
-            block_number=len(self.logical_token_blocks),
-            block_size=self.block_size,
-        )
-        self.logical_token_blocks.append(block)
-
-    def _append_tokens_to_blocks(self, token_ids: List[int]) -> None:
-        cursor = 0
-        while cursor < len(token_ids):
-            if not self.logical_token_blocks:
-                self._append_logical_block()
-
-            last_block = self.logical_token_blocks[-1]
-            if last_block.is_full():
-                self._append_logical_block()
-                last_block = self.logical_token_blocks[-1]
-
-            num_empty_slots = last_block.get_num_empty_slots()
-            last_block.append_tokens(token_ids[cursor:cursor +
-                                               num_empty_slots])
-            cursor += num_empty_slots
-
-    def append_token_id(
-        self,
-        token_id: int,
-        logprobs: Dict[int, float],
-    ) -> None:
-        assert token_id in logprobs
-        self._append_tokens_to_blocks([token_id])
-        self.output_logprobs.append(logprobs)
-        self.data.append_token_id(token_id, logprobs[token_id])
-
-    def get_len(self) -> int:
-        return self.data.get_len()
-
-    def get_prompt_len(self) -> int:
-        return self.data.get_prompt_len()
-
-    def get_output_len(self) -> int:
-        return self.data.get_output_len()
-
-    def get_token_ids(self) -> List[int]:
-        return self.data.get_token_ids()
-
-    def get_last_token_id(self) -> int:
-        return self.data.get_last_token_id()
-
-    def get_output_token_ids(self) -> List[int]:
-        return self.data.output_token_ids
-
-    def get_cumulative_logprob(self) -> float:
-        return self.data.cumulative_logprob
-
-    def get_beam_search_score(self,
-                              length_penalty: float = 0.0,
-                              seq_len: Optional[int] = None,
-                              eos_token_id: Optional[int] = None) -> float:
-        """Calculate the beam search score with length penalty.
-
-        Adapted from
-
-        https://github.com/huggingface/transformers/blob/ccb92be23def445f2afdea94c31286f84b89eb5b/src/transformers/generation/beam_search.py#L938
-        """
-        if seq_len is None:
-            seq_len = self.get_len()
-            # NOTE: HF implementation does not count the EOS token
-            # towards the length, we align with that here for testing.
-            if (eos_token_id is not None
-                    and self.get_last_token_id() == eos_token_id):
-                seq_len -= 1
-        return self.get_cumulative_logprob() / (seq_len**length_penalty)
-
-    def is_finished(self) -> bool:
-        return SequenceStatus.is_finished(self.status)
-
-    def fork(self, new_seq_id: int) -> "Sequence":
-        new_seq = copy.deepcopy(self)
-        new_seq.seq_id = new_seq_id
-        return new_seq
-
-    def __repr__(self) -> str:
-        return (f"Sequence(seq_id={self.seq_id}, "
-                f"status={self.status.name}, "
-                f"num_blocks={len(self.logical_token_blocks)})")
-
-
-class SequenceGroup:
-    """A group of sequences that are generated from the same prompt.
-
-    Args:
-        request_id: The ID of the request.
-        seqs: The list of sequences.
-        sampling_params: The sampling parameters used to generate the outputs.
-        arrival_time: The arrival time of the request.
-    """
-
-    def __init__(
-        self,
-        request_id: str,
-        seqs: List[Sequence],
-        sampling_params: SamplingParams,
-        arrival_time: float,
-    ) -> None:
-        self.request_id = request_id
-        self.seqs_dict = {seq.seq_id: seq for seq in seqs}
-        self.sampling_params = sampling_params
-        self.arrival_time = arrival_time
-        self.prompt_logprobs: Optional[PromptLogprobs] = None
-
-    @property
-    def prompt(self) -> str:
-        # All sequences in the group should have the same prompt.
-        # We use the prompt of an arbitrary sequence.
-        return next(iter(self.seqs_dict.values())).prompt
-
-    @property
-    def prompt_token_ids(self) -> List[int]:
-        # All sequences in the group should have the same prompt.
-        # We use the prompt of an arbitrary sequence.
-        return next(iter(self.seqs_dict.values())).data.prompt_token_ids
-
-    def get_max_num_running_seqs(self) -> int:
-        """The maximum number of sequences running in parallel in the remaining
-        lifetime of the request."""
-        if self.sampling_params.use_beam_search:
-            # For beam search, maximally there will always be `best_of` beam
-            # candidates running in the future.
-            return self.sampling_params.best_of
-        else:
-            if self.sampling_params.best_of > self.num_seqs():
-                # At prompt stage, the sequence group is not yet filled up
-                # and only have one sequence running. However, in the
-                # generation stage, we will have `best_of` sequences running.
-                return self.sampling_params.best_of
-            # At sampling stages, return the number of actual sequences
-            # that are not finished yet.
-            return self.num_unfinished_seqs()
-
-    def get_seqs(
-        self,
-        status: Optional[SequenceStatus] = None,
-    ) -> List[Sequence]:
-        if status is None:
-            return list(self.seqs_dict.values())
-        else:
-            return [
-                seq for seq in self.seqs_dict.values() if seq.status == status
-            ]
-
-    def get_unfinished_seqs(self) -> List[Sequence]:
-        return [
-            seq for seq in self.seqs_dict.values() if not seq.is_finished()
-        ]
-
-    def get_finished_seqs(self) -> List[Sequence]:
-        return [seq for seq in self.seqs_dict.values() if seq.is_finished()]
-
-    def num_seqs(self, status: Optional[SequenceStatus] = None) -> int:
-        return len(self.get_seqs(status))
-
-    def num_unfinished_seqs(self) -> int:
-        return len(self.get_unfinished_seqs())
-
-    def num_finished_seqs(self) -> int:
-        return len(self.get_finished_seqs())
-
-    def find(self, seq_id: int) -> Sequence:
-        if seq_id not in self.seqs_dict:
-            raise ValueError(f"Sequence {seq_id} not found.")
-        return self.seqs_dict[seq_id]
-
-    def add(self, seq: Sequence) -> None:
-        if seq.seq_id in self.seqs_dict:
-            raise ValueError(f"Sequence {seq.seq_id} already exists.")
-        self.seqs_dict[seq.seq_id] = seq
-
-    def remove(self, seq_id: int) -> None:
-        if seq_id not in self.seqs_dict:
-            raise ValueError(f"Sequence {seq_id} not found.")
-        del self.seqs_dict[seq_id]
-
-    def is_finished(self) -> bool:
-        return all(seq.is_finished() for seq in self.get_seqs())
-
-    def __repr__(self) -> str:
-        return (f"SequenceGroup(request_id={self.request_id}, "
-                f"sampling_params={self.sampling_params}, "
-                f"num_seqs={len(self.seqs_dict)})")
-
-
-class SequenceGroupMetadata:
-    """Metadata for a sequence group. Used to create `InputMetadata`.
-
-
-    Args:
-        request_id: The ID of the request.
-        is_prompt: Whether the request is at prompt stage.
-        seq_data: The sequence data. (Seq id -> sequence data)
-        sampling_params: The sampling parameters used to generate the outputs.
-        block_tables: The block tables. (Seq id -> list of physical block
-            numbers)
-    """
-
-    def __init__(
-        self,
-        request_id: str,
-        is_prompt: bool,
-        seq_data: Dict[int, SequenceData],
-        sampling_params: SamplingParams,
-        block_tables: Dict[int, List[int]],
-    ) -> None:
-        self.request_id = request_id
-        self.is_prompt = is_prompt
-        self.seq_data = seq_data
-        self.sampling_params = sampling_params
-        self.block_tables = block_tables
-
-
-class SequenceOutputs:
-    """The model output associated with a sequence.
-
-    Args:
-        parent_seq_id: The ID of the parent sequence (for forking in beam
-            search).
-        output_token: The output token ID.
-        logprobs: The logprobs of the output token.
-            (Token id -> logP(x_i+1 | x_0, ..., x_i))
-    """
-
-    def __init__(
-        self,
-        parent_seq_id: int,
-        output_token: int,
-        logprobs: Dict[int, float],
-    ) -> None:
-        self.parent_seq_id = parent_seq_id
-        self.output_token = output_token
-        self.logprobs = logprobs
-
-    def __repr__(self) -> str:
-        return (f"SequenceOutputs(parent_seq_id={self.parent_seq_id}, "
-                f"output_token={self.output_token}, "
-                f"logprobs={self.logprobs})")
-
-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, SequenceOutputs):
-            raise NotImplementedError()
-        return (self.parent_seq_id == other.parent_seq_id
-                and self.output_token == other.output_token
-                and self.logprobs == other.logprobs)
-
-
-class SequenceGroupOutputs:
-    """The model outputs associated with a sequence group."""
-
-    def __init__(
-        self,
-        samples: List[SequenceOutputs],
-        prompt_logprobs: Optional[PromptLogprobs],
-    ) -> None:
-        self.samples = samples
-        self.prompt_logprobs = prompt_logprobs
-
-    def __repr__(self) -> str:
-        return (f"SequenceGroupOutputs(samples={self.samples}, "
-                f"prompt_logprobs={self.prompt_logprobs})")
-
-
-# For each sequence group, we generate a list of SequenceOutputs object,
-# each of which contains one possible candidate for the next token.
-SamplerOutput = List[SequenceGroupOutputs]
--- a/server/vllm/vllm/transformers_utils/__init__.py
+++ b/server/vllm/vllm/transformers_utils/__init__.py
--- a/server/vllm/vllm/transformers_utils/config.py
+++ b/server/vllm/vllm/transformers_utils/config.py
-from typing import Optional
-
-from transformers import AutoConfig, PretrainedConfig
-
-from vllm.transformers_utils.configs import *  # pylint: disable=wildcard-import
-
-_CONFIG_REGISTRY = {
-    "mpt": MPTConfig,
-    "baichuan": BaiChuanConfig,
-    "aquila": AquilaConfig,
-    "qwen": QWenConfig,
-    "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
-    "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
-}
-
-
-def get_config(model: str,
-               trust_remote_code: bool,
-               revision: Optional[str] = None) -> PretrainedConfig:
-    try:
-        config = AutoConfig.from_pretrained(
-            model, trust_remote_code=trust_remote_code, revision=revision)
-    except ValueError as e:
-        if (not trust_remote_code and
-                "requires you to execute the configuration file" in str(e)):
-            err_msg = (
-                "Failed to load the model config. If the model is a custom "
-                "model not yet available in the HuggingFace transformers "
-                "library, consider setting `trust_remote_code=True` in LLM "
-                "or using the `--trust-remote-code` flag in the CLI.")
-            raise RuntimeError(err_msg) from e
-        else:
-            raise e
-    if config.model_type in _CONFIG_REGISTRY:
-        config_class = _CONFIG_REGISTRY[config.model_type]
-        config = config_class.from_pretrained(model, revision=revision)
-    return config